Step1
提取PDF中的图片,并另存
Step2
去除灰色纸张背景
- import PyPDF2
- from PIL import ImageEnhance,Image,ImageFilter
- import cv2
- import numpy as np
- from skimage.filters import unsharp_mask
- from skimage.filters import gaussian
- from skimage.restoration import denoise_tv_chambolle
-
- local = './'
-
-
-
- pdf_file = open(local+'001.pdf', 'rb')
- pdf_reader = PyPDF2.PdfReader(pdf_file)
-
- num_pages = len(pdf_reader.pages)
- print("num : "+str(num_pages))
- pic_n=0
- # 遍历每一页
- for page_num in range(num_pages):
- # 获取当前页对象
- page_obj = pdf_reader.pages[page_num]
- # 获取当前页中的所有对象
- page_objs = page_obj['/Resources']['/XObject'].get_object()
- # 遍历每个对象
- for obj_name in page_objs:
- # 判断对象是否为图片
- if page_objs[obj_name]['/Subtype'] == '/Image':
- # 获取图片对象
- img_obj = page_objs[obj_name]
- # 获取图片数据
- img_data = img_obj.get_data()
- # 将图片数据保存为文件
- with open(local+"/99_tmp.jpg", 'wb') as img_file:
- img_file.write(img_data)
- #####################################################################################
- # 图片处理
- image = Image.open(local+"/99_tmp.jpg")
- #饱和度
- enhancer = ImageEnhance.Color(image)
- C_image = enhancer.enhance(0.001)
- C_image.save("./0_tmp.jpg")
-
- img = cv2.imread('./0_tmp.jpg')
- # 对每个像素进行对比度调整 alpha=对比度 beta=亮度
- img_contrast = cv2.convertScaleAbs(img, alpha=1.4, beta=0)
-
- ####################################################################################
-
- pnum=str(page_num).zfill(4)
- pic_str=str(pic_n).zfill(4)
- img_path ="./image_heibai/"+ pic_str + "_" +"page_"+pnum + '.jpg'
- cv2.imwrite(img_path,img_contrast)
- print("image : "+img_path)
- pic_n=pic_n+1
Step3
去除黑色边框
Step4
去除阴影部分,字清晰