Python 读取电子发票PDF 转成Excel
目录
只识别普通电子发票PDF,提取其中某些关键内容到excel中。
如下4个库是经常更新维护的!
pyPDF/pyPDF2、pdfplumber、PyMuPDF、Camelot等4个库。
个人推荐pdfplumber,它有extract_tables函数
- import pdfplumber
- import re
- import os
- import pandas as pd
- '''
- 处理 普通发票电子PDF 提取关键字段内容信息写入excel中
- 其它类的发票 自己参考对应规律 更改代码即可
- 参考:pdfplumber官网、以及:https://blog.csdn.net/Burannnn/article/details/129393295
- '''
- def re_text(bt, text):
- # re 搜索正则匹配 包含re.compile包含的文字内容
- m1 = re.search(bt, text)
- if m1 is not None:
- return re_block(m1[0])
- return None
-
- def re_block(text):
- # 去掉空格、中英文小括号、中文冒号变英文冒号;去掉中文全角空格
- return text.replace(' ', '').replace(' ', '').replace(')', '').replace(')', '').replace(':', ':')
-
- def get_pdf(dir_path):
- pdf_file = []
- for root, sub_dirs, file_names in os.walk(dir_path):
- for name in file_names:
- if name.endswith('.pdf'):
- filepath = os.path.join(root, name)
- pdf_file.append(filepath)
- return pdf_file
-
- def read(xlsx_path, pdf_root):
- # 构建excel writer 写入器
- writer = pd.ExcelWriter(xlsx_path)
- # 如果字段不通用 则需要单独拎出来判断,这里我全部拎出来做了if判断
- all_fields = {
- "开票日期": [],
- "名称": [],
- "纳税人识别号": [],
- "价税合计(小写)": [],
- "货物或应税劳务、服务名称": [],
- "规格型号": [],
- "单位": [],
- "数量": [],
- "单价": [],
- "金额": [],
- "税率": [],
- "税额": [],
- }
-
- filenames = get_pdf(pdf_root)
- for filename in filenames:
- print(f"正在读取:{filename}")
- with pdfplumber.open(filename) as pdf:
- first_page = pdf.pages[0]
- pdf_text = first_page.extract_text()
- # print(pdf_text)
-
- kaipiao = re_text(re.compile(r'开票日期(.*)'), pdf_text)
- if kaipiao:
- all_fields["开票日期"].append(kaipiao.replace("开票日期:", ""))
- mingcheng = re_text(re.compile(r'名\s*称\s*[::]\s*([\u4e00-\u9fa5]+)'), pdf_text)
- if mingcheng:
- all_fields["名称"].append(mingcheng.replace("名称:", ""))
- # nashuiren = re_text(re.compile(r'纳税人识别号\s*[::]\s*([a-zA-Z0-9]+)'), pdf_text)
- # if nashuiren:
- # all_fields["纳税人识别号"].append(nashuiren.replace("纳税人识别号:", ""))
- jine = re_text(re.compile(r'小写.*(.*[0-9.]+)'), pdf_text)
- if jine:
- all_fields["价税合计(小写)"].append(jine.replace("小写¥", ""))
- table = first_page.extract_tables()[0]
-
- # 纳税人识别号 购买方
- for t in table[0]:
- t_ = str(t).replace(" ", "")
- nashuiren = re_text(re.compile(r'纳税人识别号\s*[::]\s*([a-zA-Z0-9]+)'), t_)
- if nashuiren:
- all_fields["纳税人识别号"].append(nashuiren.replace("纳税人识别号:", ""))
-
- # 这里根据pdfplumber提取table来依次输出,查看规律(适合普通发票,其它发票打印输出看规律即可)
- for t in table[1]:
- if not t:
- continue
- t_ = str(t).replace(" ", "") # 去掉空格
-
- ts = t_.split("\n")
- if "货物或应税劳务、服务名称" in t_:
- if len(ts) > 1:
- all_fields["货物或应税劳务、服务名称"].append(ts[1])
- else:
- all_fields["货物或应税劳务、服务名称"].append("")
- if "规格型号" in t_:
- if len(ts) > 1:
- all_fields["规格型号"].append(ts[1])
- else:
- all_fields["规格型号"].append("")
- if "单位" in t_:
- if len(ts) > 1:
- all_fields["单位"].append(ts[1])
- else:
- all_fields["单位"].append("")
- if "数量" in t_:
- if len(ts) > 1:
- all_fields["数量"].append(ts[1])
- else:
- all_fields["数量"].append("")
- if "单价" in t_:
- if len(ts) > 1:
- all_fields["单价"].append(ts[1])
- else:
- all_fields["单价"].append("")
- if "税率" in t_:
- if len(ts) > 1:
- all_fields["税率"].append(ts[1])
- else:
- all_fields["税率"].append("")
- if "金额" in t_:
- if len(ts) > 1:
- all_fields["金额"].append(ts[1])
- else:
- all_fields["金额"].append("")
- if "税额" in t_:
- if len(ts) > 1:
- all_fields["税额"].append(ts[1])
- else:
- all_fields["税额"].append("")
-
- # print('--------------------------------------------------------')
- # print(re_text(re.compile(r'[\u4e00-\u9fa5]+电子普通发票.*?'), pdf_text))
-
- # # print(re_text(re.compile(r'发票代码(.*\d+)'), pdf_text))
- # print(re_text(re.compile(r'发票号码(.*\d+)'), pdf_text))
- # print(re_text(re.compile(r'开票日期(.*)'), pdf_text))
- # print(re_text(re.compile(r'名\s*称\s*[::]\s*([\u4e00-\u9fa5]+)'), pdf_text))
- # print(re_text(re.compile(r'纳税人识别号\s*[::]\s*([a-zA-Z0-9]+)'), pdf_text))
- # price = re_text(re.compile(r'小写.*(.*[0-9.]+)'), pdf_text)
- # print(price)
- # company = re.findall(re.compile(r'名.*称\s*[::]\s*([\u4e00-\u9fa5]+)'), pdf_text)
- # if company:
- # print(re_block(company[len(company)-1]))
- # print('--------------------------------------------------------')
- print(all_fields)
- df = pd.DataFrame(all_fields)
- df.to_excel(writer)
- writer.save()
-
- return
-
- pdf_root = r"G:\PDF"
- xlsx_path = r"G:\PDF\all_fields.xlsx"
-
- read(xlsx_path, pdf_root)
-
对于专用发票,找到对应的规律即可。这里最好用的是extract_tables函数,打印出来,找规律即可!