- # encoding: utf-8
- # 版权所有 2023 涂聚文有限公司
- # 许可信息查看:
- # 描述:
- # Author : geovindu,Geovin Du 涂聚文.
- # IDE : PyCharm 2023.1 python 311
- # Datetime : 2023/9/30 6:56
- # User : geovindu
- # Product : PyCharm
- # Project : pythonTkinterDemo
- # File : BaiduOCRAPI.py
- # explain : 学习
-
- import os
- import base64
- import requests
- import pandas as pd
- import json
-
-
-
- class BaiduOCR(object):
- """
- 利用百度API读取发票信息(pdf,image文件)
- """
-
- AppID="40226401"
- APIKey="geovindu"
- SecretKey="geovindu"
-
- def __init__(self):
- """
-
- """
- self.AppID="40226401"
- self.APIKey="geovindu" #
- self.SecretKey="geovindu" #
-
-
-
- def getAccessToken(self):
- """
-
- :param APIKey:
- :param SecretKey:
- :return:
- """
- '''
- host = f"https://aip.baidubce.com/oauth/2.0/token?client_secret={self.SecretKey}&grant_type=client_credentials&client_id={self.APIKey}"
- response = requests.get(host)
- return response.json()['access_token']
- '''
- url = "https://aip.baidubce.com/oauth/2.0/token"
- params = {"grant_type": "client_credentials", "client_id": self.APIKey, "client_secret": self.SecretKey}
- return str(requests.post(url, params=params).json().get("access_token"))
-
- def getContent(self,accessToken, pdfFile):
- """
-
- :param accessToken
- :param pdfFile:
- :return:
- """
- #headers = {'content-type': 'application/x-www-form-urlencoded'}
- #request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice?access_token={accessToken}"
- f = open(pdfFile, 'rb')
- pdf = base64.b64encode(f.read())
- print(pdf)
- print(accessToken)
- request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
- params = {"pdf_file": pdf}
- access_token =accessToken # '[调用鉴权接口获取的token]'
- request_url = request_url + "?access_token=" + access_token
- headers = {'content-type': 'application/x-www-form-urlencoded'}
- response = requests.post(request_url, data=params, headers=headers)
- if response:
- print(response.json())
-
- #print(pdf)
- #params = {"pdf_file": pdf}
- #response = requests.post(request_url, data=params, headers=headers)
- #print(response.json())
- return response.json()
-
-
- def getContentPng(self,accessToken, pngFile):
- """
-
- :param accessToken
- :param pngFile:
- :return:
- """
- #headers = {'content-type': 'application/x-www-form-urlencoded'}
- #request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice?access_token={accessToken}"
- f = open(pngFile, 'rb')
- pdf = base64.b64encode(f.read())
- print(pdf)
- print(accessToken)
- request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
- params = {"image": pdf}
- access_token =accessToken # '[调用鉴权接口获取的token]'
- request_url = request_url + "?access_token=" + access_token
- headers = {'content-type': 'application/x-www-form-urlencoded'}
- response = requests.post(request_url, data=params, headers=headers)
- if response:
- print(response.json())
-
- #print(pdf)
- #params = {"pdf_file": pdf}
- #response = requests.post(request_url, data=params, headers=headers)
- #print(response.json())
- return response.json()
-
- def getUsefulInfo(self,content, pdf_name):
- """
-
- :param content
- :param pdf_name:
- :return:
- """
- jsonstr = content
- print("Json",jsonstr)
- words_result = jsonstr['words_result']
- info = {'发票文件名': pdf_name,
- '发票号码': str(words_result['InvoiceNum']),
- '开票日期': words_result['InvoiceDate'],
- '货物名称': words_result['CommodityName'][0]['word'],
- '未税金额': words_result['CommodityAmount'][0]['word'],
- '货物税率': words_result['CommodityTaxRate'][0]['word'],
- '货物税额': words_result['CommodityTax'][0]['word'],
- '合计金额': words_result['TotalAmount'],
- '合计税额': words_result['TotalTax'],
- '价税合计(小写)': words_result['AmountInFiguers'],
- '价税合计(大写)': words_result['AmountInWords'],
- '销售方名称': words_result['SellerName'],
- '销售方纳税人识别号': words_result['SellerRegisterNum'],
- '销售方银行及账户': words_result['SellerBank'],
- '销售方地址及电话': words_result['SellerAddress']}
- return info
调用:用京东多张发票测试成功
- ocr=Common.BaiduOCRAPI.BaiduOCR()
- pdfFilelist = os.listdir("invoice/")
- infolist = []
- for pdfFile in pdfFilelist:
- if pdfFile.split(".")[-1] == 'pdf':
- pdfName = pdfFile.split(".")[:-1]
- print(pdfFile)
- access_token =ocr.getAccessToken()
- content = ocr.getContent(access_token, "invoice/" + pdfFile)
- info = ocr.getUsefulInfo(content, pdfName)
- infolist.append(info)
-
- df = pd.DataFrame(infolist)
- print(df)
- #df.to_excel('增值税发票信息统计.xlsx', sheet_name="geovindu",index=False)
- with pd.ExcelWriter('geovindu.xlsx') as writer: #, mode='a' 附加
- df.to_excel(writer, sheet_name='geovindu', index=False)

