• python: 用百度API读取增值税发票信息


    1. # encoding: utf-8
    2. # 版权所有 2023 涂聚文有限公司
    3. # 许可信息查看:
    4. # 描述:
    5. # Author : geovindu,Geovin Du 涂聚文.
    6. # IDE : PyCharm 2023.1 python 311
    7. # Datetime : 2023/9/30 6:56
    8. # User : geovindu
    9. # Product : PyCharm
    10. # Project : pythonTkinterDemo
    11. # File : BaiduOCRAPI.py
    12. # explain : 学习
    13. import os
    14. import base64
    15. import requests
    16. import pandas as pd
    17. import json
    18. class BaiduOCR(object):
    19. """
    20. 利用百度API读取发票信息(pdf,image文件)
    21. """
    22. AppID="40226401"
    23. APIKey="geovindu"
    24. SecretKey="geovindu"
    25. def __init__(self):
    26. """
    27. """
    28. self.AppID="40226401"
    29. self.APIKey="geovindu" #
    30. self.SecretKey="geovindu" #
    31. def getAccessToken(self):
    32. """
    33. :param APIKey:
    34. :param SecretKey:
    35. :return:
    36. """
    37. '''
    38. host = f"https://aip.baidubce.com/oauth/2.0/token?client_secret={self.SecretKey}&grant_type=client_credentials&client_id={self.APIKey}"
    39. response = requests.get(host)
    40. return response.json()['access_token']
    41. '''
    42. url = "https://aip.baidubce.com/oauth/2.0/token"
    43. params = {"grant_type": "client_credentials", "client_id": self.APIKey, "client_secret": self.SecretKey}
    44. return str(requests.post(url, params=params).json().get("access_token"))
    45. def getContent(self,accessToken, pdfFile):
    46. """
    47. :param accessToken
    48. :param pdfFile:
    49. :return:
    50. """
    51. #headers = {'content-type': 'application/x-www-form-urlencoded'}
    52. #request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice?access_token={accessToken}"
    53. f = open(pdfFile, 'rb')
    54. pdf = base64.b64encode(f.read())
    55. print(pdf)
    56. print(accessToken)
    57. request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
    58. params = {"pdf_file": pdf}
    59. access_token =accessToken # '[调用鉴权接口获取的token]'
    60. request_url = request_url + "?access_token=" + access_token
    61. headers = {'content-type': 'application/x-www-form-urlencoded'}
    62. response = requests.post(request_url, data=params, headers=headers)
    63. if response:
    64. print(response.json())
    65. #print(pdf)
    66. #params = {"pdf_file": pdf}
    67. #response = requests.post(request_url, data=params, headers=headers)
    68. #print(response.json())
    69. return response.json()
    70. def getContentPng(self,accessToken, pngFile):
    71. """
    72. :param accessToken
    73. :param pngFile:
    74. :return:
    75. """
    76. #headers = {'content-type': 'application/x-www-form-urlencoded'}
    77. #request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice?access_token={accessToken}"
    78. f = open(pngFile, 'rb')
    79. pdf = base64.b64encode(f.read())
    80. print(pdf)
    81. print(accessToken)
    82. request_url = f"https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
    83. params = {"image": pdf}
    84. access_token =accessToken # '[调用鉴权接口获取的token]'
    85. request_url = request_url + "?access_token=" + access_token
    86. headers = {'content-type': 'application/x-www-form-urlencoded'}
    87. response = requests.post(request_url, data=params, headers=headers)
    88. if response:
    89. print(response.json())
    90. #print(pdf)
    91. #params = {"pdf_file": pdf}
    92. #response = requests.post(request_url, data=params, headers=headers)
    93. #print(response.json())
    94. return response.json()
    95. def getUsefulInfo(self,content, pdf_name):
    96. """
    97. :param content
    98. :param pdf_name:
    99. :return:
    100. """
    101. jsonstr = content
    102. print("Json",jsonstr)
    103. words_result = jsonstr['words_result']
    104. info = {'发票文件名': pdf_name,
    105. '发票号码': str(words_result['InvoiceNum']),
    106. '开票日期': words_result['InvoiceDate'],
    107. '货物名称': words_result['CommodityName'][0]['word'],
    108. '未税金额': words_result['CommodityAmount'][0]['word'],
    109. '货物税率': words_result['CommodityTaxRate'][0]['word'],
    110. '货物税额': words_result['CommodityTax'][0]['word'],
    111. '合计金额': words_result['TotalAmount'],
    112. '合计税额': words_result['TotalTax'],
    113. '价税合计(小写)': words_result['AmountInFiguers'],
    114. '价税合计(大写)': words_result['AmountInWords'],
    115. '销售方名称': words_result['SellerName'],
    116. '销售方纳税人识别号': words_result['SellerRegisterNum'],
    117. '销售方银行及账户': words_result['SellerBank'],
    118. '销售方地址及电话': words_result['SellerAddress']}
    119. return info

    调用:用京东多张发票测试成功

    1. ocr=Common.BaiduOCRAPI.BaiduOCR()
    2. pdfFilelist = os.listdir("invoice/")
    3. infolist = []
    4. for pdfFile in pdfFilelist:
    5. if pdfFile.split(".")[-1] == 'pdf':
    6. pdfName = pdfFile.split(".")[:-1]
    7. print(pdfFile)
    8. access_token =ocr.getAccessToken()
    9. content = ocr.getContent(access_token, "invoice/" + pdfFile)
    10. info = ocr.getUsefulInfo(content, pdfName)
    11. infolist.append(info)
    12. df = pd.DataFrame(infolist)
    13. print(df)
    14. #df.to_excel('增值税发票信息统计.xlsx', sheet_name="geovindu",index=False)
    15. with pd.ExcelWriter('geovindu.xlsx') as writer: #, mode='a' 附加
    16. df.to_excel(writer, sheet_name='geovindu', index=False)

  • 相关阅读:
    1798_GNU pdf阅读器evince_支持的格式
    详解傅立叶变换,看这一文足矣!
    CSS中的定位
    Spring框架(缺SM整合)
    Docker清理
    数据中心浸没液冷中冷却液关键问题研究
    企业落地数字化转型,战略规划应该放在首要位置
    vue2原理初探-数据代理和数据劫持
    Django中使用Ajax时使用CSRF保护
    猿创征文|瑞吉外卖——管理端_菜品管理_2
  • 原文地址:https://blog.csdn.net/geovindu/article/details/133426839