• python 处理excel 识别图片文字 转换成表格内容输出


     pycharm idea 开发利器啊(主要为了进入活动)

    1. # This is a sample Python script.
    2. import os
    3. import re
    4. import requests
    5. # Press Shift+F10 to execute it or replace it with your code.
    6. # Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.
    7. import urllib
    8. import pytesseract
    9. from PIL import Image
    10. # 读写2003 excel
    11. import xlrd
    12. import xlwt
    13. # 读写2007 excel
    14. import openpyxl
    15. def deal_space():
    16. pass
    17. def deal_excel():
    18. parentPath = "txt\\final_text"
    19. outPut = "新建 XLSX 工作表.xlsx"
    20. wb = xlwt.Workbook()
    21. for (root, dirs, files) in os.walk(parentPath):
    22. for filename in files:
    23. filename_full_path = os.path.join(root, filename)
    24. print("filename_full_path " + filename_full_path)
    25. sheet = wb.add_sheet(filename.replace(".txt", ""))
    26. with open(filename_full_path, encoding="utf-8", mode="r") as f:
    27. context = f.readlines()
    28. value = []
    29. value.append( ['序号', "单位名称", "资质类别"])
    30. index = 0
    31. for x in context:
    32. text = x.replace("\n", "")
    33. if text:
    34. index = index + 1
    35. value.append([index] + text.split("#"))
    36. print(value)
    37. for i in range(0, len(value)):
    38. for j in range(0, len(value[i])):
    39. print(value[i][j])
    40. sheet.write(i, j, value[i][j])
    41. # value = [["名称", "价格", "出版社", "语言"],
    42. # ["如何高效读懂一本书", "22.3", "机械工业出版社", "中文"],
    43. # ["暗时间", "32.4", "人民邮电出版社", "中文"],
    44. # ["拆掉思维里的墙", "26.7", "机械工业出版社", "中文"]]
    45. # continue
    46. wb.save(outPut)
    47. print("写入数据成功!")
    48. pass
    49. def main():
    50. # 下载图片
    51. # downloadPic()
    52. # 将图片二进制程序化
    53. # hex_pic()
    54. # 识别图片存取到一个文件党章
    55. # generate_text()
    56. # 处理空格
    57. # deal_space()
    58. # 处理xml
    59. deal_excel()
    60. def hex_pic():
    61. img = Image.open('pic/origin/02.jpg')
    62. # 模式L”为灰色图像,它的每个像素用8bit表示,0表示黑,255表示白,其他数字表示不同的灰度。
    63. Img = img.convert('L')
    64. Img.save("pic\\hex_pic\\02.jpg")
    65. # # 自定义灰度界限,大于这个值为黑色,小于这个值为白色
    66. # threshold = 200
    67. #
    68. # table = []
    69. # for i in range(256):
    70. # if i < threshold:
    71. # table.append(0)
    72. # else:
    73. # table.append(1)
    74. #
    75. # # 图片二值化
    76. # photo = Img.point(table, '1')
    77. # photo.save("test2.png")
    78. # # 识别图片内容
    79. # import pytesseract
    80. # img_path = 'test2.png'
    81. #
    82. # text = pytesseract.image_to_string(Image.open(img_path))
    83. for (root, dirs, files) in os.walk("pic/origin"):
    84. for filename in files:
    85. filename_full_path = os.path.join(root, filename)
    86. print("filename_full_path " + filename_full_path)
    87. img = Image.open(filename_full_path)
    88. # 模式L”为灰色图像,它的每个像素用8bit表示,0表示黑,255表示白,其他数字表示不同的灰度。
    89. Img = img.convert('L')
    90. Img.save(filename_full_path.replace("hex_pic", "origin"))
    91. # 根据图片生成每个text
    92. def generate_text():
    93. parent_path = "pic/origin/"
    94. # 批量添加namespace|
    95. # 列出windows目录下的所有文件和文件名
    96. res = ""
    97. for (root, dirs, files) in os.walk(parent_path):
    98. for filename in files:
    99. filename_full_path = os.path.join(root, filename)
    100. print("filename_full_path " + filename_full_path)
    101. im = Image.open(filename_full_path)
    102. # 识别文字
    103. # 识别文字,并指定语言
    104. res = pytesseract.image_to_string(im, lang='chi_sim')
    105. # 识别所有图片 认识
    106. with open("txt\\hex_pic\\" + filename.replace(".jpg", ".txt"), encoding="utf-8", mode="w") as f:
    107. f.write(res)
    108. pass
    109. # 步骤1 下载文件到本地
    110. def download_pic():
    111. url = "https://xxxx"
    112. s = requests.Session()
    113. context = s.get(url).text
    114. reg = r'data-original=\"(.*?r.jpg)\"'
    115. res = re.findall(reg, context)
    116. print(res)
    117. res = {}.fromkeys(res).keys()
    118. print(len(res))
    119. print(res)
    120. path = "pic"
    121. index = 0
    122. for x in res:
    123. index = index + 1
    124. text = "0" + str(index) if index <= 9 else str(index)
    125. urllib.request.urlretrieve(x, path + "\\" + text + ".jpg")
    126. # Press the green button in the gutter to run the script.
    127. if __name__ == '__main__':
    128. main()
    129. # See PyCharm help at https://www.jetbrains.com/help/pycharm/

    处理内容就是将图片生成文字

    将文字再处理成excel 这里都是代码。。需要会点的才会点。。好像很废话。

    那么识别文字如何做的呢

    用python识别图片

    先下载tesseract

    网址:Index of /tesseract

     

    下载好了记住安装目录。然后把安装目录设置为环境变量

    1. pip install --upgrade pip
    2. pip install pytesseract
    3. pip install pillow

    安装目录里面有一个

    tessdata

    这个是存放语言识别文件的。

    下载地址 :

    https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.00/chi_sim.traineddata

    这个是中文简体。繁体也可以去下载这个

    https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.00/chi_tra.traineddata

    下载好了丢进tessdata 即可

    识别中文文字

    1. im = Image.open(filename_full_path)
    2. # 识别文字
    3. # 识别文字,并指定语言
    4. res = pytesseract.image_to_string(im, lang='chi_sim')
    5. # 识别所有图片 认识
    6. with open("txt\\hex_pic\\" + filename.replace(".jpg", ".txt"), encoding="utf-8", mode="w") as f:
    7. f.write(res)

    识别的结果如果不理想可以先将图片二进制化

    1. img = Image.open(filename_full_path)
    2. # 模式L”为灰色图像,它的每个像素用8bit表示,0表示黑,255表示白,其他数字表示不同的灰度。
    3. Img = img.convert('L')
    4. Img.save(filename_full_path.replace("hex_pic", "origin"))

    然后再次进行处理

    1. parentPath = "txt\\final_text"
    2. outPut = "新建 XLSX 工作表.xlsx"
    3. wb = xlwt.Workbook()
    4. for (root, dirs, files) in os.walk(parentPath):
    5. for filename in files:
    6. filename_full_path = os.path.join(root, filename)
    7. print("filename_full_path " + filename_full_path)
    8. sheet = wb.add_sheet(filename.replace(".txt", ""))
    9. with open(filename_full_path, encoding="utf-8", mode="r") as f:
    10. context = f.readlines()
    11. value = []
    12. value.append( ['序号', "单位名称", "资质类别"])
    13. index = 0
    14. for x in context:
    15. text = x.replace("\n", "")
    16. if text:
    17. index = index + 1
    18. value.append([index] + text.split("#"))
    19. print(value)
    20. for i in range(0, len(value)):
    21. for j in range(0, len(value[i])):
    22. print(value[i][j])
    23. sheet.write(i, j, value[i][j])
    24. # value = [["名称", "价格", "出版社", "语言"],
    25. # ["如何高效读懂一本书", "22.3", "机械工业出版社", "中文"],
    26. # ["暗时间", "32.4", "人民邮电出版社", "中文"],
    27. # ["拆掉思维里的墙", "26.7", "机械工业出版社", "中文"]]
    28. # continue
    29. wb.save(outPut)
    30. print("写入数据成功!")

    处理后拿到文本再根据一定格式存好

     文件名称就是我们的sheet名称

    我们随意的造假数据

    然后根据当前文本特点生成表格

    最后 查看表格内容

     

     

     

     

  • 相关阅读:
    springboot+社区疫苗管理系统 毕业设计-附源码191705
    搜索接口搜索“苏州协同创新智能科技时”超时调优
    C语言高级-5队列
    Qt QPushButton详解
    安装oh-my-zsh后全局包失效怎么解决
    Python写API
    fastadmin tp 安装使用百度富文本编辑器UEditor
    手动关闭PS中的TopazStudio2的登录窗口
    SQLAlchemy关联表删除策略设置
    GitHub项目里的api
  • 原文地址:https://blog.csdn.net/mp624183768/article/details/127562216