• python实现从excel导出csv最完整版本,openpyxl,pandas,xlrd全家桶


    先说一下应用场景,我们项目之前excel导出csv都是通过打开excel执行宏完成的,这样没法做到批量导出,于是我准备优化一下,两个思路:

    1.还是利用excel的宏,写vbs外部执行,export_csv.vbs 具体代码如下:

    1. Function export_csv(filename)
    2. Set oExcel = createobject("Excel.Application")
    3. oExcel.Visible = false
    4. Set oWorkbooks = oExcel.Workbooks.Open(filename)
    5. oExcel.Run "ThisWorkbook.导出CSV"
    6. oWorkbooks.Close
    7. Set oWorkbooks = nothing
    8. oExcel.Quit
    9. Set oExcel= nothing
    10. WSH.Echo filename
    11. End Function
    12. export_csv(WScript.Arguments.Item(0))

    外部的bat调度脚本:

    1. ::扫描当前目录所有xlsm文件,然后调用vbs处理
    2. for %%i in (%~dp0*.xlsm) do ( CScript //B .\export_csv_new.vbs %%i )
    3. pause

    这样可以完全按照策划之前的操作流程处理,但是有个最大的问题就是太慢了,因为具体的执行过程是:

    1)打开excel文件,窗口隐藏

    2)  运行excel文件的导出CSV宏

    3)关闭excel文件

    2.通过第三方库实现excel的读取和csv的写入,首选python,因为库还算多,最终实现了python中的pandas,openpyxl和xlrd三种方案,同时写入csv也有三种方案,包括pandas的dataframe to_csv,还有csv模块的writerow,还有自己实现了一个列拼接写入,代码如下

    1. import sys
    2. import os
    3. import re
    4. import pprint
    5. import inspect
    6. import csv
    7. import openpyxl
    8. import pandas as pd
    9. import xlrd
    10. import codecs
    11. from bs4 import BeautifulSoup
    12. from multiprocessing import Process
    13. from multiprocessing import freeze_support
    14. def scan_files(directory,postfix):
    15. files_list=[]
    16. for root, sub_dirs, files in os.walk(directory):
    17. for special_file in files:
    18. if special_file.endswith(postfix) and special_file.startswith("~$") == False:
    19. files_list.append(os.path.join(root,special_file))
    20. return files_list
    21. def get_csv_file_name(sheet_name, target_dir):
    22. if sheet_name.endswith(".csv"):
    23. csv_file_name = target_dir + "\\" + sheet_name
    24. else:
    25. csv_file_name = target_dir + "\\" + sheet_name+".csv"
    26. return csv_file_name
    27. def export_single_excel_xlrd(file_name, target_dir):
    28. print(file_name)
    29. execl_file = xlrd.open_workbook(file_name)
    30. for sheet_name in execl_file.sheet_names():
    31. csv_file_name = get_csv_file_name(sheet_name, target_dir)
    32. csv_file = csv.writer(open(csv_file_name,'w', encoding='utf-8', newline=''))
    33. sheet_data = execl_file.sheet_by_name(sheet_name)
    34. nrows = sheet_data.nrows
    35. ncols = sheet_data.ncols
    36. for row_idx in range(0,nrows):
    37. row_data = sheet_data.row_values(row_idx)
    38. csv_file.writerow(row_data)
    39. def pandas_dataframe_local_write_csv(sheet_data, csv_file_name):
    40. csv_file = codecs.open(csv_file_name, "w", "utf-8")
    41. sep = ","
    42. row_data=[]
    43. #write head
    44. for col in sheet_data.head():
    45. row_data.append(str(col))
    46. row_str = sep.join(row_data)
    47. csv_file.write(row_str+"\n")
    48. #write content
    49. for row in sheet_data.values:
    50. row_data=[]
    51. for col in row:
    52. row_data.append(str(col))
    53. row_str = sep.join(row_data)
    54. csv_file.write(row_str+"\n")
    55. def pandas_dataframe_lib_write_csv(sheet_data, csv_file_name):
    56. csv_file = csv.writer(open(csv_file_name,'w', encoding='utf-8', newline=''))
    57. for row in sheet_data:
    58. csv_file.writerow(sheet_data[row])
    59. def export_single_excel_pandas_with_write(file_name, target_dir):
    60. print(file_name)
    61. execl_file = pd.read_excel(io=file_name, sheet_name=None)
    62. for sheet_name in execl_file:
    63. csv_file_name = get_csv_file_name(sheet_name, target_dir)
    64. print(csv_file_name)
    65. sheet_data = execl_file[sheet_name].fillna("")
    66. pandas_dataframe_local_write_csv(sheet_data, csv_file_name)
    67. #pandas_dataframe_lib_write_csv(sheet_data, csv_file_name)
    68. def export_single_excel_pandas_to_csv(file_name, target_dir):
    69. execl_file = pd.read_excel(io=file_name, sheet_name=None, dtype={'Power Coefficient (source)':object,})
    70. for sheet_name in execl_file:
    71. #过滤不导出的sheet
    72. if re.match("[a-zA-Z0-9_]", sheet_name) == None or sheet_name.startswith("(O)"):
    73. continue
    74. csv_file_name = get_csv_file_name(sheet_name, target_dir)
    75. execl_file[sheet_name].to_csv(csv_file_name, index=False, float_format = '%g', encoding = 'utf-8', header=True);
    76. def export_single_excel_pandas_to_html_to_csv(file_name, target_dir):
    77. pd.set_option('display.max_colwidth', None)
    78. execl_file = pd.read_excel(io=file_name, sheet_name=None)
    79. for sheet_name in execl_file:
    80. #过滤不导出的sheet
    81. if re.match("[a-zA-Z0-9_]", sheet_name) == None or sheet_name.startswith("(O)"):
    82. continue
    83. html_file_name = target_dir + "\\" + sheet_name+".html"
    84. sheet_df = execl_file[sheet_name].fillna('')
    85. sheet_df.to_html(html_file_name, index=False, encoding = 'utf-8', header=True, notebook=True);
    86. csv_file_name = get_csv_file_name(sheet_name, target_dir)
    87. html_table_to_csv(html_file_name, csv_file_name)
    88. def html_table_to_csv(html_file, csv_file_name):
    89. with open(html_file, 'r', encoding='utf-8') as f:
    90. bsobj = BeautifulSoup(f.read(), 'html.parser')
    91. # 表格是当前页面上的第一个表格
    92. table = bsobj.findAll("table",{"class":"dataframe"})[0]
    93. # 一行
    94. rows = table.findAll("tr")
    95. # 写入
    96. csvFile = open(csv_file_name,'w', encoding='utf-8', newline='')
    97. writer = csv.writer(csvFile)
    98. try:
    99. # 遍历
    100. for row in rows:
    101. # 创建一个空列表
    102. csvRow = []
    103. # 'td'一行中的列,
    104. for cell in row.findAll(['td', 'th']):
    105. # 利用函数get_text()获取-添加
    106. csvRow.append(cell.get_text())
    107. # 写入
    108. writer.writerow(csvRow)
    109. finally:
    110. # 关闭
    111. csvFile.close()
    112. os.unlink(html_file)
    113. def export_single_excel_openpyxl(file_name, target_dir):
    114. print(file_name)
    115. execl_file = openpyxl.load_workbook(file_name, data_only=True)
    116. for sheet_name in execl_file.sheetnames:
    117. #过滤不导出的sheet
    118. if re.match("[a-zA-Z0-9_]", sheet_name) == None or sheet_name.startswith("(O)"):
    119. continue
    120. csv_file_name = get_csv_file_name(sheet_name, target_dir)
    121. csv_file = csv.writer(open(csv_file_name,'w', encoding='utf-8', newline=''))
    122. sheet_data = execl_file[sheet_name]
    123. row_idx = 1
    124. ignore_field = []
    125. for row in sheet_data.rows:
    126. row_data=[]
    127. all_none=True
    128. field_idx = 1
    129. for field in row:
    130. #查找表头中要忽略的列
    131. if row_idx == 1 and field.value != None and field.value.startswith("(O)"):
    132. ignore_field.append(field_idx)
    133. if field_idx not in ignore_field:
    134. row_data.append(field.value)
    135. if field.value != None:
    136. all_none = False
    137. field_idx = field_idx + 1
    138. #row_data = [a.value for a in row]
    139. if all_none != True:
    140. csv_file.writerow(row_data)
    141. row_idx = row_idx + 1
    142. def split_list_by_n(list_collection, n):
    143. """
    144. 将集合均分,每份n个元素
    145. :param list_collection:
    146. :param n:
    147. :return:返回的结果为评分后的每份可迭代对象
    148. """
    149. for i in range(0, len(list_collection), n):
    150. yield list_collection[i: i + n]
    151. def export_process(excel_list):
    152. print("sub total count:" + str(len(excel_list)))
    153. for execl_file in excel_list:
    154. print(execl_file + " start")
    155. if len(sys.argv) >= 2:
    156. if sys.argv[1] == "pandas_csv":
    157. export_single_excel_pandas_to_csv(execl_file, ".")
    158. elif sys.argv[1] == "html_2_csv":
    159. export_single_excel_pandas_to_html_to_csv(execl_file, ".")
    160. elif sys.argv[1] == "openpyxl_csv":
    161. export_single_excel_openpyxl(execl_file, ".")
    162. elif sys.argv[1] == "xlrd_csv":
    163. export_single_excel_xlrd(execl_file, ".")
    164. else:
    165. export_single_excel_xlrd(execl_file, ".")
    166. else:
    167. export_single_excel_pandas_to_csv(execl_file, ".")
    168. print(execl_file + " end")
    169. #cur_processed = cur_processed + 1
    170. def main():
    171. all_excel_files = scan_files(".", "xlsm")
    172. total = len(all_excel_files)
    173. cur_processed = 0
    174. length = int(len(all_excel_files) / 8 + 1);
    175. split_excel_list = split_list_by_n(all_excel_files, length)
    176. print("total count:" + str(total))
    177. process_list = []
    178. for excel_list in split_excel_list:
    179. task = Process(target=export_process, args=(excel_list,))
    180. task.start()
    181. process_list.append(task)
    182. for task in process_list:
    183. task.join()
    184. print("all complete!!!!!")
    185. if __name__ == '__main__':
    186. freeze_support()
    187. main()

    这是一个多进程版本,将目录下所有文件分成8分,8进程同时导出。最后还将这个python打包成了exe,随处可以执行,用的是pyinstaller。

    这其中有一部分pandas先导出html然后转换成csv,这是为了完全模拟excel中的显示效果,float数据同excel保留位数相同。

    过程中遇到的问题:

    1.多进程打包python文件的时候,用到了upx模块,安装方案:

    【python 问题解决】 ---- UPX is not available. - 腾讯云开发者社区-腾讯云

    2.实现excel->html->csv的时候,遇到了列值被截断的情况,

    max_colwidth_doc = """ 
    : int or None
        The maximum width in characters of a column in the repr of
        a pandas data structure. When the column overflows, a "..."
        placeholder is embedded in the output. A 'None' value means unlimited.
    """

    这里需要执行,放开成无限制

    pd.set_option('display.max_colwidth', None)

    3.除了html->csv之外,其他几种方法都遇到了数值被写成科学计数法的情况,我尝试了dtype和converters都不起作用,最后也还没有屏蔽掉,后续会继续探索,也希望解决过这个问题的朋友不吝赐教

    4.html的table转csv的实现代码,源码出处:python之获取HTML表格并写入CSV文件_贾继康的博客-CSDN博客

  • 相关阅读:
    安装Microsoft visual c++ 2022 X86 minimum runtime报错
    LeetCode笔记:Weekly Contest 319
    深度学习常用的Python库(核心库、可视化、NLP、计算机视觉、深度学习等)
    node18 vue2启动报错 error:0308010C:digital envelope routines::unsupported
    PHP如何批量修改二维数组中值
    Oracle P6 -SQLServer数据库乱码案例分享
    【教程】遥感数据与作物模型同化实践
    iperf3交叉编译
    Beego 使用教程 6:Web 输入处理
    嵌入式学习——硬件(ARM体系架构)——day51
  • 原文地址:https://blog.csdn.net/cleanfield/article/details/126532030