先说一下应用场景,我们项目之前excel导出csv都是通过打开excel执行宏完成的,这样没法做到批量导出,于是我准备优化一下,两个思路:
1.还是利用excel的宏,写vbs外部执行,export_csv.vbs 具体代码如下:
- Function export_csv(filename)
- Set oExcel = createobject("Excel.Application")
- oExcel.Visible = false
- Set oWorkbooks = oExcel.Workbooks.Open(filename)
- oExcel.Run "ThisWorkbook.导出CSV"
- oWorkbooks.Close
- Set oWorkbooks = nothing
- oExcel.Quit
- Set oExcel= nothing
- WSH.Echo filename
- End Function
-
- export_csv(WScript.Arguments.Item(0))
外部的bat调度脚本:
- ::扫描当前目录所有xlsm文件,然后调用vbs处理
- for %%i in (%~dp0*.xlsm) do ( CScript //B .\export_csv_new.vbs %%i )
-
- pause
这样可以完全按照策划之前的操作流程处理,但是有个最大的问题就是太慢了,因为具体的执行过程是:
1)打开excel文件,窗口隐藏
2) 运行excel文件的导出CSV宏
3)关闭excel文件
2.通过第三方库实现excel的读取和csv的写入,首选python,因为库还算多,最终实现了python中的pandas,openpyxl和xlrd三种方案,同时写入csv也有三种方案,包括pandas的dataframe to_csv,还有csv模块的writerow,还有自己实现了一个列拼接写入,代码如下
- import sys
- import os
- import re
- import pprint
- import inspect
- import csv
- import openpyxl
- import pandas as pd
- import xlrd
- import codecs
- from bs4 import BeautifulSoup
- from multiprocessing import Process
- from multiprocessing import freeze_support
-
- def scan_files(directory,postfix):
- files_list=[]
-
- for root, sub_dirs, files in os.walk(directory):
- for special_file in files:
- if special_file.endswith(postfix) and special_file.startswith("~$") == False:
- files_list.append(os.path.join(root,special_file))
-
- return files_list
-
- def get_csv_file_name(sheet_name, target_dir):
- if sheet_name.endswith(".csv"):
- csv_file_name = target_dir + "\\" + sheet_name
- else:
- csv_file_name = target_dir + "\\" + sheet_name+".csv"
-
- return csv_file_name
-
- def export_single_excel_xlrd(file_name, target_dir):
- print(file_name)
- execl_file = xlrd.open_workbook(file_name)
- for sheet_name in execl_file.sheet_names():
-
- csv_file_name = get_csv_file_name(sheet_name, target_dir)
-
- csv_file = csv.writer(open(csv_file_name,'w', encoding='utf-8', newline=''))
- sheet_data = execl_file.sheet_by_name(sheet_name)
-
- nrows = sheet_data.nrows
- ncols = sheet_data.ncols
-
- for row_idx in range(0,nrows):
- row_data = sheet_data.row_values(row_idx)
- csv_file.writerow(row_data)
-
- def pandas_dataframe_local_write_csv(sheet_data, csv_file_name):
- csv_file = codecs.open(csv_file_name, "w", "utf-8")
-
- sep = ","
- row_data=[]
-
- #write head
- for col in sheet_data.head():
- row_data.append(str(col))
- row_str = sep.join(row_data)
- csv_file.write(row_str+"\n")
-
- #write content
- for row in sheet_data.values:
- row_data=[]
- for col in row:
- row_data.append(str(col))
-
- row_str = sep.join(row_data)
- csv_file.write(row_str+"\n")
-
- def pandas_dataframe_lib_write_csv(sheet_data, csv_file_name):
- csv_file = csv.writer(open(csv_file_name,'w', encoding='utf-8', newline=''))
-
- for row in sheet_data:
- csv_file.writerow(sheet_data[row])
-
- def export_single_excel_pandas_with_write(file_name, target_dir):
- print(file_name)
-
- execl_file = pd.read_excel(io=file_name, sheet_name=None)
-
- for sheet_name in execl_file:
- csv_file_name = get_csv_file_name(sheet_name, target_dir)
- print(csv_file_name)
-
- sheet_data = execl_file[sheet_name].fillna("")
- pandas_dataframe_local_write_csv(sheet_data, csv_file_name)
-
- #pandas_dataframe_lib_write_csv(sheet_data, csv_file_name)
-
- def export_single_excel_pandas_to_csv(file_name, target_dir):
- execl_file = pd.read_excel(io=file_name, sheet_name=None, dtype={'Power Coefficient (source)':object,})
-
- for sheet_name in execl_file:
- #过滤不导出的sheet
- if re.match("[a-zA-Z0-9_]", sheet_name) == None or sheet_name.startswith("(O)"):
- continue
- csv_file_name = get_csv_file_name(sheet_name, target_dir)
- execl_file[sheet_name].to_csv(csv_file_name, index=False, float_format = '%g', encoding = 'utf-8', header=True);
-
- def export_single_excel_pandas_to_html_to_csv(file_name, target_dir):
- pd.set_option('display.max_colwidth', None)
- execl_file = pd.read_excel(io=file_name, sheet_name=None)
-
- for sheet_name in execl_file:
- #过滤不导出的sheet
- if re.match("[a-zA-Z0-9_]", sheet_name) == None or sheet_name.startswith("(O)"):
- continue
- html_file_name = target_dir + "\\" + sheet_name+".html"
-
- sheet_df = execl_file[sheet_name].fillna('')
- sheet_df.to_html(html_file_name, index=False, encoding = 'utf-8', header=True, notebook=True);
-
- csv_file_name = get_csv_file_name(sheet_name, target_dir)
- html_table_to_csv(html_file_name, csv_file_name)
-
-
- def html_table_to_csv(html_file, csv_file_name):
- with open(html_file, 'r', encoding='utf-8') as f:
- bsobj = BeautifulSoup(f.read(), 'html.parser')
-
- # 表格是当前页面上的第一个表格
- table = bsobj.findAll("table",{"class":"dataframe"})[0]
-
- # 一行
- rows = table.findAll("tr")
-
- # 写入
- csvFile = open(csv_file_name,'w', encoding='utf-8', newline='')
- writer = csv.writer(csvFile)
- try:
- # 遍历
- for row in rows:
- # 创建一个空列表
- csvRow = []
- # 'td'一行中的列,
- for cell in row.findAll(['td', 'th']):
- # 利用函数get_text()获取-添加
- csvRow.append(cell.get_text())
- # 写入
-
- writer.writerow(csvRow)
- finally:
- # 关闭
- csvFile.close()
- os.unlink(html_file)
-
- def export_single_excel_openpyxl(file_name, target_dir):
- print(file_name)
- execl_file = openpyxl.load_workbook(file_name, data_only=True)
- for sheet_name in execl_file.sheetnames:
-
- #过滤不导出的sheet
- if re.match("[a-zA-Z0-9_]", sheet_name) == None or sheet_name.startswith("(O)"):
- continue
-
- csv_file_name = get_csv_file_name(sheet_name, target_dir)
-
- csv_file = csv.writer(open(csv_file_name,'w', encoding='utf-8', newline=''))
- sheet_data = execl_file[sheet_name]
- row_idx = 1
- ignore_field = []
- for row in sheet_data.rows:
- row_data=[]
- all_none=True
- field_idx = 1
- for field in row:
- #查找表头中要忽略的列
- if row_idx == 1 and field.value != None and field.value.startswith("(O)"):
- ignore_field.append(field_idx)
-
- if field_idx not in ignore_field:
- row_data.append(field.value)
- if field.value != None:
- all_none = False
-
- field_idx = field_idx + 1
- #row_data = [a.value for a in row]
- if all_none != True:
- csv_file.writerow(row_data)
- row_idx = row_idx + 1
-
- def split_list_by_n(list_collection, n):
- """
- 将集合均分,每份n个元素
- :param list_collection:
- :param n:
- :return:返回的结果为评分后的每份可迭代对象
- """
- for i in range(0, len(list_collection), n):
- yield list_collection[i: i + n]
-
- def export_process(excel_list):
- print("sub total count:" + str(len(excel_list)))
- for execl_file in excel_list:
- print(execl_file + " start")
-
- if len(sys.argv) >= 2:
- if sys.argv[1] == "pandas_csv":
- export_single_excel_pandas_to_csv(execl_file, ".")
- elif sys.argv[1] == "html_2_csv":
- export_single_excel_pandas_to_html_to_csv(execl_file, ".")
- elif sys.argv[1] == "openpyxl_csv":
- export_single_excel_openpyxl(execl_file, ".")
- elif sys.argv[1] == "xlrd_csv":
- export_single_excel_xlrd(execl_file, ".")
- else:
- export_single_excel_xlrd(execl_file, ".")
- else:
- export_single_excel_pandas_to_csv(execl_file, ".")
-
- print(execl_file + " end")
- #cur_processed = cur_processed + 1
-
- def main():
-
- all_excel_files = scan_files(".", "xlsm")
-
- total = len(all_excel_files)
- cur_processed = 0
- length = int(len(all_excel_files) / 8 + 1);
- split_excel_list = split_list_by_n(all_excel_files, length)
-
- print("total count:" + str(total))
-
- process_list = []
- for excel_list in split_excel_list:
- task = Process(target=export_process, args=(excel_list,))
- task.start()
- process_list.append(task)
-
- for task in process_list:
- task.join()
-
- print("all complete!!!!!")
-
-
- if __name__ == '__main__':
- freeze_support()
- main()
-
这是一个多进程版本,将目录下所有文件分成8分,8进程同时导出。最后还将这个python打包成了exe,随处可以执行,用的是pyinstaller。
这其中有一部分pandas先导出html然后转换成csv,这是为了完全模拟excel中的显示效果,float数据同excel保留位数相同。
过程中遇到的问题:
1.多进程打包python文件的时候,用到了upx模块,安装方案:
【python 问题解决】 ---- UPX is not available. - 腾讯云开发者社区-腾讯云
2.实现excel->html->csv的时候,遇到了列值被截断的情况,
max_colwidth_doc = """
: int or None
The maximum width in characters of a column in the repr of
a pandas data structure. When the column overflows, a "..."
placeholder is embedded in the output. A 'None' value means unlimited.
"""
这里需要执行,放开成无限制
pd.set_option('display.max_colwidth', None)
3.除了html->csv之外,其他几种方法都遇到了数值被写成科学计数法的情况,我尝试了dtype和converters都不起作用,最后也还没有屏蔽掉,后续会继续探索,也希望解决过这个问题的朋友不吝赐教
4.html的table转csv的实现代码,源码出处:python之获取HTML表格并写入CSV文件_贾继康的博客-CSDN博客