python实现从excel导出csv最完整版本，openpyxl，pandas，xlrd全家桶

先说一下应用场景，我们项目之前excel导出csv都是通过打开excel执行宏完成的，这样没法做到批量导出，于是我准备优化一下，两个思路：

1.还是利用excel的宏，写vbs外部执行，export_csv.vbs 具体代码如下：


Function export_csv(filename)
  Set oExcel = createobject("Excel.Application")
  oExcel.Visible = false
  Set oWorkbooks = oExcel.Workbooks.Open(filename)
  oExcel.Run "ThisWorkbook.导出CSV"
  oWorkbooks.Close
  Set oWorkbooks = nothing 
  oExcel.Quit
  Set oExcel= nothing 
  WSH.Echo filename
End Function  
 
export_csv(WScript.Arguments.Item(0))

外部的bat调度脚本：


::扫描当前目录所有xlsm文件，然后调用vbs处理
for %%i in (%~dp0*.xlsm) do ( CScript //B .\export_csv_new.vbs %%i )
 
pause

这样可以完全按照策划之前的操作流程处理，但是有个最大的问题就是太慢了，因为具体的执行过程是：

1）打开excel文件，窗口隐藏

2) 运行excel文件的导出CSV宏

3）关闭excel文件

2.通过第三方库实现excel的读取和csv的写入，首选python，因为库还算多，最终实现了python中的pandas，openpyxl和xlrd三种方案，同时写入csv也有三种方案，包括pandas的dataframe to_csv，还有csv模块的writerow，还有自己实现了一个列拼接写入，代码如下


import sys 
import os
import re
import pprint
import inspect
import csv 
import openpyxl
import pandas as pd
import xlrd
import codecs
from bs4 import BeautifulSoup
from multiprocessing import  Process
from multiprocessing import  freeze_support
 
def scan_files(directory,postfix):
    files_list=[]
 
    for root, sub_dirs, files in os.walk(directory):
        for special_file in files:
            if special_file.endswith(postfix) and special_file.startswith("~$") == False:
                files_list.append(os.path.join(root,special_file))
 
    return files_list
 
def get_csv_file_name(sheet_name, target_dir):
    if sheet_name.endswith(".csv"):
        csv_file_name = target_dir + "\\" + sheet_name
    else:
        csv_file_name = target_dir + "\\" + sheet_name+".csv"  
 
    return csv_file_name 
 
def export_single_excel_xlrd(file_name, target_dir):
    print(file_name)
    execl_file = xlrd.open_workbook(file_name)
    for sheet_name in execl_file.sheet_names():
     
        csv_file_name = get_csv_file_name(sheet_name, target_dir)
 
        csv_file = csv.writer(open(csv_file_name,'w', encoding='utf-8', newline=''))
        sheet_data = execl_file.sheet_by_name(sheet_name)
 
        nrows = sheet_data.nrows
        ncols = sheet_data.ncols
 
        for row_idx in range(0,nrows):
            row_data = sheet_data.row_values(row_idx)
            csv_file.writerow(row_data)
 
def pandas_dataframe_local_write_csv(sheet_data, csv_file_name):
    csv_file = codecs.open(csv_file_name, "w", "utf-8")
 
    sep = ","
    row_data=[]
 
    #write head
    for col in sheet_data.head():
        row_data.append(str(col))
    row_str = sep.join(row_data)
    csv_file.write(row_str+"\n")
 
    #write content
    for row in sheet_data.values:
        row_data=[]
        for col in row:
            row_data.append(str(col))
 
        row_str = sep.join(row_data)
        csv_file.write(row_str+"\n")   
 
def pandas_dataframe_lib_write_csv(sheet_data, csv_file_name):
    csv_file = csv.writer(open(csv_file_name,'w', encoding='utf-8', newline=''))
 
    for row in sheet_data:
        csv_file.writerow(sheet_data[row])
 
def export_single_excel_pandas_with_write(file_name, target_dir):
    print(file_name)
 
    execl_file = pd.read_excel(io=file_name, sheet_name=None)
 
    for sheet_name in execl_file:
        csv_file_name = get_csv_file_name(sheet_name, target_dir)
        print(csv_file_name)
 
        sheet_data = execl_file[sheet_name].fillna("")
        pandas_dataframe_local_write_csv(sheet_data, csv_file_name)
 
        #pandas_dataframe_lib_write_csv(sheet_data, csv_file_name)
 
def export_single_excel_pandas_to_csv(file_name, target_dir):
    execl_file = pd.read_excel(io=file_name, sheet_name=None, dtype={'Power Coefficient (source)':object,})
 
    for sheet_name in execl_file:
        #过滤不导出的sheet
        if re.match("[a-zA-Z0-9_]", sheet_name) == None or sheet_name.startswith("(O)"):
            continue  
        csv_file_name = get_csv_file_name(sheet_name, target_dir)
        execl_file[sheet_name].to_csv(csv_file_name, index=False, float_format = '%g', encoding = 'utf-8', header=True);
 
def export_single_excel_pandas_to_html_to_csv(file_name, target_dir):
    pd.set_option('display.max_colwidth', None)
    execl_file = pd.read_excel(io=file_name, sheet_name=None)
 
    for sheet_name in execl_file:
        #过滤不导出的sheet
        if re.match("[a-zA-Z0-9_]", sheet_name) == None or sheet_name.startswith("(O)"):
            continue  
        html_file_name = target_dir + "\\" + sheet_name+".html"
 
        sheet_df = execl_file[sheet_name].fillna('')
        sheet_df.to_html(html_file_name, index=False, encoding = 'utf-8', header=True, notebook=True);
 
        csv_file_name = get_csv_file_name(sheet_name, target_dir)
        html_table_to_csv(html_file_name, csv_file_name)
 
 
def html_table_to_csv(html_file, csv_file_name):
    with open(html_file, 'r', encoding='utf-8') as f:
        bsobj = BeautifulSoup(f.read(), 'html.parser')
 
    # 表格是当前页面上的第一个表格
    table = bsobj.findAll("table",{"class":"dataframe"})[0]
 
    # 一行
    rows = table.findAll("tr")
 
    # 写入
    csvFile = open(csv_file_name,'w', encoding='utf-8', newline='')
    writer = csv.writer(csvFile)
    try:
        # 遍历
        for row in rows:
            # 创建一个空列表
            csvRow = []
            # 'td'一行中的列，
            for cell in row.findAll(['td', 'th']):
                # 利用函数get_text()获取-添加
                csvRow.append(cell.get_text())
                # 写入
                
            writer.writerow(csvRow)
    finally:
        # 关闭
        csvFile.close()
        os.unlink(html_file)
 
def export_single_excel_openpyxl(file_name, target_dir):
    print(file_name)
    execl_file = openpyxl.load_workbook(file_name, data_only=True)
    for sheet_name in execl_file.sheetnames:
 
        #过滤不导出的sheet
        if re.match("[a-zA-Z0-9_]", sheet_name) == None or sheet_name.startswith("(O)"):
            continue       
 
        csv_file_name = get_csv_file_name(sheet_name, target_dir)
 
        csv_file = csv.writer(open(csv_file_name,'w', encoding='utf-8', newline=''))
        sheet_data = execl_file[sheet_name]
        row_idx = 1
        ignore_field = []
        for row in sheet_data.rows:
            row_data=[]
            all_none=True
            field_idx = 1
            for field in row:
                #查找表头中要忽略的列
                if row_idx == 1 and field.value != None and field.value.startswith("(O)"):
                    ignore_field.append(field_idx)
 
                if field_idx not in ignore_field:
                    row_data.append(field.value)
                    if field.value != None:
                        all_none = False
 
                field_idx = field_idx + 1
            #row_data = [a.value for a in row]
            if all_none != True:
                csv_file.writerow(row_data)
            row_idx = row_idx + 1
 
def split_list_by_n(list_collection, n):
    """
    将集合均分，每份n个元素
    :param list_collection:
    :param n:
    :return:返回的结果为评分后的每份可迭代对象
    """
    for i in range(0, len(list_collection), n):
        yield list_collection[i: i + n]
 
def export_process(excel_list):
    print("sub total count:" + str(len(excel_list)))
    for execl_file in excel_list:
        print(execl_file + "  start")
 
        if len(sys.argv) >= 2:
            if sys.argv[1] == "pandas_csv":
                export_single_excel_pandas_to_csv(execl_file, ".") 
            elif sys.argv[1] == "html_2_csv":
                export_single_excel_pandas_to_html_to_csv(execl_file, ".") 
            elif sys.argv[1] == "openpyxl_csv":
                export_single_excel_openpyxl(execl_file, ".") 
            elif sys.argv[1] == "xlrd_csv":
                export_single_excel_xlrd(execl_file, ".")                 
            else:
                export_single_excel_xlrd(execl_file, ".") 
        else:
            export_single_excel_pandas_to_csv(execl_file, ".") 
 
        print(execl_file + "  end")
        #cur_processed = cur_processed + 1
 
def main():
 
    all_excel_files = scan_files(".", "xlsm")
 
    total = len(all_excel_files)
    cur_processed = 0
    length = int(len(all_excel_files) / 8 + 1);
    split_excel_list = split_list_by_n(all_excel_files, length)
 
    print("total count:" + str(total))
 
    process_list = []
    for excel_list in split_excel_list:
        task = Process(target=export_process, args=(excel_list,))
        task.start()
        process_list.append(task)
 
    for task in process_list:
        task.join()
 
    print("all complete!!!!!")
 
 
if __name__ == '__main__':
    freeze_support()
    main()

这是一个多进程版本，将目录下所有文件分成8分，8进程同时导出。最后还将这个python打包成了exe，随处可以执行，用的是pyinstaller。

这其中有一部分pandas先导出html然后转换成csv，这是为了完全模拟excel中的显示效果，float数据同excel保留位数相同。

过程中遇到的问题：

1.多进程打包python文件的时候，用到了upx模块，安装方案：

【python 问题解决】 ---- UPX is not available. - 腾讯云开发者社区-腾讯云

2.实现excel->html->csv的时候，遇到了列值被截断的情况，

max_colwidth_doc = """
: int or None
The maximum width in characters of a column in the repr of
a pandas data structure. When the column overflows, a "..."
placeholder is embedded in the output. A 'None' value means unlimited.
"""

这里需要执行，放开成无限制

pd.set_option('display.max_colwidth', None)

3.除了html->csv之外，其他几种方法都遇到了数值被写成科学计数法的情况，我尝试了dtype和converters都不起作用，最后也还没有屏蔽掉，后续会继续探索，也希望解决过这个问题的朋友不吝赐教

4.html的table转csv的实现代码，源码出处：python之获取HTML表格并写入CSV文件_贾继康的博客-CSDN博客

相关阅读:
安装Microsoft visual c++ 2022 X86 minimum runtime报错
 LeetCode笔记：Weekly Contest 319
深度学习常用的Python库（核心库、可视化、NLP、计算机视觉、深度学习等）
node18 vue2启动报错 error:0308010C:digital envelope routines::unsupported
PHP如何批量修改二维数组中值
 Oracle P6 -SQLServer数据库乱码案例分享
 【教程】遥感数据与作物模型同化实践
 iperf3交叉编译
 Beego 使用教程 6：Web 输入处理
 嵌入式学习——硬件（ARM体系架构）——day51
原文地址：https://blog.csdn.net/cleanfield/article/details/126532030