• VBA处理数据与Python Pandas处理数据案例比较分析


    需求:

    现有一个 csv文件,包含'CNUM'和'COMPANY'两列,数据里包含空行,且有内容重复的行数据。

    要求:

    1)去掉空行;
    2)重复行数据只保留一行有效数据;
    3)修改'COMPANY'列的名称为'Company_New‘;
    4)并在其后增加六列,分别为'C_col',‘D_col',‘E_col',‘F_col',‘G_col',‘H_col'。

    一,使用 Python Pandas来处理

    1

    2

    3

    4

    5

    6

    7

    8

    9

    10

    11

    12

    13

    14

    15

    16

    17

    18

    19

    import pandas as pd

    import numpy as np

    from pandas import DataFrame,Series

    def deal_with_data(filepath,newpath):

      file_obj=open(filepath)

      df=pd.read_csv(file_obj)  # 读取csv文件,创建 DataFrame

      df=df.reindex(columns=['CNUM','COMPANY','C_col','D_col','E_col','F_col','G_col','H_col'],fill_value=None# 重新指定列索引

      df.rename(columns={'COMPANY':'Company_New'}, inplace = True) # 修改列名

      df=df.dropna(axis=0,how='all')         # 去除 NAN 即文件中的空行

      df['CNUM'] = df['CNUM'].astype('int32')    # 将 CNUM 列的数据类型指定为 int32

      df = df.drop_duplicates(subset=['CNUM', 'Company_New'], keep='first') # 去除重复行

      df.to_csv(newpath,index=False,encoding='GBK')

      file_obj.close()

       

    if __name__=='__main__':

      file_path=r'C:\Users\12078\Desktop\python\CNUM_COMPANY.csv'

      file_save_path=r'C:\Users\12078\Desktop\python\CNUM_COMPANY_OUTPUT.csv'

      deal_with_data(file_path,file_save_path)

    二,使用 VBA来处理:

    1

    2

    3

    4

    5

    6

    7

    8

    9

    10

    11

    12

    13

    14

    15

    16

    17

    18

    19

    20

    21

    22

    23

    24

    25

    26

    27

    28

    29

    30

    31

    32

    33

    34

    35

    36

    37

    38

    39

    40

    41

    42

    43

    44

    45

    46

    47

    48

    49

    50

    51

    52

    53

    54

    55

    56

    57

    58

    59

    60

    61

    62

    63

    64

    65

    66

    67

    68

    69

    70

    71

    72

    73

    74

    75

    76

    77

    78

    79

    80

    81

    82

    83

    84

    85

    86

    87

    88

    89

    90

    91

    92

    93

    94

    95

    96

    97

    98

    99

    100

    101

    102

    103

    104

    Option Base 1

    Option Explicit

    Sub main()

     On Error GoTo error_handling

     Dim wb         As Workbook

     Dim wb_out       As Workbook

     Dim sht         As Worksheet

     Dim sht_out       As Worksheet

     Dim rng         As Range

     Dim usedrows      As Byte

     Dim usedrows_out    As Byte

     Dim dict_cnum_company  As Object

     Dim str_file_path    As String

        Dim str_new_file_path  As String

        'assign values to variables:

        str_file_path = "C:\Users\12078\Desktop\Python\CNUM_COMPANY.csv"

        str_new_file_path = "C:\Users\12078\Desktop\Python\CNUM_COMPANY_OUTPUT.csv"

      

     Set wb = checkAndAttachWorkbook(str_file_path)

     Set sht = wb.Worksheets("CNUM_COMPANY")

     Set wb_out = Workbooks.Add

     wb_out.SaveAs str_new_file_path, xlCSV 'create a csv file

     Set sht_out = wb_out.Worksheets("CNUM_COMPANY_OUTPUT")

     Set dict_cnum_company = CreateObject("Scripting.Dictionary")

     usedrows = WorksheetFunction.Max(getLastValidRow(sht, "A"), getLastValidRow(sht, "B"))

     'rename the header 'COMPANY' to 'Company_New',remove blank & duplicate lines/rows.

     Dim cnum_company As String

     cnum_company = ""

     For Each rng In sht.Range("A1", "A" & usedrows)

       If VBA.Trim(rng.Offset(0, 1).Value) = "COMPANY" Then

         rng.Offset(0, 1).Value = "Company_New"

       End If

       cnum_company = rng.Value & "-" & rng.Offset(0, 1).Value

       If VBA.Trim(cnum_company) <> "-" And Not dict_cnum_company.Exists(rng.Value & "-" & rng.Offset(0, 1).Value) Then

         dict_cnum_company.Add rng.Value & "-" & rng.Offset(0, 1).Value, ""

       End If

     Next rng

      

     'loop the keys of dict split the keyes by '-' into cnum array and company array.

     Dim index_dict As Byte

     Dim arr_cnum()

     Dim arr_Company()

     For index_dict = 0 To UBound(dict_cnum_company.keys)

       ReDim Preserve arr_cnum(1 To UBound(dict_cnum_company.keys) + 1)

       ReDim Preserve arr_Company(1 To UBound(dict_cnum_company.keys) + 1)

       arr_cnum(index_dict + 1) = Split(dict_cnum_company.keys()(index_dict), "-")(0)

       arr_Company(index_dict + 1) = Split(dict_cnum_company.keys()(index_dict), "-")(1)

       Debug.Print index_dict

     Next

     'assigns the value of the arrays to the celles.

     sht_out.Range("A1", "A" & UBound(arr_cnum)) = Application.WorksheetFunction.Transpose(arr_cnum)

     sht_out.Range("B1", "B" & UBound(arr_Company)) = Application.WorksheetFunction.Transpose(arr_Company)

     'add 6 columns to output csv file:

     Dim arr_columns() As Variant

     arr_columns = Array("C_col", "D_col", "E_col", "F_col", "G_col", "H_col")  '

     sht_out.Range("C1:H1") = arr_columns

     Call checkAndCloseWorkbook(str_file_path, False)

     Call checkAndCloseWorkbook(str_new_file_path, True)

    Exit Sub

    error_handling:

      Call checkAndCloseWorkbook(str_file_path, False)

      Call checkAndCloseWorkbook(str_new_file_path, False)

    End Sub

    ' 辅助函数:

    'Get last row of Column N in a Worksheet

    Function getLastValidRow(in_ws As Worksheet, in_col As String)

      getLastValidRow = in_ws.Cells(in_ws.Rows.count, in_col).End(xlUp).Row

    End Function

    Function checkAndAttachWorkbook(in_wb_path As String) As Workbook

      Dim wb As Workbook

      Dim mywb As String

      mywb = in_wb_path

       

      For Each wb In Workbooks

        If LCase(wb.FullName) = LCase(mywb) Then

          Set checkAndAttachWorkbook = wb

          Exit Function

        End If

      Next

       

      Set wb = Workbooks.Open(in_wb_path, UpdateLinks:=0)

      Set checkAndAttachWorkbook = wb

    End Function

      

    Function checkAndCloseWorkbook(in_wb_path As String, in_saved As Boolean)

      Dim wb As Workbook

      Dim mywb As String

      mywb = in_wb_path

      For Each wb In Workbooks

        If LCase(wb.FullName) = LCase(mywb) Then

          wb.Close savechanges:=in_saved

          Exit Function

        End If

      Next

    End Function

    三,输出结果:

    两种方法输出结果相同:

    四,比较总结:

    Python pandas 内置了大量处理数据的方法,我们不需要重复造轮子,用起来很方便,代码简洁的多。
    Excel VBA 处理这个需求,使用了 数组,字典等数据结构(实际需求中,数据量往往很大,所以一些地方没有直接使用遍历单元格的方法),以及处理字符串,数组和字典的很多方法,对文件的操作也很复杂,一旦出错,调试起来比python也较困难,代码已经尽量优化,但还是远比 Python要多。

  • 相关阅读:
    excel导入存到数据库(老项目 poi)
    Iceberg Flink FLIP-27实现
    【不存在的人】用Python获取生成随机头像,还不侵权
    超详细的前后端实战项目(Spring系列加上vue3)前端篇+后端篇(三)(一步步实现+源码)
    文件包含漏洞-知识点
    为百度翻译花了3元,感觉还是值得的
    Redis:hash类型底层数据结构剖析
    项目管理之jira 9.1安装
    网站列表页加密:三次请求后返回内容多\r
    Socks5 与 HTTP 代理在网络安全中的应用
  • 原文地址:https://blog.csdn.net/jh035512/article/details/128048371