• 【Python】Pandas数据处理教程


    目录

    Pandas基本介绍

    pandas选择数据

    pandas设置值

    pandas处理丢失数据

    pandas数据导入与导出

    pandas 合并 concat

    pandas 合并 merge



    Pandas基本介绍

    1. # Pandas像是一个字典形式的Numpy
    2. import pandas as pd
    3. import numpy as np
    4. # 两者在应用上一般是互相搭配的
    5. s = pd.Series([1,3,6,np.nan,44,1])
    6. print(s) # 打印出来后会形成一个带序号的表格,最下方还有该列的数据名称
    7. dates = pd.date_range('20231008',periods = 6) # 第一个参数是时间序列的开始日期,第二个是时间的长度,也就是打印的天数
    8. print(dates)
    9. df = pd.DataFrame(np.random.randn(6,4),index = dates,columns=['a','b','c','d'])
    10. # DataFrame其实就是一个矩阵,第一个参数就是np的矩阵,用index来定义它的行,columns定义它的列,这就变成一个表格
    11. print(df)
    12. df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
    13. # 若不指定行列的定义,则直接用下标
    14. print(df1)
    15. df2 = pd.DataFrame({
    16. 'A':1.,
    17. 'B':pd.Timestamp('20231008'),
    18. 'C':pd.Series(1,index=list(range(4)),dtype='float32'),
    19. 'D':np.array([3]*4,dtype='int32'),
    20. 'E':pd.Categorical(["test","train","test","train"]),
    21. 'F':'foo'
    22. })
    23. # 还可以在DataFrame的括号里加字典,这样字典里的每个键就代表一列数据,所有列的数据数量要一样多
    24. print(df2)
    25. df2.dtypes
    26. # 打印出所有键的数据类型
    27. df2.index
    28. # 输出所有列的序号
    29. df2.columns
    30. # 打印出所有行的名字
    31. df2.values
    32. # 打印出每一行的所有元素值
    33. print(df2.describe())
    34. # 打印数据表中所有数字数据的描述性统计
    35. print(df2.T)
    36. # 表格的行列转置
    37. print(df2.sort_index(axis=1,ascending=False))
    38. # 对列序号按照逆序排序
    39. print(df2.sort_index(axis=0,ascending=False))
    40. print(df2.sort_values(by='E'))
    41. # 按照第E列的数据进行由大到小的排序
    0     1.0
    1     3.0
    2     6.0
    3     NaN
    4    44.0
    5     1.0
    dtype: float64
    DatetimeIndex(['2023-10-08', '2023-10-09', '2023-10-10', '2023-10-11',
                   '2023-10-12', '2023-10-13'],
                  dtype='datetime64[ns]', freq='D')
                       a         b         c         d
    2023-10-08  0.330878 -1.194733  0.539956  0.121203
    2023-10-09 -0.132819  0.294950  0.067614  0.052185
    2023-10-10  0.311724  1.305968 -1.043187  0.606765
    2023-10-11  0.108810  0.772342 -0.406491  3.065872
    2023-10-12  0.495157 -0.765991  1.580787  0.554438
    2023-10-13 -0.042651  0.822573  0.264911  0.192190
       0  1   2   3
    0  0  1   2   3
    1  4  5   6   7
    2  8  9  10  11
         A          B    C  D      E    F
    0  1.0 2023-10-08  1.0  3   test  foo
    1  1.0 2023-10-08  1.0  3  train  foo
    2  1.0 2023-10-08  1.0  3   test  foo
    3  1.0 2023-10-08  1.0  3  train  foo
             A    C    D
    count  4.0  4.0  4.0
    mean   1.0  1.0  3.0
    std    0.0  0.0  0.0
    min    1.0  1.0  3.0
    25%    1.0  1.0  3.0
    50%    1.0  1.0  3.0
    75%    1.0  1.0  3.0
    max    1.0  1.0  3.0
                         0                    1                    2  \
    A                  1.0                  1.0                  1.0   
    B  2023-10-08 00:00:00  2023-10-08 00:00:00  2023-10-08 00:00:00   
    C                  1.0                  1.0                  1.0   
    D                    3                    3                    3   
    E                 test                train                 test   
    F                  foo                  foo                  foo   
    
                         3  
    A                  1.0  
    B  2023-10-08 00:00:00  
    C                  1.0  
    D                    3  
    E                train  
    F                  foo  
         F      E  D    C          B    A
    0  foo   test  3  1.0 2023-10-08  1.0
    1  foo  train  3  1.0 2023-10-08  1.0
    2  foo   test  3  1.0 2023-10-08  1.0
    3  foo  train  3  1.0 2023-10-08  1.0
         A          B    C  D      E    F
    3  1.0 2023-10-08  1.0  3  train  foo
    2  1.0 2023-10-08  1.0  3   test  foo
    1  1.0 2023-10-08  1.0  3  train  foo
    0  1.0 2023-10-08  1.0  3   test  foo
         A          B    C  D      E    F
    0  1.0 2023-10-08  1.0  3   test  foo
    2  1.0 2023-10-08  1.0  3   test  foo
    1  1.0 2023-10-08  1.0  3  train  foo
    3  1.0 2023-10-08  1.0  3  train  foo

    pandas选择数据

    1. # 本节介绍Pandas数据选择/筛选的功能
    2. import pandas as pd
    3. import numpy as np
    4. dates = pd.date_range('20231008',periods = 6)
    5. df = pd.DataFrame(np.arange(24).reshape(6,4),index = dates,columns=['A','B','C','D'])
    6. print(df)
    7. print(df['A'])
    8. print(df.A)
    9. # 可以看出这两种选择列数据的方式得到的结果相同
    10. # select by label:loc
    11. print(df.loc['20231008'])
    12. # 以标签名为选择方式
    13. print(df.loc[:,['A','B']])
    14. # 打印出所有行,A、B列的数据
    15. # 可以把loc看成location,用它可以精准定位行列打印一组数据
    16. # selected by position:iloc
    17. print(df.iloc[3:5,1:3])
    18. print(df.iloc[[1,3,5],1:3])
    19. # Boolean indexing
    20. # 通过对比某列数据进行筛选
    21. print(df)
    22. print(df[df.A>8])
                 A   B   C   D
    2023-10-08   0   1   2   3
    2023-10-09   4   5   6   7
    2023-10-10   8   9  10  11
    2023-10-11  12  13  14  15
    2023-10-12  16  17  18  19
    2023-10-13  20  21  22  23
    2023-10-08     0
    2023-10-09     4
    2023-10-10     8
    2023-10-11    12
    2023-10-12    16
    2023-10-13    20
    Freq: D, Name: A, dtype: int32
    2023-10-08     0
    2023-10-09     4
    2023-10-10     8
    2023-10-11    12
    2023-10-12    16
    2023-10-13    20
    Freq: D, Name: A, dtype: int32
    A    0
    B    1
    C    2
    D    3
    Name: 2023-10-08 00:00:00, dtype: int32
                 A   B
    2023-10-08   0   1
    2023-10-09   4   5
    2023-10-10   8   9
    2023-10-11  12  13
    2023-10-12  16  17
    2023-10-13  20  21
                 B   C
    2023-10-11  13  14
    2023-10-12  17  18
                 B   C
    2023-10-09   5   6
    2023-10-11  13  14
    2023-10-13  21  22
                 A   B   C   D
    2023-10-08   0   1   2   3
    2023-10-09   4   5   6   7
    2023-10-10   8   9  10  11
    2023-10-11  12  13  14  15
    2023-10-12  16  17  18  19
    2023-10-13  20  21  22  23
                 A   B   C   D
    2023-10-11  12  13  14  15
    2023-10-12  16  17  18  19
    2023-10-13  20  21  22  23

    pandas设置值

    1. # 本节介绍如何给选定的范围赋值
    2. import pandas as pd
    3. import numpy as np
    4. dates = pd.date_range('20231008',periods=6)
    5. df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
    6. df.iloc[2,2] = 1111
    7. print(df)
    8. df.loc['20231008','B']=2222
    9. print(df)
    10. df.A[df.A>4] = 0
    11. print(df)
    12. # 添加空列或预定义数据元素的列
    13. df['F']=np.nan
    14. print(df)
    15. df['E']=pd.Series([1,2,3,4,5,6],index=pd.date_range('20231008',periods=6))
    16. print(df)
                 A   B     C   D
    2023-10-08   0   1     2   3
    2023-10-09   4   5     6   7
    2023-10-10   8   9  1111  11
    2023-10-11  12  13    14  15
    2023-10-12  16  17    18  19
    2023-10-13  20  21    22  23
                 A     B     C   D
    2023-10-08   0  2222     2   3
    2023-10-09   4     5     6   7
    2023-10-10   8     9  1111  11
    2023-10-11  12    13    14  15
    2023-10-12  16    17    18  19
    2023-10-13  20    21    22  23
                A     B     C   D
    2023-10-08  0  2222     2   3
    2023-10-09  4     5     6   7
    2023-10-10  0     9  1111  11
    2023-10-11  0    13    14  15
    2023-10-12  0    17    18  19
    2023-10-13  0    21    22  23
                A     B     C   D   F
    2023-10-08  0  2222     2   3 NaN
    2023-10-09  4     5     6   7 NaN
    2023-10-10  0     9  1111  11 NaN
    2023-10-11  0    13    14  15 NaN
    2023-10-12  0    17    18  19 NaN
    2023-10-13  0    21    22  23 NaN
                A     B     C   D   F  E
    2023-10-08  0  2222     2   3 NaN  1
    2023-10-09  4     5     6   7 NaN  2
    2023-10-10  0     9  1111  11 NaN  3
    2023-10-11  0    13    14  15 NaN  4
    2023-10-12  0    17    18  19 NaN  5
    2023-10-13  0    21    22  23 NaN  6

    pandas处理丢失数据

    1. # 本节介绍pandas处理丢失数据
    2. import pandas as pd
    3. import numpy as np
    4. dates = pd.date_range('20231008',periods=6)
    5. df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
    6. df.iloc[0,1] = np.nan
    7. df.iloc[1,2] = np.nan
    8. print(df)
    9. print(df.dropna(axis=0,how='any')) # how={'any','all'} ,为all时只有这行全为NaN时才会丢掉
    10. # 当axis等于0时,把所有含有NaN的行丢掉,等于1时就把列丢掉
    11. print(df.fillna(value=0))
    12. # 把所有的NaN换成0
    13. print(df.isnull())
    14. # 返回一个表格,有缺失数据的地方就是True
                 A     B     C   D
    2023-10-08   0   NaN   2.0   3
    2023-10-09   4   5.0   NaN   7
    2023-10-10   8   9.0  10.0  11
    2023-10-11  12  13.0  14.0  15
    2023-10-12  16  17.0  18.0  19
    2023-10-13  20  21.0  22.0  23
                 A     B     C   D
    2023-10-10   8   9.0  10.0  11
    2023-10-11  12  13.0  14.0  15
    2023-10-12  16  17.0  18.0  19
    2023-10-13  20  21.0  22.0  23
                 A     B     C   D
    2023-10-08   0   0.0   2.0   3
    2023-10-09   4   5.0   0.0   7
    2023-10-10   8   9.0  10.0  11
    2023-10-11  12  13.0  14.0  15
    2023-10-12  16  17.0  18.0  19
    2023-10-13  20  21.0  22.0  23
                    A      B      C      D
    2023-10-08  False   True  False  False
    2023-10-09  False  False   True  False
    2023-10-10  False  False  False  False
    2023-10-11  False  False  False  False
    2023-10-12  False  False  False  False
    2023-10-13  False  False  False  False

    pandas数据导入与导出

    1. # 本节介绍用Pandas导入和导出数据
    2. import pandas as pd
    3. # 预先准备好名为student的csv文件,即可读取
    4. data = pd.read_csv('student.csv')
    5. print(data)
    6. # 将csv文件转换为pickle文件,保存在文件夹中
    7. data.to_pickle('student.pickle')

    pandas 合并 concat

    1. # 本节介绍如何使用pandas的concat合并多个DataFrame
    2. import pandas as pd
    3. import numpy as np
    4. # concatenating
    5. df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
    6. df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
    7. df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
    8. print(df1)
    9. print(df2)
    10. print(df3)
    11. res = pd.concat([df1,df2,df3],axis=0,ignore_index=True) # 合并行,axis=1时合并列
    12. print(res)
    13. # join,['inner','outer']
    14. df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
    15. df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
    16. print(df1)
    17. print(df2)
    18. res = pd.concat([df1,df2],join='outer',ignore_index=True) # outer会补齐表格,而inter会裁剪掉再合并
    19. print(res)
    20. #append
    21. df3 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
    22. df4 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
    23. res = df3.append(df4,ignore_index=True)
    24. # 将df4补到df3后面,按行补充
    25. print(res)
    26. s1 = pd.Series([1,2,3,4],index=['a','b','c','d']) # 往df3下面再添加一行数据
    27. res = df3.append(s1,ignore_index=True)
    28. print(res)
         a    b    c    d
    0  0.0  0.0  0.0  0.0
    1  0.0  0.0  0.0  0.0
    2  0.0  0.0  0.0  0.0
         a    b    c    d
    0  1.0  1.0  1.0  1.0
    1  1.0  1.0  1.0  1.0
    2  1.0  1.0  1.0  1.0
         a    b    c    d
    0  2.0  2.0  2.0  2.0
    1  2.0  2.0  2.0  2.0
    2  2.0  2.0  2.0  2.0
         a    b    c    d
    0  0.0  0.0  0.0  0.0
    1  0.0  0.0  0.0  0.0
    2  0.0  0.0  0.0  0.0
    3  1.0  1.0  1.0  1.0
    4  1.0  1.0  1.0  1.0
    5  1.0  1.0  1.0  1.0
    6  2.0  2.0  2.0  2.0
    7  2.0  2.0  2.0  2.0
    8  2.0  2.0  2.0  2.0
         a    b    c    d
    1  0.0  0.0  0.0  0.0
    2  0.0  0.0  0.0  0.0
    3  0.0  0.0  0.0  0.0
         b    c    d    e
    2  1.0  1.0  1.0  1.0
    3  1.0  1.0  1.0  1.0
    4  1.0  1.0  1.0  1.0
         a    b    c    d    e
    0  0.0  0.0  0.0  0.0  NaN
    1  0.0  0.0  0.0  0.0  NaN
    2  0.0  0.0  0.0  0.0  NaN
    3  NaN  1.0  1.0  1.0  1.0
    4  NaN  1.0  1.0  1.0  1.0
    5  NaN  1.0  1.0  1.0  1.0
         a    b    c    d
    0  0.0  0.0  0.0  0.0
    1  0.0  0.0  0.0  0.0
    2  0.0  0.0  0.0  0.0
    3  1.0  1.0  1.0  1.0
    4  1.0  1.0  1.0  1.0
    5  1.0  1.0  1.0  1.0
         a    b    c    d
    0  0.0  0.0  0.0  0.0
    1  0.0  0.0  0.0  0.0
    2  0.0  0.0  0.0  0.0
    3  1.0  2.0  3.0  4.0

    pandas 合并 merge

    这篇博客写得很好,可以参考学习,我就不重复造轮子了:

    Pandas的Merge函数详解

    pandas plot 画图

    1. import pandas as pd
    2. import numpy as np
    3. import matplotlib.pyplot as plt
    4. # plot data
    5. # Series
    6. data = pd.Series(np.random.randn(1000), index = np.arange(1000))
    7. data = data.cumsum()
    8. # data.plot()
    9. # plt.show()
    10. # DataFrame
    11. data = pd.DataFrame(np.random.randn(1000,4), index = np.arange(1000), columns = list("ABCD"))
    12. data = data.cumsum()
    13. print(data.head())
    14. # plot methods:
    15. #'bar', 'hist', 'box', 'kde', 'area', 'scatter', 'hexbin', 'pie'
    16. data.plot()
    17. ax = data.plt.scatter(x='A',y='B', color = 'DarkBlue', label='Class 1')
    18. data.plot.scatter(x = 'A', y = 'C', color = 'DarkGreen', label = 'Class 2', ax = ax)
    19. plt.show()

  • 相关阅读:
    【资源推荐】一站式机器学习学习资料
    Terraform 系列-Terraform Cloud 比 Terraform OSS 有哪些增强?
    java实现线程安全的三种方式
    LabVIEW合并VI
    计算机二级WPS 选择题(模拟和解析八)
    数值常量如何转化为内存地址?
    聚乙二醇/聚吡咯/多聚赖氨酸(PLL)/聚合物-聚乙烯亚胺(PEI)包裹四氧化三铁磁性纳米颗粒
    SaaSBase:什么是微盛?
    Volatile 可以保证什么特性?有什么作用?
    ES6(三)
  • 原文地址:https://blog.csdn.net/Summerison/article/details/133690772