目录
pandas主要提供了3种数据结构:1)Series,带标签的一维数组;2)DataFrame,带标签且大小可变的二维表格结构;3)Panel,带标签且大小可变的三维数组。
python中关于数据结构Series的讲解_C.DLording的博客-CSDN博客_series在python
- >>> import numpy as np
- >>> import pandas as pd
- >>> x = pd.Series([1, 3, 5, np.nan]) # np.nan表示空值、缺失值
- >>> x
- 0 1.0
- 1 3.0
- 2 5.0
- 3 NaN
- dtype: float64
>>> dates = pd.date_range(start='20200101', end='20201231', freq='M') # 间隔为月,每月最后一天

- import numpy as np
- import matplotlib.pyplot as plt
- import matplotlib.font_manager as fm
- import pandas as pd
- dates = pd.date_range(start='20200101', end='20201231', freq='M')
- dataframe=pd.DataFrame(np.random.randn(12,4), index=dates, columns=list('ABCD'))
- print(dataframe)

- >>> pd.DataFrame([np.random.randint(1, 100, 4) for i in range(12)],
- index=dates, columns=list('ABCD')) # 4列随机数
- >>> df.head() # 默认显示前5行
- A B C D E F
- zhang 20 2020-01-01 1.0 3 test foo
- li 26 2020-01-02 2.0 3 train foo
- zhou 63 2020-01-03 3.0 3 test foo
- wang 69 2020-01-04 4.0 3 train foo
- >>> df.head(3) # 查看前3行
- A B C D E F
- zhang 20 2020-01-01 1.0 3 test foo
- li 26 2020-01-02 2.0 3 train foo
- zhou 63 2020-01-03 3.0 3 test foo
- >>> df.tail(2) # 查看最后2行
- A B C D E F
- zhou 63 2020-01-03 3.0 3 test foo
- wang 69 2020-01-04 4.0 3 train foo
- print(df.index)
- print(df.columns)
- print(df.values)

>>> df.describe() # 平均值、标准差、最小值、最大值等信息
>>> df.T
注:这个地方选的是一个字段名 不是表格内的具体数值
- >>> df['A'] # 选择列
- zhang 20
- li 26
- zhou 63
- wang 69
- Name: A, dtype: int32
- >>> 69 in df['A'] # df['A']是一个Series对象
- False
- >>> 69 in df['A'].values
- True
- >>> df[0:2] # 使用切片选择多行
- A B C D E F
- zhang 20 2020-01-01 1.0 3 test foo
- li 26 2020-01-02 2.0 3 train foo
- >>> df.loc[:, ['A', 'C']] # 选择多列,等价于df[['A', 'C']]
- A C
- zhang 20 1.0
- li 26 2.0
- zhou 63 3.0
- wang 69 4.0
- >>> df.loc[['zhang', 'zhou'], ['A', 'D', 'E']]
- # 同时指定多行与多列进行选择
- A D E
- zhang 20 3 test
- zhou 63 3 test
- >>> df.loc['zhang', ['A', 'D', 'E']]
- A 20
- D 3
- E test
- Name: zhang, dtype: object
- >>> df.at['zhang', 'A'] # 查询指定行、列位置的数据值
- 20
- >>> df.at['zhang', 'D']
- 3
- >>> df.iloc[3] # 查询第3行数据
- A 69
- B 2020-01-04 00:00:00
- C 4
- D 3
- E train
- F foo
- Name: wang, dtype: object
-
- >>> df.iloc[0:3, 0:4] # 查询前3行、前4列数据
- A B C D
- zhang 20 2020-01-01 1.0 3
- li 26 2020-01-02 2.0 3
- zhou 63 2020-01-03 3.0 3
- >>> df.iloc[[0, 2, 3], [0, 4]] # 查询指定的多行、多列数据
- A E
- zhang 20 test
- zhou 63 test
- wang 69 train
- >>> df.iloc[0,1] # 查询第0行第1列位置的数据值
- Timestamp('2020-01-01 00:00:00')
- >>> df.iloc[2,2] # 查询第2行第2列位置的数据值
- 3.0
- >>> df[df.A>50] # 按给定条件进行查询
- A B C D E F
- zhou 63 2020-01-03 3.0 3 test foo
- wang 69 2020-01-04 4.0 3 train foo
- >>> df[df['E']=='test'] # 按给定条件进行查询
- A B C D E F
- zhang 20 2020-01-01 1.0 3 test foo
- zhou 63 2020-01-03 3.0 3 test foo
- >>> df[df['A'].isin([20,69])]
- A B C D E F
- zhang 20 2020-01-01 1.0 3 test foo
- wang 69 2020-01-04 4.0 3 train foo
- >>> df.nlargest(3, ['C']) # 返回指定列最大的前3行
- A B C D E F
- wang 69 2020-01-04 4.0 3 train foo
- zhou 63 2020-01-03 3.0 3 test foo
- li 26 2020-01-02 2.0 3 train foo
- >>> df.nlargest(3, ['A'])
- A B C D E F
- wang 69 2020-01-04 4.0 3 train foo
- zhou 63 2020-01-03 3.0 3 test foo
- li 26 2020-01-02 2.0 3 train foo
- >>> dff = pd.DataFrame({'A':[1,2,3,4], 'B':[10,20,8,40]})
- >>> dff
- A B
- 0 1 10
- 1 2 20
- 2 3 8
- 3 4 40
- >>> dff[dff.sum(axis=1)==11]
- A B
- 0 1 10
- 2 3 8
- >>> df.iat[0, 2] = 3 # 修改指定行、列位置的数据值
- >>> df.loc[:, 'D'] = np.random.randint(50, 60, 4)
- # 修改某列的值
- >>> df['C'] = -df['C'] # 对指定列数据取反
- >>> df # 查看修改结果
- A B C D E F
- zhang 20 2020-01-01 -3.0 53 test foo
- li 26 2020-01-02 -2.0 59 train foo
- zhou 63 2020-01-03 -3.0 59 test foo
- wang 69 2020-01-04 -4.0 50 train foo
- >>> from copy import deepcopy
- >>> dff = deepcopy(df)
- >>> dff
- A B C D E F
- zhang 20 2020-01-01 -3.0 53 test foo
- li 26 2020-01-02 -2.0 59 train foo
- zhou 63 2020-01-03 -3.0 59 test foo
- wang 69 2020-01-04 -4.0 50 train foo
- >>> dff['C'] = dff['C'] ** 2 # 替换列数据
- >>> dff
- A B C D E F
- zhang 20 2020-01-01 9.0 53 test foo
- li 26 2020-01-02 4.0 59 train foo
- zhou 63 2020-01-03 9.0 59 test foo
- wang 69 2020-01-04 16.0 50 train foo
- >>> data = pd.DataFrame({'k1':['one'] * 3 + ['two'] * 4,
- 'k2':[1, 1, 2, 3, 3, 4, 4]})
- >>> data.replace(1, 5) # 把所有1替换为5
- k1 k2
- 0 one 5
- 1 one 5
- 2 one 2
- 3 two 3
- 4 two 3
- 5 two 4
- 6 two 4
-
- >>> data.replace([1,2],[5,6]) # 1->5,2->6
- k1 k2
- 0 one 5
- 1 one 5
- 2 one 6
- 3 two 3
- 4 two 3
- 5 two 4
- 6 two 4
-
- >>> data.replace({1:5, 'one':'ONE'}) # 使用字典指定替换关系
- k1 k2
- 0 ONE 5
- 1 ONE 5
- 2 ONE 2
- 3 two 3
- 4 two 3
- 5 two 4
- 6 two 4
-
- >>> data = pd.DataFrame({'k1':['one'] * 3 + ['two'] * 4,
- 'k2':[1, 1, 2, 3, 3, 4, 4]})
- >>> data
- k1 k2
- 0 one 1
- 1 one 1
- 2 one 2
- 3 two 3
- 4 two 3
- 5 two 4
- 6 two 4
-
- >>> data.drop(5, axis=0) # 删除指定行
- k1 k2
- 0 one 1
- 1 one 1
- 2 one 2
- 3 two 3
- 4 two 3
- 6 two 4
-
- >>> data.drop(3, inplace=True) # 原地删除
- >>> data
- k1 k2
- 0 one 1
- 1 one 1
- 2 one 2
- 4 two 3
- 5 two 4
- 6 two 4
-
- >>> data.drop('k1', axis=1) # 删除指定列
- k2
- 0 1
- 1 1
- 2 2
- 4 3
- 5 4
- 6 4
-
- >>> data = pd.DataFrame({'age':np.random.randint(20,50,5)})
- >>> data
- age
- 0 31
- 1 27
- 2 26
- 3 33
- 4 37
-
- >>> data['rank'] = data['age'].rank() # 增加一列位次序号
- >>> data
- age rank
- 0 31 3.0
- 1 27 2.0
- 2 26 1.0
- 3 33 4.0
- 4 37 5.0
- >>> data = pd.DataFrame({'姓名':['张三','李四','王五','赵六','刘七','孙八'],
- '成绩':[86,92,86,60,78,78]})
- >>> data
- 姓名 成绩
- 0 张三 86
- 1 李四 92
- 2 王五 86
- 3 赵六 60
- 4 刘七 78
- 5 孙八 78
-
- >>> data['排名'] = data['成绩'].rank(method='min') # 倒数名次,并列的取最小值
- >>> data
- 姓名 成绩 排名
- 0 张三 86 4.0
- 1 李四 92 6.0
- 2 王五 86 4.0
- 3 赵六 60 1.0
- 4 刘七 78 2.0
- 5 孙八 78 2.0
-
- >>> data['排名'] = data['成绩'].rank(method='min', ascending=False)
- >>> data # 正数名次,并列的名次取最小值
- 姓名 成绩 排名
- 0 张三 86 2.0
- 1 李四 92 1.0
- 2 王五 86 2.0
- 3 赵六 60 6.0
- 4 刘七 78 4.0
- 5 孙八 78 4.0
-
- >>> data['排名'] = data['成绩'].rank(method='max', ascending=False)
- >>> data # 正数名次,并列的名次取最大值
- 姓名 成绩 排名
- 0 张三 86 3.0
- 1 李四 92 1.0
- 2 王五 86 3.0
- 3 赵六 60 6.0
- 4 刘七 78 5.0
- 5 孙八 78 5.0
-
- >>> data['排名'] = data['成绩'].rank(method='max')
- >>> data # 倒数名次,并列的名次取最大值
- 姓名 成绩 排名
- 0 张三 86 5.0
- 1 李四 92 6.0
- 2 王五 86 5.0
- 3 赵六 60 1.0
- 4 刘七 78 3.0
- 5 孙八 78 3.0
-
- >>> data['排名'] = data['成绩'].rank(method='average')
- >>> data # 倒数名次,并列的名次取平均值
- 姓名 成绩 排名
- 0 张三 86 4.5
- 1 李四 92 6.0
- 2 王五 86 4.5
- 3 赵六 60 1.0
- 4 刘七 78 2.5
- 5 孙八 78 2.5
-
- >>> dff = pd.DataFrame({'A':[1,2,3,4], 'B':[10,20,8,40]})
- >>> dff
- A B
- 0 1 10
- 1 2 20
- 2 3 8
- 3 4 40
-
- >>> dff['ColSum'] = dff.apply(sum, axis=1) # 对行求和,增加1列
- >>> dff.loc['RowSum'] = dff.apply(sum, axis=0) # 对列求和,增加1行
- >>> dff
- A B ColSum
- 0 1 10 11
- 1 2 20 22
- 2 3 8 11
- 3 4 40 44
- RowSum 10 78 88
- >>> df
- A B C D E F
- zhang 20 2020-01-01 9.0 53 test foo
- li 26 2020-01-02 4.0 59 train foo
- zhou 63 2020-01-03 9.0 59 test foo
- wang 69 2020-01-04 16.0 50 train foo
-
- >>> df1 = df.reindex(columns=list(df.columns) + ['G'])
- >>> df1
- A B C D E F G
- zhang 20 2020-01-01 9.0 53 test foo NaN
- li 26 2020-01-02 4.0 59 train foo NaN
- zhou 63 2020-01-03 9.0 59 test foo NaN
- wang 69 2020-01-04 16.0 50 train foo NaN
- >>> df1.iat[0, 6] = 3 # 修改指定位置元素值,该列其他元素为缺失值NaN
- >>> df1
- A B C D E F G
- zhang 20 2020-01-01 9.0 53 test foo 3.0
- li 26 2020-01-02 4.0 59 train foo NaN
- zhou 63 2020-01-03 9.0 59 test foo NaN
- wang 69 2020-01-04 16.0 50 train foo NaN
- >>> pd.isnull(df1) # 测试缺失值,返回值为True/False阵列
- A B C D E F G
- zhang False False False False False False False
- li False False False False False False True
- zhou False False False False False False True
- wang False False False False False False True
- >>> df1.dropna() # 返回不包含缺失值的行
- A B C D E F G
- zhang 20 2020-01-01 9.0 53 test foo 3.0
-
- >>> from copy import deepcopy
- >>> df2 = deepcopy(df1)
- >>> df1['G'].fillna(5, inplace=True) # 使用指定值填充缺失值
-
- >>> df1
- A B C D E F G
- zhang 20 2020-01-01 9.0 53 test foo 3.0
- li 26 2020-01-02 4.0 59 train foo 5.0
- zhou 63 2020-01-03 9.0 59 test foo 5.0
- wang 69 2020-01-04 16.0 50 train foo 5.0
- >>> df2.iat[2, 5] = np.NaN
- >>> df2
- A B C D E F G
- zhang 20 2020-01-01 1.0 53 test foo 3.0
- li 26 2020-01-02 4.0 59 train foo NaN
- zhou 63 2020-01-03 9.0 59 test NaN NaN
- wang 69 2020-01-04 16.0 50 train foo NaN
-
- >>> df2.dropna(thresh=6) # 返回包含6个有效值以上的数据
- A B C D E F G
- zhang 20 2020-01-01 1.0 53 test foo 3.0
- li 26 2020-01-02 4.0 59 train foo NaN
- wang 69 2020-01-04 16.0 50 train foo NaN
- >>> df2.iat[3, 6] = 8
- >>> df2
- A B C D E F G
- zhang 20 2020-01-01 1.0 53 test foo 3.0
- li 26 2020-01-02 4.0 59 train foo NaN
- zhou 63 2020-01-03 9.0 59 test NaN NaN
- wang 69 2020-01-04 16.0 50 train foo 8.0
-
- >>> df2.fillna({'F':'foo', 'G':df2['G'].mean()}) # 填充缺失值
- A B C D E F G
- zhang 20 2020-01-01 1.0 53 test foo 3.0
- li 26 2020-01-02 4.0 59 train foo 5.5
- zhou 63 2020-01-03 9.0 59 test foo 5.5
- wang 69 2020-01-04 16.0 50 train foo 8.0
- >>> import numpy as np
- >>> import pandas as pd
- >>> dft = pd.DataFrame({'a':[1,np.NaN, np.NaN,3]})
-
- >>> dft.fillna(method='pad') # 使用缺失值前最后一个有效值进行填充
- a
- 0 1.0
- 1 1.0
- 2 1.0
- 3 3.0
-
- >>> dft.fillna(method='bfill') # 使用缺失值后第一个有效值往回填充
- a
- 0 1.0
- 1 3.0
- 2 3.0
- 3 3.0
-
- >>> dft.fillna(method='bfill', limit=1) # 只填充一个缺失值
- a
- 0 1.0
- 1 NaN
- 2 3.0
- 3 3.0
- >>> data = pd.DataFrame({'k1':['one'] * 3 + ['two'] * 4,
- 'k2':[1, 1, 2, 3, 3, 4, 4]})
- >>> data
- k1 k2
- 0 one 1
- 1 one 1
- 2 one 2
- 3 two 3
- 4 two 3
- 5 two 4
- 6 two 4
-
- >>> data.duplicated() # 检查重复行
- 0 False
- 1 True
- 2 False
- 3 False
- 4 True
- 5 False
- 6 True
- dtype: bool
-
-
- >>> data.drop_duplicates() # 返回新数组,删除重复行
- k1 k2
- 0 one 1
- 2 one 2
- 3 two 3
- 5 two 4
-
- >>> data.drop_duplicates(['k1']) # 删除k1列的重复数据,只保留第一项
- k1 k2
- 0 one 1
- 3 two 3
-
- >>> data.drop_duplicates(['k1'], keep='last') # 保留最后一项
- k1 k2
- 2 one 2
- 6 two 4
-
- >>> data = pd.Series([3,3,3,2,1,1,1,0])
- >>> data
- 0 3
- 1 3
- 2 3
- 3 2
- 4 1
- 5 1
- 6 1
- 7 0
- dtype: int64
-
- >>> data.drop_duplicates(keep=False) # 只保留出现一次的数字
- 3 2
- 7 0
- dtype: int64
- >>> df2 = pd.DataFrame(np.random.randn(10, 4))
- >>> df2
- 0 1 2 3
- 0 2.064867 -0.888018 0.586441 -0.660901
- 1 -0.465664 -0.496101 0.249952 0.627771
- 2 1.974986 1.304449 -0.168889 -0.334622
- 3 0.715677 2.017427 1.750627 -0.787901
- 4 -0.370020 -0.878282 0.499584 0.269102
- 5 0.184308 0.653620 0.117899 -1.186588
- 6 -0.364170 1.652270 0.234833 0.362925
- 7 -0.329063 0.356276 1.158202 -1.063800
- 8 -0.778828 -0.156918 -0.760394 -0.040323
- 9 -0.391045 -0.374825 -1.016456 0.767481
-
- >>> p1 = df2[:3] # 数据行拆分
- >>> p1
- 0 1 2 3
- 0 2.064867 -0.888018 0.586441 -0.660901
- 1 -0.465664 -0.496101 0.249952 0.627771
- 2 1.974986 1.304449 -0.168889 -0.334622
-
- >>> p2 = df2[3:7]
- >>> p3 = df2[7:]
- >>> df3 = pd.concat([p1, p2, p3]) # 数据行合并
- >>> df4 = pd.DataFrame({'A':np.random.randint(1,5,8),
- 'B':np.random.randint(10,15,8),
- 'C':np.random.randint(20,30,8),
- 'D':np.random.randint(80,100,8)})
- >>> df4
- A B C D
- 0 1 13 26 81
- 1 3 14 29 88
- 2 1 13 28 88
- 3 2 10 21 90
- 4 4 14 28 83
- 5 4 11 24 81
- 6 2 11 26 99
- 7 3 13 25 91
- >>> df4.groupby('A').sum() # 数据分组计算
- B C D
- A
- 1 26 54 169
- 2 21 47 189
- 3 27 54 179
- 4 25 52 164
-
- >>> df4.groupby(by=['A', 'B']).mean()
- C D
- A B
- 1 13 27.0 84.5
- 2 10 21.0 90.0
- 11 26.0 99.0
- 3 13 25.0 91.0
- 14 29.0 88.0
- 4 11 24.0 81.0
- 14 28.0 83.0
-
- >>> df4.groupby(by=['A', 'B'], as_index=False).mean()
- A B C D
- 0 1 13 27.0 84.5
- 1 2 10 21.0 90.0
- 2 2 11 26.0 99.0
- 3 3 13 25.0 91.0
- 4 3 14 29.0 88.0
- 5 4 11 24.0 81.0
- 6 4 14 28.0 83.0
-
- >>> df4.groupby(by=['A', 'B']).aggregate({'C':np.mean, 'D':np.min})
- # 分组后,C列使用平均值,D列使用最小值
- C D
- A B
- 1 13 27 81
- 2 10 21 90
- 11 26 99
- 3 13 25 91
- 14 29 88
- 4 11 24 81
- 14 28 83
-