目录
- # Pandas像是一个字典形式的Numpy
-
- import pandas as pd
- import numpy as np
- # 两者在应用上一般是互相搭配的
-
- s = pd.Series([1,3,6,np.nan,44,1])
- print(s) # 打印出来后会形成一个带序号的表格,最下方还有该列的数据名称
-
- dates = pd.date_range('20231008',periods = 6) # 第一个参数是时间序列的开始日期,第二个是时间的长度,也就是打印的天数
- print(dates)
-
- df = pd.DataFrame(np.random.randn(6,4),index = dates,columns=['a','b','c','d'])
- # DataFrame其实就是一个矩阵,第一个参数就是np的矩阵,用index来定义它的行,columns定义它的列,这就变成一个表格
- print(df)
-
- df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
- # 若不指定行列的定义,则直接用下标
- print(df1)
-
- df2 = pd.DataFrame({
- 'A':1.,
- 'B':pd.Timestamp('20231008'),
- 'C':pd.Series(1,index=list(range(4)),dtype='float32'),
- 'D':np.array([3]*4,dtype='int32'),
- 'E':pd.Categorical(["test","train","test","train"]),
- 'F':'foo'
- })
- # 还可以在DataFrame的括号里加字典,这样字典里的每个键就代表一列数据,所有列的数据数量要一样多
- print(df2)
-
- df2.dtypes
- # 打印出所有键的数据类型
-
- df2.index
- # 输出所有列的序号
-
- df2.columns
- # 打印出所有行的名字
-
- df2.values
- # 打印出每一行的所有元素值
-
- print(df2.describe())
- # 打印数据表中所有数字数据的描述性统计
-
- print(df2.T)
- # 表格的行列转置
-
- print(df2.sort_index(axis=1,ascending=False))
- # 对列序号按照逆序排序
-
- print(df2.sort_index(axis=0,ascending=False))
-
- print(df2.sort_values(by='E'))
- # 按照第E列的数据进行由大到小的排序
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
DatetimeIndex(['2023-10-08', '2023-10-09', '2023-10-10', '2023-10-11',
'2023-10-12', '2023-10-13'],
dtype='datetime64[ns]', freq='D')
a b c d
2023-10-08 0.330878 -1.194733 0.539956 0.121203
2023-10-09 -0.132819 0.294950 0.067614 0.052185
2023-10-10 0.311724 1.305968 -1.043187 0.606765
2023-10-11 0.108810 0.772342 -0.406491 3.065872
2023-10-12 0.495157 -0.765991 1.580787 0.554438
2023-10-13 -0.042651 0.822573 0.264911 0.192190
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
A B C D E F
0 1.0 2023-10-08 1.0 3 test foo
1 1.0 2023-10-08 1.0 3 train foo
2 1.0 2023-10-08 1.0 3 test foo
3 1.0 2023-10-08 1.0 3 train foo
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
0 1 2 \
A 1.0 1.0 1.0
B 2023-10-08 00:00:00 2023-10-08 00:00:00 2023-10-08 00:00:00
C 1.0 1.0 1.0
D 3 3 3
E test train test
F foo foo foo
3
A 1.0
B 2023-10-08 00:00:00
C 1.0
D 3
E train
F foo
F E D C B A
0 foo test 3 1.0 2023-10-08 1.0
1 foo train 3 1.0 2023-10-08 1.0
2 foo test 3 1.0 2023-10-08 1.0
3 foo train 3 1.0 2023-10-08 1.0
A B C D E F
3 1.0 2023-10-08 1.0 3 train foo
2 1.0 2023-10-08 1.0 3 test foo
1 1.0 2023-10-08 1.0 3 train foo
0 1.0 2023-10-08 1.0 3 test foo
A B C D E F
0 1.0 2023-10-08 1.0 3 test foo
2 1.0 2023-10-08 1.0 3 test foo
1 1.0 2023-10-08 1.0 3 train foo
3 1.0 2023-10-08 1.0 3 train foo
- # 本节介绍Pandas数据选择/筛选的功能
-
- import pandas as pd
- import numpy as np
-
- dates = pd.date_range('20231008',periods = 6)
- df = pd.DataFrame(np.arange(24).reshape(6,4),index = dates,columns=['A','B','C','D'])
-
- print(df)
-
- print(df['A'])
- print(df.A)
- # 可以看出这两种选择列数据的方式得到的结果相同
-
- # select by label:loc
-
- print(df.loc['20231008'])
- # 以标签名为选择方式
- print(df.loc[:,['A','B']])
- # 打印出所有行,A、B列的数据
- # 可以把loc看成location,用它可以精准定位行列打印一组数据
-
- # selected by position:iloc
- print(df.iloc[3:5,1:3])
- print(df.iloc[[1,3,5],1:3])
-
- # Boolean indexing
- # 通过对比某列数据进行筛选
- print(df)
- print(df[df.A>8])
A B C D
2023-10-08 0 1 2 3
2023-10-09 4 5 6 7
2023-10-10 8 9 10 11
2023-10-11 12 13 14 15
2023-10-12 16 17 18 19
2023-10-13 20 21 22 23
2023-10-08 0
2023-10-09 4
2023-10-10 8
2023-10-11 12
2023-10-12 16
2023-10-13 20
Freq: D, Name: A, dtype: int32
2023-10-08 0
2023-10-09 4
2023-10-10 8
2023-10-11 12
2023-10-12 16
2023-10-13 20
Freq: D, Name: A, dtype: int32
A 0
B 1
C 2
D 3
Name: 2023-10-08 00:00:00, dtype: int32
A B
2023-10-08 0 1
2023-10-09 4 5
2023-10-10 8 9
2023-10-11 12 13
2023-10-12 16 17
2023-10-13 20 21
B C
2023-10-11 13 14
2023-10-12 17 18
B C
2023-10-09 5 6
2023-10-11 13 14
2023-10-13 21 22
A B C D
2023-10-08 0 1 2 3
2023-10-09 4 5 6 7
2023-10-10 8 9 10 11
2023-10-11 12 13 14 15
2023-10-12 16 17 18 19
2023-10-13 20 21 22 23
A B C D
2023-10-11 12 13 14 15
2023-10-12 16 17 18 19
2023-10-13 20 21 22 23
- # 本节介绍如何给选定的范围赋值
-
- import pandas as pd
- import numpy as np
-
- dates = pd.date_range('20231008',periods=6)
- df = pd.DataFrame(np.arange(24).reshape(6,4),index=dates,columns=['A','B','C','D'])
-
- df.iloc[2,2] = 1111
- print(df)
-
- df.loc['20231008','B']=2222
- print(df)
-
- df.A[df.A>4] = 0
- print(df)
-
- # 添加空列或预定义数据元素的列
-
- df['F']=np.nan
- print(df)
-
- df['E']=pd.Series([1,2,3,4,5,6],index=pd.date_range('20231008',periods=6))
- print(df)
A B C D
2023-10-08 0 1 2 3
2023-10-09 4 5 6 7
2023-10-10 8 9 1111 11
2023-10-11 12 13 14 15
2023-10-12 16 17 18 19
2023-10-13 20 21 22 23
A B C D
2023-10-08 0 2222 2 3
2023-10-09 4 5 6 7
2023-10-10 8 9 1111 11
2023-10-11 12 13 14 15
2023-10-12 16 17 18 19
2023-10-13 20 21 22 23
A B C D
2023-10-08 0 2222 2 3
2023-10-09 4 5 6 7
2023-10-10 0 9 1111 11
2023-10-11 0 13 14 15
2023-10-12 0 17 18 19
2023-10-13 0 21 22 23
A B C D F
2023-10-08 0 2222 2 3 NaN
2023-10-09 4 5 6 7 NaN
2023-10-10 0 9 1111 11 NaN
2023-10-11 0 13 14 15 NaN
2023-10-12 0 17 18 19 NaN
2023-10-13 0 21 22 23 NaN
A B C D F E
2023-10-08 0 2222 2 3 NaN 1
2023-10-09 4 5 6 7 NaN 2
2023-10-10 0 9 1111 11 NaN 3
2023-10-11 0 13 14 15 NaN 4
2023-10-12 0 17 18 19 NaN 5
2023-10-13 0 21 22 23 NaN 6
- # 本节介绍pandas处理丢失数据
-
- import pandas as pd
- import numpy as np
-
- dates = pd.date_range('20231008',periods=6)
- df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
- df.iloc[0,1] = np.nan
- df.iloc[1,2] = np.nan
-
- print(df)
- print(df.dropna(axis=0,how='any')) # how={'any','all'} ,为all时只有这行全为NaN时才会丢掉
- # 当axis等于0时,把所有含有NaN的行丢掉,等于1时就把列丢掉
-
- print(df.fillna(value=0))
- # 把所有的NaN换成0
-
- print(df.isnull())
- # 返回一个表格,有缺失数据的地方就是True
A B C D
2023-10-08 0 NaN 2.0 3
2023-10-09 4 5.0 NaN 7
2023-10-10 8 9.0 10.0 11
2023-10-11 12 13.0 14.0 15
2023-10-12 16 17.0 18.0 19
2023-10-13 20 21.0 22.0 23
A B C D
2023-10-10 8 9.0 10.0 11
2023-10-11 12 13.0 14.0 15
2023-10-12 16 17.0 18.0 19
2023-10-13 20 21.0 22.0 23
A B C D
2023-10-08 0 0.0 2.0 3
2023-10-09 4 5.0 0.0 7
2023-10-10 8 9.0 10.0 11
2023-10-11 12 13.0 14.0 15
2023-10-12 16 17.0 18.0 19
2023-10-13 20 21.0 22.0 23
A B C D
2023-10-08 False True False False
2023-10-09 False False True False
2023-10-10 False False False False
2023-10-11 False False False False
2023-10-12 False False False False
2023-10-13 False False False False
- # 本节介绍用Pandas导入和导出数据
-
- import pandas as pd
-
- # 预先准备好名为student的csv文件,即可读取
- data = pd.read_csv('student.csv')
- print(data)
-
- # 将csv文件转换为pickle文件,保存在文件夹中
- data.to_pickle('student.pickle')
- # 本节介绍如何使用pandas的concat合并多个DataFrame
-
- import pandas as pd
- import numpy as np
-
- # concatenating
-
- df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
- df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
- df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
-
- print(df1)
- print(df2)
- print(df3)
-
- res = pd.concat([df1,df2,df3],axis=0,ignore_index=True) # 合并行,axis=1时合并列
- print(res)
-
- # join,['inner','outer']
- df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
- df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
-
-
- print(df1)
- print(df2)
-
- res = pd.concat([df1,df2],join='outer',ignore_index=True) # outer会补齐表格,而inter会裁剪掉再合并
- print(res)
-
- #append
- df3 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
- df4 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
-
- res = df3.append(df4,ignore_index=True)
- # 将df4补到df3后面,按行补充
-
- print(res)
-
- s1 = pd.Series([1,2,3,4],index=['a','b','c','d']) # 往df3下面再添加一行数据
- res = df3.append(s1,ignore_index=True)
-
- print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
b c d e
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
a b c d e
0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
5 NaN 1.0 1.0 1.0 1.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
这篇博客写得很好,可以参考学习,我就不重复造轮子了:
- import pandas as pd
- import numpy as np
- import matplotlib.pyplot as plt
-
- # plot data
-
- # Series
-
- data = pd.Series(np.random.randn(1000), index = np.arange(1000))
- data = data.cumsum()
- # data.plot()
- # plt.show()
-
- # DataFrame
- data = pd.DataFrame(np.random.randn(1000,4), index = np.arange(1000), columns = list("ABCD"))
- data = data.cumsum()
- print(data.head())
-
- # plot methods:
- #'bar', 'hist', 'box', 'kde', 'area', 'scatter', 'hexbin', 'pie'
- data.plot()
- ax = data.plt.scatter(x='A',y='B', color = 'DarkBlue', label='Class 1')
- data.plot.scatter(x = 'A', y = 'C', color = 'DarkGreen', label = 'Class 2', ax = ax)
- plt.show()