pd.merge
pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
left_index=False, right_index=False, sort=True,
suffixes=('_x', '_y'), copy=True, indicator=False,
validate=None)
pandas的设计目标之一就是使得处理缺失数据的任务更加轻松些。pandas使用NaN作为缺失数据的标记。
dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
效果同布尔表达式df[df.notnull()]
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
"toy": [np.nan, 'Batmobile', 'Bullwhip'],
"born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT]})
#
# name toy born
# 0 Alfred NaN NaT
# 1 Batman Batmobile 1940-04-25
# 2 Catwoman Bullwhip NaT
df.dropna()
# name toy born
# 1 Batman Batmobile 1940-04-25
drop_duplicates(subset=' 列名',keep='firsrt',inplace='True')
函数是删除DataFrame的某列中重复项的函数。
subset,输入列名,形式为subset=‘列名1’,可输入多列,形式为subset=[‘列名1’,‘列名2’]
keep包括’first’,‘last’,False,三个参数,注意first和last带引号,而False没有,'first’是保留重复项中第一个,last是保留最后一个,False是都不保留
import pandas as pd
dict={'x':[1,2,3,6],'y':[1,4,1,1],'z':[1,2,4,1]}
df=pd.DataFrame(dict)
print(df, "\n")
# x y z
# 0 1 1 1
# 1 2 4 2
# 2 3 1 4
# 3 6 1 1
df.drop_duplicates(subset=['y','z'],keep='first',inplace=True)
print(df)
# x y z
# 0 1 1 1
# 1 2 4 2
# 2 3 1 4
用于处理数据,类似于python中的apply函数,返回迭代器
apply(func, axis=0, raw=False, result_type=None, args=(), **kwds)
df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
print(df)
# A B
# 0 4 9
# 1 4 9
# 2 4 9
df.apply(np.sqrt)
# A B
# 0 2.0 3.0
# 1 2.0 3.0
# 2 2.0 3.0
groupby(by=None,
axis=0,
level=None,
as_index: bool = True,
sort: bool = True,
group_keys: bool = True,
squeeze: bool = False,
observed: bool = False)
https://blog.csdn.net/qq1483661204/article/details/79824381
https://blog.csdn.net/zgljl2012/article/details/54880353
https://zhuanlan.zhihu.com/p/142972462
http://localhost:8888/notebooks/Desktop/zsw/code/jupyter_notebook/pandas%E8%BF%9B%E9%98%B6.ipynb