
- import pandas as pd
- import numpy as np
- # import warnings
- # warnings.filterwarnings('ignore')
-
- diabetes = pd.read_csv(r"D:\本科\kaggle数据挖掘\titanic\diabetes.csv")
- titanic = pd.read_csv(r"D:\本科\kaggle数据挖掘\titanic\train.csv")
- titanic.fillna(0, inplace = True)
- titanic.head()
diabetes.head()
- from sklearn.preprocessing import PolynomialFeatures
- tmp = PolynomialFeatures(degree=5).fit_transform(diabetes.iloc[:,0:1])
- tmp = pd.DataFrame(tmp)
- tmp.rename(columns = {0:'preg$^0$', 1:'preg$^1$', 2:'preg$^2$', 3:'preg$^3$', 4:'preg$^4$', 5:'preg$^5$'}, inplace = True)
- new_diabetes = pd.concat([diabetes, tmp], axis = 1, join = 'inner')
- new_diabetes.head()
- temp = PolynomialFeatures(degree = 5).fit_transform(titanic.iloc[:,9:10])
- temp = pd.DataFrame(temp)
- temp.rename(columns = {0:'fare$^0$', 1:'fare$^1$', 2:'fare$^2$', 3:'fare$^3$', 4:'fare$^4$', 5:'fare$^5$'}, inplace = True)
- new_titanic = pd.concat([titanic, temp], axis = 1, join = 'inner')
- new_titanic.iloc[0:5, 6:]
- data = diabetes.iloc[[0,1], [0,1]]
- data
PolynomialFeatures(degree = 2, include_bias = False).fit_transform(data)
array([[6.0000e+00, 1.4800e+02, 3.6000e+01, 8.8800e+02, 2.1904e+04],
[1.0000e+00, 8.5000e+01, 1.0000e+00, 8.5000e+01, 7.2250e+03]])
即

| 6 | 148 | 36 | 888 | 21904 |
| 1 | 85 | 1 | 85 | 7225 |
PolynomialFeatures(degree = 3, include_bias = False).fit_transform(data)
array([[6.000000e+00, 1.480000e+02, 3.600000e+01, 8.880000e+02,
2.190400e+04, 2.160000e+02, 5.328000e+03, 1.314240e+05,
3.241792e+06],
[1.000000e+00, 8.500000e+01, 1.000000e+00, 8.500000e+01,
7.225000e+03, 1.000000e+00, 8.500000e+01, 7.225000e+03,
6.141250e+05]])
即


- import pandas as pd
- t = pd.DataFrame()
- t['time'] = ['2022-04-05;13:34:03',
- '1949-10-03;14:01:06',
- '1945-08-15;09:00:00']
- t
- t['time'] = pd.to_datetime(t['time'])
- t['time']
0 2022-04-05 13:34:03 1 1949-10-01 14:01:06 2 1945-08-15 09:00:00 Name: time, dtype: datetime64[ns]
- t1 = pd.DataFrame()
- t1['time'] = ['1997-07-01',
- '1999-12-20']
- t1['time'] = pd.to_datetime(t1['time'])
- t1['time']
0 1997-07-01 1 1999-12-20 Name: time, dtype: datetime64[ns]
- t1['time'].values.astype('datetime64[D]')
- t1['time']
0 1997-07-01 1 1999-12-20 Name: time, dtype: datetime64[ns]
- t1['time'].values.astype('datetime64[h]')
- t1['time']
0 1997-07-01 1 1999-12-20 Name: time, dtype: datetime64[ns]

t1['time'].dt.year
0 1997 1 1999 Name: time, dtype: int64
t1['time'].dt.quarter
0 3 1 4 Name: time, dtype: int64
- import numpy as np
- a = np.array([[1,2] * 5, [0, 1, 1, 1, 1, 0, 0, 0, 1, 0]]).T
- train = pd.DataFrame(a, columns = ['tenure', 'Churn'])
- train
- from sklearn.model_selection import KFold
- kf = KFold(n_splits = 5)
- for train, text in kf.split(a):
- print('train: %s, text: %s' %(train, text))