c = OneHotEncoder(categories='auto').fit_transform(iris.target.reshape((-1,1)))
c
>><150x3 sparse matrix of type'numpy.float64'>'with150 stored elements in Compressed Sparse Row format>
1
2
3
4
c.toarray()
1
6.缺失值处理
strategy的参数 mean 平均值 median 中位数 most_frequent 众数
from sklearn.impute import SimpleImputer
1
d = SimpleImputer(strategy='mean').fit_transform(c)
d
from sklearn.preprocessing import FunctionTransformer
1
a = np.arange(0,12).reshape(2,6)
a
>>array([[0,1,2,3,4,5],[6,7,8,9,10,11]])
1
2
3
4
m = FunctionTransformer(np.log).fit_transform(a)
m
>>array([[-inf,0.,0.69314718,1.09861229,1.38629436,1.60943791],[1.79175947,1.94591015,2.07944154,2.19722458,2.30258509,2.39789527]])
1
2
3
4
5
6
2.特征选择
1.删除低方差特征
from sklearn.feature_selection import VarianceThreshold
1
(threshold=0.2) 方差低于2的删除
b = VarianceThreshold(threshold=2).fit_transform(a)
b
1
2
删除标准差为0的某一列
X = X.drop(X.columns[X.std()==0], axis=1)
1
2.SelectKBest()方法
f_classif,即利用ANOVA方法(方差分析(Analysis of Variance)又称F检验)来给特征打分,除此之外还有mutual_info_classif(基于互信息)、chi2(卡方检验)的方法来给特征打分后进行特征选择,这三种就可以用于常用的过滤法。
m = np.argsort(c.scores_)[::-1]
m
e = pd.DataFrame(iris.data)
e
1
2
3
4
返回最重要的俩个特征索引名称
list(e.columns.values[m[0:2]])>> 索引名称为 [2,3]
1
2
使用卡方检验查看最重要的2个特征完整代码
from sklearn.feature_selection import SelectKBest,chi2,f_classif
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
c = SelectKBest(chi2,k=2)
d = c.fit_transform(iris.data,iris.target)
m = np.argsort(c.scores_)[::-1]
e = pd.DataFrame(iris.data)list(e.columns.values[m[0:2]])