boruta_py 包安装:
pip install Boruta
estimator : object
A supervised learning estimator, with a 'fit' method that returns the feature_importances_ attribute.
Important features must correspond to high absolute values in the feature_importances_.
n_estimators : int or string, default = 1000
If int sets the number of estimators in the chosen ensemble method. If 'auto' this is determined automatically based on the size of the dataset.
The other parameters of the used estimators need to be set with initialisation.
perc : int, default = 100
Instead of the max we use the percentile defined by the user, to pick our threshold for comparison between shadow and real features. The max tends to be too stringent.
This provides a finer control over this. The lower perc is the more false positives will be picked as relevant but also the less relevant features will be left out.
The usual trade-off. The default is essentially the vanilla Boruta corresponding to the max.
alpha : float, default = 0.05(置信度)
Level at which the corrected p-values will get rejected in both correction steps.
two_step : Boolean, default = True
If you want to use the original implementation of Boruta with Bonferroni correction only set this to False.
max_iter : int, default = 100(最大迭代次数)
The number of maximum iterations to perform.
verbose : int, default=0
Controls verbosity of output.
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
X = pd.read_csv('C:/Users/Administrator/Desktop/test_X.csv', index_col=0).values
y = pd.read_csv('C:/Users/Administrator/Desktop/test_y.csv', header=None, index_col=0).values
y = y.ravel() # 数据展平
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
# 定义Boruta特征选择方法
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
# 找出所有相关的特征, y)
# 检查所选的特征
# 检查特征的排名
# 在X上调用transform()将其筛选到所选的特征
X_filtered = feat_selector.transform(X)
随机森林是最受欢迎的机器学习方法之一,由于其相对良好的准确性、鲁棒性和易用性。它们还提供了两种简单的功能选择方法:1. 基尼重要性或平均减少杂质 (MDI);2. 排列重要性或平均减少精度 (MDA)
一种新的全相关特征选择方法是:Boruta,由Witold R. Rudnicki构思,并在华沙大学(ICM UW)数学和计算建模跨学科中心开发。
在scikit-learn中,对于分类问题,Gini Importance以RandomForestClassifier.feature_importances_ 的形式实现;对于回归问题,以RandomForestRegressor.feature_importances_ 的形式实现。
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from collections import defaultdict
from sklearn.metrics import r2_score
sys.path.insert(0, 'boruta_py-master/boruta')
from boruta import BorutaPy
sys.path.insert(0, 'random-forest-importances-master/src')
from rfpimp import *
%matplotlib inline
wine = pd.read_csv("C:/Users/Administrator/Desktop/winequality-red.csv")
house = pd.read_csv("C:/Users/Administrator/Desktop/kc_house_data.csv")
size = 10000
# normal正态分布:
# 1.param1:正态分布的均值,对应着这个分布的中心
# 2.param2:正态分布的标准差,对应分布的宽度,值越大,正态分布的曲线越矮胖,值越小,曲线越高瘦。
# 3.param3:输出的值赋在shape里,默认为None。
X_seed = np.random.normal(0, 1, size) # (10000,)
X0 = X_seed + np.random.normal(0, .1, size)
X1 = X_seed + np.random.normal(0, .1, size)
X2 = X_seed + np.random.normal(0, .1, size)
X = np.array([X0, X1, X2]).T
Y = X0 + X1 + X2
rf = RandomForestRegressor(n_estimators=20, max_features=2), Y);
print("Scores for X0, X1, X2: {}".format(rf.feature_importances_))
Scores for X0, X1, X2: [0.29792321 0.52388941 0.17818738]
def permutation_importances(rf, X_train, y_train, metric):
baseline = metric(rf, X_train, y_train)
imp = []
std = []
for col in X_train.columns:
for i in range(10):
save = X_train[col].copy()
X_train[col] = np.random.permutation(X_train[col]) # permutation()按照给定列表生成一个打乱后的随机列表
m = metric(rf, X_train, y_train)
X_train[col] = save
imp.append(baseline - np.mean(tmp))
return np.array(imp),np.array(std)
Boruta是一个R包的名字,它实现了一种新的特征选择算法。它像Permutation Importance一样对变量进行随机排列,但同时对所有变量进行排列,并将洗牌后的特征与原始特征连接起来。它迭代地删除那些被统计测试证明不如随机探针相关的特征。
数据解释: 葡萄酒数据集,第一个数据集与葡萄牙 "Vinho Verde "葡萄酒的红色变体有关。该数据集只有物理化学(输入)和感官(输出)变量可用(没有关于葡萄类型、葡萄酒品牌、葡萄酒销售价格等数据)。
输入连续变量(基于物理化学测试): 固定酸度、挥发性酸度、柠檬酸、残糖、氯化物、游离二氧化硫、总二氧化硫、密度、pH值10、硫酸盐、酒精
输出分类变量(基于感官数据): 质量(分数在0和10之间)
# 目标变量的数量
sns.countplot(x='quality', data=wine)
# 对响应变量进行二元分类
# 通过给出质量的界限,将葡萄酒分为好的和坏的
# bad:质量<=6
# good:质量 >= 7
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
wine['quality'] = pd.cut(wine['quality'], bins = bins, labels = group_names)
# 给质量变量指定一个标签
label_quality = LabelEncoder()
# 坏的变成0,好的变成1
wine['quality'] = label_quality.fit_transform(wine['quality'])
# 新的响应变量
sns.countplot(x='quality', data=wine)
fig, ax =plt.subplots(nrows=3,ncols=5,figsize=(15,10))
sns.distplot(wine[wine['quality']==0]['fixed acidity'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Bad Quality',ax=ax[0][0])
sns.distplot(wine[wine['quality']==1]['fixed acidity'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Good Quality',ax=ax[0][0])
# plt.legend(loc=0, prop={'size': 8},)
# plt.legend(labels = ['bad quality','good quality'])
sns.distplot(wine[wine['quality']==0]['volatile acidity'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Bad Quality',ax=ax[0][1])
sns.distplot(wine[wine['quality']==1]['volatile acidity'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Good Quality',ax=ax[0][1])
plt.legend(loc=0, prop={'size': 8})
sns.distplot(wine[wine['quality']==0]['sulphates'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Bad Quality',ax=ax[0][2])
sns.distplot(wine[wine['quality']==1]['sulphates'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Good Quality',ax=ax[0][2])
plt.legend(loc=0, prop={'size': 8})
sns.distplot(wine[wine['quality']==0]['citric acid'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Bad Quality',ax=ax[0][3])
sns.distplot(wine[wine['quality']==1]['citric acid'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Good Quality',ax=ax[0][3])
plt.legend(loc=0, prop={'size': 8})
sns.distplot(wine[wine['quality']==0]['residual sugar'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Bad Quality',ax=ax[0][4])
sns.distplot(wine[wine['quality']==1]['residual sugar'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Good Quality',ax=ax[0][4])
plt.legend(loc=0, prop={'size': 8})
sns.distplot(wine[wine['quality']==0]['chlorides'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Bad Quality',ax=ax[1][0])
sns.distplot(wine[wine['quality']==1]['chlorides'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Good Quality',ax=ax[1][0])
plt.legend(loc=0, prop={'size': 8})
sns.distplot(wine[wine['quality']==0]['free sulfur dioxide'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Bad Quality',ax=ax[1][1])
sns.distplot(wine[wine['quality']==1]['free sulfur dioxide'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Good Quality',ax=ax[1][1])
plt.legend(loc=0, prop={'size': 8})
sns.distplot(wine[wine['quality']==0]['total sulfur dioxide'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Bad Quality',ax=ax[1][2])
sns.distplot(wine[wine['quality']==1]['total sulfur dioxide'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Good Quality',ax=ax[1][2])
plt.legend(loc=0, prop={'size': 8})
sns.distplot(wine[wine['quality']==0]['density'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Bad Quality',ax=ax[1][3])
sns.distplot(wine[wine['quality']==1]['density'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Good Quality',ax=ax[1][3])
plt.legend(loc=0, prop={'size': 8})
sns.distplot(wine[wine['quality']==0]['pH'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Bad Quality',ax=ax[1][4])
sns.distplot(wine[wine['quality']==1]['pH'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Good Quality',ax=ax[1][4])
plt.legend(loc=0, prop={'size': 8})
sns.distplot(wine[wine['quality']==0]['alcohol'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Bad Quality',ax=ax[2][0])
sns.distplot(wine[wine['quality']==1]['alcohol'], hist = False, kde = True,
kde_kws = {'shade': True, 'linewidth': 3},label='Good Quality',ax=ax[2][0])
plt.legend(loc=0, prop={'size': 8})
plt.subplots_adjust(wspace=.3, hspace=.3)
corr = wine.drop('quality',axis=1).corr()
# 为上面的三角形生成一个掩码
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(10, 10))
# 生成一个自定义的发散性色彩图
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# 用掩膜和正确的长宽比绘制热图
sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
正相关特征:"总二氧化硫 "与 “游离二氧化硫”、"固定酸度 "与 "柠檬酸 "、"固定酸度 "与 “密度”;
负相关特征:"pH值 "与 “固定酸度”、"pH值 "与 "柠檬酸 "、"柠檬酸 "与 “挥发性酸度”;