目录
1.数据集介绍
1.数据集介绍
我们通过构建一个KNN分类器,实现对鸢尾花数据集进行分类。
1.数据集介绍
- import csv
- #定义数据
- data = [
- ['RID', 'age', 'income', 'student', 'credit_rating', 'class_buys_computer'],
- ['1', 'youth', 'high', 'no', 'fair', 'no'],
- ['2', 'youth', 'high', 'no', 'excellent', 'no'],
- ['3', 'middle_aged', 'high', 'no', 'fair', 'yes'],
- ['4', 'senior', 'medium', 'no', 'fair', 'yes'],
- ['5', 'senior', 'low', 'yes', 'fair', 'yes'],
- ['6', 'senior', 'low', 'yes', 'excellent', 'no'],
- ['7', 'middle_aged', 'low', 'yes', 'excellent', 'yes'],
- ['8', 'youth', 'medium', 'no', 'fair', 'no'],
- ['9', 'youth', 'low', 'yes', 'fair', 'yes'],
- ['10', 'senior', 'medium', 'yes', 'fair', 'yes'],
- ['11', 'youth', 'medium', 'yes', 'excellent', 'yes'],
- ['12', 'middle_aged', 'medium', 'no', 'excellent', 'yes'],
- ['13', 'middle_aged', 'high', 'yes', 'fair', 'yes'],
- ['14', 'senior', 'medium', 'no', 'excellent', 'no'],
- ]
- #打开文件以写入数据,使用'newline=''来避免额外的空行
- with open('AllElectronics.csv', 'w', newline='') as csvfile:
- writer = csv.writer(csvfile)
- for row in data:
- writer.writerow(row)
2.代码实现
(1)导入库
- from graphviz import Source
- import pandas as pd
- from sklearn import tree
- from sklearn.feature_extraction import DictVectorizer
- from sklearn import preprocessing
(2)读取数据并进行相应的处理
- data = pd.read_csv('AllElectronics.csv')
- data = pd.DataFrame(data)
-
- valuedata = data.values
- header = list(data.columns)[1:6]
-
- featureList = []
- labelList = data['class_buys_computer']
- for value in valuedata:
- featureDict = {}
- for i in range(4):
- featureDict[header[i]]=value[i+1]
- featureList.append(featureDict)
(3)建立决策树
- vec = DictVectorizer()
- dummyX = vec.fit_transform(featureList).toarray()
-
- lb = preprocessing.LabelBinarizer()
- dummyY = lb.fit_transform(labelList)
-
- clf = tree.DecisionTreeClassifier(criterion='entropy')
-
- vec.get_feature_names()
- print(vec.get_feature_names());
(4)生成决策树图形
- clf = clf.fit(dummyX, dummyY)
- print(clf)
-
- graph = Source(tree.export_graphviz(clf, feature_names=vec.get_feature_names(),
- out_file=None))
- print(graph)
1.数据集介绍
2.代码实现
(1)导入库
(2)加载数据
- import numpy as up
- import pandas as pd
-
- dataset = pd.read_csv('E:\大数据挖掘/Iris.csv')
- dataset.shape
- print(dataset)
(3)数据预处理
(4)数据集切分:训练集和测试集
- dataset.describe()
- dataset.groupby('Species').size()
-
- feature_columns = ['SepalLengthCm',
- 'SepalWidthCm',
- 'PetalLengthCm',
- 'PetalWidthCm']
- X = dataset[feature_columns].values
- y = dataset['Species'].values
- le = LabelEncoder()
- y = le.fit_transform(y)
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
- plt.figure(figsize=(10,6))
-
(5)数据可视化探索
- import numpy as np
- import pandas as pd
- from sklearn.preprocessing import LabelEncoder
- from sklearn.model_selection import train_test_split
- import matplotlib.pylab as plt
- import seaborn as sns
- from pandas.plotting import parallel_coordinates
- import matplotlib.pyplot as plt
-
-
-
- dataset = pd.read_csv('E:\大数据挖掘/iris.csv')
-
- dataset.shape
- dataset.describe()
- dataset.groupby('Species').size()
-
- feature_columns = ['SepalLengthCm',
- 'SepalWidthCm',
- 'PetalLengthCm',
- 'PetalWidthCm']
- X = dataset[feature_columns].values
- y = dataset['Species'].values
-
- le = LabelEncoder()
- y = le.fit_transform(y)
-
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
-
-
-
- plt.figure(figsize=(10,6))
-
- parallel_coordinates(frame=dataset, class_column="Species")
- plt.title('Parallel Coordinates Plot', fontsize=15, fontweight='bold')
- plt.xlabel('Features', fontsize=10)
- plt.ylabel('Features values', fontsize=10)
- plt.legend(loc=1, prop={'size': 10}, frameon=True, shadow=True, facecolor='white', edgecolor='black')
- plt.show()
(6)实现KNN分类模型
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.metrics import confusion_matrix,accuracy_score
- from sklearn.model_selection import cross_val_score
-
- classifier = KNeighborsClassifier(n_neighbors=3)
-
- classifier.fit(X_train, y_train)
-
- y_pred = classifier.predict(X_test)
- cm = confusion_matrix(y_test, y_pred)
- cm
- print(cm)
- accuracy=accuracy_score(y_test, y_pred)*100
- print('模型准确率为:'+str(round(accuracy, 2))+'%.')
(7)使用交叉验证进行参数优化
- k_list=list=(range(1,50,2))#三个参数代表range;开始,结束,步长
- cv_scores=[]
-
- for k in k_list:
- knn=KNeighborsClassifier(n_neighbors=k)
- scores=cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
- cv_scores.append(scores.mean())
(8)可视化结果
(9)得出最优值
- MSE=[1-x for x in cv_scores]
-
- plt.figure()
- plt.figure(figsize=(10, 6))
-
-
- plt.title('the optimal number of neighbors', fontsize=15, fontweght='bold')
-
-
- plt.xlabel('Number of neighbors K', fontsize=10)
-
- sns.set_style("whitegrid")
-
- plt.plot(k_list, MSE)
- plt.sho
- best_k=k_list[MSE.index(min(MSE))]
- print("最优近邻数K为:%d"%best_k)