(以下为代码,在下一部分列出,此处省略。)
"""
file name: classification.py
author: yichen Li
date: 2023/10/17
"""
# Import packages
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import pandas as pd
# import matplotlib.pyplot as plt
import scipy as stat
import numpy as np
import time
# Load dataset
def initData():
path = "./agaricus-lepiota.data"
data = pd.read_csv(path, header=None)
print(len(data))
np_tests = np.array(data)
# for i, test in enumerate(np_tests):
# print("%d %s:" % (i, test))
return np_tests
# Mahalanobis Distance
def calculateMahalanobis(y=None, data=None, cov=None):
y_mu = y - np.mean(data)
if not cov:
cov = np.cov(data.values.T)
inv_covmat = np.linalg.inv(cov)
left = np.dot(y_mu, inv_covmat)
mahal = np.dot(left, y_mu.T)
return mahal.diagonal()
# Original Dataset
mushroom = initData()
print(mushroom)
print(mushroom.shape)
# Handle missing values
num_missing = 0
count = np.zeros(6, dtype=int)
print(count.shape)
mean = 0
# Convert categorical data into numerical data
for i in range(0, 8124):
# Feature 0: poisonous (Target)
if mushroom[i][0] == 'p':
mushroom[i][0] = 1
elif mushroom[i][0] == 'e':
mushroom[i][0] = 0
# Feature 1: cap-shape
if mushroom[i][1] == 'b':
mushroom[i][1] = 0
elif mushroom[i][1] == 'c':
mushroom[i][1] = 1
elif mushroom[i][1] == 'x':
mushroom[i][1] = 2
elif mushroom[i][1] == 'f':
mushroom[i][1] = 3
elif mushroom[i][1] == 'k':
mushroom[i][1] = 4
elif mushroom[i][1] == 's':
mushroom[i][1] = 5
# Feature 2: cap-surface
if mushroom[i][2] == 'f':
mushroom[i][2] = 0
elif mushroom[i][2] == 'g':
mushroom[i][2] = 1
elif mushroom[i][2] == 'y':
mushroom[i][2] = 2
elif mushroom[i][2] == 's':
mushroom[i][2] = 3
# Feature 3: cap-color
if mushroom[i][3] == 'n':
mushroom[i][3] = 0
elif mushroom[i][3] == 'b':
mushroom[i][3] = 1
elif mushroom[i][3] == 'c':
mushroom[i][3] = 2
elif mushroom[i][3] == 'g':
mushroom[i][3] = 3
elif mushroom[i][3] == 'r':
mushroom[i][3] = 4
elif mushroom[i][3] == 'p':
mushroom[i][3] = 5
elif mushroom[i][3] == 'u':
mushroom[i][3] = 6
elif mushroom[i][3] == 'e':
mushroom[i][3] = 7
elif mushroom[i][3] == 'w':
mushroom[i][3] = 8
elif mushroom[i][3] == 'y':
mushroom[i][3] = 9
# Feature 4: bruises
if mushroom[i][4] == 'f':
mushroom[i][4] = 0
elif mushroom[i][4] == 't':
mushroom[i][4] = 1
# Feature 5: odor
if mushroom[i][5] == 'a':
mushroom[i][5] = 0
elif mushroom[i][5] == 'l':
mushroom[i][5] = 1
elif mushroom[i][5] == 'c':
mushroom[i][5] = 2
elif mushroom[i][5] == 'y':
mushroom[i][5] = 3
elif mushroom[i][5] == 'f':
mushroom[i][5] = 4
elif mushroom[i][5] == 'm':
mushroom[i][5] = 5
elif mushroom[i][5] == 'n':
mushroom[i][5] = 6
elif mushroom[i][5] == 'p':
mushroom[i][5] = 7
elif mushroom[i][5] == 's':
mushroom[i][5] = 8
# Feature 6: gill-attachment
if mushroom[i][6] == 'a':
mushroom[i][6] = 0
elif mushroom[i][6] == 'd':
mushroom[i][6] = 1
elif mushroom[i][6] == 'f':
mushroom[i][6] = 2
elif mushroom[i][6] == 'n':
mushroom[i][6] = 3
# Feature 7: gill-spacing
if mushroom[i][7] == 'c':
mushroom[i][7] = 0
elif mushroom[i][7] == 'w':
mushroom[i][7] = 1
elif mushroom[i][7] == 'd':
mushroom[i][7] = 2
# Feature 8: gill-size
if mushroom[i][8] == 'b':
mushroom[i][8] = 0
elif mushroom[i][8] == 'n':
mushroom[i][8] = 1
# Feature 9: gill-color
if mushroom[i][9] == 'k':
mushroom[i][9] = 0
elif mushroom[i][9] == 'n':
mushroom[i][9] = 1
elif mushroom[i][9] == 'b':
mushroom[i][9] = 2
elif mushroom[i][9] == 'h':
mushroom[i][9] = 3
elif mushroom[i][9] == 'g':
mushroom[i][9] = 4
elif mushroom[i][9] == 'r':
mushroom[i][9] = 5
elif mushroom[i][9] == 'o':
mushroom[i][9] = 6
elif mushroom[i][9] == 'p':
mushroom[i][9] = 7
elif mushroom[i][9] == 'u':
mushroom[i][9] = 8
elif mushroom[i][9] == 'e':
mushroom[i][9] = 9
elif mushroom[i][9] == 'w':
mushroom[i][9] = 10
elif mushroom[i][9] == 'y':
mushroom[i][9] = 11
# Feature 10: stalk-shape
if mushroom[i][10] == 'e':
mushroom[i][10] = 0
elif mushroom[i][10] == 't':
mushroom[i][10] = 1
# Feature 11: stalk-root
# TODO: Missing values
# Calculate number of missing values
if mushroom[i][11] == '?':
num_missing += 1
else:
if mushroom[i][11] == 'b':
mushroom[i][11] = 0
count[0] += 1
elif mushroom[i][11] == 'c':
mushroom[i][11] = 1
count[1] += 1
elif mushroom[i][11] == 'u':
mushroom[i][11] = 2
count[2] += 1
elif mushroom[i][11] == 'e':
mushroom[i][11] = 3
count[3] += 1
elif mushroom[i][11] == 'z':
mushroom[i][11] = 4
count[4] += 1
elif mushroom[i][11] == 'r':
mushroom[i][11] = 5
count[5] += 1
# Feature 12: stalk-surface-above-ring
if mushroom[i][12] == 'f':
mushroom[i][12] = 0
elif mushroom[i][12] == 'y':
mushroom[i][12] = 1
elif mushroom[i][12] == 'k':
mushroom[i][12] = 2
elif mushroom[i][12] == 's':
mushroom[i][12] = 3
# Feature 13: stalk-surface-below-ring
if mushroom[i][13] == 'f':
mushroom[i][13] = 0
elif mushroom[i][13] == 'y':
mushroom[i][13] = 1
elif mushroom[i][13] == 'k':
mushroom[i][13] = 2
elif mushroom[i][13] == 's':
mushroom[i][13] = 3
# Feature 14: stalk-color-above-ring
if mushroom[i][14] == 'n':
mushroom[i][14] = 0
elif mushroom[i][14] == 'b':
mushroom[i][14] = 1
elif mushroom[i][14] == 'c':
mushroom[i][14] = 2
elif mushroom[i][14] == 'g':
mushroom[i][14] = 3
elif mushroom[i][14] == 'o':
mushroom[i][14] = 4
elif mushroom[i][14] == 'p':
mushroom[i][14] = 5
elif mushroom[i][14] == 'e':
mushroom[i][14] = 6
elif mushroom[i][14] == 'w':
mushroom[i][14] = 7
elif mushroom[i][14] == 'y':
mushroom[i][14] = 8
# Feature 15: stalk-color-below-ring
if mushroom[i][15] == 'n':
mushroom[i][15] = 0
elif mushroom[i][15] == 'b':
mushroom[i][15] = 1
elif mushroom[i][15] == 'c':
mushroom[i][15] = 2
elif mushroom[i][15] == 'g':
mushroom[i][15] = 3
elif mushroom[i][15] == 'o':
mushroom[i][15] = 4
elif mushroom[i][15] == 'p':
mushroom[i][15] = 5
elif mushroom[i][15] == 'e':
mushroom[i][15] = 6
elif mushroom[i][15] == 'w':
mushroom[i][15] = 7
elif mushroom[i][15] == 'y':
mushroom[i][15] = 8
# Feature 16: veil-type
if mushroom[i][16] == 'p':
mushroom[i][16] = 0
elif mushroom[i][16] == 'u':
mushroom[i][16] = 1
# Feature 17: veil-color
if mushroom[i][17] == 'n':
mushroom[i][17] = 0
elif mushroom[i][17] == 'o':
mushroom[i][17] = 1
elif mushroom[i][17] == 'w':
mushroom[i][17] = 2
elif mushroom[i][17] == 'y':
mushroom[i][17] = 3
# Feature 18: ring-number
if mushroom[i][18] == 'n':
mushroom[i][18] = 0
elif mushroom[i][18] == 'o':
mushroom[i][18] = 1
elif mushroom[i][18] == 't':
mushroom[i][18] = 2
# Feature 19: ring-type
if mushroom[i][19] == 'c':
mushroom[i][19] = 0
elif mushroom[i][19] == 'e':
mushroom[i][19] = 1
elif mushroom[i][19] == 'f':
mushroom[i][19] = 2
elif mushroom[i][19] == 'l':
mushroom[i][19] = 3
elif mushroom[i][19] == 'n':
mushroom[i][19] = 4
elif mushroom[i][19] == 'p':
mushroom[i][19] = 5
elif mushroom[i][19] == 's':
mushroom[i][19] = 6
elif mushroom[i][19] == 'z':
mushroom[i][19] = 7
# Feature 20: spore-print-color
if mushroom[i][20] == 'k':
mushroom[i][20] = 0
elif mushroom[i][20] == 'n':
mushroom[i][20] = 1
elif mushroom[i][20] == 'b':
mushroom[i][20] = 2
elif mushroom[i][20] == 'h':
mushroom[i][20] = 3
elif mushroom[i][20] == 'r':
mushroom[i][20] = 4
elif mushroom[i][20] == 'o':
mushroom[i][20] = 5
elif mushroom[i][20] == 'u':
mushroom[i][20] = 6
elif mushroom[i][20] == 'w':
mushroom[i][20] = 7
elif mushroom[i][20] == 'y':
mushroom[i][20] = 8
# Feature 21: population
if mushroom[i][21] == 'a':
mushroom[i][21] = 0
elif mushroom[i][21] == 'c':
mushroom[i][21] = 1
elif mushroom[i][21] == 'n':
mushroom[i][21] = 2
elif mushroom[i][21] == 's':
mushroom[i][21] = 3
elif mushroom[i][21] == 'v':
mushroom[i][21] = 4
elif mushroom[i][21] == 'y':
mushroom[i][21] = 5
# Feature 22: habitat
if mushroom[i][22] == 'g':
mushroom[i][22] = 0
elif mushroom[i][22] == 'l':
mushroom[i][22] = 1
elif mushroom[i][22] == 'm':
mushroom[i][22] = 2
elif mushroom[i][22] == 'p':
mushroom[i][22] = 3
elif mushroom[i][22] == 'u':
mushroom[i][22] = 4
elif mushroom[i][22] == 'w':
mushroom[i][22] = 5
elif mushroom[i][22] == 'd':
mushroom[i][22] = 6
print(mushroom)
# calculate the mean of missing values for feature 11
# for i in range(0, 6):
# print(count[i])
mean = (count[0] * 0 + count[1] * 1 + count[2] * 2 + count[3] * 3 + count[4] * 4 + count[5] * 5) / 5644
# print(mean)
# assign mean to missing values
for i in range(0, 8124):
if mushroom[i][11] == '?':
mushroom[i][11] = mean
# Training and Test sets
# Training Set
training_x = np.zeros((6093, 22), dtype=float) # Training data without label
training_y = np.zeros((6093, 1), dtype=int) # Training label
# Get the training data from dataset
for i in range(0, 6092):
training_y[i] = mushroom[i][0]
for j in range(1, 22):
training_x[i][j] = mushroom[i][j]
# Examine training data
print(training_x)
print(training_x.shape)
print(training_y)
print(training_y.shape)
# Test Set
test_x = np.zeros((2031, 22), dtype=float) # Test data without label
test_y = np.zeros((2031, 1), dtype=int) # Test label
# Get the test data from dataset
for i in range(0, 2030):
test_y[i] = mushroom[i + 6093][0]
for j in range(1, 22):
test_x[i][j] = mushroom[i + 6093][j]
# Examine training data
print(training_x)
print(training_x.shape)
print(training_y)
print(training_y.shape)
# Examine test data
print(test_x)
print(test_x.shape)
print(test_y)
print(test_y.shape)
# Dimensionality Reduction by PCA Method
for n in {1, 2, 4, 8, 12, 16, 20}:
model = PCA(n_components=n)
# print(num_missing) 2480
pca_features = model.fit_transform(mushroom)
mean = model.mean_
first_pc = model.components_[0, :]
print(mushroom.shape)
# Dimensionality Reduction by LDA Method
sklearn_lda = LDA(n_components=1)
X_lda_sklearn = sklearn_lda.fit_transform(training_x, training_y.ravel())
print(X_lda_sklearn.shape)
print(X_lda_sklearn)
# Classification
def Classify(sample):
pass
# Performance Analysis
def Performance():
t_p = 0
t_n = 0
f_p = 0
f_n = 0
for i in range(0, 2030):
if Classify(test_x[i] == 0):
if test_y[i] == 0:
t_p += 1 # True Positive
else:
f_p += 1 # False Positive
else:
if test_y[i] == 0:
f_n += 1 # False Negative
else:
t_n += 1 # True Negative
accuracy = (t_p + t_n) / (t_p + t_n + f_p + f_n)
return accuracy
# Efficiency
# Calculate the time of executing function
def cal_time(func):
time1 = time.perf_counter()
func()
time2 = time.perf_counter()
func_time = time2 - time1
return func_time