• 机器学习:朴素贝叶斯算法(Python)


    一、朴素贝叶斯算法的实现

    naive_bayes_classifier.py

    1. import numpy as np
    2. import collections as cc # 集合的计数功能
    3. from scipy.stats import norm # 极大似然估计样本的均值和标准方差
    4. from data_bin_wrapper import DataBinsWrapper
    5. class NaiveBayesClassifier:
    6. """
    7. 朴素贝叶斯分类器:对于连续属性两种方式操作,1是分箱处理,2是直接进行高斯分布的参数估计
    8. """
    9. def __init__(self, is_binned=False, is_feature_all_R=False, feature_R_idx=None, max_bins=10):
    10. self.is_binned = is_binned # 连续特征变量数据是否进行分箱操作,离散化
    11. if is_binned:
    12. self.is_feature_all_R = is_feature_all_R # 是否所有特征变量都是连续数值,bool
    13. self.max_bins = max_bins # 最大分箱数
    14. self.dbw = DataBinsWrapper() # 分箱对象
    15. self.dbw_XrangeMap = dict() # 存储训练样本特征分箱的段点
    16. self.feature_R_idx = feature_R_idx # 混合式数据中连续特征变量的索引
    17. self.class_values, self.n_class = None, 0 # 类别取值以及类别数
    18. self.prior_prob = dict() # 先验分布,键是类别取值,键是类别取值
    19. self.classified_feature_prob = dict() # 存储每个类所对应的特征变量取值频次或者连续属性的高斯分布参数
    20. self.feature_values_num = dict() # 训练样本中每个特征不同的取值数,针对离散数据
    21. self.class_values_num = dict() # 目标集中每个类别的样本量,Dc
    22. def _prior_probability(self, y_train):
    23. """
    24. 计算类别的先验概率
    25. :param y_train: 目标集
    26. :return:
    27. """
    28. n_samples = len(y_train) # 总样本量
    29. self.class_values_num = cc.Counter(y_train) # Counter({'否': 9, '是': 8})
    30. # print(self.class_values_num)
    31. for key in self.class_values_num.keys():
    32. self.prior_prob[key] = (self.class_values_num[key] + 1) / (n_samples + self.n_class)
    33. # print(self.prior_prob)
    34. def _data_bin_wrapper(self, x_samples):
    35. """
    36. 针对特定的连续特征属性索引dbw_feature_idx,分别进行分箱,考虑测试样本与训练样本使用同一个XrangeMap
    37. :param x_samples: 样本:即可以是训练样本,也可以是测试样本
    38. :return:
    39. """
    40. self.feature_R_idx = np.asarray(self.feature_R_idx)
    41. x_samples_prop = [] # 分箱之后的数据
    42. if not self.dbw_XrangeMap:
    43. # 为空,即创建决策树前所做的分箱操作
    44. for i in range(x_samples.shape[1]):
    45. if i in self.feature_R_idx: # 说明当前特征是连续数值
    46. self.dbw.fit(x_samples[:, i])
    47. self.dbw_XrangeMap[i] = self.dbw.XrangeMap
    48. x_samples_prop.append(self.dbw.transform(x_samples[:, i]))
    49. else:
    50. x_samples_prop.append(x_samples[:, i])
    51. else: # 针对测试样本的分箱操作
    52. for i in range(x_samples.shape[1]):
    53. if i in self.feature_R_idx: # 说明当前特征是连续数值
    54. x_samples_prop.append(self.dbw.transform(x_samples[:, i], self.dbw_XrangeMap[i]))
    55. else:
    56. x_samples_prop.append(x_samples[:, i])
    57. return np.asarray(x_samples_prop).T
    58. def fit(self, x_train, y_train):
    59. """
    60. 朴素贝叶斯分类器训练,可将朴素贝叶斯分类器涉及的所有概率估值事先计算好存储起来
    61. :param x_train: 训练集
    62. :param y_train: 目标集
    63. :return:
    64. """
    65. x_train, y_train = np.asarray(x_train), np.asarray(y_train)
    66. self.class_values = np.unique(y_train) # 类别取值
    67. self.n_class = len(self.class_values) # 类别数
    68. if self.n_class < 2:
    69. print("仅有一个类别,不进行贝叶斯分类器估计...")
    70. exit(0)
    71. self._prior_probability(y_train) # 先验概率
    72. # 每个特征变量不同的取值数,类条件概率的分子D(x, xi)
    73. for i in range(x_train.shape[1]):
    74. self.feature_values_num[i] = len(np.unique(x_train[:, i]))
    75. if self.is_binned:
    76. self._binned_fit(x_train, y_train) # 分箱处理
    77. else:
    78. self._gaussian_fit(x_train, y_train) # 直接进行高斯分布估计
    79. def _binned_fit(self, x_train, y_train):
    80. """
    81. 对连续特征属性进行分箱操作,然后计算各概率值
    82. :param x_train:
    83. :param y_train:
    84. :return:
    85. """
    86. if self.is_feature_all_R: # 全部是连续
    87. self.dbw.fit(x_train)
    88. x_train = self.dbw.transform(x_train)
    89. elif self.feature_R_idx is not None:
    90. x_train = self._data_bin_wrapper(x_train)
    91. for c in self.class_values:
    92. class_x = x_train[y_train == c] # 获取对应类别的样本
    93. feature_counter = dict() # 每个离散变量特征中特定值的出现的频次,连续特征变量存u、sigma
    94. for i in range(x_train.shape[1]):
    95. feature_counter[i] = cc.Counter(class_x[:, i])
    96. self.classified_feature_prob[c] = feature_counter
    97. print(self.classified_feature_prob)
    98. def _gaussian_fit(self, x_train, y_train):
    99. """
    100. 连续特征变量不进行分箱,直接进行高斯分布估计,离散特征变量取值除外
    101. :param x_train:
    102. :param y_train:
    103. :return:
    104. """
    105. for c in self.class_values:
    106. class_x = x_train[y_train == c] # 获取对应类别的样本
    107. feature_counter = dict() # 每个离散变量特征中特定值的出现的频次,连续特征变量存u、sigma
    108. for i in range(x_train.shape[1]):
    109. if self.feature_R_idx is not None and (i in self.feature_R_idx): # 连续特征
    110. # 极大似然估计均值和方差
    111. mu, sigma = norm.fit(np.asarray(class_x[:, i], dtype=np.float64))
    112. feature_counter[i] = {"mu": mu, "sigma": sigma}
    113. else: # 离散特征
    114. feature_counter[i] = cc.Counter(class_x[:, i])
    115. self.classified_feature_prob[c] = feature_counter
    116. print(self.classified_feature_prob)
    117. def predict_proba(self, x_test):
    118. """
    119. 预测测试样本所属类别的概率
    120. :param x_test: 测试样本集
    121. :return:
    122. """
    123. x_test = np.asarray(x_test)
    124. if self.is_binned:
    125. return self._binned_predict_proba(x_test)
    126. else:
    127. return self._gaussian_predict_proba(x_test)
    128. def _binned_predict_proba(self, x_test):
    129. """
    130. 连续特征变量进行分箱离散化,预测
    131. :param x_test: 测试样本集
    132. :return:
    133. """
    134. if self.is_feature_all_R:
    135. x_test = self.dbw.transform(x_test)
    136. elif self.feature_R_idx is not None:
    137. x_test = self._data_bin_wrapper(x_test)
    138. y_test_hat = np.zeros((x_test.shape[0], self.n_class)) # 存储测试样本所属各个类别概率
    139. for i in range(x_test.shape[0]):
    140. test_sample = x_test[i, :] # 当前测试样本
    141. y_hat = [] # 当前测试样本所属各个类别的概率
    142. for c in self.class_values:
    143. prob_ln = np.log(self.prior_prob[c]) # 当前类别的先验概率,取对数
    144. # 当前类别下不同特征变量不同取值的频次,构成字典
    145. feature_frequency = self.classified_feature_prob[c]
    146. for j in range(x_test.shape[1]): # 针对每个特征变量
    147. value = test_sample[j] # 当前测试样本的当前特征取值
    148. cur_feature_freq = feature_frequency[j] # Counter({'浅白': 4, '青绿': 3, '乌黑': 2})
    149. # 按照拉普拉斯修正方法计算
    150. prob_ln += np.log((cur_feature_freq.get(value, 0) + 1) /
    151. (self.class_values_num[c] + self.feature_values_num[j]))
    152. y_hat.append(prob_ln) # 输入第c个类别的概率
    153. y_test_hat[i, :] = self.softmax_func(np.asarray(y_hat)) # 适合多分类,且归一化
    154. return y_test_hat
    155. @staticmethod
    156. def softmax_func(x):
    157. """
    158. softmax函数,为避免上溢或下溢,对参数x做限制
    159. :param x: 数组: 1 * n_classes
    160. :return:
    161. """
    162. exps = np.exp(x - np.max(x)) # 避免溢出,每个数减去其最大值
    163. return exps / np.sum(exps)
    164. def _gaussian_predict_proba(self, x_test):
    165. """
    166. 连续特征变量不进行分箱,直接按高斯分布估计
    167. :param x_test: 测试样本集
    168. :return:
    169. """
    170. y_test_hat = np.zeros((x_test.shape[0], self.n_class)) # 存储测试样本所属各个类别概率
    171. for i in range(x_test.shape[0]):
    172. test_sample = x_test[i, :] # 当前测试样本
    173. y_hat = [] # 当前测试样本所属各个类别的概率
    174. for c in self.class_values:
    175. prob_ln = np.log(self.prior_prob[c]) # 当前类别的先验概率,取对数
    176. # 当前类别下不同特征变量不同取值的频次,构成字典
    177. feature_frequency = self.classified_feature_prob[c]
    178. for j in range(x_test.shape[1]): # 针对每个特征变量
    179. value = test_sample[j] # 当前测试样本的当前特征取值
    180. if self.feature_R_idx is not None and (j in self.feature_R_idx): # 连续特征
    181. # 取极大似然估计的均值和方差
    182. # print(feature_frequency[j].values())
    183. mu, sigma = feature_frequency[j].values()
    184. prob_ln += np.log(norm.pdf(value, mu, sigma) + 1e-8)
    185. else:
    186. cur_feature_freq = feature_frequency[j] # Counter({'浅白': 4, '青绿': 3, '乌黑': 2})
    187. # 按照拉普拉斯修正方法计算
    188. prob_ln += np.log((cur_feature_freq.get(value, 0) + 1) /
    189. (self.class_values_num[c] + self.feature_values_num[j]))
    190. y_hat.append(prob_ln) # 输入第c个类别的概率
    191. y_test_hat[i, :] = self.softmax_func(np.asarray(y_hat)) # 适合多分类,且归一化
    192. return y_test_hat
    193. def predict(self, x_test):
    194. """
    195. 预测测试样本所属类别
    196. :param x_test: 测试样本集
    197. :return:
    198. """
    199. return np.argmax(self.predict_proba(x_test), axis=1)

    二、可视化分类边界函数

    plt_decision_function.py

    1. import matplotlib.pyplot as plt
    2. import numpy as np
    3. def plot_decision_function(X, y, clf, is_show=True):
    4. """
    5. 可视化分类边界函数
    6. :param X: 测试样本
    7. :param y: 测试样本的类别
    8. :param clf: 分类模型
    9. :param is_show: 是否在当前显示图像,用于父函数绘制子图
    10. :return:
    11. """
    12. if is_show:
    13. plt.figure(figsize=(7, 5))
    14. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    15. y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    16. xi, yi = np.meshgrid(np.linspace(x_min, x_max, 100),
    17. np.linspace(y_min, y_max, 100))
    18. y_pred = clf.predict(np.c_[xi.ravel(), yi.ravel()]) # 模型预测值
    19. y_pred = y_pred.reshape(xi.shape)
    20. plt.contourf(xi, yi, y_pred, cmap="winter", alpha=0.4)
    21. plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors="k")
    22. plt.xlabel("Feature 1", fontdict={"fontsize": 12})
    23. plt.ylabel("Feature 2", fontdict={"fontsize": 12})
    24. plt.title("NativeBayes Model Classification Boundary", fontdict={"fontsize": 14})
    25. if is_show:
    26. plt.show()

    三、朴素贝叶斯算法的测试

    1. import matplotlib.pyplot as plt
    2. import pandas as pd
    3. import numpy as np
    4. from naive_bayes_classifier import NaiveBayesClassifier
    5. from sklearn.datasets import make_blobs
    6. from sklearn.model_selection import train_test_split
    7. from sklearn.metrics import classification_report
    8. from plt_decision_function import plot_decision_function
    9. # wm = pd.read_csv("watermelon.csv").dropna()
    10. # X, y = np.asarray(wm.iloc[:, 1:-1]), np.asarray(wm.iloc[:, -1])
    11. # # print(X)
    12. # # print(y)
    13. # nbc = NaiveBayesClassifier(is_binned=True, feature_R_idx=[6, 7], max_bins=10)
    14. # nbc.fit(X, y)
    15. # y_proba = nbc.predict_proba(X)
    16. # print(y_proba)
    17. # y_hat = nbc.predict(X)
    18. # print(y_hat)
    19. X, y = make_blobs(n_samples=500, n_features=2, centers=4, cluster_std=0.85, random_state=0)
    20. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
    21. nbc = NaiveBayesClassifier(is_binned=True, max_bins=20, is_feature_all_R=True)
    22. nbc.fit(X_train, y_train)
    23. y_pred = nbc.predict(X_test)
    24. print(classification_report(y_test, y_pred))
    25. plt.figure(figsize=(14, 5))
    26. plt.subplot(121)
    27. plot_decision_function(X_train, y_train, nbc, is_show=False)
    28. nbc = NaiveBayesClassifier(is_binned=False, feature_R_idx=[0, 1])
    29. nbc.fit(X_train, y_train)
    30. y_pred = nbc.predict(X_test)
    31. print(classification_report(y_test, y_pred))
    32. plt.subplot(122)
    33. plot_decision_function(X_train, y_train, nbc, is_show=False)
    34. plt.show()
    35. # al = pd.read_csv("mushroom/agaricus-lepiota.data").dropna()

     

     

  • 相关阅读:
    四级单词大全o-z
    操作系统八股
    一篇文章搞懂MYSQL的脏读、不可重复读、幻读出现的原因以及用事务隔离级别来解决问题详解
    Java并发编程解析 | 解析AQS基础同步器的设计与实现
    查找文本文件的差集
    2024级199管理类联考之逻辑核心基础
    JavaScript 基础知识|值的比较
    未来的应用为什么需要安全沙箱
    专为云原生、微服务架构而设计的链路追踪工具 【SkyWalking介绍及搭建】
    第30届深圳礼品展暨1688工厂直采季开幕,携手创增长
  • 原文地址:https://blog.csdn.net/2302_78896863/article/details/136306772