机器学习垃圾邮件分类

关注码龄粉丝数原力等级 -- 被采纳被点赞采纳率大力力力力力力力力出奇迹 2024-06-09 12:40 采纳率: 33.3% 浏览 0 首页/ 人工智能 / 机器学习垃圾邮件分类机器学习 import os import re import string import math import numpy as np from collections import defaultdict from sklearn.model_selection import train_test_split DATA_DIR = r'C:\Users\刘晓丽\Desktop\team7' # 数据集地址 target_names = ['ham', 'spam'] # 正常、垃圾 stopwords = set(open('stopwords.txt', 'r').read().splitlines()) # 加载停用词 def get_data(DATA_DIR): # 假设“未分类邮件”是包含所有邮件的文件夹 mail_folder = '待分类邮件' data = [] target = [] # 获取“未分类邮件”文件夹中的所有文件 all_files = os.listdir(os.path.join(DATA_DIR, mail_folder)) for mail_file in all_files: # 通过文件夹名称判断邮件类型 if 'spam' in mail_file: label = 1 # 垃圾邮件标签为1 else: label = 0 # 正常邮件标签为0 # 打开邮件文件 with open(os.path.join(DATA_DIR, mail_folder, mail_file), encoding="latin-1") as f: data.append(f.read()) target.append(label) return data, target def preprocess(text): text = text.lower() # 转换为小写 text = re.sub(f'[{string.punctuation}]', ' ', text) # 去除标点符号 text = [word for word in text.split() if word not in stopwords] # 去除停用词 return text class NaiveBayesClassifier(): def __init__(self): self.vocabulary = set() # 词汇表 self.class_total = defaultdict(int) # 每个类别的文档数 self.word_total = defaultdict(int) # 每个类别中所有单词出现次数之和 self.word_given_class = defaultdict(lambda: defaultdict(int)) # 每个类别中每个单词出现次数 def fit(self, X, y): for text, label in zip(X, y): words = preprocess(text) self.class_total[label] += 1 for word in words: self.vocabulary.add(word) self.word_given_class[label][word] += 1 self.word_total[label] += 1 def predict(self, X): log_priors = {} for c in self.class_total.keys(): log_priors[c] = math.log(self.class_total[c] / sum(self.class_total.values())) predictions = [] for text in X: words = preprocess(text) log_probs = {} for c in self.class_total.keys(): log_probs[c] = log_priors[c] for word in words: # 对未见过的单词进行平滑处理 log_probs[c] += math.log((self.word_given_class[c][word] + 1) / (self.word_total[c] + len(self.vocabulary))) predictions.append(max(log_probs, key=log_probs.get)) return predictions # 加载数据集 X, y = get_data(DATA_DIR) # 划分数据集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 创建实例并训练模型 clf = NaiveBayesClassifier() clf.fit(X_train, y_train) # 对新邮件进行分类 new_email = 'Subject: et & s photo contest - announcing the winners\nCongratulations to the following winners of the 2001 ET & S photo contest. Over 200 entries were submitted! The winning photos will be displayed in the 2001 ET & S public education calendar.' prediction = clf.predict([new_email])[0] # 在测试集上进行预测并计算准确率 predictions = clf.predict(X_test) accuracy = np.sum(np.array(predictions) == np.array(y_test)) / len(y_test) # 创建一个列表来存储每封邮件的预测结果 predicted_emails = [] # 遍历每封测试邮件，并将预测结果添加到列表中 for i, prediction_index in enumerate(predictions): predicted_emails.append((i + 1, target_names[prediction_index])) # 打印预测结果 for email_no, prediction in predicted_emails: print(f'Email {email_no}: Prediction: {prediction}') print(f'Prediction for new email: {target_names[prediction]}') print(f'Accuracy: {accuracy:.2f}') 展开全部收起写回答好问题 0 提建议追加酬金关注问题微信扫一扫点击复制链接分享邀请回答编辑收藏删除结题收藏举报追加酬金 (90%的用户在追加酬金后获得了解决方案) 当前问题酬金 ¥ 0 (可追加 ¥500） ¥ 15¥ 20¥ 50¥ 100¥ 200 支付方式扫码支付二维码出错点击刷新支付金额 15 元提供问题酬金的用户不参与问题酬金结算和分配支付即为同意《付费问题酬金结算规则》结题再想想删除再等等

2条回答默认最新

专家-赤兔[在线] 全栈领域优质创作者 2024-06-09 12:40

关注

引自免费微信小程序：皆我百晓生

为了实现一个基于朴素贝叶斯的垃圾邮件分类器，我已经提供了一个完整的Python脚本。这个脚本分为以下几个部分：数据获取、预处理、定义朴素贝叶斯分类器以及评估。

首先，确保你已经在你的工作目录中有一个名为stopwords.txt的文件，其中包含了停用词。接下来，运行下面的代码，它会执行以下操作：

获取数据：从指定的数据集路径中读取邮件文本和对应的标签。
预处理：对邮件文本进行预处理，包括转换为小写、移除标点符号、去除停用词。
定义朴素贝叶斯分类器：创建一个简单的NaiveBayesClassifier类，用于训练模型和进行预测。
划分数据集：将数据集划分为训练集和测试集。
训练模型：使用训练集对朴素贝叶斯分类器进行训练。
对新邮件进行分类：对给定的新邮件（在这个例子中是一个示例）进行预测。
评估模型：在测试集上进行预测，计算准确率，并输出预测结果。

以下是完整的代码：

import os
import re
import string
import math
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split

DATA_DIR = r'C:\Users\刘晓丽\Desktop\team7'  # 数据集地址
target_names = ['ham', 'spam']  # 正常、垃圾
stopwords = set(open('stopwords.txt', 'r').read().splitlines())  # 加载停用词

def get_data(DATA_DIR):
    # 假设“未分类邮件”是包含所有邮件的文件夹
    mail_folder = '待分类邮件'
    data = []
    target = []
    # 获取“未分类邮件”文件夹中的所有文件
    all_files = os.listdir(os.path.join(DATA_DIR, mail_folder))
    for mail_file in all_files:
        # 通过文件夹名称判断邮件类型
        if 'spam' in mail_file:
            label = 1  # 垃圾邮件标签为1
        else:
            label = 0  # 正常邮件标签为0

        # 打开邮件文件
        with open(os.path.join(DATA_DIR, mail_folder, mail_file), encoding="latin-1") as f:
            data.append(f.read())
            target.append(label)

    return data, target


def preprocess(text):
    text = text.lower()  # 转换为小写
    text = re.sub(f'[{string.punctuation}]', ' ', text)  # 去除标点符号
    text = [word for word in text.split() if word not in stopwords]  # 去除停用词
    return text


class NaiveBayesClassifier():
    def __init__(self):
        self.vocabulary = set()  # 词汇表
        self.class_total = defaultdict(int)  # 每个类别的文档数
        self.word_total = defaultdict(int)  # 每个类别中所有单词出现次数之和
        self.word_given_class = defaultdict(lambda: defaultdict(int))  # 每个类别中每个单词出现次数

    def fit(self, X, y):
        for text, label in zip(X, y):
            words = preprocess(text)
            self.class_total[label] += 1
            for word in words:
                self.vocabulary.add(word)
                self.word_given_class[label][word] += 1
                self.word_total[label] += 1

    def predict(self, X):
        log_priors = {}
        for c in self.class_total.keys():
            log_priors[c] = math.log(self.class_total[c] / sum(self.class_total.values()))
            predictions = []
        for text in X:
            words = preprocess(text)
            log_probs = {}
        for c in self.class_total.keys():
            log_probs[c] = log_priors[c]
        for word in words:
            # 对未见过的单词进行平滑处理
            log_probs[c] += math.log((self.word_given_class[c][word] + 1) / (self.word_total[c] + len(self.vocabulary)))
            predictions.append(max(log_probs, key=log_probs.get))
        return predictions


# 加载数据集
X, y = get_data(DATA_DIR)

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建实例并训练模型
clf = NaiveBayesClassifier()
clf.fit(X_train, y_train)

# 对新邮件进行分类
new_email = 'Subject: et & s photo contest - announcing the winners\nCongratulations to the following winners of the 2001 ET & S photo contest. Over 200 entries were submitted! The winning photos will be displayed in the 2001 ET & S public education calendar.'
prediction = clf.predict([new_email])[0]

# 在测试集上进行预测并计算准确率
predictions = clf.predict(X_test)
accuracy = np.sum(np.array(predictions) == np.array(y_test)) / len(y_test)
# 创建一个列表来存储每封邮件的预测结果
predicted_emails = []

# 遍历每封测试邮件，并将预测结果添加到列表中
for i, prediction_index in enumerate(predictions):
    predicted_emails.append((i + 1, target_names[prediction_index]))

# 打印预测结果
for email_no, prediction in predicted_emails:
    print(f'Email {email_no}: Prediction: {prediction}')

print(f'Prediction for new email: {target_names[prediction]}')
print(f'Accuracy: {accuracy:.2f}')

这段代码会根据提供的数据集进行训练，并对新邮件进行分类。你需要替换DATA_DIR变量中的路径为你实际的数据集所在位置，并确保stopwords.txt文件存在且包含有效的停用词。运行代码后，你会看到每封测试邮件的预测结果以及新邮件的分类预测和整个模型的准确率。

展开全部

编辑记录

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
阿里嘎多学长 2024-06-09 12:40
关注
以下内容由CHATGPT及阿里嘎多学长共同生成、有用望采纳：

针对您提供的代码和需求，以下是一些建议和改进措施：

1. 数据预处理优化

文本规范化：除了转换为小写和去除标点符号外，还可以考虑去除数字和特殊字符。
词干提取：使用词干提取（Stemming）或词形还原（Lemmatization）来统一单词的不同形式。
TF-IDF：考虑使用TF-IDF（Term Frequency-Inverse Document Frequency）代替简单的词频统计，以减少常见词的影响。

2. 朴素贝叶斯分类器的实现优化

概率计算：确保先验概率和条件概率的计算是正确的。在您的代码中，条件概率的计算使用了加一平滑（Laplace smoothing），这是一个很好的实践。
模型评估：在训练模型之前，可以使用交叉验证来评估模型的稳定性和泛化能力。

3. 代码实现改进

预测函数：在predict函数中，您需要确保对每个类别计算了对数概率，并返回了最高概率的类别。
异常处理：添加异常处理，例如在读取文件或处理数据时捕获并处理可能的错误。

4. 模型测试和评估

准确率计算：您已经计算了准确率，这是一个很好的开始。此外，还可以考虑其他评估指标，如精确度、召回率和F1分数。
混淆矩阵：使用混淆矩阵来可视化模型性能，特别是在类别不平衡的情况下。

5. 探索改进方法

特征工程：尝试不同的特征提取方法，例如使用n-gram模型来捕捉词序信息。
参数调整：使用网格搜索（Grid Search）或随机搜索（Random Search）来找到最优的模型参数。
其他算法：考虑使用其他机器学习算法，如支持向量机（SVM）、随机森林或梯度提升机（GBM）进行比较。

6. 代码示例

以下是对predict方法的一些改进：

def predict(self, X): log_priors = {c: math.log(c_t / sum(self.class_total.values())) for c, c_t in self.class_total.items()} predictions = [] for text in X: words = preprocess(text) log_probs = {c: log_priors[c] for c in self.class_total.keys()} for word in words: for c in self.class_total.keys(): # 使用加一平滑 count = self.word_given_class[c][word] + 1 total = self.word_total[c] + len(self.vocabulary) log_probs[c] += math.log(count / total) # 选择最高概率的类别 predictions.append(max(log_probs, key=log_probs.get)) return predictions

7. 参考资料

Scikit-learn官方文档：提供了机器学习算法的实现和使用示例。
Machine Learning Mastery：提供了机器学习入门教程和代码示例。
Towards Data Science：有许多关于机器学习和自然语言处理的文章。

希望这些建议能帮助您改进现有的垃圾邮件分类模型。如果您需要更具体的代码实现或其他帮助，请随时告诉我。
展开全部
解决无用
评论打赏
分享
举报编辑记录
编辑

预览
轻敲空格完成输入
显示为

卡片

标题

链接
评论

按下Enter换行，Ctrl+Enter发表内容