• 【自然语言处理(NLP)实战】LSTM网络实现中文文本情感分析(手把手与教学超详细)


    目录

    引言:

    1.所有文件展示:

    1.中文停用词数据(hit_stopwords.txt)来源于:

    2.其中data数据集为chinese_text_cnn-master.zip提取出的文件。点击链接进入github,点击Code、Download ZIP即可下载。

    2.安装依赖库:

    3.数据预处理(data_set.py):

    train.txt-去除停用词后的训练集文件:

    test.txt -去除停用词后的测试集文件:

    4. 模型训练以及保存(main.py)

    1.LSTM模型搭建:

    2.main.py代价展示 :

     3.模型保存

    4.训练结果 

    5.LSTM模型测试(test.py) 

     1.测试结果:

    2.测试结果:

    6.完整代码展示:

    1.data_set.py

    2.mian.py

    3.test.py 


    引言:

    在当今数字化时代,人们在社交媒体、评论平台以及各类在线交流中产生了海量的文本数据。这些数据蕴含着丰富的情感信息,从而成为了深入理解用户态度、市场趋势,甚至社会情绪的宝贵资源。自然语言处理(NLP)的发展为我们提供了强大的工具,使得对文本情感进行分析成为可能。在这个领域中,长短时记忆网络(LSTM)凭借其能够捕捉文本序列中长距离依赖关系的能力,成为了情感分析任务中的一项重要技术。

    本篇博客将手把手地教你如何使用LSTM网络实现中文文本情感分析。我们将从数据预处理开始,逐步构建一个端到端的情感分析模型。通过详细的步骤和示例代码,深入了解如何处理中文文本数据、构建LSTM模型、进行训练和评估。

    1.所有文件展示:

    1.中文停用词数据(hit_stopwords.txt)来源于:

    项目目录预览 - stopwords - GitCode

    2.其中data数据集chinese_text_cnn-master.zip提取出的文件。点击链接进入github,点击Code、Download ZIP即可下载。

    2.安装依赖库:

    1. pip install torch # 搭建LSTM模型
    2. pip install gensim # 中文文本词向量转换
    3. pip install numpy # 数据清洗、预处理
    4. pip install pandas

    3.数据预处理(data_set.py):

    1. # -*- coding: utf-8 -*-
    2. # @Time : 2023/11/15 10:52
    3. # @Author :Muzi
    4. # @File : data_set.py
    5. # @Software: PyCharm
    6. import pandas as pd
    7. import jieba
    8. # 数据读取
    9. def load_tsv(file_path):
    10. data = pd.read_csv(file_path, sep='\t')
    11. data_x = data.iloc[:, -1]
    12. data_y = data.iloc[:, 1]
    13. return data_x, data_y
    14. train_x, train_y = load_tsv("./data/train.tsv")
    15. test_x, test_y = load_tsv("./data/test.tsv")
    16. train_x=[list(jieba.cut(x)) for x in train_x]
    17. test_x=[list(jieba.cut(x)) for x in test_x]
    18. with open('./hit_stopwords.txt','r',encoding='UTF8') as f:
    19. stop_words=[word.strip() for word in f.readlines()]
    20. print('Successfully')
    21. def drop_stopword(datas):
    22. for data in datas:
    23. for word in data:
    24. if word in stop_words:
    25. data.remove(word)
    26. return datas
    27. def save_data(datax,path):
    28. with open(path, 'w', encoding="UTF8") as f:
    29. for lines in datax:
    30. for i, line in enumerate(lines):
    31. f.write(str(line))
    32. # 如果不是最后一行,就添加一个逗号
    33. if i != len(lines) - 1:
    34. f.write(',')
    35. f.write('\n')
    36. if __name__ == '__main':
    37. train_x=drop_stopword(train_x)
    38. test_x=drop_stopword(test_x)
    39. save_data(train_x,'./train.txt')
    40. save_data(test_x,'./test.txt')
    41. print('Successfully')

    train.txt-去除停用词后的训练集文件:

     

    test.txt -去除停用词后的测试集文件:

    4. 模型训练以及保存(main.py)

    1.LSTM模型搭建:

    不同的数据集应该有不同的分类标准,我这里用到的数据模型属于二分类问题

    1. # 定义LSTM模型
    2. class LSTMModel(nn.Module):
    3. def __init__(self, input_size, hidden_size, output_size):
    4. super(LSTMModel, self).__init__()
    5. self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
    6. self.fc = nn.Linear(hidden_size, output_size)
    7. def forward(self, x):
    8. lstm_out, _ = self.lstm(x)
    9. output = self.fc(lstm_out[:, -1, :]) # 取序列的最后一个输出
    10. return output
    11. # 定义模型
    12. input_size = word2vec_model.vector_size
    13. hidden_size = 50 # 你可以根据需要调整隐藏层大小
    14. output_size = 2 # 输出的大小,根据你的任务而定
    15. model = LSTMModel(input_size, hidden_size, output_size)
    16. # 定义损失函数和优化器
    17. criterion = nn.CrossEntropyLoss() # 交叉熵损失函数
    18. optimizer = torch.optim.Adam(model.parameters(), lr=0.0002)

    2.main.py代价展示 :

    1. # -*- coding: utf-8 -*-
    2. # @Time : 2023/11/13 20:31
    3. # @Author :Muzi
    4. # @File : mian.py.py
    5. # @Software: PyCharm
    6. import pandas as pd
    7. import torch
    8. from torch import nn
    9. import jieba
    10. from gensim.models import Word2Vec
    11. import numpy as np
    12. from data_set import load_tsv
    13. from torch.utils.data import DataLoader, TensorDataset
    14. # 数据读取
    15. def load_txt(path):
    16. with open(path,'r',encoding='utf-8') as f:
    17. data=[[line.strip()] for line in f.readlines()]
    18. return data
    19. train_x=load_txt('train.txt')
    20. test_x=load_txt('test.txt')
    21. train=train_x+test_x
    22. X_all=[i for x in train for i in x]
    23. _, train_y = load_tsv("./data/train.tsv")
    24. _, test_y = load_tsv("./data/test.tsv")
    25. # 训练Word2Vec模型
    26. word2vec_model = Word2Vec(sentences=X_all, vector_size=100, window=5, min_count=1, workers=4)
    27. # 将文本转换为Word2Vec向量表示
    28. def text_to_vector(text):
    29. vector = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv]
    30. return sum(vector) / len(vector) if vector else [0] * word2vec_model.vector_size
    31. X_train_w2v = [[text_to_vector(text)] for line in train_x for text in line]
    32. X_test_w2v = [[text_to_vector(text)] for line in test_x for text in line]
    33. # 将词向量转换为PyTorch张量
    34. X_train_array = np.array(X_train_w2v, dtype=np.float32)
    35. X_train_tensor = torch.Tensor(X_train_array)
    36. X_test_array = np.array(X_test_w2v, dtype=np.float32)
    37. X_test_tensor = torch.Tensor(X_test_array)
    38. #使用DataLoader打包文件
    39. train_dataset = TensorDataset(X_train_tensor, torch.LongTensor(train_y))
    40. train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    41. test_dataset = TensorDataset(X_test_tensor,torch.LongTensor(test_y))
    42. test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)
    43. # 定义LSTM模型
    44. class LSTMModel(nn.Module):
    45. def __init__(self, input_size, hidden_size, output_size):
    46. super(LSTMModel, self).__init__()
    47. self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
    48. self.fc = nn.Linear(hidden_size, output_size)
    49. def forward(self, x):
    50. lstm_out, _ = self.lstm(x)
    51. output = self.fc(lstm_out[:, -1, :]) # 取序列的最后一个输出
    52. return output
    53. # 定义模型
    54. input_size = word2vec_model.vector_size
    55. hidden_size = 50 # 你可以根据需要调整隐藏层大小
    56. output_size = 2 # 输出的大小,根据你的任务而定
    57. model = LSTMModel(input_size, hidden_size, output_size)
    58. # 定义损失函数和优化器
    59. criterion = nn.CrossEntropyLoss() # 交叉熵损失函数
    60. optimizer = torch.optim.Adam(model.parameters(), lr=0.0002)
    61. if __name__ == "__main__":
    62. # 训练模型
    63. num_epochs = 10
    64. log_interval = 100 # 每隔100个批次输出一次日志
    65. loss_min=100
    66. for epoch in range(num_epochs):
    67. model.train()
    68. for batch_idx, (data, target) in enumerate(train_loader):
    69. outputs = model(data)
    70. loss = criterion(outputs, target)
    71. optimizer.zero_grad()
    72. loss.backward()
    73. optimizer.step()
    74. if batch_idx % log_interval == 0:
    75. print('Epoch [{}/{}], Batch [{}/{}], Loss: {:.4f}'.format(
    76. epoch + 1, num_epochs, batch_idx, len(train_loader), loss.item()))
    77. # 保存最佳模型
    78. if loss.item()
    79. loss_min=loss.item()
    80. torch.save(model, 'model.pth')
    81. # 模型评估
    82. with torch.no_grad():
    83. model.eval()
    84. correct = 0
    85. total = 0
    86. for data, target in test_loader:
    87. outputs = model(data)
    88. _, predicted = torch.max(outputs.data, 1)
    89. total += target.size(0)
    90. correct += (predicted == target).sum().item()
    91. accuracy = correct / total
    92. print('Test Accuracy: {:.2%}'.format(accuracy))

     3.模型保存

    1. # 保存最佳模型
    2. if loss.item()
    3. loss_min=loss.item()
    4. torch.save(model, 'model.pth')

    4.训练结果 

    5.LSTM模型测试(test.py) 

    1. # -*- coding: utf-8 -*-
    2. # @Time : 2023/11/15 15:53
    3. # @Author :Muzi
    4. # @File : test.py.py
    5. # @Software: PyCharm
    6. import torch
    7. import jieba
    8. from torch import nn
    9. from gensim.models import Word2Vec
    10. import numpy as np
    11. class LSTMModel(nn.Module):
    12. def __init__(self, input_size, hidden_size, output_size):
    13. super(LSTMModel, self).__init__()
    14. self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
    15. self.fc = nn.Linear(hidden_size, output_size)
    16. def forward(self, x):
    17. lstm_out, _ = self.lstm(x)
    18. output = self.fc(lstm_out[:, -1, :]) # 取序列的最后一个输出
    19. return output
    20. # 数据读取
    21. def load_txt(path):
    22. with open(path,'r',encoding='utf-8') as f:
    23. data=[[line.strip()] for line in f.readlines()]
    24. return data
    25. #去停用词
    26. def drop_stopword(datas):
    27. # 假设你有一个函数用于预处理文本数据
    28. with open('./hit_stopwords.txt', 'r', encoding='UTF8') as f:
    29. stop_words = [word.strip() for word in f.readlines()]
    30. datas=[x for x in datas if x not in stop_words]
    31. return datas
    32. def preprocess_text(text):
    33. text=list(jieba.cut(text))
    34. text=drop_stopword(text)
    35. return text
    36. # 将文本转换为Word2Vec向量表示
    37. def text_to_vector(text):
    38. train_x = load_txt('train.txt')
    39. test_x = load_txt('test.txt')
    40. train = train_x + test_x
    41. X_all = [i for x in train for i in x]
    42. # 训练Word2Vec模型
    43. word2vec_model = Word2Vec(sentences=X_all, vector_size=100, window=5, min_count=1, workers=4)
    44. vector = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv]
    45. return sum(vector) / len(vector) if vector else [0] * word2vec_model.vector_size
    46. if __name__ == '__main__':
    47. # input_text = "这个车完全就是垃圾,又热又耗油"
    48. input_text = "这个车我开了好几年,还是不错的"
    49. label = {1: "正面情绪", 0: "负面情绪"}
    50. model = torch.load('model.pth')
    51. # 预处理输入数据
    52. input_data = preprocess_text(input_text)
    53. # 确保输入词向量与模型维度和数据类型相同
    54. input_data=[[text_to_vector(input_data)]]
    55. input_arry= np.array(input_data, dtype=np.float32)
    56. input_tensor = torch.Tensor(input_arry)
    57. # 将输入数据传入模型
    58. with torch.no_grad():
    59. output = model(input_tensor)
    60. predicted_class = label[torch.argmax(output).item()]
    61. print(f"predicted_text:{input_text}")
    62. print(f"模型预测的类别: {predicted_class}")

     1.测试结果:

    2.测试结果:

    6.完整代码展示:

    1.data_set.py

    1. import pandas as pd
    2. import jieba
    3. # 数据读取
    4. def load_tsv(file_path):
    5. data = pd.read_csv(file_path, sep='\t')
    6. data_x = data.iloc[:, -1]
    7. data_y = data.iloc[:, 1]
    8. return data_x, data_y
    9. with open('./hit_stopwords.txt','r',encoding='UTF8') as f:
    10. stop_words=[word.strip() for word in f.readlines()]
    11. print('Successfully')
    12. def drop_stopword(datas):
    13. for data in datas:
    14. for word in data:
    15. if word in stop_words:
    16. data.remove(word)
    17. return datas
    18. def save_data(datax,path):
    19. with open(path, 'w', encoding="UTF8") as f:
    20. for lines in datax:
    21. for i, line in enumerate(lines):
    22. f.write(str(line))
    23. # 如果不是最后一行,就添加一个逗号
    24. if i != len(lines) - 1:
    25. f.write(',')
    26. f.write('\n')
    27. if __name__ == '__main':
    28. train_x, train_y = load_tsv("./data/train.tsv")
    29. test_x, test_y = load_tsv("./data/test.tsv")
    30. train_x = [list(jieba.cut(x)) for x in train_x]
    31. test_x = [list(jieba.cut(x)) for x in test_x]
    32. train_x=drop_stopword(train_x)
    33. test_x=drop_stopword(test_x)
    34. save_data(train_x,'./train.txt')
    35. save_data(test_x,'./test.txt')
    36. print('Successfully')

    2.mian.py

    1. import pandas as pd
    2. import torch
    3. from torch import nn
    4. import jieba
    5. from gensim.models import Word2Vec
    6. import numpy as np
    7. from data_set import load_tsv
    8. from torch.utils.data import DataLoader, TensorDataset
    9. # 数据读取
    10. def load_txt(path):
    11. with open(path,'r',encoding='utf-8') as f:
    12. data=[[line.strip()] for line in f.readlines()]
    13. return data
    14. train_x=load_txt('train.txt')
    15. test_x=load_txt('test.txt')
    16. train=train_x+test_x
    17. X_all=[i for x in train for i in x]
    18. _, train_y = load_tsv("./data/train.tsv")
    19. _, test_y = load_tsv("./data/test.tsv")
    20. # 训练Word2Vec模型
    21. word2vec_model = Word2Vec(sentences=X_all, vector_size=100, window=5, min_count=1, workers=4)
    22. # 将文本转换为Word2Vec向量表示
    23. def text_to_vector(text):
    24. vector = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv]
    25. return sum(vector) / len(vector) if vector else [0] * word2vec_model.vector_size
    26. X_train_w2v = [[text_to_vector(text)] for line in train_x for text in line]
    27. X_test_w2v = [[text_to_vector(text)] for line in test_x for text in line]
    28. # 将词向量转换为PyTorch张量
    29. X_train_array = np.array(X_train_w2v, dtype=np.float32)
    30. X_train_tensor = torch.Tensor(X_train_array)
    31. X_test_array = np.array(X_test_w2v, dtype=np.float32)
    32. X_test_tensor = torch.Tensor(X_test_array)
    33. #使用DataLoader打包文件
    34. train_dataset = TensorDataset(X_train_tensor, torch.LongTensor(train_y))
    35. train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    36. test_dataset = TensorDataset(X_test_tensor,torch.LongTensor(test_y))
    37. test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)
    38. # 定义LSTM模型
    39. class LSTMModel(nn.Module):
    40. def __init__(self, input_size, hidden_size, output_size):
    41. super(LSTMModel, self).__init__()
    42. self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
    43. self.fc = nn.Linear(hidden_size, output_size)
    44. def forward(self, x):
    45. lstm_out, _ = self.lstm(x)
    46. output = self.fc(lstm_out[:, -1, :]) # 取序列的最后一个输出
    47. return output
    48. # 定义模型
    49. input_size = word2vec_model.vector_size
    50. hidden_size = 50 # 你可以根据需要调整隐藏层大小
    51. output_size = 2 # 输出的大小,根据你的任务而定
    52. model = LSTMModel(input_size, hidden_size, output_size)
    53. # 定义损失函数和优化器
    54. criterion = nn.CrossEntropyLoss() # 交叉熵损失函数
    55. optimizer = torch.optim.Adam(model.parameters(), lr=0.0002)
    56. if __name__ == "__main__":
    57. # 训练模型
    58. num_epochs = 10
    59. log_interval = 100 # 每隔100个批次输出一次日志
    60. loss_min=100
    61. for epoch in range(num_epochs):
    62. model.train()
    63. for batch_idx, (data, target) in enumerate(train_loader):
    64. outputs = model(data)
    65. loss = criterion(outputs, target)
    66. optimizer.zero_grad()
    67. loss.backward()
    68. optimizer.step()
    69. if batch_idx % log_interval == 0:
    70. print('Epoch [{}/{}], Batch [{}/{}], Loss: {:.4f}'.format(
    71. epoch + 1, num_epochs, batch_idx, len(train_loader), loss.item()))
    72. # 保存最佳模型
    73. if loss.item()
    74. loss_min=loss.item()
    75. torch.save(model, 'model.pth')
    76. # 模型评估
    77. with torch.no_grad():
    78. model.eval()
    79. correct = 0
    80. total = 0
    81. for data, target in test_loader:
    82. outputs = model(data)
    83. _, predicted = torch.max(outputs.data, 1)
    84. total += target.size(0)
    85. correct += (predicted == target).sum().item()
    86. accuracy = correct / total
    87. print('Test Accuracy: {:.2%}'.format(accuracy))

    3.test.py 

    1. import torch
    2. import jieba
    3. from torch import nn
    4. from gensim.models import Word2Vec
    5. import numpy as np
    6. class LSTMModel(nn.Module):
    7. def __init__(self, input_size, hidden_size, output_size):
    8. super(LSTMModel, self).__init__()
    9. self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
    10. self.fc = nn.Linear(hidden_size, output_size)
    11. def forward(self, x):
    12. lstm_out, _ = self.lstm(x)
    13. output = self.fc(lstm_out[:, -1, :]) # 取序列的最后一个输出
    14. return output
    15. # 数据读取
    16. def load_txt(path):
    17. with open(path,'r',encoding='utf-8') as f:
    18. data=[[line.strip()] for line in f.readlines()]
    19. return data
    20. #去停用词
    21. def drop_stopword(datas):
    22. # 假设你有一个函数用于预处理文本数据
    23. with open('./hit_stopwords.txt', 'r', encoding='UTF8') as f:
    24. stop_words = [word.strip() for word in f.readlines()]
    25. datas=[x for x in datas if x not in stop_words]
    26. return datas
    27. def preprocess_text(text):
    28. text=list(jieba.cut(text))
    29. text=drop_stopword(text)
    30. return text
    31. # 将文本转换为Word2Vec向量表示
    32. def text_to_vector(text):
    33. train_x = load_txt('train.txt')
    34. test_x = load_txt('test.txt')
    35. train = train_x + test_x
    36. X_all = [i for x in train for i in x]
    37. # 训练Word2Vec模型
    38. word2vec_model = Word2Vec(sentences=X_all, vector_size=100, window=5, min_count=1, workers=4)
    39. vector = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv]
    40. return sum(vector) / len(vector) if vector else [0] * word2vec_model.vector_size
    41. if __name__ == '__main__':
    42. input_text = "这个车完全就是垃圾,又热又耗油"
    43. # input_text = "这个车我开了好几年,还是不错的"
    44. label = {1: "正面情绪", 0: "负面情绪"}
    45. model = torch.load('model.pth')
    46. # 预处理输入数据
    47. input_data = preprocess_text(input_text)
    48. # 确保输入词向量与模型维度和数据类型相同
    49. input_data=[[text_to_vector(input_data)]]
    50. input_arry= np.array(input_data, dtype=np.float32)
    51. input_tensor = torch.Tensor(input_arry)
    52. # 将输入数据传入模型
    53. with torch.no_grad():
    54. output = model(input_tensor)
    55. # 这里只一个简单的示例
    56. predicted_class = label[torch.argmax(output).item()]
    57. print(f"predicted_text:{input_text}")
    58. print(f"模型预测的类别: {predicted_class}")

  • 相关阅读:
    Electron自动化测试技术选型调研
    正点原子lwIP学习笔记——Socket接口UDP实验
    企业搭建网站用哪种服务器
    PyQt5如何在Qtdesigner里修改按钮形状、大小、按钮颜色、字体颜色等参数,尤其是如何将按钮修改成圆形。
    在deepin上使用Fleet开发SpringBoot 3.0.0项目
    【高等数学基础进阶】定积分与反常积分-定积分
    LeetCode 面试题 03.04. 化栈为队
    华为欧拉系统安装
    使用naive-ui做一个标签页展示列表
    Python 模拟Laguerre Polynomial拉盖尔多项式
  • 原文地址:https://blog.csdn.net/m0_74053536/article/details/134379831