• 自然语言处理实验2 字符级RNN分类实验


    实验2 字符级RNN分类实验

    必做题:

    (1)数据准备:academy_titles.txt为“考硕考博”板块的帖子标题,job_titles.txt为“招聘信息”板块的帖子标题,将上述两个txt进行划分,其中训练集为70%,测试集为30%。二分类标签:考硕考博为0,招聘信息为1。字符使用One-hot方法表示。

    (2)设计模型:在训练集上训练字符级RNN模型。注意,字符级不用分词,是将文本的每个字依次送入模型。

    (3)将训练好的模型在测试数据集上进行验证,计算准确率,并分析实验结果。要给出每一部分的代码。

    1. import torch
    2. import torch.nn as nn
    3. import torch.optim as optim
    4. import numpy as np
    5. from sklearn.model_selection import train_test_split
    6. # 读取academy_titles文件内容
    7. with open('C:\\Users\\hp\\Desktop\\academy_titles.txt', 'r', encoding='utf-8') as file:
    8. academy_titles = file.readlines()
    9. # 读取job_titles文件内容
    10. with open('C:\\Users\\hp\\Desktop\\job_titles.txt', 'r', encoding='utf-8') as file:
    11. job_titles = file.readlines()
    12. # 将招聘信息与学术信息分开
    13. academy_titles = [title.strip() for title in academy_titles]
    14. job_titles = [title.strip() for title in job_titles]
    15. # 构建标签和数据
    16. X = academy_titles + job_titles
    17. y = [0] * len(academy_titles) + [1] * len(job_titles)
    18. # 划分训练集和测试集
    19. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    20. # 构建字符到索引的映射
    21. all_chars = set(''.join(academy_titles + job_titles))
    22. char_to_index = {char: i for i, char in enumerate(all_chars)}
    23. # 将文本转换为模型可接受的输入形式
    24. def text_to_input(text, max_len, char_to_index):
    25. X_indices = np.zeros((len(text), max_len, len(char_to_index)), dtype=np.float32)
    26. for i, title in enumerate(text):
    27. for t, char in enumerate(title):
    28. X_indices[i, t, char_to_index[char]] = 1
    29. return torch.tensor(X_indices)
    30. max_len = max([len(title) for title in X])
    31. X_train_indices = text_to_input(X_train, max_len, char_to_index)
    32. X_test_indices = text_to_input(X_test, max_len, char_to_index)
    33. # 构建字符级RNN模型
    34. class CharRNN(nn.Module):
    35. def __init__(self, input_size, hidden_size):
    36. super(CharRNN, self).__init__()
    37. self.hidden_size = hidden_size
    38. self.i2h = nn.LSTM(input_size, hidden_size)
    39. self.fc = nn.Linear(hidden_size, 1)
    40. self.sigmoid = nn.Sigmoid()
    41. def forward(self, input):
    42. hidden, _ = self.i2h(input)
    43. output = self.fc(hidden[-1])
    44. output = self.sigmoid(output)
    45. return output
    46. model = CharRNN(input_size=len(char_to_index), hidden_size=128)
    47. # 定义损失函数和优化器
    48. criterion = nn.BCELoss()
    49. optimizer = optim.Adam(model.parameters(), lr=0.001)
    50. # 转换数据为PyTorch张量
    51. y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
    52. y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)
    53. # 定义新的训练周期数和学习率
    54. num_epochs = 30
    55. learning_rate = 0.01
    56. # 定义新的优化器
    57. optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    58. best_accuracy = 0.0
    59. best_model = None
    60. # 训练模型并输出每一轮的准确率
    61. for epoch in range(num_epochs):
    62. optimizer.zero_grad()
    63. output = model(X_train_indices)
    64. output = output.view(-1, 1)
    65. loss = criterion(output, y_train_tensor[:output.size(0)])
    66. loss.backward()
    67. optimizer.step()
    68. # 计算训练集准确率
    69. predictions = (output > 0.5).float()
    70. correct = (predictions == y_train_tensor[:output.size(0)]).float()
    71. accuracy = correct.sum() / len(correct)
    72. print(f'Epoch {epoch+1}, 训练集准确率: {accuracy.item()}')
    73. # 保存准确率最高的模型
    74. if accuracy > best_accuracy:
    75. best_accuracy = accuracy
    76. best_model = model.state_dict().copy()
    77. # 加载最佳模型参数
    78. model.load_state_dict(best_model)
    79. # 使用测试集上准确率最高的模型进行测试
    80. test_output = model(X_test_indices)
    81. test_output = test_output.view(-1, 1)
    82. test_loss = criterion(test_output, y_test_tensor[:test_output.size(0)])
    83. predictions = (test_output > 0.5).float()
    84. correct = (predictions == y_test_tensor[:test_output.size(0)]).float()
    85. accuracy = correct.sum() / len(correct)
    86. print(f'使用测试集上准确率最高的模型进行测试,准确率: {accuracy.item()}')

     这个实验准确率目前是偏低的,但是我没有很多时间去一直调整参数

    希望后面有需要的同学,可以去调整参数!

  • 相关阅读:
    c# 多线程处理
    怎样去除DataFrame字段列名
    Ubuntu 常用命令
    springboot整合Redis报错:java.io.IOException 远程主机强迫关闭了一个现有的连接
    插件_创蓝图文滑动验证码
    【uniapp】实现对TXT文本文件的读取和保存下载
    虚拟化特性
    python web服务windows管理工具
    Access注入 - Cookie注入
    Selenium01
  • 原文地址:https://blog.csdn.net/m0_62581697/article/details/136722475