• N3 中文文本分类


    前言

    前面学习了相关自然语言编码,这周进行相关实战

    导入依赖库和设置设备

    import torch
    import torch.nn as nn
    import torchvision
    from torchvision import transforms, datasets
    import os, PIL, pathlib, warnings
    
    warnings.filterwarnings("ignore")  # 忽略警告
    # win10
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    

    这段代码导入了必要的库并设置了设备(GPU或CPU)。

    数据预处理和词汇表构建

    from torchtext.data.utils import get_tokenizer
    from torchtext.vocab import build_vocab_from_iterator
    from torchtext.datasets import AG_NEWS
    
    train_iter = AG_NEWS(split='train')
    tokenizer = get_tokenizer('basic_english')  # 返回分词器函数
    
    def yield_tokens(data_iter):
        for _, text in data_iter:
            yield tokenizer(text)
    
    vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=[""])
    vocab.set_default_index(vocab[""])  # 设置默认索引,如果找不到单词,则会选择默认索引
    

    这里使用torchtext库加载AG_NEWS数据集,定义了一个分词器并构建了词汇表。

    数据处理管道

    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: int(x) - 1
    text_pipeline('here is the an example')
    

    定义了两个数据处理管道:text_pipeline用于将文本转化为词汇表中的索引序列,label_pipeline用于将标签转化为整数索引。

    定义数据加载器

    from torch.utils.data import DataLoader
    
    def collate_batch(batch):
        label_list, text_list, offsets = [], [], [0]
    
        for (_label, _text) in batch:
            label_list.append(label_pipeline(_label))
            processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
            text_list.append(processed_text)
            offsets.append(processed_text.size(0))
    
        label_list = torch.tensor(label_list, dtype=torch.int64)
        text_list = torch.cat(text_list)
        offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)  # 返回维度dim中输入元素的累计和
    
        return label_list.to(device), text_list.to(device), offsets.to(device)
    
    dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)
    

    定义了一个collate_batch函数用于将一个批次的数据整合在一起,并创建了一个数据加载器。

    定义模型

    from torch import nn
    
    class TextClassificationModel(nn.Module):
    
        def __init__(self, vocab_size, embed_dim, num_class):
            super(TextClassificationModel, self).__init__()
            self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
            self.fc = nn.Linear(embed_dim, num_class)
            self.init_weights()
    
        def init_weights(self):
            initrange = 0.5
            self.embedding.weight.data.uniform_(-initrange, initrange)
            self.fc.weight.data.uniform_(-initrange, initrange)
            self.fc.bias.data.zero_()
    
        def forward(self, text, offsets):
            embedded = self.embedding(text, offsets)
            return self.fc(embedded)
    
    num_class = len(set([label for (label, text) in train_iter]))
    vocab_size = len(vocab)
    em_size = 64
    model = TextClassificationModel(vocab_size, em_size, num_class).to(device)
    

    定义了一个文本分类模型TextClassificationModel,包括初始化函数、权重初始化和前向传播函数。模型由一个嵌入层和一个线性层组成。

    训练和评估函数

    import time
    
    def train(dataloader):
        model.train()  # 切换为训练模式
        total_acc, train_loss, total_count = 0, 0, 0
        log_interval = 500
        start_time = time.time()
    
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            optimizer.zero_grad()  # grad属性归零
            loss = criterion(predicted_label, label)  # 计算网络输出和真实值之间的差距,label为真实值
            loss.backward()  # 反向传播
            optimizer.step()  # 每一步自动更新
    
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            train_loss += loss.item()
            total_count += label.size(0)
    
            if idx % log_interval == 0 and idx > 0:
                elapsed = time.time() - start_time
                print('| epoch {:1d} | {:4d}/{:4d} batches '
                      '| train_acc {:4.3f} train_loss {:4.5f}'.format(epoch, idx, len(dataloader),
                                                                      total_acc / total_count, train_loss / total_count))
                total_acc, train_loss, total_count = 0, 0, 0
                start_time = time.time()
    
    def evaluate(dataloader):
        model.eval()  # 切换为测试模式
        total_acc, train_loss, total_count = 0, 0, 0
    
        with torch.no_grad():
            for idx, (label, text, offsets) in enumerate(dataloader):
                predicted_label = model(text, offsets)
                loss = criterion(predicted_label, label)  # 计算loss值
                total_acc += (predicted_label.argmax(1) == label).sum().item()
                train_loss += loss.item()
                total_count += label.size(0)
    
        return total_acc / total_count, train_loss / total_count
    

    定义了训练和评估函数,用于训练模型和评估模型性能。

    数据集分割和数据加载器创建

    from torch.utils.data.dataset import random_split
    from torchtext.data.functional import to_map_style_dataset
    
    EPOCHS = 10  # epoch
    LR = 5  # 学习率
    BATCH_SIZE = 64  # batch size for training
    
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
    total_accu = None
    
    train_iter, test_iter = AG_NEWS()  # 加载数据
    train_dataset = to_map_style_dataset(train_iter)
    test_dataset = to_map_style_dataset(test_iter)
    num_train = int(len(train_dataset) * 0.95)
    
    split_train_, split_valid_ = random_split(train_dataset,
                                              [num_train, len(train_dataset) - num_train])
    
    train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                                  shuffle=True, collate_fn=collate_batch)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                                 shuffle=True, collate_fn=collate_batch)
    

    加载数据集并将其转换为适用于随机访问的数据集,分割训练集和验证集,并创建相应的数据加载器。

    训练和验证模型

    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train(train_dataloader)
        val_acc, val_loss = evaluate(valid_dataloader)
    
        if total_accu is not None and total_accu > val_acc:
            scheduler.step()
        else:
            total_accu = val_acc
        print('-' * 69)
        print('| epoch {:1d} | time: {:4.2f}s | '
              'valid_acc {:4.3f} valid_loss {:4.3f}'.format(epoch,
                                                            time.time() - epoch_start_time,
                                                            val_acc, val_loss))
        print('-' * 69)
    

    进行训练和验证,在每个epoch结束时打印验证准确率和损失,并根据验证结果调整学习率。

    测试模型

    print('Checking the results of test dataset.')
    test_acc, test_loss = evaluate(test_dataloader)
    print('test accuracy {:8.3f}'.format(test_acc))
    

    在测试集上评估模型性能并打印测试准确率。

    结果

    在这里插入图片描述

    总结

    这个案例实现了一个完整的文本分类流程,从数据预处理、模型定义到训练和评估。使用torchtext加载数据,并利用PyTorch构建和训练深度学习模型,实现了对AG_NEWS数据集的文本分类任务,达到了90.1%的精度。

  • 相关阅读:
    全新UI四方聚合支付系统源码/新增USDT提现/最新更新安全升级修复XSS漏洞补单漏洞
    ubuntu下C++调用matplotlibcpp进行画图(超详细)
    spring boot课程评价系统 毕业设计源码211004
    python+pygame+opencv+gpt实现虚拟数字人直播(一)
    如何使用Linux编写STM32程序并且烧录
    C++ 构造函数
    48.集群节点维护—滚动升级-1
    H7-TOOL发布V2.19,脱机烧录新增中微半导体、广芯微电子、中移芯昇以及极海和灵动新系列,增加PWM发生器等功能(2022-11-17)
    云呐|机房监控服务平台,机房监控服务平台有哪些
    华为云云耀云服务器L实例评测|云耀云服务器L实例部署ZFile在线网盘服务
  • 原文地址:https://blog.csdn.net/tjl521314_21/article/details/139630349