• 【Datawhale AI 夏令营】讯飞“基于术语词典干预的机器翻译挑战赛”


    背景

    机器翻译具有悠长的发展历史,目前主流的机器翻译方法为神经网络翻译,如LSTM和transformer在特定领域或行业中,由于机器翻译难以保证术语的一致性,导致翻译效果还不够理想。对于术语名词、人名地名等机器翻译不准确的结果,可以通过术语词典进行纠正,避免了混淆或歧义,最大限度提高翻译质量。

    任务

    基于术语词典干预的机器翻译挑战赛选择以英文为源语言,中文为目标语言的机器翻译。本次大赛除英文到中文的双语数据,还提供英中对照的术语词典。参赛队伍需要基于提供的训练数据样本从多语言机器翻译模型的构建与训练,并基于测试集以及术语词典,提供最终的翻译结果。

    Baseline代码解读

    首先导入相应的包

    1. import torch
    2. import torch.nn as nn
    3. import torch.optim as optim
    4. from torch.utils.data import Dataset, DataLoader
    5. from torchtext.data.utils import get_tokenizer
    6. from collections import Counter
    7. import random
    8. from torch.utils.data import Subset, DataLoader
    9. import time

    随后定义数据集、Decoder类、Encoder类、Seq2seq类

    1. # 定义数据集类
    2. # 修改TranslationDataset类以处理术语
    3. class TranslationDataset(Dataset):
    4. def __init__(self, filename, terminology):
    5. self.data = []
    6. with open(filename, 'r', encoding='utf-8') as f:
    7. for line in f:
    8. en, zh = line.strip().split('\t')
    9. self.data.append((en, zh))
    10. self.terminology = terminology
    11. # 创建词汇表,注意这里需要确保术语词典中的词也被包含在词汇表中
    12. self.en_tokenizer = get_tokenizer('basic_english')
    13. self.zh_tokenizer = list # 使用字符级分词
    14. en_vocab = Counter(self.terminology.keys()) # 确保术语在词汇表中
    15. zh_vocab = Counter()
    16. for en, zh in self.data:
    17. en_vocab.update(self.en_tokenizer(en))
    18. zh_vocab.update(self.zh_tokenizer(zh))
    19. # 添加术语到词汇表
    20. self.en_vocab = ['', '', ''] + list(self.terminology.keys()) + [word for word, _ in en_vocab.most_common(10000)]
    21. self.zh_vocab = ['', '', ''] + [word for word, _ in zh_vocab.most_common(10000)]
    22. self.en_word2idx = {word: idx for idx, word in enumerate(self.en_vocab)}
    23. self.zh_word2idx = {word: idx for idx, word in enumerate(self.zh_vocab)}
    24. def __len__(self):
    25. return len(self.data)
    26. def __getitem__(self, idx):
    27. en, zh = self.data[idx]
    28. en_tensor = torch.tensor([self.en_word2idx.get(word, self.en_word2idx['']) for word in self.en_tokenizer(en)] + [self.en_word2idx['']])
    29. zh_tensor = torch.tensor([self.zh_word2idx.get(word, self.zh_word2idx['']) for word in self.zh_tokenizer(zh)] + [self.zh_word2idx['']])
    30. return en_tensor, zh_tensor
    31. def collate_fn(batch):
    32. en_batch, zh_batch = [], []
    33. for en_item, zh_item in batch:
    34. en_batch.append(en_item)
    35. zh_batch.append(zh_item)
    36. # 对英文和中文序列分别进行填充
    37. en_batch = nn.utils.rnn.pad_sequence(en_batch, padding_value=0, batch_first=True)
    38. zh_batch = nn.utils.rnn.pad_sequence(zh_batch, padding_value=0, batch_first=True)
    39. return en_batch, zh_batch
    1. class Encoder(nn.Module):
    2. def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    3. super().__init__()
    4. self.embedding = nn.Embedding(input_dim, emb_dim)
    5. self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
    6. self.dropout = nn.Dropout(dropout)
    7. def forward(self, src):
    8. # src shape: [batch_size, src_len]
    9. embedded = self.dropout(self.embedding(src))
    10. # embedded shape: [batch_size, src_len, emb_dim]
    11. outputs, hidden = self.rnn(embedded)
    12. # outputs shape: [batch_size, src_len, hid_dim]
    13. # hidden shape: [n_layers, batch_size, hid_dim]
    14. return outputs, hidden
    15. class Decoder(nn.Module):
    16. def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
    17. super().__init__()
    18. self.output_dim = output_dim
    19. self.embedding = nn.Embedding(output_dim, emb_dim)
    20. self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
    21. self.fc_out = nn.Linear(hid_dim, output_dim)
    22. self.dropout = nn.Dropout(dropout)
    23. def forward(self, input, hidden):
    24. # input shape: [batch_size, 1]
    25. # hidden shape: [n_layers, batch_size, hid_dim]
    26. embedded = self.dropout(self.embedding(input))
    27. # embedded shape: [batch_size, 1, emb_dim]
    28. output, hidden = self.rnn(embedded, hidden)
    29. # output shape: [batch_size, 1, hid_dim]
    30. # hidden shape: [n_layers, batch_size, hid_dim]
    31. prediction = self.fc_out(output.squeeze(1))
    32. # prediction shape: [batch_size, output_dim]
    33. return prediction, hidden
    34. class Seq2Seq(nn.Module):
    35. def __init__(self, encoder, decoder, device):
    36. super().__init__()
    37. self.encoder = encoder
    38. self.decoder = decoder
    39. self.device = device
    40. def forward(self, src, trg, teacher_forcing_ratio=0.5):
    41. # src shape: [batch_size, src_len]
    42. # trg shape: [batch_size, trg_len]
    43. batch_size = src.shape[0]
    44. trg_len = trg.shape[1]
    45. trg_vocab_size = self.decoder.output_dim
    46. outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
    47. _, hidden = self.encoder(src)
    48. input = trg[:, 0].unsqueeze(1) # Start token
    49. for t in range(1, trg_len):
    50. output, hidden = self.decoder(input, hidden)
    51. outputs[:, t, :] = output
    52. teacher_force = random.random() < teacher_forcing_ratio
    53. top1 = output.argmax(1)
    54. input = trg[:, t].unsqueeze(1) if teacher_force else top1.unsqueeze(1)
    55. return outputs

    增加术语词典

    1. # 新增术语词典加载部分
    2. def load_terminology_dictionary(dict_file):
    3. terminology = {}
    4. with open(dict_file, 'r', encoding='utf-8') as f:
    5. for line in f:
    6. en_term, ch_term = line.strip().split('\t')
    7. terminology[en_term] = ch_term
    8. return terminology

    训练模型

    1. def train(model, iterator, optimizer, criterion, clip):
    2. model.train()
    3. epoch_loss = 0
    4. for i, (src, trg) in enumerate(iterator):
    5. src, trg = src.to(device), trg.to(device)
    6. optimizer.zero_grad()
    7. output = model(src, trg)
    8. output_dim = output.shape[-1]
    9. output = output[:, 1:].contiguous().view(-1, output_dim)
    10. trg = trg[:, 1:].contiguous().view(-1)
    11. loss = criterion(output, trg)
    12. loss.backward()
    13. torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    14. optimizer.step()
    15. epoch_loss += loss.item()
    16. return epoch_loss / len(iterator)

    主函数,设置批次大小和数据量

    1. # 主函数
    2. if __name__ == '__main__':
    3. start_time = time.time() # 开始计时
    4. device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    5. #terminology = load_terminology_dictionary('../dataset/en-zh.dic')
    6. terminology = load_terminology_dictionary('../dataset/en-zh.dic')
    7. # 加载数据
    8. dataset = TranslationDataset('../dataset/train.txt',terminology = terminology)
    9. # 选择数据集的前N个样本进行训练
    10. N = 1000 #int(len(dataset) * 1) # 或者你可以设置为数据集大小的一定比例,如 int(len(dataset) * 0.1)
    11. subset_indices = list(range(N))
    12. subset_dataset = Subset(dataset, subset_indices)
    13. train_loader = DataLoader(subset_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    14. # 定义模型参数
    15. INPUT_DIM = len(dataset.en_vocab)
    16. OUTPUT_DIM = len(dataset.zh_vocab)
    17. ENC_EMB_DIM = 256
    18. DEC_EMB_DIM = 256
    19. HID_DIM = 512
    20. N_LAYERS = 2
    21. ENC_DROPOUT = 0.5
    22. DEC_DROPOUT = 0.5
    23. # 初始化模型
    24. enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
    25. dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
    26. model = Seq2Seq(enc, dec, device).to(device)
    27. # 定义优化器和损失函数
    28. optimizer = optim.Adam(model.parameters())
    29. criterion = nn.CrossEntropyLoss(ignore_index=dataset.zh_word2idx[''])
    30. # 训练模型
    31. N_EPOCHS = 10
    32. CLIP = 1
    33. for epoch in range(N_EPOCHS):
    34. train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    35. print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}')
    36. # 在训练循环结束后保存模型
    37. torch.save(model.state_dict(), './translation_model_GRU.pth')
    38. end_time = time.time() # 结束计时
    39. # 计算并打印运行时间
    40. elapsed_time_minute = (end_time - start_time)/60
    41. print(f"Total running time: {elapsed_time_minute:.2f} minutes")

    由于没有对代码进行任何修改,所以效果并不好

    之后尝试修改N以及NEPOCH参数,来降低损失,从而提高分数

  • 相关阅读:
    docker容器怎么设置开机启动
    《Upload-Labs》01. Pass 1~13
    webpack5 (四)
    react使用hook封装一个search+input+checkbox组件
    【C语言】文件输入输出操作
    Kubernetes快速部署
    TFHE 的全同态模结构(FHE Module Structure)
    Qt教程 — 3.1 深入了解Qt 控件:Buttons按钮
    数据库的原理及应用
    Relational Triplet Extraction(RTE)
  • 原文地址:https://blog.csdn.net/weixin_56029873/article/details/140383959