赞
踩
题目链接点这里
目前神经机器翻译技术已经取得了很大的突破,但在特定领域或行业中,由于机器翻译难以保证术语的一致性,导致翻译效果还不够理想。对于术语名词、人名地名等机器翻译不准确的结果,可以通过术语词典进行纠正,避免了混淆或歧义,最大限度提高翻译质量。
基于术语词典干预的机器翻译挑战赛选择以英文为源语言,中文为目标语言的机器翻译。本次大赛除英文到中文的双语数据,还提供英中对照的术语词典。参赛队伍需要基于提供的训练数据样本从多语言机器翻译模型的构建与训练,并基于测试集以及术语词典,提供最终的翻译结果,数据包括:
训练集:双语数据:中英14万余双语句对
开发集:英中1000双语句对
测试集:英中1000双语句对
术语词典:英中2226条
所有文件均为UTF-8编码,其中测评官方发放的训练集、开发集、测试集和术语词典皆为文本文件,格式如下所示:
训练集为双语数据,每行为一个句对样本,其格式如图1所示。
术语词典格式如图2所示。
对于参赛队伍提交的测试集翻译结果文件,采用自动评价指标BLUE-4进行评价,具体工具使用sacrebleu开源版本。
代码如下:
import torchtext
torchtext.disable_torchtext_deprecation_warning() # 忽略警告
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from collections import Counter
import random
from torch.utils.data import Subset, DataLoader
import time
# 定义数据集类 # 修改TranslationDataset类以处理术语 class TranslationDataset(Dataset): def __init__(self, filename, terminology): self.data = [] with open(filename, 'r', encoding='utf-8') as f: for line in f: en, zh = line.strip().split('\t') self.data.append((en, zh)) self.terminology = terminology # 创建词汇表,注意这里需要确保术语词典中的词也被包含在词汇表中 self.en_tokenizer = get_tokenizer('basic_english') self.zh_tokenizer = list # 使用字符级分词 en_vocab = Counter(self.terminology.keys()) # 确保术语在词汇表中 zh_vocab = Counter() for en, zh in self.data: en_vocab.update(self.en_tokenizer(en)) zh_vocab.update(self.zh_tokenizer(zh)) # 添加术语到词汇表 self.en_vocab = ['<pad>', '<sos>', '<eos>'] + list(self.terminology.keys()) + [word for word, _ in en_vocab.most_common(10000)] self.zh_vocab = ['<pad>', '<sos>', '<eos>'] + [word for word, _ in zh_vocab.most_common(10000)] self.en_word2idx = {word: idx for idx, word in enumerate(self.en_vocab)} self.zh_word2idx = {word: idx for idx, word in enumerate(self.zh_vocab)} def __len__(self): return len(self.data) def __getitem__(self, idx): en, zh = self.data[idx] en_tensor = torch.tensor([self.en_word2idx.get(word, self.en_word2idx['<sos>']) for word in self.en_tokenizer(en)] + [self.en_word2idx['<eos>']]) zh_tensor = torch.tensor([self.zh_word2idx.get(word, self.zh_word2idx['<sos>']) for word in self.zh_tokenizer(zh)] + [self.zh_word2idx['<eos>']]) return en_tensor, zh_tensor def collate_fn(batch): en_batch, zh_batch = [], [] for en_item, zh_item in batch: en_batch.append(en_item) zh_batch.append(zh_item) # 对英文和中文序列分别进行填充 en_batch = nn.utils.rnn.pad_sequence(en_batch, padding_value=0, batch_first=True) zh_batch = nn.utils.rnn.pad_sequence(zh_batch, padding_value=0, batch_first=True) return en_batch, zh_batch
class Encoder(nn.Module): def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout): super().__init__() self.embedding = nn.Embedding(input_dim, emb_dim) self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True) self.dropout = nn.Dropout(dropout) def forward(self, src): # src shape: [batch_size, src_len] embedded = self.dropout(self.embedding(src)) # embedded shape: [batch_size, src_len, emb_dim] outputs, hidden = self.rnn(embedded) # outputs shape: [batch_size, src_len, hid_dim] # hidden shape: [n_layers, batch_size, hid_dim] return outputs, hidden class Decoder(nn.Module): def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout): super().__init__() self.output_dim = output_dim self.embedding = nn.Embedding(output_dim, emb_dim) self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True) self.fc_out = nn.Linear(hid_dim, output_dim) self.dropout = nn.Dropout(dropout) def forward(self, input, hidden): # input shape: [batch_size, 1] # hidden shape: [n_layers, batch_size, hid_dim] embedded = self.dropout(self.embedding(input)) # embedded shape: [batch_size, 1, emb_dim] output, hidden = self.rnn(embedded, hidden) # output shape: [batch_size, 1, hid_dim] # hidden shape: [n_layers, batch_size, hid_dim] prediction = self.fc_out(output.squeeze(1)) # prediction shape: [batch_size, output_dim] return prediction, hidden class Seq2Seq(nn.Module): def __init__(self, encoder, decoder, device): super().__init__() self.encoder = encoder self.decoder = decoder self.device = device def forward(self, src, trg, teacher_forcing_ratio=0.5): # src shape: [batch_size, src_len] # trg shape: [batch_size, trg_len] batch_size = src.shape[0] trg_len = trg.shape[1] trg_vocab_size = self.decoder.output_dim outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device) _, hidden = self.encoder(src) input = trg[:, 0].unsqueeze(1) # Start token for t in range(1, trg_len): output, hidden = self.decoder(input, hidden) outputs[:, t, :] = output teacher_force = random.random() < teacher_forcing_ratio top1 = output.argmax(1) input = trg[:, t].unsqueeze(1) if teacher_force else top1.unsqueeze(1) return outputs
# 新增术语词典加载部分
def load_terminology_dictionary(dict_file):
terminology = {}
with open(dict_file, 'r', encoding='utf-8') as f:
for line in f:
en_term, ch_term = line.strip().split('\t')
terminology[en_term] = ch_term
return terminology
def train(model, iterator, optimizer, criterion, clip): model.train() epoch_loss = 0 for i, (src, trg) in enumerate(iterator): src, trg = src.to(device), trg.to(device) optimizer.zero_grad() output = model(src, trg) output_dim = output.shape[-1] output = output[:, 1:].contiguous().view(-1, output_dim) trg = trg[:, 1:].contiguous().view(-1) loss = criterion(output, trg) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), clip) optimizer.step() epoch_loss += loss.item() return epoch_loss / len(iterator)
# 主函数 if __name__ == '__main__': start_time = time.time() # 开始计时 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #terminology = load_terminology_dictionary('../dataset/en-zh.dic') terminology = load_terminology_dictionary('../dataset/en-zh.dic') # 加载数据 dataset = TranslationDataset('../dataset/train.txt',terminology = terminology) # 选择数据集的前N个样本进行训练 N = 1000 #int(len(dataset) * 1) # 或者你可以设置为数据集大小的一定比例,如 int(len(dataset) * 0.1) subset_indices = list(range(N)) subset_dataset = Subset(dataset, subset_indices) train_loader = DataLoader(subset_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn) # 定义模型参数 INPUT_DIM = len(dataset.en_vocab) OUTPUT_DIM = len(dataset.zh_vocab) ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 HID_DIM = 512 N_LAYERS = 2 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 # 初始化模型 enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT) dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT) model = Seq2Seq(enc, dec, device).to(device) # 定义优化器和损失函数 optimizer = optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss(ignore_index=dataset.zh_word2idx['<pad>']) # 训练模型 N_EPOCHS = 10 CLIP = 1 for epoch in range(N_EPOCHS): train_loss = train(model, train_loader, optimizer, criterion, CLIP) print(f'Epoch: {epoch+1:03} | Train Loss: {train_loss:.3f}') # 在训练循环结束后保存模型 torch.save(model.state_dict(), './translation_model_GRU.pth') end_time = time.time() # 结束计时 # 计算并打印运行时间 elapsed_time_minute = (end_time - start_time)/60 print(f"Total running time: {elapsed_time_minute:.2f} minutes")
/usr/local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
Epoch: 001 | Train Loss: 6.545
Epoch: 002 | Train Loss: 6.059
Epoch: 003 | Train Loss: 6.020
Epoch: 004 | Train Loss: 5.980
Epoch: 005 | Train Loss: 5.936
Epoch: 006 | Train Loss: 5.867
Epoch: 007 | Train Loss: 5.801
Epoch: 008 | Train Loss: 5.715
Epoch: 009 | Train Loss: 5.638
Epoch: 010 | Train Loss: 5.556
Total running time: 0.81 minutes
在开发集上进行模型评价
import torch
from sacrebleu.metrics import BLEU
from typing import List
# 假设我们已经定义了TranslationDataset, Encoder, Decoder, Seq2Seq类 def load_sentences(file_path: str) -> List[str]: with open(file_path, 'r', encoding='utf-8') as f: return [line.strip() for line in f] # 更新translate_sentence函数以考虑术语词典 def translate_sentence(sentence: str, model: Seq2Seq, dataset: TranslationDataset, terminology, device: torch.device, max_length: int = 50): model.eval() tokens = dataset.en_tokenizer(sentence) tensor = torch.LongTensor([dataset.en_word2idx.get(token, dataset.en_word2idx['<sos>']) for token in tokens]).unsqueeze(0).to(device) # [1, seq_len] with torch.no_grad(): _, hidden = model.encoder(tensor) translated_tokens = [] input_token = torch.LongTensor([[dataset.zh_word2idx['<sos>']]]).to(device) # [1, 1] for _ in range(max_length): output, hidden = model.decoder(input_token, hidden) top_token = output.argmax(1) translated_token = dataset.zh_vocab[top_token.item()] if translated_token == '<eos>': break # 如果翻译的词在术语词典中,则使用术语词典中的词 if translated_token in terminology.values(): for en_term, ch_term in terminology.items(): if translated_token == ch_term: translated_token = en_term break translated_tokens.append(translated_token) input_token = top_token.unsqueeze(1) # [1, 1] return ''.join(translated_tokens)
def evaluate_bleu(model: Seq2Seq, dataset: TranslationDataset, src_file: str, ref_file: str, terminology,device: torch.device):
model.eval()
src_sentences = load_sentences(src_file)
ref_sentences = load_sentences(ref_file)
translated_sentences = []
for src in src_sentences:
translated = translate_sentence(src, model, dataset, terminology, device)
translated_sentences.append(translated)
bleu = BLEU()
score = bleu.corpus_score(translated_sentences, [ref_sentences])
return score
# 主函数 if __name__ == '__main__': device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 加载术语词典 terminology = load_terminology_dictionary('../dataset/en-zh.dic') # 创建数据集实例时传递术语词典 dataset = TranslationDataset('../dataset/train.txt', terminology) # 定义模型参数 INPUT_DIM = len(dataset.en_vocab) OUTPUT_DIM = len(dataset.zh_vocab) ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 HID_DIM = 512 N_LAYERS = 2 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 # 初始化模型 enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT) dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT) model = Seq2Seq(enc, dec, device).to(device) # 加载训练好的模型 model.load_state_dict(torch.load('./translation_model_GRU.pth')) # 评估BLEU分数 bleu_score = evaluate_bleu(model, dataset, '../dataset/dev_en.txt', '../dataset/dev_zh.txt', terminology = terminology,device = device) print(f'BLEU-4 score: {bleu_score.score:.2f}')
BLEU-4 score: 0.00
在测试集上进行推理
def inference(model: Seq2Seq, dataset: TranslationDataset, src_file: str, save_dir:str, terminology, device: torch.device): model.eval() src_sentences = load_sentences(src_file) translated_sentences = [] for src in src_sentences: translated = translate_sentence(src, model, dataset, terminology, device) #print(translated) translated_sentences.append(translated) #print(translated_sentences) # 将列表元素连接成一个字符串,每个元素后换行 text = '\n'.join(translated_sentences) # 打开一个文件,如果不存在则创建,'w'表示写模式 with open(save_dir, 'w', encoding='utf-8') as f: # 将字符串写入文件 f.write(text) #return translated_sentences
# 主函数 if __name__ == '__main__': device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 加载术语词典 terminology = load_terminology_dictionary('../dataset/en-zh.dic') # 加载数据集和模型 dataset = TranslationDataset('../dataset/train.txt',terminology = terminology) # 定义模型参数 INPUT_DIM = len(dataset.en_vocab) OUTPUT_DIM = len(dataset.zh_vocab) ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 HID_DIM = 512 N_LAYERS = 2 ENC_DROPOUT = 0.5 DEC_DROPOUT = 0.5 # 初始化模型 enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT) dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT) model = Seq2Seq(enc, dec, device).to(device) # 加载训练好的模型 model.load_state_dict(torch.load('./translation_model_GRU.pth')) save_dir = '../dataset/submit.txt' inference(model, dataset, src_file="../dataset/test_en.txt", save_dir = save_dir, terminology = terminology, device = device) print(f"翻译完成!文件已保存到{save_dir}")
跑通完 baseline 后提交结果,可以得到分数为:0.1139
对于上述 baseline 代码,我们提高使用的训练集的数据和训练次数,将上述代码进行修改:
N = 1000
-> N = int(len(dataset) * 0.75)
N_EPOCHS = 10
-> N_EPOCHS = 20
运行代码,我们可以得到:
Epoch: 001 | Train Loss: 5.342 Epoch: 002 | Train Loss: 4.996 Epoch: 003 | Train Loss: 4.893 Epoch: 004 | Train Loss: 4.835 Epoch: 005 | Train Loss: 4.786 Epoch: 006 | Train Loss: 4.751 Epoch: 007 | Train Loss: 4.721 Epoch: 008 | Train Loss: 4.688 Epoch: 009 | Train Loss: 4.666 Epoch: 010 | Train Loss: 4.642 Epoch: 011 | Train Loss: 4.625 Epoch: 012 | Train Loss: 4.609 Epoch: 013 | Train Loss: 4.591 Epoch: 014 | Train Loss: 4.573 Epoch: 015 | Train Loss: 4.562 Epoch: 016 | Train Loss: 4.550 Epoch: 017 | Train Loss: 4.538 Epoch: 018 | Train Loss: 4.527 Epoch: 019 | Train Loss: 4.513 Epoch: 020 | Train Loss: 4.505 Total running time: 164.66 minutes
BLEU-4 score: 0.02
提交结果可以得到分数为:1.6431
可以看到还是有显著的提升的。
!pip install torchtext !pip install jieba !pip install sacrebleu import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.nn.utils import clip_grad_norm_ import torchtext torchtext.disable_torchtext_deprecation_warning() from torchtext.data.metrics import bleu_score from torch.utils.data import Dataset, DataLoader from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator from typing import List, Tuple import jieba import random from torch.nn.utils.rnn import pad_sequence import sacrebleu import time import math import warnings warnings.filterwarnings("ignore", category=UserWarning, message=".*pytree.*")
!pip install -U pip setuptools wheel -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install -U 'spacy[cuda12x]' -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install ../dataset/en_core_web_trf-3.7.3-py3-none-any.whl
# 定义tokenizer
en_tokenizer = get_tokenizer('spacy', language='en_core_web_trf')
zh_tokenizer = lambda x: list(jieba.cut(x)) # 使用jieba分词
def read_data(file_path: str) -> List[str]:
with open(file_path, 'r', encoding='utf-8') as f:
return [line.strip() for line in f]
def preprocess_data(en_data: List[str], zh_data: List[str]) -> List[Tuple[List[str], List[str]]]:
processed_data = []
for en, zh in zip(en_data, zh_data):
en_tokens = en_tokenizer(en.lower())[:MAX_LENGTH]
zh_tokens = zh_tokenizer(zh)[:MAX_LENGTH]
if en_tokens and zh_tokens: # 确保两个序列都不为空
processed_data.append((en_tokens, zh_tokens))
return processed_data
def build_vocab(data: List[Tuple[List[str], List[str]]]):
en_vocab = build_vocab_from_iterator(
(en for en, _ in data),
specials=['<unk>', '<pad>', '<bos>', '<eos>']
)
zh_vocab = build_vocab_from_iterator(
(zh for _, zh in data),
specials=['<unk>', '<pad>', '<bos>', '<eos>']
)
en_vocab.set_default_index(en_vocab['<unk>'])
zh_vocab.set_default_index(zh_vocab['<unk>'])
return en_vocab, zh_vocab
class TranslationDataset(Dataset):
def __init__(self, data: List[Tuple[List[str], List[str]]], en_vocab, zh_vocab):
self.data = data
self.en_vocab = en_vocab
self.zh_vocab = zh_vocab
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
en, zh = self.data[idx]
en_indices = [self.en_vocab['<bos>']] + [self.en_vocab[token] for token in en] + [self.en_vocab['<eos>']]
zh_indices = [self.zh_vocab['<bos>']] + [self.zh_vocab[token] for token in zh] + [self.zh_vocab['<eos>']]
return en_indices, zh_indices
def collate_fn(batch):
en_batch, zh_batch = [], []
for en_item, zh_item in batch:
if en_item and zh_item: # 确保两个序列都不为空
en_batch.append(torch.tensor(en_item))
zh_batch.append(torch.tensor(zh_item))
if not en_batch or not zh_batch: # 如果整个批次为空,返回空张量
return torch.tensor([]), torch.tensor([])
en_batch = nn.utils.rnn.pad_sequence(en_batch, batch_first=True, padding_value=en_vocab['<pad>'])
zh_batch = nn.utils.rnn.pad_sequence(zh_batch, batch_first=True, padding_value=zh_vocab['<pad>'])
return en_batch, zh_batch
def load_data(train_path: str, dev_en_path: str, dev_zh_path: str, test_en_path: str): # 读取训练数据 train_data = read_data(train_path) train_en, train_zh = zip(*(line.split('\t') for line in train_data)) # 读取开发集和测试集 dev_en = read_data(dev_en_path) dev_zh = read_data(dev_zh_path) test_en = read_data(test_en_path) # 预处理数据 train_processed = preprocess_data(train_en, train_zh) dev_processed = preprocess_data(dev_en, dev_zh) test_processed = [(en_tokenizer(en.lower())[:MAX_LENGTH], []) for en in test_en if en.strip()] # 构建词汇表 global en_vocab, zh_vocab en_vocab, zh_vocab = build_vocab(train_processed) # 创建数据集 train_dataset = TranslationDataset(train_processed, en_vocab, zh_vocab) dev_dataset = TranslationDataset(dev_processed, en_vocab, zh_vocab) test_dataset = TranslationDataset(test_processed, en_vocab, zh_vocab) from torch.utils.data import Subset # 假设你有10000个样本,你只想用前1000个样本进行测试 indices = list(range(N)) train_dataset = Subset(train_dataset, indices) # 创建数据加载器 train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, drop_last=True) dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, drop_last=True) test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, drop_last=True) return train_loader, dev_loader, test_loader, en_vocab, zh_vocab
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
super().__init__()
self.hid_dim = hid_dim
self.n_layers = n_layers
self.embedding = nn.Embedding(input_dim, emb_dim)
self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
self.dropout = nn.Dropout(dropout)
def forward(self, src):
embedded = self.dropout(self.embedding(src))
outputs, hidden = self.gru(embedded)
return outputs, hidden
class Attention(nn.Module): def __init__(self, hid_dim): super().__init__() self.attn = nn.Linear(hid_dim * 2, hid_dim) self.v = nn.Linear(hid_dim, 1, bias=False) def forward(self, hidden, encoder_outputs): batch_size = encoder_outputs.shape[0] src_len = encoder_outputs.shape[1] hidden = hidden.repeat(src_len, 1, 1).transpose(0, 1) energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2))) attention = self.v(energy).squeeze(2) return F.softmax(attention, dim=1)
class Decoder(nn.Module): def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, attention): super().__init__() self.output_dim = output_dim self.hid_dim = hid_dim self.n_layers = n_layers self.attention = attention self.embedding = nn.Embedding(output_dim, emb_dim) self.gru = nn.GRU(hid_dim + emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True) self.fc_out = nn.Linear(hid_dim * 2 + emb_dim, output_dim) self.dropout = nn.Dropout(dropout) def forward(self, input, hidden, encoder_outputs): input = input.unsqueeze(1) embedded = self.dropout(self.embedding(input)) a = self.attention(hidden[-1:], encoder_outputs) a = a.unsqueeze(1) weighted = torch.bmm(a, encoder_outputs) rnn_input = torch.cat((embedded, weighted), dim=2) output, hidden = self.gru(rnn_input, hidden) embedded = embedded.squeeze(1) output = output.squeeze(1) weighted = weighted.squeeze(1) prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1)) return prediction, hidden
class Seq2Seq(nn.Module): def __init__(self, encoder, decoder, device): super().__init__() self.encoder = encoder self.decoder = decoder self.device = device def forward(self, src, trg, teacher_forcing_ratio=0.5): batch_size = src.shape[0] trg_len = trg.shape[1] trg_vocab_size = self.decoder.output_dim outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device) encoder_outputs, hidden = self.encoder(src) input = trg[:, 0] for t in range(1, trg_len): output, hidden = self.decoder(input, hidden, encoder_outputs) outputs[:, t] = output teacher_force = random.random() < teacher_forcing_ratio top1 = output.argmax(1) input = trg[:, t] if teacher_force else top1 return outputs
def initialize_model(input_dim, output_dim, emb_dim, hid_dim, n_layers, dropout, device):
attn = Attention(hid_dim)
enc = Encoder(input_dim, emb_dim, hid_dim, n_layers, dropout)
dec = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout, attn)
model = Seq2Seq(enc, dec, device).to(device)
return model
def train(model, iterator, optimizer, criterion, clip): model.train() epoch_loss = 0 for i, batch in enumerate(iterator): src, trg = batch if src.numel() == 0 or trg.numel() == 0: continue # 跳过空的批次 src, trg = src.to(DEVICE), trg.to(DEVICE) optimizer.zero_grad() output = model(src, trg) output_dim = output.shape[-1] output = output[:, 1:].contiguous().view(-1, output_dim) trg = trg[:, 1:].contiguous().view(-1) loss = criterion(output, trg) loss.backward() clip_grad_norm_(model.parameters(), clip) optimizer.step() epoch_loss += loss.item() print(f"Average loss for this epoch: {epoch_loss / len(iterator)}") return epoch_loss / len(iterator)
def evaluate(model, iterator, criterion): model.eval() epoch_loss = 0 with torch.no_grad(): for i, batch in enumerate(iterator): src, trg = batch if src.numel() == 0 or trg.numel() == 0: continue # 跳过空批次 src, trg = src.to(DEVICE), trg.to(DEVICE) output = model(src, trg, 0) # 关闭 teacher forcing output_dim = output.shape[-1] output = output[:, 1:].contiguous().view(-1, output_dim) trg = trg[:, 1:].contiguous().view(-1) loss = criterion(output, trg) epoch_loss += loss.item() return epoch_loss / len(iterator)
def translate_sentence(src_indexes, src_vocab, tgt_vocab, model, device, max_length=50): model.eval() src_tensor = src_indexes.unsqueeze(0).to(device) # 添加批次维度 trg_indexes = [tgt_vocab['<bos>']] for i in range(max_length): trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device) with torch.no_grad(): output = model(src_tensor, trg_tensor) pred_token = output.argmax(2)[:, -1].item() trg_indexes.append(pred_token) if pred_token == tgt_vocab['<eos>']: break trg_tokens = [tgt_vocab.get_itos()[i] for i in trg_indexes] return trg_tokens[1:-1] # 移除<bos>和<eos>标记
def calculate_bleu(dev_loader, src_vocab, tgt_vocab, model, device): model.eval() translations = [] references = [] with torch.no_grad(): for src, tgt in dev_loader: src = src.to(device) for sentence in src: translated = translate_sentence(sentence, src_vocab, tgt_vocab, model, device) translations.append(' '.join(translated)) for reference in tgt: ref_tokens = [tgt_vocab.get_itos()[idx] for idx in reference if idx not in [tgt_vocab['<bos>'], tgt_vocab['<eos>'], tgt_vocab['<pad>']]] references.append([' '.join(ref_tokens)]) bleu = sacrebleu.corpus_bleu(translations, references) return bleu.score
def train_model(model, train_iterator, valid_iterator, optimizer, criterion, N_EPOCHS = 10, CLIP = 1, save_path = '../model/best-model.pt'): best_valid_loss = float('inf') for epoch in range(N_EPOCHS): start_time = time.time() train_loss = train(model, train_iterator, optimizer, criterion, CLIP) valid_loss = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), save_path) print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}') print(f'\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')
MAX_LENGTH = 100 # 最大句子长度 BATCH_SIZE = 32 DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') N = 1000 # 采样训练集的数量,最大148363 train_path = '../dataset/train.txt' dev_en_path = '../dataset/dev_en.txt' dev_zh_path = '../dataset/dev_zh.txt' test_en_path = '../dataset/test_en.txt' train_loader, dev_loader, test_loader, en_vocab, zh_vocab = load_data( train_path, dev_en_path, dev_zh_path, test_en_path ) print(f"英语词汇表大小: {len(en_vocab)}") print(f"中文词汇表大小: {len(zh_vocab)}") print(f"训练集大小: {len(train_loader.dataset)}") print(f"开发集大小: {len(dev_loader.dataset)}") print(f"测试集大小: {len(test_loader.dataset)}")
if __name__ == '__main__': N_EPOCHS = 5 CLIP=1 # 模型参数 INPUT_DIM = len(en_vocab) OUTPUT_DIM = len(zh_vocab) EMB_DIM = 128 HID_DIM = 256 N_LAYERS = 2 DROPOUT = 0.5 # 初始化模型 model = initialize_model(INPUT_DIM, OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT, DEVICE) print(f'The model has {sum(p.numel() for p in model.parameters() if p.requires_grad):,} trainable parameters') # 定义损失函数 criterion = nn.CrossEntropyLoss(ignore_index=zh_vocab['<pad>']) # 初始化优化器 optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9) # 训练模型 save_path = '../model/best-model.pt' train_model(model, train_loader, dev_loader, optimizer, criterion, N_EPOCHS, CLIP, save_path = save_path) print(f"训练完成!模型已保存到:{save_path}")
save_dir = '../results/submit_task2.txt'
with open(save_dir, 'w') as f:
translated_sentences = []
for batch in test_loader: # 遍历所有数据
src, _ = batch
src = src.to(DEVICE)
translated = translate_sentence(src[0], en_vocab, zh_vocab, model, DEVICE, max_length=50) # 翻译结果,max_length生成翻译的最大长度
results = "".join(translated)
f.write(results + '\n') # 将结果写入文件
print(f"翻译完成,结果已保存到{save_dir}")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。