赞
踩
数据来源: 微博,网上下载的
格式:
前面的是label,后面的是内容
数据预处理:
没有做预处理,应该去掉转述别人的内容,只保留自己的。
def get_data(): pd_all = pd.read_csv("weibo_senti_100k.csv") print('评论数目(总体):%d' % pd_all.shape[0]) print('评论数目(正向):%d' % pd_all[pd_all.label==1].shape[0]) print('评论数目(负向):%d' % pd_all[pd_all.label==0].shape[0]) positive_data = pd_all[pd_all.label == 1] negative_data = pd_all[pd_all.label == 0] positive_data = np.array(positive_data["review"]) negative_data = np.array(negative_data["review"]) # data_cnt_list = [10000, 20000, 50000, 80000] # 分别构建了10000,20000,50000,80000的平行语料 for i in range(len(data_cnt_list)): # p_data_list = []; n_data_list = [] example_data_list = []; example_label_list = [] for j in tqdm(range(0, data_cnt_list[i], 2), ncols = 80, desc = "weibo_snti_%d_parrel"%data_cnt_list[i]): # 1:1的构建正负例 example_data_list.append(np.random.choice(positive_data)) example_data_list.append(np.random.choice(negative_data)) example_label_list.append(1); example_label_list.append(0) # shuffle操作 d_index = np.arange(data_cnt_list[i]) np.random.shuffle(d_index) # print(d_index, "@@@") # d_index = d_index.tolist() example_data_list = np.array(example_data_list) example_label_list = np.array(example_label_list) example_data_list = example_data_list[d_index] example_label_list = example_label_list[d_index] data_ = pd.DataFrame(example_data_list, columns = ['data']) label_ = pd.DataFrame(example_label_list, columns = ['label']) data_ = pd.concat([data_, label_], axis = 1, ignore_index = True) data_ = data_.rename(columns = {0: "data", 1: "label"}) data_.to_csv("weibo_snti_%d_parrel.csv"%data_cnt_list[i])
def read_data(cnt, args):
df = pd.read_csv("weibo_snti_%d_parrel.csv"%cnt)
data = df["data"]; label = df["label"]
train_inputs, test_inputs, train_label, test_label = \
train_test_split(data, label)
train_inputs = train_inputs.tolist(); train_label = train_label.tolist()
test_inputs = test_inputs.tolist(); test_label = test_label.tolist()
train_dataset = BertDataSet(train_inputs, train_label, "../bert-base-chinese",
seq_len = args.seq_len, encoding = args.encoding)
test_dataset = BertDataSet(test_inputs, test_label, "../bert-base-chinese",
seq_len = args.seq_len, encoding = args.encoding)
train_data_loader = DataLoader(train_dataset, batch_size = args.batch_size,
num_workers = 4)
test_data_loader = DataLoader(test_dataset, batch_size = args.batch_size,
num_workers = 4)
BertDataSet也需要修改:
from torch.utils.data import Dataset import torch import transformers as tfs class BertDataSet(Dataset): """docstring for ClassName""" def __init__(self, dataset, labelset, tokenizer_name, seq_len, encoding = "utf-8"): super(BertDataSet, self).__init__() # self.vocab = vocab self.seq_len = seq_len # self.corpus_path = corpus_path self.encoding = encoding self.dataset = dataset self.labelset = labelset self.tokenizer = tfs.BertTokenizer.from_pretrained(tokenizer_name) assert len(self.labelset) == len(self.labelset) self.data_len = len(self.labelset) # special_tokens_dict = {'additional_special_tokens': ["[EOS]"]} # self.tokenizer.add_special_tokens(special_tokens_dict) self.cls_id = self.tokenizer.convert_tokens_to_ids(self.tokenizer.cls_token) self.pad_id = self.tokenizer.convert_tokens_to_ids(self.tokenizer.pad_token) # self.tokenizer.eos_token("eos_token") # bert_base_chinese中没有EOS self.tokenizer.eos_token = "[EOS]" # print(self.tokenizer.eos_token) # print(self.tokenizer.pad_token, self.tokenizer.unk_token) self.eos_id = self.tokenizer.convert_tokens_to_ids(self.tokenizer.eos_token) # print(self.eos_id, "@@@") # input("@@@") def __len__(self): return self.data_len def __getitem__(self, item): assert item < self.data_len label = self.labelset[item] data = self.dataset[item] input_ids, mask_ids = self.get_input_id(data) bert_input = ([self.cls_id] + input_ids + [self.eos_id])[:self.seq_len] bert_mask = ([1] + mask_ids + [self.pad_id])[:self.seq_len] padding = [self.pad_id for _ in range(self.seq_len - len(bert_input))] bert_input.extend(padding); bert_mask.extend(padding) output = { "bert_input": bert_input, "bert_mask": bert_mask, "label": label } # print(bert_input) # print(bert_mask) return {key: torch.tensor(value) for key, value in output.items()} def get_input_id(self, data): id_list = []; mask_list = [] for i, word in enumerate(data): id_list.append(self.tokenizer.convert_tokens_to_ids(word)) mask_list.append(1) return id_list, mask_list
import torch import torch.nn as nn import transformers as tfs # import math import json class BertSentiClassificationModel(nn.Module): def __init__(self): super(BertSentiClassificationModel, self).__init__() # config = json.loads(open("../bert-base-chinese/config.json").read()) self.bert = tfs.BertModel.from_pretrained("../bert-base-chinese") # self.bert.to("cuda") self.dropout = nn.Dropout(0.1) # self.tokenizer = tfs.BertTokenizer.from_pretrained("../bert-base-chinese") self.linear = nn.Linear(768, 2) # 二分类问题 self.softmax = nn.LogSoftmax(dim = -1) def forward(self, batch): # batch_tokenized = self.tokenizer.batch_encode_plus(batch_sentences, # add_special_token = True, max_len = 100, pad_to_max_length = True) input_ids = torch.tensor(batch['bert_input']) attention_mask = torch.tensor(batch['bert_mask']) # print("\n") # print(input_ids.size(), attention_mask.size()) bert_ouput = self.bert(input_ids, attention_mask = attention_mask) bert_cls = bert_ouput[0][:, 0, :] # 取cls output = self.softmax(self.linear(self.dropout(bert_cls))) return output
import torch import torch.nn as nn from torch.utils.data import DataLoader from model.model import BertSentiClassificationModel from torch.optim import Adam from .optim_schedule import ScheduledOptim import tqdm class BertSentiTrainer(object): """docstring for BertSentiTrainer""" def __init__(self, bert: BertSentiClassificationModel, vocab_size: int, train_dataloader: DataLoader, test_dataloader: DataLoader, with_cuda: bool, log_freq: int = 50, lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000): super(BertSentiTrainer, self).__init__() cuda_condition = torch.cuda.is_available() and with_cuda self.device = torch.device("cuda" if cuda_condition else "cpu") self.bert = bert self.bert.to(self.device) self.train_data = train_dataloader self.test_data = test_dataloader self.criterion = nn.CrossEntropyLoss() self.optim = Adam(self.bert.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) self.optim_schedule = ScheduledOptim(self.optim, 512, n_warmup_steps=warmup_steps) self.log_freq = log_freq def train(self, epoch): self.iteration(epoch, self.train_data) def test(self, epoch): self.iteration(epoch, self.test_data, train = False) def iteration(self, epoch, data_loader, train = True): str_code = "train" if train else "test" data_iter = tqdm.tqdm(enumerate(data_loader), desc="EP_%s:%d" % (str_code, epoch), total=len(data_loader), bar_format="{l_bar}{r_bar}", ncols = 80) avg_loss = 0.0 total_correct = 0 total_element = 0 for i, data in enumerate(data_loader): data = {key: value.to(self.device) for key, value in data.items()} classfilabel = self.bert(data) loss = self.criterion(classfilabel, data["label"]) # print(loss.item()) # print(classfilabel.argmax(dim = -1)) # print(data["label"]) # print(classfilabel.argmax(dim = -1).eq(data["label"]).sum()) # input("@@@@") if train: self.optim_schedule.zero_grad() loss.backward() self.optim_schedule.step_and_update_lr() correct = classfilabel.argmax(dim = -1).eq(data["label"]).sum().item() avg_loss += loss.item() total_correct += correct total_element += len(data["label"]) post_fix = { "epoch": epoch, "iter": i, "avg_loss": avg_loss / (i + 1), "avg_acc": total_correct / total_element * 100, "loss": loss.item() } if i % self.log_freq == 0: data_iter.write(str(post_fix)) print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=", total_correct * 100.0 / total_element) def save(self, epoch, file_path = "output/bert_senti.model"): output_path = file_path + ".ep%d" %epoch torch.save(self.bert.cpu(), output_path) self.bert.to(self.device) print("EP:%d Model Saved on:" % epoch, output_path) return output_path
bert = BertSentiClassificationModel()
trainer = BertSentiTrainer(bert, args.vocab_size, train_data_loader,
test_data_loader, True, log_freq = args.freq)
for epoch in range(args.epochs):
trainer.train(epoch)
trainer.save(epoch)
if test_data_loader is not None:
trainer.test(epoch)
6.实验结果

训练一个epoch就有92%的正确率,还是在没有经过任何处理的情况。
优化点:
1、数据预处理
2、embedding层可以自己训练
3、可以自己使用LM模型预训练bert(时间久,数据要求高)
补充:
优化器代码:
'''A wrapper class for optimizer ''' import numpy as np class ScheduledOptim(): '''A simple wrapper class for learning rate scheduling''' def __init__(self, optimizer, d_model, n_warmup_steps): self._optimizer = optimizer self.n_warmup_steps = n_warmup_steps self.n_current_steps = 0 self.init_lr = np.power(d_model, -0.5) def step_and_update_lr(self): "Step with the inner optimizer" self._update_learning_rate() self._optimizer.step() def zero_grad(self): "Zero out the gradients by the inner optimizer" self._optimizer.zero_grad() def _get_lr_scale(self): return np.min([ np.power(self.n_current_steps, -0.5), np.power(self.n_warmup_steps, -1.5) * self.n_current_steps]) def _update_learning_rate(self): ''' Learning rate scheduling per step ''' self.n_current_steps += 1 lr = self.init_lr * self._get_lr_scale() for param_group in self._optimizer.param_groups: param_group['lr'] = lr
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。