当前位置:   article > 正文

使用BERT进行情感分析_bert 模型 情感分析

bert 模型 情感分析

年前最后一篇,就写个自己使用BERT的流程步骤,提前祝大家新年快乐~

  1. ## STEP1:构建模型
  2. class Config(object):
  3. """配置参数"""
  4. def __init__(self, dataset):
  5. self.model_name = 'bert'
  6. self.train_path = dataset + '/data/train.txt' # 训练集
  7. self.dev_path = dataset + '/data/dev.txt' # 验证集
  8. self.test_path = dataset + '/data/test.txt' # 测试集
  9. self.class_list = [x.strip() for x in open(
  10. dataset + '/data/class.txt').readlines()]
  11. self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt'
  12. self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  13. self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
  14. self.num_classes = len(self.class_list)
  15. self.num_epochs = 3
  16. self.batch_size = 128
  17. self.pad_size = 32
  18. self.learning_rate = 5e-5
  19. self.bert_path = './bert_pretrain'
  20. self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
  21. self.hidden_size = 768
  22. class BERT(nn.Module):
  23. def __init__(self, config):
  24. super(BERT, self).__init__()
  25. self.bert = BertModel.from_pretrained(config.bert_path)
  26. for param in self.bert.parameters():
  27. param.requires_grad = True
  28. self.fc = nn.Linear(config.hidden_size, config.num_classes)
  29. def forward(self, x):
  30. context = x[0] # 输入的句子
  31. mask = x[2] # 对padding部分进行mask,和句子一个size,padding部分用0表示
  32. _, pooled = self.bert(context, attention_mask=mask, output_all_encoded_layers=False)
  33. out = self.fc(pooled)
  34. return out
  1. ## STEP2:构建数据集
  2. def build_dataset(config):
  3. def load_dataset(path, pad_size=32):
  4. contents = []
  5. with open(path, 'r', encoding='UTF-8') as f:
  6. for line in tqdm(f):
  7. lin = line.strip()
  8. if not lin:
  9. continue
  10. content, label = lin.split('\t')
  11. token = config.tokenizer.tokenize(content)
  12. token = [CLS] + token
  13. seq_len = len(token)
  14. mask = []
  15. token_ids = config.tokenizer.convert_tokens_to_ids(token)
  16. if pad_size:
  17. if len(token) < pad_size:
  18. mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
  19. token_ids += ([0] * (pad_size - len(token)))
  20. else:
  21. mask = [1] * pad_size
  22. token_ids = token_ids[:pad_size]
  23. seq_len = pad_size
  24. contents.append((token_ids, int(label), seq_len, mask))
  25. return contents
  26. train = load_dataset(config.train_path, config.pad_size)
  27. dev = load_dataset(config.dev_path, config.pad_size)
  28. test = load_dataset(config.test_path, config.pad_size)
  29. return train, dev, test
  30. class DatasetIterater(object):
  31. def __init__(self, batches, batch_size, device):
  32. self.batch_size = batch_size
  33. self.batches = batches
  34. self.n_batches = len(batches) // batch_size
  35. self.residue = False # 记录batch数量是否为整数
  36. if len(batches) % self.n_batches != 0:
  37. self.residue = True
  38. self.index = 0
  39. self.device = device
  40. def _to_tensor(self, datas):
  41. x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
  42. y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
  43. # pad前的长度(超过pad_size的设为pad_size)
  44. seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
  45. mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
  46. return (x, seq_len, mask), y
  47. def __next__(self):
  48. if self.residue and self.index == self.n_batches:
  49. batches = self.batches[self.index * self.batch_size: len(self.batches)]
  50. self.index += 1
  51. batches = self._to_tensor(batches)
  52. return batches
  53. elif self.index >= self.n_batches:
  54. self.index = 0
  55. raise StopIteration
  56. else:
  57. batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
  58. self.index += 1
  59. batches = self._to_tensor(batches)
  60. return batches
  61. def __iter__(self):
  62. return self
  63. def __len__(self):
  64. if self.residue:
  65. return self.n_batches + 1
  66. else:
  67. return self.n_batches
  68. def build_iterator(dataset, config):
  69. iter = DatasetIterater(dataset, config.batch_size, config.device)
  70. return iter
  71. def get_time_dif(start_time):
  72. """获取已使用时间"""
  73. end_time = time.time()
  74. time_dif = end_time - start_time
  75. return timedelta(seconds=int(round(time_dif)))
  1. ## STEP3:构建训练测试流程函数
  2. def init_network(model, method='xavier', exclude='embedding', seed=123):
  3. for name, w in model.named_parameters():
  4. if exclude not in name:
  5. if len(w.size()) < 2:
  6. continue
  7. if 'weight' in name:
  8. if method == 'xavier':
  9. nn.init.xavier_normal_(w)
  10. elif method == 'kaiming':
  11. nn.init.kaiming_normal_(w)
  12. else:
  13. nn.init.normal_(w)
  14. elif 'bias' in name:
  15. nn.init.constant_(w, 0)
  16. else:
  17. pass
  18. def train(config, model, train_iter, dev_iter, test_iter):
  19. start_time = time.time()
  20. model.train()
  21. param_optimizer = list(model.named_parameters())
  22. no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  23. optimizer_grouped_parameters = [
  24. {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
  25. {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
  26. optimizer = BertAdam(optimizer_grouped_parameters,
  27. lr=config.learning_rate,
  28. warmup=0.05,
  29. t_total=len(train_iter) * config.num_epochs)
  30. total_batch = 0 # 记录进行到多少batch
  31. dev_best_loss = float('inf')
  32. last_improve = 0 # 记录上次验证集loss下降的batch数
  33. flag = False # 记录是否很久没有效果提升
  34. model.train()
  35. for epoch in range(config.num_epochs):
  36. print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
  37. for i, (trains, labels) in enumerate(train_iter):
  38. outputs = model(trains)
  39. model.zero_grad()
  40. loss = F.cross_entropy(outputs, labels)
  41. loss.backward()
  42. optimizer.step()
  43. if total_batch % 100 == 0:
  44. # 每多少轮输出在训练集和验证集上的效果
  45. true = labels.data.cpu()
  46. predic = torch.max(outputs.data, 1)[1].cpu()
  47. train_acc = metrics.accuracy_score(true, predic)
  48. dev_acc, dev_loss = evaluate(config, model, dev_iter)
  49. if dev_loss < dev_best_loss:
  50. dev_best_loss = dev_loss
  51. torch.save(model.state_dict(), config.save_path)
  52. improve = '*'
  53. last_improve = total_batch
  54. else:
  55. improve = ''
  56. time_dif = get_time_dif(start_time)
  57. msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}'
  58. print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
  59. model.train()
  60. total_batch += 1
  61. if total_batch - last_improve > config.require_improvement:
  62. # 验证集loss超过1000batch没下降,结束训练
  63. print("No optimization for a long time, auto-stopping...")
  64. flag = True
  65. break
  66. if flag:
  67. break
  68. test(config, model, test_iter)
  69. def test(config, model, test_iter):
  70. # test
  71. model.load_state_dict(torch.load(config.save_path))
  72. model.eval()
  73. start_time = time.time()
  74. test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
  75. msg = 'Test Loss: {0:>5.2}, Test Acc: {1:>6.2%}'
  76. print(msg.format(test_loss, test_acc))
  77. print("Precision, Recall and F1-Score...")
  78. print(test_report)
  79. print("Confusion Matrix...")
  80. print(test_confusion)
  81. time_dif = get_time_dif(start_time)
  82. print("Time usage:", time_dif)
  83. def evaluate(config, model, data_iter, test=False):
  84. model.eval()
  85. loss_total = 0
  86. predict_all = np.array([], dtype=int)
  87. labels_all = np.array([], dtype=int)
  88. with torch.no_grad():
  89. for texts, labels in data_iter:
  90. outputs = model(texts)
  91. loss = F.cross_entropy(outputs, labels)
  92. loss_total += loss
  93. labels = labels.data.cpu().numpy()
  94. predic = torch.max(outputs.data, 1)[1].cpu().numpy()
  95. labels_all = np.append(labels_all, labels)
  96. predict_all = np.append(predict_all, predic)
  97. acc = metrics.accuracy_score(labels_all, predict_all)
  98. if test:
  99. report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
  100. confusion = metrics.confusion_matrix(labels_all, predict_all)
  101. return acc, loss_total / len(data_iter), report, confusion
  102. return acc, loss_total / len(data_iter)

main函数:

  1. parser = argparse.ArgumentParser(description='Chinese Text Classification')
  2. parser.add_argument('--model', type=str, required=True, help='choose a model: Bert, ERNIE')
  3. args = parser.parse_args()
  4. if __name__ == '__main__':
  5. dataset = 'THUCNews' # 数据集
  6. model_name = args.model # bert
  7. x = BERT()
  8. config = x.Config(dataset)
  9. np.random.seed(1)
  10. torch.manual_seed(1)
  11. torch.cuda.manual_seed_all(1)
  12. torch.backends.cudnn.deterministic = True # 保证每次结果一样
  13. start_time = time.time()
  14. print("Loading data...")
  15. train_data, dev_data, test_data = build_dataset(config)
  16. train_iter = build_iterator(train_data, config)
  17. dev_iter = build_iterator(dev_data, config)
  18. test_iter = build_iterator(test_data, config)
  19. time_dif = get_time_dif(start_time)
  20. print("Time usage:", time_dif)
  21. # train
  22. model = x.Model(config).to(config.device)
  23. train(config, model, train_iter, dev_iter, test_iter)

完事~

源码来自:https://github.com/649453932/Bert-Chinese-Text-Classification-Pytorch

本文内容由网友自发贡献,转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号