当前位置:   article > 正文

【深度学习、工程实践】预训练模型进行情感分析(以bert-base-chinese为例)

bert-base-chinese

目录

1.预训练模型下载

2.下载预训练模型

 3.导入需要的库

4.定义数据路径

5.查看数据

 6.定义神经网络

7.使用BertTokenizer 编码成Bert需要的输入格式

8.将数据加载为Tensor格式

9.实例化DataLoader

10.定义验证函数

11.定义训练函数 

12.实例化模型并进行训练与验证

13.定义预测函数

14.使用训练好的模型进行预测

15.获得预测值与预测的概率

16.调用函数计算准确率等指标


1.预训练模型下载

        预训练模型基于transformers库使用,bert-base-chinese预训练模型是通过Models - Hugging Face 下载,将模型下载至服务器。

2.下载预训练模型

 3.导入需要的库

  1. import numpy as np
  2. import pandas as pd
  3. import csv
  4. import torch.nn as nn
  5. from torch.optim.lr_scheduler import ReduceLROnPlateau
  6. from torch.utils.data import TensorDataset, DataLoader
  7. from transformers import BertTokenizer,BertConfig,AdamW
  8. from sklearn.metrics import accuracy_score
  9. from sklearn.metrics import classification_report
  10. from sklearn.model_selection import train_test_split
  11. from tqdm import tqdm

4.定义数据路径

  1. #读取数据
  2. Data_path = "./data.csv"
  3. Totle_data = pd.read_csv(Data_path)
  4. train_dataset,temp_data = train_test_split(Totle_data,test_size=0.2)
  5. validate_dataset, test_dataset = train_test_split(temp_data,test_size=0.5)
  6. print(len(train_dataset))
  7. print(len(validate_dataset))
  8. print(len(test_dataset))
  9. #设置保存路径
  10. train_data_path="./Train.csv"
  11. dev_data_path = "./Dev.csv"
  12. test_data_path="./Test.csv"
  13. #index参数设置为False表示不保存行索引,header设置为False表示不保存列索引
  14. train_dataset.to_csv(train_data_path,index=False,header=True)
  15. validate_dataset.to_csv(dev_data_path ,index=False,header=True)
  16. test_dataset.to_csv(test_data_path,index=False,header=True)

5.查看数据

data = pd.read_csv(train_data_path)

 6.定义神经网络

  1. class BertClassificationModel(nn.Module):
  2. def __init__(self):
  3. super(BertClassificationModel, self).__init__()
  4. #加载预训练模型
  5. pretrained_weights="/root/Bert/bert-base-chinese/"
  6. self.bert = transformers.BertModel.from_pretrained(pretrained_weights)
  7. for param in self.bert.parameters():
  8. param.requires_grad = True
  9. #定义线性函数
  10. self.dense = nn.Linear(768, 2) #bert默认的隐藏单元数是768, 输出单元是2,表示二分类
  11. def forward(self, input_ids,token_type_ids,attention_mask):
  12. #得到bert_output
  13. bert_output = self.bert(input_ids=input_ids,token_type_ids=token_type_ids, attention_mask=attention_mask)
  14. #获得预训练模型的输出
  15. bert_cls_hidden_state = bert_output[1]
  16. #将768维的向量输入到线性层映射为二维向量
  17. linear_output = self.dense(bert_cls_hidden_state)
  18. return linear_output

7.使用BertTokenizer 编码成Bert需要的输入格式

        数据送入预训练模型之间需要进行预处理,使用BertTokenizer将数据编码为Bert需要的输入格式。预训练模型有三种输入分别是input_ids、token_type_ids 、attention_mask。

  1. def encoder(max_len,vocab_path,text_list):
  2. #将text_list embedding成bert模型可用的输入形式
  3. #加载分词模型
  4. tokenizer = BertTokenizer.from_pretrained(vocab_path)
  5. tokenizer = tokenizer(
  6. text_list,
  7. padding = True,
  8. truncation = True,
  9. max_length = max_len,
  10. return_tensors='pt' # 返回的类型为pytorch tensor
  11. )
  12. input_ids = tokenizer['input_ids']
  13. token_type_ids = tokenizer['token_type_ids']
  14. attention_mask = tokenizer['attention_mask']
  15. return input_ids,token_type_ids,attention_mask

8.将数据加载为Tensor格式

  1. def load_data(path):
  2. csvFileObj = open(path)
  3. readerObj = csv.reader(csvFileObj)
  4. text_list = []
  5. labels = []
  6. for row in readerObj:
  7. #跳过表头
  8. if readerObj.line_num == 1:
  9. continue
  10. #label在什么位置就改成对应的index
  11. label = int(row[1])
  12. text = row[0]
  13. text_list.append(text)
  14. labels.append(label)
  15. #调用encoder函数,获得预训练模型的三种输入形式
  16. input_ids,token_type_ids,attention_mask = encoder(max_len=150,vocab_path="/root/Bert/bert-base-chinese/vocab.txt",text_list=text_list)
  17. labels = torch.tensor(labels)
  18. #将encoder的返回值以及label封装为Tensor的形式
  19. data = TensorDataset(input_ids,token_type_ids,attention_mask,labels)
  20. return data

9.实例化DataLoader

  1. #设定batch_size
  2. batch_size = 16
  3. #引入数据路径
  4. train_data_path="/root/Data/JD_Bert_Train.csv"
  5. dev_data_path="/root/Data/JD_Bert_Dev.csv"
  6. test_data_path="/root/Data/JD_Bert_Test.csv"
  7. #调用load_data函数,将数据加载为Tensor形式
  8. train_data = load_data(train_data_path)
  9. dev_data = load_data(dev_data_path)
  10. test_data = load_data(test_data_path)
  11. #将训练数据和测试数据进行DataLoader实例化
  12. train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
  13. dev_loader = DataLoader(dataset=dev_data, batch_size=batch_size, shuffle=True)
  14. test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)

10.定义验证函数

  1. def dev(model,dev_loader):
  2. #将模型放到服务器上
  3. model.to(device)
  4. #设定模式为验证模式
  5. model.eval()
  6. #设定不会有梯度的改变仅作验证
  7. with torch.no_grad():
  8. correct = 0
  9. total = 0
  10. for step, (input_ids,token_type_ids,attention_mask,labels) in tqdm(enumerate(dev_loader),desc='Dev Itreation:'): input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device)
  11. out_put = model(input_ids,token_type_ids,attention_mask)
  12. _, predict = torch.max(out_put.data, 1)
  13. correct += (predict==labels).sum().item()
  14. total += labels.size(0)
  15. res = correct / total
  16. return res

11.定义训练函数 

  1. def train(model,train_loader,dev_loader) :
  2. #将model放到服务器上
  3. model.to(device)
  4. #设定模型的模式为训练模式
  5. model.train()
  6. #定义模型的损失函数
  7. criterion = nn.CrossEntropyLoss()
  8. param_optimizer = list(model.named_parameters())
  9. no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  10. #设置模型参数的权重衰减
  11. optimizer_grouped_parameters = [
  12. {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
  13. 'weight_decay': 0.01},
  14. {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
  15. ]
  16. #学习率的设置
  17. optimizer_params = {'lr': 1e-5, 'eps': 1e-6, 'correct_bias': False}
  18. #使用AdamW 主流优化器
  19. optimizer = AdamW(optimizer_grouped_parameters, **optimizer_params)
  20. #学习率调整器,检测准确率的状态,然后衰减学习率
  21. scheduler = ReduceLROnPlateau(optimizer,mode='max',factor=0.5,min_lr=1e-7, patience=5,verbose= True, threshold=0.0001, eps=1e-08)
  22. t_total = len(train_loader)
  23. #设定训练轮次
  24. total_epochs = 2
  25. bestAcc = 0
  26. correct = 0
  27. total = 0
  28. print('Training and verification begin!')
  29. for epoch in range(total_epochs):
  30. for step, (input_ids,token_type_ids,attention_mask,labels) in enumerate(train_loader):
  31. #从实例化的DataLoader中取出数据,并通过 .to(device)将数据部署到服务器上 input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device)
  32. #梯度清零
  33. optimizer.zero_grad()
  34. #将数据输入到模型中获得输出
  35. out_put = model(input_ids,token_type_ids,attention_mask)
  36. #计算损失
  37. loss = criterion(out_put, labels)
  38. _, predict = torch.max(out_put.data, 1)
  39. correct += (predict == labels).sum().item()
  40. total += labels.size(0)
  41. loss.backward()
  42. optimizer.step()
  43. #每两步进行一次打印
  44. if (step + 1) % 2 == 0:
  45. train_acc = correct / total
  46. print("Train Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,loss.item()))
  47. #每五十次进行一次验证
  48. if (step + 1) % 50 == 0:
  49. train_acc = correct / total
  50. #调用验证函数dev对模型进行验证,并将有效果提升的模型进行保存
  51. acc = dev(model, dev_loader)
  52. if bestAcc < acc:
  53. bestAcc = acc
  54. #模型保存路径
  55. path = '/root/data/savedmodel/span_bert_hide_model1.pkl'
  56. torch.save(model, path)
  57. print("DEV Epoch[{}/{}],step[{}/{}],tra_acc{:.6f} %,bestAcc{:.6f}%,dev_acc{:.6f} %,loss:{:.6f}".format(epoch + 1, total_epochs, step + 1, len(train_loader),train_acc*100,bestAcc*100,acc*100,loss.item()))
  58. scheduler.step(bestAcc)

12.实例化模型并进行训练与验证

  1. device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
  2. #实例化模型
  3. model = BertClassificationModel()
  4. #调用训练函数进行训练与验证
  5. train(model,train_loader,dev_loader)

 

13.定义预测函数

  1. def predict(model,test_loader):
  2. model.to(device)
  3. model.eval()
  4. predicts = []
  5. predict_probs = []
  6. with torch.no_grad():
  7. correct = 0
  8. total = 0
  9. for step, (input_ids,token_type_ids,attention_mask,labels) in enumerate(test_loader):
  10. input_ids,token_type_ids,attention_mask,labels=input_ids.to(device),token_type_ids.to(device),attention_mask.to(device),labels.to(device)
  11. out_put = model(input_ids,token_type_ids,attention_mask)
  12. _, predict = torch.max(out_put.data, 1)
  13. pre_numpy = predict.cpu().numpy().tolist()
  14. predicts.extend(pre_numpy)
  15. probs = F.softmax(out_put).detach().cpu().numpy().tolist()
  16. predict_probs.extend(probs)
  17. correct += (predict==labels).sum().item()
  18. total += labels.size(0)
  19. res = correct / total
  20. print('predict_Accuracy : {} %'.format(100 * res))
  21. #返回预测结果和预测的概率
  22. return predicts,predict_probs

14.使用训练好的模型进行预测

  1. #引进训练好的模型进行测试
  2. path = '/root/data/savedmodel/span_bert_hide_model.pkl'
  3. Trained_model = torch.load(path)
  4. #predicts是预测的(0 1),predict_probs是概率值
  5. predicts,predict_probs = predict(Trained_model,dev_loader)

15.获得预测值与预测的概率

16.调用函数计算准确率等指标

  1. P = sklearn.metrics.precision_score(y_true, y_pred, average=’binary’,sample_weight=None)
  2. R = sklearn.metrics.recall_score(y_true, y_pred, average=’binary’,sample_weight=None)
  3. F1 = sklearn.metrics.f1_score(y_true, y_pred,average=’binary’,sample_weight=None)
参数名含义类型
y_true正确值1维矩阵
y_pred预测值1维矩阵
average计算类型字符串,‘binary’(默认)、‘micro’、‘macro’、‘weighted’、‘samples’
sample_weight样本比重n维矩阵(n=样本类数)

average的选项详解: 

选项含义
binary二分类
micro统计全局TP和FP来计算
macro计算每个标签的未加权均值(不考虑不平衡)
weighted计算每个标签等等加权均值(考虑不平衡)
samples计算每个实例找出其均值

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/article/detail/47183?site
推荐阅读
相关标签
  

闽ICP备14008679号