赞
踩
微调一下其实蛮简单,其实应用起来重点在于理解bert的输入需要使用tokenizer格式化成标准bert输入(就是把句子里的字符按照词典标号标准化,并且加上各种token标志,进行补齐和截断),然后bert的输出就是<batchsize,句子长度,768>的tensor,后面加上你想要的各种网络就可以了,需要特别注意的就是需要把数据和网络都放在同一个设备上(CPU or GPU)~
共两个文件:
注:data、label均读取成list;可以使用sklearn的train_test_split将训练集分为训练集和测试集
def read_data(filepath):
texts_1=[]
texts_2=[]
labels=[]
with open(filepath,'r') as f:
lines=f.readlines()
for line in lines:
items=line.replace('\n','').split('\t')
texts_1.append(items[0])
texts_2.append(items[1])
labels.append(float(items[2]))
return texts_1,texts_2,labels
train_texts_1,train_texts_2,train_labels=read_data('./data/train')
test_texts_1,test_texts_2,test_labels=read_data('./data/test')
print("训练集:",len(train_texts_1))
print("测试集:",len(test_texts_1))
注:tokenizer是可选的bert数据预处理的格式化工具,用于文本处理成bert的输入格式
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
train_encodings = tokenizer(train_texts_1,train_texts_2, truncation=True, padding=True)
test_encodings = tokenizer(test_texts_1,test_texts_2, truncation=True, padding=True)
注:迭代器是一个可以使用for循环访问的python对象
import torch class Dataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels[idx]) return item def __len__(self): return len(self.labels) train_dataset = Dataset(train_encodings, train_labels) test_dataset = Dataset(test_encodings, test_labels)
注:dataloader二次封装便于按照batchsize来给模型提供数据
from torch.utils.data import DataLoader
#生成训练和测试Dataloader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)
注:BertModel.from_pretrained(model_name)就是载入预训练的bert架构,BertModel以及对应model_name可以提替换成你想使用的模型和参数版本,具体可以参见:huggingface,官网有详细介绍
from transformers import BertModel, AdamW class myFinrtuneModel(torch.nn.Module): def __init__(self,model_name='bert-base-chinese',freeze_bert=False): super(myFinrtuneModel,self).__init__() # bert模型 self.bert = BertModel.from_pretrained(model_name) if freeze_bert: for p in self.bert.parameters(): p.requires_grad=False # 定义bert后面要接的网络 self.class_net = torch.nn.Linear(768,1) # 微调的具体操作 def forward(self,input_ids,attention_masks): # 输入bert outputs = self.bert(input_ids, attention_mask=attention_masks) # 获取bert输出的隐藏层特征 last_hidden_state=outputs.last_hidden_state # 把token embedding平均得到sentences_embedding sentences_embeddings=torch.mean(last_hidden_state,dim=1) sentences_embeddings=sentences_embeddings.squeeze(1) # 把sentences_embedding输入分类网络 out=self.class_net(sentences_embeddings).squeeze(-1) return out
#初始化自定义模型
model=myFinrtuneModel(model_name='bert-base-chinese')
#模型参数放在cuda上
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
#调整成训练模式
model.train()
#生成优化器
optim = AdamW(model.parameters(), lr=5e-5)
#最大迭代次数
max_epoch=3
#损失函数
loss_function=torch.nn.BCEWithLogitsLoss()
#保存函数
import os
from pathlib import Path
def save(model,optimizer,PATH):
my_file = Path(PATH)
if not my_file.exists():
os.system("mkdir "+PATH)
torch.save({
'model_state_dict':model.state_dict(),
'optimizer_state_dict':optimizer.state_dict()
},os.path.join(PATH, 'checkpoint'))
print("保存模型参数")
#训练函数
def train(model,train_loader,test_loader,optim,loss_function,max_epoch): print('-------------- start training ---------------','\n') step=0 for epoch in range(max_epoch): print("========= epoch:",epoch,'==============') for batch in train_loader: step+=1 # 清空优化器 optim.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) # 将用例输入模型,计算loss out=model(input_ids=input_ids,attention_masks=attention_mask) loss=loss_function(out,labels) if step%100==0: print('step ',step,"loss:",format(loss.item(),'.3f')) # 反向传播 loss.backward() optim.step() # 每一次epoch进行一次测试 eval(model=model,test_loader=test_loader)
#测试函数
def eval(model,test_loader): right=0 total=0 for batch in test_loader: total+=1 input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) out=torch.sigmoid(model(input_ids=input_ids,attention_masks=attention_mask)) # 二分类 pred_label=0 if out.item()<=0.5 else 1 if pred_label == labels.item(): right+=1 accurcy=format(right/total, '.3f') print("= accurcy:",accurcy) print("\n")
#训练模型
train(model=model,train_loader=train_loader,test_loader=test_loader,optim=optim,loss_function=loss_function,max_epoch=max_epoch)
#保存模型
save(model,optim,'save_BertModel_for_text_similarity')
注:从finetune_bert_model.py中导入myFinrtuneModel
from transformers import BertTokenizer from finetune_bert_model import myFinrtuneModel import torch #生成bert的文本输入格式化工具 tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') #载入微调之后的保存参数 checkpoint=torch.load('save_BertModel_for_text_similarity/checkpoint') model=myFinrtuneModel() model.load_state_dict(checkpoint['model_state_dict']) #转换为测试模式 model.eval() #把文本处理成bert输入格式 inputs = tokenizer("吃饭了么","今天你吃饭了吗", return_tensors="pt") input_ids=inputs['input_ids'] attention_mask=inputs['attention_mask'] #输入模型 outputs = model(input_ids=input_ids,attention_masks=attention_mask) #输出score outputs = torch.sigmoid(outputs).item() #判断二者是否相似 out=0 if outputs>0.5 else 1 print(out)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。