赞
踩
import torch
if torch.cuda.is_available():
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1070
import pandas as pd
path = './中文文本情感分类/'
comments = pd.read_csv(path + '社交网站评论信息情感分类.csv')
moods = {0: '喜悦', 1: '愤怒', 2: '厌恶', 3: '低落'}
print('文本数量(总体):%d' % comments.shape[0])
for label, mood in moods.items():
print('文本数量({}):{}'.format(mood, comments[comments.label==label].shape[0]))
文本数量(总体):361744
文本数量(喜悦):199496
文本数量(愤怒):51714
文本数量(厌恶):55267
文本数量(低落):55267
comments[0:5]
label | review | |
---|---|---|
0 | 0 | 啊呀呀!要死啦!么么么!只穿外套就好了,我认为里面那件很多余啊周小伦喜歡 你各種 五角星的... |
1 | 0 | 嗯……既然大姚通知了……那我也表示下收到……姚,你知道吗?假如外星人入侵地球,只要摧毁我们的... |
2 | 0 | 风格不一样嘛,都喜欢!最喜欢哪张? |
3 | 0 | 好呀,试试D .I .Y .去死皮面膜1.将燕麦片加水中浸泡6小时,加入木瓜牛奶搅拌。2.放... |
4 | 0 | 张老师,谢谢侬的信任!粉丝多少无所谓重在质地近日发现一个现象——他加了你关注,你回加后,他立... |
# 标签为 0 的数据
df0 = comments.loc[comments.label == 0].sample(1000)[['review', 'label']]
# 标签为 1 的数据
df1 = comments.loc[comments.label == 1].sample(1000)[['review', 'label']]
# 标签为 2 的数据
df2 = comments.loc[comments.label == 2].sample(1000)[['review', 'label']]
# 标签为 3 的数据
df3 = comments.loc[comments.label == 3].sample(1000)[['review', 'label']]
df0 = df0.append(df1)
df0 = df0.append(df2)
df0 = df0.append(df3)
df0 = df0.append(df3)
df0 = df0.sample(frac=1)
len(df0)
5000
df0[0:5]
review | label | |
---|---|---|
355831 | 据最新消息,此次火灾造成9人死亡9人受伤。经初步勘察,火灾系售楼处一楼沙盘电路故障引发。 | 3 |
348348 | 上星期买的裙子,今晚穿了,不好看~ ~ | 3 |
357542 | 顺便说点什么吧...荷兰,阿姆斯特丹火车站,广播里突然传来耳熟能详的《音乐之声》里的123,... | 3 |
361326 | 午后,打扮的美美的和喜欢的人坐在咖啡厅,聊聊自己想说的话,或静静坐在一起,什么都不说,只是紧... | 3 |
278616 | 真的假的?!!! | 2 |
# 句子和标签
sentences = df0.review.values
labels = df0.label.values
from transformers import BertTokenizer
print('下载 BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True)
下载 BERT tokenizer...
print(' 原句: ', sentences[0])
print('Tokenizen 后的句子: ', tokenizer.tokenize(sentences[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))
原句: 据最新消息,此次火灾造成9人死亡9人受伤。经初步勘察,火灾系售楼处一楼沙盘电路故障引发。
Tokenizen 后的句子: ['据', '最', '新', '消', '息', ',', '此', '次', '火', '灾', '造', '成', '9', '人', '死', '亡', '9', '人', '受', '伤', '。', '经', '初', '步', '勘', '察', ',', '火', '灾', '系', '售', '楼', '处', '一', '楼', '沙', '盘', '电', '路', '故', '障', '引', '发', '。']
Token IDs: [2945, 3297, 3173, 3867, 2622, 8024, 3634, 3613, 4125, 4135, 6863, 2768, 130, 782, 3647, 767, 130, 782, 1358, 839, 511, 5307, 1159, 3635, 1242, 2175, 8024, 4125, 4135, 5143, 1545, 3517, 1905, 671, 3517, 3763, 4669, 4510, 6662, 3125, 7397, 2471, 1355, 511]
max_len = 0
lengthOfsentence = []
# 循环每一个句子...
for sent in sentences:
lengthOfsentence.append(len(sent))
# 找到句子最大长度
max_len = max(max_len, len(sent))
print('最长的句子长度为: ', max_len)
最长的句子长度为: 284
type(sentences)
numpy.ndarray
sentences.shape
(5000,)
import matplotlib.pyplot as plt
plt.plot(lengthOfsentence)
plt.ylabel('some numbers')
plt.show()
<Figure size 640x480 with 1 Axes>
input_ids = [] attention_masks = [] for sent in sentences: # `encode_plus` will: # (1) Tokenize the sentence. # (2) Prepend the `[CLS]` token to the start. # (3) Append the `[SEP]` token to the end. # (4) Map tokens to their IDs. # (5) Pad or truncate the sentence to `max_length` # (6) Create attention masks for [PAD] tokens. encoded_dict = tokenizer.encode_plus( sent, # Sentence to encode. add_special_tokens = True, # Add '[CLS]' and '[SEP]' max_length = 256, # Pad & truncate all sentences. pad_to_max_length = True, return_attention_mask = True, # Construct attn. masks. return_tensors = 'pt', # Return pytorch tensors. ) # 把编码的句子加入list. input_ids.append(encoded_dict['input_ids']) # 加上 attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_dict['attention_mask']) # 把lists 转为 tensors. input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labels = torch.tensor(labels)
print('原句: ', sentences[0])
print('Token IDs:', input_ids[0])
print('attention_masks:', attention_masks[0])
原句: 国家哀悼日,悲伤无边,情义永在这里有毁灭,有伤痛,有绝望,但还有希望。 Token IDs: tensor([ 101, 1744, 2157, 1500, 2656, 3189, 8024, 2650, 839, 3187, 6804, 8024, 2658, 721, 3719, 1762, 6821, 7027, 3300, 3673, 4127, 8024, 3300, 839, 4578, 8024, 3300, 5318, 3307, 8024, 852, 6820, 3300, 2361, 3307, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) attention_masks: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
from torch.utils.data import TensorDataset, random_split
# 把input 放入 TensorDataset。
dataset = TensorDataset(input_ids, attention_masks, labels)
# 计算 train_size 和 val_size 的长度.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
# 90% 的dataset 为train_dataset, 10% 的的dataset 为val_dataset.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('{:>5,} 训练数据'.format(train_size))
print('{:>5,} 验证数据'.format(val_size))
4,500 训练数据
500 验证数据
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler # 推荐batch_size 为 16 或者 32 batch_size = 16 # 为训练数据集和验证数据集设计DataLoaders. train_dataloader = DataLoader( train_dataset, # 训练数据. sampler = RandomSampler(train_dataset), # 打乱顺序 batch_size = batch_size ) validation_dataloader = DataLoader( val_dataset, # 验证数据. sampler = RandomSampler(val_dataset), # 打乱顺序 batch_size = batch_size )
from transformers import BertForSequenceClassification, AdamW, BertConfig
model = BertForSequenceClassification.from_pretrained(
"bert-base-chinese", # 使用 12-layer 的 BERT 模型.
num_labels = 4, # 多分类任务的输出标签为 4个.
output_attentions = False, # 不返回 attentions weights.
output_hidden_states = False, # 不返回 all hidden-states.
)
model.cuda()
BertForSequenceClassification( (bert): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(21128, 768, padding_idx=0) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) (dropout): Dropout(p=0.1, inplace=False) (classifier): Linear(in_features=768, out_features=4, bias=True) )
# Get all of the model's parameters as a list of tuples. params = list(model.named_parameters()) print('The BERT model has {:} different named parameters.\n'.format(len(params))) print('==== Embedding Layer ====\n') for p in params[0:5]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) print('\n==== First Transformer ====\n') for p in params[5:21]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) print('\n==== Output Layer ====\n') for p in params[-4:]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
The BERT model has 201 different named parameters. ==== Embedding Layer ==== bert.embeddings.word_embeddings.weight (21128, 768) bert.embeddings.position_embeddings.weight (512, 768) bert.embeddings.token_type_embeddings.weight (2, 768) bert.embeddings.LayerNorm.weight (768,) bert.embeddings.LayerNorm.bias (768,) ==== First Transformer ==== bert.encoder.layer.0.attention.self.query.weight (768, 768) bert.encoder.layer.0.attention.self.query.bias (768,) bert.encoder.layer.0.attention.self.key.weight (768, 768) bert.encoder.layer.0.attention.self.key.bias (768,) bert.encoder.layer.0.attention.self.value.weight (768, 768) bert.encoder.layer.0.attention.self.value.bias (768,) bert.encoder.layer.0.attention.output.dense.weight (768, 768) bert.encoder.layer.0.attention.output.dense.bias (768,) bert.encoder.layer.0.attention.output.LayerNorm.weight (768,) bert.encoder.layer.0.attention.output.LayerNorm.bias (768,) bert.encoder.layer.0.intermediate.dense.weight (3072, 768) bert.encoder.layer.0.intermediate.dense.bias (3072,) bert.encoder.layer.0.output.dense.weight (768, 3072) bert.encoder.layer.0.output.dense.bias (768,) bert.encoder.layer.0.output.LayerNorm.weight (768,) bert.encoder.layer.0.output.LayerNorm.bias (768,) ==== Output Layer ==== bert.pooler.dense.weight (768, 768) bert.pooler.dense.bias (768,) classifier.weight (4, 768) classifier.bias (4,)
# AdamW 是一个 huggingface library 的类,'W' 是'Weight Decay fix"的意思。
optimizer = AdamW(model.parameters(),
lr = 2e-5, # args.learning_rate - 默认是 5e-5
eps = 1e-8 # args.adam_epsilon - 默认是 1e-8, 是为了防止衰减率分母除到0
)
from transformers import get_linear_schedule_with_warmup
# bert 推荐 epochs 在2到4之间为好。
epochs = 4
# training steps 的数量: [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs
# 设计 learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps)
import numpy as np
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
import time
import datetime
def format_time(elapsed):
elapsed_rounded = int(round((elapsed)))
# 返回 hh:mm:ss 形式的时间
return str(datetime.timedelta(seconds=elapsed_rounded))
import os import random import numpy as np from transformers import WEIGHTS_NAME, CONFIG_NAME output_dir = "./models/" output_model_file = os.path.join(output_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_dir, CONFIG_NAME) # 代码参考 https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128 # 设置随机种子. seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # 记录training ,validation loss ,validation accuracy and timings. training_stats = [] # 设置总时间. total_t0 = time.time() best_val_accuracy = 0 for epoch_i in range(0, epochs): print('Epoch {:} / {:}'.format(epoch_i + 1, epochs)) # 记录每个 epoch 所用的时间 t0 = time.time() total_train_loss = 0 total_train_accuracy = 0 model.train() for step, batch in enumerate(train_dataloader): # 每隔40个batch 输出一下所用时间. if step % 40 == 0 and not step == 0: elapsed = format_time(time.time() - t0) print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) # `batch` 包括3个 tensors: # [0]: input ids # [1]: attention masks # [2]: labels b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) # 清空梯度 model.zero_grad() # forward # 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) total_train_loss += loss.item() # backward 更新 gradients. loss.backward() # 减去大于1 的梯度,将其设为 1.0, 以防梯度爆炸. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # 更新模型参数 optimizer.step() # 更新 learning rate. scheduler.step() logit = logits.detach().cpu().numpy() label_id = b_labels.to('cpu').numpy() # 计算training 句子的准确度. total_train_accuracy += flat_accuracy(logit, label_id) # 计算batches的平均损失. avg_train_loss = total_train_loss / len(train_dataloader) # 计算训练时间. training_time = format_time(time.time() - t0) # 训练集的准确率. avg_train_accuracy = total_train_accuracy / len(train_dataloader) print(" 训练准确率: {0:.2f}".format(avg_train_accuracy)) print(" 平均训练损失 loss: {0:.2f}".format(avg_train_loss)) print(" 训练时间: {:}".format(training_time)) # ======================================== # Validation # ======================================== t0 = time.time() # 设置 model 为valuation 状态,在valuation状态 dropout layers 的dropout rate会不同 model.eval() # 设置参数 total_eval_accuracy = 0 total_eval_loss = 0 nb_eval_steps = 0 for batch in validation_dataloader: # `batch` 包括3个 tensors: # [0]: input ids # [1]: attention masks # [2]: labels b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) # 在valuation 状态,不更新权值,不改变计算图 with torch.no_grad(): # 参考 https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification (loss, logits) = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # 计算 validation loss. total_eval_loss += loss.item() logit = logits.detach().cpu().numpy() label_id = b_labels.to('cpu').numpy() # 计算 validation 句子的准确度. total_eval_accuracy += flat_accuracy(logit, label_id) # 计算 validation 的准确率. avg_val_accuracy = total_eval_accuracy / len(validation_dataloader) print("") print(" 测试准确率: {0:.2f}".format(avg_val_accuracy)) if avg_val_accuracy > best_val_accuracy: best_val_accuracy = avg_val_accuracy torch.save(model.state_dict(),output_model_file) model.config.to_json_file(output_config_file) tokenizer.save_vocabulary(output_dir) # 计算batches的平均损失. avg_val_loss = total_eval_loss / len(validation_dataloader) # 计算validation 时间. validation_time = format_time(time.time() - t0) print(" 平均测试损失 Loss: {0:.2f}".format(avg_val_loss)) print(" 测试时间: {:}".format(validation_time)) # 记录模型参数 training_stats.append( { 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'Valid. Loss': avg_val_loss, 'Valid. Accur.': avg_val_accuracy, 'Training Time': training_time, 'Validation Time': validation_time } ) print("训练一共用了 {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
Epoch 1 / 4 Batch 40 of 282. Elapsed: 0:00:30. Batch 80 of 282. Elapsed: 0:01:00. Batch 120 of 282. Elapsed: 0:01:30. Batch 160 of 282. Elapsed: 0:02:00. Batch 200 of 282. Elapsed: 0:02:30. Batch 240 of 282. Elapsed: 0:03:00. Batch 280 of 282. Elapsed: 0:03:30. 训练准确率: 0.43 平均训练损失 loss: 1.27 训练时间: 0:03:31 测试准确率: 0.49 平均测试损失 Loss: 1.20 测试时间: 0:00:08 Epoch 2 / 4 Batch 40 of 282. Elapsed: 0:00:30. Batch 80 of 282. Elapsed: 0:01:00. Batch 120 of 282. Elapsed: 0:01:30. Batch 160 of 282. Elapsed: 0:02:01. Batch 200 of 282. Elapsed: 0:02:31. Batch 240 of 282. Elapsed: 0:03:01. Batch 280 of 282. Elapsed: 0:03:31. 训练准确率: 0.54 平均训练损失 loss: 1.09 训练时间: 0:03:32 测试准确率: 0.50 平均测试损失 Loss: 1.20 测试时间: 0:00:08 Epoch 3 / 4 Batch 40 of 282. Elapsed: 0:00:30. Batch 80 of 282. Elapsed: 0:01:01. Batch 120 of 282. Elapsed: 0:01:31. Batch 160 of 282. Elapsed: 0:02:01. Batch 200 of 282. Elapsed: 0:02:31. Batch 240 of 282. Elapsed: 0:03:01. Batch 280 of 282. Elapsed: 0:03:31. 训练准确率: 0.68 平均训练损失 loss: 0.81 训练时间: 0:03:32 测试准确率: 0.53 平均测试损失 Loss: 1.18 测试时间: 0:00:08 Epoch 4 / 4 Batch 40 of 282. Elapsed: 0:00:30. Batch 80 of 282. Elapsed: 0:01:00. Batch 120 of 282. Elapsed: 0:01:30. Batch 160 of 282. Elapsed: 0:02:01. Batch 200 of 282. Elapsed: 0:02:31. Batch 240 of 282. Elapsed: 0:03:01. Batch 280 of 282. Elapsed: 0:03:31. 训练准确率: 0.77 平均训练损失 loss: 0.61 训练时间: 0:03:32 测试准确率: 0.56 平均测试损失 Loss: 1.13 测试时间: 0:00:08 训练一共用了 0:14:40 (h:mm:ss)
(_, logits) = model(input_ids[-20:].to(device),
token_type_ids=None,
attention_mask=attention_masks[-20:].to(device),
labels=labels[-20:].to(device))
logits = logits.detach().cpu().numpy()
label_ids = labels[-20:].to('cpu').numpy()
acc = flat_accuracy(logits, label_ids)
acc
0.95
pred_flat = np.argmax(logits, axis=1).flatten()
pred_flat
array([3, 1, 3, 3, 1, 2, 3, 3, 3, 3, 3, 3, 1, 3, 2, 3, 0, 2, 0, 0],
dtype=int64)
label_ids
array([3, 1, 3, 3, 1, 2, 3, 3, 3, 3, 3, 2, 1, 3, 2, 3, 0, 2, 0, 0],
dtype=int64)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。