当前位置:   article > 正文

Python BiLSTM_CRF医学文本标注,医学命名实体识别,NER,双向长短记忆神经网络和条件随机场应用实例,BiLSTM_CRF实现代码_bilstm-crf代码

bilstm-crf代码

具体参考的哪一位大神的代码记不清了,在此表示感谢一下:

进入正题

  1. import torch
  2. import torch.autograd as autograd
  3. import torch.nn as nn
  4. import torch.optim as optim

1.设置随机种子 

torch.manual_seed(1)

2.torch.max(input, dim) 函数

output = torch.max(input, dim)

输入

  • input是softmax函数输出的一个tensor
  • dim是max函数索引的维度0/10是每列的最大值,1是每行的最大值

输出

  • 函数会返回两个tensor,第一个tensor是每行的最大值,softmax的输出中最大的是1,所以第一个tensor是全1的tensor;第二个tensor是每行最大值的索引。
  1. def argmax(vec):
  2. _,idx = torch.max(vec,1)
  3. return idx.item()

3.将 idxs(位置下标)的列表转化为tensor格式

  1. def prepare_sequence(seq, to_ix):
  2. idxs = [to_ix[w] for w in seq]
  3. return torch.tensor(idxs, dtype=torch.long)

4. LogSumExp(LSE)技巧,主要解决计算Softmax或CrossEntropy2时出现的上溢(overflow)或下溢(underflow)问题。

 LSE被定义为参数指数之和的对数:

输入可以看成是一个n维的向量,输出是一个标量。 

  1. def log_sum_exp(vec):
  2. max_score = vec[0, argmax(vec)]
  3. max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
  4. return max_score + \
  5. torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

5.双向长短记忆神经网络和条件随机场 

  1. class BiLSTM_CRF(nn.Module):
  2. def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
  3. super(BiLSTM_CRF, self).__init__()
  4. self.embedding_dim = embedding_dim
  5. self.hidden_dim = hidden_dim
  6. self.vocab_size = vocab_size
  7. self.tag_to_ix = tag_to_ix
  8. self.tagset_size = len(tag_to_ix)
  9. self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
  10. self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
  11. num_layers=1, bidirectional=True)
  12. # Maps the output of the LSTM into tag space.
  13. # 将 LSTM 的输出映射到标签空间。
  14. self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
  15. # Matrix of transition parameters. Entry i,j is the score of
  16. # transitioning *to* i *from* j.
  17. # 转换参数矩阵。 条目 i,j 是转换 *to* i *from* j.
  18. self.transitions = nn.Parameter(
  19. torch.randn(self.tagset_size, self.tagset_size))
  20. # These two statements enforce the constraint that we never transfer
  21. # to the start tag and we never transfer from the stop tag
  22. # 这两个语句强制我们从不转移到开始标签并且我们从不从停止标签转移的约束
  23. self.transitions.data[tag_to_ix[START_TAG], :] = -10000
  24. self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
  25. self.hidden = self.init_hidden()
  26. def init_hidden(self):
  27. return (torch.randn(2, 1, self.hidden_dim // 2),
  28. torch.randn(2, 1, self.hidden_dim // 2))
  29. def _forward_alg(self, feats):
  30. # Do the forward algorithm to compute the partition function
  31. init_alphas = torch.full((1, self.tagset_size), -10000.)
  32. # START_TAG has all of the score.
  33. init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
  34. # Wrap in a variable so that we will get automatic backprop
  35. forward_var = init_alphas
  36. # Iterate through the sentence
  37. for feat in feats:
  38. alphas_t = [] # The forward tensors at this timestep
  39. for next_tag in range(self.tagset_size):
  40. # broadcast the emission score: it is the same regardless of
  41. # the previous tag
  42. emit_score = feat[next_tag].view(
  43. 1, -1).expand(1, self.tagset_size)
  44. # the ith entry of trans_score is the score of transitioning to
  45. # next_tag from i
  46. trans_score = self.transitions[next_tag].view(1, -1)
  47. # The ith entry of next_tag_var is the value for the
  48. # edge (i -> next_tag) before we do log-sum-exp
  49. next_tag_var = forward_var + trans_score + emit_score
  50. # The forward variable for this tag is log-sum-exp of all the
  51. # scores.
  52. alphas_t.append(log_sum_exp(next_tag_var).view(1))
  53. forward_var = torch.cat(alphas_t).view(1, -1)
  54. terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
  55. alpha = log_sum_exp(terminal_var)
  56. return alpha
  57. def _get_lstm_features(self, sentence):
  58. self.hidden = self.init_hidden()
  59. embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
  60. lstm_out, self.hidden = self.lstm(embeds, self.hidden)
  61. lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
  62. lstm_feats = self.hidden2tag(lstm_out)
  63. return lstm_feats
  64. def _score_sentence(self, feats, tags):
  65. # Gives the score of a provided tag sequence
  66. score = torch.zeros(1)
  67. tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
  68. for i, feat in enumerate(feats):
  69. score = score + \
  70. self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
  71. score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
  72. return score
  73. def _viterbi_decode(self, feats):
  74. backpointers = []
  75. # Initialize the viterbi variables in log space
  76. init_vvars = torch.full((1, self.tagset_size), -10000.)
  77. init_vvars[0][self.tag_to_ix[START_TAG]] = 0
  78. # forward_var at step i holds the viterbi variables for step i-1
  79. forward_var = init_vvars
  80. for feat in feats:
  81. bptrs_t = [] # holds the backpointers for this step
  82. viterbivars_t = [] # holds the viterbi variables for this step
  83. for next_tag in range(self.tagset_size):
  84. # next_tag_var[i] holds the viterbi variable for tag i at the
  85. # previous step, plus the score of transitioning
  86. # from tag i to next_tag.
  87. # We don't include the emission scores here because the max
  88. # does not depend on them (we add them in below)
  89. next_tag_var = forward_var + self.transitions[next_tag]
  90. best_tag_id = argmax(next_tag_var)
  91. bptrs_t.append(best_tag_id)
  92. viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
  93. # Now add in the emission scores, and assign forward_var to the set
  94. # of viterbi variables we just computed
  95. forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
  96. backpointers.append(bptrs_t)
  97. # Transition to STOP_TAG
  98. terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
  99. best_tag_id = argmax(terminal_var)
  100. path_score = terminal_var[0][best_tag_id]
  101. # Follow the back pointers to decode the best path.
  102. best_path = [best_tag_id]
  103. for bptrs_t in reversed(backpointers):
  104. best_tag_id = bptrs_t[best_tag_id]
  105. best_path.append(best_tag_id)
  106. # Pop off the start tag (we dont want to return that to the caller)
  107. start = best_path.pop()
  108. assert start == self.tag_to_ix[START_TAG] # Sanity check
  109. best_path.reverse()
  110. return path_score, best_path
  111. def neg_log_likelihood(self, sentence, tags):
  112. feats = self._get_lstm_features(sentence)
  113. forward_score = self._forward_alg(feats)
  114. gold_score = self._score_sentence(feats, tags)
  115. return forward_score - gold_score
  116. def forward(self, sentence): # dont confuse this with _forward_alg above.
  117. # Get the emission scores from the BiLSTM
  118. lstm_feats = self._get_lstm_features(sentence)
  119. # Find the best path, given the features.
  120. score, tag_seq = self._viterbi_decode(lstm_feats)
  121. # return score, tag_seq
  122. return tag_seq

 6.基础参数设置:标注的起始标签和结束标签,词向量的维度(5)和隐藏层(4)设置

  1. START_TAG = "<START>"
  2. STOP_TAG = "<STOP>"
  3. EMBEDDING_DIM = 5
  4. HIDDEN_DIM = 4

7.序列标注的实例,示例就是两句医学文本,采用的BIO标注方法,B表示实体名词的起始位置,I表示实体名词的非起始位置的字符,O表示其它字符,这是最简单的标注样例。还有比较复杂的标注样例,如下所示: 

  1. # Make up some training data
  2. training_data = [(
  3. "高 血 压 容 易 造 成 心 脏 病".split(),
  4. "B I I O O O O B I I".split()
  5. ), (
  6. "心 脏 病 易 造 成 心 肌 梗 死".split(),
  7. "B I I O O O B I I I".split()
  8. )]
  9. print(training_data)

8.数据清洗过程, 建立字典-id

  1. word_to_ix = {}
  2. for sentence, tags in training_data:
  3. # print(tags)
  4. for word in sentence:
  5. if word not in word_to_ix:
  6. word_to_ix[word] = len(word_to_ix)
  7. print(word_to_ix)

 

 9.标签设置,将标签进行id化,导入模型,开始BiLSTM+CRF模型的训练

  1. tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}
  2. model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
  3. optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
  4. # Check predictions before training
  5. # 训练前检查预测
  6. with torch.no_grad():
  7. precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
  8. precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long)
  9. print('样本一的真实标签:'+ str(precheck_tags.tolist()))
  10. print('未训练模型的预测:'+ str(model(precheck_sent)))
  11. print('=============开始BiLSTM+CRF模型的训练=============')
  12. # Make sure prepare_sequence from earlier in the LSTM section is loaded
  13. for epoch in range(200): # again, normally you would NOT do 300 epochs, it is toy data
  14. for sentence, tags in training_data:
  15. # Step 1. Remember that Pytorch accumulates gradients.
  16. # We need to clear them out before each instance
  17. model.zero_grad()
  18. # Step 2. Get our inputs ready for the network, that is,
  19. # turn them into Tensors of word indices.
  20. sentence_in = prepare_sequence(sentence, word_to_ix)
  21. targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
  22. # Step 3. Run our forward pass.
  23. loss = model.neg_log_likelihood(sentence_in, targets)
  24. # Step 4. Compute the loss, gradients, and update the parameters by
  25. # calling optimizer.step()
  26. loss.backward()
  27. optimizer.step()
  28. if epoch%50 == 0:
  29. print(f'模型训练第{epoch}轮的Loss值为:{loss[0]}')

10.保存训练好的模型 

  1. # 保存训练好的模型
  2. output_path = 'ner_trained_model.h5'
  3. torch.save(model, output_path)
  4. print('=============训练结束,保存训练好的模型=============\n\n')

生成如下文件 

 

11.加载训练好的模型,进行文本标注,举了一个例子:

test_data = ['高', '血', '压', '容', '易', '造', '成', '心', '脏', '病']

  1. # 加载训练好的模型
  2. print('=============加载训练好的模型,进行测试=============')
  3. test_data = ['高', '血', '压', '容', '易', '造', '成', '心', '脏', '病']
  4. model_path = 'ner_trained_model.h5'
  5. trained_ner_model = torch.load(model_path)
  6. with torch.no_grad():
  7. precheck_sent = prepare_sequence(test_data, word_to_ix)
  8. result = model(precheck_sent)
  9. print('训练后模型的预测:' ,result)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小小林熬夜学编程/article/detail/368015?site=
推荐阅读
相关标签
  

闽ICP备14008679号