当前位置:   article > 正文

[文本语义相似] 基于bert的余弦距离(bert4keras实现)_set_gelu('tanh')

set_gelu('tanh')

文本相似在问答系统中有很重要的应用,如基于知识的问答系统(Knowledge-based QA),基于文档的问答系统(Documen-based QA),以及基于FAQ的问答系统(Community-QA)等。像 对于问题的内容,需要进行相似度匹配,从而选择出与问题最接近,同时最合理的答案。本节介绍 基于bert的余弦距离计算相似度。

学习bert可以看这里:https://blog.csdn.net/u014365862/article/details/104412737

 

训练/预测:

  1. # 绘图案例 an example of matplotlib
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. from scipy.special import jn
  5. from IPython.display import display, clear_output
  6. import time
  7. from sklearn.model_selection import KFold, train_test_split, GridSearchCV
  8. '''
  9. x = np.linspace(0,5)
  10. f, ax = plt.subplots()
  11. ax.set_title("Bessel functions")
  12. for n in range(1,10):
  13. time.sleep(1)
  14. ax.plot(x, jn(x,n))
  15. clear_output(wait=True)
  16. display(f)
  17. # close the figure at the end, so we don't get a duplicate
  18. # of the last plot
  19. plt.close()
  20. '''
  21. from keras.layers import *
  22. from bert4keras.backend import keras, set_gelu
  23. from bert4keras.bert import build_bert_model
  24. from bert4keras.optimizers import Adam
  25. from bert4keras.snippets import sequence_padding, DataGenerator
  26. from bert4keras.tokenizer import Tokenizer
  27. import pandas as pd
  28. import numpy as np
  29. set_gelu('tanh') # 切换gelu版本
  30. maxlen = 32
  31. batch_size = 16
  32. num_classes = 2
  33. epochs = 20
  34. learning_rate = 2e-5
  35. # sim roeberta_zh
  36. config_path = 'albert_tiny_google_zh_489k/albert_config.json'
  37. checkpoint_path = 'albert_tiny_google_zh_489k/albert_model.ckpt'
  38. dict_path = 'albert_tiny_google_zh_489k/vocab.txt'
  39. def load_data(filename):
  40. D = []
  41. data = pd.read_csv(filename)
  42. data.dropna(axis=0, how='any', inplace=True)
  43. data = data.values.tolist()
  44. for per_data in data:
  45. D.append( (per_data[0],per_data[1],int(per_data[2])) )
  46. return D
  47. # 加载数据集
  48. train_val_data = load_data('train_data.csv')
  49. # test_data = load_data('dev.csv')
  50. # 查看一下数据
  51. print ( 'train>>>>', train_val_data[0] )
  52. print ( '训练验证集数量:', len(train_val_data) )
  53. random_order = range(len(train_val_data))
  54. np.random.shuffle(list(random_order))
  55. train_data = [train_val_data[j] for i, j in enumerate(random_order) if i % 5 != 1 ]
  56. valid_data = [train_val_data[j] for i, j in enumerate(random_order) if i % 5 == 1 ]
  57. test_data = valid_data
  58. print ( '训练集数量:', len(train_data) )
  59. print ( '验证集数量:', len(valid_data) )
  60. print ( '测试集数量:', len(test_data) )
  61. # 建立分词器
  62. tokenizer = Tokenizer(dict_path, do_lower_case=True)
  63. class data_generator(DataGenerator):
  64. """数据生成器
  65. """
  66. def __iter__(self, random=False):
  67. idxs = list(range(len(self.data)))
  68. if random:
  69. np.random.shuffle(idxs)
  70. batch_token_ids, batch_segment_ids, batch_labels = [], [], []
  71. for i in idxs:
  72. text1, text2, label = self.data[i]
  73. # print(text1, text2, label)
  74. token_ids, segment_ids = tokenizer.encode(text1, text2, max_length=maxlen)
  75. batch_token_ids.append(token_ids)
  76. batch_segment_ids.append(segment_ids)
  77. batch_labels.append([label])
  78. if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
  79. batch_token_ids = sequence_padding(batch_token_ids)
  80. batch_segment_ids = sequence_padding(batch_segment_ids)
  81. batch_labels = sequence_padding(batch_labels)
  82. yield [batch_token_ids, batch_segment_ids], batch_labels
  83. batch_token_ids, batch_segment_ids, batch_labels = [], [], []
  84. # 加载预训练模型
  85. bert = build_bert_model(
  86. model='albert',
  87. config_path=config_path,
  88. checkpoint_path=checkpoint_path,
  89. with_pool=True,
  90. return_keras_model=False,
  91. )
  92. output = Dropout(rate=0.1)(bert.model.output)
  93. output = Dense(units=num_classes,
  94. activation='softmax',
  95. kernel_initializer=bert.initializer)(output)
  96. model = keras.models.Model(bert.model.input, output)
  97. model.summary()
  98. model.compile(
  99. loss='sparse_categorical_crossentropy',
  100. optimizer=Adam(learning_rate), # 用足够小的学习率
  101. # optimizer=PiecewiseLinearLearningRate(Adam(5e-5), {10000: 1, 30000: 0.1}),
  102. metrics=['accuracy'],
  103. )
  104. # 转换数据集
  105. train_generator = data_generator(train_data, batch_size)
  106. valid_generator = data_generator(valid_data, batch_size)
  107. test_generator = data_generator(test_data, batch_size)
  108. def evaluate(data):
  109. total, right = 0., 0.
  110. for x_true, y_true in data:
  111. y_pred = model.predict(x_true).argmax(axis=1)
  112. y_true = y_true[:, 0]
  113. total += len(y_true)
  114. right += (y_true == y_pred).sum()
  115. return right / total
  116. class Evaluator(keras.callbacks.Callback):
  117. def __init__(self):
  118. self.best_val_acc = 0.
  119. def on_epoch_end(self, epoch, logs=None):
  120. val_acc = evaluate(valid_generator)
  121. if val_acc > self.best_val_acc:
  122. self.best_val_acc = val_acc
  123. model.save_weights('best_model.weights')
  124. test_acc = evaluate(test_generator)
  125. print(u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n'
  126. % (val_acc, self.best_val_acc, test_acc))
  127. evaluator = Evaluator()
  128. model.fit_generator(train_generator.forfit(),
  129. steps_per_epoch=len(train_generator),
  130. epochs=epochs,
  131. callbacks=[evaluator])
  132. model.load_weights('best_model.weights')
  133. print(u'final test acc: %05f\n' % (evaluate(test_generator)))

 

  1. import numpy as np
  2. import tensorflow as tf
  3. from holmes.models.model_base.model_base import ModelBase
  4. from bert4keras.backend import keras, set_gelu
  5. from bert4keras.bert import build_bert_model
  6. from bert4keras.tokenizer import Tokenizer
  7. from bert4keras.snippets import sequence_padding
  8. set_gelu('tanh')
  9. class ALBertEmbedding():
  10. '''通过ALBert计算句向量
  11. '''
  12. def __init__(self, words_list=None,
  13. config_path=albert_config_path,
  14. checkpoint_path = albert_checkpoint_path,
  15. dict_path = albert_dict_path,
  16. albert_checkpoint_path = albert_checkpoint_path ):
  17. self.session = tf.Session()
  18. keras.backend.set_session(self.session)
  19. self.bert = build_bert_model(
  20. model='albert',
  21. config_path=config_path,
  22. # checkpoint_path=checkpoint_path,
  23. with_pool=True,
  24. return_keras_model=False,)
  25. self.encoder = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[0])
  26. self.encoder.load_weights(albert_checkpoint_path, by_name=True)
  27. self.tokenizer = Tokenizer(dict_path, do_lower_case=True)
  28. def init(self, words_list=None, update=True):
  29. token_ids_list, segment_ids_list = [], []
  30. for words in words_list:
  31. token_ids, segment_ids = self.tokenizer.encode(words)
  32. token_ids_list.append(token_ids)
  33. segment_ids_list.append(segment_ids)
  34. token_ids_list = sequence_padding(token_ids_list)
  35. segment_ids_list = sequence_padding(segment_ids_list)
  36. self.words_list_pre = self.encoder.predict([token_ids_list, segment_ids_list])
  37. self.words_list_pre = self._normalize(self.words_list_pre)
  38. return self
  39. def _normalize(self, x):
  40. x /= (np.array(x)**2).sum(axis=1, keepdims=True)**0.5
  41. return x
  42. def _predict(self, words):
  43. with self.session.as_default():
  44. with self.session.graph.as_default():
  45. token_ids, segment_ids = self.tokenizer.encode( words )
  46. pre = self.encoder.predict([np.array([token_ids]), np.array([segment_ids])])
  47. pre = self._normalize(pre)
  48. return pre
  49. # 句向量
  50. def predict(self, words):
  51. with self.session.as_default():
  52. with self.session.graph.as_default():
  53. token_ids, segment_ids = self.tokenizer.encode( words )
  54. pre = self.encoder.predict([np.array([token_ids]), np.array([segment_ids])])
  55. pre = self._normalize(pre)
  56. return np.dot( self.words_list_pre[:], pre[0] )

 

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/article/detail/42277
推荐阅读
相关标签
  

闽ICP备14008679号