当前位置:   article > 正文

Bert pre-train 代码_valueerror: at least one of `do_train` or `do_eval

valueerror: at least one of `do_train` or `do_eval` must be true.

目录

Pre-train主体代码结构

Initial Config

Build model

Masked LM预测

Next Sentence 预测

Bert主体

Input_fn

Initial Estimator

Train


Pre-train主体代码结构

  1. def main(_):
  2. tf.logging.set_verbosity(tf.logging.INFO)
  3. if not FLAGS.do_train and not FLAGS.do_eval:
  4. raise ValueError("At least one of `do_train` or `do_eval` must be True.")
  5. bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
  6. tf.gfile.MakeDirs(FLAGS.output_dir)
  7. input_files = []
  8. for input_pattern in FLAGS.input_file.split(","):
  9. input_files.extend(tf.gfile.Glob(input_pattern))
  10. tf.logging.info("*** Input Files ***")
  11. for input_file in input_files:
  12. tf.logging.info(" %s" % input_file)
  13. tpu_cluster_resolver = None
  14. if FLAGS.use_tpu and FLAGS.tpu_name:
  15. tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
  16. FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
  17. is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
  18. run_config = tf.contrib.tpu.RunConfig(
  19. cluster=tpu_cluster_resolver,
  20. master=FLAGS.master,
  21. model_dir=FLAGS.output_dir,
  22. save_checkpoints_steps=FLAGS.save_checkpoints_steps,
  23. tpu_config=tf.contrib.tpu.TPUConfig(
  24. iterations_per_loop=FLAGS.iterations_per_loop,
  25. num_shards=FLAGS.num_tpu_cores,
  26. per_host_input_for_training=is_per_host))
  27. model_fn = model_fn_builder(
  28. bert_config=bert_config,
  29. init_checkpoint=FLAGS.init_checkpoint,
  30. learning_rate=FLAGS.learning_rate,
  31. num_train_steps=FLAGS.num_train_steps,
  32. num_warmup_steps=FLAGS.num_warmup_steps,
  33. use_tpu=FLAGS.use_tpu,
  34. use_one_hot_embeddings=FLAGS.use_tpu)
  35. # If TPU is not available, this will fall back to normal Estimator on CPU
  36. # or GPU.
  37. estimator = tf.contrib.tpu.TPUEstimator(
  38. use_tpu=FLAGS.use_tpu,
  39. model_fn=model_fn,
  40. config=run_config,
  41. train_batch_size=FLAGS.train_batch_size,
  42. eval_batch_size=FLAGS.eval_batch_size)
  43. if FLAGS.do_train:
  44. tf.logging.info("***** Running training *****")
  45. tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
  46. train_input_fn = input_fn_builder(
  47. input_files=input_files,
  48. max_seq_length=FLAGS.max_seq_length,
  49. max_predictions_per_seq=FLAGS.max_predictions_per_seq,
  50. is_training=True)
  51. estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)
  52. if FLAGS.do_eval:
  53. tf.logging.info("***** Running evaluation *****")
  54. tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
  55. eval_input_fn = input_fn_builder(
  56. input_files=input_files,
  57. max_seq_length=FLAGS.max_seq_length,
  58. max_predictions_per_seq=FLAGS.max_predictions_per_seq,
  59. is_training=False)
  60. result = estimator.evaluate(
  61. input_fn=eval_input_fn, steps=FLAGS.max_eval_steps)
  62. output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
  63. with tf.gfile.GFile(output_eval_file, "w") as writer:
  64. tf.logging.info("***** Eval results *****")
  65. for key in sorted(result.keys()):
  66. tf.logging.info(" %s = %s", key, str(result[key]))
  67. writer.write("%s = %s\n" % (key, str(result[key])))

Initial Config

Bert model config, session config, distribution strategy, 将前面这些config传入给Run_config

  1. # config code
  2. bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
  3. # Creates session config. allow_soft_placement = True, is required for
  4. # multi-GPU and is not harmful for other modes.
  5. session_config = tf.ConfigProto(
  6. inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads,
  7. intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads,
  8. allow_soft_placement=True)
  9. distribution_strategy = distribution_utils.get_distribution_strategy(
  10. get_num_gpus(FLAGS), FLAGS.all_reduce_alg)
  11. # Creates a `RunConfig` that checkpoints every 24 hours which essentially
  12. # results in checkpoints determined only by `epochs_between_evals`.
  13. # for tmp test save_checkpoints_secs = 60*60, each hour
  14. run_config = tf.estimator.RunConfig(
  15. train_distribute=distribution_strategy,
  16. session_config=session_config,
  17. model_dir=FLAGS.output_dir,
  18. save_checkpoints_steps=FLAGS.save_checkpoints_steps,
  19. )

Build model

model_fn = model_fn_builder()

获取数据内容,传入到Bert主体(12层Transformer)中。对下一句预测任务取出模型的[CLS]结果。对遮蔽词预测任务取出模型的最后结果。然后分别计算loss值,最后将loss值相加。

  • Masked LM预测

获取BERT模型的最后一层序列的encoder tensors,输出遮蔽词预测任务的loss和概率矩阵。

  1. # MLM
  2. def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
  3. label_ids, label_weights):
  4. """Get loss and log probs for the masked LM."""
  5. input_tensor = gather_indexes(input_tensor, positions)
  6. with tf.variable_scope("cls/predictions"):
  7. # We apply one more non-linear transformation before the output layer.
  8. # This matrix is not used after pre-training.
  9. with tf.variable_scope("transform"):
  10. input_tensor = tf.layers.dense(
  11. input_tensor,
  12. units=bert_config.hidden_size,
  13. activation=modeling.get_activation(bert_config.hidden_act),
  14. kernel_initializer=modeling.create_initializer(
  15. bert_config.initializer_range))
  16. input_tensor = modeling.layer_norm(input_tensor)
  17. # The output weights are the same as the input embeddings, but there is
  18. # an output-only bias for each token.
  19. output_bias = tf.get_variable(
  20. "output_bias",
  21. shape=[bert_config.vocab_size],
  22. initializer=tf.zeros_initializer())
  23. logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
  24. logits = tf.nn.bias_add(logits, output_bias)
  25. log_probs = tf.nn.log_softmax(logits, axis=-1)
  26. label_ids = tf.reshape(label_ids, [-1])
  27. label_weights = tf.reshape(label_weights, [-1])
  28. one_hot_labels = tf.one_hot(
  29. label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
  30. # The `positions` tensor might be zero-padded (if the sequence is too
  31. # short to have the maximum number of predictions). The `label_weights`
  32. # tensor has a value of 1.0 for every real prediction and 0.0 for the
  33. # padding predictions.
  34. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
  35. numerator = tf.reduce_sum(label_weights * per_example_loss)
  36. denominator = tf.reduce_sum(label_weights) + 1e-5
  37. loss = numerator / denominator
  38. return (loss, per_example_loss, log_probs)

 

  • Next Sentence 预测

获取cls位的数据,建立一层全连接层,再接一层softmax分类层

  1. # Next Sentence
  2. def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
  3. label_ids, label_weights):
  4. """Get loss and log probs for the masked LM."""
  5. input_tensor = gather_indexes(input_tensor, positions)
  6. with tf.variable_scope("cls/predictions"):
  7. # We apply one more non-linear transformation before the output layer.
  8. # This matrix is not used after pre-training.
  9. with tf.variable_scope("transform"):
  10. input_tensor = tf.layers.dense(
  11. input_tensor,
  12. units=bert_config.hidden_size,
  13. activation=modeling.get_activation(bert_config.hidden_act),
  14. kernel_initializer=modeling.create_initializer(
  15. bert_config.initializer_range))
  16. input_tensor = modeling.layer_norm(input_tensor)
  17. # The output weights are the same as the input embeddings, but there is
  18. # an output-only bias for each token.
  19. output_bias = tf.get_variable(
  20. "output_bias",
  21. shape=[bert_config.vocab_size],
  22. initializer=tf.zeros_initializer())
  23. logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
  24. logits = tf.nn.bias_add(logits, output_bias)
  25. log_probs = tf.nn.log_softmax(logits, axis=-1)
  26. label_ids = tf.reshape(label_ids, [-1])
  27. label_weights = tf.reshape(label_weights, [-1])
  28. one_hot_labels = tf.one_hot(
  29. label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
  30. # The `positions` tensor might be zero-padded (if the sequence is too
  31. # short to have the maximum number of predictions). The `label_weights`
  32. # tensor has a value of 1.0 for every real prediction and 0.0 for the
  33. # padding predictions.
  34. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
  35. numerator = tf.reduce_sum(label_weights * per_example_loss)
  36. denominator = tf.reduce_sum(label_weights) + 1e-5
  37. loss = numerator / denominator
  38. return (loss, per_example_loss, log_probs)

Bert主体

  1. # Bert Model
  2. with tf.variable_scope("bert", scope):
  3. with tf.variable_scope("embeddings"):
  4. # Perform embedding lookup on the word ids.
  5. (self.embedding_output, self.embedding_table) = embedding_lookup(
  6. input_ids=input_ids,
  7. vocab_size=config.vocab_size,
  8. embedding_size=config.hidden_size,
  9. initializer_range=config.initializer_range,
  10. word_embedding_name="word_embeddings",
  11. use_one_hot_embeddings=use_one_hot_embeddings)
  12. # Add positional embeddings and token type embeddings, then layer
  13. # normalize and perform dropout.
  14. self.embedding_output = embedding_postprocessor(
  15. input_tensor=self.embedding_output,
  16. use_token_type=True,
  17. token_type_ids=token_type_ids,
  18. token_type_vocab_size=config.type_vocab_size,
  19. token_type_embedding_name="token_type_embeddings",
  20. use_position_embeddings=True,
  21. position_embedding_name="position_embeddings",
  22. initializer_range=config.initializer_range,
  23. max_position_embeddings=config.max_position_embeddings,
  24. dropout_prob=config.hidden_dropout_prob)
  25. with tf.variable_scope("encoder"):
  26. # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
  27. # mask of shape [batch_size, seq_length, seq_length] which is used
  28. # for the attention scores.
  29. attention_mask = create_attention_mask_from_input_mask(
  30. input_ids, input_mask)
  31. # Run the stacked transformer.
  32. # `sequence_output` shape = [batch_size, seq_length, hidden_size].
  33. self.all_encoder_layers = transformer_model(
  34. input_tensor=self.embedding_output,
  35. attention_mask=attention_mask,
  36. hidden_size=config.hidden_size,
  37. num_hidden_layers=config.num_hidden_layers,
  38. num_attention_heads=config.num_attention_heads,
  39. intermediate_size=config.intermediate_size,
  40. intermediate_act_fn=get_activation(config.hidden_act),
  41. hidden_dropout_prob=config.hidden_dropout_prob,
  42. attention_probs_dropout_prob=config.attention_probs_dropout_prob,
  43. initializer_range=config.initializer_range,
  44. do_return_all_layers=True)
  45. self.sequence_output = self.all_encoder_layers[-1]
  46. # The "pooler" converts the encoded sequence tensor of shape
  47. # [batch_size, seq_length, hidden_size] to a tensor of shape
  48. # [batch_size, hidden_size]. This is necessary for segment-level
  49. # (or segment-pair-level) classification tasks where we need a fixed
  50. # dimensional representation of the segment.
  51. with tf.variable_scope("pooler"):
  52. # We "pool" the model by simply taking the hidden state corresponding
  53. # to the first token. We assume that this has been pre-trained
  54. first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
  55. self.pooled_output = tf.layers.dense(
  56. first_token_tensor,
  57. config.hidden_size,
  58. activation=tf.tanh,
  59. kernel_initializer=create_initializer(config.initializer_range))

Input_fn

数据规范化

Initial Estimator

新建Estimator:

Train

Train: Estimator.Train

Optimizer: 将loss值输入给优化方法,优化并update weight。Training flow如下图

  1. # train op
  2. def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
  3. """Creates an optimizer training op."""
  4. global_step = tf.train.get_or_create_global_step()
  5. learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
  6. # Implements linear decay of the learning rate.
  7. learning_rate = tf.train.polynomial_decay(
  8. learning_rate,
  9. global_step,
  10. num_train_steps,
  11. end_learning_rate=0.0,
  12. power=1.0,
  13. cycle=False)
  14. # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
  15. # learning rate will be `global_step/num_warmup_steps * init_lr`.
  16. if num_warmup_steps:
  17. global_steps_int = tf.cast(global_step, tf.int32)
  18. warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
  19. global_steps_float = tf.cast(global_steps_int, tf.float32)
  20. warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
  21. warmup_percent_done = global_steps_float / warmup_steps_float
  22. warmup_learning_rate = init_lr * warmup_percent_done
  23. is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
  24. learning_rate = (
  25. (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
  26. # create tensor name for logging
  27. tf.identity(learning_rate, name='learning_rate')
  28. tf.summary.scalar('learning_rate', learning_rate)
  29. # It is recommended that you use this optimizer for fine tuning, since this
  30. # is how the model was trained (note that the Adam m/v variables are NOT
  31. # loaded from init_checkpoint.)
  32. optimizer = AdamWeightDecayOptimizer(
  33. learning_rate=learning_rate,
  34. weight_decay_rate=0.01,
  35. beta_1=0.9,
  36. beta_2=0.999,
  37. epsilon=1e-6,
  38. exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
  39. if use_tpu:
  40. optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
  41. tvars = tf.trainable_variables()
  42. grads = tf.gradients(loss, tvars)
  43. # This is how the model was pre-trained.
  44. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
  45. train_op = optimizer.apply_gradients(
  46. zip(grads, tvars), global_step=global_step)
  47. new_global_step = global_step + 1
  48. train_op = tf.group(train_op, [global_step.assign(new_global_step)])
  49. return train_op

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/花生_TL007/article/detail/312368
推荐阅读
相关标签
  

闽ICP备14008679号