赞
踩
目录
- def main(_):
- tf.logging.set_verbosity(tf.logging.INFO)
-
- if not FLAGS.do_train and not FLAGS.do_eval:
- raise ValueError("At least one of `do_train` or `do_eval` must be True.")
-
- bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
-
- tf.gfile.MakeDirs(FLAGS.output_dir)
-
- input_files = []
- for input_pattern in FLAGS.input_file.split(","):
- input_files.extend(tf.gfile.Glob(input_pattern))
-
- tf.logging.info("*** Input Files ***")
- for input_file in input_files:
- tf.logging.info(" %s" % input_file)
-
- tpu_cluster_resolver = None
- if FLAGS.use_tpu and FLAGS.tpu_name:
- tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
- FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
-
- is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
- run_config = tf.contrib.tpu.RunConfig(
- cluster=tpu_cluster_resolver,
- master=FLAGS.master,
- model_dir=FLAGS.output_dir,
- save_checkpoints_steps=FLAGS.save_checkpoints_steps,
- tpu_config=tf.contrib.tpu.TPUConfig(
- iterations_per_loop=FLAGS.iterations_per_loop,
- num_shards=FLAGS.num_tpu_cores,
- per_host_input_for_training=is_per_host))
-
- model_fn = model_fn_builder(
- bert_config=bert_config,
- init_checkpoint=FLAGS.init_checkpoint,
- learning_rate=FLAGS.learning_rate,
- num_train_steps=FLAGS.num_train_steps,
- num_warmup_steps=FLAGS.num_warmup_steps,
- use_tpu=FLAGS.use_tpu,
- use_one_hot_embeddings=FLAGS.use_tpu)
-
- # If TPU is not available, this will fall back to normal Estimator on CPU
- # or GPU.
- estimator = tf.contrib.tpu.TPUEstimator(
- use_tpu=FLAGS.use_tpu,
- model_fn=model_fn,
- config=run_config,
- train_batch_size=FLAGS.train_batch_size,
- eval_batch_size=FLAGS.eval_batch_size)
-
- if FLAGS.do_train:
- tf.logging.info("***** Running training *****")
- tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
- train_input_fn = input_fn_builder(
- input_files=input_files,
- max_seq_length=FLAGS.max_seq_length,
- max_predictions_per_seq=FLAGS.max_predictions_per_seq,
- is_training=True)
- estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)
-
- if FLAGS.do_eval:
- tf.logging.info("***** Running evaluation *****")
- tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)
-
- eval_input_fn = input_fn_builder(
- input_files=input_files,
- max_seq_length=FLAGS.max_seq_length,
- max_predictions_per_seq=FLAGS.max_predictions_per_seq,
- is_training=False)
-
- result = estimator.evaluate(
- input_fn=eval_input_fn, steps=FLAGS.max_eval_steps)
-
- output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
- with tf.gfile.GFile(output_eval_file, "w") as writer:
- tf.logging.info("***** Eval results *****")
- for key in sorted(result.keys()):
- tf.logging.info(" %s = %s", key, str(result[key]))
- writer.write("%s = %s\n" % (key, str(result[key])))

Bert model config, session config, distribution strategy, 将前面这些config传入给Run_config
- # config code
- bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
-
- # Creates session config. allow_soft_placement = True, is required for
- # multi-GPU and is not harmful for other modes.
- session_config = tf.ConfigProto(
- inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads,
- intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads,
- allow_soft_placement=True)
-
- distribution_strategy = distribution_utils.get_distribution_strategy(
- get_num_gpus(FLAGS), FLAGS.all_reduce_alg)
-
- # Creates a `RunConfig` that checkpoints every 24 hours which essentially
- # results in checkpoints determined only by `epochs_between_evals`.
- # for tmp test save_checkpoints_secs = 60*60, each hour
- run_config = tf.estimator.RunConfig(
- train_distribute=distribution_strategy,
- session_config=session_config,
- model_dir=FLAGS.output_dir,
- save_checkpoints_steps=FLAGS.save_checkpoints_steps,
- )

model_fn = model_fn_builder()
获取数据内容,传入到Bert主体(12层Transformer)中。对下一句预测任务取出模型的[CLS]结果。对遮蔽词预测任务取出模型的最后结果。然后分别计算loss值,最后将loss值相加。
获取BERT模型的最后一层序列的encoder tensors,输出遮蔽词预测任务的loss和概率矩阵。
- # MLM
- def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
- label_ids, label_weights):
- """Get loss and log probs for the masked LM."""
- input_tensor = gather_indexes(input_tensor, positions)
-
- with tf.variable_scope("cls/predictions"):
- # We apply one more non-linear transformation before the output layer.
- # This matrix is not used after pre-training.
- with tf.variable_scope("transform"):
- input_tensor = tf.layers.dense(
- input_tensor,
- units=bert_config.hidden_size,
- activation=modeling.get_activation(bert_config.hidden_act),
- kernel_initializer=modeling.create_initializer(
- bert_config.initializer_range))
- input_tensor = modeling.layer_norm(input_tensor)
-
- # The output weights are the same as the input embeddings, but there is
- # an output-only bias for each token.
- output_bias = tf.get_variable(
- "output_bias",
- shape=[bert_config.vocab_size],
- initializer=tf.zeros_initializer())
- logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
- logits = tf.nn.bias_add(logits, output_bias)
- log_probs = tf.nn.log_softmax(logits, axis=-1)
-
- label_ids = tf.reshape(label_ids, [-1])
- label_weights = tf.reshape(label_weights, [-1])
-
- one_hot_labels = tf.one_hot(
- label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
-
- # The `positions` tensor might be zero-padded (if the sequence is too
- # short to have the maximum number of predictions). The `label_weights`
- # tensor has a value of 1.0 for every real prediction and 0.0 for the
- # padding predictions.
- per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
- numerator = tf.reduce_sum(label_weights * per_example_loss)
- denominator = tf.reduce_sum(label_weights) + 1e-5
- loss = numerator / denominator
-
- return (loss, per_example_loss, log_probs)

获取cls位的数据,建立一层全连接层,再接一层softmax分类层
- # Next Sentence
- def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
- label_ids, label_weights):
- """Get loss and log probs for the masked LM."""
- input_tensor = gather_indexes(input_tensor, positions)
-
- with tf.variable_scope("cls/predictions"):
- # We apply one more non-linear transformation before the output layer.
- # This matrix is not used after pre-training.
- with tf.variable_scope("transform"):
- input_tensor = tf.layers.dense(
- input_tensor,
- units=bert_config.hidden_size,
- activation=modeling.get_activation(bert_config.hidden_act),
- kernel_initializer=modeling.create_initializer(
- bert_config.initializer_range))
- input_tensor = modeling.layer_norm(input_tensor)
-
- # The output weights are the same as the input embeddings, but there is
- # an output-only bias for each token.
- output_bias = tf.get_variable(
- "output_bias",
- shape=[bert_config.vocab_size],
- initializer=tf.zeros_initializer())
- logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
- logits = tf.nn.bias_add(logits, output_bias)
- log_probs = tf.nn.log_softmax(logits, axis=-1)
-
- label_ids = tf.reshape(label_ids, [-1])
- label_weights = tf.reshape(label_weights, [-1])
-
- one_hot_labels = tf.one_hot(
- label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
-
- # The `positions` tensor might be zero-padded (if the sequence is too
- # short to have the maximum number of predictions). The `label_weights`
- # tensor has a value of 1.0 for every real prediction and 0.0 for the
- # padding predictions.
- per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
- numerator = tf.reduce_sum(label_weights * per_example_loss)
- denominator = tf.reduce_sum(label_weights) + 1e-5
- loss = numerator / denominator
-
- return (loss, per_example_loss, log_probs)

- # Bert Model
- with tf.variable_scope("bert", scope):
- with tf.variable_scope("embeddings"):
- # Perform embedding lookup on the word ids.
- (self.embedding_output, self.embedding_table) = embedding_lookup(
- input_ids=input_ids,
- vocab_size=config.vocab_size,
- embedding_size=config.hidden_size,
- initializer_range=config.initializer_range,
- word_embedding_name="word_embeddings",
- use_one_hot_embeddings=use_one_hot_embeddings)
-
- # Add positional embeddings and token type embeddings, then layer
- # normalize and perform dropout.
- self.embedding_output = embedding_postprocessor(
- input_tensor=self.embedding_output,
- use_token_type=True,
- token_type_ids=token_type_ids,
- token_type_vocab_size=config.type_vocab_size,
- token_type_embedding_name="token_type_embeddings",
- use_position_embeddings=True,
- position_embedding_name="position_embeddings",
- initializer_range=config.initializer_range,
- max_position_embeddings=config.max_position_embeddings,
- dropout_prob=config.hidden_dropout_prob)
-
- with tf.variable_scope("encoder"):
- # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
- # mask of shape [batch_size, seq_length, seq_length] which is used
- # for the attention scores.
- attention_mask = create_attention_mask_from_input_mask(
- input_ids, input_mask)
-
- # Run the stacked transformer.
- # `sequence_output` shape = [batch_size, seq_length, hidden_size].
- self.all_encoder_layers = transformer_model(
- input_tensor=self.embedding_output,
- attention_mask=attention_mask,
- hidden_size=config.hidden_size,
- num_hidden_layers=config.num_hidden_layers,
- num_attention_heads=config.num_attention_heads,
- intermediate_size=config.intermediate_size,
- intermediate_act_fn=get_activation(config.hidden_act),
- hidden_dropout_prob=config.hidden_dropout_prob,
- attention_probs_dropout_prob=config.attention_probs_dropout_prob,
- initializer_range=config.initializer_range,
- do_return_all_layers=True)
-
- self.sequence_output = self.all_encoder_layers[-1]
- # The "pooler" converts the encoded sequence tensor of shape
- # [batch_size, seq_length, hidden_size] to a tensor of shape
- # [batch_size, hidden_size]. This is necessary for segment-level
- # (or segment-pair-level) classification tasks where we need a fixed
- # dimensional representation of the segment.
- with tf.variable_scope("pooler"):
- # We "pool" the model by simply taking the hidden state corresponding
- # to the first token. We assume that this has been pre-trained
- first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
- self.pooled_output = tf.layers.dense(
- first_token_tensor,
- config.hidden_size,
- activation=tf.tanh,
- kernel_initializer=create_initializer(config.initializer_range))

数据规范化
新建Estimator:
Train: Estimator.Train
Optimizer: 将loss值输入给优化方法,优化并update weight。Training flow如下图
- # train op
- def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
- """Creates an optimizer training op."""
- global_step = tf.train.get_or_create_global_step()
-
- learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
-
- # Implements linear decay of the learning rate.
- learning_rate = tf.train.polynomial_decay(
- learning_rate,
- global_step,
- num_train_steps,
- end_learning_rate=0.0,
- power=1.0,
- cycle=False)
-
- # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
- # learning rate will be `global_step/num_warmup_steps * init_lr`.
- if num_warmup_steps:
- global_steps_int = tf.cast(global_step, tf.int32)
- warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
-
- global_steps_float = tf.cast(global_steps_int, tf.float32)
- warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
-
- warmup_percent_done = global_steps_float / warmup_steps_float
- warmup_learning_rate = init_lr * warmup_percent_done
-
- is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
- learning_rate = (
- (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
-
- # create tensor name for logging
- tf.identity(learning_rate, name='learning_rate')
- tf.summary.scalar('learning_rate', learning_rate)
-
- # It is recommended that you use this optimizer for fine tuning, since this
- # is how the model was trained (note that the Adam m/v variables are NOT
- # loaded from init_checkpoint.)
- optimizer = AdamWeightDecayOptimizer(
- learning_rate=learning_rate,
- weight_decay_rate=0.01,
- beta_1=0.9,
- beta_2=0.999,
- epsilon=1e-6,
- exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
-
- if use_tpu:
- optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
-
- tvars = tf.trainable_variables()
- grads = tf.gradients(loss, tvars)
-
- # This is how the model was pre-trained.
- (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
-
- train_op = optimizer.apply_gradients(
- zip(grads, tvars), global_step=global_step)
-
- new_global_step = global_step + 1
- train_op = tf.group(train_op, [global_step.assign(new_global_step)])
- return train_op

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。