Continuing with the pre-training PART I didn’t cover before, in the previous article (BERT Source Analysis (PART II)) we finished processing the input data, Let’s see how BERT performs “Masked LM” and “Next Sentence Prediction”.

  • run_pretraining[1]

Task #1: Masked LM

The get_masked_LM_output function is used to calculate the training loss for “task #1”. Input is the sequence_output output ([batch_size, seq_length, hidden_size]) of the last layer of BertModel, because the prediction of the MASK tag of a sequence is a labeling problem, The output state of the entire sequence is required.

def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
                         label_ids, label_weights):
  """Get loss and log probs for the masked LM."""Encode input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # Add a nonlinear transform before output, only in the pre-training phase with tf.variable_scope("transform") : input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = Modeling. Layer_norm (input_tensor) # output_weights = tf.get_variable()"output_bias",
    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=- 1Label_ids # Label_ids = tf.0 (0 label_ids, [0- 1])
    label_weights = tf.reshape(label_weights, [- 1])

    one_hot_labels = tf.one_hot(
        label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # But due to the actual MASK may not be20, for example, MASK18 only, then label_IDS has2a0(padding) # label_weights = [1.1. .0.0], indicating that the following two label_id are padding, and the calculation of loss should be removed. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[- 1])
    numerator = tf.reduce_sum(label_weights * per_example_loss)
    denominator = tf.reduce_sum(label_weights) + 1e-5
    loss = numerator / denominator

  return (loss, per_example_loss, log_probs)
Task #2 Next Sentence Prediction

The get_next_sentence_output function is used to calculate the training loss for “Task #2”. The input is pooled_output ([batch_size, hidden_size]) of the last layer of BertModel. Since this task is a dichotomous problem, only the first token [CLS] of each sequence is required.

def get_next_sentence_output(bert_config, input_tensor, labels):
  """Get loss and log probs for the next sentence prediction."""# label0The next sentence is true. The label1The next sentence is not true. # The parameters of this classifier are discarded in the actual fine-tuning stage."cls/seq_relationship"):
    output_weights = tf.get_variable(
        shape=[2, bert_config.hidden_size],
    output_bias = tf.get_variable(
        "output_bias", shape=[2], initializer=tf.zeros_initializer())

    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=- 1)
    labels = tf.reshape(labels, [- 1])
    one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=- 1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, per_example_loss, log_probs)
Custom model

The module_fn_builder function is used to construct the model_FN used by Estimator. With the above two training tasks defined, you can write out the training process, and then pass the training set into automatic training.

def model_fn_builder(bert_config, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,

  def model_fn(features, labels, mode, params):

    tf.logging.info("*** Features ***")
    for name in sorted(features.keys()):
      tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    masked_lm_positions = features["masked_lm_positions"]
    masked_lm_ids = features["masked_lm_ids"]
    masked_lm_weights = features["masked_lm_weights"]
    next_sentence_labels = features["next_sentence_labels"] is_training = (mode = = tf) estimator. ModeKeys. "TRAIN") # create Transformer instance object model = modeling. BertModel ( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, Use_one_hot_embeddings = USe_one_HOT_embeddings) # Get batch loss for MASK LM task, Average loss and predicted probability matrix (MASKED_LM_Loss, MASkeD_LM_example_Loss, maskeD_LM_log_PROBs) = GEt_MASKED_LM_OUTPUT (bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, Masked_lm_weights) # obtain batch loss of NEXT SENTENCE PREDICTION task, Average loss and predicted probability matrix (next_sentence_loss, next_sentence_example_Loss, next_sentence_log_probs) = get_next_sentence_output( bert_config, model.get_pooled_output(), Next_sentence_labels) # Total loss defined as the sum of the two total_loss = masked_LM_loss + next_sentence_loss # Get all variables tvars = Tf.trainable_variables () Initialized_variABLE_NAMES = {} SCAFFold_fn = None # Restore the previously saved models if anyif init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      if use_tpu:

        def tpu_scaffold():
          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
          return tf.train.Scaffold()

        scaffold_fn = tpu_scaffold
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info(" name = %s, shape = %s%s".var.name, var.shape, init_string) output_spec = Noneifmode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, Scaffold_fn = scaffold_fn) # validation process spec elif mode = = tf. The estimator. ModeKeys. EVAL: def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels):"""Calculating losses and accuracy."""
        masked_lm_log_probs = tf.reshape(masked_lm_log_probs,
                                         [- 1, masked_lm_log_probs.shape[- 1]])
        masked_lm_predictions = tf.argmax(
            masked_lm_log_probs, axis=- 1, output_type=tf.int32)
        masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [- 1])
        masked_lm_ids = tf.reshape(masked_lm_ids, [- 1])
        masked_lm_weights = tf.reshape(masked_lm_weights, [- 1])
        masked_lm_accuracy = tf.metrics.accuracy(
        masked_lm_mean_loss = tf.metrics.mean(
            values=masked_lm_example_loss, weights=masked_lm_weights)

        next_sentence_log_probs = tf.reshape(
            next_sentence_log_probs, [- 1, next_sentence_log_probs.shape[- 1]])
        next_sentence_predictions = tf.argmax(
            next_sentence_log_probs, axis=- 1, output_type=tf.int32)
        next_sentence_labels = tf.reshape(next_sentence_labels, [- 1])
        next_sentence_accuracy = tf.metrics.accuracy(
            labels=next_sentence_labels, predictions=next_sentence_predictions)
        next_sentence_mean_loss = tf.metrics.mean(

        return {
            "masked_lm_accuracy": masked_lm_accuracy,
            "masked_lm_loss": masked_lm_mean_loss,
            "next_sentence_accuracy": next_sentence_accuracy,
            "next_sentence_loss": next_sentence_mean_loss,

      eval_metrics = (metric_fn, [
          masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
          masked_lm_weights, next_sentence_example_loss,
          next_sentence_log_probs, next_sentence_labels
      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
      raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))

    return output_spec

  return model_fn
The main function

The training process is realized based on the above functions

def main(_):
  if not FLAGS.do_train and not FLAGS.do_eval:
    raise ValueError("At least one of `do_train` or `do_eval` must be True.")
  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  input_files = []
  for input_pattern in FLAGS.input_file.split(","):

  tf.logging.info("*** Input Files ***")
  for input_file in input_files:
    tf.logging.info(" %s" % input_file)

  tpu_cluster_resolver = None
  ifFLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, Model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, Num_warmup_steps = flags.num_WARMup_steps, use_tpu= flags.use_tpu, use_one_HOT_embeddings = flags.use_tpu) # Will be automatically converted into the CPU/GPU Estimator Estimator = tf. The contrib. Tpu. TPUEstimator (use_tpu = FLAGS. Use_tpu model_fn = model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size)if FLAGS.do_train:
    tf.logging.info("***** Running training *****")
    tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
    train_input_fn = input_fn_builder(
    estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)

  if FLAGS.do_eval:
    tf.logging.info("***** Running evaluation *****")
    tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)

    eval_input_fn = input_fn_builder(

    result = estimator.evaluate(
        input_fn=eval_input_fn, steps=FLAGS.max_eval_steps)

    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
    with tf.gfile.GFile(output_eval_file, "w") as writer:
      tf.logging.info("***** Eval results *****")
      for key in sorted(result.keys()):
        tf.logging.info(" %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))
The test code

Pretrain to run scripts

python run_pretraining.py \
  --input_file=/tmp/tf_examples.tfrecord \
  --output_dir=/tmp/pretraining_output \
  --do_train=True \
  --do_eval=True \
  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
  --train_batch_size=32 \
  --max_seq_length=128 \
  --max_predictions_per_seq=20 \
  --num_train_steps=20 \
  --num_warmup_steps=10 \
You can then get output logs like the following:

***** Eval results *****
  global_step = 20
  loss = 0.0979674
  masked_lm_accuracy = 0.985479
  masked_lm_loss = 0.0979328
  next_sentence_accuracy = 1.0
  next_sentence_loss = 3.45724 e-05
Finally, post a tip on the pre-training process [I can’t do it anyway, just check it out]

The Over~BERT source code series ends here.

PS. Up to now, BERT has also updated many things, such as Whole Word Masking, so please point out any mistakes before, so that I can correct them in time

