nlp/bert/tensorflow2/benchmarks.yml

---
# --- Pretraining ---
pretrain_options: &pretrain_options
  env:
    TF_POPLAR_FLAGS: '--executable_cache_path=./tf_cache/'
  data:
    throughput:
      regexp: 'throughput: *(.*?) samples\/sec'
      skip: 1
    mlm_loss:
      regexp: 'mlm_loss: *(\d*\.\d*)'
      reduction_type: "final"
    nsp_loss:
      regexp: 'nsp_loss: *(\d*\.\d*)'
      reduction_type: "final"

  output:
    - [samples/sec, 'throughput']
    - [mlm_loss, 'mlm_loss']
    - [nsp_loss, 'nsp_loss']

config_options: &config_options
  requirements_path: requirements.txt


# POD4
tf2_bert_large_pretrain_real_pod4:
  <<: [*pretrain_options, *config_options]
  description: |
    This benchmark measures the throughput of Bert Large phase 1
    and phase 2 pretraining.
  parameters:
    - [seqlen, phase]
    - ['128', '1']
    - ['384', '2']
  cmd: >-
    python3 run_pretraining.py
      --config configs/pretrain_large_{seqlen}_phase{phase}_pod4.json
      --dataset-dir $DATASETS_DIR/wikipedia/{seqlen}/
      --total-num-train-samples 300000
      --enable-wandb false

tf2_bert_base_pretrain_real_pod4:
  <<: [*pretrain_options, *config_options]
  description: |
    This benchmark measures the throughput of Bert Base phase 1
    and phase 2 pretraining.
  parameters:
    - [seqlen, phase]
    - ['128', '1']
    - ['384', '2']
  cmd: >-
    python3 run_pretraining.py
      --config configs/pretrain_base_{seqlen}_phase{phase}_pod4.json
      --dataset-dir $DATASETS_DIR/wikipedia/{seqlen}/
      --total-num-train-samples 300000
      --enable-wandb false


# POD16
tf2_bert_large_pretrain_real_pod16:
  <<: [*pretrain_options, *config_options]
  description: |
    This benchmark measures the throughput of Bert Large phase 1
    and phase 2 pretraining.
  parameters:
    - [seqlen, phase]
    - ['128', '1']
    - ['384', '2']
  cmd: >-
    python3 run_pretraining.py
      --config configs/pretrain_large_{seqlen}_phase{phase}.json
      --dataset-dir $DATASETS_DIR/wikipedia/{seqlen}/
      --total-num-train-samples 300000
      --enable-wandb false

tf2_bert_base_pretrain_real_pod16:
  <<: [*pretrain_options, *config_options]
  description: |
    This benchmark measures the throughput of Bert Base phase 1
    and phase 2 pretraining.
  parameters:
    - [seqlen, phase]
    - ['128', '1']
    - ['384', '2']
  cmd: >-
    python3 run_pretraining.py
      --config configs/pretrain_base_{seqlen}_phase{phase}.json
      --dataset-dir $DATASETS_DIR/wikipedia/{seqlen}/
      --total-num-train-samples 300000
      --enable-wandb false

# POD64
tf2_bert_large_sl128_pretrain_real_pod64_conv:
  <<: [*pretrain_options, *config_options]
  description: |
    Tests convergence of Bert Large reference config for
    phase 1 pretraining on 64 IPUs.
  cmd: >-
    python3 run_pretraining.py
      --config configs/pretrain_large_128_phase1_POD64.json
      --dataset-dir $DATASETS_DIR/wikipedia/128/
      --save-ckpt-path checkpoint/phase1/
      --enable-wandb true
      --wandb-entity apps-benchmarking

tf2_bert_large_sl384_pretrain_real_pod64_conv:
  <<: [*pretrain_options, *config_options]
  description: |
    Tests convergence of Bert Large reference config for
    phase 2 pretraining on 64 IPUs.
  cmd: >-
    python3 run_pretraining.py
      --config configs/pretrain_large_384_phase2_POD64.json
      --dataset-dir $DATASETS_DIR/wikipedia/384/
      --save-ckpt-path checkpoint/phase2/
      --pretrained-ckpt-path checkpoint/phase1/
      --enable-wandb true
      --wandb-entity-name apps-benchmarking


# --- SQuAD training ---
squad_options: &squad_options
  data:
    throughput:
      regexp: 'throughput: *(.*?) samples\/sec'
      skip: 2
  output:
    - [samples/sec, 'throughput']

tf2_bert_squad_base_train_real_pod16:
  <<: [*squad_options, *config_options]
  description:
    This benchmark measures the throughput of Bert Base SQuAD fine tuning.
  cmd: >-
    python3 run_squad.py
      --config configs/squad_base_checkpoint_path.json
      --do-validation false
      --total-num-train-samples 4000

tf2_bert_squad_large_train_real_pod16:
  <<: [*squad_options, *config_options]
  description:
    This benchmark measures the throughput of Bert Large SQuAD fine tuning.
  cmd: >-
    python3 run_squad.py
      --config configs/squad_large_checkpoint_path.json
      --do-validation false
      --total-num-train-samples 2000

tf2_bert_squad_large_train_real_pod16_conv:
  <<: [*squad_options, *config_options]
  description:
    This benchmark trains Bert Large SQuAD fine tuning to convergence.
  cmd: >-
    python3 run_squad.py
      --config configs/squad_large_checkpoint_path.json
      --do-validation true
      --pretrained-ckpt-path ./checkpoint/phase2/
      --enable-wandb true
      --wandb-entity apps-benchmarking

# --- GLUE training ---
glue_options: &glue_options
  data:
    throughput:
      regexp: 'throughput: *(.*?) samples\/sec'
      skip: 2
  output:
    - [samples/sec, 'throughput']

tf2_glue_base_train_real_pod16:
  <<: [*glue_options, *config_options]
  description:
    This benchmark measures the throughput of Bert Base GLUE fine tuning.
  cmd: >-
    python3 run_seq_classification.py
      --config configs/glue_base_checkpoint_path.json
      --glue-task mrpc
      --do-validation false
      --do-test false
      --total-num-train-samples 300
      --enable-wandb true
      --wandb-entity apps-benchmarking