Skip to content

Commit

Permalink
Merge pull request #340 from NVIDIA/add_more_nlp_ci_tests
Browse files Browse the repository at this point in the history
Add more nlp ci tests
  • Loading branch information
ekmb authored Feb 7, 2020
2 parents 4f299f4 + adb82a0 commit c6a3cdd
Show file tree
Hide file tree
Showing 17 changed files with 118 additions and 6,564 deletions.
63 changes: 60 additions & 3 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ pipeline {
sh './reinstall.sh && python -m unittest tests/*.py'
}
}

stage('Unittests ASR') {
steps {
sh 'python -m unittest tests/asr/*.py'
Expand Down Expand Up @@ -61,6 +62,26 @@ pipeline {
}
}

stage('Parallel NLP-BERT pretraining') {
failFast true
parallel {
stage('BERT on the fly preprocessing') {
steps {
sh 'cd examples/nlp/language_modeling && CUDA_VISIBLE_DEVICES=0 python bert_pretraining.py --amp_opt_level O1 --data_dir /home/mrjenkins/TestData/nlp/wikitext-2 --dataset_name wikitext-2 --work_dir outputs/bert_lm/wikitext2 --batch_size 64 --lr 0.01 --lr_policy CosineAnnealing --lr_warmup_proportion 0.05 --tokenizer sentence-piece --vocab_size 3200 --hidden_size 768 --intermediate_size 3072 --num_hidden_layers 6 --num_attention_heads 12 --hidden_act "gelu" --save_step_freq 200 --sample_size 10000000 --mask_probability 0.15 --short_seq_prob 0.1 --max_steps=300'
sh 'cd examples/nlp/language_modeling && LOSS=$(cat outputs/bert_lm/wikitext2/log_globalrank-0_localrank-0.txt | grep "Loss" |tail -n 1| awk \'{print \$7}\' | egrep -o "[0-9.]+" ) && echo $LOSS && if [ $(echo "$LOSS < 8.0" | bc -l) -eq 1 ]; then echo "SUCCESS" && exit 0; else echo "FAILURE" && exit 1; fi'
sh 'rm -rf examples/nlp/language_modeling/outputs/wikitext2'
}
}
stage('BERT offline preprocessing') {
steps {
sh 'cd examples/nlp/language_modeling && CUDA_VISIBLE_DEVICES=1 python bert_pretraining.py --amp_opt_level O1 --data_dir /home/mrjenkins/TestData/nlp/wiki_book_mini --work_dir outputs/bert_lm/wiki_book --batch_size 8 --config_file /home/mrjenkins/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json --save_step_freq 200 --max_steps 300 --num_gpus 1 --batches_per_step 1 --lr_policy SquareRootAnnealing --beta2 0.999 --beta1 0.9 --lr_warmup_proportion 0.01 --optimizer adam_w --weight_decay 0.01 --lr 0.875e-4 --preprocessed_data '
sh 'cd examples/nlp/language_modeling && LOSS=$(cat outputs/bert_lm/wiki_book/log_globalrank-0_localrank-0.txt | grep "Loss" |tail -n 1| awk \'{print \$7}\' | egrep -o "[0-9.]+" ) && echo $LOSS && if [ $(echo "$LOSS < 15.0" | bc -l) -eq 1 ]; then echo "SUCCESS" && exit 0; else echo "FAILURE" && exit 1; fi'
sh 'rm -rf examples/nlp/language_modeling/outputs/wiki_book'
}
}
}
}

stage('Parallel NLP Examples 1') {
failFast true
parallel {
Expand All @@ -85,6 +106,7 @@ pipeline {
}
}


stage('Parallel NLP Examples 2') {
failFast true
parallel {
Expand All @@ -105,7 +127,42 @@ pipeline {
}
}

stage('Intent Detection/SLot Tagging Examples - Multi-GPU') {
stage('Parallel NLP-Squad') {
failFast true
parallel {
stage('Squad v1.1') {
steps {
sh 'cd examples/nlp/question_answering && CUDA_VISIBLE_DEVICES=0 python question_answering_squad.py --amp_opt_level O1 --train_file /home/mrjenkins/TestData/nlp/squad_mini/v1.1/train-v1.1.json --dev_file /home/mrjenkins/TestData/nlp/squad_mini/v1.1/dev-v1.1.json --work_dir outputs/squadv1 --batch_size 8 --save_step_freq 300 --num_epochs 3 --lr_policy WarmupAnnealing --lr 3e-5 --do_lower_case'
sh 'cd examples/nlp/question_answering && FSCORE=$(cat outputs/squadv1/log_globalrank-0_localrank-0.txt | grep "f1" |tail -n 1 |egrep -o "[0-9.]+"|tail -n 1 ) && echo $FSCORE && if [ $(echo "$FSCORE > 50.0" | bc -l) -eq 1 ]; then echo "SUCCESS" && exit 0; else echo "FAILURE" && exit 1; fi'
sh 'rm -rf examples/nlp/question_answering/outputs/squadv1 && rm -rf /home/mrjenkins/TestData/nlp/squad_mini/v1.1/*cache*'
}
}
stage('Squad v2.0') {
steps {
sh 'cd examples/nlp/question_answering && CUDA_VISIBLE_DEVICES=1 python question_answering_squad.py --amp_opt_level O1 --train_file /home/mrjenkins/TestData/nlp/squad_mini/v2.0/train-v2.0.json --dev_file /home/mrjenkins/TestData/nlp/squad_mini/v2.0/dev-v2.0.json --work_dir outputs/squadv2 --batch_size 8 --save_step_freq 300 --num_epochs 3 --lr_policy WarmupAnnealing --lr 3e-5 --do_lower_case --version_2_with_negative'
sh 'cd examples/nlp/question_answering && FSCORE=$(cat outputs/squadv2/log_globalrank-0_localrank-0.txt | grep "f1" |tail -n 1 |egrep -o "[0-9.]+"|tail -n 1 ) && echo $FSCORE && if [ $(echo "$FSCORE > 50.0" | bc -l) -eq 1 ]; then echo "SUCCESS" && exit 0; else echo "FAILURE" && exit 1; fi'
sh 'rm -rf examples/nlp/question_answering/outputs/squadv2 && rm -rf /home/mrjenkins/TestData/nlp/squad_mini/v2.0/*cache*'
}
}
}
}



stage('NLP-ASR processing') {
failFast true
parallel {
stage('asr_processing') {
steps {
sh 'cd examples/nlp/asr_postprocessor && CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 asr_postprocessor.py --data_dir=/home/mrjenkins/TestData/nlp/asr_postprocessor/pred_real --restore_from=/home/mrjenkins/TestData/nlp/asr_postprocessor/bert-base-uncased_decoder.pt --max_steps=50 --batch_size=512'
sh 'cd examples/nlp/asr_postprocessor && WER=$(cat outputs/asr_postprocessor/log_globalrank-0_localrank-0.txt | grep "Validation WER" | tail -n 1 | egrep -o "[0-9.]+" | tail -n 1) && echo $WER && if [ $(echo "$WER < 2.0" | bc -l) -eq 1 ]; then echo "SUCCESS" && exit 0; else echo "FAILURE" && exit 1; fi'
sh 'rm -rf examples/nlp/asr_postprocessor/outputs'
}
}
}
}

stage('NLP-Intent Detection/SLot Tagging Examples - Multi-GPU') {
failFast true
steps {
sh 'cd examples/nlp/intent_detection_slot_tagging && CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 joint_intent_slot_with_bert.py --num_gpus=2 --num_epochs=1 --max_seq_length=50 --dataset_name=jarvis-retail --data_dir=/home/mrjenkins/TestData/nlp/retail/ --eval_file_prefix=eval --batch_size=10 --num_train_samples=-1 --do_lower_case --shuffle_data --work_dir=outputs'
Expand All @@ -115,7 +172,7 @@ pipeline {
}
}

stage('NMT Example') {
stage('NLP-NMT Example') {
failFast true
steps {
sh 'cd examples/nlp/neural_machine_translation/ && CUDA_VISIBLE_DEVICES=0 python machine_translation_tutorial.py --max_steps 100'
Expand Down Expand Up @@ -170,7 +227,7 @@ pipeline {
failFast true
steps {
sh 'cd examples/tts && CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 tacotron2.py --max_steps=51 --model_config=configs/tacotron2.yaml --train_dataset=/home/mrjenkins/TestData/an4_dataset/an4_train.json --amp_opt_level=O1 --eval_freq=50'
sh 'cd examples/tts && TTS_CHECKPOINT_DIR=$(ls | grep "Tacotron2") && echo $TTS_CHECKPOINT_DIR && LOSS=$(cat $TTS_CHECKPOINT_DIR/log_globalrank-0_localrank-0.txt | grep -o -E "Loss[ :0-9.]+" | grep -o -E "[0-9.]+" | tail -n 1) && echo $LOSS && if [ $(echo "$LOSS > 3.0" | bc -l) -eq 1 ]; then echo "FAILURE" && exit 1; else echo "SUCCESS"; fi'
sh 'cd examples/tts && TTS_CHECKPOINT_DIR=$(ls | grep "Tacotron2") && echo $TTS_CHECKPOINT_DIR && LOSS=$(cat $TTS_CHECKPOINT_DIR/log_globalrank-0_localrank-0.txt | grep -o -E "Loss[ :0-9.]+" | grep -o -E "[0-9.]+" | tail -n 1) && echo $LOSS && if [ $(echo "$LOSS < 3.0" | bc -l) -eq 1 ]; then echo "SUCCESS" && exit 0; else echo "FAILURE" && exit 1; fi'
// sh 'cd examples/tts && TTS_CHECKPOINT_DIR=$(ls | grep "Tacotron2") && cp ../asr/multi_gpu/checkpoints/* $TTS_CHECKPOINT_DIR/checkpoints'
// sh 'CUDA_VISIBLE_DEVICES=0 python tacotron2_an4_test.py --model_config=configs/tacotron2.yaml --eval_dataset=/home/mrjenkins/TestData/an4_dataset/an4_train.json --jasper_model_config=../asr/configs/jasper_an4.yaml --load_dir=$TTS_CHECKPOINT_DIR/checkpoints'
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
parser.add_argument("--beam_size", default=4, type=int)
parser.add_argument("--len_pen", default=0.0, type=float)
parser.add_argument(
"--restore_from", dest="restore_from", type=str, default="../../scripts/bert-base-uncased_decoder.pt"
"--restore_from", dest="restore_from", type=str, default="../../../scripts/bert-base-uncased_decoder.pt"
)
args = parser.parse_args()

Expand Down Expand Up @@ -207,6 +207,11 @@ def print_loss(x):
callbacks=callbacks,
optimizer=args.optimizer,
lr_policy=lr_policy,
optimization_params={"num_epochs": 300, "lr": args.lr, "weight_decay": args.weight_decay},
optimization_params={
"num_epochs": 300,
"max_steps": args.max_steps,
"lr": args.lr,
"weight_decay": args.weight_decay,
},
batches_per_step=args.iter_per_step,
)
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
See the list of pretrained models, call:
nemo_nlp.huggingface.BERT.list_pretrained_models()
"""
pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(
pretrained_bert_model = nemo.collections.nlp.nm.trainables.huggingface.BERT(
pretrained_model_name=args.pretrained_bert_model
)
hidden_size = pretrained_bert_model.hidden_size
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,10 @@
nemo_nlp.huggingface.BERT.list_pretrained_models()
"""
if args.bert_checkpoint and args.bert_config:
pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(
config_filename=args.bert_config
)
pretrained_bert_model = nemo.collections.nlp.nm.trainables.huggingface.BERT(config_filename=args.bert_config)
pretrained_bert_model.restore_from(args.bert_checkpoint)
else:
pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(
pretrained_bert_model = nemo.collections.nlp.nm.trainables.huggingface.BERT(
pretrained_model_name=args.pretrained_bert_model
)

Expand Down
3 changes: 1 addition & 2 deletions examples/nlp/language_modeling/bert_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@
log_dir=args.work_dir,
create_tb_writer=True,
files_to_copy=[__file__],
add_time_to_log_dir=True,
add_time_to_log_dir=False,
)

if args.config_file is not None:
Expand Down Expand Up @@ -318,7 +318,6 @@ def create_pipeline(data_file, batch_size, preprocessed_data=False, batches_per_
optimization_params['num_epochs'] = args.num_epochs
else:
optimization_params['max_steps'] = args.max_steps

nf.train(
tensors_to_optimize=[train_loss],
lr_policy=lr_policy_fn,
Expand Down
42 changes: 22 additions & 20 deletions examples/nlp/question_answering/question_answering_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
To finetune Squad v1.1 on pretrained BERT large uncased on 1 GPU:
python question_answering_squad.py
--data_dir /path_to_data_dir/squad/v1.1
--train_file /path_to_data_dir/squad/v1.1/train-v1.1.json
--dev_file /path_to_data_dir/squad/v1.1/dev-v1.1.json
--work_dir /path_to_output_folder
--bert_checkpoint /path_to_bert_checkpoint
--amp_opt_level "O1"
Expand All @@ -43,7 +44,8 @@
To finetune Squad v1.1 on pretrained BERT large uncased on 8 GPU:
python -m torch.distributed.launch --nproc_per_node=8 question_answering_squad.py
--amp_opt_level "O1"
--data_dir /path_to_data_dir/squad/v1.1
--train_file /path_to_data_dir/squad/v1.1/train-v1.1.json
--dev_file /path_to_data_dir/squad/v1.1/dev-v1.1.json
--bert_checkpoint /path_to_bert_checkpoint
--batch_size 3
--num_gpus 8
Expand Down Expand Up @@ -74,12 +76,10 @@
def parse_args():
parser = argparse.ArgumentParser(description="Squad_with_pretrained_BERT")
parser.add_argument(
"--data_dir",
type=str,
required=True,
help="The input data dir. Should contain "
"train.*.json, dev.*.json files "
"(or other data files) for the task.",
"--train_file", type=str, help="The training data file. Should be *.json",
)
parser.add_argument(
"--dev_file", type=str, required=True, help="The evaluation data file. Should be *.json",
)
parser.add_argument(
"--pretrained_bert_model", default="bert-base-uncased", type=str, help="Name of the pre-trained model"
Expand Down Expand Up @@ -202,7 +202,7 @@ def parse_args():


def create_pipeline(
data_dir,
data_file,
model,
head,
loss_fn,
Expand All @@ -220,7 +220,7 @@ def create_pipeline(
version_2_with_negative=version_2_with_negative,
batch_size=batch_size,
tokenizer=tokenizer,
data_dir=data_dir,
data_file=data_file,
max_query_length=max_query_length,
max_seq_length=max_seq_length,
doc_stride=doc_stride,
Expand Down Expand Up @@ -248,13 +248,14 @@ def create_pipeline(

if __name__ == "__main__":
args = parse_args()
if not os.path.exists(args.data_dir):
raise FileNotFoundError("SQUAD datasets not found. Datasets can be " "obtained using scripts/get_squad.py")

if not args.version_2_with_negative:
args.work_dir = f'{args.work_dir}/squad1.1'
else:
args.work_dir = f'{args.work_dir}/squad2.0'
if not os.path.exists(args.dev_file):
raise FileNotFoundError(
"eval data not found. Datasets can be " "obtained using examples/nlp/scripts/get_squad.py"
)
if not args.evaluation_only and not os.path.exists(args.train_file):
raise FileNotFoundError(
"train data not found. Datasets can be " "obtained using examples/nlp/scripts/get_squad.py"
)

# Instantiate neural factory with supported backend
nf = nemo_core.NeuralModuleFactory(
Expand All @@ -264,7 +265,7 @@ def create_pipeline(
log_dir=args.work_dir,
create_tb_writer=True,
files_to_copy=[__file__],
add_time_to_log_dir=True,
add_time_to_log_dir=False,
)

if args.tokenizer == "sentencepiece":
Expand Down Expand Up @@ -303,7 +304,7 @@ def create_pipeline(

if not args.evaluation_only:
train_loss, train_steps_per_epoch, _, _ = create_pipeline(
data_dir=args.data_dir,
data_file=args.train_file,
model=model,
head=qa_head,
loss_fn=squad_loss,
Expand All @@ -316,8 +317,9 @@ def create_pipeline(
batches_per_step=args.batches_per_step,
mode="train",
)
logging.info(f"training step per epoch: {train_steps_per_epoch}")
_, _, eval_output, eval_data_layer = create_pipeline(
data_dir=args.data_dir,
data_file=args.dev_file,
model=model,
head=qa_head,
loss_fn=squad_loss,
Expand Down
Loading

0 comments on commit c6a3cdd

Please sign in to comment.