Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more nlp ci tests #340

Merged
merged 8 commits into from
Feb 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 60 additions & 3 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ pipeline {
sh './reinstall.sh && python -m unittest tests/*.py'
}
}

stage('Unittests ASR') {
steps {
sh 'python -m unittest tests/asr/*.py'
Expand Down Expand Up @@ -61,6 +62,26 @@ pipeline {
}
}

stage('Parallel NLP-BERT pretraining') {
failFast true
parallel {
stage('BERT on the fly preprocessing') {
steps {
sh 'cd examples/nlp/language_modeling && CUDA_VISIBLE_DEVICES=0 python bert_pretraining.py --amp_opt_level O1 --data_dir /home/mrjenkins/TestData/nlp/wikitext-2 --dataset_name wikitext-2 --work_dir outputs/bert_lm/wikitext2 --batch_size 64 --lr 0.01 --lr_policy CosineAnnealing --lr_warmup_proportion 0.05 --tokenizer sentence-piece --vocab_size 3200 --hidden_size 768 --intermediate_size 3072 --num_hidden_layers 6 --num_attention_heads 12 --hidden_act "gelu" --save_step_freq 200 --sample_size 10000000 --mask_probability 0.15 --short_seq_prob 0.1 --max_steps=300'
sh 'cd examples/nlp/language_modeling && LOSS=$(cat outputs/bert_lm/wikitext2/log_globalrank-0_localrank-0.txt | grep "Loss" |tail -n 1| awk \'{print \$7}\' | egrep -o "[0-9.]+" ) && echo $LOSS && if [ $(echo "$LOSS < 8.0" | bc -l) -eq 1 ]; then echo "SUCCESS" && exit 0; else echo "FAILURE" && exit 1; fi'
sh 'rm -rf examples/nlp/language_modeling/outputs/wikitext2'
}
}
stage('BERT offline preprocessing') {
steps {
sh 'cd examples/nlp/language_modeling && CUDA_VISIBLE_DEVICES=1 python bert_pretraining.py --amp_opt_level O1 --data_dir /home/mrjenkins/TestData/nlp/wiki_book_mini --work_dir outputs/bert_lm/wiki_book --batch_size 8 --config_file /home/mrjenkins/TestData/nlp/bert_configs/uncased_L-12_H-768_A-12.json --save_step_freq 200 --max_steps 300 --num_gpus 1 --batches_per_step 1 --lr_policy SquareRootAnnealing --beta2 0.999 --beta1 0.9 --lr_warmup_proportion 0.01 --optimizer adam_w --weight_decay 0.01 --lr 0.875e-4 --preprocessed_data '
sh 'cd examples/nlp/language_modeling && LOSS=$(cat outputs/bert_lm/wiki_book/log_globalrank-0_localrank-0.txt | grep "Loss" |tail -n 1| awk \'{print \$7}\' | egrep -o "[0-9.]+" ) && echo $LOSS && if [ $(echo "$LOSS < 15.0" | bc -l) -eq 1 ]; then echo "SUCCESS" && exit 0; else echo "FAILURE" && exit 1; fi'
sh 'rm -rf examples/nlp/language_modeling/outputs/wiki_book'
}
}
}
}

stage('Parallel NLP Examples 1') {
failFast true
parallel {
Expand All @@ -85,6 +106,7 @@ pipeline {
}
}


stage('Parallel NLP Examples 2') {
failFast true
parallel {
Expand All @@ -105,7 +127,42 @@ pipeline {
}
}

stage('Intent Detection/SLot Tagging Examples - Multi-GPU') {
stage('Parallel NLP-Squad') {
failFast true
parallel {
stage('Squad v1.1') {
steps {
sh 'cd examples/nlp/question_answering && CUDA_VISIBLE_DEVICES=0 python question_answering_squad.py --amp_opt_level O1 --train_file /home/mrjenkins/TestData/nlp/squad_mini/v1.1/train-v1.1.json --dev_file /home/mrjenkins/TestData/nlp/squad_mini/v1.1/dev-v1.1.json --work_dir outputs/squadv1 --batch_size 8 --save_step_freq 300 --num_epochs 3 --lr_policy WarmupAnnealing --lr 3e-5 --do_lower_case'
sh 'cd examples/nlp/question_answering && FSCORE=$(cat outputs/squadv1/log_globalrank-0_localrank-0.txt | grep "f1" |tail -n 1 |egrep -o "[0-9.]+"|tail -n 1 ) && echo $FSCORE && if [ $(echo "$FSCORE > 50.0" | bc -l) -eq 1 ]; then echo "SUCCESS" && exit 0; else echo "FAILURE" && exit 1; fi'
sh 'rm -rf examples/nlp/question_answering/outputs/squadv1 && rm -rf /home/mrjenkins/TestData/nlp/squad_mini/v1.1/*cache*'
}
}
stage('Squad v2.0') {
steps {
sh 'cd examples/nlp/question_answering && CUDA_VISIBLE_DEVICES=1 python question_answering_squad.py --amp_opt_level O1 --train_file /home/mrjenkins/TestData/nlp/squad_mini/v2.0/train-v2.0.json --dev_file /home/mrjenkins/TestData/nlp/squad_mini/v2.0/dev-v2.0.json --work_dir outputs/squadv2 --batch_size 8 --save_step_freq 300 --num_epochs 3 --lr_policy WarmupAnnealing --lr 3e-5 --do_lower_case --version_2_with_negative'
sh 'cd examples/nlp/question_answering && FSCORE=$(cat outputs/squadv2/log_globalrank-0_localrank-0.txt | grep "f1" |tail -n 1 |egrep -o "[0-9.]+"|tail -n 1 ) && echo $FSCORE && if [ $(echo "$FSCORE > 50.0" | bc -l) -eq 1 ]; then echo "SUCCESS" && exit 0; else echo "FAILURE" && exit 1; fi'
sh 'rm -rf examples/nlp/question_answering/outputs/squadv2 && rm -rf /home/mrjenkins/TestData/nlp/squad_mini/v2.0/*cache*'
}
}
}
}



stage('NLP-ASR processing') {
failFast true
parallel {
stage('asr_processing') {
steps {
sh 'cd examples/nlp/asr_postprocessor && CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 asr_postprocessor.py --data_dir=/home/mrjenkins/TestData/nlp/asr_postprocessor/pred_real --restore_from=/home/mrjenkins/TestData/nlp/asr_postprocessor/bert-base-uncased_decoder.pt --max_steps=50 --batch_size=512'
sh 'cd examples/nlp/asr_postprocessor && WER=$(cat outputs/asr_postprocessor/log_globalrank-0_localrank-0.txt | grep "Validation WER" | tail -n 1 | egrep -o "[0-9.]+" | tail -n 1) && echo $WER && if [ $(echo "$WER < 2.0" | bc -l) -eq 1 ]; then echo "SUCCESS" && exit 0; else echo "FAILURE" && exit 1; fi'
sh 'rm -rf examples/nlp/asr_postprocessor/outputs'
}
}
}
}

stage('NLP-Intent Detection/SLot Tagging Examples - Multi-GPU') {
failFast true
steps {
sh 'cd examples/nlp/intent_detection_slot_tagging && CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 joint_intent_slot_with_bert.py --num_gpus=2 --num_epochs=1 --max_seq_length=50 --dataset_name=jarvis-retail --data_dir=/home/mrjenkins/TestData/nlp/retail/ --eval_file_prefix=eval --batch_size=10 --num_train_samples=-1 --do_lower_case --shuffle_data --work_dir=outputs'
Expand All @@ -115,7 +172,7 @@ pipeline {
}
}

stage('NMT Example') {
stage('NLP-NMT Example') {
failFast true
steps {
sh 'cd examples/nlp/neural_machine_translation/ && CUDA_VISIBLE_DEVICES=0 python machine_translation_tutorial.py --max_steps 100'
Expand Down Expand Up @@ -170,7 +227,7 @@ pipeline {
failFast true
steps {
sh 'cd examples/tts && CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 tacotron2.py --max_steps=51 --model_config=configs/tacotron2.yaml --train_dataset=/home/mrjenkins/TestData/an4_dataset/an4_train.json --amp_opt_level=O1 --eval_freq=50'
sh 'cd examples/tts && TTS_CHECKPOINT_DIR=$(ls | grep "Tacotron2") && echo $TTS_CHECKPOINT_DIR && LOSS=$(cat $TTS_CHECKPOINT_DIR/log_globalrank-0_localrank-0.txt | grep -o -E "Loss[ :0-9.]+" | grep -o -E "[0-9.]+" | tail -n 1) && echo $LOSS && if [ $(echo "$LOSS > 3.0" | bc -l) -eq 1 ]; then echo "FAILURE" && exit 1; else echo "SUCCESS"; fi'
sh 'cd examples/tts && TTS_CHECKPOINT_DIR=$(ls | grep "Tacotron2") && echo $TTS_CHECKPOINT_DIR && LOSS=$(cat $TTS_CHECKPOINT_DIR/log_globalrank-0_localrank-0.txt | grep -o -E "Loss[ :0-9.]+" | grep -o -E "[0-9.]+" | tail -n 1) && echo $LOSS && if [ $(echo "$LOSS < 3.0" | bc -l) -eq 1 ]; then echo "SUCCESS" && exit 0; else echo "FAILURE" && exit 1; fi'
// sh 'cd examples/tts && TTS_CHECKPOINT_DIR=$(ls | grep "Tacotron2") && cp ../asr/multi_gpu/checkpoints/* $TTS_CHECKPOINT_DIR/checkpoints'
// sh 'CUDA_VISIBLE_DEVICES=0 python tacotron2_an4_test.py --model_config=configs/tacotron2.yaml --eval_dataset=/home/mrjenkins/TestData/an4_dataset/an4_train.json --jasper_model_config=../asr/configs/jasper_an4.yaml --load_dir=$TTS_CHECKPOINT_DIR/checkpoints'
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
parser.add_argument("--beam_size", default=4, type=int)
parser.add_argument("--len_pen", default=0.0, type=float)
parser.add_argument(
"--restore_from", dest="restore_from", type=str, default="../../scripts/bert-base-uncased_decoder.pt"
"--restore_from", dest="restore_from", type=str, default="../../../scripts/bert-base-uncased_decoder.pt"
)
args = parser.parse_args()

Expand Down Expand Up @@ -207,6 +207,11 @@ def print_loss(x):
callbacks=callbacks,
optimizer=args.optimizer,
lr_policy=lr_policy,
optimization_params={"num_epochs": 300, "lr": args.lr, "weight_decay": args.weight_decay},
optimization_params={
"num_epochs": 300,
"max_steps": args.max_steps,
"lr": args.lr,
"weight_decay": args.weight_decay,
},
batches_per_step=args.iter_per_step,
)
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
See the list of pretrained models, call:
nemo_nlp.huggingface.BERT.list_pretrained_models()
"""
pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(
pretrained_bert_model = nemo.collections.nlp.nm.trainables.huggingface.BERT(
pretrained_model_name=args.pretrained_bert_model
)
hidden_size = pretrained_bert_model.hidden_size
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,10 @@
nemo_nlp.huggingface.BERT.list_pretrained_models()
"""
if args.bert_checkpoint and args.bert_config:
pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(
config_filename=args.bert_config
)
pretrained_bert_model = nemo.collections.nlp.nm.trainables.huggingface.BERT(config_filename=args.bert_config)
pretrained_bert_model.restore_from(args.bert_checkpoint)
else:
pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(
pretrained_bert_model = nemo.collections.nlp.nm.trainables.huggingface.BERT(
pretrained_model_name=args.pretrained_bert_model
)

Expand Down
3 changes: 1 addition & 2 deletions examples/nlp/language_modeling/bert_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@
log_dir=args.work_dir,
create_tb_writer=True,
files_to_copy=[__file__],
add_time_to_log_dir=True,
add_time_to_log_dir=False,
)

if args.config_file is not None:
Expand Down Expand Up @@ -318,7 +318,6 @@ def create_pipeline(data_file, batch_size, preprocessed_data=False, batches_per_
optimization_params['num_epochs'] = args.num_epochs
else:
optimization_params['max_steps'] = args.max_steps

nf.train(
tensors_to_optimize=[train_loss],
lr_policy=lr_policy_fn,
Expand Down
42 changes: 22 additions & 20 deletions examples/nlp/question_answering/question_answering_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@

To finetune Squad v1.1 on pretrained BERT large uncased on 1 GPU:
python question_answering_squad.py
--data_dir /path_to_data_dir/squad/v1.1
--train_file /path_to_data_dir/squad/v1.1/train-v1.1.json
--dev_file /path_to_data_dir/squad/v1.1/dev-v1.1.json
--work_dir /path_to_output_folder
--bert_checkpoint /path_to_bert_checkpoint
--amp_opt_level "O1"
Expand All @@ -43,7 +44,8 @@
To finetune Squad v1.1 on pretrained BERT large uncased on 8 GPU:
python -m torch.distributed.launch --nproc_per_node=8 question_answering_squad.py
--amp_opt_level "O1"
--data_dir /path_to_data_dir/squad/v1.1
--train_file /path_to_data_dir/squad/v1.1/train-v1.1.json
--dev_file /path_to_data_dir/squad/v1.1/dev-v1.1.json
--bert_checkpoint /path_to_bert_checkpoint
--batch_size 3
--num_gpus 8
Expand Down Expand Up @@ -74,12 +76,10 @@
def parse_args():
parser = argparse.ArgumentParser(description="Squad_with_pretrained_BERT")
parser.add_argument(
"--data_dir",
type=str,
required=True,
help="The input data dir. Should contain "
"train.*.json, dev.*.json files "
"(or other data files) for the task.",
"--train_file", type=str, help="The training data file. Should be *.json",
)
parser.add_argument(
"--dev_file", type=str, required=True, help="The evaluation data file. Should be *.json",
)
parser.add_argument(
"--pretrained_bert_model", default="bert-base-uncased", type=str, help="Name of the pre-trained model"
Expand Down Expand Up @@ -202,7 +202,7 @@ def parse_args():


def create_pipeline(
data_dir,
data_file,
model,
head,
loss_fn,
Expand All @@ -220,7 +220,7 @@ def create_pipeline(
version_2_with_negative=version_2_with_negative,
batch_size=batch_size,
tokenizer=tokenizer,
data_dir=data_dir,
data_file=data_file,
max_query_length=max_query_length,
max_seq_length=max_seq_length,
doc_stride=doc_stride,
Expand Down Expand Up @@ -248,13 +248,14 @@ def create_pipeline(

if __name__ == "__main__":
args = parse_args()
if not os.path.exists(args.data_dir):
raise FileNotFoundError("SQUAD datasets not found. Datasets can be " "obtained using scripts/get_squad.py")

if not args.version_2_with_negative:
args.work_dir = f'{args.work_dir}/squad1.1'
else:
args.work_dir = f'{args.work_dir}/squad2.0'
if not os.path.exists(args.dev_file):
raise FileNotFoundError(
"eval data not found. Datasets can be " "obtained using examples/nlp/scripts/get_squad.py"
)
if not args.evaluation_only and not os.path.exists(args.train_file):
raise FileNotFoundError(
"train data not found. Datasets can be " "obtained using examples/nlp/scripts/get_squad.py"
)
ekmb marked this conversation as resolved.
Show resolved Hide resolved

# Instantiate neural factory with supported backend
nf = nemo_core.NeuralModuleFactory(
Expand All @@ -264,7 +265,7 @@ def create_pipeline(
log_dir=args.work_dir,
create_tb_writer=True,
files_to_copy=[__file__],
add_time_to_log_dir=True,
add_time_to_log_dir=False,
)

if args.tokenizer == "sentencepiece":
Expand Down Expand Up @@ -303,7 +304,7 @@ def create_pipeline(

if not args.evaluation_only:
train_loss, train_steps_per_epoch, _, _ = create_pipeline(
data_dir=args.data_dir,
data_file=args.train_file,
model=model,
head=qa_head,
loss_fn=squad_loss,
Expand All @@ -316,8 +317,9 @@ def create_pipeline(
batches_per_step=args.batches_per_step,
mode="train",
)
logging.info(f"training step per epoch: {train_steps_per_epoch}")
_, _, eval_output, eval_data_layer = create_pipeline(
data_dir=args.data_dir,
data_file=args.dev_file,
model=model,
head=qa_head,
loss_fn=squad_loss,
Expand Down
Loading