Merge branch 'master' into neural_type_system2

NVIDIA · Feb 5, 2020 · a296fc2 · a296fc2
2 parents 0cec895 + ad32363
commit a296fc2
Show file tree

Hide file tree

Showing 133 changed files with 8,599 additions and 5,659 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -84,7 +84,15 @@ To release a new version, please update the changelog as followed:
 ([PR #286](https://github.com/NVIDIA/NeMo/pull/286)) - @stasbel
 - Major cleanup of Neural Module constructors (init), aiming at increasing the framework robustness: cleanup of NeuralModule initialization logic, refactor of trainer/actions (getting rid of local_params), fixes of several examples and unit tests, extraction and storing of intial parameters (init_params).  
 ([PR #309](https://github.com/NVIDIA/NeMo/pull/309)) - @tkornuta-nvidia
-
+- Refactoring of `nemo_nlp` collections: 
+([PR #316](https://github.com/NVIDIA/NeMo/pull/316)) - @VahidooX, @yzhang123, @ekmb
+    - renaming of files and restructuring of folder in `nemo_nlp`
+    - Updated licenses
+- Updated nemo's use of the logging library. from nemo import logging is now the reccomended way of using the nemo logger. neural_factory.logger and all other instances of logger are now deprecated and planned for removal in the next version. Please see PR 267 for complete change information.
+([PR #267](https://github.com/NVIDIA/NeMo/pull/267), [PR #283](https://github.com/NVIDIA/NeMo/pull/283), [PR #305](https://github.com/NVIDIA/NeMo/pull/305), [PR #311](https://github.com/NVIDIA/NeMo/pull/311)) - @blisc
+
+- Added TRADE (dialogue state tracking model) on MultiWOZ dataset
+([PR #322](https://github.com/NVIDIA/NeMo/pull/322)) - @chiphuyen, @VahidooX
 
 ### Dependencies Update
 - Added dependency on `wrapt` (the new version of the `deprecated` warning) - @tkornuta-nvidia, @DEKHTIARJonathan

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -60,7 +60,7 @@ pipeline {
         }
         stage ('NMT test') {
           steps {
-            sh 'cd examples/nlp && CUDA_VISIBLE_DEVICES=0 python nmt_tutorial.py'
+            sh 'cd examples/nlp && CUDA_VISIBLE_DEVICES=0 python machine_translation_tutorial.py'
           }
         }
       }

diff --git a/examples/asr/notebooks/2_Online_ASR_Microphone_Demo.ipynb b/examples/asr/notebooks/2_Online_ASR_Microphone_Demo.ipynb
@@ -173,7 +173,6 @@
     "data_layer = AudioDataLayer()\n",
     "\n",
     "data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(\n",
-    "    factory=neural_factory,\n",
     "    **model_definition['AudioToMelSpectrogramPreprocessor'])\n",
     "\n",
     "jasper_encoder = nemo_asr.JasperEncoder(\n",

diff --git a/examples/nlp/BERTPretrainingTutorial.ipynb b/examples/nlp/BERTPretrainingTutorial.ipynb
@@ -58,8 +58,8 @@
     "from nemo.utils.lr_policies import CosineAnnealing\n",
     "\n",
     "import nemo.collections.nlp as nemo_nlp\n",
-    "from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer\n",
-    "from nemo.collections.nlp.utils.callbacks.bert_pretraining import eval_iter_callback, \\\n",
+    "from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer\n",
+    "from nemo.collections.nlp.callbacks.lm_bert_callback import eval_iter_callback, \\\n",
     "    eval_epochs_done_callback\n",
     "\n",
     "BATCHES_PER_STEP = 1\n",
@@ -126,7 +126,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "bert_model = nemo_nlp.huggingface.BERT(\n",
+    "bert_model = nemo_nlp.nm.trainables.huggingface.BERT(\n",
     "    vocab_size=tokenizer.vocab_size,\n",
     "    num_hidden_layers=NUM_LAYERS,\n",
     "    hidden_size=D_MODEL,\n",
@@ -144,21 +144,21 @@
    "outputs": [],
    "source": [
     "# Masked Language Modeling Loss\n",
-    "mlm_classifier = nemo_nlp.BertTokenClassifier(D_MODEL,\n",
+    "mlm_classifier = nemo_nlp.nm.trainables.BertTokenClassifier(D_MODEL,\n",
     "                                          num_classes=tokenizer.vocab_size,\n",
     "                                              activation=HIDDEN_ACT,\n",
     "                                          log_softmax=True)\n",
-    "mlm_loss = nemo_nlp.MaskedLanguageModelingLossNM()\n",
+    "mlm_loss = nemo_nlp.nm.losses.MaskedLanguageModelingLossNM()\n",
     "\n",
     "# Next Sentence Prediciton Loss\n",
-    "nsp_classifier = nemo_nlp.SequenceClassifier(D_MODEL,\n",
+    "nsp_classifier = nemo_nlp.nm.trainables.SequenceClassifier(D_MODEL,\n",
     "                                             num_classes=2,\n",
     "                                             num_layers=2,\n",
     "                                             activation='tanh',\n",
     "                                             log_softmax=False)\n",
     "nsp_loss = nemo.backends.pytorch.common.CrossEntropyLoss()\n",
     "\n",
-    "bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)"
+    "bert_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2)"
    ]
   },
   {
@@ -167,15 +167,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_data_layer = nemo_nlp.BertPretrainingDataLayer(\n",
+    "import os\n",
+    "train_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n",
     "    tokenizer=tokenizer,\n",
     "    dataset=os.path.join(\"data/lm/wikitext-2\", \"train.txt\"),\n",
     "    max_seq_length=MAX_SEQ_LENGTH,\n",
     "    mask_probability=MASK_PROBABILITY,\n",
     "    batch_size=BATCH_SIZE\n",
     ")\n",
     "\n",
-    "eval_data_layer = nemo_nlp.BertPretrainingDataLayer(\n",
+    "eval_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n",
     "    tokenizer=tokenizer,\n",
     "    dataset=os.path.join(\"data/lm/wikitext-2\", \"valid.txt\"),\n",
     "    max_seq_length=MAX_SEQ_LENGTH,\n",
@@ -282,6 +283,13 @@
     "                    \"grad_norm_clip\": None\n",
     "                })"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/examples/nlp/NERWithBERT.ipynb b/examples/nlp/NERWithBERT.ipynb
@@ -13,16 +13,18 @@
     "from nemo.utils.lr_policies import WarmupAnnealing\n",
     "\n",
     "import nemo.collections.nlp as nemo_nlp\n",
-    "from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer\n",
-    "from nemo.collections.nlp.utils.callbacks.token_classification import \\\n",
-    "    eval_iter_callback, eval_epochs_done_callback"
+    "from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer\n",
+    "from nemo.collections.nlp.callbacks.token_classification_callback import \\\n",
+    "    eval_iter_callback, eval_epochs_done_callback\n",
+    "from nemo.collections.nlp.nm.losses import TokenClassificationLoss\n",
+    "from nemo.collections.nlp.nm.trainables import TokenClassifier"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "You can download data from [here](https://github.com/kyzhouhzau/BERT-NER/tree/master/data) and use [this](https://github.com/NVIDIA/NeMo/blob/master/scripts/convert_iob_format_to_token_classification_format.py) script to preprocess it."
+    "You can download data from [here](https://github.com/kyzhouhzau/BERT-NER/tree/master/data) and use [this](https://github.com/NVIDIA/NeMo/blob/master/nemo/collections/nlp/data/scripts/convert_iob_format_to_token_classification_format.py) script to preprocess it."
    ]
   },
   {
@@ -78,7 +80,7 @@
     "# If you're using a standard BERT model, you should do it like this. To see the full\n",
     "# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()\n",
     "tokenizer = NemoBertTokenizer(pretrained_model=\"bert-base-cased\")\n",
-    "bert_model = nemo_nlp.huggingface.BERT(\n",
+    "bert_model = nemo_nlp.nm.trainables.huggingface.BERT(\n",
     "    pretrained_model_name=\"bert-base-cased\")"
    ]
   },
@@ -89,7 +91,7 @@
    "outputs": [],
    "source": [
     "# Describe training DAG\n",
-    "train_data_layer = nemo_nlp.BertTokenClassificationDataLayer(\n",
+    "train_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationDataLayer(\n",
     "        tokenizer=tokenizer,\n",
     "        text_file=os.path.join(DATA_DIR, 'text_train.txt'),\n",
     "        label_file=os.path.join(DATA_DIR, 'labels_train.txt'),\n",
@@ -99,13 +101,12 @@
     "label_ids = train_data_layer.dataset.label_ids\n",
     "num_classes = len(label_ids)\n",
     "\n",
-    "ner_classifier = nemo_nlp.TokenClassifier(hidden_size=bert_model.hidden_size,\n",
+    "hidden_size = bert_model.hidden_size\n",
+    "ner_classifier = TokenClassifier(hidden_size=hidden_size,\n",
     "                                          num_classes=num_classes,\n",
     "                                          dropout=CLASSIFICATION_DROPOUT)\n",
     "\n",
-    "ner_loss = nemo_nlp.TokenClassificationLoss(d_model=hidden_size,\n",
-    "                                            num_classes=len(label_ids),\n",
-    "                                            dropout=CLASSIFICATION_DROPOUT)\n",
+    "ner_loss = TokenClassificationLoss(num_classes=len(label_ids))\n",
     "\n",
     "input_ids, input_type_ids, input_mask, loss_mask, _, labels = train_data_layer()\n",
     "\n",
@@ -124,7 +125,7 @@
    "outputs": [],
    "source": [
     "# Describe evaluation DAG\n",
-    "eval_data_layer = nemo_nlp.BertTokenClassificationDataLayer(\n",
+    "eval_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationDataLayer(\n",
     "        tokenizer=tokenizer,\n",
     "        text_file=os.path.join(DATA_DIR, 'text_dev.txt'),\n",
     "        label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),\n",
@@ -203,9 +204,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.7.4 64-bit",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python37464bitc56e562f54084a24b5afed5459c99218"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {

diff --git a/examples/nlp/PunctuationWithBERT.ipynb b/examples/nlp/PunctuationWithBERT.ipynb
@@ -11,12 +11,15 @@
     "import os\n",
     "\n",
     "import nemo\n",
+    "from nemo import logging\n",
     "from nemo.utils.lr_policies import WarmupAnnealing\n",
     "\n",
     "import nemo.collections.nlp as nemo_nlp\n",
-    "from nemo.collections.nlp import NemoBertTokenizer, TokenClassifier, TokenClassificationLoss\n",
-    "from nemo.collections.nlp.data.datasets import utils\n",
-    "from nemo.collections.nlp.utils.callbacks.punctuation_capitalization import eval_iter_callback, eval_epochs_done_callback\n",
+    "from nemo.collections.nlp.data import NemoBertTokenizer\n",
+    "from nemo.collections.nlp.nm.trainables import TokenClassifier\n",
+    "from nemo.collections.nlp.nm.losses import TokenClassificationLoss, LossAggregatorNM\n",
+    "from nemo.collections.nlp.callbacks.punctuation_capitalization_callback import eval_iter_callback, eval_epochs_done_callback\n",
+    "from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights\n",
     "\n",
     "DATA_DIR = \"PATH_TO_WHERE_THE_DATA_IS\"\n",
     "WORK_DIR = \"PATH_TO_WHERE_TO_STORE_CHECKPOINTS_AND_LOGS\"\n",
@@ -47,7 +50,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In this notebook we're going to use a subset of English examples from the [Tatoeba collection of sentences](https://tatoeba.org/eng), set NUM_SAMPLES=-1 and consider including other datasets to improve the performance of the model. Use [NeMo/scripts/get_tatoeba_data.py](https://github.com/NVIDIA/NeMo/blob/master/scripts/get_tatoeba_data.py) to download and preprocess the Tatoeba data."
+    "In this notebook we're going to use a subset of English examples from the [Tatoeba collection of sentences](https://tatoeba.org/eng), set NUM_SAMPLES=-1 and consider including other datasets to improve the performance of the model. Use [NeMo/nemo/collections/nlp/data/scripts/get_tatoeba_data.py](https://github.com/NVIDIA/NeMo/blob/master/nemo/collections/nlp/data/scripts/get_tatoeba_data.py) to download and preprocess the Tatoeba data."
    ]
   },
   {
@@ -57,7 +60,8 @@
    "outputs": [],
    "source": [
     "# This should take about a minute since the data is already downloaded in the previous step\n",
-    "! python ../../scripts/get_tatoeba_data.py --data_dir $DATA_DIR --num_sample $NUM_SAMPLES"
+    "\n",
+    "! python ../../nemo/collections/nlp/data/scripts/get_tatoeba.py --data_dir $DATA_DIR --num_sample $NUM_SAMPLES"
    ]
   },
   {
@@ -116,7 +120,7 @@
     "# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()\n",
     "\n",
     "tokenizer = NemoBertTokenizer(pretrained_model=PRETRAINED_BERT_MODEL)\n",
-    "bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=PRETRAINED_BERT_MODEL)"
+    "bert_model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=PRETRAINED_BERT_MODEL)"
    ]
   },
   {
@@ -132,7 +136,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer(\n",
+    "train_data_layer = nemo_nlp.nm.data_layers.PunctuationCapitalizationDataLayer(\n",
     "     tokenizer=tokenizer,\n",
     "     text_file=os.path.join(DATA_DIR, 'text_train.txt'),\n",
     "     label_file=os.path.join(DATA_DIR, 'labels_train.txt'),\n",
@@ -144,14 +148,14 @@
     "\n",
     "\n",
     "# Define classifier for Punctuation and Capitalization tasks\n",
-    "punct_classifier = nemo_nlp.TokenClassifier(\n",
+    "punct_classifier = TokenClassifier(\n",
     "    hidden_size=bert_model.hidden_size,\n",
     "    num_classes=len(punct_label_ids),\n",
     "    dropout=CLASSIFICATION_DROPOUT,\n",
     "    num_layers=PUNCT_NUM_FC_LAYERS,\n",
     "    name='Punctuation')\n",
     "\n",
-    "capit_classifier = nemo_nlp.TokenClassifier(\n",
+    "capit_classifier = TokenClassifier(\n",
     "    hidden_size=bert_model.hidden_size,\n",
     "    num_classes=len(capit_label_ids),\n",
     "    dropout=CLASSIFICATION_DROPOUT,\n",
@@ -160,14 +164,14 @@
     "\n",
     "# If you don't want to use weighted loss for Punctuation task, use class_weights=None\n",
     "punct_label_freqs = train_data_layer.dataset.punct_label_frequencies\n",
-    "class_weights = utils.calc_class_weights(punct_label_freqs)\n",
+    "class_weights = calc_class_weights(punct_label_freqs)\n",
     "\n",
     "# define loss\n",
-    "punct_loss = nemo_nlp.TokenClassificationLoss(\n",
+    "punct_loss = TokenClassificationLoss(\n",
     "    num_classes=len(punct_label_ids),\n",
     "    class_weights=class_weights)\n",
-    "capit_loss = nemo_nlp.TokenClassificationLoss(num_classes=len(capit_label_ids))\n",
-    "task_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)"
+    "capit_loss = TokenClassificationLoss(num_classes=len(capit_label_ids))\n",
+    "task_loss = LossAggregatorNM(num_inputs=2)"
    ]
   },
   {
@@ -218,7 +222,7 @@
     "# during creation of the train_data_layer to make sure that the mapping is correct in case some of the labels from\n",
     "# the train set are missing in the dev set.\n",
     "\n",
-    "eval_data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer(\n",
+    "eval_data_layer = nemo_nlp.nm.data_layers.PunctuationCapitalizationDataLayer(\n",
     "    tokenizer=tokenizer,\n",
     "    text_file=os.path.join(DATA_DIR, 'text_dev.txt'),\n",
     "    label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),\n",
@@ -361,7 +365,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "infer_data_layer = nemo_nlp.BertTokenClassificationInferDataLayer(\n",
+    "infer_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationInferDataLayer(\n",
     "    queries=queries,\n",
     "    tokenizer=tokenizer,\n",
     "    max_seq_length=MAX_SEQ_LENGTH,\n",
@@ -399,7 +403,7 @@
     "capit_preds = np.argmax(capit_logits, axis=2)\n",
     "\n",
     "for i, query in enumerate(queries):\n",
-    "    nf.logger.info(f'Query: {query}')\n",
+    "    logging(f'Query: {query}')\n",
     "\n",
     "    punct_pred = punct_preds[i][subtokens_mask[i] > 0.5]\n",
     "    capit_pred = capit_preds[i][subtokens_mask[i] > 0.5]\n",
@@ -419,7 +423,7 @@
     "        if punct_label != 'O':\n",
     "            output += punct_label\n",
     "        output += ' '\n",
-    "    nf.logger.info(f'Combined: {output.strip()}\\n')"
+    "    logging(f'Combined: {output.strip()}\\n')"
    ]
   },
   {