diff --git a/CHANGELOG.md b/CHANGELOG.md
index b0c3a818beb4..1fbf6d6ac532 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -84,7 +84,15 @@ To release a new version, please update the changelog as followed:
 ([PR #286](https://github.com/NVIDIA/NeMo/pull/286)) - @stasbel
 - Major cleanup of Neural Module constructors (init), aiming at increasing the framework robustness: cleanup of NeuralModule initialization logic, refactor of trainer/actions (getting rid of local_params), fixes of several examples and unit tests, extraction and storing of intial parameters (init_params).  
 ([PR #309](https://github.com/NVIDIA/NeMo/pull/309)) - @tkornuta-nvidia
-
+- Refactoring of `nemo_nlp` collections: 
+([PR #316](https://github.com/NVIDIA/NeMo/pull/316)) - @VahidooX, @yzhang123, @ekmb
+    - renaming of files and restructuring of folder in `nemo_nlp`
+    - Updated licenses
+- Updated nemo's use of the logging library. from nemo import logging is now the reccomended way of using the nemo logger. neural_factory.logger and all other instances of logger are now deprecated and planned for removal in the next version. Please see PR 267 for complete change information.
+([PR #267](https://github.com/NVIDIA/NeMo/pull/267), [PR #283](https://github.com/NVIDIA/NeMo/pull/283), [PR #305](https://github.com/NVIDIA/NeMo/pull/305), [PR #311](https://github.com/NVIDIA/NeMo/pull/311)) - @blisc
+
+- Added TRADE (dialogue state tracking model) on MultiWOZ dataset
+([PR #322](https://github.com/NVIDIA/NeMo/pull/322)) - @chiphuyen, @VahidooX
 
 ### Dependencies Update
 - Added dependency on `wrapt` (the new version of the `deprecated` warning) - @tkornuta-nvidia, @DEKHTIARJonathan
diff --git a/Jenkinsfile b/Jenkinsfile
index c43f67d59ca5..d0d2b0eaa5b1 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -60,7 +60,7 @@ pipeline {
         }
         stage ('NMT test') {
           steps {
-            sh 'cd examples/nlp && CUDA_VISIBLE_DEVICES=0 python nmt_tutorial.py'
+            sh 'cd examples/nlp && CUDA_VISIBLE_DEVICES=0 python machine_translation_tutorial.py'
           }
         }
       }
diff --git a/examples/asr/notebooks/2_Online_ASR_Microphone_Demo.ipynb b/examples/asr/notebooks/2_Online_ASR_Microphone_Demo.ipynb
index 4a842b3a4365..0a4a5842f0b8 100644
--- a/examples/asr/notebooks/2_Online_ASR_Microphone_Demo.ipynb
+++ b/examples/asr/notebooks/2_Online_ASR_Microphone_Demo.ipynb
@@ -173,7 +173,6 @@
     "data_layer = AudioDataLayer()\n",
     "\n",
     "data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(\n",
-    "    factory=neural_factory,\n",
     "    **model_definition['AudioToMelSpectrogramPreprocessor'])\n",
     "\n",
     "jasper_encoder = nemo_asr.JasperEncoder(\n",
diff --git a/examples/nlp/BERTPretrainingTutorial.ipynb b/examples/nlp/BERTPretrainingTutorial.ipynb
index 6c62a495db50..f33887452dbc 100644
--- a/examples/nlp/BERTPretrainingTutorial.ipynb
+++ b/examples/nlp/BERTPretrainingTutorial.ipynb
@@ -58,8 +58,8 @@
     "from nemo.utils.lr_policies import CosineAnnealing\n",
     "\n",
     "import nemo.collections.nlp as nemo_nlp\n",
-    "from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer\n",
-    "from nemo.collections.nlp.utils.callbacks.bert_pretraining import eval_iter_callback, \\\n",
+    "from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer\n",
+    "from nemo.collections.nlp.callbacks.lm_bert_callback import eval_iter_callback, \\\n",
     "    eval_epochs_done_callback\n",
     "\n",
     "BATCHES_PER_STEP = 1\n",
@@ -126,7 +126,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "bert_model = nemo_nlp.huggingface.BERT(\n",
+    "bert_model = nemo_nlp.nm.trainables.huggingface.BERT(\n",
     "    vocab_size=tokenizer.vocab_size,\n",
     "    num_hidden_layers=NUM_LAYERS,\n",
     "    hidden_size=D_MODEL,\n",
@@ -144,21 +144,21 @@
    "outputs": [],
    "source": [
     "# Masked Language Modeling Loss\n",
-    "mlm_classifier = nemo_nlp.BertTokenClassifier(D_MODEL,\n",
+    "mlm_classifier = nemo_nlp.nm.trainables.BertTokenClassifier(D_MODEL,\n",
     "                                          num_classes=tokenizer.vocab_size,\n",
     "                                              activation=HIDDEN_ACT,\n",
     "                                          log_softmax=True)\n",
-    "mlm_loss = nemo_nlp.MaskedLanguageModelingLossNM()\n",
+    "mlm_loss = nemo_nlp.nm.losses.MaskedLanguageModelingLossNM()\n",
     "\n",
     "# Next Sentence Prediciton Loss\n",
-    "nsp_classifier = nemo_nlp.SequenceClassifier(D_MODEL,\n",
+    "nsp_classifier = nemo_nlp.nm.trainables.SequenceClassifier(D_MODEL,\n",
     "                                             num_classes=2,\n",
     "                                             num_layers=2,\n",
     "                                             activation='tanh',\n",
     "                                             log_softmax=False)\n",
     "nsp_loss = nemo.backends.pytorch.common.CrossEntropyLoss()\n",
     "\n",
-    "bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)"
+    "bert_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2)"
    ]
   },
   {
@@ -167,7 +167,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_data_layer = nemo_nlp.BertPretrainingDataLayer(\n",
+    "import os\n",
+    "train_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n",
     "    tokenizer=tokenizer,\n",
     "    dataset=os.path.join(\"data/lm/wikitext-2\", \"train.txt\"),\n",
     "    max_seq_length=MAX_SEQ_LENGTH,\n",
@@ -175,7 +176,7 @@
     "    batch_size=BATCH_SIZE\n",
     ")\n",
     "\n",
-    "eval_data_layer = nemo_nlp.BertPretrainingDataLayer(\n",
+    "eval_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n",
     "    tokenizer=tokenizer,\n",
     "    dataset=os.path.join(\"data/lm/wikitext-2\", \"valid.txt\"),\n",
     "    max_seq_length=MAX_SEQ_LENGTH,\n",
@@ -282,6 +283,13 @@
     "                    \"grad_norm_clip\": None\n",
     "                })"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/examples/nlp/NERWithBERT.ipynb b/examples/nlp/NERWithBERT.ipynb
index 19cf18f8389b..9d993fd4c3a8 100644
--- a/examples/nlp/NERWithBERT.ipynb
+++ b/examples/nlp/NERWithBERT.ipynb
@@ -13,16 +13,18 @@
     "from nemo.utils.lr_policies import WarmupAnnealing\n",
     "\n",
     "import nemo.collections.nlp as nemo_nlp\n",
-    "from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer\n",
-    "from nemo.collections.nlp.utils.callbacks.token_classification import \\\n",
-    "    eval_iter_callback, eval_epochs_done_callback"
+    "from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer\n",
+    "from nemo.collections.nlp.callbacks.token_classification_callback import \\\n",
+    "    eval_iter_callback, eval_epochs_done_callback\n",
+    "from nemo.collections.nlp.nm.losses import TokenClassificationLoss\n",
+    "from nemo.collections.nlp.nm.trainables import TokenClassifier"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "You can download data from [here](https://github.com/kyzhouhzau/BERT-NER/tree/master/data) and use [this](https://github.com/NVIDIA/NeMo/blob/master/scripts/convert_iob_format_to_token_classification_format.py) script to preprocess it."
+    "You can download data from [here](https://github.com/kyzhouhzau/BERT-NER/tree/master/data) and use [this](https://github.com/NVIDIA/NeMo/blob/master/nemo/collections/nlp/data/scripts/convert_iob_format_to_token_classification_format.py) script to preprocess it."
    ]
   },
   {
@@ -78,7 +80,7 @@
     "# If you're using a standard BERT model, you should do it like this. To see the full\n",
     "# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()\n",
     "tokenizer = NemoBertTokenizer(pretrained_model=\"bert-base-cased\")\n",
-    "bert_model = nemo_nlp.huggingface.BERT(\n",
+    "bert_model = nemo_nlp.nm.trainables.huggingface.BERT(\n",
     "    pretrained_model_name=\"bert-base-cased\")"
    ]
   },
@@ -89,7 +91,7 @@
    "outputs": [],
    "source": [
     "# Describe training DAG\n",
-    "train_data_layer = nemo_nlp.BertTokenClassificationDataLayer(\n",
+    "train_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationDataLayer(\n",
     "        tokenizer=tokenizer,\n",
     "        text_file=os.path.join(DATA_DIR, 'text_train.txt'),\n",
     "        label_file=os.path.join(DATA_DIR, 'labels_train.txt'),\n",
@@ -99,13 +101,12 @@
     "label_ids = train_data_layer.dataset.label_ids\n",
     "num_classes = len(label_ids)\n",
     "\n",
-    "ner_classifier = nemo_nlp.TokenClassifier(hidden_size=bert_model.hidden_size,\n",
+    "hidden_size = bert_model.hidden_size\n",
+    "ner_classifier = TokenClassifier(hidden_size=hidden_size,\n",
     "                                          num_classes=num_classes,\n",
     "                                          dropout=CLASSIFICATION_DROPOUT)\n",
     "\n",
-    "ner_loss = nemo_nlp.TokenClassificationLoss(d_model=hidden_size,\n",
-    "                                            num_classes=len(label_ids),\n",
-    "                                            dropout=CLASSIFICATION_DROPOUT)\n",
+    "ner_loss = TokenClassificationLoss(num_classes=len(label_ids))\n",
     "\n",
     "input_ids, input_type_ids, input_mask, loss_mask, _, labels = train_data_layer()\n",
     "\n",
@@ -124,7 +125,7 @@
    "outputs": [],
    "source": [
     "# Describe evaluation DAG\n",
-    "eval_data_layer = nemo_nlp.BertTokenClassificationDataLayer(\n",
+    "eval_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationDataLayer(\n",
     "        tokenizer=tokenizer,\n",
     "        text_file=os.path.join(DATA_DIR, 'text_dev.txt'),\n",
     "        label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),\n",
@@ -203,9 +204,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.7.4 64-bit",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python37464bitc56e562f54084a24b5afed5459c99218"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/examples/nlp/PunctuationWithBERT.ipynb b/examples/nlp/PunctuationWithBERT.ipynb
index 58d0f57f8edb..de8c77eec5bc 100644
--- a/examples/nlp/PunctuationWithBERT.ipynb
+++ b/examples/nlp/PunctuationWithBERT.ipynb
@@ -11,12 +11,15 @@
     "import os\n",
     "\n",
     "import nemo\n",
+    "from nemo import logging\n",
     "from nemo.utils.lr_policies import WarmupAnnealing\n",
     "\n",
     "import nemo.collections.nlp as nemo_nlp\n",
-    "from nemo.collections.nlp import NemoBertTokenizer, TokenClassifier, TokenClassificationLoss\n",
-    "from nemo.collections.nlp.data.datasets import utils\n",
-    "from nemo.collections.nlp.utils.callbacks.punctuation_capitalization import eval_iter_callback, eval_epochs_done_callback\n",
+    "from nemo.collections.nlp.data import NemoBertTokenizer\n",
+    "from nemo.collections.nlp.nm.trainables import TokenClassifier\n",
+    "from nemo.collections.nlp.nm.losses import TokenClassificationLoss, LossAggregatorNM\n",
+    "from nemo.collections.nlp.callbacks.punctuation_capitalization_callback import eval_iter_callback, eval_epochs_done_callback\n",
+    "from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights\n",
     "\n",
     "DATA_DIR = \"PATH_TO_WHERE_THE_DATA_IS\"\n",
     "WORK_DIR = \"PATH_TO_WHERE_TO_STORE_CHECKPOINTS_AND_LOGS\"\n",
@@ -47,7 +50,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In this notebook we're going to use a subset of English examples from the [Tatoeba collection of sentences](https://tatoeba.org/eng), set NUM_SAMPLES=-1 and consider including other datasets to improve the performance of the model. Use [NeMo/scripts/get_tatoeba_data.py](https://github.com/NVIDIA/NeMo/blob/master/scripts/get_tatoeba_data.py) to download and preprocess the Tatoeba data."
+    "In this notebook we're going to use a subset of English examples from the [Tatoeba collection of sentences](https://tatoeba.org/eng), set NUM_SAMPLES=-1 and consider including other datasets to improve the performance of the model. Use [NeMo/nemo/collections/nlp/data/scripts/get_tatoeba_data.py](https://github.com/NVIDIA/NeMo/blob/master/nemo/collections/nlp/data/scripts/get_tatoeba_data.py) to download and preprocess the Tatoeba data."
    ]
   },
   {
@@ -57,7 +60,8 @@
    "outputs": [],
    "source": [
     "# This should take about a minute since the data is already downloaded in the previous step\n",
-    "! python ../../scripts/get_tatoeba_data.py --data_dir $DATA_DIR --num_sample $NUM_SAMPLES"
+    "\n",
+    "! python ../../nemo/collections/nlp/data/scripts/get_tatoeba.py --data_dir $DATA_DIR --num_sample $NUM_SAMPLES"
    ]
   },
   {
@@ -116,7 +120,7 @@
     "# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()\n",
     "\n",
     "tokenizer = NemoBertTokenizer(pretrained_model=PRETRAINED_BERT_MODEL)\n",
-    "bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=PRETRAINED_BERT_MODEL)"
+    "bert_model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=PRETRAINED_BERT_MODEL)"
    ]
   },
   {
@@ -132,7 +136,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer(\n",
+    "train_data_layer = nemo_nlp.nm.data_layers.PunctuationCapitalizationDataLayer(\n",
     "     tokenizer=tokenizer,\n",
     "     text_file=os.path.join(DATA_DIR, 'text_train.txt'),\n",
     "     label_file=os.path.join(DATA_DIR, 'labels_train.txt'),\n",
@@ -144,14 +148,14 @@
     "\n",
     "\n",
     "# Define classifier for Punctuation and Capitalization tasks\n",
-    "punct_classifier = nemo_nlp.TokenClassifier(\n",
+    "punct_classifier = TokenClassifier(\n",
     "    hidden_size=bert_model.hidden_size,\n",
     "    num_classes=len(punct_label_ids),\n",
     "    dropout=CLASSIFICATION_DROPOUT,\n",
     "    num_layers=PUNCT_NUM_FC_LAYERS,\n",
     "    name='Punctuation')\n",
     "\n",
-    "capit_classifier = nemo_nlp.TokenClassifier(\n",
+    "capit_classifier = TokenClassifier(\n",
     "    hidden_size=bert_model.hidden_size,\n",
     "    num_classes=len(capit_label_ids),\n",
     "    dropout=CLASSIFICATION_DROPOUT,\n",
@@ -160,14 +164,14 @@
     "\n",
     "# If you don't want to use weighted loss for Punctuation task, use class_weights=None\n",
     "punct_label_freqs = train_data_layer.dataset.punct_label_frequencies\n",
-    "class_weights = utils.calc_class_weights(punct_label_freqs)\n",
+    "class_weights = calc_class_weights(punct_label_freqs)\n",
     "\n",
     "# define loss\n",
-    "punct_loss = nemo_nlp.TokenClassificationLoss(\n",
+    "punct_loss = TokenClassificationLoss(\n",
     "    num_classes=len(punct_label_ids),\n",
     "    class_weights=class_weights)\n",
-    "capit_loss = nemo_nlp.TokenClassificationLoss(num_classes=len(capit_label_ids))\n",
-    "task_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)"
+    "capit_loss = TokenClassificationLoss(num_classes=len(capit_label_ids))\n",
+    "task_loss = LossAggregatorNM(num_inputs=2)"
    ]
   },
   {
@@ -218,7 +222,7 @@
     "# during creation of the train_data_layer to make sure that the mapping is correct in case some of the labels from\n",
     "# the train set are missing in the dev set.\n",
     "\n",
-    "eval_data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer(\n",
+    "eval_data_layer = nemo_nlp.nm.data_layers.PunctuationCapitalizationDataLayer(\n",
     "    tokenizer=tokenizer,\n",
     "    text_file=os.path.join(DATA_DIR, 'text_dev.txt'),\n",
     "    label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),\n",
@@ -361,7 +365,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "infer_data_layer = nemo_nlp.BertTokenClassificationInferDataLayer(\n",
+    "infer_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationInferDataLayer(\n",
     "    queries=queries,\n",
     "    tokenizer=tokenizer,\n",
     "    max_seq_length=MAX_SEQ_LENGTH,\n",
@@ -399,7 +403,7 @@
     "capit_preds = np.argmax(capit_logits, axis=2)\n",
     "\n",
     "for i, query in enumerate(queries):\n",
-    "    nf.logger.info(f'Query: {query}')\n",
+    "    logging(f'Query: {query}')\n",
     "\n",
     "    punct_pred = punct_preds[i][subtokens_mask[i] > 0.5]\n",
     "    capit_pred = capit_preds[i][subtokens_mask[i] > 0.5]\n",
@@ -419,7 +423,7 @@
     "        if punct_label != 'O':\n",
     "            output += punct_label\n",
     "        output += ' '\n",
-    "    nf.logger.info(f'Combined: {output.strip()}\\n')"
+    "    logging(f'Combined: {output.strip()}\\n')"
    ]
   },
   {
diff --git a/examples/nlp/asr_postprocessor.py b/examples/nlp/asr_postprocessor.py
index f65de6e8becc..483516621de8 100644
--- a/examples/nlp/asr_postprocessor.py
+++ b/examples/nlp/asr_postprocessor.py
@@ -1,13 +1,32 @@
-# Copyright (c) 2019 NVIDIA Corporation
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 import math
 import os
 
 import torch
 
-import nemo
 import nemo.collections.nlp as nemo_nlp
+import nemo.collections.nlp.nm.data_layers.machine_translation_datalayer
+from nemo import logging
+from nemo.collections.nlp.callbacks.machine_translation_callback import (
+    eval_epochs_done_callback_wer,
+    eval_iter_callback,
+)
 from nemo.collections.nlp.data.tokenizers.bert_tokenizer import NemoBertTokenizer
-from nemo.collections.nlp.utils.callbacks.translation import eval_epochs_done_callback_wer, eval_iter_callback
 from nemo.core.callbacks import CheckpointCallback
 from nemo.utils.lr_policies import SquareAnnealing
 
@@ -47,7 +66,7 @@
 parser.add_argument("--beam_size", default=4, type=int)
 parser.add_argument("--len_pen", default=0.0, type=float)
 parser.add_argument(
-    "--restore_from", dest="restore_from", type=str, default="../../scripts/bert-base-uncased_decoder.pt",
+    "--restore_from", dest="restore_from", type=str, default="../../scripts/bert-base-uncased_decoder.pt"
 )
 args = parser.parse_args()
 
@@ -66,14 +85,14 @@
 tokens_to_add = vocab_size - tokenizer.vocab_size
 
 zeros_transform = nemo.backends.pytorch.common.ZerosLikeNM()
-encoder = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_model)
+encoder = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_model)
 device = encoder.bert.embeddings.word_embeddings.weight.get_device()
 zeros = torch.zeros((tokens_to_add, args.d_model)).to(device=device)
 encoder.bert.embeddings.word_embeddings.weight.data = torch.cat(
     (encoder.bert.embeddings.word_embeddings.weight.data, zeros)
 )
 
-decoder = nemo_nlp.TransformerDecoderNM(
+decoder = nemo_nlp.nm.trainables.TransformerDecoderNM(
     d_model=args.d_model,
     d_inner=args.d_inner,
     num_layers=args.num_layers,
@@ -90,11 +109,13 @@
 
 decoder.restore_from(args.restore_from, local_rank=args.local_rank)
 
-t_log_softmax = nemo_nlp.TokenClassifier(args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True)
+t_log_softmax = nemo_nlp.nm.trainables.TokenClassifier(
+    args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True
+)
 
-loss_fn = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(pad_id=tokenizer.pad_id(), label_smoothing=0.1)
+loss_fn = nemo_nlp.nm.losses.PaddedSmoothedCrossEntropyLossNM(pad_id=tokenizer.pad_id(), label_smoothing=0.1)
 
-beam_search = nemo_nlp.BeamSearchTranslatorNM(
+beam_search = nemo_nlp.nm.trainables.BeamSearchTranslatorNM(
     decoder=decoder,
     log_softmax=t_log_softmax,
     max_seq_length=args.max_seq_length,
@@ -114,7 +135,7 @@
 def create_pipeline(dataset, tokens_in_batch, clean=False, training=True):
     dataset_src = os.path.join(args.data_dir, dataset + "." + args.src_lang)
     dataset_tgt = os.path.join(args.data_dir, dataset + "." + args.tgt_lang)
-    data_layer = nemo_nlp.TranslationDataLayer(
+    data_layer = nemo_nlp.nm.data_layers.machine_translation_datalayer.TranslationDataLayer(
         tokenizer_src=tokenizer,
         tokenizer_tgt=tokenizer,
         dataset_src=dataset_src,
@@ -126,7 +147,7 @@ def create_pipeline(dataset, tokens_in_batch, clean=False, training=True):
     input_type_ids = zeros_transform(input_type_ids=src)
     src_hiddens = encoder(input_ids=src, token_type_ids=input_type_ids, attention_mask=src_mask)
     tgt_hiddens = decoder(
-        input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask,
+        input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask
     )
     log_softmax = t_log_softmax(hidden_states=tgt_hiddens)
     loss = loss_fn(logits=log_softmax, target_ids=labels)
@@ -150,7 +171,7 @@ def create_pipeline(dataset, tokens_in_batch, clean=False, training=True):
 
 def print_loss(x):
     loss = x[0].item()
-    nemo.logging.info("Training loss: {:.4f}".format(loss))
+    logging.info("Training loss: {:.4f}".format(loss))
 
 
 # callbacks
@@ -186,6 +207,6 @@ def print_loss(x):
     callbacks=callbacks,
     optimizer=args.optimizer,
     lr_policy=lr_policy,
-    optimization_params={"num_epochs": 300, "lr": args.lr, "weight_decay": args.weight_decay,},
+    optimization_params={"num_epochs": 300, "lr": args.lr, "weight_decay": args.weight_decay},
     batches_per_step=args.iter_per_step,
 )
diff --git a/examples/nlp/bert_pretraining.py b/examples/nlp/bert_pretraining.py
index 2207fe5184fa..046814231296 100644
--- a/examples/nlp/bert_pretraining.py
+++ b/examples/nlp/bert_pretraining.py
@@ -1,5 +1,18 @@
-#!/usr/bin/env python3
-# Copyright (c) 2019 NVIDIA Corporation
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
 """
 
@@ -62,14 +75,13 @@
 import math
 import os
 
-import torch
-from pytorch_transformers import BertConfig
+from transformers import BertConfig
 
-import nemo
+import nemo.backends.pytorch.common as nemo_common
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp.data.datasets.utils import BERTPretrainingDataDesc
-from nemo.collections.nlp.transformer.utils import gelu
-from nemo.collections.nlp.utils.callbacks.bert_pretraining import eval_epochs_done_callback, eval_iter_callback
+import nemo.core as nemo_core
+from nemo import logging
+from nemo.collections.nlp.data.datasets.lm_bert_dataset import BERTPretrainingDataDesc
 from nemo.utils.lr_policies import get_lr_policy
 
 parser = argparse.ArgumentParser(description='BERT pretraining')
@@ -86,9 +98,7 @@
 parser.add_argument("--beta2", default=0.25, type=float)
 parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"])
 parser.add_argument("--weight_decay", default=0.0, type=float)
-parser.add_argument(
-    "--tokenizer", default="sentence-piece", type=str, choices=["sentence-piece", "nemo-bert"],
-)
+parser.add_argument("--tokenizer", default="sentence-piece", type=str, choices=["sentence-piece", "nemo-bert"])
 parser.add_argument("--max_seq_length", default=128, type=int)
 parser.add_argument("--sample_size", default=1e7, type=int)
 parser.add_argument("--mask_probability", default=0.15, type=float)
@@ -108,14 +118,10 @@
 )
 parser.add_argument("--data_dir", default="data/lm/wikitext-2", type=str)
 parser.add_argument(
-    "--preprocessed_data", action="store_true", default=False, help="specify if using preprocessed data",
-)
-parser.add_argument(
-    "--gradient_predivide", action="store_true", default=False, help="use gradient predivide",
-)
-parser.add_argument(
-    "--only_mlm_loss", action="store_true", default=False, help="use only masked language model loss",
+    "--preprocessed_data", action="store_true", default=False, help="specify if using preprocessed data"
 )
+parser.add_argument("--gradient_predivide", action="store_true", default=False, help="use gradient predivide")
+parser.add_argument("--only_mlm_loss", action="store_true", default=False, help="use only masked language model loss")
 parser.add_argument(
     "--max_steps",
     default=-1,
@@ -125,9 +131,7 @@
 )
 parser.add_argument("--dataset_name", default="wikitext-2", type=str)
 parser.add_argument("--load_dir", default=None, type=str)
-parser.add_argument(
-    "--bert_checkpoint", default=None, type=str, help="specify path to pretrained BERT weights",
-)
+parser.add_argument("--bert_checkpoint", default=None, type=str, help="specify path to pretrained BERT weights")
 parser.add_argument("--work_dir", default="outputs/bert_lm", type=str)
 parser.add_argument("--save_epoch_freq", default=1, type=int)
 parser.add_argument("--save_step_freq", default=100, type=int)
@@ -135,8 +139,8 @@
 parser.add_argument("--config_file", default=None, type=str, help="The BERT model config")
 args = parser.parse_args()
 
-nf = nemo.core.NeuralModuleFactory(
-    backend=nemo.core.Backend.PyTorch,
+nf = nemo_core.NeuralModuleFactory(
+    backend=nemo_core.Backend.PyTorch,
     local_rank=args.local_rank,
     optimization_level=args.amp_opt_level,
     log_dir=args.work_dir,
@@ -158,23 +162,23 @@
 if not args.preprocessed_data:
     special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']
     data_desc = BERTPretrainingDataDesc(
-        args.dataset_name, args.data_dir, args.vocab_size, args.sample_size, special_tokens, 'train.txt',
+        args.dataset_name, args.data_dir, args.vocab_size, args.sample_size, special_tokens, 'train.txt'
     )
     if args.tokenizer == "sentence-piece":
-        nemo.logging.info("To use SentencePieceTokenizer.")
-        tokenizer = nemo_nlp.SentencePieceTokenizer(model_path=data_desc.tokenizer_model)
+        logging.info("To use SentencePieceTokenizer.")
+        tokenizer = nemo_nlp.data.SentencePieceTokenizer(model_path=data_desc.tokenizer_model)
         tokenizer.add_special_tokens(special_tokens)
     elif args.tokenizer == "nemo-bert":
-        nemo.logging.info("To use NemoBertTokenizer.")
+        logging.info("To use NemoBertTokenizer.")
         vocab_file = os.path.join(args.data_dir, 'vocab.txt')
         # To train on a Chinese dataset, use NemoBertTokenizer
-        tokenizer = nemo_nlp.NemoBertTokenizer(vocab_file=vocab_file)
+        tokenizer = nemo_nlp.data.NemoBertTokenizer(vocab_file=vocab_file)
     else:
         raise ValueError("Please add your tokenizer " "or use sentence-piece or nemo-bert.")
     args.vocab_size = tokenizer.vocab_size
 
 print(vars(args))
-bert_model = nemo_nlp.huggingface.BERT(
+bert_model = nemo_nlp.nm.trainables.huggingface.BERT(
     vocab_size=args.vocab_size,
     num_hidden_layers=args.num_hidden_layers,
     hidden_size=args.hidden_size,
@@ -191,17 +195,17 @@
 data layers, BERT encoder, and MLM and NSP loss functions
 """
 
-mlm_classifier = nemo_nlp.BertTokenClassifier(
-    args.hidden_size, num_classes=args.vocab_size, activation=args.hidden_act, log_softmax=True,
+mlm_classifier = nemo_nlp.nm.trainables.token_classification_nm.BertTokenClassifier(
+    args.hidden_size, num_classes=args.vocab_size, activation=args.hidden_act, log_softmax=True
 )
-mlm_loss_fn = nemo_nlp.MaskedLanguageModelingLossNM()
+mlm_loss_fn = nemo_nlp.nm.losses.MaskedLanguageModelingLossNM()
 if not args.only_mlm_loss:
-    nsp_classifier = nemo_nlp.SequenceClassifier(
-        args.hidden_size, num_classes=2, num_layers=2, activation='tanh', log_softmax=False,
+    nsp_classifier = nemo_nlp.nm.trainables.sequence_classification_nm.SequenceClassifier(
+        args.hidden_size, num_classes=2, num_layers=2, activation='tanh', log_softmax=False
     )
-    nsp_loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss()
+    nsp_loss_fn = nemo_common.CrossEntropyLoss()
 
-    bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)
+    bert_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2)
 
 # tie weights of MLM softmax layer and embedding layer of the encoder
 if mlm_classifier.mlp.last_linear_layer.weight.shape != bert_model.bert.embeddings.word_embeddings.weight.shape:
@@ -209,31 +213,26 @@
 mlm_classifier.mlp.last_linear_layer.weight = bert_model.bert.embeddings.word_embeddings.weight
 
 
-def create_pipeline(
-    data_file, batch_size, preprocessed_data=False, batches_per_step=1, **kwargs,
-):
+def create_pipeline(data_file, batch_size, preprocessed_data=False, batches_per_step=1, **kwargs):
     if not preprocessed_data:
         max_seq_length, mask_probability, short_seq_prob = (
             kwargs['max_seq_length'],
             kwargs['mask_probability'],
             kwargs['short_seq_prob'],
         )
-        data_layer = nemo_nlp.BertPretrainingDataLayer(
-            tokenizer, data_file, max_seq_length, mask_probability, short_seq_prob, batch_size=batch_size,
+        data_layer = nemo_nlp.nm.data_layers.lm_bert_datalayer.BertPretrainingDataLayer(
+            tokenizer, data_file, max_seq_length, mask_probability, short_seq_prob, batch_size=batch_size
         )
     else:
-        training, max_predictions_per_seq = (
-            kwargs['training'],
-            kwargs['max_predictions_per_seq'],
-        )
-        data_layer = nemo_nlp.BertPretrainingPreprocessedDataLayer(
-            data_file, max_predictions_per_seq, batch_size=batch_size, training=training,
+        training, max_predictions_per_seq = (kwargs['training'], kwargs['max_predictions_per_seq'])
+        data_layer = nemo_nlp.nm.data_layers.lm_bert_datalayer.BertPretrainingPreprocessedDataLayer(
+            data_file, max_predictions_per_seq, batch_size=batch_size, training=training
         )
 
     steps_per_epoch = math.ceil(len(data_layer) / (batch_size * args.num_gpus * batches_per_step))
 
-    (input_ids, input_type_ids, input_mask, output_ids, output_mask, nsp_labels,) = data_layer()
-    hidden_states = bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
+    (input_ids, input_type_ids, input_mask, output_ids, output_mask, nsp_labels) = data_layer()
+    hidden_states = bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
     mlm_logits = mlm_classifier(hidden_states=hidden_states)
     mlm_loss = mlm_loss_fn(logits=mlm_logits, output_ids=output_ids, output_mask=output_mask)
     if not args.only_mlm_loss:
@@ -275,15 +274,15 @@ def create_pipeline(
 else:
     log_tensors = [train_loss]
     print_msg = "Loss: {:.3f}"
-train_callback = nemo.core.SimpleLossLoggerCallback(
+train_callback = nemo_core.SimpleLossLoggerCallback(
     tensors=log_tensors,
     step_freq=args.print_step_freq,
-    print_func=lambda x: nemo.logging.info(print_msg.format(*[y.item() for y in x])),
+    print_func=lambda x: logging.info(print_msg.format(*[y.item() for y in x])),
     get_tb_values=lambda x: [["loss", x[0]]],
     tb_writer=nf.tb_writer,
 )
 
-ckpt_callback = nemo.core.CheckpointCallback(
+ckpt_callback = nemo_core.CheckpointCallback(
     folder=nf.checkpoint_dir,
     epoch_freq=args.save_epoch_freq,
     load_from_folder=args.load_dir,
@@ -294,11 +293,11 @@ def create_pipeline(
 if args.lr_policy is not None:
     if args.max_steps < 0:
         lr_policy_fn = get_lr_policy(
-            args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion,
+            args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion
         )
     else:
         lr_policy_fn = get_lr_policy(
-            args.lr_policy, total_steps=args.max_steps, warmup_ratio=args.lr_warmup_proportion,
+            args.lr_policy, total_steps=args.max_steps, warmup_ratio=args.lr_warmup_proportion
         )
 else:
     lr_policy_fn = None
diff --git a/examples/nlp/dialogue_state_tracking_trade.py b/examples/nlp/dialogue_state_tracking_trade.py
new file mode 100644
index 000000000000..996e0195d721
--- /dev/null
+++ b/examples/nlp/dialogue_state_tracking_trade.py
@@ -0,0 +1,226 @@
+# =============================================================================
+# Copyright 2019 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+""" An implementation of the paper "Transferable Multi-Domain State Generator
+for Task-Oriented Dialogue Systems" (Wu et al., 2019 - ACL 2019)
+Adopted from: https://github.com/jasonwu0731/trade-dst
+"""
+
+import argparse
+import math
+import os
+
+import numpy as np
+
+import nemo.collections.nlp as nemo_nlp
+import nemo.core as nemo_core
+from nemo import logging
+from nemo.backends.pytorch.common import EncoderRNN
+from nemo.collections.nlp.callbacks.state_tracking_trade_callback import eval_epochs_done_callback, eval_iter_callback
+from nemo.collections.nlp.data.datasets.state_tracking_trade_dataset import MultiWOZDataDesc
+from nemo.utils.lr_policies import get_lr_policy
+
+parser = argparse.ArgumentParser(description='Dialog state tracking with TRADE model on MultiWOZ dataset')
+parser.add_argument("--local_rank", default=None, type=int)
+parser.add_argument("--batch_size", default=16, type=int)
+parser.add_argument("--eval_batch_size", default=16, type=int)
+parser.add_argument("--num_gpus", default=1, type=int)
+parser.add_argument("--num_epochs", default=10, type=int)
+parser.add_argument("--lr_warmup_proportion", default=0.0, type=float)
+parser.add_argument("--lr", default=0.001, type=float)
+parser.add_argument("--lr_policy", default=None, type=str)
+parser.add_argument("--min_lr", default=1e-4, type=float)
+parser.add_argument("--weight_decay", default=0.0, type=float)
+parser.add_argument("--emb_dim", default=400, type=int)
+parser.add_argument("--hid_dim", default=400, type=int)
+parser.add_argument("--n_layers", default=1, type=int)
+parser.add_argument("--dropout", default=0.2, type=float)
+parser.add_argument("--input_dropout", default=0.2, type=float)
+parser.add_argument("--data_dir", default='data/statetracking/multiwoz2.1', type=str)
+parser.add_argument("--train_file_prefix", default='train', type=str)
+parser.add_argument("--eval_file_prefix", default='test', type=str)
+parser.add_argument("--work_dir", default='outputs', type=str)
+parser.add_argument("--save_epoch_freq", default=-1, type=int)
+parser.add_argument("--save_step_freq", default=-1, type=int)
+parser.add_argument("--optimizer_kind", default="adam", type=str)
+parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"])
+parser.add_argument("--shuffle_data", action='store_true')
+parser.add_argument("--num_train_samples", default=-1, type=int)
+parser.add_argument("--num_eval_samples", default=-1, type=int)
+parser.add_argument("--grad_norm_clip", type=float, default=10, help="gradient clipping")
+parser.add_argument("--teacher_forcing", default=0.5, type=float)
+args = parser.parse_args()
+
+# List of the domains to be considered
+domains = {"attraction": 0, "restaurant": 1, "taxi": 2, "train": 3, "hotel": 4}
+
+if not os.path.exists(args.data_dir):
+    raise ValueError(f'Data not found at {args.data_dir}')
+
+work_dir = f'{args.work_dir}/DST_TRADE'
+
+data_desc = MultiWOZDataDesc(args.data_dir, domains)
+
+nf = nemo_core.NeuralModuleFactory(
+    backend=nemo_core.Backend.PyTorch,
+    local_rank=args.local_rank,
+    optimization_level=args.amp_opt_level,
+    log_dir=work_dir,
+    create_tb_writer=True,
+    files_to_copy=[__file__],
+    add_time_to_log_dir=True,
+)
+
+vocab_size = len(data_desc.vocab)
+encoder = EncoderRNN(vocab_size, args.emb_dim, args.hid_dim, args.dropout, args.n_layers)
+
+decoder = nemo_nlp.nm.trainables.TRADEGenerator(
+    data_desc.vocab,
+    encoder.embedding,
+    args.hid_dim,
+    args.dropout,
+    data_desc.slots,
+    len(data_desc.gating_dict),
+    teacher_forcing=args.teacher_forcing,
+)
+
+gate_loss_fn = nemo_nlp.nm.losses.CrossEntropyLoss3D(num_classes=len(data_desc.gating_dict))
+ptr_loss_fn = nemo_nlp.nm.losses.TRADEMaskedCrossEntropy()
+total_loss_fn = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2)
+
+
+def create_pipeline(num_samples, batch_size, num_gpus, input_dropout, data_prefix, is_training):
+    logging.info(f"Loading {data_prefix} data...")
+    shuffle = args.shuffle_data if is_training else False
+
+    data_layer = nemo_nlp.nm.data_layers.MultiWOZDataLayer(
+        args.data_dir,
+        data_desc.domains,
+        all_domains=data_desc.all_domains,
+        vocab=data_desc.vocab,
+        slots=data_desc.slots,
+        gating_dict=data_desc.gating_dict,
+        num_samples=num_samples,
+        shuffle=shuffle,
+        num_workers=0,
+        batch_size=batch_size,
+        mode=data_prefix,
+        is_training=is_training,
+        input_dropout=input_dropout,
+    )
+
+    src_ids, src_lens, tgt_ids, tgt_lens, gate_labels, turn_domain = data_layer()
+
+    data_size = len(data_layer)
+    logging.info(f'The length of data layer is {data_size}')
+
+    if data_size < batch_size:
+        logging.warning("Batch_size is larger than the dataset size")
+        logging.warning("Reducing batch_size to dataset size")
+        batch_size = data_size
+
+    steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus))
+    logging.info(f"Steps_per_epoch = {steps_per_epoch}")
+
+    outputs, hidden = encoder(inputs=src_ids, input_lens=src_lens)
+
+    point_outputs, gate_outputs = decoder(
+        encoder_hidden=hidden, encoder_outputs=outputs, input_lens=src_lens, src_ids=src_ids, targets=tgt_ids
+    )
+
+    gate_loss = gate_loss_fn(logits=gate_outputs, labels=gate_labels)
+    ptr_loss = ptr_loss_fn(logits=point_outputs, targets=tgt_ids, loss_mask=tgt_lens)
+    total_loss = total_loss_fn(loss_1=gate_loss, loss_2=ptr_loss)
+
+    if is_training:
+        tensors_to_evaluate = [total_loss, gate_loss, ptr_loss]
+    else:
+        tensors_to_evaluate = [total_loss, point_outputs, gate_outputs, gate_labels, turn_domain, tgt_ids, tgt_lens]
+
+    return tensors_to_evaluate, total_loss, ptr_loss, gate_loss, steps_per_epoch, data_layer
+
+
+(
+    tensors_train,
+    total_loss_train,
+    ptr_loss_train,
+    gate_loss_train,
+    steps_per_epoch_train,
+    data_layer_train,
+) = create_pipeline(
+    args.num_train_samples,
+    batch_size=args.batch_size,
+    num_gpus=args.num_gpus,
+    input_dropout=args.input_dropout,
+    data_prefix=args.train_file_prefix,
+    is_training=True,
+)
+
+tensors_eval, total_loss_eval, ptr_loss_eval, gate_loss_eval, steps_per_epoch_eval, data_layer_eval = create_pipeline(
+    args.num_eval_samples,
+    batch_size=args.eval_batch_size,
+    num_gpus=args.num_gpus,
+    input_dropout=0.0,
+    data_prefix=args.eval_file_prefix,
+    is_training=False,
+)
+
+# Create callbacks for train and eval modes
+train_callback = nemo_core.SimpleLossLoggerCallback(
+    tensors=[total_loss_train, gate_loss_train, ptr_loss_train],
+    print_func=lambda x: logging.info(
+        f'Loss:{str(np.round(x[0].item(), 3))}, '
+        f'Gate Loss:{str(np.round(x[1].item(), 3))}, '
+        f'Pointer Loss:{str(np.round(x[2].item(), 3))}'
+    ),
+    tb_writer=nf.tb_writer,
+    get_tb_values=lambda x: [["loss", x[0]], ["gate_loss", x[1]], ["pointer_loss", x[2]]],
+    step_freq=steps_per_epoch_train,
+)
+
+eval_callback = nemo_core.EvaluatorCallback(
+    eval_tensors=tensors_eval,
+    user_iter_callback=lambda x, y: eval_iter_callback(x, y, data_desc),
+    user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, data_desc),
+    tb_writer=nf.tb_writer,
+    eval_step=steps_per_epoch_train,
+)
+
+ckpt_callback = nemo_core.CheckpointCallback(
+    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq
+)
+
+if args.lr_policy is not None:
+    total_steps = args.num_epochs * steps_per_epoch_train
+    lr_policy_fn = get_lr_policy(
+        args.lr_policy, total_steps=total_steps, warmup_ratio=args.lr_warmup_proportion, min_lr=args.min_lr
+    )
+else:
+    lr_policy_fn = None
+
+grad_norm_clip = args.grad_norm_clip if args.grad_norm_clip > 0 else None
+nf.train(
+    tensors_to_optimize=[total_loss_train],
+    callbacks=[eval_callback, train_callback, ckpt_callback],
+    lr_policy=lr_policy_fn,
+    optimizer=args.optimizer_kind,
+    optimization_params={
+        "num_epochs": args.num_epochs,
+        "lr": args.lr,
+        "grad_norm_clip": grad_norm_clip,
+        "weight_decay": args.weight_decay,
+    },
+)
diff --git a/examples/nlp/glue_with_BERT.py b/examples/nlp/glue_benchmark_with_bert.py
similarity index 72%
rename from examples/nlp/glue_with_BERT.py
rename to examples/nlp/glue_benchmark_with_bert.py
index d7dcc8bc87b7..a7d909d93247 100644
--- a/examples/nlp/glue_with_BERT.py
+++ b/examples/nlp/glue_benchmark_with_bert.py
@@ -24,14 +24,14 @@
 https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e
 
 To run this example on 1 GPU:
-python glue_with_BERT.py  \
+python glue_benchmark_with_bert.py  \
 --data_dir /path_to_data_dir/MRPC \
 --task_name mrpc \
 --work_dir /path_to_output_folder \
 
 To run this example on 4 GPUs with mixed precision:
 python -m torch.distributed.launch \
---nproc_per_node=4 glue_with_BERT.py \
+--nproc_per_node=4 glue_benchmark_with_bert.py \
 --data_dir=/path_to_data/MNLI \
 --task_name mnli \
 --work_dir /path_to_output_folder \
@@ -64,17 +64,15 @@
 import json
 import os
 
-import nemo
 import nemo.collections.nlp as nemo_nlp
+import nemo.core as nemo_core
+from nemo import logging
 from nemo.backends.pytorch.common import CrossEntropyLoss, MSELoss
-from nemo.collections.nlp import (
-    GlueDataLayerClassification,
-    GlueDataLayerRegression,
-    NemoBertTokenizer,
-    SentencePieceTokenizer,
-)
-from nemo.collections.nlp.data.datasets.utils import output_modes, processors
-from nemo.collections.nlp.utils.callbacks.glue import eval_epochs_done_callback, eval_iter_callback
+from nemo.collections.nlp.callbacks.glue_benchmark_callback import eval_epochs_done_callback, eval_iter_callback
+from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer
+from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import output_modes, processors
+from nemo.collections.nlp.nm.data_layers import GlueClassificationDataLayer, GlueRegressionDataLayer
+from nemo.collections.nlp.nm.trainables import SequenceClassifier, SequenceRegression
 from nemo.utils.lr_policies import get_lr_policy
 
 parser = argparse.ArgumentParser(description="GLUE_with_pretrained_BERT")
@@ -85,94 +83,71 @@
     default='COLA',
     type=str,
     required=True,
-    help="The input data dir. Should contain the .tsv    \
-                    files (or other data files) for the task.",
+    help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
 )
 parser.add_argument(
     "--task_name",
     default="CoLA",
     type=str,
     required=True,
-    choices=['cola', 'sst-2', 'mrpc', 'sts-b', 'qqp', 'mnli', 'qnli', 'rte', 'wnli',],
-    help="GLUE task name, MNLI includes both matched and \
-                    mismatched tasks",
-)
-parser.add_argument(
-    "--dataset_type", default="GLUEDataset", type=str, help='Type of dataset to create datalayers',
-)
-parser.add_argument(
-    "--pretrained_bert_model", default="bert-base-cased", type=str, help="Name of the pre-trained model",
-)
-parser.add_argument(
-    "--bert_checkpoint", default=None, type=str, help="Path to model checkpoint",
+    choices=['cola', 'sst-2', 'mrpc', 'sts-b', 'qqp', 'mnli', 'qnli', 'rte', 'wnli'],
+    help="GLUE task name, MNLI includes both matched and mismatched tasks",
 )
 parser.add_argument(
-    "--bert_config", default=None, type=str, help="Path to bert config file in json format",
+    "--pretrained_bert_model", default="bert-base-cased", type=str, help="Name of the pre-trained model"
 )
+parser.add_argument("--bert_checkpoint", default=None, type=str, help="Path to model checkpoint")
+parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format")
 parser.add_argument(
     "--tokenizer_model",
     default="tokenizer.model",
     type=str,
-    help="Path to pretrained tokenizer model, \
-                    only used if --tokenizer is sentencepiece",
+    help="Path to pretrained tokenizer model, only used if --tokenizer is sentencepiece",
 )
 parser.add_argument(
     "--tokenizer",
     default="nemobert",
     type=str,
     choices=["nemobert", "sentencepiece"],
-    help="tokenizer to use, \
-                    only relevant when using custom pretrained checkpoint.",
+    help="tokenizer to use, only relevant when using custom pretrained checkpoint.",
 )
 parser.add_argument(
     "--max_seq_length",
     default=128,
     type=int,
     choices=range(1, 513),
-    help="The maximum total input sequence length after   \
-                    tokenization.Sequences longer than this will be       \
+    help="The maximum total input sequence length after tokenization.Sequences longer than this will be \
                     truncated, sequences shorter will be padded.",
 )
 parser.add_argument("--optimizer_kind", default="adam", type=str, help="Optimizer kind")
 parser.add_argument("--lr_policy", default="WarmupAnnealing", type=str)
 parser.add_argument("--lr", default=5e-5, type=float, help="The initial learning rate.")
 parser.add_argument("--lr_warmup_proportion", default=0.1, type=float)
-parser.add_argument(
-    "--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.",
-)
-parser.add_argument(
-    "--num_epochs", default=3, type=int, help="Total number of training epochs to perform.",
-)
-parser.add_argument(
-    "--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training/evaluation.",
-)
+parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+parser.add_argument("--num_epochs", default=3, type=int, help="Total number of training epochs to perform.")
+parser.add_argument("--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training/evaluation.")
 parser.add_argument("--num_gpus", default=1, type=int, help="Number of GPUs")
 parser.add_argument(
-    "--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"], help="01/02 to enable mixed precision",
-)
-parser.add_argument(
-    "--local_rank", type=int, default=None, help="For distributed training: local_rank",
+    "--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"], help="01/02 to enable mixed precision"
 )
+parser.add_argument("--local_rank", type=int, default=None, help="For distributed training: local_rank")
 parser.add_argument(
     "--work_dir",
     default='output_glue',
     type=str,
-    help="The output directory where the model predictions \
-                    and checkpoints will be written.",
+    help="The output directory where the model predictions and checkpoints will be written.",
 )
 parser.add_argument(
     "--save_epoch_freq",
     default=1,
     type=int,
-    help="Frequency of saving checkpoint \
-                    '-1' - epoch checkpoint won't be saved",
+    help="Frequency of saving checkpoint '-1' - epoch checkpoint won't be saved",
 )
 parser.add_argument(
     "--save_step_freq",
     default=-1,
     type=int,
-    help="Frequency of saving checkpoint \
-                    '-1' - step checkpoint won't be saved",
+    help="Frequency of saving checkpoint '-1' - step checkpoint won't be saved",
 )
 parser.add_argument("--loss_step_freq", default=25, type=int, help="Frequency of printing loss")
 
@@ -181,8 +156,7 @@
 if not os.path.exists(args.data_dir):
     raise FileNotFoundError(
         "GLUE datasets not found. Datasets can be "
-        "obtained at https://gist.github.com/W4ngatang/ \
-                            60c2bdb54d156a41194446737ce03e2e"
+        "obtained at https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e"
     )
 
 args.work_dir = f'{args.work_dir}/{args.task_name.upper()}'
@@ -203,8 +177,8 @@
 output_mode = output_modes[args.task_name]
 
 # Instantiate neural factory with supported backend
-nf = nemo.core.NeuralModuleFactory(
-    backend=nemo.core.Backend.PyTorch,
+nf = nemo_core.NeuralModuleFactory(
+    backend=nemo_core.Backend.PyTorch,
     local_rank=args.local_rank,
     optimization_level=args.amp_opt_level,
     log_dir=args.work_dir,
@@ -216,10 +190,10 @@
 if args.bert_checkpoint is None:
     """ Use this if you're using a standard BERT model.
     To see the list of pretrained models, call:
-    nemo_nlp.huggingface.BERT.list_pretrained_models()
+    nemo_nlp.nm.trainables.huggingface.BERT.list_pretrained_models()
     """
     tokenizer = NemoBertTokenizer(args.pretrained_bert_model)
-    model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
+    model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 else:
     """ Use this if you're using a BERT model that you pre-trained yourself.
     Replace BERT-STEP-150000.pt with the path to your checkpoint.
@@ -234,9 +208,9 @@
     if args.bert_config is not None:
         with open(args.bert_config) as json_file:
             config = json.load(json_file)
-        model = nemo_nlp.huggingface.BERT(**config)
+        model = nemo_nlp.nm.trainables.huggingface.BERT(**config)
     else:
-        model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
+        model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 
     model.restore_from(args.bert_checkpoint)
 
@@ -244,10 +218,10 @@
 
 # uses [CLS] token for classification (the first token)
 if args.task_name == 'sts-b':
-    pooler = nemo_nlp.SequenceRegression(hidden_size=hidden_size)
+    pooler = SequenceRegression(hidden_size=hidden_size)
     glue_loss = MSELoss()
 else:
-    pooler = nemo_nlp.SequenceClassifier(hidden_size=hidden_size, num_classes=num_labels, log_softmax=False)
+    pooler = SequenceClassifier(hidden_size=hidden_size, num_classes=num_labels, log_softmax=False)
     glue_loss = CrossEntropyLoss()
 
 
@@ -259,12 +233,11 @@ def create_pipeline(
     evaluate=False,
     processor=task_processors[0],
 ):
-    data_layer = GlueDataLayerClassification
+    data_layer = GlueClassificationDataLayer
     if output_mode == 'regression':
-        data_layer = GlueDataLayerRegression
+        data_layer = GlueRegressionDataLayer
 
     data_layer = data_layer(
-        dataset_type=args.dataset_type,
         processor=processor,
         evaluate=evaluate,
         batch_size=batch_size,
@@ -278,7 +251,7 @@ def create_pipeline(
 
     input_ids, input_type_ids, input_mask, labels = data_layer()
 
-    hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
+    hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
 
     """
     For STS-B (regressiont tast), the pooler_output represents a is single
@@ -296,18 +269,13 @@ def create_pipeline(
     return loss, steps_per_epoch, data_layer, [pooler_output, labels]
 
 
-token_params = {
-    'bos_token': None,
-    'eos_token': '[SEP]',
-    'pad_token': '[PAD]',
-    'cls_token': '[CLS]',
-}
+token_params = {'bos_token': None, 'eos_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]'}
 
 train_loss, steps_per_epoch, _, _ = create_pipeline()
 _, _, eval_data_layer, eval_tensors = create_pipeline(evaluate=True)
 
 callbacks_eval = [
-    nemo.core.EvaluatorCallback(
+    nemo_core.EvaluatorCallback(
         eval_tensors=eval_tensors,
         user_iter_callback=lambda x, y: eval_iter_callback(x, y),
         user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, args.work_dir, eval_task_names[0]),
@@ -323,7 +291,7 @@ def create_pipeline(
 if args.task_name == 'mnli':
     _, _, eval_data_layer_mm, eval_tensors_mm = create_pipeline(evaluate=True, processor=task_processors[1])
     callbacks_eval.append(
-        nemo.core.EvaluatorCallback(
+        nemo_core.EvaluatorCallback(
             eval_tensors=eval_tensors_mm,
             user_iter_callback=lambda x, y: eval_iter_callback(x, y),
             user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, args.work_dir, eval_task_names[1]),
@@ -332,8 +300,8 @@ def create_pipeline(
         )
     )
 
-nemo.logging.info(f"steps_per_epoch = {steps_per_epoch}")
-callback_train = nemo.core.SimpleLossLoggerCallback(
+logging.info(f"steps_per_epoch = {steps_per_epoch}")
+callback_train = nemo_core.SimpleLossLoggerCallback(
     tensors=[train_loss],
     print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())),
     get_tb_values=lambda x: [["loss", x[0]]],
@@ -341,12 +309,12 @@ def create_pipeline(
     tb_writer=nf.tb_writer,
 )
 
-ckpt_callback = nemo.core.CheckpointCallback(
-    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq,
+ckpt_callback = nemo_core.CheckpointCallback(
+    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq
 )
 
 lr_policy_fn = get_lr_policy(
-    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion,
+    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion
 )
 
 nf.train(
diff --git a/examples/nlp/joint_intent_slot_infer.py b/examples/nlp/joint_intent_slot_infer.py
index d2f3efaf8c68..942d1c98bd0d 100644
--- a/examples/nlp/joint_intent_slot_infer.py
+++ b/examples/nlp/joint_intent_slot_infer.py
@@ -1,13 +1,29 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 import argparse
 import os
 
 import numpy as np
-from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.metrics import classification_report
 from transformers import BertTokenizer
 
-import nemo
-import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp.data.datasets.utils import JointIntentSlotDataDesc
+import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
+from nemo import logging
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
 
 # Parsing arguments
 parser = argparse.ArgumentParser(description='Joint-intent BERT')
@@ -28,22 +44,24 @@
     raise ValueError(f'Data not found at {args.data_dir}')
 
 nf = nemo.core.NeuralModuleFactory(
-    backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, log_dir=None,
+    backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, log_dir=None
 )
 
 """ Load the pretrained BERT parameters
 See the list of pretrained models, call:
 nemo_nlp.huggingface.BERT.list_pretrained_models()
 """
-pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
+pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(
+    pretrained_model_name=args.pretrained_bert_model
+)
 hidden_size = pretrained_bert_model.hidden_size
 tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model)
 
 data_desc = JointIntentSlotDataDesc(args.data_dir, args.do_lower_case, args.dataset_name)
 
 # Evaluation pipeline
-nemo.logging.info("Loading eval data...")
-data_layer = nemo_nlp.BertJointIntentSlotDataLayer(
+logging.info("Loading eval data...")
+data_layer = nemo.collections.nlp.nm.data_layers.joint_intent_slot_datalayer.BertJointIntentSlotDataLayer(
     input_file=f'{data_desc.data_dir}/{args.eval_file_prefix}.tsv',
     slot_file=f'{data_desc.data_dir}/{args.eval_file_prefix}_slots.tsv',
     pad_label=data_desc.pad_label,
@@ -51,15 +69,13 @@
     max_seq_length=args.max_seq_length,
     shuffle=False,
     batch_size=args.batch_size,
-    # num_workers=0,
-    # local_rank=args.local_rank,
 )
 
-classifier = nemo_nlp.JointIntentSlotClassifier(
-    hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots,
+classifier = nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm.JointIntentSlotClassifier(
+    hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots
 )
 
-(ids, type_ids, input_mask, loss_mask, subtokens_mask, intents, slots,) = data_layer()
+(ids, type_ids, input_mask, loss_mask, subtokens_mask, intents, slots) = data_layer()
 
 hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask)
 intent_logits, slot_logits = classifier(hidden_states=hidden_states)
@@ -69,7 +85,7 @@
 
 # Instantiate an optimizer to perform `infer` action
 evaluated_tensors = nf.infer(
-    tensors=[intent_logits, slot_logits, loss_mask, subtokens_mask, intents, slots,], checkpoint_dir=args.work_dir,
+    tensors=[intent_logits, slot_logits, loss_mask, subtokens_mask, intents, slots], checkpoint_dir=args.work_dir
 )
 
 
@@ -86,13 +102,13 @@ def get_preds(logits):
 ]
 
 pred_intents = np.argmax(intent_logits, 1)
-nemo.logging.info('Intent prediction results')
+logging.info('Intent prediction results')
 
 intents = np.asarray(intents)
 pred_intents = np.asarray(pred_intents)
 intent_accuracy = sum(intents == pred_intents) / len(pred_intents)
-nemo.logging.info(f'Intent accuracy: {intent_accuracy}')
-nemo.logging.info(classification_report(intents, pred_intents))
+logging.info(f'Intent accuracy: {intent_accuracy}')
+logging.info(classification_report(intents, pred_intents))
 
 slot_preds = np.argmax(slot_logits, axis=2)
 slot_preds_list, slot_labels_list = [], []
@@ -101,9 +117,9 @@ def get_preds(logits):
     slot_preds_list.extend(list(slot_preds[i][subtokens_mask[i]]))
     slot_labels_list.extend(list(slot_labels[i][subtokens_mask[i]]))
 
-nemo.logging.info('Slot prediction results')
+logging.info('Slot prediction results')
 slot_labels_list = np.asarray(slot_labels_list)
 slot_preds_list = np.asarray(slot_preds_list)
 slot_accuracy = sum(slot_labels_list == slot_preds_list) / len(slot_labels_list)
-nemo.logging.info(f'Slot accuracy: {slot_accuracy}')
-nemo.logging.info(classification_report(slot_labels_list, slot_preds_list))
+logging.info(f'Slot accuracy: {slot_accuracy}')
+logging.info(classification_report(slot_labels_list, slot_preds_list))
diff --git a/examples/nlp/joint_intent_slot_infer_b1.py b/examples/nlp/joint_intent_slot_infer_b1.py
index 089a2c06820e..55c467f3f5ea 100644
--- a/examples/nlp/joint_intent_slot_infer_b1.py
+++ b/examples/nlp/joint_intent_slot_infer_b1.py
@@ -1,12 +1,28 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 import argparse
 
 import numpy as np
 from transformers import BertTokenizer
 
-import nemo
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp.data.datasets.utils import JointIntentSlotDataDesc
-from nemo.collections.nlp.utils.nlp_utils import read_intent_slot_outputs
+import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
+from nemo.collections.nlp.utils.common_nlp_utils import read_intent_slot_outputs
 
 # Parsing arguments
 parser = argparse.ArgumentParser(description='Joint-intent BERT')
@@ -23,14 +39,14 @@
 args = parser.parse_args()
 
 nf = nemo.core.NeuralModuleFactory(
-    backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None,
+    backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None
 )
 
 """ Load the pretrained BERT parameters
 See the list of pretrained models, call:
-nemo_nlp.huggingface.BERT.list_pretrained_models()
+nemo_nlp.BERT.list_pretrained_models()
 """
-pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
+pretrained_bert_model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model)
 hidden_size = pretrained_bert_model.hidden_size
 
@@ -40,13 +56,13 @@
 if args.do_lower_case:
     query = query.lower()
 
-data_layer = nemo_nlp.BertJointIntentSlotInferDataLayer(
-    queries=[query], tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1,
+data_layer = nemo.collections.nlp.nm.data_layers.joint_intent_slot_datalayer.BertJointIntentSlotInferDataLayer(
+    queries=[query], tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1
 )
 
 # Create sentence classification loss on top
-classifier = nemo_nlp.JointIntentSlotClassifier(
-    hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots, dropout=args.fc_dropout,
+classifier = nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm.JointIntentSlotClassifier(
+    hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots, dropout=args.fc_dropout
 )
 
 ids, type_ids, input_mask, loss_mask, subtokens_mask = data_layer()
@@ -58,7 +74,7 @@
 ###########################################################################
 
 
-evaluated_tensors = nf.infer(tensors=[intent_logits, slot_logits, subtokens_mask], checkpoint_dir=args.work_dir,)
+evaluated_tensors = nf.infer(tensors=[intent_logits, slot_logits, subtokens_mask], checkpoint_dir=args.work_dir)
 
 
 def concatenate(lists):
@@ -68,5 +84,5 @@ def concatenate(lists):
 intent_logits, slot_logits, subtokens_mask = [concatenate(tensors) for tensors in evaluated_tensors]
 
 read_intent_slot_outputs(
-    [query], data_desc.intent_dict_file, data_desc.slot_dict_file, intent_logits, slot_logits, subtokens_mask,
+    [query], data_desc.intent_dict_file, data_desc.slot_dict_file, intent_logits, slot_logits, subtokens_mask
 )
diff --git a/examples/nlp/joint_intent_slot_with_bert.py b/examples/nlp/joint_intent_slot_with_bert.py
index 8e0d5874f226..f700a21f7943 100644
--- a/examples/nlp/joint_intent_slot_with_bert.py
+++ b/examples/nlp/joint_intent_slot_with_bert.py
@@ -1,3 +1,19 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 import argparse
 import math
 import os
@@ -5,10 +21,12 @@
 import numpy as np
 from transformers import BertTokenizer
 
-import nemo
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp.data.datasets.utils import JointIntentSlotDataDesc
-from nemo.collections.nlp.utils.callbacks.joint_intent_slot import eval_epochs_done_callback, eval_iter_callback
+import nemo.collections.nlp.nm.data_layers.joint_intent_slot_datalayer
+import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm
+from nemo import logging
+from nemo.collections.nlp.callbacks.joint_intent_slot_callback import eval_epochs_done_callback, eval_iter_callback
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc
 from nemo.utils.lr_policies import get_lr_policy
 
 # Parsing arguments
@@ -44,9 +62,7 @@
 parser.add_argument("--do_lower_case", action='store_true')
 parser.add_argument("--shuffle_data", action='store_true')
 parser.add_argument("--intent_loss_weight", default=0.6, type=float)
-parser.add_argument(
-    "--class_balancing", default="regular", type=str, choices=["regular", "weighted_loss"],
-)
+parser.add_argument("--class_balancing", default="regular", type=str, choices=["regular", "weighted_loss"])
 
 args = parser.parse_args()
 
@@ -71,43 +87,47 @@
 nemo_nlp.huggingface.BERT.list_pretrained_models()
 """
 if args.bert_checkpoint and args.bert_config:
-    pretrained_bert_model = nemo_nlp.huggingface.BERT(config_filename=args.bert_config, factory=nf)
+    pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(
+        config_filename=args.bert_config
+    )
     pretrained_bert_model.restore_from(args.bert_checkpoint)
 else:
-    pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model, factory=nf)
+    pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(
+        pretrained_model_name=args.pretrained_bert_model
+    )
 
 hidden_size = pretrained_bert_model.hidden_size
 
 data_desc = JointIntentSlotDataDesc(
-    args.data_dir, args.do_lower_case, args.dataset_name, args.none_slot_label, args.pad_label,
+    args.data_dir, args.do_lower_case, args.dataset_name, args.none_slot_label, args.pad_label
 )
 
 # Create sentence classification loss on top
-classifier = nemo_nlp.JointIntentSlotClassifier(
-    hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots, dropout=args.fc_dropout,
+classifier = nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm.JointIntentSlotClassifier(
+    hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots, dropout=args.fc_dropout
 )
 
 if args.class_balancing == 'weighted_loss':
     # Using weighted loss will enable weighted loss for both intents and slots
     # Use the intent_loss_weight hyperparameter to adjust intent loss to
     # prevent overfitting or underfitting.
-    loss_fn = nemo_nlp.JointIntentSlotLoss(
+    loss_fn = nemo_nlp.nm.losses.JointIntentSlotLoss(
         num_slots=data_desc.num_slots,
         slot_classes_loss_weights=data_desc.slot_weights,
         intent_classes_loss_weights=data_desc.intent_weights,
         intent_loss_weight=args.intent_loss_weight,
     )
 else:
-    loss_fn = nemo_nlp.JointIntentSlotLoss(num_slots=data_desc.num_slots)
+    loss_fn = nemo_nlp.nm.losses.JointIntentSlotLoss(num_slots=data_desc.num_slots)
 
 
 def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mode='train'):
-    nemo.logging.info(f"Loading {mode} data...")
+    logging.info(f"Loading {mode} data...")
     data_file = f'{data_desc.data_dir}/{mode}.tsv'
     slot_file = f'{data_desc.data_dir}/{mode}_slots.tsv'
     shuffle = args.shuffle_data if mode == 'train' else False
 
-    data_layer = nemo_nlp.BertJointIntentSlotDataLayer(
+    data_layer = nemo.collections.nlp.nm.data_layers.joint_intent_slot_datalayer.BertJointIntentSlotDataLayer(
         input_file=data_file,
         slot_file=slot_file,
         pad_label=data_desc.pad_label,
@@ -116,43 +136,35 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mod
         num_samples=num_samples,
         shuffle=shuffle,
         batch_size=batch_size,
-        num_workers=0,
-        local_rank=local_rank,
         ignore_extra_tokens=args.ignore_extra_tokens,
         ignore_start_end=args.ignore_start_end,
     )
 
-    (ids, type_ids, input_mask, loss_mask, subtokens_mask, intents, slots,) = data_layer()
+    (ids, type_ids, input_mask, loss_mask, subtokens_mask, intents, slots) = data_layer()
     data_size = len(data_layer)
 
     print(f'The length of data layer is {data_size}')
 
     if data_size < batch_size:
-        nemo.logging.warning("Batch_size is larger than the dataset size")
-        nemo.logging.warning("Reducing batch_size to dataset size")
+        logging.warning("Batch_size is larger than the dataset size")
+        logging.warning("Reducing batch_size to dataset size")
         batch_size = data_size
 
     steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus))
-    nemo.logging.info(f"Steps_per_epoch = {steps_per_epoch}")
+    logging.info(f"Steps_per_epoch = {steps_per_epoch}")
 
     hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask)
 
     intent_logits, slot_logits = classifier(hidden_states=hidden_states)
 
     loss = loss_fn(
-        intent_logits=intent_logits, slot_logits=slot_logits, loss_mask=loss_mask, intents=intents, slots=slots,
+        intent_logits=intent_logits, slot_logits=slot_logits, loss_mask=loss_mask, intents=intents, slots=slots
     )
 
     if mode == 'train':
         tensors_to_evaluate = [loss, intent_logits, slot_logits]
     else:
-        tensors_to_evaluate = [
-            intent_logits,
-            slot_logits,
-            intents,
-            slots,
-            subtokens_mask,
-        ]
+        tensors_to_evaluate = [intent_logits, slot_logits, intents, slots, subtokens_mask]
 
     return tensors_to_evaluate, loss, steps_per_epoch, data_layer
 
@@ -191,11 +203,11 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mod
 
 # Create callback to save checkpoints
 ckpt_callback = nemo.core.CheckpointCallback(
-    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq,
+    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq
 )
 
 lr_policy_fn = get_lr_policy(
-    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion,
+    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion
 )
 
 nf.train(
@@ -203,5 +215,5 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mod
     callbacks=[train_callback, eval_callback, ckpt_callback],
     lr_policy=lr_policy_fn,
     optimizer=args.optimizer_kind,
-    optimization_params={"num_epochs": args.num_epochs, "lr": args.lr, "weight_decay": args.weight_decay,},
+    optimization_params={"num_epochs": args.num_epochs, "lr": args.lr, "weight_decay": args.weight_decay},
 )
diff --git a/examples/nlp/transformer_lm.py b/examples/nlp/language_modeling_transformer.py
similarity index 74%
rename from examples/nlp/transformer_lm.py
rename to examples/nlp/language_modeling_transformer.py
index 41ca2e960ffb..9d2b08be9080 100644
--- a/examples/nlp/transformer_lm.py
+++ b/examples/nlp/language_modeling_transformer.py
@@ -1,10 +1,27 @@
-# Copyright (c) 2019 NVIDIA Corporation
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 import math
 
 import nemo
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp.data.datasets.utils import LanguageModelDataDesc
-from nemo.collections.nlp.utils.callbacks.language_modeling import eval_epochs_done_callback, eval_iter_callback
+import nemo.collections.nlp.nm.data_layers.lm_transformer_datalayer
+import nemo.collections.nlp.nm.trainables.common.token_classification_nm
+from nemo.collections.nlp.callbacks.lm_transformer_callback import eval_epochs_done_callback, eval_iter_callback
+from nemo.collections.nlp.data.datasets.lm_transformer_dataset import LanguageModelDataDesc
 from nemo.utils.lr_policies import CosineAnnealing
 
 parser = nemo.utils.NemoArgParser(description='LM Transformer')
@@ -67,14 +84,14 @@
 # define tokenizer, in this example we use word-level tokenizer
 # we also adjust the vocabulary size to make it multiple of 8 to accelerate
 # training in fp16 mode with the use of Tensor Cores
-tokenizer = nemo_nlp.WordTokenizer(f"{args.data_dir}/{args.tokenizer_model}")
+tokenizer = nemo_nlp.data.WordTokenizer(f"{args.data_dir}/{args.tokenizer_model}")
 vocab_size = 8 * math.ceil(tokenizer.vocab_size / 8)
 
 # instantiate necessary modules for the whole translation pipeline, namely
 # data layers, encoder, decoder, output log_softmax, beam_search_translator
 # and loss function
 
-encoder = nemo_nlp.TransformerEncoderNM(
+encoder = nemo_nlp.nm.trainables.TransformerEncoderNM(
     d_model=args.d_model,
     d_inner=args.d_inner,
     num_layers=args.num_layers,
@@ -88,19 +105,23 @@
     max_seq_length=args.max_seq_length,
 )
 
-log_softmax = nemo_nlp.TokenClassifier(args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True)
+log_softmax = nemo.collections.nlp.nm.trainables.common.token_classification_nm.TokenClassifier(
+    args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True
+)
 
-loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(pad_id=tokenizer.pad_id(), label_smoothing=args.label_smoothing)
+loss = nemo_nlp.nm.losses.PaddedSmoothedCrossEntropyLossNM(
+    pad_id=tokenizer.pad_id(), label_smoothing=args.label_smoothing
+)
 
 # tie weight of embedding and log_softmax layers
 log_softmax.mlp.last_linear_layer.weight = encoder.embedding_layer.token_embedding.weight
 
 
 def create_pipeline(
-    dataset, max_seq_length=args.max_seq_length, batch_step=args.max_seq_length, batch_size=args.batch_size,
+    dataset, max_seq_length=args.max_seq_length, batch_step=args.max_seq_length, batch_size=args.batch_size
 ):
-    data_layer = nemo_nlp.LanguageModelingDataLayer(
-        dataset, tokenizer, max_seq_length, batch_step, batch_size=batch_size
+    data_layer = nemo.collections.nlp.nm.data_layers.lm_transformer_datalayer.LanguageModelingDataLayer(
+        dataset, tokenizer, max_seq_length, batch_size, batch_step
     )
     src, src_mask, labels = data_layer()
     src_hiddens = encoder(input_ids=src, input_mask_src=src_mask)
@@ -141,7 +162,7 @@ def create_pipeline(
 
 # callback which saves checkpoints once in a while
 callback_ckpt = nemo.core.CheckpointCallback(
-    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, checkpoints_to_keep=-1,
+    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, checkpoints_to_keep=-1
 )
 
 # define learning rate decay policy
diff --git a/examples/nlp/nmt_tutorial.py b/examples/nlp/machine_translation_tutorial.py
similarity index 84%
rename from examples/nlp/nmt_tutorial.py
rename to examples/nlp/machine_translation_tutorial.py
index 49775c187ce3..5ca3cc4a3ca5 100644
--- a/examples/nlp/nmt_tutorial.py
+++ b/examples/nlp/machine_translation_tutorial.py
@@ -1,4 +1,20 @@
-""" Copyright (c) 2019 NVIDIA Corporation
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""
 See the tutorial and download the data here:
 https://nvidia.github.io/NeMo/nlp/
 neural-machine-translation.html#translation-with-pretrained-model
@@ -7,7 +23,7 @@
 
 import nemo
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp.utils.callbacks.translation import eval_epochs_done_callback, eval_iter_callback
+from nemo.collections.nlp.callbacks.machine_translation_callback import eval_epochs_done_callback, eval_iter_callback
 from nemo.utils.lr_policies import get_lr_policy
 
 parser = nemo.utils.NemoArgParser(description='Transformer for Neural Machine Translation')
@@ -76,14 +92,14 @@
     We use YouTokenToMe tokenizer trained on joint
     English & German data for both source and target languages.
     """
-    src_tokenizer = nemo_nlp.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.src_tokenizer_model}")
+    src_tokenizer = nemo_nlp.data.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.src_tokenizer_model}")
     src_vocab_size = src_tokenizer.vocab_size
     if args.src_tokenizer_model == args.tgt_tokenizer_model:
         tgt_tokenizer = src_tokenizer
         # source and target use the same tokenizer, set tie_weight to True
         tie_weight = True
     else:
-        tgt_tokenizer = nemo_nlp.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.tgt_tokenizer_model}")
+        tgt_tokenizer = nemo_nlp.data.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.tgt_tokenizer_model}")
         # source and target use different tokenizers, set tie_weight to False
         tie_weight = False
     tgt_vocab_size = tgt_tokenizer.vocab_size
@@ -92,9 +108,9 @@
     We use YouTokenToMeTokenizer for src since the src contains English words
     and CharTokenizer for tgt since the tgt contains Chinese characters.
     """
-    src_tokenizer = nemo_nlp.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.src_tokenizer_model}")
+    src_tokenizer = nemo_nlp.data.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.src_tokenizer_model}")
     src_vocab_size = src_tokenizer.vocab_size
-    tgt_tokenizer = nemo_nlp.CharTokenizer(vocab_path=f"{args.data_dir}/{args.tgt_tokenizer_model}")
+    tgt_tokenizer = nemo_nlp.data.CharTokenizer(vocab_path=f"{args.data_dir}/{args.tgt_tokenizer_model}")
     tgt_vocab_size = tgt_tokenizer.vocab_size
     # source and target use different tokenizers, set tie_weight to False
     tie_weight = False
@@ -104,7 +120,7 @@
 # instantiate necessary modules for the whole translation pipeline, namely
 # data layers, encoder, decoder, output log_softmax, beam_search_translator
 # and loss function
-encoder = nemo_nlp.TransformerEncoderNM(
+encoder = nemo_nlp.nm.trainables.TransformerEncoderNM(
     d_model=args.d_model,
     d_inner=args.d_inner,
     num_layers=args.num_layers,
@@ -117,7 +133,7 @@
     max_seq_length=args.max_seq_length,
 )
 
-decoder = nemo_nlp.TransformerDecoderNM(
+decoder = nemo_nlp.nm.trainables.TransformerDecoderNM(
     d_model=args.d_model,
     d_inner=args.d_inner,
     num_layers=args.num_layers,
@@ -130,11 +146,11 @@
     max_seq_length=args.max_seq_length,
 )
 
-log_softmax = nemo_nlp.TokenClassifier(
-    args.d_model, num_classes=tgt_tokenizer.vocab_size, num_layers=1, log_softmax=True,
+log_softmax = nemo_nlp.nm.trainables.token_classification_nm.TokenClassifier(
+    args.d_model, num_classes=tgt_tokenizer.vocab_size, num_layers=1, log_softmax=True
 )
 
-beam_search = nemo_nlp.BeamSearchTranslatorNM(
+beam_search = nemo_nlp.nm.trainables.BeamSearchTranslatorNM(
     decoder=decoder,
     log_softmax=log_softmax,
     max_seq_length=args.max_seq_length,
@@ -144,7 +160,7 @@
     eos_token=tgt_tokenizer.eos_id(),
 )
 
-loss_fn = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(
+loss_fn = nemo_nlp.nm.losses.PaddedSmoothedCrossEntropyLossNM(
     pad_id=tgt_tokenizer.pad_id(), label_smoothing=args.label_smoothing
 )
 
@@ -154,7 +170,7 @@
 
 
 def create_pipeline(dataset_src, dataset_tgt, tokens_in_batch, clean=False, training=True):
-    data_layer = nemo_nlp.TranslationDataLayer(
+    data_layer = nemo_nlp.nm.data_layers.machine_translation_datalayer.TranslationDataLayer(
         tokenizer_src=src_tokenizer,
         tokenizer_tgt=tgt_tokenizer,
         dataset_src=dataset_src,
@@ -165,7 +181,7 @@ def create_pipeline(dataset_src, dataset_tgt, tokens_in_batch, clean=False, trai
     src, src_mask, tgt, tgt_mask, labels, sent_ids = data_layer()
     src_hiddens = encoder(input_ids=src, input_mask_src=src_mask)
     tgt_hiddens = decoder(
-        input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask,
+        input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask
     )
     logits = log_softmax(hidden_states=tgt_hiddens)
     loss = loss_fn(logits=logits, target_ids=labels)
@@ -207,7 +223,7 @@ def create_pipeline(dataset_src, dataset_tgt, tokens_in_batch, clean=False, trai
 # callback which saves checkpoints once in a while
 ckpt_dir = nf.checkpoint_dir if not args.interactive else args.restore_checkpoint_from
 ckpt_callback = nemo.core.CheckpointCallback(
-    folder=ckpt_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, checkpoints_to_keep=1,
+    folder=ckpt_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, checkpoints_to_keep=1
 )
 
 # define learning rate decay policy
@@ -228,7 +244,7 @@ def create_pipeline(dataset_src, dataset_tgt, tokens_in_batch, clean=False, trai
         callbacks=[train_callback, eval_callback, ckpt_callback],
         optimizer=args.optimizer,
         lr_policy=lr_policy_fn,
-        optimization_params={**stop_training_condition, "lr": args.lr, "weight_decay": args.weight_decay,},
+        optimization_params={**stop_training_condition, "lr": args.lr, "weight_decay": args.weight_decay},
         batches_per_step=args.iter_per_step,
     )
 else:
diff --git a/examples/nlp/punctuation_capitalization.py b/examples/nlp/punctuation_capitalization.py
index cf2a2d20cda6..abd67203ffd4 100644
--- a/examples/nlp/punctuation_capitalization.py
+++ b/examples/nlp/punctuation_capitalization.py
@@ -1,18 +1,34 @@
-# pylint: disable=invalid-name
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
 import argparse
 import json
 import os
-import sys
 
-import nemo
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer, TokenClassificationLoss, TokenClassifier
-from nemo.collections.nlp.data.datasets import utils
-from nemo.collections.nlp.utils.callbacks.punctuation_capitalization import (
+import nemo.collections.nlp.utils.common_nlp_utils
+from nemo import logging
+from nemo.collections.nlp.callbacks.punctuation_capitalization_callback import (
     eval_epochs_done_callback,
     eval_iter_callback,
 )
+from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer
+from nemo.collections.nlp.nm.data_layers import PunctuationCapitalizationDataLayer
+from nemo.collections.nlp.nm.losses.token_classification_loss import TokenClassificationLoss
+from nemo.collections.nlp.nm.trainables import TokenClassifier
 from nemo.utils.lr_policies import get_lr_policy
 
 # Parsing arguments
@@ -40,9 +56,7 @@
 parser.add_argument("--shuffle_data", action='store_true')
 parser.add_argument("--pretrained_bert_model", default="bert-base-uncased", type=str)
 parser.add_argument("--bert_checkpoint", default=None, type=str)
-parser.add_argument(
-    "--bert_config", default=None, type=str, help="Path to bert config file in json format",
-)
+parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format")
 parser.add_argument("--punct_classifier_checkpoint", default=None, type=str)
 parser.add_argument("--capit_classifier_checkpoint", default=None, type=str)
 parser.add_argument(
@@ -67,9 +81,7 @@
     help="The output directory where the model prediction\
                     and checkpoints will be written.",
 )
-parser.add_argument(
-    "--use_cache", action='store_true', help="Whether to cache preprocessed data",
-)
+parser.add_argument("--use_cache", action='store_true', help="Whether to cache preprocessed data")
 parser.add_argument(
     "--save_epoch_freq",
     default=1,
@@ -84,9 +96,7 @@
     help="Frequency of saving checkpoint \
                     '-1' - step checkpoint won't be saved",
 )
-parser.add_argument(
-    "--loss_step_freq", default=250, type=int, help="Frequency of printing loss",
-)
+parser.add_argument("--loss_step_freq", default=250, type=int, help="Frequency of printing loss")
 parser.add_argument(
     "--use_weighted_loss_punct",
     action='store_true',
@@ -109,7 +119,7 @@
     add_time_to_log_dir=True,
 )
 
-nemo.logging.info(args)
+logging.info(args)
 
 output_file = f'{nf.work_dir}/output.txt'
 
@@ -119,7 +129,7 @@
     nemo_nlp.huggingface.BERT.list_pretrained_models()
     """
     tokenizer = NemoBertTokenizer(args.pretrained_bert_model)
-    model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
+    model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 else:
     """ Use this if you're using a BERT model that you pre-trained yourself.
     """
@@ -133,29 +143,20 @@
     if args.bert_config is not None:
         with open(args.bert_config) as json_file:
             config = json.load(json_file)
-        model = nemo_nlp.huggingface.BERT(**config)
+        model = nemo_nlp.nm.trainables.huggingface.BERT(**config)
     else:
-        model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
+        model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 
     model.restore_from(args.bert_checkpoint)
-    nemo.logging.info(f"Model restored from {args.bert_checkpoint}")
+    logging.info(f"Model restored from {args.bert_checkpoint}")
 
 hidden_size = model.hidden_size
 
-punct_classifier = "TokenClassifier"
-punct_loss = "TokenClassificationLoss"
-
-capit_classifier = "TokenClassifier"
-capit_loss = "TokenClassificationLoss"
-task_loss = None
-
 
 def create_pipeline(
-    num_samples=-1,
     pad_label=args.none_label,
     max_seq_length=args.max_seq_length,
     batch_size=args.batch_size,
-    local_rank=args.local_rank,
     num_gpus=args.num_gpus,
     mode='train',
     punct_label_ids=None,
@@ -165,10 +166,11 @@ def create_pipeline(
     use_cache=args.use_cache,
     dropout=args.fc_dropout,
     punct_num_layers=args.punct_num_fc_layers,
+    punct_classifier=TokenClassifier,
+    capit_classifier=TokenClassifier,
 ):
-    global punct_classifier, punct_loss, capit_classifier, capit_loss, task_loss
 
-    nemo.logging.info(f"Loading {mode} data...")
+    logging.info(f"Loading {mode} data...")
     shuffle = args.shuffle_data if mode == 'train' else False
 
     text_file = f'{args.data_dir}/text_{mode}.txt'
@@ -187,7 +189,7 @@ def create_pipeline(
            [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).'
         )
 
-    data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer(
+    data_layer = PunctuationCapitalizationDataLayer(
         tokenizer=tokenizer,
         text_file=text_file,
         label_file=label_file,
@@ -196,15 +198,13 @@ def create_pipeline(
         capit_label_ids=capit_label_ids,
         max_seq_length=max_seq_length,
         batch_size=batch_size,
-        num_workers=0,
-        local_rank=local_rank,
         shuffle=shuffle,
         ignore_extra_tokens=ignore_extra_tokens,
         ignore_start_end=ignore_start_end,
         use_cache=use_cache,
     )
 
-    (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, punct_labels, capit_labels,) = data_layer()
+    (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, punct_labels, capit_labels) = data_layer()
 
     if mode == 'train':
         punct_label_ids = data_layer.dataset.punct_label_ids
@@ -212,12 +212,11 @@ def create_pipeline(
         class_weights = None
 
         if args.use_weighted_loss_punct:
-            nemo.logging.info(f"Using weighted loss for punctuation task")
+            logging.info(f"Using weighted loss for punctuation task")
             punct_label_freqs = data_layer.dataset.punct_label_frequencies
-            class_weights = utils.calc_class_weights(punct_label_freqs)
+            class_weights = nemo.collections.nlp.utils.common_nlp_utils.calc_class_weights(punct_label_freqs)
 
         # Initialize punctuation loss
-        punct_classifier = getattr(sys.modules[__name__], punct_classifier)
         punct_classifier = punct_classifier(
             hidden_size=hidden_size,
             num_classes=len(punct_label_ids),
@@ -226,20 +225,17 @@ def create_pipeline(
             name='Punctuation',
         )
 
-        punct_loss = getattr(sys.modules[__name__], punct_loss)
-        punct_loss = punct_loss(num_classes=len(punct_label_ids), class_weights=class_weights)
+        punct_loss = TokenClassificationLoss(num_classes=len(punct_label_ids), class_weights=class_weights)
 
         # Initialize capitalization loss
-        capit_classifier = getattr(sys.modules[__name__], capit_classifier)
         capit_classifier = capit_classifier(
-            hidden_size=hidden_size, num_classes=len(capit_label_ids), dropout=dropout, name='Capitalization',
+            hidden_size=hidden_size, num_classes=len(capit_label_ids), dropout=dropout, name='Capitalization'
         )
-        capit_loss = getattr(sys.modules[__name__], capit_loss)
-        capit_loss = capit_loss(num_classes=len(capit_label_ids))
+        capit_loss = TokenClassificationLoss(num_classes=len(capit_label_ids))
 
-        task_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)
+        task_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2)
 
-    hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
+    hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
 
     punct_logits = punct_classifier(hidden_states=hidden_states)
     capit_logits = capit_classifier(hidden_states=hidden_states)
@@ -253,31 +249,31 @@ def create_pipeline(
 
         losses = [task_loss, punct_loss, capit_loss]
         logits = [punct_logits, capit_logits]
-        return (
-            losses,
-            logits,
-            steps_per_epoch,
-            punct_label_ids,
-            capit_label_ids,
-        )
+        return losses, logits, steps_per_epoch, punct_label_ids, capit_label_ids, punct_classifier, capit_classifier
     else:
-        tensors_to_evaluate = [
-            punct_logits,
-            capit_logits,
-            punct_labels,
-            capit_labels,
-            subtokens_mask,
-        ]
+        tensors_to_evaluate = [punct_logits, capit_logits, punct_labels, capit_labels, subtokens_mask]
         return tensors_to_evaluate, data_layer
 
 
-(losses, train_logits, steps_per_epoch, punct_label_ids, capit_label_ids,) = create_pipeline()
+(
+    losses,
+    train_logits,
+    steps_per_epoch,
+    punct_label_ids,
+    capit_label_ids,
+    punct_classifier,
+    capit_classifier,
+) = create_pipeline()
 
 eval_tensors, data_layer = create_pipeline(
-    mode='dev', punct_label_ids=punct_label_ids, capit_label_ids=capit_label_ids,
+    mode='dev',
+    punct_label_ids=punct_label_ids,
+    capit_label_ids=capit_label_ids,
+    punct_classifier=punct_classifier,
+    capit_classifier=capit_classifier,
 )
 
-nemo.logging.info(f"steps_per_epoch = {steps_per_epoch}")
+logging.info(f"steps_per_epoch = {steps_per_epoch}")
 
 # Create trainer and execute training action
 train_callback = nemo.core.SimpleLossLoggerCallback(
@@ -298,11 +294,11 @@ def create_pipeline(
 )
 
 ckpt_callback = nemo.core.CheckpointCallback(
-    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq,
+    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq
 )
 
 lr_policy_fn = get_lr_policy(
-    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion,
+    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion
 )
 
 nf.train(
diff --git a/examples/nlp/punctuation_capitalization_infer.py b/examples/nlp/punctuation_capitalization_infer.py
index 2456e64408f2..2d18fcda82fd 100644
--- a/examples/nlp/punctuation_capitalization_infer.py
+++ b/examples/nlp/punctuation_capitalization_infer.py
@@ -1,16 +1,33 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 import argparse
 import os
 
 import numpy as np
-from sklearn.metrics import classification_report
 
 import nemo
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp import NemoBertTokenizer
-from nemo.collections.nlp.utils.nlp_utils import get_vocab
+from nemo import logging
+from nemo.collections.nlp.data import NemoBertTokenizer
+from nemo.collections.nlp.nm.data_layers import BertTokenClassificationInferDataLayer
+from nemo.collections.nlp.utils.common_nlp_utils import get_vocab
 
 # Parsing arguments
-parser = argparse.ArgumentParser(description='NER with pretrained BERT')
+parser = argparse.ArgumentParser(description='Punctuation and capitalization detection inference')
 parser.add_argument("--max_seq_length", default=128, type=int)
 parser.add_argument("--fc_dropout", default=0, type=float)
 parser.add_argument("--punct_num_fc_layers", default=3, type=int)
@@ -26,8 +43,7 @@
         'how are you',
         'how\'s the weather today',
         'okay',
-        'we bought four shirts one mug and ten '
-        + 'thousand titan rtx graphics cards the more '
+        'we bought four shirts one mug and ten thousand titan rtx graphics cards the more '
         + 'you buy the more you save',
     ],
     help="Example: --queries 'san francisco' --queries 'la'",
@@ -66,7 +82,7 @@
     )
 
 nf = nemo.core.NeuralModuleFactory(
-    backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None,
+    backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None
 )
 
 punct_labels_dict = get_vocab(args.punct_labels_dict)
@@ -75,17 +91,17 @@
 
 """ Load the pretrained BERT parameters
 See the list of pretrained models, call:
-nemo_nlp.huggingface.BERT.list_pretrained_models()
+nemo.collections.nlp.BERT.list_pretrained_models()
 """
-pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
+pretrained_bert_model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 hidden_size = pretrained_bert_model.hidden_size
 tokenizer = NemoBertTokenizer(args.pretrained_bert_model)
 
-data_layer = nemo_nlp.BertTokenClassificationInferDataLayer(
-    queries=args.queries, tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1,
+data_layer = BertTokenClassificationInferDataLayer(
+    queries=args.queries, tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1
 )
 
-punct_classifier = nemo_nlp.TokenClassifier(
+punct_classifier = nemo_nlp.nm.trainables.TokenClassifier(
     hidden_size=hidden_size,
     num_classes=len(punct_labels_dict),
     dropout=args.fc_dropout,
@@ -93,13 +109,13 @@
     name='Punctuation',
 )
 
-capit_classifier = nemo_nlp.TokenClassifier(
-    hidden_size=hidden_size, num_classes=len(capit_labels_dict), dropout=args.fc_dropout, name='Capitalization',
+capit_classifier = nemo_nlp.nm.trainables.TokenClassifier(
+    hidden_size=hidden_size, num_classes=len(capit_labels_dict), dropout=args.fc_dropout, name='Capitalization'
 )
 
 input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = data_layer()
 
-hidden_states = pretrained_bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
+hidden_states = pretrained_bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
 
 punct_logits = punct_classifier(hidden_states=hidden_states)
 capit_logits = capit_classifier(hidden_states=hidden_states)
@@ -107,26 +123,20 @@
 ###########################################################################
 
 # Instantiate an optimizer to perform `infer` action
-evaluated_tensors = nf.infer(
-    tensors=[punct_logits, capit_logits, subtokens_mask], checkpoint_dir=args.checkpoints_dir,
-)
+evaluated_tensors = nf.infer(tensors=[punct_logits, capit_logits, subtokens_mask], checkpoint_dir=args.checkpoints_dir)
 
 
 def concatenate(lists):
     return np.concatenate([t.cpu() for t in lists])
 
 
-def get_preds(logits):
-    return np.argmax(logits, 1)
-
-
 punct_logits, capit_logits, subtokens_mask = [concatenate(tensors) for tensors in evaluated_tensors]
 
 punct_preds = np.argmax(punct_logits, axis=2)
 capit_preds = np.argmax(capit_logits, axis=2)
 
 for i, query in enumerate(args.queries):
-    nemo.logging.info(f'Query: {query}')
+    logging.info(f'Query: {query}')
 
     punct_pred = punct_preds[i][subtokens_mask[i] > 0.5]
     capit_pred = capit_preds[i][subtokens_mask[i] > 0.5]
@@ -145,4 +155,4 @@ def get_preds(logits):
         if punct_label != args.none_label:
             output += punct_label
         output += ' '
-    nemo.logging.info(f'Combined: {output.strip()}\n')
+    logging.info(f'Combined: {output.strip()}\n')
diff --git a/examples/nlp/squad.py b/examples/nlp/question_answering_squad.py
similarity index 80%
rename from examples/nlp/squad.py
rename to examples/nlp/question_answering_squad.py
index 627b8bd00300..44b737d57cab 100755
--- a/examples/nlp/squad.py
+++ b/examples/nlp/question_answering_squad.py
@@ -16,12 +16,14 @@
 
 Some transformer of this code were adapted from the HuggingFace library at
 https://github.com/huggingface/transformers
+"""
 
+"""
 Download the Squad data by running the script:
-examples/nlp/scripts/download_squad.py
+examples/nlp/scripts/get_squad.py
 
 To finetune Squad v1.1 on pretrained BERT large uncased on 1 GPU:
-python squad.py
+python question_answering_squad.py
 --data_dir /path_to_data_dir/squad/v1.1
 --work_dir /path_to_output_folder
 --bert_checkpoint /path_to_bert_checkpoint
@@ -39,7 +41,7 @@
 Huggingface pretrained checkpoints.
 
 To finetune Squad v1.1 on pretrained BERT large uncased on 8 GPU:
-python -m torch.distributed.launch --nproc_per_node=8 squad.py
+python -m torch.distributed.launch --nproc_per_node=8 question_answering_squad.py
 --amp_opt_level "O1"
 --data_dir /path_to_data_dir/squad/v1.1
 --bert_checkpoint /path_to_bert_checkpoint
@@ -62,9 +64,10 @@
 import json
 import os
 
-import nemo
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp.utils.callbacks.squad import eval_epochs_done_callback, eval_iter_callback
+import nemo.core as nemo_core
+from nemo import logging
+from nemo.collections.nlp.callbacks.qa_squad_callback import eval_epochs_done_callback, eval_iter_callback
 from nemo.utils.lr_policies import get_lr_policy
 
 
@@ -79,17 +82,13 @@ def parse_args():
         "(or other data files) for the task.",
     )
     parser.add_argument(
-        "--pretrained_bert_model", default="bert-base-uncased", type=str, help="Name of the pre-trained model",
-    )
-    parser.add_argument(
-        "--checkpoint_dir", default=None, type=str, help="Checkpoint directory for inference.",
-    )
-    parser.add_argument(
-        "--bert_checkpoint", default=None, type=str, help="Path to BERT model checkpoint for finetuning.",
+        "--pretrained_bert_model", default="bert-base-uncased", type=str, help="Name of the pre-trained model"
     )
+    parser.add_argument("--checkpoint_dir", default=None, type=str, help="Checkpoint directory for inference.")
     parser.add_argument(
-        "--bert_config", default=None, type=str, help="Path to bert config file in json format",
+        "--bert_checkpoint", default=None, type=str, help="Path to BERT model checkpoint for finetuning."
     )
+    parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format")
     parser.add_argument(
         "--tokenizer_model",
         default="tokenizer.model",
@@ -107,23 +106,15 @@ def parse_args():
     parser.add_argument("--lr_policy", default="WarmupAnnealing", type=str)
     parser.add_argument("--lr", default=3e-5, type=float, help="The initial learning rate.")
     parser.add_argument("--lr_warmup_proportion", default=0.0, type=float)
-    parser.add_argument(
-        "--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.",
-    )
-    parser.add_argument(
-        "--num_epochs", default=2, type=int, help="Total number of training epochs to perform.",
-    )
-    parser.add_argument(
-        "--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training/evaluation.",
-    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
+    parser.add_argument("--num_epochs", default=2, type=int, help="Total number of training epochs to perform.")
+    parser.add_argument("--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training/evaluation.")
     parser.add_argument(
         "--do_lower_case",
         action='store_true',
         help="Whether to lower case the input text. " "True for uncased models, False for cased models.",
     )
-    parser.add_argument(
-        "--evaluation_only", action='store_true', help="Whether to only do evaluation.",
-    )
+    parser.add_argument("--evaluation_only", action='store_true', help="Whether to only do evaluation.")
     parser.add_argument(
         "--doc_stride",
         default=128,
@@ -149,11 +140,9 @@ def parse_args():
     )
     parser.add_argument("--num_gpus", default=1, type=int, help="Number of GPUs")
     parser.add_argument(
-        "--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"], help="01/02 to enable mixed precision",
-    )
-    parser.add_argument(
-        "--local_rank", type=int, default=None, help="For distributed training: local_rank",
+        "--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"], help="01/02 to enable mixed precision"
     )
+    parser.add_argument("--local_rank", type=int, default=None, help="For distributed training: local_rank")
     parser.add_argument(
         "--work_dir",
         default='output_squad',
@@ -172,12 +161,8 @@ def parse_args():
         type=int,
         help="Frequency of saving checkpoint " "'-1' - step checkpoint won't be saved",
     )
-    parser.add_argument(
-        "--loss_step_freq", default=100, type=int, help="Frequency of printing loss",
-    )
-    parser.add_argument(
-        "--eval_step_freq", default=500, type=int, help="Frequency of evaluation on dev data",
-    )
+    parser.add_argument("--loss_step_freq", default=100, type=int, help="Frequency of printing loss")
+    parser.add_argument("--eval_step_freq", default=500, type=int, help="Frequency of evaluation on dev data")
     parser.add_argument(
         "--version_2_with_negative",
         action="store_true",
@@ -195,9 +180,7 @@ def parse_args():
         type=int,
         help="The total number of n-best predictions to " "generate in the nbest_predictions.json output file.",
     )
-    parser.add_argument(
-        "--batches_per_step", default=1, type=int, help="Number of iterations per step.",
-    )
+    parser.add_argument("--batches_per_step", default=1, type=int, help="Number of iterations per step.")
     parser.add_argument(
         "--max_answer_length",
         default=30,
@@ -232,7 +215,7 @@ def create_pipeline(
     batches_per_step=1,
     mode="train",
 ):
-    data_layer = nemo_nlp.BertQuestionAnsweringDataLayer(
+    data_layer = nemo_nlp.nm.data_layers.qa_squad_datalayer.BertQuestionAnsweringDataLayer(
         mode=mode,
         version_2_with_negative=version_2_with_negative,
         batch_size=batch_size,
@@ -246,19 +229,19 @@ def create_pipeline(
     input_data = data_layer()
 
     hidden_states = model(
-        input_ids=input_data.input_ids, token_type_ids=input_data.input_type_ids, attention_mask=input_data.input_mask,
+        input_ids=input_data.input_ids, token_type_ids=input_data.input_type_ids, attention_mask=input_data.input_mask
     )
 
     qa_output = head(hidden_states=hidden_states)
     loss_output = loss_fn(
-        logits=qa_output, start_positions=input_data.start_positions, end_positions=input_data.end_positions,
+        logits=qa_output, start_positions=input_data.start_positions, end_positions=input_data.end_positions
     )
 
     steps_per_epoch = len(data_layer) // (batch_size * num_gpus * batches_per_step)
     return (
         loss_output.loss,
         steps_per_epoch,
-        [loss_output.start_logits, loss_output.end_logits, input_data.unique_ids,],
+        [loss_output.start_logits, loss_output.end_logits, input_data.unique_ids],
         data_layer,
     )
 
@@ -266,9 +249,7 @@ def create_pipeline(
 if __name__ == "__main__":
     args = parse_args()
     if not os.path.exists(args.data_dir):
-        raise FileNotFoundError(
-            "SQUAD datasets not found. Datasets can be " "obtained using scripts/download_squad.py"
-        )
+        raise FileNotFoundError("SQUAD datasets not found. Datasets can be " "obtained using scripts/get_squad.py")
 
     if not args.version_2_with_negative:
         args.work_dir = f'{args.work_dir}/squad1.1'
@@ -276,8 +257,8 @@ def create_pipeline(
         args.work_dir = f'{args.work_dir}/squad2.0'
 
     # Instantiate neural factory with supported backend
-    nf = nemo.core.NeuralModuleFactory(
-        backend=nemo.core.Backend.PyTorch,
+    nf = nemo_core.NeuralModuleFactory(
+        backend=nemo_core.Backend.PyTorch,
         local_rank=args.local_rank,
         optimization_level=args.amp_opt_level,
         log_dir=args.work_dir,
@@ -288,7 +269,7 @@ def create_pipeline(
 
     if args.tokenizer == "sentencepiece":
         try:
-            tokenizer = nemo_nlp.SentencePieceTokenizer(model_path=args.tokenizer_model)
+            tokenizer = nemo_nlp.data.utilsSentencePieceTokenizer(model_path=args.tokenizer_model)
         except Exception:
             raise ValueError(
                 "Using --tokenizer=sentencepiece \
@@ -296,25 +277,27 @@ def create_pipeline(
             )
         tokenizer.add_special_tokens(["[CLS]", "[SEP]"])
     elif args.tokenizer == "nemobert":
-        tokenizer = nemo_nlp.NemoBertTokenizer(args.pretrained_bert_model)
+        tokenizer = nemo_nlp.data.NemoBertTokenizer(args.pretrained_bert_model)
     else:
         raise ValueError(f"received unexpected tokenizer '{args.tokenizer}'")
 
     if args.bert_config is not None:
         with open(args.bert_config) as json_file:
             config = json.load(json_file)
-        model = nemo_nlp.huggingface.BERT(**config)
+        model = nemo_nlp.nm.trainables.huggingface.BERT(**config)
     else:
         """ Use this if you're using a standard BERT model.
         To see the list of pretrained models, call:
         nemo_nlp.huggingface.BERT.list_pretrained_models()
         """
-        model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
+        model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 
     hidden_size = model.hidden_size
 
-    qa_head = nemo_nlp.TokenClassifier(hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False)
-    squad_loss = nemo_nlp.QuestionAnsweringLoss()
+    qa_head = nemo_nlp.nm.trainables.token_classification_nm.TokenClassifier(
+        hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False
+    )
+    squad_loss = nemo_nlp.nm.losses.QuestionAnsweringLoss()
     if args.bert_checkpoint is not None:
         model.restore_from(args.bert_checkpoint)
 
@@ -349,8 +332,8 @@ def create_pipeline(
     )
 
     if not args.evaluation_only:
-        nemo.logging.info(f"steps_per_epoch = {train_steps_per_epoch}")
-        callback_train = nemo.core.SimpleLossLoggerCallback(
+        logging.info(f"steps_per_epoch = {train_steps_per_epoch}")
+        callback_train = nemo_core.SimpleLossLoggerCallback(
             tensors=[train_loss],
             print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())),
             get_tb_values=lambda x: [["loss", x[0]]],
@@ -358,10 +341,10 @@ def create_pipeline(
             tb_writer=nf.tb_writer,
         )
 
-        ckpt_callback = nemo.core.CheckpointCallback(
-            folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq,
+        ckpt_callback = nemo_core.CheckpointCallback(
+            folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq
         )
-        callbacks_eval = nemo.core.EvaluatorCallback(
+        callbacks_eval = nemo_core.EvaluatorCallback(
             eval_tensors=eval_output,
             user_iter_callback=lambda x, y: eval_iter_callback(x, y),
             user_epochs_done_callback=lambda x: eval_epochs_done_callback(
@@ -378,9 +361,7 @@ def create_pipeline(
         )
 
         lr_policy_fn = get_lr_policy(
-            args.lr_policy,
-            total_steps=args.num_epochs * train_steps_per_epoch,
-            warmup_ratio=args.lr_warmup_proportion,
+            args.lr_policy, total_steps=args.num_epochs * train_steps_per_epoch, warmup_ratio=args.lr_warmup_proportion
         )
 
         nf.train(
@@ -416,7 +397,7 @@ def create_pipeline(
             null_score_diff_threshold=args.null_score_diff_threshold,
             do_lower_case=args.do_lower_case,
         )
-        nemo.logging.info(f"exact_match: {exact_match}, f1: {f1}")
+        logging.info(f"exact_match: {exact_match}, f1: {f1}")
         if args.output_prediction_file is not None:
             with open(args.output_prediction_file, "w") as writer:
                 writer.write(json.dumps(all_predictions, indent=4) + "\n")
diff --git a/scripts/convert_iob_format_to_token_classification_format.py b/examples/nlp/scripts/convert_iob_format_to_token_classification_format.py
similarity index 83%
rename from scripts/convert_iob_format_to_token_classification_format.py
rename to examples/nlp/scripts/convert_iob_format_to_token_classification_format.py
index e30345e547d8..0e95f62aa186 100644
--- a/scripts/convert_iob_format_to_token_classification_format.py
+++ b/examples/nlp/scripts/convert_iob_format_to_token_classification_format.py
@@ -1,21 +1,24 @@
-# Copyright (C) NVIDIA CORPORATION. All Rights Reserved.
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the “License”);
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#      http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.****
+# limitations under the License.
+# =============================================================================
 
 import argparse
-import logging
 import os
 
+from nemo import logging
+
 
 def __convert_data(in_file, out_text, out_labels):
     """
diff --git a/nemo/collections/nlp/utils/download_squad.py b/examples/nlp/scripts/get_squad.py
similarity index 79%
rename from nemo/collections/nlp/utils/download_squad.py
rename to examples/nlp/scripts/get_squad.py
index 80c4739e7b62..037d1b3d3fbb 100755
--- a/nemo/collections/nlp/utils/download_squad.py
+++ b/examples/nlp/scripts/get_squad.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,11 +12,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# =============================================================================
 
 import argparse
 import os
 import urllib.request
 
+from nemo import logging
+
 
 class SquadDownloader:
     def __init__(self, save_path):
@@ -32,12 +37,8 @@ def __init__(self, save_path):
         self.download_urls = {
             'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/train-v1.1.json': 'v1.1/train-v1.1.json',
             'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/dev-v1.1.json': 'v1.1/dev-v1.1.json',
-            'https://worksheets.codalab.org/rest/bundles'
-            '/0xbcd57bee090b421c982906709c8c27e1/contents/blob/': 'v1.1/evaluate-v1.1.py',
             'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/train-v2.0.json': 'v2.0/train-v2.0.json',
             'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/dev-v2.0.json': 'v2.0/dev-v2.0.json',
-            'https://worksheets.codalab.org/rest/bundles'
-            '/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/': 'v2.0/evaluate-v2.0.py',
         }
 
     def download(self):
@@ -45,9 +46,9 @@ def download(self):
             url = item
             file = self.download_urls[item]
 
-            print('Downloading:', url)
+            logging.info('Downloading:', url)
             if os.path.isfile(self.save_path + '/' + file):
-                print('** Download file already exists, skipping download')
+                logging.info('** Download file already exists, skipping download')
             else:
                 response = urllib.request.urlopen(url)
                 with open(self.save_path + '/' + file, "wb") as handle:
@@ -61,8 +62,9 @@ def download(self):
         type=str,
         required=False,
         help='directory to store data',
-        default=os.path.split(os.path.abspath(__file__))[0] + '/../data/lm',
+        default=os.path.split(os.path.abspath(__file__))[0] + '../../../../../../examples/data/lm',
     )
     args = parser.parse_args()
+    logging.info(args.destDir)
     squad_dl = SquadDownloader(args.destDir)
     squad_dl.download()
diff --git a/scripts/get_tatoeba_data.py b/examples/nlp/scripts/get_tatoeba.py
similarity index 89%
rename from scripts/get_tatoeba_data.py
rename to examples/nlp/scripts/get_tatoeba.py
index 47cb09791b72..0da3137e54ee 100644
--- a/scripts/get_tatoeba_data.py
+++ b/examples/nlp/scripts/get_tatoeba.py
@@ -1,16 +1,18 @@
-# Copyright (C) NVIDIA CORPORATION. All Rights Reserved.
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the “License”);
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#      http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.****
+# limitations under the License.
+# =============================================================================
 
 import argparse
 import logging
@@ -19,7 +21,8 @@
 import re
 import string
 import urllib.request
-from collections import Counter
+
+from nemo import logging
 
 URL = {'tatoeba': 'https://downloads.tatoeba.org/exports/sentences.csv'}
 
@@ -182,18 +185,12 @@ def __delete_file(file_to_del):
     parser = argparse.ArgumentParser(description='Prepare tatoeba dataset')
     parser.add_argument("--data_dir", required=True, type=str)
     parser.add_argument("--dataset", default='tatoeba', type=str)
+    parser.add_argument("--num_samples", default=-1, type=int, help='-1 to use the whole dataset')
+    parser.add_argument("--percent_to_cut", default=0, type=float, help='Percent of sentences to cut in the middle')
     parser.add_argument(
-        "--num_samples", default=-1, type=int, help='-1 to use the whole dataset',
-    )
-    parser.add_argument(
-        "--percent_to_cut", default=0, type=float, help='Percent of sentences to cut in the middle',
-    )
-    parser.add_argument(
-        "--num_lines_to_combine", default=1, type=int, help='Number of lines to combine into single example',
-    )
-    parser.add_argument(
-        "--percent_dev", default=0.2, type=float, help='Size of the dev set, float',
+        "--num_lines_to_combine", default=1, type=int, help='Number of lines to combine into single example'
     )
+    parser.add_argument("--percent_dev", default=0.2, type=float, help='Size of the dev set, float')
     parser.add_argument("--clean_dir", action='store_true')
     args = parser.parse_args()
 
@@ -210,7 +207,7 @@ def __delete_file(file_to_del):
     logging.info(f'Processing English sentences...')
     clean_eng_sentences = os.path.join(args.data_dir, 'clean_eng_sentences.txt')
     __process_english_sentences(
-        tatoeba_dataset, clean_eng_sentences, args.percent_to_cut, args.num_lines_to_combine, args.num_samples,
+        tatoeba_dataset, clean_eng_sentences, args.percent_to_cut, args.num_lines_to_combine, args.num_samples
     )
 
     train_file = os.path.join(args.data_dir, 'train.txt')
diff --git a/examples/nlp/scripts/multiwoz/process_multiwoz.py b/examples/nlp/scripts/multiwoz/process_multiwoz.py
new file mode 100644
index 000000000000..bcdeec21bc0b
--- /dev/null
+++ b/examples/nlp/scripts/multiwoz/process_multiwoz.py
@@ -0,0 +1,400 @@
+7  #!/usr/bin/python
+
+# =============================================================================
+# Copyright 2019 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
+# Copyright 2019 Salesforce Research and Paweł Budzianowski.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom
+# the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# =============================================================================
+
+"""
+Dataset: http://dialogue.mi.eng.cam.ac.uk/index.php/corpus/
+
+Code based on:
+https://github.com/jasonwu0731/trade-dst
+https://github.com/budzianowski/multiwoz
+"""
+
+import argparse
+import json
+import os
+import re
+import shutil
+
+from nemo.collections.nlp.data.datasets.datasets_utils import if_exist
+
+parser = argparse.ArgumentParser(description='Process MultiWOZ dataset')
+parser.add_argument("--data_dir", default='../../data/statetracking/MULTIWOZ2.1', type=str)
+parser.add_argument("--out_dir", default='../../data/statetracking/multiwoz', type=str)
+args = parser.parse_args()
+
+if not os.path.exists(args.data_dir):
+    raise FileNotFoundError(f"{args.data_dir} doesn't exist.")
+
+DOMAINS = ['restaurant', 'hotel', 'attraction', 'train', 'taxi', 'hospital', 'police']
+PHONE_NUM_TMPL = '\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4,5})'
+POSTCODE_TMPL = (
+    '([a-z]{1}[\. ]?[a-z]{1}[\. ]?\d{1,2}[, ]+\d{1}[\. ]?' + '[a-z]{1}[\. ]?[a-z]{1}|[a-z]{2}\d{2}[a-z]{2})'
+)
+
+REPLACEMENTS = {}
+with open('replacements.txt', 'r') as f:
+    for line in f:
+        word1, word2 = line.strip().split('\t')
+        REPLACEMENTS[word1] = word2
+REPLACEMENTS['-'] = ' '
+REPLACEMENTS[';'] = ','
+REPLACEMENTS['/'] = ' and '
+
+DONT_CARES = set(['dont care', 'dontcare', "don't care", "do not care"])
+
+
+def is_ascii(text):
+    return all(ord(c) < 128 for c in text)
+
+
+def normalize(text):
+    text = text.lower().strip()
+
+    # hotel domain pfb30
+    text = re.sub(r"b&b", "bed and breakfast", text)
+    text = re.sub(r"b and b", "bed and breakfast", text)
+    text = re.sub('[\"\<>@\(\)]', '', text)  # remove brackets
+    text = re.sub(u"(\u2018|\u2019)", "'", text)  # weird unicode bug
+    # add space around punctuations
+    text = re.sub('(\D)([?.,!])', r'\1 \2 ', text)
+
+    clean_tokens = []
+
+    for token in text.split():
+        token = token.strip()
+        if not token:
+            continue
+        if token in REPLACEMENTS:
+            clean_tokens.append(REPLACEMENTS[token])
+        else:
+            clean_tokens.append(token)
+
+    text = ' '.join(clean_tokens)  # remove extra spaces
+    text = re.sub('(\d) (\d)', r'\1\2', text)  # concatenate numbers
+
+    return text
+
+
+def get_goal(idx, log, goals, last_goal):
+    if idx == 1:  # first system's response
+        active_goals = get_summary_belief_state(log[idx]["metadata"], True)
+        return active_goals[0] if len(active_goals) != 0 else goals[0]
+    else:
+        new_goals = get_new_goal(log[idx - 2]["metadata"], log[idx]["metadata"])
+        return last_goal if not new_goals else new_goals[0]
+
+
+def get_summary_belief_state(bstate, get_goal=False):
+    """Based on the mturk annotations we form multi-domain belief state
+    TODO: Figure out why this script has hotel-name but jason's script doesn't
+    (see val_dialogs.json)
+    """
+    summary_bstate, summary_bvalue, active_domain = [], [], []
+    for domain in DOMAINS:
+        domain_active = False
+        booking = []
+
+        for slot in sorted(bstate[domain]['book'].keys()):
+            if slot == 'booked':
+                booking.append(int(len(bstate[domain]['book']['booked']) != 0))
+            else:
+                if bstate[domain]['book'][slot]:
+                    booking.append(1)
+                    curr_bvalue = [f"{domain}-book {slot.strip().lower()}", normalize(bstate[domain]['book'][slot])]
+                    summary_bvalue.append(curr_bvalue)
+                else:
+                    booking.append(0)
+        if domain == 'train':
+            if 'people' not in bstate[domain]['book']:
+                booking.append(0)
+            if 'ticket' not in bstate[domain]['book']:  # TODO: possibly elif
+                booking.append(0)
+        summary_bstate += booking
+
+        for slot in bstate[domain]['semi']:
+            slot_enc = [0, 0, 0]  # not mentioned, dontcare, filled
+            if bstate[domain]['semi'][slot] == 'not mentioned':
+                slot_enc[0] = 1
+            elif bstate[domain]['semi'][slot] in DONT_CARES:
+                slot_enc[1] = 1
+                summary_bvalue.append([f"{domain}-{slot.strip().lower()}", "dontcare"])
+            elif bstate[domain]['semi'][slot]:
+                curr_bvalue = [f"{domain}-{slot.strip().lower()}", normalize(bstate[domain]['semi'][slot])]
+                summary_bvalue.append(curr_bvalue)
+            if sum(slot_enc) > 0:
+                domain_active = True
+            summary_bstate += slot_enc
+
+        if domain_active:  # quasi domain-tracker
+            summary_bstate += [1]
+            active_domain.append(domain)
+        else:
+            summary_bstate += [0]
+
+    assert len(summary_bstate) == 94
+    if get_goal:
+        return active_domain
+    return summary_bstate, summary_bvalue
+
+
+def get_new_goal(prev_turn, curr_turn):
+    """ If multiple domains are updated between turns,
+    return all of them
+    """
+    new_goals = []
+    # Sometimes, metadata is an empty dictionary, bug?
+    if not prev_turn or not curr_turn:
+        return new_goals
+
+    for domain in prev_turn:
+        if curr_turn[domain] != prev_turn[domain]:
+            new_goals.append(domain)
+    return new_goals
+
+
+def get_dialog_act(curr_dialog_acts, act_idx):
+    """Given system dialogue acts fix automatic delexicalization."""
+    acts = []
+    if not act_idx in curr_dialog_acts:
+        return acts
+
+    turn = curr_dialog_acts[act_idx]
+
+    if isinstance(turn, dict):  # it's annotated:
+        for key in turn:
+            key_acts = turn[key]
+            key = key.strip().lower()
+            if key.endswith('request'):
+                for act in key_acts:
+                    acts.append(act[0].lower())
+            elif key.endswith('inform'):
+                for act in key_acts:
+                    acts.append([act[0].lower(), normalize(act[1])])
+    return acts
+
+
+def fix_delex(curr_dialog_acts, act_idx, text):
+    """Given system dialogue acts fix automatic delexicalization."""
+    if not act_idx in curr_dialog_acts:
+        return text
+
+    turn = curr_dialog_acts[act_idx]
+
+    if isinstance(turn, dict):  # it's annotated:
+        for key in turn:
+            if 'Attraction' in key:
+                if 'restaurant_' in text:
+                    text = text.replace("restaurant", "attraction")
+                if 'hotel_' in text:
+                    text = text.replace("hotel", "attraction")
+            if 'Hotel' in key:
+                if 'attraction_' in text:
+                    text = text.replace("attraction", "hotel")
+                if 'restaurant_' in text:
+                    text = text.replace("restaurant", "hotel")
+            if 'Restaurant' in key:
+                if 'attraction_' in text:
+                    text = text.replace("attraction", "restaurant")
+                if 'hotel_' in text:
+                    text = text.replace("hotel", "restaurant")
+
+    return text
+
+
+def create_data(data_dir):
+    data = json.load(open(f'{data_dir}/data.json', 'r'))
+    dialog_acts = json.load(open(f'{data_dir}/dialogue_acts.json', 'r'))
+
+    delex_data = {}
+
+    for dialog_id in data:
+        dialog = data[dialog_id]
+        curr_dialog_acts = dialog_acts[dialog_id.strip('.json')]
+        goals = [key for key in dialog['goal'].keys() if key in DOMAINS and dialog['goal'][key]]
+
+        last_goal, act_idx = '', 1
+        for idx, turn in enumerate(dialog['log']):
+            dialog['log'][idx]['text'] = normalize(turn['text'])
+
+            if idx % 2 == 1:  # system's turn
+                cur_goal = get_goal(idx, dialog['log'], goals, last_goal)
+                last_goal = cur_goal
+
+                dialog['log'][idx - 1]['domain'] = cur_goal  # human's domain
+                dialog['log'][idx]['dialogue_acts'] = get_dialog_act(curr_dialog_acts, str(act_idx))
+                act_idx += 1
+
+            dialog['log'][idx]['text'] = fix_delex(curr_dialog_acts, str(act_idx), dialog['log'][idx]['text'])
+
+        delex_data[dialog_id] = dialog
+    return delex_data
+
+
+def analyze_dialogue(dialog, max_length):
+    """Cleaning procedure for all kinds of errors in text and annotation."""
+    if len(dialog['log']) % 2 == 1:
+        print('Odd number of turns. Wrong dialogue.')
+        return None
+
+    clean_dialog = {}
+    clean_dialog['goal'] = dialog['goal']  # for now we just copy the goal
+    usr_turns, sys_turns = [], []
+
+    for idx in range(len(dialog['log'])):
+        text = dialog['log'][idx]['text']
+        if len(text.split()) > max_length or not is_ascii(text):
+            return None  # sequence corrupted. discard
+
+        if idx % 2 == 0:  # usr turn
+            usr_turns.append(dialog['log'][idx])
+        else:  # sys turn
+            belief_summary, belief_value_summary = get_summary_belief_state(dialog['log'][idx]['metadata'])
+
+            dialog['log'][idx]['belief_summary'] = str(belief_summary)
+            dialog['log'][idx]['belief_value_summary'] = belief_value_summary
+            sys_turns.append(dialog['log'][idx])
+
+    clean_dialog['usr_log'] = usr_turns
+    clean_dialog['sys_log'] = sys_turns
+
+    return clean_dialog
+
+
+def get_dialog(dialog, max_length=50):
+    """Extract a dialogue from the file"""
+    dialog = analyze_dialogue(dialog, max_length)
+    if dialog is None:
+        return None
+
+    dialogs = []
+    for idx in range(len(dialog['usr_log'])):
+        dialogs.append(
+            {
+                'usr': dialog['usr_log'][idx]['text'],
+                'sys': dialog['sys_log'][idx]['text'],
+                'sys_a': dialog['sys_log'][idx]['dialogue_acts'],
+                'domain': dialog['usr_log'][idx]['domain'],
+                'bvs': dialog['sys_log'][idx]['belief_value_summary'],
+            }
+        )
+
+    return dialogs
+
+
+def partition_data(data, infold, outfold):
+    """Partition the data into train, valid, and test sets
+    based on the list of val and test specified in the dataset.
+    """
+    if if_exist(
+        outfold, ['trainListFile.json', 'val_dialogs.json', 'test_dialogs.json', 'train_dialogs.json', 'ontology.json']
+    ):
+        print(f'Data is already processed and stored at {outfold}')
+        return
+    os.makedirs(outfold, exist_ok=True)
+    shutil.copyfile(f'{infold}/ontology.json', f'{outfold}/ontology.json')
+
+    with open(f'{infold}/testListFile.json', 'r') as fin:
+        test_files = [line.strip() for line in fin.readlines()]
+
+    with open(f'{infold}/valListFile.json', 'r') as fin:
+        val_files = [line.strip() for line in fin.readlines()]
+
+    train_list_files = open(f'{outfold}/trainListFile.json', 'w')
+
+    train_dialogs, val_dialogs, test_dialogs = [], [], []
+    count_train, count_val, count_test = 0, 0, 0
+
+    for dialog_id in data:
+        dialog = data[dialog_id]
+        domains = [key for key in dialog['goal'].keys() if key in DOMAINS and dialog['goal'][key]]
+
+        dial = get_dialog(dialog)
+        if dial:
+            dialogue = {}
+            dialogue['dialog_idx'] = dialog_id
+            dialogue['domains'] = list(set(domains))
+            last_bs = []
+            dialogue['dialog'] = []
+
+            for idx, turn in enumerate(dial):
+                turn_dl = {
+                    'sys_transcript': dial[idx - 1]['sys'] if idx > 0 else "",
+                    'turn_idx': idx,
+                    'transcript': turn['usr'],
+                    'sys_acts': dial[idx - 1]['sys_a'] if idx > 0 else [],
+                    'domain': turn['domain'],
+                }
+                turn_dl['belief_state'] = [{"slots": [s], "act": "inform"} for s in turn['bvs']]
+                turn_dl['turn_label'] = [bs["slots"][0] for bs in turn_dl['belief_state'] if bs not in last_bs]
+                last_bs = turn_dl['belief_state']
+                dialogue['dialog'].append(turn_dl)
+
+            if dialog_id in test_files:
+                test_dialogs.append(dialogue)
+                count_test += 1
+            elif dialog_id in val_files:
+                val_dialogs.append(dialogue)
+                count_val += 1
+            else:
+                train_list_files.write(dialog_id + '\n')
+                train_dialogs.append(dialogue)
+                count_train += 1
+
+    print(f"Dialogs: {count_train} train, {count_val} val, {count_test} test.")
+
+    # save all dialogues
+    with open(f'{outfold}/val_dialogs.json', 'w') as fout:
+        json.dump(val_dialogs, fout, indent=4)
+
+    with open(f'{outfold}/test_dialogs.json', 'w') as fout:
+        json.dump(test_dialogs, fout, indent=4)
+
+    with open(f'{outfold}/train_dialogs.json', 'w') as fout:
+        json.dump(train_dialogs, fout, indent=4)
+
+    train_list_files.close()
+
+
+def process_woz():
+    delex_data = create_data(args.data_dir)
+    partition_data(delex_data, args.data_dir, args.out_dir)
+
+
+process_woz()
diff --git a/examples/nlp/scripts/multiwoz/replacements.txt b/examples/nlp/scripts/multiwoz/replacements.txt
new file mode 100644
index 000000000000..34df41d01e93
--- /dev/null
+++ b/examples/nlp/scripts/multiwoz/replacements.txt
@@ -0,0 +1,83 @@
+it's	it is
+don't	do not
+doesn't	does not
+didn't	did not
+you'd	you would
+you're	you are
+you'll	you will
+i'm	i am
+they're	they are
+that's	that is
+what's	what is
+couldn't	could not
+i've	i have
+we've	we have
+can't	cannot
+i'd	i would
+i'd	i would
+aren't	are not
+isn't	is not
+wasn't	was not
+weren't	were not
+won't	will not
+there's	there is
+there're	there are
+. .	.
+restaurants	restaurant -s
+hotels	hotel -s
+laptops	laptop -s
+cheaper	cheap -er
+dinners	dinner -s
+lunches	lunch -s
+breakfasts	breakfast -s
+expensively	expensive -ly
+moderately	moderate -ly
+cheaply	cheap -ly
+prices	price -s
+places	place -s
+venues	venue -s
+ranges	range -s
+meals	meal -s
+locations	location -s
+areas	area -s
+policies	policy -s
+children	child -s
+kids	kid -s
+kidfriendly	kid friendly
+cards	card -s
+upmarket	expensive
+inpricey	cheap
+inches	inch -s
+uses	use -s
+dimensions	dimension -s
+driverange	drive range
+includes	include -s
+computers	computer -s
+machines	machine -s
+families	family -s
+ratings	rating -s
+constraints	constraint -s
+pricerange	price range
+batteryrating	battery rating
+requirements	requirement -s
+drives	drive -s
+specifications	specification -s
+weightrange	weight range
+harddrive	hard drive
+batterylife	battery life
+businesses	business -s
+hours	hour -s
+one	1
+two	2
+three	3
+four	4
+five	5
+six	6
+seven	7
+eight	8
+nine	9
+ten	10
+eleven	11
+twelve	12
+anywhere	any where
+good bye	goodbye
diff --git a/examples/nlp/scripts/process_wiki_zh.py b/examples/nlp/scripts/process_wiki_zh.py
index a7f195fbb9c0..58d944a5c727 100755
--- a/examples/nlp/scripts/process_wiki_zh.py
+++ b/examples/nlp/scripts/process_wiki_zh.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
+
 # =============================================================================
-# Copyright 2019 NVIDIA Corporation. All Rights Reserved.
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,9 +24,7 @@
 from functools import partial
 
 
-def create_vocab(
-    lines, vocab_file, min_frequency=3, special_symbols=["[PAD]", "[SEP]", "[CLS]", "[MASK]", "[UNK]"],
-):
+def create_vocab(lines, vocab_file, min_frequency=3, special_symbols=["[PAD]", "[SEP]", "[CLS]", "[MASK]", "[UNK]"]):
     """Create vocabulary from lines"""
     # Count word occurency
     vocab = {}
@@ -140,11 +139,9 @@ def process(data_dir, output_dir=None, min_frequency=3, max_files=-1):
     parser.add_argument("--data_dir", default="/raid/data/wiki_zh", type=str)
     parser.add_argument("--output_dir", default="./", type=str)
     parser.add_argument(
-        "--min_frequency", default=0, type=int, help="Characters occuring less frequently " "will be filtered out",
-    )
-    parser.add_argument(
-        "--max_files", default=-1, type=int, help="Max number of dirs to process",
+        "--min_frequency", default=0, type=int, help="Characters occuring less frequently " "will be filtered out"
     )
+    parser.add_argument("--max_files", default=-1, type=int, help="Max number of dirs to process")
     args = parser.parse_args()
 
     process(args.data_dir, args.output_dir, args.min_frequency, args.max_files)
diff --git a/examples/nlp/sentence_classification_with_bert.py b/examples/nlp/text_classification_with_bert.py
similarity index 72%
rename from examples/nlp/sentence_classification_with_bert.py
rename to examples/nlp/text_classification_with_bert.py
index 2cd622e65ac3..4dd8535e2347 100644
--- a/examples/nlp/sentence_classification_with_bert.py
+++ b/examples/nlp/text_classification_with_bert.py
@@ -1,15 +1,30 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 import argparse
 import math
 
 import numpy as np
-import torch
-from torch import nn
 from transformers import BertTokenizer
 
-import nemo
-import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp.data.datasets.utils import SentenceClassificationDataDesc
-from nemo.collections.nlp.utils.callbacks.sentence_classification import eval_epochs_done_callback, eval_iter_callback
+import nemo.collections.nlp.nm.data_layers.text_classification_datalayer
+import nemo.collections.nlp.nm.trainables.common.sequence_classification_nm
+from nemo import logging
+from nemo.collections.nlp.callbacks.text_classification_callback import eval_epochs_done_callback, eval_iter_callback
+from nemo.collections.nlp.data.datasets.text_classification_dataset import SentenceClassificationDataDesc
 from nemo.utils.lr_policies import get_lr_policy
 
 # Parsing arguments
@@ -45,9 +60,7 @@
 parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"])
 parser.add_argument("--do_lower_case", action='store_true')
 parser.add_argument("--shuffle_data", action='store_true')
-parser.add_argument(
-    "--class_balancing", default="None", type=str, choices=["None", "weighted_loss"],
-)
+parser.add_argument("--class_balancing", default="None", type=str, choices=["None", "weighted_loss"])
 
 args = parser.parse_args()
 
@@ -68,10 +81,14 @@
 """
 
 if args.bert_checkpoint and args.bert_config:
-    pretrained_bert_model = nemo_nlp.huggingface.BERT(config_filename=args.bert_config)
+    pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(
+        config_filename=args.bert_config
+    )
     pretrained_bert_model.restore_from(args.bert_checkpoint)
 else:
-    pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
+    pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(
+        pretrained_model_name=args.pretrained_bert_model
+    )
 
 hidden_size = pretrained_bert_model.hidden_size
 tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model)
@@ -79,8 +96,8 @@
 data_desc = SentenceClassificationDataDesc(args.dataset_name, args.data_dir, args.do_lower_case)
 
 # Create sentence classification loss on top
-classifier = nemo_nlp.SequenceClassifier(
-    hidden_size=hidden_size, num_classes=data_desc.num_labels, dropout=args.fc_dropout,
+classifier = nemo.collections.nlp.nm.trainables.common.sequence_classification_nm.SequenceClassifier(
+    hidden_size=hidden_size, num_classes=data_desc.num_labels, dropout=args.fc_dropout
 )
 
 if args.class_balancing == 'weighted_loss':
@@ -91,31 +108,29 @@
 
 
 def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mode='train'):
-    nemo.logging.info(f"Loading {mode} data...")
+    logging.info(f"Loading {mode} data...")
     data_file = f'{data_desc.data_dir}/{mode}.tsv'
     shuffle = args.shuffle_data if mode == 'train' else False
 
-    data_layer = nemo_nlp.BertSentenceClassificationDataLayer(
+    data_layer = nemo.collections.nlp.nm.data_layers.text_classification_datalayer.BertSentenceClassificationDataLayer(
         input_file=data_file,
         tokenizer=tokenizer,
         max_seq_length=args.max_seq_length,
         num_samples=num_samples,
         shuffle=shuffle,
         batch_size=batch_size,
-        # num_workers=0,
-        # local_rank=local_rank,
     )
 
     ids, type_ids, input_mask, labels = data_layer()
     data_size = len(data_layer)
 
     if data_size < batch_size:
-        nemo.logging.warning("Batch_size is larger than the dataset size")
-        nemo.logging.warning("Reducing batch_size to dataset size")
+        logging.warning("Batch_size is larger than the dataset size")
+        logging.warning("Reducing batch_size to dataset size")
         batch_size = data_size
 
     steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus))
-    nemo.logging.info(f"Steps_per_epoch = {steps_per_epoch}")
+    logging.info(f"Steps_per_epoch = {steps_per_epoch}")
 
     hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask)
 
@@ -164,11 +179,11 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mod
 
 # Create callback to save checkpoints
 ckpt_callback = nemo.core.CheckpointCallback(
-    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq,
+    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq
 )
 
 lr_policy_fn = get_lr_policy(
-    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion,
+    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion
 )
 
 nf.train(
@@ -176,5 +191,5 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mod
     callbacks=[train_callback, eval_callback, ckpt_callback],
     lr_policy=lr_policy_fn,
     optimizer=args.optimizer_kind,
-    optimization_params={"num_epochs": args.num_epochs, "lr": args.lr, "weight_decay": args.weight_decay,},
+    optimization_params={"num_epochs": args.num_epochs, "lr": args.lr, "weight_decay": args.weight_decay},
 )
diff --git a/examples/nlp/token_classification.py b/examples/nlp/token_classification.py
index 43749c299e05..3a88bad1a958 100644
--- a/examples/nlp/token_classification.py
+++ b/examples/nlp/token_classification.py
@@ -1,22 +1,35 @@
-# pylint: disable=invalid-name
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
 import argparse
 import json
 import os
-import sys
 
-import nemo
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer, TokenClassificationLoss, TokenClassifier
-from nemo.collections.nlp.data.datasets import utils
-from nemo.collections.nlp.utils.callbacks.token_classification import eval_epochs_done_callback, eval_iter_callback
+import nemo.collections.nlp.utils.common_nlp_utils
+from nemo import logging
+from nemo.collections.nlp.callbacks.token_classification_callback import eval_epochs_done_callback, eval_iter_callback
+from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer
+from nemo.collections.nlp.nm.data_layers import BertTokenClassificationDataLayer
+from nemo.collections.nlp.nm.losses import TokenClassificationLoss
+from nemo.collections.nlp.nm.trainables import TokenClassifier
 from nemo.utils.lr_policies import get_lr_policy
 
 # Parsing arguments
-parser = argparse.ArgumentParser(
-    description="Token classification\
-                        with pretrained BERT"
-)
+parser = argparse.ArgumentParser(description="Token classification with pretrained BERT")
 parser.add_argument("--local_rank", default=None, type=int)
 parser.add_argument("--batch_size", default=8, type=int)
 parser.add_argument("--max_seq_length", default=128, type=int)
@@ -37,54 +50,41 @@
 parser.add_argument("--shuffle_data", action='store_false')
 parser.add_argument("--pretrained_bert_model", default="bert-base-cased", type=str)
 parser.add_argument("--bert_checkpoint", default=None, type=str)
-parser.add_argument(
-    "--bert_config", default=None, type=str, help="Path to bert config file in json format",
-)
+parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format")
 parser.add_argument(
     "--tokenizer_model",
     default="tokenizer.model",
     type=str,
-    help="Path to pretrained tokenizer model, \
-                    only used if --tokenizer is sentencepiece",
+    help="Path to pretrained tokenizer model, only used if --tokenizer is sentencepiece",
 )
 parser.add_argument(
     "--tokenizer",
     default="nemobert",
     type=str,
     choices=["nemobert", "sentencepiece"],
-    help="tokenizer to use, \
-                    only relevant when using custom pretrained checkpoint.",
+    help="tokenizer to use, only relevant when using custom pretrained checkpoint.",
 )
 parser.add_argument(
     "--work_dir",
     default='output',
     type=str,
-    help="The output directory where the model prediction\
-                    and checkpoints will be written.",
-)
-parser.add_argument(
-    "--use_cache", action='store_true', help="Whether to cache preprocessed data",
+    help="The output directory where the model prediction and checkpoints will be written.",
 )
+parser.add_argument("--use_cache", action='store_true', help="Whether to cache preprocessed data")
 parser.add_argument(
     "--save_epoch_freq",
     default=1,
     type=int,
-    help="Frequency of saving checkpoint\
-                    '-1' - step checkpoint won't be saved",
+    help="Frequency of saving checkpoint '-1' - step checkpoint won't be saved",
 )
 parser.add_argument(
     "--save_step_freq",
     default=-1,
     type=int,
-    help="Frequency of saving checkpoint \
-                    '-1' - step checkpoint won't be saved",
-)
-parser.add_argument(
-    "--loss_step_freq", default=250, type=int, help="Frequency of printing loss",
-)
-parser.add_argument(
-    "--use_weighted_loss", action='store_true', help="Flag to indicate whether to use weighted loss",
+    help="Frequency of saving checkpoint '-1' - step checkpoint won't be saved",
 )
+parser.add_argument("--loss_step_freq", default=250, type=int, help="Frequency of printing loss")
+parser.add_argument("--use_weighted_loss", action='store_true', help="Flag to indicate whether to use weighted loss")
 
 args = parser.parse_args()
 
@@ -106,17 +106,17 @@
     add_time_to_log_dir=True,
 )
 
-nemo.logging.info(args)
+logging.info(args)
 
 output_file = f'{nf.work_dir}/output.txt'
 
 if args.bert_checkpoint is None:
     """ Use this if you're using a standard BERT model.
     To see the list of pretrained models, call:
-    nemo_nlp.huggingface.BERT.list_pretrained_models()
+    nemo_nlp.nm.trainables.huggingface.BERT.list_pretrained_models()
     """
     tokenizer = NemoBertTokenizer(args.pretrained_bert_model)
-    model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
+    model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 else:
     """ Use this if you're using a BERT model that you pre-trained yourself.
     """
@@ -130,25 +130,20 @@
     if args.bert_config is not None:
         with open(args.bert_config) as json_file:
             config = json.load(json_file)
-        model = nemo_nlp.huggingface.BERT(**config)
+        model = nemo_nlp.nm.trainables.huggingface.BERT(**config)
     else:
-        model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
+        model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 
     model.restore_from(args.bert_checkpoint)
-    nemo.logging.info(f"Model restored from {args.bert_checkpoint}")
+    logging.info(f"Model restored from {args.bert_checkpoint}")
 
 hidden_size = model.hidden_size
 
-classifier = "TokenClassifier"
-task_loss = "TokenClassificationLoss"
-
 
 def create_pipeline(
-    num_samples=-1,
     pad_label=args.none_label,
     max_seq_length=args.max_seq_length,
     batch_size=args.batch_size,
-    local_rank=args.local_rank,
     num_gpus=args.num_gpus,
     mode='train',
     label_ids=None,
@@ -157,10 +152,10 @@ def create_pipeline(
     use_cache=args.use_cache,
     dropout=args.fc_dropout,
     num_layers=args.num_fc_layers,
+    classifier=TokenClassifier,
 ):
-    global classifier, task_loss
 
-    nemo.logging.info(f"Loading {mode} data...")
+    logging.info(f"Loading {mode} data...")
     shuffle = args.shuffle_data if mode == 'train' else False
 
     text_file = f'{args.data_dir}/text_{mode}.txt'
@@ -179,7 +174,7 @@ def create_pipeline(
            [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).'
         )
 
-    data_layer = nemo_nlp.BertTokenClassificationDataLayer(
+    data_layer = BertTokenClassificationDataLayer(
         tokenizer=tokenizer,
         text_file=text_file,
         label_file=label_file,
@@ -187,54 +182,49 @@ def create_pipeline(
         label_ids=label_ids,
         max_seq_length=max_seq_length,
         batch_size=batch_size,
-        num_workers=0,
-        local_rank=local_rank,
         shuffle=shuffle,
         ignore_extra_tokens=ignore_extra_tokens,
         ignore_start_end=ignore_start_end,
         use_cache=use_cache,
     )
 
-    (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, labels,) = data_layer()
+    (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, labels) = data_layer()
 
     if mode == 'train':
         label_ids = data_layer.dataset.label_ids
         class_weights = None
 
         if args.use_weighted_loss:
-            nemo.logging.info(f"Using weighted loss")
+            logging.info(f"Using weighted loss")
             label_freqs = data_layer.dataset.label_frequencies
-            class_weights = utils.calc_class_weights(label_freqs)
+            class_weights = nemo.collections.nlp.utils.common_nlp_utils.calc_class_weights(label_freqs)
 
-            nemo.logging.info(f"class_weights: {class_weights}")
+            logging.info(f"class_weights: {class_weights}")
 
-        classifier = getattr(sys.modules[__name__], classifier)
         classifier = classifier(
-            hidden_size=hidden_size, num_classes=len(label_ids), dropout=dropout, num_layers=num_layers,
+            hidden_size=hidden_size, num_classes=len(label_ids), dropout=dropout, num_layers=num_layers
         )
 
-        task_loss = getattr(sys.modules[__name__], task_loss)
-        task_loss = task_loss(num_classes=len(label_ids), class_weights=class_weights)
-
-    hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
+        task_loss = TokenClassificationLoss(num_classes=len(label_ids), class_weights=class_weights)
 
+    hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
     logits = classifier(hidden_states=hidden_states)
-    loss = task_loss(logits=logits, labels=labels, loss_mask=loss_mask)
-
-    steps_per_epoch = len(data_layer) // (batch_size * num_gpus)
 
     if mode == 'train':
+        loss = task_loss(logits=logits, labels=labels, loss_mask=loss_mask)
+        steps_per_epoch = len(data_layer) // (batch_size * num_gpus)
         tensors_to_evaluate = [loss, logits]
+        return tensors_to_evaluate, loss, steps_per_epoch, label_ids, classifier
     else:
         tensors_to_evaluate = [logits, labels, subtokens_mask]
-    return tensors_to_evaluate, loss, steps_per_epoch, label_ids, data_layer
+        return tensors_to_evaluate, data_layer
 
 
-train_tensors, train_loss, steps_per_epoch, label_ids, _ = create_pipeline()
+train_tensors, train_loss, steps_per_epoch, label_ids, classifier = create_pipeline()
 
-eval_tensors, _, _, _, data_layer = create_pipeline(mode='dev', label_ids=label_ids)
+eval_tensors, data_layer = create_pipeline(mode='dev', label_ids=label_ids, classifier=classifier)
 
-nemo.logging.info(f"steps_per_epoch = {steps_per_epoch}")
+logging.info(f"steps_per_epoch = {steps_per_epoch}")
 
 # Create trainer and execute training action
 train_callback = nemo.core.SimpleLossLoggerCallback(
@@ -253,11 +243,11 @@ def create_pipeline(
 )
 
 ckpt_callback = nemo.core.CheckpointCallback(
-    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq,
+    folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq
 )
 
 lr_policy_fn = get_lr_policy(
-    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion,
+    args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion
 )
 
 nf.train(
diff --git a/examples/nlp/token_classification_infer.py b/examples/nlp/token_classification_infer.py
index ae272f86d210..642be6d149b1 100644
--- a/examples/nlp/token_classification_infer.py
+++ b/examples/nlp/token_classification_infer.py
@@ -1,13 +1,30 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 import argparse
 import os
 
 import numpy as np
-from sklearn.metrics import classification_report
 
 import nemo
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp import NemoBertTokenizer
-from nemo.collections.nlp.utils.nlp_utils import get_vocab
+from nemo import logging
+from nemo.collections.nlp.data import NemoBertTokenizer
+from nemo.collections.nlp.nm.trainables import TokenClassifier
+from nemo.collections.nlp.utils.common_nlp_utils import get_vocab
 
 # Parsing arguments
 parser = argparse.ArgumentParser(description='NER with pretrained BERT')
@@ -46,7 +63,7 @@
     raise ValueError(f'Dictionary with ids to labels not found at {args.labels_dict}')
 
 nf = nemo.core.NeuralModuleFactory(
-    backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None,
+    backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None
 )
 
 labels_dict = get_vocab(args.labels_dict)
@@ -55,35 +72,31 @@
 See the list of pretrained models, call:
 nemo_nlp.huggingface.BERT.list_pretrained_models()
 """
-pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
+pretrained_bert_model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model)
 hidden_size = pretrained_bert_model.hidden_size
 tokenizer = NemoBertTokenizer(args.pretrained_bert_model)
 
-data_layer = nemo_nlp.BertTokenClassificationInferDataLayer(
-    queries=args.queries, tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1,
+data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationInferDataLayer(
+    queries=args.queries, tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1
 )
 
-classifier = nemo_nlp.TokenClassifier(hidden_size=hidden_size, num_classes=len(labels_dict), dropout=args.fc_dropout,)
+classifier = TokenClassifier(hidden_size=hidden_size, num_classes=len(labels_dict), dropout=args.fc_dropout)
 
 input_ids, input_type_ids, input_mask, _, subtokens_mask = data_layer()
 
-hidden_states = pretrained_bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
+hidden_states = pretrained_bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
 logits = classifier(hidden_states=hidden_states)
 
 ###########################################################################
 
 # Instantiate an optimizer to perform `infer` action
-evaluated_tensors = nf.infer(tensors=[logits, subtokens_mask], checkpoint_dir=args.work_dir,)
+evaluated_tensors = nf.infer(tensors=[logits, subtokens_mask], checkpoint_dir=args.work_dir)
 
 
 def concatenate(lists):
     return np.concatenate([t.cpu() for t in lists])
 
 
-def get_preds(logits):
-    return np.argmax(logits, 1)
-
-
 def add_brackets(text, add=args.add_brackets):
     return '[' + text + ']' if add else text
 
@@ -93,7 +106,7 @@ def add_brackets(text, add=args.add_brackets):
 preds = np.argmax(logits, axis=2)
 
 for i, query in enumerate(args.queries):
-    nemo.logging.info(f'Query: {query}')
+    logging.info(f'Query: {query}')
 
     pred = preds[i][subtokens_mask[i] > 0.5]
     words = query.strip().split()
@@ -108,4 +121,4 @@ def add_brackets(text, add=args.add_brackets):
             label = add_brackets(label)
             output += label
         output += ' '
-    nemo.logging.info(f'Combined: {output.strip()}')
+    logging.info(f'Combined: {output.strip()}')
diff --git a/examples/tts/tacotron2.py b/examples/tts/tacotron2.py
index 2980ddf3e701..332da22e0be5 100644
--- a/examples/tts/tacotron2.py
+++ b/examples/tts/tacotron2.py
@@ -164,7 +164,7 @@ def create_train_dag(
 
     # Callbacks needed to print info to console and Tensorboard
     train_callback = nemo.core.SimpleLossLoggerCallback(
-        tensors=[loss_t, spec_target, mel_postnet, gate, gate_target, alignments,],
+        tensors=[loss_t, spec_target, mel_postnet, gate, gate_target, alignments],
         print_func=lambda x: nemo.logging.info(f"Loss: {x[0].data}"),
         log_to_tb_func=partial(tacotron2_log_to_tb_func, log_images=True, log_images_freq=log_freq),
         tb_writer=neural_factory.tb_writer,
diff --git a/nemo/backends/pytorch/common/rnn.py b/nemo/backends/pytorch/common/rnn.py
index dd0c3c9474ca..4d87a2cca196 100644
--- a/nemo/backends/pytorch/common/rnn.py
+++ b/nemo/backends/pytorch/common/rnn.py
@@ -1,4 +1,18 @@
-__all__ = ['DecoderRNN']
+# =============================================================================
+# Copyright 2019 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
 import random
 
@@ -11,6 +25,8 @@
 from nemo.core import AxisType
 from nemo.utils.misc import pad_to
 
+__all__ = ['DecoderRNN', 'EncoderRNN']
+
 
 class DecoderRNN(TrainableNM):
     """Simple RNN-based decoder with attention.
@@ -203,3 +219,95 @@ def forward_cl(self, targets, encoder_outputs=None):
             attention_weights = None
 
         return log_probs, attention_weights
+
+
+class EncoderRNN(TrainableNM):
+    """ Simple RNN-based encoder using GRU cells """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        targets:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        encoder_outputs:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+        """
+        return {
+            'inputs': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            'input_lens': NeuralType({0: AxisType(BatchTag),}, optional=True),
+        }
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        log_probs:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+
+        attention_weights:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(TimeTag)
+        """
+        return {
+            'outputs': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
+            'hidden': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
+        }
+
+    def __init__(
+        self, input_dim, emb_dim, hid_dim, dropout, n_layers=1, pad_idx=1, embedding_to_load=None, sum_hidden=True
+    ):
+        super().__init__()
+        self.dropout = nn.Dropout(dropout)
+        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
+        if embedding_to_load is not None:
+            self.embedding.weight.data.copy_(embedding_to_load)
+        else:
+            self.embedding.weight.data.normal_(0, 0.1)
+        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, batch_first=True, dropout=dropout, bidirectional=True)
+        self.sum_hidden = sum_hidden
+        self.to(self._device)
+
+    def forward(self, inputs, input_lens=None):
+        embedded = self.embedding(inputs)
+        embedded = self.dropout(embedded)
+        if input_lens is not None:
+            embedded = nn.utils.rnn.pack_padded_sequence(embedded, input_lens, batch_first=True)
+
+        outputs, hidden = self.rnn(embedded)
+        # outputs of shape (seq_len, batch, num_directions * hidden_size)
+        # hidden of shape (num_layers * num_directions, batch, hidden_size)
+        if input_lens is not None:
+            outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
+        else:
+            outputs = outputs.transpose(0, 1)
+        # outputs of shape: (batch, seq_len, num_directions * hidden_size)
+
+        batch_size = hidden.size()[1]
+
+        # separate final hidden states by layer and direction
+        hidden = hidden.view(self.rnn.num_layers, 2 if self.rnn.bidirectional else 1, batch_size, self.rnn.hidden_size)
+        hidden = hidden.transpose(2, 0).transpose(1, 2)
+        # hidden shape: batch x num_layer x num_directions x hidden_size
+        if self.sum_hidden and self.rnn.bidirectional:
+            hidden = hidden[:, :, 0, :] + hidden[:, :, 1, :]
+            outputs = outputs[:, :, : self.rnn.hidden_size] + outputs[:, :, self.rnn.hidden_size :]
+        else:
+            hidden = hidden.reshape(batch_size, self.rnn.num_layers, -1)
+        # hidden is now of shape (batch, num_layer, [num_directions] * hidden_size)
+
+        return outputs, hidden
diff --git a/nemo/backends/pytorch/nm.py b/nemo/backends/pytorch/nm.py
index 0a92cfe5cdc9..e759035f6a9d 100644
--- a/nemo/backends/pytorch/nm.py
+++ b/nemo/backends/pytorch/nm.py
@@ -205,7 +205,7 @@ def __init__(self):
         # (when the time for that will come;))
         self._batch_size = 1
         self._num_workers = os.cpu_count()  # Use all CPUs by default.
-        self._shuffle = True  # Shuffle by default.
+        self._shuffle = False  # Don't shuffle by default.
 
     @property
     def input_ports(self):
diff --git a/nemo/collections/nlp/__init__.py b/nemo/collections/nlp/__init__.py
index 33c4a8aea2b2..06f6cd875da6 100644
--- a/nemo/collections/nlp/__init__.py
+++ b/nemo/collections/nlp/__init__.py
@@ -1,4 +1,5 @@
-# Copyright 2019 NVIDIA. All Rights Reserved.
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,10 +14,9 @@
 # limitations under the License.
 # =============================================================================
 
-import nemo
-from .data import *
-from .huggingface import *
-from .modules import *
-from .transformer import *
+import nemo.collections.nlp.callbacks
+import nemo.collections.nlp.data
+import nemo.collections.nlp.nm
+import nemo.collections.nlp.utils
 
 backend = nemo.core.Backend.PyTorch
diff --git a/nemo/collections/nlp/callbacks/__init__.py b/nemo/collections/nlp/callbacks/__init__.py
new file mode 100644
index 000000000000..ada8ad45abe2
--- /dev/null
+++ b/nemo/collections/nlp/callbacks/__init__.py
@@ -0,0 +1,25 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.callbacks.glue_benchmark_callback import *
+from nemo.collections.nlp.callbacks.joint_intent_slot_callback import *
+from nemo.collections.nlp.callbacks.lm_bert_callback import *
+from nemo.collections.nlp.callbacks.lm_transformer_callback import *
+from nemo.collections.nlp.callbacks.machine_translation_callback import *
+from nemo.collections.nlp.callbacks.punctuation_capitalization_callback import *
+from nemo.collections.nlp.callbacks.qa_squad_callback import *
+from nemo.collections.nlp.callbacks.text_classification_callback import *
+from nemo.collections.nlp.callbacks.token_classification_callback import *
diff --git a/nemo/collections/nlp/utils/callbacks/glue.py b/nemo/collections/nlp/callbacks/glue_benchmark_callback.py
similarity index 80%
rename from nemo/collections/nlp/utils/callbacks/glue.py
rename to nemo/collections/nlp/callbacks/glue_benchmark_callback.py
index 3edb95fe6ea9..1368284d66fd 100644
--- a/nemo/collections/nlp/utils/callbacks/glue.py
+++ b/nemo/collections/nlp/callbacks/glue_benchmark_callback.py
@@ -19,8 +19,6 @@
 Some transformer of this code were adapted from the HuggingFace library at
 https://github.com/huggingface/transformers
 """
-__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
-
 import os
 import random
 
@@ -28,7 +26,10 @@
 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import f1_score, matthews_corrcoef
 
-import nemo
+from nemo import logging
+from nemo.collections.nlp.utils.callback_utils import list2str, tensor2list
+
+__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
 
 
 def eval_iter_callback(tensors, global_vars):
@@ -46,16 +47,16 @@ def eval_iter_callback(tensors, global_vars):
         if 'logits' in kv:
             for v_tensor in v:
                 for logit_tensor in v_tensor:
-                    logits_lists.append(logit_tensor.detach().cpu().tolist())
+                    logits_lists.append(tensor2list(logit_tensor))
         # for GLUE STS-B task (regression)
         elif 'preds' in kv:
             for v_tensor in v:
                 for pred_tensor in v_tensor:
-                    preds_lists.append(pred_tensor.detach().cpu().tolist())
+                    preds_lists.append(tensor2list(pred_tensor))
         if 'labels' in kv:
             for v_tensor in v:
                 for label_tensor in v_tensor:
-                    labels_lists.append(label_tensor.detach().cpu().tolist())
+                    labels_lists.append(tensor2list(label_tensor))
 
     if len(logits_lists) > 0:
         preds = list(np.argmax(np.asarray(logits_lists), 1))
@@ -66,21 +67,19 @@ def eval_iter_callback(tensors, global_vars):
     global_vars["all_labels"].extend(labels_lists)
 
 
-def list2str(l):
-    return ' '.join([str(j) for j in l])
-
-
 def eval_epochs_done_callback(global_vars, output_dir, task_name):
     labels = np.asarray(global_vars['all_labels'])
     preds = np.asarray(global_vars['all_preds'])
 
+    # print predictions and labels for a small random subset of data
+    sample_size = 20
     i = 0
-    if preds.shape[0] > 21:
-        i = random.randint(0, preds.shape[0] - 21)
+    if preds.shape[0] > sample_size + 1:
+        i = random.randint(0, preds.shape[0] - sample_size - 1)
 
-    nemo.logging.info("Task name: %s" % task_name.upper())
-    nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i : i + 20]))
-    nemo.logging.info("Sampled labels: [%s]" % list2str(labels[i : i + 20]))
+    logging.info("Task name: %s" % task_name.upper())
+    logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size]))
+    logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size]))
 
     results = compute_metrics(task_name, preds, labels)
 
@@ -89,7 +88,7 @@ def eval_epochs_done_callback(global_vars, output_dir, task_name):
         f.write('labels\t' + list2str(labels) + '\n')
         f.write('preds\t' + list2str(preds) + '\n')
 
-    nemo.logging.info(results)
+    logging.info(results)
 
     return results
 
@@ -111,11 +110,7 @@ def mcc(preds, labels):
 def pearson_and_spearman(preds, labels):
     pearson_corr = pearsonr(preds, labels)[0]
     spearman_corr = spearmanr(preds, labels)[0]
-    return {
-        "pearson": pearson_corr,
-        "spearmanr": spearman_corr,
-        "corr": (pearson_corr + spearman_corr) / 2,
-    }
+    return {"pearson": pearson_corr, "spearmanr": spearman_corr, "corr": (pearson_corr + spearman_corr) / 2}
 
 
 def compute_metrics(task_name, preds, labels):
diff --git a/nemo/collections/nlp/utils/callbacks/joint_intent_slot.py b/nemo/collections/nlp/callbacks/joint_intent_slot_callback.py
similarity index 63%
rename from nemo/collections/nlp/utils/callbacks/joint_intent_slot.py
rename to nemo/collections/nlp/callbacks/joint_intent_slot_callback.py
index 79db8a709f20..b3f49c5e33fb 100644
--- a/nemo/collections/nlp/utils/callbacks/joint_intent_slot.py
+++ b/nemo/collections/nlp/callbacks/joint_intent_slot_callback.py
@@ -1,23 +1,30 @@
-# Copyright (c) 2019 NVIDIA Corporation
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
-import os
 import random
-import time
 
-import matplotlib
 import numpy as np
-from matplotlib import pyplot as plt
-from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.metrics import classification_report
 
-import nemo
+from nemo import logging
+from nemo.collections.nlp.utils.callback_utils import list2str, plot_confusion_matrix, tensor2list
 
 __all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
 
 
-def tensor2list(tensor):
-    return tensor.detach().cpu().tolist()
-
-
 def eval_iter_callback(tensors, global_vars, eval_data_layer):
     if "all_intent_preds" not in global_vars.keys():
         global_vars["all_intent_preds"] = []
@@ -68,10 +75,6 @@ def eval_iter_callback(tensors, global_vars, eval_data_layer):
     global_vars["all_subtokens_mask"].extend(all_subtokens_mask)
 
 
-def list2str(l):
-    return ' '.join([str(j) for j in l])
-
-
 def eval_epochs_done_callback(global_vars, graph_fold):
     intent_labels = np.asarray(global_vars['all_intent_labels'])
     intent_preds = np.asarray(global_vars['all_intent_preds'])
@@ -83,38 +86,31 @@ def eval_epochs_done_callback(global_vars, graph_fold):
     slot_labels = slot_labels[subtokens_mask]
     slot_preds = slot_preds[subtokens_mask]
 
+    # print predictions and labels for a small random subset of data
+    sample_size = 20
     i = 0
-    if intent_preds.shape[0] > 21:
-        i = random.randint(0, intent_preds.shape[0] - 21)
-    nemo.logging.info("Sampled i_preds: [%s]" % list2str(intent_preds[i : i + 20]))
-    nemo.logging.info("Sampled intents: [%s]" % list2str(intent_labels[i : i + 20]))
-    nemo.logging.info("Sampled s_preds: [%s]" % list2str(slot_preds[i : i + 20]))
-    nemo.logging.info("Sampled slots: [%s]" % list2str(slot_labels[i : i + 20]))
-    cm = confusion_matrix(intent_labels, intent_preds)
-    nemo.logging.info(f'Confusion matrix:\n{cm}')
-    fig = plt.figure()
-    ax = fig.add_subplot(111)
-    cax = ax.matshow(cm)
-    plt.title('Confusion matrix of the classifier')
-    fig.colorbar(cax)
-    plt.xlabel('Predicted')
-    plt.ylabel('True')
-    os.makedirs(graph_fold, exist_ok=True)
-    plt.savefig(os.path.join(graph_fold, time.strftime('%Y%m%d-%H%M%S')))
-
-    nemo.logging.info('Intent prediction results')
+    if intent_preds.shape[0] > sample_size + 1:
+        i = random.randint(0, intent_preds.shape[0] - sample_size - 1)
+    logging.info("Sampled i_preds: [%s]" % list2str(intent_preds[i : i + sample_size]))
+    logging.info("Sampled intents: [%s]" % list2str(intent_labels[i : i + sample_size]))
+    logging.info("Sampled s_preds: [%s]" % list2str(slot_preds[i : i + sample_size]))
+    logging.info("Sampled slots: [%s]" % list2str(slot_labels[i : i + sample_size]))
+
+    plot_confusion_matrix(intent_labels, intent_preds, graph_fold)
+
+    logging.info('Intent prediction results')
     correct_preds = sum(intent_labels == intent_preds)
     intent_accuracy = correct_preds / intent_labels.shape[0]
-    nemo.logging.info(f'Intent accuracy: {intent_accuracy}')
-    nemo.logging.info(
+    logging.info(f'Intent accuracy: {intent_accuracy}')
+    logging.info(
         f'Classification report:\n \
         {classification_report(intent_labels, intent_preds)}'
     )
 
-    nemo.logging.info('Slot prediction results')
+    logging.info('Slot prediction results')
     slot_accuracy = sum(slot_labels == slot_preds) / slot_labels.shape[0]
-    nemo.logging.info(f'Slot accuracy: {slot_accuracy}')
-    nemo.logging.info(
+    logging.info(f'Slot accuracy: {slot_accuracy}')
+    logging.info(
         f'Classification report:\n \
         {classification_report(slot_labels[:-2], slot_preds[:-2])}'
     )
diff --git a/nemo/collections/nlp/utils/callbacks/bert_pretraining.py b/nemo/collections/nlp/callbacks/lm_bert_callback.py
similarity index 52%
rename from nemo/collections/nlp/utils/callbacks/bert_pretraining.py
rename to nemo/collections/nlp/callbacks/lm_bert_callback.py
index baeaabe2d701..e31f964a22da 100644
--- a/nemo/collections/nlp/utils/callbacks/bert_pretraining.py
+++ b/nemo/collections/nlp/callbacks/lm_bert_callback.py
@@ -1,9 +1,24 @@
-# Copyright (c) 2019 NVIDIA Corporation
-__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
 import numpy as np
 
-import nemo
+from nemo import logging
+
+__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
 
 
 def eval_iter_callback(tensors, global_vars):
@@ -24,14 +39,14 @@ def eval_iter_callback(tensors, global_vars):
 def eval_epochs_done_callback(global_vars):
     if 'dev_mlm_loss' in global_vars:
         mlm_loss = np.mean(global_vars["dev_mlm_loss"])
-        nemo.logging.info("Dev MLM perplexity: {0}".format(np.round(np.exp(mlm_loss), 3)))
+        logging.info("Dev MLM perplexity: {0}".format(np.round(np.exp(mlm_loss), 3)))
         global_vars["dev_mlm_loss"] = []
     else:
         mlm_loss = -123.0
 
     if 'dev_nsp_loss' in global_vars:
         nsp_loss = np.mean(global_vars["dev_nsp_loss"])
-        nemo.logging.info("Dev NSP perplexity: {0}".format(np.round(np.exp(nsp_loss), 3)))
+        logging.info("Dev NSP perplexity: {0}".format(np.round(np.exp(nsp_loss), 3)))
         global_vars["dev_nsp_loss"] = []
     else:
         nsp_loss = -123.0
diff --git a/nemo/collections/nlp/callbacks/lm_transformer_callback.py b/nemo/collections/nlp/callbacks/lm_transformer_callback.py
new file mode 100644
index 000000000000..344873c216d0
--- /dev/null
+++ b/nemo/collections/nlp/callbacks/lm_transformer_callback.py
@@ -0,0 +1,46 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import numpy as np
+
+from nemo import logging
+
+__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
+GLOBAL_KEYS = ["eval_loss", "sys"]
+
+
+def eval_iter_callback(tensors, global_vars):
+    for key in GLOBAL_KEYS:
+        if key not in global_vars.keys():
+            global_vars[key] = []
+
+    for kv, v in tensors.items():
+        if "loss" in kv:
+            for eval_loss in v:
+                global_vars["eval_loss"].append(eval_loss.item())
+
+
+def eval_epochs_done_callback(global_vars):
+    eval_loss = np.mean(global_vars["eval_loss"])
+    eval_ppl = np.exp(eval_loss)
+
+    logging.info("------------------------------------------------------")
+    logging.info("Eval loss: {0}".format(np.round(eval_loss, 3)))
+    logging.info("Eval  ppl: {0}".format(np.round(eval_ppl, 3)))
+    logging.info("------------------------------------------------------")
+    for key in GLOBAL_KEYS:
+        global_vars[key] = []
+    return dict({"Eval_loss": eval_loss, "Eval_ppl": eval_ppl})
diff --git a/nemo/collections/nlp/utils/callbacks/translation.py b/nemo/collections/nlp/callbacks/machine_translation_callback.py
similarity index 64%
rename from nemo/collections/nlp/utils/callbacks/translation.py
rename to nemo/collections/nlp/callbacks/machine_translation_callback.py
index 02f168de00c1..e0a885f3bf4c 100644
--- a/nemo/collections/nlp/utils/callbacks/translation.py
+++ b/nemo/collections/nlp/callbacks/machine_translation_callback.py
@@ -1,10 +1,26 @@
-# Copyright (c) 2019 NVIDIA Corporation
-__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
 import numpy as np
 
-from ..metrics.sacrebleu import corpus_bleu
+from nemo import logging
 from nemo.collections.asr.metrics import word_error_rate
+from nemo.collections.nlp.metrics.sacrebleu import corpus_bleu
+
+__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
 
 GLOBAL_KEYS = ["eval_loss", "ref", "sys", "sent_ids", "nonpad_tokens"]
 
@@ -64,19 +80,19 @@ def eval_epochs_done_callback(global_vars, validation_dataset=None):
 
     for i in range(3):
         sent_id = np.random.randint(len(all_sys))
-        print("Ground truth: {0}\n".format(all_ref[0][sent_id]))
-        print("Translation:  {0}\n".format(all_sys[sent_id]))
+        logging.info("Ground truth: {0}\n".format(all_ref[0][sent_id]))
+        logging.info("Translation:  {0}\n".format(all_sys[sent_id]))
 
-    print("------------------------------------------------------------")
-    print("Validation loss: {0}".format(np.round(eval_loss, 3)))
-    print("TokenBLEU: {0}".format(np.round(token_bleu, 2)))
-    print("SacreBLEU: {0}".format(np.round(sacre_bleu, 2)))
-    print("------------------------------------------------------------")
+    logging.info("------------------------------------------------------------")
+    logging.info("Validation loss: {0}".format(np.round(eval_loss, 3)))
+    logging.info("TokenBLEU: {0}".format(np.round(token_bleu, 2)))
+    logging.info("SacreBLEU: {0}".format(np.round(sacre_bleu, 2)))
+    logging.info("------------------------------------------------------------")
 
     for key in GLOBAL_KEYS:
         global_vars[key] = []
 
-    metrics = dict({"eval_loss": eval_loss, "token_bleu": token_bleu, "sacre_bleu": sacre_bleu,})
+    metrics = dict({"eval_loss": eval_loss, "token_bleu": token_bleu, "sacre_bleu": sacre_bleu})
 
     return metrics
 
@@ -94,11 +110,11 @@ def eval_epochs_done_callback_wer(global_vars):
     eval_wer = word_error_rate(ref, sys)
     for i in range(3):
         sent_id = np.random.randint(len(sys))
-        print("Ground truth: {0}\n".format(ref[sent_id]))
-        print("Translation:  {0}\n".format(sys[sent_id]))
+        logging.info("Ground truth: {0}\n".format(ref[sent_id]))
+        logging.info("Translation:  {0}\n".format(sys[sent_id]))
 
-    print("Validation loss: {0}".format(np.round(eval_loss, 3)))
-    print("Validation WER: {0}".format(eval_wer))
+    logging.info("Validation loss: {0}".format(np.round(eval_loss, 3)))
+    logging.info("Validation WER: {0}".format(eval_wer))
     global_vars["eval_loss"] = []
     global_vars["ref"] = []
     global_vars["sys"] = []
diff --git a/nemo/collections/nlp/utils/callbacks/punctuation_capitalization.py b/nemo/collections/nlp/callbacks/punctuation_capitalization_callback.py
similarity index 70%
rename from nemo/collections/nlp/utils/callbacks/punctuation_capitalization.py
rename to nemo/collections/nlp/callbacks/punctuation_capitalization_callback.py
index a3f8d01add15..dc76015d7363 100644
--- a/nemo/collections/nlp/utils/callbacks/punctuation_capitalization.py
+++ b/nemo/collections/nlp/callbacks/punctuation_capitalization_callback.py
@@ -1,14 +1,28 @@
-# Copyright (c) 2019 NVIDIA Corporation
-__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
 import random
 
 import numpy as np
 from sklearn.metrics import classification_report
 
-import nemo
-from nemo.collections.nlp.data.datasets.utils import list2str, tensor2list
-from nemo.collections.nlp.utils.nlp_utils import plot_confusion_matrix
+from nemo import logging
+from nemo.collections.nlp.utils.callback_utils import list2str, plot_confusion_matrix, tensor2list
+
+__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
 
 
 def eval_iter_callback(tensors, global_vars):
@@ -64,9 +78,7 @@ def eval_iter_callback(tensors, global_vars):
     global_vars["all_subtokens_mask"].extend(all_subtokens_mask)
 
 
-def eval_epochs_done_callback(
-    global_vars, punct_label_ids, capit_label_ids, graph_fold=None, normalize_cm=True,
-):
+def eval_epochs_done_callback(global_vars, punct_label_ids, capit_label_ids, graph_fold=None, normalize_cm=True):
     '''
     Args:
       graph_fold (str): path to output folder
@@ -78,10 +90,7 @@ def eval_epochs_done_callback(
 
     capit_accuracy = _eval_epochs_done_callback('capit', global_vars, capit_label_ids, graph_fold, normalize_cm)
 
-    return {
-        "Punctuation_task_accuracy": punct_accuracy,
-        "Capitalization_task_accuracy": capit_accuracy,
-    }
+    return {"Punctuation_task_accuracy": punct_accuracy, "Capitalization_task_accuracy": capit_accuracy}
 
 
 def _eval_epochs_done_callback(task_name, global_vars, label_ids, graph_fold=None, normalize_cm=True):
@@ -93,25 +102,23 @@ def _eval_epochs_done_callback(task_name, global_vars, label_ids, graph_fold=Non
     preds = preds[subtokens_mask]
 
     accuracy = sum(labels == preds) / labels.shape[0]
-    nemo.logging.info(f'Accuracy for task {task_name}: {accuracy}')
+    logging.info(f'Accuracy for task {task_name}: {accuracy}')
 
     # print predictions and labels for a small random subset of data
     sample_size = 20
     i = 0
     if preds.shape[0] > sample_size + 1:
         i = random.randint(0, preds.shape[0] - sample_size - 1)
-    nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size]))
-    nemo.logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size]))
+    logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size]))
+    logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size]))
 
     # remove labels from label_ids that don't appear in the dev set
     used_labels = set(labels) | set(preds)
     label_ids = {k: label_ids[k] for k, v in label_ids.items() if v in used_labels}
 
-    nemo.logging.info(classification_report(labels, preds, target_names=label_ids))
+    logging.info(classification_report(labels, preds, target_names=label_ids))
 
     # calculate and plot confusion_matrix
     if graph_fold:
-        plot_confusion_matrix(
-            label_ids, labels, preds, graph_fold, normalize=normalize_cm, prefix=task_name,
-        )
+        plot_confusion_matrix(labels, preds, graph_fold, label_ids, normalize=normalize_cm, prefix=task_name)
     return accuracy
diff --git a/nemo/collections/nlp/utils/callbacks/squad.py b/nemo/collections/nlp/callbacks/qa_squad_callback.py
similarity index 67%
rename from nemo/collections/nlp/utils/callbacks/squad.py
rename to nemo/collections/nlp/callbacks/qa_squad_callback.py
index 5f87132bc7e4..321999d902ba 100644
--- a/nemo/collections/nlp/utils/callbacks/squad.py
+++ b/nemo/collections/nlp/callbacks/qa_squad_callback.py
@@ -1,18 +1,22 @@
-"""
-Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+from nemo import logging
 
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
 
 
 def eval_iter_callback(tensors, global_vars):
@@ -63,7 +67,7 @@ def eval_epochs_done_callback(
         do_lower_case=do_lower_case,
     )
 
-    print(f"Exact_match = {exact_match}, f1 = {f1}")
+    logging.info(f"Exact_match = {exact_match}, f1 = {f1}")
 
     global_vars["eval_unique_ids"] = []
     global_vars["eval_start_logits"] = []
diff --git a/nemo/collections/nlp/callbacks/state_tracking_trade_callback.py b/nemo/collections/nlp/callbacks/state_tracking_trade_callback.py
new file mode 100644
index 000000000000..01cc43047d36
--- /dev/null
+++ b/nemo/collections/nlp/callbacks/state_tracking_trade_callback.py
@@ -0,0 +1,103 @@
+# =============================================================================
+# Copyright 2019 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import numpy as np
+import torch
+
+from nemo import logging
+
+__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
+
+
+def eval_iter_callback(tensors, global_vars, data_desc):
+
+    if 'loss' not in global_vars:
+        global_vars['loss'] = []
+    if 'comp_res' not in global_vars:
+        global_vars['comp_res'] = []
+    if 'gating_labels' not in global_vars:
+        global_vars['gating_labels'] = []
+    if 'gating_preds' not in global_vars:
+        global_vars['gating_preds'] = []
+
+    for kv, v in tensors.items():
+        if kv.startswith('loss'):
+            loss_numpy = v[0].cpu().numpy()
+            global_vars['loss'].append(loss_numpy)
+        if kv.startswith('point_outputs'):
+            point_outputs = v[0]
+        if kv.startswith('gate_outputs'):
+            gate_outputs = v[0]
+        if kv.startswith('gating_labels'):
+            gating_labels = v[0].cpu().numpy()
+            global_vars['gating_labels'].extend(gating_labels)
+        if kv.startswith('tgt_ids'):
+            tgt_ids = v[0]
+
+    point_outputs_max = torch.argmax(point_outputs, dim=-1)
+    mask_paddings = tgt_ids == data_desc.vocab.pad_id
+    comp_res = (point_outputs_max == tgt_ids) | mask_paddings
+    comp_res = torch.all(comp_res, axis=-1, keepdims=False)
+
+    global_vars['comp_res'].extend(comp_res.cpu().numpy())
+    global_vars['gating_preds'].extend(torch.argmax(gate_outputs, axis=-1).cpu().numpy())
+
+
+def eval_epochs_done_callback(global_vars, data_desc):
+    joint_acc, turn_acc = evaluate_metrics(
+        global_vars['comp_res'],
+        global_vars['gating_labels'],
+        global_vars['gating_preds'],
+        data_desc.gating_dict["ptr"],
+    )
+
+    gating_comp_flatten = (np.asarray(global_vars['gating_labels']) == np.asarray(global_vars['gating_preds'])).ravel()
+    gating_acc = np.sum(gating_comp_flatten) / len(gating_comp_flatten)
+
+    evaluation_metrics = {"Joint_Goal_Acc": joint_acc, "Turn_Acc": turn_acc, "Gate_Acc": gating_acc}
+    logging.info(evaluation_metrics)
+
+    return evaluation_metrics
+
+
+def evaluate_metrics(comp_res, gating_labels, gating_preds, ptr_code):
+    # TODO: Calculate precision, recall, and F1
+    total_slots = 0
+    correct_slots = 0
+    total_turns = 0
+    correct_turns = 0
+    for result_idx, result in enumerate(comp_res):
+        turn_wrong = False
+        total_turns += 1
+        for slot_idx, slot_eq in enumerate(result):
+            total_slots += 1
+            if gating_labels[result_idx][slot_idx] == ptr_code:
+                if slot_eq:
+                    correct_slots += 1
+                else:
+                    turn_wrong = True
+            elif gating_labels[result_idx][slot_idx] == gating_preds[result_idx][slot_idx] or (
+                slot_eq and gating_preds[result_idx][slot_idx] == ptr_code
+            ):
+                correct_slots += 1
+            else:
+                turn_wrong = True
+        if not turn_wrong:
+            correct_turns += 1
+
+    turn_acc = correct_slots / float(total_slots) if total_slots != 0 else 0
+    joint_acc = correct_turns / float(total_turns) if total_turns != 0 else 0
+    return joint_acc, turn_acc
diff --git a/nemo/collections/nlp/callbacks/text_classification_callback.py b/nemo/collections/nlp/callbacks/text_classification_callback.py
new file mode 100644
index 000000000000..14b89d8e57e7
--- /dev/null
+++ b/nemo/collections/nlp/callbacks/text_classification_callback.py
@@ -0,0 +1,68 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import random
+
+import numpy as np
+from sklearn.metrics import classification_report
+
+from nemo import logging
+from nemo.collections.nlp.utils.callback_utils import list2str, plot_confusion_matrix, tensor2list
+
+__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
+
+
+def eval_iter_callback(tensors, global_vars, eval_data_layer):
+    if "all_preds" not in global_vars.keys():
+        global_vars["all_preds"] = []
+    if "all_labels" not in global_vars.keys():
+        global_vars["all_labels"] = []
+
+    logits_lists = []
+    labels_lists = []
+
+    for kv, v in tensors.items():
+        if 'logits' in kv:
+            for v_tensor in v:
+                for logit_tensor in v_tensor:
+                    logits_lists.append(tensor2list(logit_tensor))
+
+        if 'labels' in kv:
+            for v_tensor in v:
+                for label_tensor in v_tensor:
+                    labels_lists.append(tensor2list(label_tensor))
+
+    preds = list(np.argmax(np.asarray(logits_lists), 1))
+    global_vars["all_preds"].extend(preds)
+    global_vars["all_labels"].extend(labels_lists)
+
+
+def eval_epochs_done_callback(global_vars, graph_fold):
+    labels = np.asarray(global_vars['all_labels'])
+    preds = np.asarray(global_vars['all_preds'])
+    accuracy = sum(labels == preds) / labels.shape[0]
+    logging.info(f'Accuracy: {accuracy}')
+
+    # print predictions and labels for a small random subset of data
+    sample_size = 20
+    i = 0
+    if preds.shape[0] > sample_size + 1:
+        i = random.randint(0, preds.shape[0] - sample_size - 1)
+    logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size]))
+    logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size]))
+    plot_confusion_matrix(labels, preds, graph_fold)
+    logging.info(classification_report(labels, preds))
+    return dict({"accuracy": accuracy})
diff --git a/nemo/collections/nlp/utils/callbacks/token_classification.py b/nemo/collections/nlp/callbacks/token_classification_callback.py
similarity index 64%
rename from nemo/collections/nlp/utils/callbacks/token_classification.py
rename to nemo/collections/nlp/callbacks/token_classification_callback.py
index 20d3036118f1..0f4d3c545622 100644
--- a/nemo/collections/nlp/utils/callbacks/token_classification.py
+++ b/nemo/collections/nlp/callbacks/token_classification_callback.py
@@ -1,14 +1,28 @@
-# Copyright (c) 2019 NVIDIA Corporation
-__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
 import random
 
 import numpy as np
 from sklearn.metrics import classification_report
 
-import nemo
-from nemo.collections.nlp.data.datasets.utils import list2str, tensor2list
-from nemo.collections.nlp.utils.nlp_utils import plot_confusion_matrix
+from nemo import logging
+from nemo.collections.nlp.utils.callback_utils import list2str, plot_confusion_matrix, tensor2list
+
+__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
 
 
 def eval_iter_callback(tensors, global_vars):
@@ -52,24 +66,24 @@ def eval_epochs_done_callback(global_vars, label_ids, graph_fold=None, none_labe
     preds = preds[subtokens_mask]
 
     accuracy = sum(labels == preds) / labels.shape[0]
-    nemo.logging.info(f'Accuracy: {accuracy}')
+    logging.info(f'Accuracy: {accuracy}')
 
     # print predictions and labels for a small random subset of data
     sample_size = 20
     i = 0
     if preds.shape[0] > sample_size + 1:
         i = random.randint(0, preds.shape[0] - sample_size - 1)
-    nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size]))
-    nemo.logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size]))
+    logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size]))
+    logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size]))
 
     # remove labels from label_ids that don't appear in the dev set
     used_labels = set(labels) | set(preds)
     label_ids = {k: label_ids[k] for k, v in label_ids.items() if v in used_labels}
 
-    nemo.logging.info(classification_report(labels, preds, target_names=label_ids))
+    logging.info(classification_report(labels, preds, target_names=label_ids))
 
     # calculate and plot confusion_matrix
     if graph_fold:
-        plot_confusion_matrix(label_ids, labels, preds, graph_fold, normalize=normalize_cm)
+        plot_confusion_matrix(labels, preds, graph_fold, label_ids, normalize=normalize_cm)
 
     return dict({'Accuracy': accuracy})
diff --git a/nemo/collections/nlp/data/__init__.py b/nemo/collections/nlp/data/__init__.py
index 6e6bf8956b48..87a10d8803c8 100644
--- a/nemo/collections/nlp/data/__init__.py
+++ b/nemo/collections/nlp/data/__init__.py
@@ -1,3 +1,18 @@
-from .data_layers import *
-from .datasets import *
-from .tokenizers import *
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.data.datasets import *
+from nemo.collections.nlp.data.tokenizers import *
diff --git a/nemo/collections/nlp/data/data_layers.py b/nemo/collections/nlp/data/data_layers.py
deleted file mode 100644
index 36dac97ec98d..000000000000
--- a/nemo/collections/nlp/data/data_layers.py
+++ /dev/null
@@ -1,1128 +0,0 @@
-# Copyright (c) 2019 NVIDIA Corporation
-
-# If you want to add your own data layer, you should put its name in
-# __all__ so that it can be imported with 'from text_data_layers import *'
-
-
-__all__ = [
-    'GlueDataLayerClassification',
-    'GlueDataLayerRegression',
-    'BertJointIntentSlotDataLayer',
-    'BertJointIntentSlotInferDataLayer',
-    'BertPunctuationCapitalizationDataLayer',
-    'BertPunctuationCapitalizationInferDataLayer',
-    'BertPretrainingDataLayer',
-    'BertPretrainingPreprocessedDataLayer',
-    'BertSentenceClassificationDataLayer',
-    'BertTokenClassificationDataLayer',
-    'BertTokenClassificationInferDataLayer',
-    'BertQuestionAnsweringDataLayer',
-    'LanguageModelingDataLayer',
-    'TextDataLayer',
-    'TranslationDataLayer',
-]
-
-import os
-import random
-import sys
-
-import h5py
-import numpy as np
-import torch
-from torch.utils import data as pt_data
-
-import nemo
-from .datasets import *
-from nemo.backends.pytorch.nm import DataLayerNM
-from nemo.core.neural_types import *
-
-
-class TextDataLayer(DataLayerNM):
-    """
-    Generic Text Data Layer NM which wraps PyTorch's dataset
-
-    Args:
-        dataset_type: type of dataset used for this datalayer
-        dataset_params (dict): all the params for the dataset
-        batch_size: size of batch
-    """
-
-    def __init__(self, dataset_type, dataset_params, batch_size):
-        super().__init__()
-        if isinstance(dataset_type, str):
-            dataset_type = getattr(sys.modules[__name__], dataset_type)
-        self._dataset = dataset_type(**dataset_params)
-        self._batch_size = batch_size
-
-    def __len__(self):
-        return len(self._dataset)
-
-    @property
-    def dataset(self):
-        return self._dataset
-
-    @property
-    def data_iterator(self):
-        return None
-
-
-class BertSentenceClassificationDataLayer(TextDataLayer):
-    """
-    Creates the data layer to use for the task of sentence classification
-    with pretrained model.
-
-    All the data processing is done BertSentenceClassificationDataset.
-
-    Args:
-        dataset (BertSentenceClassificationDataset):
-                the dataset that needs to be converted to DataLayerNM
-    """
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        input_ids:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_type_ids:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        labels:
-            0: AxisType(BatchTag)
-
-        """
-        return {
-            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "labels": NeuralType({0: AxisType(BatchTag),}),
-        }
-
-    def __init__(
-        self,
-        input_file,
-        tokenizer,
-        max_seq_length,
-        num_samples=-1,
-        shuffle=False,
-        batch_size=64,
-        dataset_type=BertSentenceClassificationDataset,
-    ):
-        dataset_params = {
-            'input_file': input_file,
-            'tokenizer': tokenizer,
-            'max_seq_length': max_seq_length,
-            'num_samples': num_samples,
-            'shuffle': shuffle,
-        }
-        super().__init__(dataset_type, dataset_params, batch_size)
-
-
-class BertJointIntentSlotDataLayer(TextDataLayer):
-    """
-    Creates the data layer to use for the task of joint intent
-    and slot classification with pretrained model.
-
-    All the data processing is done in BertJointIntentSlotDataset.
-
-    input_mask: used to ignore some of the input tokens like paddings
-
-    loss_mask: used to mask and ignore tokens in the loss function
-
-    subtokens_mask: used to ignore the outputs of unwanted tokens in
-    the inference and evaluation like the start and end tokens
-
-    Args:
-        dataset (BertJointIntentSlotDataset):
-            the dataset that needs to be converted to DataLayerNM
-    """
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        input_ids:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_type_ids:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        loss_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        subtokens_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        intents:
-            0: AxisType(BatchTag)
-
-        slots:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-        """
-        return {
-            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "intents": NeuralType({0: AxisType(BatchTag),}),
-            "slots": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        }
-
-    def __init__(
-        self,
-        input_file,
-        slot_file,
-        pad_label,
-        tokenizer,
-        max_seq_length,
-        num_samples=-1,
-        shuffle=False,
-        batch_size=64,
-        ignore_extra_tokens=False,
-        ignore_start_end=False,
-        dataset_type=BertJointIntentSlotDataset,
-    ):
-        dataset_params = {
-            'input_file': input_file,
-            'slot_file': slot_file,
-            'pad_label': pad_label,
-            'tokenizer': tokenizer,
-            'max_seq_length': max_seq_length,
-            'num_samples': num_samples,
-            'shuffle': shuffle,
-            'ignore_extra_tokens': ignore_extra_tokens,
-            'ignore_start_end': ignore_start_end,
-        }
-        super().__init__(dataset_type, dataset_params, batch_size)
-
-
-class BertJointIntentSlotInferDataLayer(TextDataLayer):
-    """
-    Creates the data layer to use for the task of joint intent
-    and slot classification with pretrained model. This is for
-
-    All the data processing is done in BertJointIntentSlotInferDataset.
-
-    input_mask: used to ignore some of the input tokens like paddings
-
-    loss_mask: used to mask and ignore tokens in the loss function
-
-    subtokens_mask: used to ignore the outputs of unwanted tokens in
-    the inference and evaluation like the start and end tokens
-
-    Args:
-        dataset (BertJointIntentSlotInferDataset):
-            the dataset that needs to be converted to DataLayerNM
-    """
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        input_ids:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_type_ids:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        loss_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        subtokens_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        """
-        return {
-            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        }
-
-    def __init__(self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertJointIntentSlotInferDataset):
-        dataset_params = {
-            'queries': queries,
-            'tokenizer': tokenizer,
-            'max_seq_length': max_seq_length,
-        }
-        super().__init__(dataset_type, dataset_params, batch_size)
-
-
-class LanguageModelingDataLayer(TextDataLayer):
-    """
-    Data layer for standard language modeling task.
-
-    Args:
-        dataset (str): path to text document with data
-        tokenizer (TokenizerSpec): tokenizer
-        max_seq_length (int): maximum allowed length of the text segments
-        batch_step (int): how many tokens to skip between two successive
-            segments of text when constructing batches
-    """
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        input_ids: indices of tokens which constitute batches of text segments
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_mask: bool tensor with 0s in place of tokens to be masked
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        labels: indices of tokens which should be predicted from each of the
-            corresponding tokens in input_ids; for left-to-right language
-            modeling equals to input_ids shifted by 1 to the right
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-        """
-        return {
-            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        }
-
-    def __init__(
-        self, dataset, tokenizer, max_seq_length, batch_size, batch_step=128, dataset_type=LanguageModelingDataset
-    ):
-        dataset_params = {
-            'dataset': dataset,
-            'tokenizer': tokenizer,
-            'max_seq_length': max_seq_length,
-            'batch_step': batch_step,
-        }
-        super().__init__(dataset_type, dataset_params, batch_size)
-
-
-class BertTokenClassificationDataLayer(TextDataLayer):
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-            input_ids:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-
-            input_type_ids:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-
-            input_mask:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-
-            loss_mask:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-
-            subtokens_mask:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-
-            labels:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-        """
-        return {
-            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        }
-
-    def __init__(
-        self,
-        text_file,
-        label_file,
-        tokenizer,
-        max_seq_length,
-        pad_label='O',
-        label_ids=None,
-        num_samples=-1,
-        shuffle=False,
-        batch_size=64,
-        ignore_extra_tokens=False,
-        ignore_start_end=False,
-        use_cache=False,
-        dataset_type=BertTokenClassificationDataset,
-    ):
-        dataset_params = {
-            'text_file': text_file,
-            'label_file': label_file,
-            'max_seq_length': max_seq_length,
-            'tokenizer': tokenizer,
-            'num_samples': num_samples,
-            'shuffle': shuffle,
-            'pad_label': pad_label,
-            'label_ids': label_ids,
-            'ignore_extra_tokens': ignore_extra_tokens,
-            'ignore_start_end': ignore_start_end,
-            'use_cache': use_cache,
-        }
-        super().__init__(dataset_type, dataset_params, batch_size)
-
-
-class BertTokenClassificationInferDataLayer(TextDataLayer):
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        input_ids:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_type_ids:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        loss_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        subtokens_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        """
-        return {
-            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        }
-
-    def __init__(
-        self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertTokenClassificationInferDataset
-    ):
-        dataset_params = {
-            'queries': queries,
-            'tokenizer': tokenizer,
-            'max_seq_length': max_seq_length,
-        }
-        super().__init__(dataset_type, dataset_params, batch_size)
-
-
-class BertPunctuationCapitalizationDataLayer(TextDataLayer):
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        input_ids:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_type_ids:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        loss_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        subtokens_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        punct_labels:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        capit_labels:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        """
-        return {
-            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "punct_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "capit_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        }
-
-    def __init__(
-        self,
-        text_file,
-        label_file,
-        tokenizer,
-        max_seq_length,
-        pad_label='O',
-        punct_label_ids=None,
-        capit_label_ids=None,
-        num_samples=-1,
-        shuffle=False,
-        batch_size=64,
-        ignore_extra_tokens=False,
-        ignore_start_end=False,
-        use_cache=False,
-        dataset_type=BertPunctuationCapitalizationDataset,
-    ):
-        dataset_params = {
-            'text_file': text_file,
-            'label_file': label_file,
-            'max_seq_length': max_seq_length,
-            'tokenizer': tokenizer,
-            'num_samples': num_samples,
-            'shuffle': shuffle,
-            'pad_label': pad_label,
-            'punct_label_ids': punct_label_ids,
-            'capit_label_ids': capit_label_ids,
-            'ignore_extra_tokens': ignore_extra_tokens,
-            'ignore_start_end': ignore_start_end,
-            'use_cache': use_cache,
-        }
-        super().__init__(dataset_type, dataset_params, batch_size)
-
-
-class BertPunctuationCapitalizationInferDataLayer(TextDataLayer):
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        input_ids:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_type_ids:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        loss_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        subtokens_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        """
-        return {
-            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        }
-
-    def __init__(
-        self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertTokenClassificationInferDataset,
-    ):
-        dataset_params = {
-            'queries': queries,
-            'tokenizer': tokenizer,
-            'max_seq_length': max_seq_length,
-        }
-        super().__init__(dataset_type, dataset_params, batch_size)
-
-
-class BertQuestionAnsweringDataLayer(TextDataLayer):
-    """
-    Creates the data layer to use for Question Answering classification task.
-
-    Args:
-        data_dir (str): Directory that contains train.*.json and dev.*.json.
-        tokenizer (obj): Tokenizer object, e.g. NemoBertTokenizer.
-        version_2_with_negative (bool): True if training should allow
-            unanswerable questions.
-        doc_stride (int): When splitting up a long document into chunks,
-            how much stride to take between chunks.
-        max_query_length (iny): All training files which have a duration less
-            than min_duration are dropped. Can't be used if the `utt2dur` file
-            does not exist. Defaults to None.
-        max_seq_length (int): All training files which have a duration more
-            than max_duration are dropped. Can't be used if the `utt2dur` file
-            does not exist. Defaults to None.
-        mode (str): Use "train" or "dev" to define between
-            training and evaluation.
-        batch_size (int): Batch size. Defaults to 64.
-        dataset_type (class): Question Answering class.
-            Defaults to SquadDataset.
-    """
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-            input_ids:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-
-            input_type_ids:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-
-            input_mask:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-
-            start_positions:
-                0: AxisType(BatchTag)
-
-            end_positions:
-                0: AxisType(BatchTag)
-
-            unique_ids:
-                0: AxisType(BatchTag)
-
-        """
-        return {
-            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "start_positions": NeuralType({0: AxisType(BatchTag)}),
-            "end_positions": NeuralType({0: AxisType(BatchTag)}),
-            "unique_ids": NeuralType({0: AxisType(BatchTag)}),
-        }
-
-    def __init__(
-        self,
-        data_dir,
-        tokenizer,
-        version_2_with_negative,
-        doc_stride,
-        max_query_length,
-        max_seq_length,
-        mode="train",
-        batch_size=64,
-        dataset_type=SquadDataset,
-    ):
-        dataset_params = {
-            'data_dir': data_dir,
-            'mode': mode,
-            'tokenizer': tokenizer,
-            'version_2_with_negative': version_2_with_negative,
-            'max_query_length': max_query_length,
-            'max_seq_length': max_seq_length,
-            'doc_stride': doc_stride,
-        }
-
-        super().__init__(dataset_type, dataset_params, batch_size)
-
-
-class BertPretrainingDataLayer(TextDataLayer):
-    """
-    Data layer for masked language modeling task.
-
-    Args:
-        tokenizer (TokenizerSpec): tokenizer
-        dataset (str): directory or a single file with dataset documents
-        max_seq_length (int): maximum allowed length of the text segments
-        mask_probability (float): probability of masking input sequence tokens
-        batch_size (int): batch size in segments
-        short_seeq_prob (float): Probability of creating sequences which are
-            shorter than the maximum length.
-            Defualts to 0.1.
-    """
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        input_ids: indices of tokens which constitute batches of text segments
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_type_ids: indices of token types (e.g., sentences A & B in BERT)
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_mask: bool tensor with 0s in place of tokens to be masked
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        output_ids: indices of output tokens which should be predicted
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        output_mask: bool tensor with 0s in place of tokens to be excluded
-            from loss calculation
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        labels: indices of classes to be predicted from [CLS] token of text
-            segments (e.g, 0 or 1 in next sentence prediction task)
-            0: AxisType(BatchTag)
-
-        """
-        return {
-            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "labels": NeuralType({0: AxisType(BatchTag)}),
-        }
-
-    def __init__(self, tokenizer, dataset, max_seq_length, mask_probability, short_seq_prob=0.1, batch_size=64):
-        dataset_params = {
-            'tokenizer': tokenizer,
-            'dataset': dataset,
-            'max_seq_length': max_seq_length,
-            'mask_probability': mask_probability,
-            'short_seq_prob': short_seq_prob,
-        }
-        super().__init__(BertPretrainingDataset, dataset_params, batch_size)
-
-
-class BertPretrainingPreprocessedDataLayer(DataLayerNM):
-    """
-    Data layer for masked language modeling task.
-
-    Args:
-        tokenizer (TokenizerSpec): tokenizer
-        dataset (str): directory or a single file with dataset documents
-        max_seq_length (int): maximum allowed length of the text segments
-        mask_probability (float): probability of masking input sequence tokens
-        batch_size (int): batch size in segments
-        short_seeq_prob (float): Probability of creating sequences which are
-            shorter than the maximum length.
-            Defualts to 0.1.
-    """
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        input_ids: indices of tokens which constitute batches of text segments
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_type_ids: indices of token types (e.g., sentences A & B in BERT)
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        input_mask: bool tensor with 0s in place of tokens to be masked
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        output_ids: indices of output tokens which should be predicted
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        output_mask: bool tensor with 0s in place of tokens to be excluded
-            from loss calculation
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        labels: indices of classes to be predicted from [CLS] token of text
-            segments (e.g, 0 or 1 in next sentence prediction task)
-            0: AxisType(BatchTag)
-
-        """
-        return {
-            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "labels": NeuralType({0: AxisType(BatchTag)}),
-        }
-
-    def __init__(self, dataset, max_pred_length, batch_size=64, training=True):
-        super().__init__()
-        self._batch_size = batch_size
-
-        if os.path.isdir(dataset):
-            self.files = [
-                os.path.join(dataset, f) for f in os.listdir(dataset) if os.path.isfile(os.path.join(dataset, f))
-            ]
-        else:
-            self.files = [dataset]
-        self.files.sort()
-        self.num_files = len(self.files)
-        self.max_pred_length = max_pred_length
-        self.training = training
-        total_length = 0
-        for f in self.files:
-            fp = h5py.File(f, 'r')
-            total_length += len(fp['input_ids'])
-            fp.close()
-        self.total_length = total_length
-
-    def _collate_fn(self, x):
-        num_components = len(x[0])
-        components = [[] for _ in range(num_components)]
-        batch_size = len(x)
-        for i in range(batch_size):
-            for j in range(num_components):
-                components[j].append(x[i][j])
-        src_ids, src_segment_ids, src_mask, tgt_ids, tgt_mask, sent_ids = [np.stack(x, axis=0) for x in components]
-        src_ids = torch.Tensor(src_ids).long().to(self._device)
-        src_segment_ids = torch.Tensor(src_segment_ids).long().to(self._device)
-        src_mask = torch.Tensor(src_mask).long().to(self._device)
-        tgt_ids = torch.Tensor(tgt_ids).long().to(self._device)
-        tgt_mask = torch.Tensor(tgt_mask).long().to(self._device)
-        sent_ids = torch.Tensor(sent_ids).long().to(self._device)
-        return src_ids, src_segment_ids, src_mask, tgt_ids, tgt_mask, sent_ids
-
-    def __len__(self):
-        return self.total_length
-
-    @property
-    def dataset(self):
-        return None
-
-    @property
-    def data_iterator(self):
-        while True:
-            if self.training:
-                random.shuffle(self.files)
-            for f_id in range(self.num_files):
-                data_file = self.files[f_id]
-                train_data = BertPretrainingPreprocessedDataset(
-                    input_file=data_file, max_pred_length=self.max_pred_length
-                )
-                train_sampler = pt_data.RandomSampler(train_data)
-                train_dataloader = pt_data.DataLoader(
-                    dataset=train_data,
-                    batch_size=self.batch_size,
-                    collate_fn=self._collate_fn,
-                    shuffle=train_sampler is None,
-                    sampler=train_sampler,
-                )
-                for x in train_dataloader:
-                    yield x
-
-
-class TranslationDataLayer(TextDataLayer):
-    """
-    Data layer for neural machine translation from source (src) language to
-    target (tgt) language.
-
-    Args:
-        tokenizer_src (TokenizerSpec): source language tokenizer
-        tokenizer_tgt (TokenizerSpec): target language tokenizer
-        dataset_src (str): path to source data
-        dataset_tgt (str): path to target data
-        tokens_in_batch (int): maximum allowed number of tokens in batches,
-            batches will be constructed to minimize the use of <pad> tokens
-        clean (bool): whether to use parallel data cleaning such as removing
-            pairs with big difference in sentences length, removing pairs with
-            the same tokens in src and tgt, etc; useful for training data layer
-            and should not be used in evaluation data layer
-    """
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        src_ids: indices of tokens which correspond to source sentences
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        src_mask: bool tensor with 0s in place of source tokens to be masked
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        tgt_ids: indices of tokens which correspond to target sentences
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        tgt_mask: bool tensor with 0s in place of target tokens to be masked
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        labels: indices of tokens which should be predicted from each of the
-            corresponding target tokens in tgt_ids; for standard neural
-            machine translation equals to tgt_ids shifted by 1 to the right
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        sent_ids: indices of the sentences in a batch; important for
-            evaluation with external metrics, such as SacreBLEU
-            0: AxisType(BatchTag)
-
-        """
-        return {
-            "src_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "src_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "tgt_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "tgt_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "sent_ids": NeuralType({0: AxisType(BatchTag)}),
-        }
-
-    def __init__(
-        self,
-        tokenizer_src,
-        tokenizer_tgt,
-        dataset_src,
-        dataset_tgt,
-        batch_size=64,
-        tokens_in_batch=1024,
-        clean=False,
-        dataset_type=TranslationDataset,
-    ):
-        dataset_params = {
-            'tokenizer_src': tokenizer_src,
-            'tokenizer_tgt': tokenizer_tgt,
-            'dataset_src': dataset_src,
-            'dataset_tgt': dataset_tgt,
-            'tokens_in_batch': tokens_in_batch,
-            'clean': clean,
-        }
-        super().__init__(dataset_type, dataset_params, batch_size)
-
-        if self._placement == nemo.core.DeviceType.AllGpu:
-            sampler = pt_data.distributed.DistributedSampler(self._dataset)
-        else:
-            sampler = None
-
-        self._dataloader = pt_data.DataLoader(
-            dataset=self._dataset, batch_size=1, collate_fn=self._collate_fn, shuffle=sampler is None, sampler=sampler,
-        )
-
-    def _collate_fn(self, x):
-        src_ids, src_mask, tgt_ids, tgt_mask, labels, sent_ids = x[0]
-        src_ids = torch.Tensor(src_ids).long().to(self._device)
-        src_mask = torch.Tensor(src_mask).float().to(self._device)
-        tgt_ids = torch.Tensor(tgt_ids).long().to(self._device)
-        tgt_mask = torch.Tensor(tgt_mask).float().to(self._device)
-        labels = torch.Tensor(labels).long().to(self._device)
-        sent_ids = torch.Tensor(sent_ids).long().to(self._device)
-        return src_ids, src_mask, tgt_ids, tgt_mask, labels, sent_ids
-
-    @property
-    def dataset(self):
-        return None
-
-    @property
-    def data_iterator(self):
-        return self._dataloader
-
-
-class GlueDataLayerClassification(TextDataLayer):
-    """
-    Creates the data layer to use for the GLUE classification tasks,
-    more details here: https://gluebenchmark.com/tasks
-
-    All the data processing is done in GLUEDataset.
-
-    Args:
-        dataset_type (GLUEDataset):
-                the dataset that needs to be converted to DataLayerNM
-    """
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-            input_ids:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-
-            input_type_ids:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-
-            input_mask:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-
-            labels:
-                0: AxisType(CategoricalTag)
-        """
-        return {
-            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "labels": NeuralType({0: AxisType(CategoricalTag),}),
-        }
-
-    def __init__(
-        self,
-        data_dir,
-        tokenizer,
-        max_seq_length,
-        processor,
-        evaluate=False,
-        token_params={},
-        num_samples=-1,
-        shuffle=False,
-        batch_size=64,
-        dataset_type=GLUEDataset,
-    ):
-        dataset_params = {
-            'data_dir': data_dir,
-            'output_mode': 'classification',
-            'processor': processor,
-            'evaluate': evaluate,
-            'token_params': token_params,
-            'tokenizer': tokenizer,
-            'max_seq_length': max_seq_length,
-        }
-
-        super().__init__(dataset_type, dataset_params, batch_size)
-
-
-class GlueDataLayerRegression(TextDataLayer):
-    """
-    Creates the data layer to use for the GLUE STS-B regression task,
-    more details here: https://gluebenchmark.com/tasks
-
-    All the data processing is done in GLUEDataset.
-
-    Args:
-        dataset_type (GLUEDataset):
-                the dataset that needs to be converted to DataLayerNM
-    """
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-            input_ids:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-
-            input_type_ids:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-
-            input_mask:
-                0: AxisType(BatchTag)
-
-                1: AxisType(TimeTag)
-
-            labels:
-                0: AxisType(RegressionTag)
-        """
-        return {
-            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "labels": NeuralType({0: AxisType(RegressionTag),}),
-        }
-
-    def __init__(
-        self,
-        data_dir,
-        tokenizer,
-        max_seq_length,
-        processor,
-        evaluate=False,
-        token_params={},
-        num_samples=-1,
-        shuffle=False,
-        batch_size=64,
-        dataset_type=GLUEDataset,
-    ):
-        dataset_params = {
-            'data_dir': data_dir,
-            'output_mode': 'regression',
-            'processor': processor,
-            'evaluate': evaluate,
-            'token_params': token_params,
-            'tokenizer': tokenizer,
-            'max_seq_length': max_seq_length,
-        }
-
-        super().__init__(dataset_type, dataset_params, batch_size)
diff --git a/nemo/collections/nlp/data/datasets/__init__.py b/nemo/collections/nlp/data/datasets/__init__.py
index 3244c1266b19..c2decfb1c855 100644
--- a/nemo/collections/nlp/data/datasets/__init__.py
+++ b/nemo/collections/nlp/data/datasets/__init__.py
@@ -1,9 +1,38 @@
-from .bert_pretraining import BertPretrainingDataset, BertPretrainingPreprocessedDataset
-from .glue import GLUEDataset
-from .joint_intent_slot import BertJointIntentSlotDataset, BertJointIntentSlotInferDataset
-from .language_modeling import LanguageModelingDataset
-from .punctuation_capitalization import BertPunctuationCapitalizationDataset, BertPunctuationCapitalizationInferDataset
-from .sentence_classification import BertSentenceClassificationDataset
-from .squad import SquadDataset
-from .token_classification import BertTokenClassificationDataset, BertTokenClassificationInferDataset
-from .translation import TranslationDataset
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import GLUEDataset
+from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import (
+    BertJointIntentSlotDataset,
+    BertJointIntentSlotInferDataset,
+)
+from nemo.collections.nlp.data.datasets.lm_bert_dataset import (
+    BertPretrainingDataset,
+    BertPretrainingPreprocessedDataset,
+)
+from nemo.collections.nlp.data.datasets.lm_transformer_dataset import LanguageModelingDataset
+from nemo.collections.nlp.data.datasets.machine_translation_dataset import TranslationDataset
+from nemo.collections.nlp.data.datasets.punctuation_capitalization_dataset import (
+    BertPunctuationCapitalizationDataset,
+    BertPunctuationCapitalizationInferDataset,
+)
+from nemo.collections.nlp.data.datasets.qa_squad_dataset import SquadDataset
+from nemo.collections.nlp.data.datasets.state_tracking_trade_dataset import *
+from nemo.collections.nlp.data.datasets.text_classification_dataset import BertTextClassificationDataset
+from nemo.collections.nlp.data.datasets.token_classification_dataset import (
+    BertTokenClassificationDataset,
+    BertTokenClassificationInferDataset,
+)
diff --git a/nemo/collections/nlp/data/datasets/datasets_utils.py b/nemo/collections/nlp/data/datasets/datasets_utils.py
new file mode 100644
index 000000000000..f2851dd0cdce
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/datasets_utils.py
@@ -0,0 +1,988 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import csv
+import glob
+import json
+import os
+import random
+import re
+import shutil
+import string
+import subprocess
+from collections import Counter
+
+import numpy as np
+from tqdm import tqdm
+
+from nemo import logging
+from nemo.collections.nlp.utils.callback_utils import list2str
+from nemo.collections.nlp.utils.common_nlp_utils import (
+    get_vocab,
+    ids2text,
+    if_exist,
+    write_vocab,
+    write_vocab_in_order,
+)
+
+__all__ = [
+    'get_label_stats',
+    'process_sst_2',
+    'process_imdb',
+    'process_thucnews',
+    'process_nlu',
+    'process_twitter_airline',
+    'process_atis',
+    'process_jarvis_datasets',
+    'process_mturk',
+    'process_intent_slot_mturk',
+    'get_intents_mturk',
+    'get_slot_labels',
+    'merge',
+    'get_intent_query_files_dialogflow',
+    'get_intents_slots_dialogflow',
+    'get_slots_dialogflow',
+    'partition_data',
+    'write_files',
+    'process_dialogflow',
+    'write_data',
+    'create_dataset',
+    'read_csv',
+    'process_snips',
+    'get_dataset',
+    'partition',
+    'map_entities',
+    'get_entities',
+    'get_data',
+    'reverse_dict',
+    'get_intent_labels',
+    'download_wkt2',
+    'normalize_answer',
+    'get_tokens',
+]
+
+DATABASE_EXISTS_TMP = '{} dataset has already been processed and stored at {}'
+MODE_EXISTS_TMP = '{} mode of {} dataset has already been processed and stored at {}'
+
+
+def get_label_stats(labels, outfile='stats.tsv'):
+    labels = Counter(labels)
+    total = sum(labels.values())
+    out = open(outfile, 'w')
+    i = 0
+    label_frequencies = labels.most_common()
+    for k, v in label_frequencies:
+        out.write(f'{k}\t{v / total}\n')
+        if i < 3:
+            logging.info(f'{i} item: {k}, {v} out of {total}, {v / total}.')
+        i += 1
+    return total, label_frequencies
+
+
+def process_sst_2(data_dir):
+    if not os.path.exists(data_dir):
+        link = 'https://gluebenchmark.com/tasks'
+        raise ValueError(f'Data not found at {data_dir}. ' f'Please download SST-2 from {link}.')
+    logging.info('Keep in mind that SST-2 is only available in lower case.')
+    return data_dir
+
+
+def process_imdb(data_dir, uncased, modes=['train', 'test']):
+    if not os.path.exists(data_dir):
+        link = 'www.kaggle.com/iarunava/imdb-movie-reviews-dataset'
+        raise ValueError(f'Data not found at {data_dir}. ' f'Please download IMDB from {link}.')
+
+    outfold = f'{data_dir}/nemo-processed'
+
+    if uncased:
+        outfold = f'{outfold}_uncased'
+
+    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format('IMDB', outfold))
+        return outfold
+    logging.info(f'Processing IMDB dataset and store at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    outfiles = {}
+
+    for mode in modes:
+        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
+        outfiles[mode].write('sentence\tlabel\n')
+        for sent in ['neg', 'pos']:
+            if sent == 'neg':
+                label = 0
+            else:
+                label = 1
+            files = glob.glob(f'{data_dir}/{mode}/{sent}/*.txt')
+            for file in files:
+                with open(file, 'r') as f:
+                    review = f.read().strip()
+                if uncased:
+                    review = review.lower()
+                review = review.replace("<br />", "")
+                outfiles[mode].write(f'{review}\t{label}\n')
+    for mode in modes:
+        outfiles[mode].close()
+
+    return outfold
+
+
+def process_thucnews(data_dir):
+    modes = ['train', 'test']
+    train_size = 0.8
+    if not os.path.exists(data_dir):
+        link = 'thuctc.thunlp.org/'
+        raise ValueError(f'Data not found at {data_dir}. ' f'Please download THUCNews from {link}.')
+
+    outfold = f'{data_dir}/nemo-processed-thucnews'
+
+    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format('THUCNews', outfold))
+        return outfold
+    logging.info(f'Processing THUCNews dataset and store at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    outfiles = {}
+
+    for mode in modes:
+        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'a+', encoding='utf-8')
+        outfiles[mode].write('sentence\tlabel\n')
+    categories = ['体育', '娱乐', '家居', '彩票', '房产', '教育', '时尚', '时政', '星座', '游戏', '社会', '科技', '股票', '财经']
+    for category in categories:
+        label = categories.index(category)
+        category_files = glob.glob(f'{data_dir}/{category}/*.txt')
+        test_num = int(len(category_files) * (1 - train_size))
+        test_files = category_files[:test_num]
+        train_files = category_files[test_num:]
+        for mode in modes:
+            logging.info(f'Processing {mode} data of the category {category}')
+            if mode == 'test':
+                files = test_files
+            else:
+                files = train_files
+            for file in tqdm(files):
+                with open(file, 'r', encoding='utf-8') as f:
+                    news = f.read().strip().replace('\r', '')
+                    news = news.replace('\n', '').replace('\t', ' ')
+                    outfiles[mode].write(f'{news}\t{label}\n')
+    for mode in modes:
+        outfiles[mode].close()
+
+    return outfold
+
+
+def process_nlu(filename, uncased, modes=['train', 'test'], dataset_name='nlu-ubuntu'):
+    """ Dataset has to be of:
+    - ubuntu
+    - chat
+    - web
+    """
+
+    if not os.path.exists(filename):
+        link = 'https://github.com/sebischair/NLU-Evaluation-Corpora'
+        raise ValueError(f'Data not found at {filename}. ' f'Please download IMDB from {link}.')
+
+    if dataset_name == 'nlu-ubuntu':
+        INTENT = {'makeupdate': 1, 'setupprinter': 2, 'shutdowncomputer': 3, 'softwarerecommendation': 4, 'none': 0}
+    elif dataset_name == 'nlu-chat':
+        INTENT = {'departuretime': 0, 'findconnection': 1}
+    elif dataset_name == 'nlu-web':
+        INTENT = {
+            'changepassword': 1,
+            'deleteaccount': 2,
+            'downloadvideo': 3,
+            'exportdata': 4,
+            'filterspam': 5,
+            'findalternative': 6,
+            'syncaccounts': 7,
+            'none': 0,
+        }
+    else:
+        raise ValueError(f'{dataset_name}: Invalid dataset name')
+
+    infold = filename[: filename.rfind('/')]
+    outfold = f'{infold}/{dataset_name}-nemo-processed'
+
+    if uncased:
+        outfold = f'{outfold}_uncased'
+
+    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format(dataset_name.upper(), outfold))
+        return outfold
+    logging.info(f'Processing data and store at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    outfiles = {}
+
+    for mode in modes:
+        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
+        outfiles[mode].write('sentence\tlabel\n')
+
+    with open(filename, 'r') as f:
+        data = json.load(f)
+
+    for obj in data['sentences']:
+        sentence = obj['text'].strip()
+        if uncased:
+            sentence = sentence.lower()
+        intent = obj['intent'].lower().replace(' ', '')
+        label = INTENT[intent]
+        txt = f'{sentence}\t{label}\n'
+        if obj['training']:
+            outfiles['train'].write(txt)
+        else:
+            outfiles['test'].write(txt)
+    for mode in modes:
+        outfiles[mode].close()
+    return outfold
+
+
+def process_twitter_airline(filename, uncased, modes=['train', 'test']):
+    """ Dataset from Kaggle:
+    https://www.kaggle.com/crowdflower/twitter-airline-sentiment
+    """
+    pass
+
+
+def process_atis(infold, uncased, modes=['train', 'test'], dev_split=0):
+    """ MSFT's dataset, processed by Kaggle
+    https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk
+    """
+    outfold = f'{infold}/nemo-processed'
+    vocab = get_vocab(f'{infold}/atis.dict.vocab.csv')
+
+    if uncased:
+        outfold = f'{outfold}-uncased'
+
+    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold))
+        return outfold
+    logging.info(f'Processing ATIS dataset and store at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    outfiles = {}
+
+    for mode in modes:
+        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
+        outfiles[mode].write('sentence\tlabel\n')
+        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')
+
+        queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines()
+        intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines()
+        slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines()
+
+        for i, query in enumerate(queries):
+            sentence = ids2text(query.strip().split()[1:-1], vocab)
+            outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n')
+            slot = ' '.join(slots[i].strip().split()[1:-1])
+            outfiles[mode + '_slots'].write(slot + '\n')
+
+    shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv')
+    shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv')
+    for mode in modes:
+        outfiles[mode].close()
+
+    return outfold
+
+
+def process_jarvis_datasets(infold, uncased, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False):
+    """ process and convert Jarvis datasets into NeMo's BIO format
+    """
+    outfold = f'{infold}/{dataset_name}-nemo-processed'
+    infold = f'{infold}/'
+
+    if uncased:
+        outfold = f'{outfold}-uncased'
+
+    if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']):
+        logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold))
+        return outfold
+
+    logging.info(f'Processing {dataset_name} dataset and store at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    outfiles = {}
+    intents_list = {}
+    slots_list = {}
+    slots_list_all = {}
+
+    outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w')
+    outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w')
+
+    outfiles['dict_slots'].write('O\n')
+    slots_list["O"] = 0
+    slots_list_all["O"] = 0
+
+    for mode in modes:
+        if if_exist(outfold, [f'{mode}.tsv']):
+            logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode))
+            continue
+
+        if not if_exist(infold, [f'{mode}.tsv']):
+            logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.')
+            continue
+
+        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
+        outfiles[mode].write('sentence\tlabel\n')
+        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')
+
+        queries = open(f'{infold}/{mode}.tsv', 'r').readlines()
+
+        for i, query in enumerate(queries):
+            line_splits = query.strip().split("\t")
+            if len(line_splits) == 3:
+                intent_str, slot_tags_str, sentence = line_splits
+            else:
+                intent_str, sentence = line_splits
+                slot_tags_str = ""
+
+            if intent_str not in intents_list:
+                intents_list[intent_str] = len(intents_list)
+                outfiles['dict_intents'].write(f'{intent_str}\n')
+
+            if ignore_prev_intent:
+                start_token = 2
+            else:
+                start_token = 1
+            sentence_cld = " ".join(sentence.strip().split()[start_token:-1])
+            outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n')
+
+            slot_tags_list = []
+            if slot_tags_str.strip():
+                slot_tags = slot_tags_str.strip().split(",")
+                for st in slot_tags:
+                    if not st.strip():
+                        continue
+                    [start_i, end_i, slot_name] = st.strip().split(":")
+                    slot_tags_list.append([int(start_i), int(end_i), slot_name])
+                    if slot_name not in slots_list:
+                        slots_list[slot_name] = len(slots_list)
+                        slots_list_all[f'B-{slot_name}'] = len(slots_list_all)
+                        slots_list_all[f'I-{slot_name}'] = len(slots_list_all)
+                        outfiles['dict_slots'].write(f'B-{slot_name}\n')
+                        outfiles['dict_slots'].write(f'I-{slot_name}\n')
+
+            slot_tags_list.sort(key=lambda x: x[0])
+            slots = []
+            processed_index = 0
+            for tag_start, tag_end, tag_str in slot_tags_list:
+                if tag_start > processed_index:
+                    words_list = sentence[processed_index:tag_start].strip().split()
+                    slots.extend([str(slots_list_all['O'])] * len(words_list))
+                words_list = sentence[tag_start:tag_end].strip().split()
+                slots.append(str(slots_list_all[f'B-{tag_str}']))
+                slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1))
+                processed_index = tag_end
+
+            if processed_index < len(sentence):
+                words_list = sentence[processed_index:].strip().split()
+                slots.extend([str(slots_list_all['O'])] * len(words_list))
+
+            slots = slots[1:-1]
+            slot = ' '.join(slots)
+            outfiles[mode + '_slots'].write(slot + '\n')
+
+        outfiles[mode + '_slots'].close()
+        outfiles[mode].close()
+
+    outfiles['dict_slots'].close()
+    outfiles['dict_intents'].close()
+
+    return outfold
+
+
+def process_mturk(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
+    if not os.path.exists(data_dir):
+        link = 'www.mturk.com'
+        raise ValueError(
+            f'Data not found at {data_dir}. ' f'Export your mturk data from' f'{link} and unzip at {data_dir}.'
+        )
+
+    outfold = f'{data_dir}/nemo-processed'
+
+    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format('mturk', outfold))
+        return outfold
+
+    logging.info(f'Processing dataset from mturk and storing at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    classification_data_file = f'{data_dir}/classification.csv'
+    annotation_data_file = f'{data_dir}/annotation.manifest'
+
+    if not os.path.exists(classification_data_file):
+        raise FileNotFoundError(f'File not found ' f'at {classification_data_file}')
+
+    if not os.path.exists(annotation_data_file):
+        raise FileNotFoundError(f'File not found at {annotation_data_file}')
+
+    utterances = []
+    utterances = read_csv(classification_data_file)
+
+    # This function assumes that the intent classification data has been
+    # reviewed and cleaned and only one label per utterance is present.
+    agreed_all, intent_names = get_intents_mturk(utterances, outfold)
+
+    with open(annotation_data_file, 'r') as f:
+        slot_annotations = f.readlines()
+
+    # This function assumes that the preprocess step would have made
+    # the task_name of all the annotations generic
+    task_name = 'retail-combined'
+
+    # It is assumed that every utterances will have corresponding
+    # slot annotation information
+    if len(slot_annotations) < len(agreed_all):
+        raise ValueError(f'Every utterance must have corresponding' f'slot annotation information')
+
+    slot_labels, intent_queries, slot_tags = process_intent_slot_mturk(
+        slot_annotations, agreed_all, intent_names, task_name
+    )
+
+    assert len(slot_tags) == len(intent_queries)
+
+    dev_split = 0.1
+
+    train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split)
+
+    write_files(train_queries, f'{outfold}/train.tsv')
+    write_files(train_slots, f'{outfold}/train_slots.tsv')
+
+    write_files(test_queries, f'{outfold}/test.tsv')
+    write_files(test_slots, f'{outfold}/test_slots.tsv')
+
+    write_files(slot_labels, f'{outfold}/dict.slots.csv')
+    write_files(intent_names, f'{outfold}/dict.intents.csv')
+
+    return outfold
+
+
+def process_intent_slot_mturk(slot_annotations, agreed_all, intent_names, task_name):
+    slot_tags = []
+    inorder_utterances = []
+    all_labels = get_slot_labels(slot_annotations, task_name)
+    logging.info(f'agreed_all - {len(agreed_all)}')
+    logging.info(f'Slot annotations - {len(slot_annotations)}')
+
+    for annotation in slot_annotations[0:]:
+        an = json.loads(annotation)
+        utterance = an['source']
+        if len(utterance) > 2 and utterance.startswith('"') and utterance.endswith('"'):
+            utterance = utterance[1:-1]
+
+        if utterance in agreed_all:
+            entities = {}
+            annotated_entities = an[task_name]['annotations']['entities']
+            for i, each_anno in enumerate(annotated_entities):
+                entities[int(each_anno['startOffset'])] = i
+
+            lastptr = 0
+            slotlist = []
+            # sorting annotations by the start offset
+            for i in sorted(entities.keys()):
+                annotated_entities = an[task_name]['annotations']['entities']
+                tags = annotated_entities[entities.get(i)]
+                untagged_words = utterance[lastptr : tags['startOffset']]
+                for _ in untagged_words.split():
+                    slotlist.append(all_labels.get('O'))
+                anno_words = utterance[tags['startOffset'] : tags['endOffset']]
+                # tagging with the IOB format.
+                for j, _ in enumerate(anno_words.split()):
+                    if j == 0:
+                        b_slot = 'B-' + tags['label']
+                        slotlist.append(all_labels.get(b_slot))
+                    else:
+                        i_slot = 'I-' + tags['label']
+                        slotlist.append(all_labels.get(i_slot))
+                lastptr = tags['endOffset']
+
+            untagged_words = utterance[lastptr : len(utterance)]
+            for _ in untagged_words.split():
+                slotlist.append(all_labels.get('O'))
+
+            slotstr = ' '.join(slotlist)
+            slotstr = f'{slotstr.strip()}\n'
+
+            slot_tags.append(slotstr)
+            intent_num = intent_names.get(agreed_all.get(utterance))
+            query_text = f'{utterance.strip()}\t{intent_num}\n'
+            inorder_utterances.append(query_text)
+        # else:
+        #     logging.warning(utterance)
+
+    logging.info(f'inorder utterances - {len(inorder_utterances)}')
+
+    return all_labels, inorder_utterances, slot_tags
+
+
+def get_intents_mturk(utterances, outfold):
+    intent_names = {}
+    intent_count = 0
+
+    agreed_all = {}
+
+    logging.info('Printing all intent_labels')
+    intent_dict = f'{outfold}/dict.intents.csv'
+    if os.path.exists(intent_dict):
+        with open(intent_dict, 'r') as f:
+            for intent_name in f.readlines():
+                intent_names[intent_name.strip()] = intent_count
+                intent_count += 1
+    logging.info(intent_names)
+
+    for i, utterance in enumerate(utterances[1:]):
+
+        if utterance[1] not in agreed_all:
+            agreed_all[utterance[0]] = utterance[1]
+
+        if utterance[1] not in intent_names:
+            intent_names[utterance[1]] = intent_count
+            intent_count += 1
+
+    logging.info(f'Total number of utterance samples: {len(agreed_all)}')
+
+    return agreed_all, intent_names
+
+
+def get_slot_labels(slot_annotations, task_name):
+    slot_labels = json.loads(slot_annotations[0])
+
+    all_labels = {}
+    count = 0
+    # Generating labels with the IOB format.
+    for label in slot_labels[task_name]['annotations']['labels']:
+        b_slot = 'B-' + label['label']
+        i_slot = 'I-' + label['label']
+        all_labels[b_slot] = str(count)
+        count += 1
+        all_labels[i_slot] = str(count)
+        count += 1
+    all_labels['O'] = str(count)
+
+    return all_labels
+
+
+def merge(data_dir, subdirs, dataset_name, modes=['train', 'test']):
+    outfold = f'{data_dir}/{dataset_name}'
+    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold))
+        slots = get_vocab(f'{outfold}/dict.slots.csv')
+        none_slot = 0
+        for key in slots:
+            if slots[key] == 'O':
+                none_slot = key
+                break
+        return outfold, int(none_slot)
+
+    os.makedirs(outfold, exist_ok=True)
+
+    data_files, slot_files = {}, {}
+    for mode in modes:
+        data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w')
+        data_files[mode].write('sentence\tlabel\n')
+        slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w')
+
+    intents, slots = {}, {}
+    intent_shift, slot_shift = 0, 0
+    none_intent, none_slot = -1, -1
+
+    for subdir in subdirs:
+        curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv')
+        curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv')
+
+        for key in curr_intents:
+            if intent_shift > 0 and curr_intents[key] == 'O':
+                continue
+            if curr_intents[key] == 'O' and intent_shift == 0:
+                none_intent = int(key)
+            intents[int(key) + intent_shift] = curr_intents[key]
+
+        for key in curr_slots:
+            if slot_shift > 0 and curr_slots[key] == 'O':
+                continue
+            if slot_shift == 0 and curr_slots[key] == 'O':
+                none_slot = int(key)
+            slots[int(key) + slot_shift] = curr_slots[key]
+
+        for mode in modes:
+            with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f:
+                for line in f.readlines()[1:]:
+                    text, label = line.strip().split('\t')
+                    label = int(label)
+                    if curr_intents[label] == 'O':
+                        label = none_intent
+                    else:
+                        label = label + intent_shift
+                    data_files[mode].write(f'{text}\t{label}\n')
+
+            with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f:
+                for line in f.readlines():
+                    labels = [int(label) for label in line.strip().split()]
+                    shifted_labels = []
+                    for label in labels:
+                        if curr_slots[label] == 'O':
+                            shifted_labels.append(none_slot)
+                        else:
+                            shifted_labels.append(label + slot_shift)
+                    slot_files[mode].write(list2str(shifted_labels) + '\n')
+
+        intent_shift += len(curr_intents)
+        slot_shift += len(curr_slots)
+
+    write_vocab_in_order(intents, f'{outfold}/dict.intents.csv')
+    write_vocab_in_order(slots, f'{outfold}/dict.slots.csv')
+    return outfold, none_slot
+
+
+def get_intent_query_files_dialogflow(path):
+    fileslist = []
+    for root, _, files in os.walk(path):
+        for file in files:
+            if '_usersays_en.json' in file:
+                fileslist.append(os.path.join(root, file))
+    return fileslist
+
+
+def get_intents_slots_dialogflow(files, slot_labels):
+    intent_names = []
+    intent_queries = []
+    slot_tags = []
+
+    for index, file in enumerate(files):
+        intent_names.append(os.path.basename(file).split('_usersays')[0])
+
+        with open(file) as json_file:
+            intent_data = json.load(json_file)
+            for query in intent_data:
+                query_text = ""
+                slots = ""
+                for segment in query['data']:
+                    query_text = ''.join([query_text, segment['text']])
+                    if 'alias' in segment:
+                        for _ in segment['text'].split():
+                            slots = ' '.join([slots, slot_labels.get(segment['alias'])])
+                    else:
+                        for _ in segment['text'].split():
+                            slots = ' '.join([slots, slot_labels.get('O')])
+                query_text = f'{query_text.strip()}\t{index}\n'
+                intent_queries.append(query_text)
+                slots = f'{slots.strip()}\n'
+                slot_tags.append(slots)
+    return intent_queries, intent_names, slot_tags
+
+
+def get_slots_dialogflow(files):
+    slot_labels = {}
+    count = 0
+    for file in files:
+        intent_head_file = ''.join([file.split('_usersays')[0], '.json'])
+        with open(intent_head_file) as json_file:
+            intent_meta_data = json.load(json_file)
+            for params in intent_meta_data['responses'][0]['parameters']:
+                if params['name'] not in slot_labels:
+                    slot_labels[params['name']] = str(count)
+                    count += 1
+    slot_labels['O'] = str(count)
+    return slot_labels
+
+
+def partition_data(intent_queries, slot_tags, split=0.1):
+    n = len(intent_queries)
+    n_dev = int(n * split)
+    dev_idx = set(random.sample(range(n), n_dev))
+    dev_intents, dev_slots, train_intents, train_slots = [], [], [], []
+
+    dev_intents.append('sentence\tlabel\n')
+    train_intents.append('sentence\tlabel\n')
+
+    for i, item in enumerate(intent_queries):
+        if i in dev_idx:
+            dev_intents.append(item)
+            dev_slots.append(slot_tags[i])
+        else:
+            train_intents.append(item)
+            train_slots.append(slot_tags[i])
+    return train_intents, train_slots, dev_intents, dev_slots
+
+
+def write_files(data, outfile):
+    with open(outfile, 'w') as f:
+        for item in data:
+            item = f'{item.strip()}\n'
+            f.write(item)
+
+
+def process_dialogflow(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
+    if not os.path.exists(data_dir):
+        link = 'www.dialogflow.com'
+        raise ValueError(
+            f'Data not found at {data_dir}. ' f'Export your dialogflow data from' f'{link} and unzip at {data_dir}.'
+        )
+
+    outfold = f'{data_dir}/dialogflow/nemo-processed'
+
+    '''TO DO  - check for nemo-processed directory
+    already exists. If exists, skip the entire creation steps below. '''
+
+    os.makedirs(outfold, exist_ok=True)
+
+    files = get_intent_query_files_dialogflow(data_dir)
+
+    slot_labels = get_slots_dialogflow(files)
+
+    intent_queries, intent_names, slot_tags = get_intents_slots_dialogflow(files, slot_labels)
+
+    train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split)
+
+    write_files(train_queries, f'{outfold}/train.tsv')
+    write_files(train_slots, f'{outfold}/train_slots.tsv')
+
+    write_files(test_queries, f'{outfold}/test.tsv')
+    write_files(test_slots, f'{outfold}/test_slots.tsv')
+
+    write_files(slot_labels, f'{outfold}/dict.slots.csv')
+    write_files(intent_names, f'{outfold}/dict.intents.csv')
+
+    return outfold
+
+
+def write_data(data, slot_dict, intent_dict, outfold, mode, uncased):
+    intent_file = open(f'{outfold}/{mode}.tsv', 'w')
+    intent_file.write('sentence\tlabel\n')
+    slot_file = open(f'{outfold}/{mode}_slots.tsv', 'w')
+    for tokens, slots, intent in data:
+        text = ' '.join(tokens)
+        if uncased:
+            text = text.lower()
+        intent_file.write(f'{text}\t{intent_dict[intent]}\n')
+        slots = [str(slot_dict[slot]) for slot in slots]
+        slot_file.write(' '.join(slots) + '\n')
+    intent_file.close()
+    slot_file.close()
+
+
+def create_dataset(train, dev, slots, intents, uncased, outfold):
+    os.makedirs(outfold, exist_ok=True)
+    if 'O' in slots:
+        slots.remove('O')
+    slots = sorted(list(slots)) + ['O']
+    intents = sorted(list(intents))
+    slots = write_vocab(slots, f'{outfold}/dict.slots.csv')
+    intents = write_vocab(intents, f'{outfold}/dict.intents.csv')
+    write_data(train, slots, intents, outfold, 'train', uncased)
+    write_data(dev, slots, intents, outfold, 'test', uncased)
+
+
+def read_csv(file_path):
+    rows = []
+    with open(file_path, 'r') as csvfile:
+        read_csv = csv.reader(csvfile, delimiter=',')
+        for row in read_csv:
+            rows.append(row)
+    return rows
+
+
+def process_snips(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
+    if not os.path.exists(data_dir):
+        link = 'www.github.com/snipsco/spoken-language'
+        '-understanding-research-datasets'
+        raise ValueError(f'Data not found at {data_dir}. ' f'Resquest to download the SNIPS dataset from {link}.')
+
+    outfold = f'{data_dir}/nemo-processed'
+
+    if uncased:
+        outfold = f'{outfold}-uncased'
+
+    exist = True
+    for dataset in ['light', 'speak', 'all']:
+        if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]):
+            logging.info(DATABASE_EXISTS_TMP.format('SNIPS-' + dataset.upper(), outfold))
+        else:
+            exist = False
+    if exist:
+        return outfold
+
+    logging.info(f'Processing SNIPS dataset and store at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    speak_dir = 'smart-speaker-en-close-field'
+    light_dir = 'smart-lights-en-close-field'
+
+    light_files = [f'{data_dir}/{light_dir}/dataset.json']
+    speak_files = [f'{data_dir}/{speak_dir}/training_dataset.json']
+    speak_files.append(f'{data_dir}/{speak_dir}/test_dataset.json')
+
+    light_train, light_dev, light_slots, light_intents = get_dataset(light_files, dev_split)
+    speak_train, speak_dev, speak_slots, speak_intents = get_dataset(speak_files)
+
+    create_dataset(light_train, light_dev, light_slots, light_intents, uncased, f'{outfold}/light')
+    create_dataset(speak_train, speak_dev, speak_slots, speak_intents, uncased, f'{outfold}/speak')
+    create_dataset(
+        light_train + speak_train,
+        light_dev + speak_dev,
+        light_slots | speak_slots,
+        light_intents | speak_intents,
+        uncased,
+        f'{outfold}/all',
+    )
+
+    return outfold
+
+
+def get_dataset(files, dev_split=0.1):
+    entity2value, value2entity = get_entities(files)
+    data, slots, intents = get_data(files, entity2value, value2entity)
+    if len(data) == 1:
+        train, dev = partition(data[0], split=dev_split)
+    else:
+        train, dev = data[0], data[1]
+    return train, dev, slots, intents
+
+
+def partition(data, split=0.1):
+    n = len(data)
+    n_dev = int(n * split)
+    dev_idx = set(random.sample(range(n), n_dev))
+    dev, train = [], []
+
+    for i, item in enumerate(data):
+        if i in dev_idx:
+            dev.append(item)
+        else:
+            train.append(item)
+    return train, dev
+
+
+def map_entities(entity2value, entities):
+    for key in entities:
+        if 'data' in entities[key]:
+            if key not in entity2value:
+                entity2value[key] = set([])
+
+            values = []
+            for value in entities[key]['data']:
+                values.append(value['value'])
+                values.extend(value['synonyms'])
+            entity2value[key] = entity2value[key] | set(values)
+
+    return entity2value
+
+
+def get_entities(files):
+    entity2value = {}
+    for file in files:
+        with open(file, 'r') as json_file:
+            data = json.load(json_file)
+            entity2value = map_entities(entity2value, data['entities'])
+
+    value2entity = reverse_dict(entity2value)
+    return entity2value, value2entity
+
+
+def get_data(files, entity2value, value2entity):
+    all_data, all_slots, all_intents = [], set(['O']), set()
+    for file in files:
+        file_data = []
+        with open(file, 'r') as json_file:
+            data = json.load(json_file)
+            for intent in data['intents']:
+                all_intents.add(intent)
+                utterances = data['intents'][intent]['utterances']
+                for utterance in utterances:
+                    tokens, slots = [], []
+                    for frag in utterance['data']:
+                        frag_tokens = frag['text'].strip().split()
+                        tokens.extend(frag_tokens)
+                        if 'slot_name' not in frag:
+                            slot = 'O'
+                        else:
+                            slot = frag['slot_name']
+                            all_slots.add(slot)
+                        slots.extend([slot] * len(frag_tokens))
+                    file_data.append((tokens, slots, intent))
+        all_data.append(file_data)
+    return all_data, all_slots, all_intents
+
+
+def reverse_dict(entity2value):
+    value2entity = {}
+    for entity in entity2value:
+        for value in entity2value[entity]:
+            value2entity[value] = entity
+    return value2entity
+
+
+def get_intent_labels(intent_file):
+    labels = {}
+    label = 0
+    with open(intent_file, 'r') as f:
+        for line in f:
+            intent = line.strip()
+            labels[intent] = label
+            label += 1
+    return labels
+
+
+def download_wkt2(data_dir):
+    os.makedirs('data/lm', exist_ok=True)
+    logging.warning(f'Data not found at {data_dir}. ' f'Downloading wikitext-2 to data/lm')
+    data_dir = 'data/lm/wikitext-2'
+    subprocess.call('scripts/get_wkt2.sh')
+    return data_dir
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+def get_stats(lengths):
+    lengths = np.asarray(lengths)
+    logging.info(
+        f'Min: {np.min(lengths)} | \
+                 Max: {np.max(lengths)} | \
+                 Mean: {np.mean(lengths)} | \
+                 Median: {np.median(lengths)}'
+    )
+    logging.info(f'75 percentile: {np.percentile(lengths, 75)}')
+    logging.info(f'99 percentile: {np.percentile(lengths, 99)}')
diff --git a/nemo/collections/nlp/data/datasets/glue.py b/nemo/collections/nlp/data/datasets/glue.py
deleted file mode 100644
index 8893c5747c45..000000000000
--- a/nemo/collections/nlp/data/datasets/glue.py
+++ /dev/null
@@ -1,229 +0,0 @@
-"""
-Copyright 2018 The Google AI Language Team Authors and
-The HuggingFace Inc. team.
-Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-Utility functions for GLUE tasks
-Some transformer of this code were adapted from the HuggingFace library at
-https://github.com/huggingface/transformers
-"""
-
-import numpy as np
-from torch.utils.data import Dataset
-
-import nemo
-
-
-class GLUEDataset(Dataset):
-    def __init__(
-        self, data_dir, tokenizer, max_seq_length, processor, output_mode, evaluate, token_params,
-    ):
-        self.tokenizer = tokenizer
-        self.label_list = processor.get_labels()
-        self.examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
-        self.features = convert_examples_to_features(
-            self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params
-        )
-
-    def __len__(self):
-        return len(self.features)
-
-    def __getitem__(self, idx):
-        feature = self.features[idx]
-        return (
-            np.array(feature.input_ids),
-            np.array(feature.segment_ids),
-            np.array(feature.input_mask, dtype=np.long),
-            np.array(feature.label_id),
-        )
-
-
-def convert_examples_to_features(
-    examples,
-    label_list,
-    max_seq_length,
-    tokenizer,
-    output_mode,
-    bos_token=None,
-    eos_token='[SEP]',
-    pad_token='[PAD]',
-    cls_token='[CLS]',
-    sep_token_extra=None,
-    cls_token_at_end=False,
-    cls_token_segment_id=0,
-    pad_token_segment_id=0,
-    pad_on_left=False,
-    mask_padding_with_zero=True,
-    sequence_a_segment_id=0,
-    sequence_b_segment_id=1,
-):
-    """ Loads a data file into a list of `InputBatch`s
-        `cls_token_at_end` define the location of the CLS token:
-            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
-            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
-        `cls_token_segment_id` define the segment id associated to the CLS
-        token (0 for BERT, 2 for XLNet)
-         The convention in BERT is:
-         (a) For sequence pairs:
-          tokens:   [CLS] is this jack ##ville ? [SEP] no it is not . [SEP]
-          type_ids:   0   0  0    0    0       0   0   1  1  1  1   1   1
-         (b) For single sequences:
-          tokens:   [CLS] the dog is hairy . [SEP]
-          type_ids:   0   0   0   0  0     0   0
-         Where "type_ids" are used to indicate whether this is the first
-         sequence or the second sequence. The embedding vectors for `type=0`
-         and `type=1` were learned during pre-training and are added to the
-         wordpiece embedding vector (and position vector). This is
-         not *strictly* necessarysince the [SEP] token unambiguously separates
-         the sequences, but it makes it easier for the model to learn
-         the concept of sequences.
-         For classification tasks, the first vector (corresponding to [CLS])
-         is used as as the "sentence vector". Note that this only makes sense
-         because the entire model is fine-tuned.
-         For NMT:
-         (a) For sequence pairs:
-          tokens:<BOS> is this jack ##ville ? <EOS> <BOS> no it is not . <EOS>
-          type_ids:0   0  0    0    0       0   0     1   1  1  1  1   1   1
-         (b) For single sequences:
-          tokens:   <BOS> the dog is hairy . <EOS>
-          type_ids:   0   0   0   0  0     0   0
-    """
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for ex_index, example in enumerate(examples):
-        if ex_index % 10000 == 0:
-            nemo.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
-
-        tokens_a = tokenizer.text_to_tokens(example.text_a)
-
-        tokens_b = None
-        if example.text_b:
-            tokens_b = tokenizer.text_to_tokens(example.text_b)
-
-            special_tokens_count = 2 if eos_token else 0
-            special_tokens_count += 1 if sep_token_extra else 0
-            special_tokens_count += 2 if bos_token else 0
-            special_tokens_count += 1 if cls_token else 0
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
-        else:
-            special_tokens_count = 1 if eos_token else 0
-            special_tokens_count += 1 if sep_token_extra else 0
-            special_tokens_count += 1 if bos_token else 0
-            if len(tokens_a) > max_seq_length - special_tokens_count:
-                tokens_a = tokens_a[: max_seq_length - special_tokens_count]
-        # Add special tokens to sequence_a
-        tokens = tokens_a
-        if bos_token:
-            tokens = [bos_token] + tokens
-        if eos_token:
-            tokens += [eos_token]
-        segment_ids = [sequence_a_segment_id] * len(tokens)
-
-        # Add sequence separator between sequences
-        if tokens_b and sep_token_extra:
-            tokens += [sep_token_extra]
-            segment_ids += [sequence_a_segment_id]
-
-        # Add special tokens to sequence_b
-        if tokens_b:
-            if bos_token:
-                tokens += [bos_token]
-                segment_ids += [sequence_b_segment_id]
-            tokens += tokens_b
-            segment_ids += [sequence_b_segment_id] * (len(tokens_b))
-            if eos_token:
-                tokens += [eos_token]
-                segment_ids += [sequence_b_segment_id]
-
-        # Add classification token - for BERT models
-        if cls_token:
-            if cls_token_at_end:
-                tokens += [cls_token]
-                segment_ids += [cls_token_segment_id]
-            else:
-                tokens = [cls_token] + tokens
-                segment_ids = [cls_token_segment_id] + segment_ids
-        input_ids = tokenizer.tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        padding_length = max_seq_length - len(input_ids)
-        pad_token_id = tokenizer.tokens_to_ids([pad_token])[0]
-        if pad_on_left:
-            input_ids = ([pad_token_id] * padding_length) + input_ids
-            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
-            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
-        else:
-            input_ids = input_ids + ([pad_token_id] * padding_length)
-            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
-        if len(input_ids) != max_seq_length:
-            raise ValueError("input_ids must be of length max_seq_length")
-        if len(input_mask) != max_seq_length:
-            raise ValueError("input_mask must be of length max_seq_length")
-        if len(segment_ids) != max_seq_length:
-            raise ValueError("segment_ids must be of length max_seq_length")
-        if output_mode == "classification":
-            label_id = label_map[example.label]
-        elif output_mode == "regression":
-            label_id = np.float32(example.label)
-        else:
-            raise KeyError(output_mode)
-
-        if ex_index < 5:
-            nemo.logging.info("*** Example ***")
-            nemo.logging.info("guid: %s" % (example.guid))
-            nemo.logging.info("tokens: %s" % " ".join(list(map(str, tokens))))
-            nemo.logging.info("input_ids: %s" % " ".join(list(map(str, input_ids))))
-            nemo.logging.info("input_mask: %s" % " ".join(list(map(str, input_mask))))
-            nemo.logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids))))
-            nemo.logging.info("label: %s (id = %d)" % (example.label, label_id))
-
-        features.append(
-            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id,)
-        )
-    return features
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length.
-
-     This will always truncate the longer sequence one token at a time.
-     This makes more sense than truncating an equal percent
-     of tokens from each, since if one sequence is very short then each token
-     that's truncated likely contains more information than a longer sequence.
-    """
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, input_ids, input_mask, segment_ids, label_id):
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_id = label_id
diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py
new file mode 100644
index 000000000000..26423c3aa549
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py
@@ -0,0 +1,593 @@
+"""
+Copyright 2018 The Google AI Language Team Authors and
+The HuggingFace Inc. team.
+Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Utility functions for GLUE tasks
+Some transformer of this code were adapted from the HuggingFace library at
+https://github.com/huggingface/transformers
+"""
+import csv
+import os
+
+import numpy as np
+from torch.utils.data import Dataset
+
+from nemo import logging
+
+__all__ = ['GLUEDataset']
+
+
+class GLUEDataset(Dataset):
+    def __init__(self, data_dir, tokenizer, max_seq_length, processor, output_mode, evaluate, token_params):
+        self.tokenizer = tokenizer
+        self.label_list = processor.get_labels()
+        self.examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
+        self.features = convert_examples_to_features(
+            self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params
+        )
+
+    def __len__(self):
+        return len(self.features)
+
+    def __getitem__(self, idx):
+        feature = self.features[idx]
+        return (
+            np.array(feature.input_ids),
+            np.array(feature.segment_ids),
+            np.array(feature.input_mask, dtype=np.long),
+            np.array(feature.label_id),
+        )
+
+
+def convert_examples_to_features(
+    examples,
+    label_list,
+    max_seq_length,
+    tokenizer,
+    output_mode,
+    bos_token=None,
+    eos_token='[SEP]',
+    pad_token='[PAD]',
+    cls_token='[CLS]',
+    sep_token_extra=None,
+    cls_token_at_end=False,
+    cls_token_segment_id=0,
+    pad_token_segment_id=0,
+    pad_on_left=False,
+    mask_padding_with_zero=True,
+    sequence_a_segment_id=0,
+    sequence_b_segment_id=1,
+):
+    """ Loads a data file into a list of `InputBatch`s
+        `cls_token_at_end` define the location of the CLS token:
+            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+        `cls_token_segment_id` define the segment id associated to the CLS
+        token (0 for BERT, 2 for XLNet)
+         The convention in BERT is:
+         (a) For sequence pairs:
+          tokens:   [CLS] is this jack ##ville ? [SEP] no it is not . [SEP]
+          type_ids:   0   0  0    0    0       0   0   1  1  1  1   1   1
+         (b) For single sequences:
+          tokens:   [CLS] the dog is hairy . [SEP]
+          type_ids:   0   0   0   0  0     0   0
+         Where "type_ids" are used to indicate whether this is the first
+         sequence or the second sequence. The embedding vectors for `type=0`
+         and `type=1` were learned during pre-training and are added to the
+         wordpiece embedding vector (and position vector). This is
+         not *strictly* necessarysince the [SEP] token unambiguously separates
+         the sequences, but it makes it easier for the model to learn
+         the concept of sequences.
+         For classification tasks, the first vector (corresponding to [CLS])
+         is used as as the "sentence vector". Note that this only makes sense
+         because the entire model is fine-tuned.
+         For NMT:
+         (a) For sequence pairs:
+          tokens:<BOS> is this jack ##ville ? <EOS> <BOS> no it is not . <EOS>
+          type_ids:0   0  0    0    0       0   0     1   1  1  1  1   1   1
+         (b) For single sequences:
+          tokens:   <BOS> the dog is hairy . <EOS>
+          type_ids:   0   0   0   0  0     0   0
+    """
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    features = []
+    for ex_index, example in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+        tokens_a = tokenizer.text_to_tokens(example.text_a)
+
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.text_to_tokens(example.text_b)
+
+            special_tokens_count = 2 if eos_token else 0
+            special_tokens_count += 1 if sep_token_extra else 0
+            special_tokens_count += 2 if bos_token else 0
+            special_tokens_count += 1 if cls_token else 0
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
+        else:
+            special_tokens_count = 1 if eos_token else 0
+            special_tokens_count += 1 if sep_token_extra else 0
+            special_tokens_count += 1 if bos_token else 0
+            if len(tokens_a) > max_seq_length - special_tokens_count:
+                tokens_a = tokens_a[: max_seq_length - special_tokens_count]
+        # Add special tokens to sequence_a
+        tokens = tokens_a
+        if bos_token:
+            tokens = [bos_token] + tokens
+        if eos_token:
+            tokens += [eos_token]
+        segment_ids = [sequence_a_segment_id] * len(tokens)
+
+        # Add sequence separator between sequences
+        if tokens_b and sep_token_extra:
+            tokens += [sep_token_extra]
+            segment_ids += [sequence_a_segment_id]
+
+        # Add special tokens to sequence_b
+        if tokens_b:
+            if bos_token:
+                tokens += [bos_token]
+                segment_ids += [sequence_b_segment_id]
+            tokens += tokens_b
+            segment_ids += [sequence_b_segment_id] * (len(tokens_b))
+            if eos_token:
+                tokens += [eos_token]
+                segment_ids += [sequence_b_segment_id]
+
+        # Add classification token - for BERT models
+        if cls_token:
+            if cls_token_at_end:
+                tokens += [cls_token]
+                segment_ids += [cls_token_segment_id]
+            else:
+                tokens = [cls_token] + tokens
+                segment_ids = [cls_token_segment_id] + segment_ids
+        input_ids = tokenizer.tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        padding_length = max_seq_length - len(input_ids)
+        pad_token_id = tokenizer.tokens_to_ids([pad_token])[0]
+        if pad_on_left:
+            input_ids = ([pad_token_id] * padding_length) + input_ids
+            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
+            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
+        else:
+            input_ids = input_ids + ([pad_token_id] * padding_length)
+            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
+        if len(input_ids) != max_seq_length:
+            raise ValueError("input_ids must be of length max_seq_length")
+        if len(input_mask) != max_seq_length:
+            raise ValueError("input_mask must be of length max_seq_length")
+        if len(segment_ids) != max_seq_length:
+            raise ValueError("segment_ids must be of length max_seq_length")
+        if output_mode == "classification":
+            label_id = label_map[example.label]
+        elif output_mode == "regression":
+            label_id = np.float32(example.label)
+        else:
+            raise KeyError(output_mode)
+
+        if ex_index < 5:
+            logging.info("*** Example ***")
+            logging.info("guid: %s" % (example.guid))
+            logging.info("tokens: %s" % " ".join(list(map(str, tokens))))
+            logging.info("input_ids: %s" % " ".join(list(map(str, input_ids))))
+            logging.info("input_mask: %s" % " ".join(list(map(str, input_mask))))
+            logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids))))
+            logging.info("label: %s (id = %d)" % (example.label, label_id))
+
+        features.append(
+            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id)
+        )
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length.
+
+     This will always truncate the longer sequence one token at a time.
+     This makes more sense than truncating an equal percent
+     of tokens from each, since if one sequence is very short then each token
+     that's truncated likely contains more information than a longer sequence.
+    """
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+"""
+Utility functions for GLUE tasks
+This code was adapted from the HuggingFace library at
+https://github.com/huggingface/transformers
+"""
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence.
+            For single sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second
+            sequence. Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8-sig") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                # if sys.version_info[0] == 2:
+                #     line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}')
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            text_b = line[4]
+            label = line[0]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[8]
+            text_b = line[9]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliMismatchedProcessor(MnliProcessor):
+    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched")
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            label = line[1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class Sst2Processor(DataProcessor):
+    """Processor for the SST-2 data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[0]
+            label = line[1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class StsbProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return [None]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[7]
+            text_b = line[8]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QqpProcessor(DataProcessor):
+    """Processor for the QQP data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            try:
+                text_a = line[3]
+                text_b = line[4]
+                label = line[5]
+            except IndexError:
+                continue
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QnliProcessor(DataProcessor):
+    """Processor for the QNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class WnliProcessor(DataProcessor):
+    """Processor for the WNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+processors = {
+    "cola": ColaProcessor,
+    "mnli": MnliProcessor,
+    "mnli-mm": MnliMismatchedProcessor,
+    "mrpc": MrpcProcessor,
+    "sst-2": Sst2Processor,
+    "sts-b": StsbProcessor,
+    "qqp": QqpProcessor,
+    "qnli": QnliProcessor,
+    "rte": RteProcessor,
+    "wnli": WnliProcessor,
+}
+output_modes = {
+    "cola": "classification",
+    "mnli": "classification",
+    "mnli-mm": "classification",
+    "mrpc": "classification",
+    "sst-2": "classification",
+    "sts-b": "regression",
+    "qqp": "classification",
+    "qnli": "classification",
+    "rte": "classification",
+    "wnli": "classification",
+}
+GLUE_TASKS_NUM_LABELS = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py
similarity index 54%
rename from nemo/collections/nlp/data/datasets/joint_intent_slot.py
rename to nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py
index 5eae9c95a766..4abc70923226 100644
--- a/nemo/collections/nlp/data/datasets/joint_intent_slot.py
+++ b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py
@@ -18,14 +18,26 @@
 Some parts of this code were adapted from the HuggingFace library at
 https://github.com/huggingface/pytorch-pretrained-BERT
 """
-
 import itertools
 import random
 
 import numpy as np
 from torch.utils.data import Dataset
 
-from . import utils
+from nemo import logging
+from nemo.collections.nlp.data.datasets.datasets_utils import (
+    get_label_stats,
+    get_stats,
+    merge,
+    process_atis,
+    process_dialogflow,
+    process_jarvis_datasets,
+    process_mturk,
+    process_snips,
+)
+from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights, get_vocab, if_exist, label2idx
+
+__all__ = ['BertJointIntentSlotDataset', 'BertJointIntentSlotInferDataset', 'JointIntentSlotDataDesc']
 
 
 def get_features(
@@ -84,8 +96,8 @@ def get_features(
             all_slots.append(slots)
 
     max_seq_length = min(max_seq_length, max(sent_lengths))
-    nemo.logging.info(f'Max length: {max_seq_length}')
-    utils.get_stats(sent_lengths)
+    logging.info(f'Max length: {max_seq_length}')
+    get_stats(sent_lengths)
     too_long_count = 0
 
     for i, subtokens in enumerate(all_subtokens):
@@ -113,16 +125,9 @@ def get_features(
 
         all_segment_ids.append([0] * max_seq_length)
 
-    nemo.logging.info(f'{too_long_count} are longer than {max_seq_length}')
+    logging.info(f'{too_long_count} are longer than {max_seq_length}')
 
-    return (
-        all_input_ids,
-        all_segment_ids,
-        all_input_mask,
-        all_loss_mask,
-        all_subtokens_mask,
-        all_slots,
-    )
+    return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_slots)
 
 
 class BertJointIntentSlotDataset(Dataset):
@@ -262,3 +267,139 @@ def __getitem__(self, idx):
             np.array(self.all_loss_mask[idx]),
             np.array(self.all_subtokens_mask[idx]),
         )
+
+
+class JointIntentSlotDataDesc:
+    """ Convert the raw data to the standard format supported by
+    JointIntentSlotDataset.
+
+    By default, the None label for slots is 'O'.
+
+    JointIntentSlotDataset requires two files:
+
+        input_file: file to sequence + label.
+            the first line is header (sentence [tab] label)
+            each line should be [sentence][tab][label]
+
+        slot_file: file to slot labels, each line corresponding to
+            slot labels for a sentence in input_file. No header.
+
+    To keep the mapping from label index to label consistent during
+    training and inferencing, we require the following files:
+        dicts.intents.csv: each line is an intent. The first line
+            corresponding to the 0 intent label, the second line
+            corresponding to the 1 intent label, and so on.
+
+        dicts.slots.csv: each line is a slot. The first line
+            corresponding to the 0 slot label, the second line
+            corresponding to the 1 slot label, and so on.
+
+    Args:
+        data_dir (str): the directory of the dataset
+        do_lower_case (bool): whether to set your dataset to lowercase
+        dataset_name (str): the name of the dataset. If it's a dataset
+            that follows the standard JointIntentSlotDataset format,
+            you can set the name as 'default'.
+        none_slot_label (str): the label for slots that aren't indentified
+            defaulted to 'O'
+        pad_label (int): the int used for padding. If set to -1,
+             it'll be set to the whatever the None label is.
+
+    """
+
+    def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1):
+        if dataset_name == 'atis':
+            self.data_dir = process_atis(data_dir, do_lower_case)
+        elif dataset_name == 'snips-atis':
+            self.data_dir, self.pad_label = merge(
+                data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all'], dataset_name
+            )
+        elif dataset_name == 'dialogflow':
+            self.data_dir = process_dialogflow(data_dir, do_lower_case)
+        elif dataset_name == 'mturk-processed':
+            self.data_dir = process_mturk(data_dir, do_lower_case)
+        elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']):
+            self.data_dir = process_snips(data_dir, do_lower_case)
+            if dataset_name.endswith('light'):
+                self.data_dir = f'{self.data_dir}/light'
+            elif dataset_name.endswith('speak'):
+                self.data_dir = f'{self.data_dir}/speak'
+            elif dataset_name.endswith('all'):
+                self.data_dir = f'{self.data_dir}/all'
+        elif dataset_name.startswith('jarvis'):
+            self.data_dir = process_jarvis_datasets(
+                data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False
+            )
+        else:
+            if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
+                raise FileNotFoundError(
+                    "Make sure that your data follows the standard format "
+                    "supported by JointIntentSlotDataset. Your data must "
+                    "contain dict.intents.csv and dict.slots.csv."
+                )
+            self.data_dir = data_dir
+
+        self.intent_dict_file = self.data_dir + '/dict.intents.csv'
+        self.slot_dict_file = self.data_dir + '/dict.slots.csv'
+        self.num_intents = len(get_vocab(self.intent_dict_file))
+        slots = label2idx(self.slot_dict_file)
+        self.num_slots = len(slots)
+
+        for mode in ['train', 'test', 'eval']:
+
+            if not if_exist(self.data_dir, [f'{mode}.tsv']):
+                logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.')
+                continue
+
+            slot_file = f'{self.data_dir}/{mode}_slots.tsv'
+            with open(slot_file, 'r') as f:
+                slot_lines = f.readlines()
+
+            input_file = f'{self.data_dir}/{mode}.tsv'
+            with open(input_file, 'r') as f:
+                input_lines = f.readlines()[1:]  # Skipping headers at index 0
+
+            if len(slot_lines) != len(input_lines):
+                raise ValueError(
+                    "Make sure that the number of slot lines match the "
+                    "number of intent lines. There should be a 1-1 "
+                    "correspondence between every slot and intent lines."
+                )
+
+            dataset = list(zip(slot_lines, input_lines))
+
+            raw_slots, queries, raw_intents = [], [], []
+            for slot_line, input_line in dataset:
+                slot_list = [int(slot) for slot in slot_line.strip().split()]
+                raw_slots.append(slot_list)
+                parts = input_line.strip().split()
+                raw_intents.append(int(parts[-1]))
+                queries.append(' '.join(parts[:-1]))
+
+            infold = input_file[: input_file.rfind('/')]
+
+            logging.info(f'Three most popular intents during {mode}ing')
+            total_intents, intent_label_freq = get_label_stats(raw_intents, infold + f'/{mode}_intent_stats.tsv')
+            merged_slots = itertools.chain.from_iterable(raw_slots)
+
+            logging.info(f'Three most popular slots during {mode}ing')
+            slots_total, slots_label_freq = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv')
+
+            if mode == 'train':
+                self.slot_weights = calc_class_weights(slots_label_freq)
+                logging.info(f'Slot weights are - {self.slot_weights}')
+
+                self.intent_weights = calc_class_weights(intent_label_freq)
+                logging.info(f'Intent weights are - {self.intent_weights}')
+
+            logging.info(f'Total intents - {total_intents}')
+            logging.info(f'Intent label frequency - {intent_label_freq}')
+            logging.info(f'Total Slots - {slots_total}')
+            logging.info(f'Slots label frequency - {slots_label_freq}')
+
+        if pad_label != -1:
+            self.pad_label = pad_label
+        else:
+            if none_slot_label not in slots:
+                raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.')
+            self.pad_label = slots[none_slot_label]
diff --git a/nemo/collections/nlp/data/datasets/language_modeling.py b/nemo/collections/nlp/data/datasets/language_modeling.py
deleted file mode 100644
index d8912da7f891..000000000000
--- a/nemo/collections/nlp/data/datasets/language_modeling.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Pytorch Dataset for training Neural Machine Translation."""
-
-import numpy as np
-from torch.utils.data import Dataset
-
-from .. import utils
-
-
-class LanguageModelingDataset(Dataset):
-    def __init__(self, tokenizer, dataset, max_seq_length=512, batch_step=None):
-        self.tokenizer = tokenizer
-        self.max_seq_length = max_seq_length
-        self.batch_step = batch_step or self.max_seq_length
-        ids = utils.dataset_to_ids(dataset, tokenizer, add_bos_eos=False)
-        self.ids = np.array([j for i in ids for j in i])
-
-    def __len__(self):
-        return (len(self.ids) - self.max_seq_length) // self.batch_step
-
-    def __getitem__(self, idx):
-        left = idx * self.batch_step
-        right = left + self.max_seq_length
-        src_ids = self.ids[left:right]
-        labels = self.ids[left + 1 : right + 1]
-        src_mask = (src_ids != self.tokenizer.pad_id()).astype(np.float32)
-        return src_ids, src_mask, labels
diff --git a/nemo/collections/nlp/data/datasets/bert_pretraining.py b/nemo/collections/nlp/data/datasets/lm_bert_dataset.py
similarity index 91%
rename from nemo/collections/nlp/data/datasets/bert_pretraining.py
rename to nemo/collections/nlp/data/datasets/lm_bert_dataset.py
index 25ded90cd6ea..1ff975d25025 100644
--- a/nemo/collections/nlp/data/datasets/bert_pretraining.py
+++ b/nemo/collections/nlp/data/datasets/lm_bert_dataset.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
+
 """Pytorch Dataset for training BERT."""
 
 import array
@@ -26,6 +27,12 @@
 from torch.utils.data import Dataset
 from tqdm import tqdm
 
+from nemo import logging
+from nemo.collections.nlp.data.datasets.datasets_utils import download_wkt2
+from nemo.collections.nlp.data.datasets.lm_transformer_dataset import create_vocab_mlm
+
+__all__ = ['BertPretrainingDataset', 'BertPretrainingPreprocessedDataset']
+
 
 class BertPretrainingDataset(Dataset):
     def __init__(
@@ -187,7 +194,7 @@ def match_target_seq_length(document, target_seq_length, filename, line_idx, sen
         a_line_offset = self.sentence_indices[a_filename][a_line_idx]
         a_document = get_document(a_filename, a_line_offset)
         a_document, a_line_idx = match_target_seq_length(
-            a_document, target_seq_length_a, a_filename, a_line_idx, self.sentence_indices,
+            a_document, target_seq_length_a, a_filename, a_line_idx, self.sentence_indices
         )
 
         is_last_line = a_line_idx >= (len(self.sentence_indices[a_filename]) - 1)
@@ -221,7 +228,7 @@ def match_target_seq_length(document, target_seq_length, filename, line_idx, sen
         b_line_pos = self.sentence_indices[b_filename][b_line_idx]
         b_document = get_document(b_filename, b_line_pos)
         b_document, b_line_idx = match_target_seq_length(
-            b_document, target_seq_length_b, b_filename, b_line_idx, self.sentence_indices,
+            b_document, target_seq_length_b, b_filename, b_line_idx, self.sentence_indices
         )
 
         def truncate_seq_pair(a, b, max_num_tokens):
@@ -350,7 +357,7 @@ def __len__(self):
         return len(self.inputs[0])
 
     def __getitem__(self, index):
-        [input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels,] = [
+        [input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels] = [
             input[index].astype(np.int64) for input in self.inputs
         ]
 
@@ -367,11 +374,24 @@ def __getitem__(self, index):
 
         input_mask = np.asarray(input_mask, dtype=np.float32)
         output_mask = np.asarray(output_mask, dtype=np.float32)
-        return (
-            input_ids,
-            segment_ids,
-            input_mask,
-            output_ids,
-            output_mask,
-            next_sentence_labels,
-        )
+        return (input_ids, segment_ids, input_mask, output_ids, output_mask, next_sentence_labels)
+
+
+class BERTPretrainingDataDesc:
+    def __init__(self, dataset_name, data_dir, vocab_size, sample_size, special_tokens, train_file=''):
+        if dataset_name == 'wikitext-2':
+            if not os.path.exists(data_dir):
+                data_dir = download_wkt2(data_dir)
+            self.data_dir, self.tokenizer_model = create_vocab_mlm(
+                data_dir, vocab_size, sample_size, special_tokens, train_file
+            )
+        else:
+            logging.warning(
+                "Looks like you passed a dataset name that isn't "
+                "already supported by NeMo. Please make sure that "
+                "you build the preprocessing method for it."
+            )
+
+        self.train_file = f'{data_dir}/train.txt'
+        self.eval_file = f'{data_dir}/valid.txt'
+        self.test_file = f'{data_dir}/test.txt'
diff --git a/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py b/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py
new file mode 100644
index 000000000000..e2a9717abf11
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py
@@ -0,0 +1,187 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Pytorch Dataset for training Neural Machine Translation."""
+import glob
+import os
+import pickle
+import re
+
+import numpy as np
+from sentencepiece import SentencePieceTrainer as SPT
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from nemo import logging
+from nemo.collections.nlp.data.datasets.datasets_utils import DATABASE_EXISTS_TMP, download_wkt2
+from nemo.collections.nlp.utils.common_nlp_utils import if_exist
+
+__all__ = ['LanguageModelingDataset']
+
+
+class LanguageModelingDataset(Dataset):
+    def __init__(self, tokenizer, dataset, max_seq_length=512, batch_step=None):
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_seq_length
+        self.batch_step = batch_step or self.max_seq_length
+        ids = dataset_to_ids(dataset, tokenizer, add_bos_eos=False)
+        self.ids = np.array([j for i in ids for j in i])
+
+    def __len__(self):
+        return (len(self.ids) - self.max_seq_length) // self.batch_step
+
+    def __getitem__(self, idx):
+        left = idx * self.batch_step
+        right = left + self.max_seq_length
+        src_ids = self.ids[left:right]
+        labels = self.ids[left + 1 : right + 1]
+        src_mask = (src_ids != self.tokenizer.pad_id()).astype(np.float32)
+        return src_ids, src_mask, labels
+
+
+class LanguageModelDataDesc:
+    def __init__(self, dataset_name, data_dir, do_lower_case):
+        if dataset_name == 'wikitext-2':
+            if not os.path.exists(data_dir):
+                data_dir = download_wkt2(data_dir)
+            self.vocab_size = create_vocab_lm(data_dir, do_lower_case)
+            self.data_dir = data_dir
+        else:
+            logging.warning(
+                "Looks like you passed a dataset name that isn't "
+                "already supported by NeMo. Please make sure that "
+                "you build the preprocessing method for it."
+            )
+
+
+def create_vocab_mlm(
+    data_dir, vocab_size, sample_size, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], train_file=''
+):
+    vocab = special_tokens[:]
+    bert_dir = f'{data_dir}/bert'
+    if if_exist(bert_dir, ['tokenizer.model']):
+        logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir))
+        return data_dir, f'{bert_dir}/tokenizer.model'
+    logging.info(f'Processing WikiText dataset and store at {bert_dir}')
+    os.makedirs(bert_dir, exist_ok=True)
+
+    if not train_file:
+        files = glob.glob(f'{data_dir}/*.txt')
+        train_file = f'{bert_dir}/merged.txt'
+        logging.info(f"Merging {len(files)} txt files into {train_file}")
+
+        with open(train_file, "w") as merged:
+            for file in tqdm(files):
+                with open(file, 'r') as inf:
+                    content = inf.read().strip()
+                merged.write(content + '\n\n\n')
+    else:
+        train_file = f'{data_dir}/{train_file}'
+
+    cmd = (
+        f"--input={train_file} --model_prefix={bert_dir}/tokenizer "
+        f"--vocab_size={vocab_size - len(vocab)} "
+        f"--input_sentence_size={sample_size} "
+        f"--shuffle_input_sentence=true --hard_vocab_limit=false "
+        f"--bos_id=-1 --eos_id=-1"
+    )
+    SPT.Train(cmd)
+
+    # Add BERT control symbols
+    tokens = []
+
+    with open(f"{bert_dir}/tokenizer.vocab", "r") as f:
+        f.readline()  # skip first <unk> token
+
+        # Read tokens from each line and parse for vocab
+        for line in f:
+            piece = line.split("\t")[0]
+            token = piece[1:] if piece.startswith("▁") else f"##{piece}"
+            tokens.append(token)
+
+    vocab.extend(tokens)
+
+    # Save vocabulary to output file
+    with open(f'{bert_dir}/vocab.txt', "w") as f:
+        for token in vocab:
+            f.write(f"{token}\n".format())
+    return data_dir, f'{bert_dir}/tokenizer.model'
+
+
+def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True):
+    """
+    Reads dataset from file line by line, tokenizes each line with tokenizer,
+    and returns list of lists which corresponds to ids of tokenized strings.
+
+    Args:
+        dataset: path to dataset
+        tokenizer: tokenizer to convert text into ids
+        cache_ids: if True, ids are saved to disk as pickle file
+            with similar name (e.g., data.txt --> data.txt.pkl)
+        add_bos_eos: bool, whether to add <s> and </s> symbols (e.g., for NMT)
+    Returns:
+        ids: list of ids which correspond to tokenized strings of the dataset
+    """
+
+    cached_ids_dataset = dataset + str(".pkl")
+    if os.path.isfile(cached_ids_dataset):
+        logging.info("Loading cached tokenized dataset ...")
+        ids = pickle.load(open(cached_ids_dataset, "rb"))
+    else:
+        logging.info("Tokenizing dataset ...")
+        data = open(dataset, "rb").readlines()
+        ids = []
+        for sentence in data:
+            sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8"))
+            if add_bos_eos:
+                sent_ids = [tokenizer.bos_id()] + sent_ids + [tokenizer.eos_id()]
+            ids.append(sent_ids)
+        if cache_ids:
+            logging.info("Caching tokenized dataset ...")
+            pickle.dump(ids, open(cached_ids_dataset, "wb"))
+    return ids
+
+
+def create_vocab_lm(data_dir, do_lower_case):
+    if if_exist(data_dir, ['train.txt', 'vocab.txt']):
+        logging.info("Vocabulary has been created.")
+        with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f:
+            vocab_size = len(f.readlines())
+        return vocab_size
+
+    logging.info(f'Creating vocabulary from training data at {data_dir}')
+
+    with open(f'{data_dir}/train.txt', 'r') as f:
+        txt = f.read()
+    if do_lower_case:
+        txt = txt.lower()
+    lines = re.split(r'[\n]', txt)
+    sentences = [line.strip().split() for line in lines if line.strip()]
+
+    vocab = {"[PAD]": 0, "[SEP]": 1, "[CLS]": 2, "[MASK]": 3}
+    idx = 4
+    for sentence in sentences:
+        for word in sentence:
+            if word not in vocab:
+                vocab[word] = idx
+                idx += 1
+
+    with open(f'{data_dir}/vocab.txt', 'w') as f:
+        for word in sorted(vocab.keys()):
+            f.write(word + '\n')
+    logging.info(f"Created vocabulary of size {len(vocab)}")
+
+    return len(vocab)
diff --git a/nemo/collections/nlp/data/datasets/translation.py b/nemo/collections/nlp/data/datasets/machine_translation_dataset.py
similarity index 77%
rename from nemo/collections/nlp/data/datasets/translation.py
rename to nemo/collections/nlp/data/datasets/machine_translation_dataset.py
index e9c1134e70e0..0b8b049840ca 100644
--- a/nemo/collections/nlp/data/datasets/translation.py
+++ b/nemo/collections/nlp/data/datasets/machine_translation_dataset.py
@@ -1,3 +1,4 @@
+# =============================================================================
 # Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,20 +12,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
+# =============================================================================
+
 """Pytorch Dataset for training Neural Machine Translation."""
+
 from collections import OrderedDict
 
 import numpy as np
 from torch.utils.data import Dataset
 
-from ..utils import clean_src_and_target, dataset_to_ids
+from nemo.collections.nlp.data.datasets.lm_transformer_dataset import dataset_to_ids
+
+__all__ = ['TranslationDataset']
 
 
 class TranslationDataset(Dataset):
-    def __init__(
-        self, tokenizer_src, tokenizer_tgt, dataset_src, dataset_tgt, tokens_in_batch=1024, clean=False,
-    ):
+    def __init__(self, tokenizer_src, tokenizer_tgt, dataset_src, dataset_tgt, tokens_in_batch=1024, clean=False):
 
         self.src_tokenizer = tokenizer_src
         self.tgt_tokenizer = tokenizer_tgt
@@ -152,3 +155,36 @@ def pack_data_into_batches(self, src_ids, tgt_ids):
             batches.pop(-1)
 
         return batches
+
+
+def clean_src_and_target(src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5):
+    """
+    Cleans source and target sentences to get rid of noisy data.
+    Specifically, a pair of sentences is removed if
+      -- either source or target is longer than *max_tokens*
+      -- either source or target is shorter than *min_tokens*
+      -- absolute difference between source and target is larger than
+         *max_tokens_diff*
+      -- one sentence is *max_tokens_ratio* times longer than the other
+    """
+
+    if len(src_ids) != len(tgt_ids):
+        raise ValueError("Source and target corpora have different lengths!")
+    src_ids_, tgt_ids_ = [], []
+    for i in range(len(src_ids)):
+        src_len, tgt_len = len(src_ids[i]), len(tgt_ids[i])
+        if (
+            src_len > max_tokens
+            or tgt_len > max_tokens
+            or src_len < min_tokens
+            or tgt_len < min_tokens
+            or (src_ids[i] == tgt_ids[i])
+            or np.abs(src_len - tgt_len) > max_tokens_diff
+        ):
+            continue
+        ratio = max(src_len - 2, 1) / max(tgt_len - 2, 1)
+        if ratio > max_tokens_ratio or ratio < (1 / max_tokens_ratio):
+            continue
+        src_ids_.append(src_ids[i])
+        tgt_ids_.append(tgt_ids[i])
+    return src_ids_, tgt_ids_
diff --git a/nemo/collections/nlp/data/datasets/punctuation_capitalization.py b/nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py
similarity index 91%
rename from nemo/collections/nlp/data/datasets/punctuation_capitalization.py
rename to nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py
index ecabfc64032f..b8d8bfcd728b 100644
--- a/nemo/collections/nlp/data/datasets/punctuation_capitalization.py
+++ b/nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py
@@ -19,6 +19,8 @@
 https://github.com/huggingface/pytorch-pretrained-BERT
 """
 
+__all__ = ['BertPunctuationCapitalizationDataset', 'BertPunctuationCapitalizationInferDataset']
+
 import itertools
 import os
 import pickle
@@ -27,8 +29,8 @@
 import numpy as np
 from torch.utils.data import Dataset
 
-import nemo
-from . import utils
+import nemo.collections.nlp.data.datasets.datasets_utils as utils
+from nemo import logging
 
 
 def get_features(
@@ -123,7 +125,7 @@ def get_features(
             capit_all_labels.append(capit_labels)
 
     max_seq_length = min(max_seq_length, max(sent_lengths))
-    nemo.logging.info(f'Max length: {max_seq_length}')
+    logging.info(f'Max length: {max_seq_length}')
     utils.get_stats(sent_lengths)
     too_long_count = 0
 
@@ -154,18 +156,18 @@ def get_features(
 
         all_segment_ids.append([0] * max_seq_length)
 
-    nemo.logging.info(f'{too_long_count} are longer than {max_seq_length}')
+    logging.info(f'{too_long_count} are longer than {max_seq_length}')
 
     for i in range(min(len(all_input_ids), 5)):
-        nemo.logging.info("*** Example ***")
-        nemo.logging.info("i: %s" % (i))
-        nemo.logging.info("subtokens: %s" % " ".join(list(map(str, all_subtokens[i]))))
-        nemo.logging.info("loss_mask: %s" % " ".join(list(map(str, all_loss_mask[i]))))
-        nemo.logging.info("input_mask: %s" % " ".join(list(map(str, all_input_mask[i]))))
-        nemo.logging.info("subtokens_mask: %s" % " ".join(list(map(str, all_subtokens_mask[i]))))
+        logging.info("*** Example ***")
+        logging.info("i: %s" % (i))
+        logging.info("subtokens: %s" % " ".join(list(map(str, all_subtokens[i]))))
+        logging.info("loss_mask: %s" % " ".join(list(map(str, all_loss_mask[i]))))
+        logging.info("input_mask: %s" % " ".join(list(map(str, all_input_mask[i]))))
+        logging.info("subtokens_mask: %s" % " ".join(list(map(str, all_subtokens_mask[i]))))
         if with_label:
-            nemo.logging.info("punct_labels: %s" % " ".join(list(map(str, punct_all_labels[i]))))
-            nemo.logging.info("capit_labels: %s" % " ".join(list(map(str, capit_all_labels[i]))))
+            logging.info("punct_labels: %s" % " ".join(list(map(str, punct_all_labels[i]))))
+            logging.info("capit_labels: %s" % " ".join(list(map(str, capit_all_labels[i]))))
 
     return (
         all_input_ids,
@@ -245,7 +247,7 @@ def __init__(
         if use_cache and os.path.exists(features_pkl):
             # If text_file was already processed, load from pickle
             features = pickle.load(open(features_pkl, 'rb'))
-            nemo.logging.info(f'features restored from {features_pkl}')
+            logging.info(f'features restored from {features_pkl}')
         else:
             if num_samples == 0:
                 raise ValueError("num_samples has to be positive", num_samples)
@@ -288,16 +290,16 @@ def __init__(
             # for dev/test sets use label mapping from training set
             if punct_label_ids:
                 if len(punct_label_ids) != len(punct_unique_labels):
-                    nemo.logging.info(
+                    logging.info(
                         'Not all labels from the specified'
                         + 'label_ids dictionary are present in the'
                         + 'current dataset. Using the provided'
                         + 'label_ids dictionary.'
                     )
                 else:
-                    nemo.logging.info('Using the provided label_ids dictionary.')
+                    logging.info('Using the provided label_ids dictionary.')
             else:
-                nemo.logging.info(
+                logging.info(
                     'Creating a new label to label_id dictionary.'
                     + ' It\'s recommended to use label_ids generated'
                     + ' during training for dev/test sets to avoid'
@@ -332,7 +334,7 @@ def create_label_ids(unique_labels, pad_label=pad_label):
 
             if use_cache:
                 pickle.dump(features, open(features_pkl, "wb"))
-                nemo.logging.info(f'features saved to {features_pkl}')
+                logging.info(f'features saved to {features_pkl}')
 
         self.all_input_ids = features[0]
         self.all_segment_ids = features[1]
@@ -348,14 +350,14 @@ def create_label_ids(unique_labels, pad_label=pad_label):
         def get_stats_and_save(all_labels, label_ids, name):
             infold = text_file[: text_file.rfind('/')]
             merged_labels = itertools.chain.from_iterable(all_labels)
-            nemo.logging.info('Three most popular labels')
+            logging.info('Three most popular labels')
             _, label_frequencies = utils.get_label_stats(merged_labels, infold + '/label_count_' + name + '.tsv')
 
             out = open(os.path.join(infold, name + '_label_ids.csv'), 'w')
             labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1]))
             out.write('\n'.join(labels))
-            nemo.logging.info(f'Labels: {label_ids}')
-            nemo.logging.info(f'Labels mapping saved to : {out.name}')
+            logging.info(f'Labels: {label_ids}')
+            logging.info(f'Labels mapping saved to : {out.name}')
 
             return label_frequencies
 
diff --git a/nemo/collections/nlp/data/datasets/squad.py b/nemo/collections/nlp/data/datasets/qa_squad_dataset.py
similarity index 88%
rename from nemo/collections/nlp/data/datasets/squad.py
rename to nemo/collections/nlp/data/datasets/qa_squad_dataset.py
index 01f99d3c5d89..bf68e7bb7a55 100644
--- a/nemo/collections/nlp/data/datasets/squad.py
+++ b/nemo/collections/nlp/data/datasets/qa_squad_dataset.py
@@ -26,9 +26,9 @@
 from torch.utils.data import Dataset
 from tqdm import tqdm
 
-import nemo
-from ...utils.metrics.squad_metrics import (
-    _compute_softmax,
+from nemo import logging
+from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import DataProcessor
+from nemo.collections.nlp.metrics.squad_metrics import (
     _get_best_indexes,
     apply_no_ans_threshold,
     exact_match_score,
@@ -39,9 +39,10 @@
     merge_eval,
     normalize_answer,
 )
-from .utils import DataProcessor
-from nemo.collections.nlp.utils.nlp_utils import _is_whitespace
+from nemo.collections.nlp.utils.common_nlp_utils import _is_whitespace
+from nemo.collections.nlp.utils.loss_utils import _compute_softmax
 
+__all__ = ['SquadDataset']
 
 """
 Utility functions for Question Answering NLP tasks
@@ -72,7 +73,7 @@ class SquadDataset(Dataset):
     """
 
     def __init__(
-        self, data_dir, tokenizer, doc_stride, max_query_length, max_seq_length, version_2_with_negative, mode,
+        self, data_dir, tokenizer, doc_stride, max_query_length, max_seq_length, version_2_with_negative, mode
     ):
         self.tokenizer = tokenizer
         if not version_2_with_negative:
@@ -90,7 +91,7 @@ def __init__(
             cached_train_features_file = (
                 data_dir
                 + '/cache'
-                + '_{0}_{1}_{2}_{3}'.format(mode, str(max_seq_length), str(doc_stride), str(max_query_length),)
+                + '_{0}_{1}_{2}_{3}'.format(mode, str(max_seq_length), str(doc_stride), str(max_query_length))
             )
 
             if os.path.exists(cached_train_features_file):
@@ -107,9 +108,7 @@ def __init__(
                 )
                 master_device = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
                 if master_device:
-                    nemo.logging.info(
-                        "  Saving train features into cached file %s", cached_train_features_file,
-                    )
+                    logging.info("  Saving train features into cached file %s", cached_train_features_file)
                     with open(cached_train_features_file, "wb") as writer:
                         pickle.dump(self.features, writer)
         elif mode == "dev":
@@ -159,7 +158,7 @@ def get_predictions(
             example_index_to_features[feature.example_index].append(feature)
 
         _PrelimPrediction = collections.namedtuple(
-            "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit",],
+            "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
         )
 
         all_predictions = collections.OrderedDict()
@@ -233,7 +232,7 @@ def get_predictions(
                         end_logit=null_end_logit,
                     )
                 )
-            prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,)
+            prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
 
             _NbestPrediction = collections.namedtuple("NbestPrediction", ["text", "start_logit", "end_logit"])
 
@@ -268,21 +267,17 @@ def get_predictions(
                     final_text = ""
                     seen_predictions[final_text] = True
 
-                nbest.append(
-                    _NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,)
-                )
+                nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
             # if we didn't include the empty option in the n-best, include it
             if version_2_with_negative:
                 if "" not in seen_predictions:
-                    nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit,))
+                    nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
 
                 # In very rare edge cases we could only
                 # have single null pred. We just create a nonce prediction
                 # in this case to avoid failure.
                 if len(nbest) == 1:
-                    nbest.insert(
-                        0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0),
-                    )
+                    nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
 
             # In very rare edge cases we could have no valid predictions. So we
             # just create a nonce prediction in this case to avoid failure.
@@ -327,9 +322,7 @@ def get_predictions(
 
         return all_predictions, all_nbest_json, scores_diff_json
 
-    def evaluate_predictions(
-        self, all_predictions, no_answer_probs=None, no_answer_probability_threshold=1.0,
-    ):
+    def evaluate_predictions(self, all_predictions, no_answer_probs=None, no_answer_probability_threshold=1.0):
         qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in self.examples}
         has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
         no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
@@ -339,10 +332,10 @@ def evaluate_predictions(
         exact, f1 = self.get_raw_scores(all_predictions)
 
         exact_threshold = apply_no_ans_threshold(
-            exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold,
+            exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
         )
         f1_threshold = apply_no_ans_threshold(
-            f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold,
+            f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
         )
 
         evaluation = make_eval_dict(exact_threshold, f1_threshold)
@@ -356,9 +349,7 @@ def evaluate_predictions(
             merge_eval(evaluation, no_ans_eval, "NoAns")
 
         if no_answer_probs:
-            find_all_best_thresh(
-                evaluation, all_predictions, exact, f1, no_answer_probs, qas_id_to_has_answer,
-            )
+            find_all_best_thresh(evaluation, all_predictions, exact, f1, no_answer_probs, qas_id_to_has_answer)
 
         return evaluation["best_exact"], evaluation["best_f1"]
 
@@ -380,7 +371,7 @@ def get_raw_scores(self, preds):
                 gold_answers = [""]
 
             if qas_id not in preds:
-                print("Missing prediction for %s" % qas_id)
+                logging.warning("Missing prediction for %s" % qas_id)
                 continue
 
             prediction = preds[qas_id]
@@ -401,7 +392,7 @@ def evaluate(
         null_score_diff_threshold,
     ):
 
-        (all_predictions, all_nbest_json, scores_diff_json,) = self.get_predictions(
+        (all_predictions, all_nbest_json, scores_diff_json) = self.get_predictions(
             unique_ids,
             start_logits,
             end_logits,
@@ -417,9 +408,7 @@ def evaluate(
         return exact_match, f1, all_predictions
 
 
-def convert_examples_to_features(
-    examples, tokenizer, max_seq_length, doc_stride, max_query_length, has_groundtruth,
-):
+def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, has_groundtruth):
     """Loads a data file into a list of `InputBatch`s."""
 
     unique_id = 1000000000
@@ -459,7 +448,7 @@ def convert_examples_to_features(
                 tok_end_position = len(all_doc_tokens) - 1
 
             (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text,
+                all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
             )
 
         # The -3 accounts for [CLS], [SEP] and [SEP]
@@ -544,28 +533,28 @@ def convert_examples_to_features(
                 end_position = 0
 
             if example_index < 1:
-                nemo.logging.info("*** Example ***")
-                nemo.logging.info("unique_id: %s" % (unique_id))
-                nemo.logging.info("example_index: %s" % (example_index))
-                nemo.logging.info("doc_span_index: %s" % (doc_span_index))
-                nemo.logging.info("tokens: %s" % " ".join(tokens))
-                nemo.logging.info(
+                logging.info("*** Example ***")
+                logging.info("unique_id: %s" % (unique_id))
+                logging.info("example_index: %s" % (example_index))
+                logging.info("doc_span_index: %s" % (doc_span_index))
+                logging.info("tokens: %s" % " ".join(tokens))
+                logging.info(
                     "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])
                 )
-                nemo.logging.info(
+                logging.info(
                     "token_is_max_context: %s"
                     % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()])
                 )
-                nemo.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                nemo.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-                nemo.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+                logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
                 if has_groundtruth and example.is_impossible:
-                    nemo.logging.info("impossible example")
+                    logging.info("impossible example")
                 if has_groundtruth and not example.is_impossible:
                     answer_text = " ".join(tokens[start_position : (end_position + 1)])
-                    nemo.logging.info("start_position: %d" % (start_position))
-                    nemo.logging.info("end_position: %d" % (end_position))
-                    nemo.logging.info("answer: %s" % (answer_text))
+                    logging.info("start_position: %d" % (start_position))
+                    logging.info("end_position: %d" % (end_position))
+                    logging.info("answer: %s" % (answer_text))
 
             features.append(
                 InputFeatures(
@@ -651,7 +640,7 @@ def get_train_examples(self, data_dir, filename=None):
             )
 
         with open(
-            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8",
+            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
         ) as reader:
             input_data = json.load(reader)["data"]
         return self._create_examples(input_data, "train")
@@ -676,7 +665,7 @@ def get_dev_examples(self, data_dir, filename=None):
                              SquadV1Processor or SquadV2Processor"
             )
         with open(
-            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8",
+            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
         ) as reader:
             input_data = json.load(reader)["data"]
         return self._create_examples(input_data, "dev")
@@ -797,7 +786,7 @@ def __init__(
             # start_position is index of word, end_position inclusive
             self.start_position = char_to_word_offset[start_position_character]
             self.end_position = char_to_word_offset[
-                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1,)
+                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
             ]
 
 
@@ -833,3 +822,45 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
             best_span_index = span_index
 
     return cur_span_index == best_span_index
+
+
+def check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token.
+
+    Because of the sliding window approach taken to scoring documents,
+    a single token can appear in multiple documents.
+
+    Example:
+        Doc: the man went to the store and bought a gallon of milk
+        Span A: the man went to the
+        Span B: to the store and bought
+        Span C: and bought a gallon of
+        ...
+
+    Now the word 'bought' will have two scores from spans B and C. We only
+    want to consider the score with "maximum context", which we define as
+    the *minimum* of its left and right context (the *sum* of left and
+    right context will always be the same, of course).
+
+    In the example the maximum context for 'bought' would be span C since
+    it has 1 left context and 3 right context, while span B has 4 left context
+    and 0 right context.
+
+    Code adapted from the code by the Google AI and HuggingFace.
+    """
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
diff --git a/nemo/collections/nlp/data/datasets/state_tracking_trade_dataset.py b/nemo/collections/nlp/data/datasets/state_tracking_trade_dataset.py
new file mode 100644
index 000000000000..0995d7c14249
--- /dev/null
+++ b/nemo/collections/nlp/data/datasets/state_tracking_trade_dataset.py
@@ -0,0 +1,428 @@
+# =============================================================================
+# Copyright 2019 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
+# Copyright 2019 Salesforce Research.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom
+# the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# =============================================================================
+
+import json
+import os
+import pickle
+import random
+
+from torch.utils.data import Dataset
+
+from nemo import logging
+
+__all__ = ['MultiWOZDataset', 'MultiWOZDataDesc']
+
+
+class MultiWOZDataset(Dataset):
+    """
+    By default, use only vocab from training data
+    Need to modify the code a little bit to create the vocab from all files
+    """
+
+    def __init__(self, data_dir, mode, domains, all_domains, vocab, gating_dict, slots, num_samples=-1, shuffle=False):
+
+        logging.info(f'Processing {mode} data')
+        self.data_dir = data_dir
+        self.mode = mode
+        self.gating_dict = gating_dict
+        self.domains = domains
+        self.all_domains = all_domains
+        self.vocab = vocab
+        self.slots = slots
+
+        self.features, self.max_len = self.get_features(num_samples, shuffle)
+        logging.info("Sample 0: " + str(self.features[0]))
+
+    def get_features(self, num_samples, shuffle):
+        if num_samples == 0:
+            raise ValueError("num_samples has to be positive", num_samples)
+
+        filename = f'{self.data_dir}/{self.mode}_dials.json'
+        logging.info(f'Reading from {filename}')
+        dialogs = json.load(open(filename, 'r'))
+
+        domain_count = {}
+        data = []
+        max_resp_len, max_value_len = 0, 0
+
+        for dialog_dict in dialogs:
+            if num_samples > 0 and len(data) >= num_samples:
+                break
+
+            dialog_history = ""
+            for domain in dialog_dict['domains']:
+                if domain not in self.domains:
+                    continue
+                if domain not in domain_count:
+                    domain_count[domain] = 0
+                domain_count[domain] += 1
+
+            for turn in dialog_dict['dialogue']:
+                if num_samples > 0 and len(data) >= num_samples:
+                    break
+
+                turn_uttr = turn['system_transcript'] + ' ; ' + turn['transcript']
+                turn_uttr_strip = turn_uttr.strip()
+                dialog_history += turn["system_transcript"] + " ; " + turn["transcript"] + " ; "
+                source_text = dialog_history.strip()
+
+                turn_beliefs = fix_general_label_error_multiwoz(turn['belief_state'], self.slots)
+
+                turn_belief_list = [f'{k}-{v}' for k, v in turn_beliefs.items()]
+
+                gating_label, responses = [], []
+                for slot in self.slots:
+                    if slot in turn_beliefs:
+                        responses.append(str(turn_beliefs[slot]))
+                        if turn_beliefs[slot] == "dontcare":
+                            gating_label.append(self.gating_dict["dontcare"])
+                        elif turn_beliefs[slot] == "none":
+                            gating_label.append(self.gating_dict["none"])
+                        else:
+                            gating_label.append(self.gating_dict["ptr"])
+                    else:
+                        responses.append("none")
+                        gating_label.append(self.gating_dict["none"])
+
+                sample = {
+                    'ID': dialog_dict['dialogue_idx'],
+                    'domains': dialog_dict['domains'],
+                    'turn_domain': turn['domain'],
+                    'turn_id': turn['turn_idx'],
+                    'dialogue_history': source_text,
+                    'turn_belief': turn_belief_list,
+                    'gating_label': gating_label,
+                    'turn_uttr': turn_uttr_strip,
+                    'responses': responses,
+                }
+
+                sample['context_ids'] = self.vocab.tokens2ids(sample['dialogue_history'].split())
+                sample['responses_ids'] = [
+                    self.vocab.tokens2ids(y.split() + [self.vocab.eos]) for y in sample['responses']
+                ]
+                sample['turn_domain'] = self.all_domains[sample['turn_domain']]
+
+                data.append(sample)
+
+                resp_len = len(sample['dialogue_history'].split())
+                max_resp_len = max(max_resp_len, resp_len)
+
+        logging.info(f'Domain count{domain_count}')
+        logging.info(f'Max response length{max_resp_len}')
+        logging.info(f'Processing {len(data)} samples')
+
+        if shuffle:
+            logging.info(f'Shuffling samples.')
+            random.shuffle(data)
+
+        return data, max_resp_len
+
+    def __len__(self):
+        return len(self.features)
+
+    def __getitem__(self, idx):
+        item = self.features[idx]
+        return {
+            'dialog_id': item['ID'],
+            'turn_id': item['turn_id'],
+            'turn_belief': item['turn_belief'],
+            'gating_label': item['gating_label'],
+            'context_ids': item['context_ids'],
+            'turn_domain': item['turn_domain'],
+            'responses_ids': item['responses_ids'],
+        }
+
+
+class Vocab:
+    """
+    Vocab class for TRADE model
+    UNK_token = 0
+    PAD_token = 1
+    SOS_token = 3
+    EOS_token = 2
+    """
+
+    def __init__(self):
+        self.word2idx = {'UNK': 0, 'PAD': 1, 'EOS': 2, 'BOS': 3}
+        self.idx2word = ['UNK', 'PAD', 'EOS', 'BOS']
+        self.unk_id = self.word2idx['UNK']
+        self.pad_id = self.word2idx['PAD']
+        self.eos_id = self.word2idx['EOS']
+        self.bos_id = self.word2idx['BOS']
+        self.unk, self.pad, self.eos, self.bos = 'UNK', 'PAD', 'EOS', 'BOS'
+
+    def __len__(self):
+        return len(self.idx2word)
+
+    def add_word(self, word):
+        if word not in self.word2idx:
+            self.word2idx[word] = len(self.idx2word)
+            self.idx2word.append(word)
+
+    def add_words(self, sent, level):
+        """
+        level == 'utterance': sent is a string
+        level == 'slot': sent is a list
+        level == 'belief': sent is a dictionary
+        """
+        if level == 'utterance':
+            for word in sent.split():
+                self.add_word(word)
+        elif level == 'slot':
+            for slot in sent:
+                domain, info = slot.split('-')
+                self.add_word(domain)
+                for subslot in info.split(' '):
+                    self.add_word(subslot)
+        elif level == 'belief':
+            for slot, value in sent.items():
+                domain, info = slot.split('-')
+                self.add_word(domain)
+                for subslot in info.split(' '):
+                    self.add_word(subslot)
+                for val in value.split(' '):
+                    self.add_word(val)
+
+    def tokens2ids(self, tokens):
+        """Converts list of tokens to list of ids."""
+        return [self.word2idx[w] if w in self.word2idx else self.unk_id for w in tokens]
+
+
+class MultiWOZDataDesc:
+    """
+    Processes MultiWOZ dataset, creates vocabulary file and list of slots.
+    """
+
+    def __init__(self, data_dir, domains={"attraction": 0, "restaurant": 1, "taxi": 2, "train": 3, "hotel": 4}):
+        logging.info(f'Processing MultiWOZ dataset')
+
+        self.all_domains = {
+            'attraction': 0,
+            'restaurant': 1,
+            'taxi': 2,
+            'train': 3,
+            'hotel': 4,
+            'hospital': 5,
+            'bus': 6,
+            'police': 7,
+        }
+        self.gating_dict = {'ptr': 0, 'dontcare': 1, 'none': 2}
+
+        self.data_dir = data_dir
+        self.domains = domains
+        self.vocab = Vocab()
+
+        ontology_file = open(f'{self.data_dir}/ontology.json', 'r')
+        self.ontology = json.load(ontology_file)
+
+        self.vocab_file = None
+        self.slots = None
+
+        self.get_slots()
+        self.get_vocab()
+
+    def get_vocab(self):
+        self.vocab_file = f'{self.data_dir}/vocab.pkl'
+
+        if os.path.exists(self.vocab_file):
+            logging.info(f'Loading vocab from {self.data_dir}')
+            self.vocab = pickle.load(open(self.vocab_file, 'rb'))
+        else:
+            self.create_vocab()
+
+        logging.info(f'Vocab size {len(self.vocab)}')
+
+    def get_slots(self):
+        used_domains = [key for key in self.ontology if key.split('-')[0] in self.domains]
+        self.slots = [k.replace(' ', '').lower() if 'book' not in k else k.lower() for k in used_domains]
+
+    def create_vocab(self):
+        self.vocab.add_words(self.slots, 'slot')
+
+        filename = f'{self.data_dir}/train_dials.json'
+        logging.info(f'Building vocab from {filename}')
+        dialogs = json.load(open(filename, 'r'))
+
+        max_value_len = 0
+
+        for dialog_dict in dialogs:
+            for turn in dialog_dict['dialogue']:
+                self.vocab.add_words(turn['system_transcript'], 'utterance')
+                self.vocab.add_words(turn['transcript'], 'utterance')
+
+                turn_beliefs = fix_general_label_error_multiwoz(turn['belief_state'], self.slots)
+                lengths = [len(turn_beliefs[slot]) for slot in self.slots if slot in turn_beliefs]
+                lengths.append(max_value_len)
+                max_value_len = max(lengths)
+
+        logging.info(f'Saving vocab to {self.data_dir}')
+        with open(self.vocab_file, 'wb') as handle:
+            pickle.dump(self.vocab, handle)
+
+
+def fix_general_label_error_multiwoz(labels, slots):
+    label_dict = dict([label['slots'][0] for label in labels])
+    GENERAL_TYPO = {
+        # type
+        "guesthouse": "guest house",
+        "guesthouses": "guest house",
+        "guest": "guest house",
+        "mutiple sports": "multiple sports",
+        "sports": "multiple sports",
+        "mutliple sports": "multiple sports",
+        "swimmingpool": "swimming pool",
+        "concerthall": "concert hall",
+        "concert": "concert hall",
+        "pool": "swimming pool",
+        "night club": "nightclub",
+        "mus": "museum",
+        "ol": "architecture",
+        "colleges": "college",
+        "coll": "college",
+        "architectural": "architecture",
+        "musuem": "museum",
+        "churches": "church",
+        # area
+        "center": "centre",
+        "center of town": "centre",
+        "near city center": "centre",
+        "in the north": "north",
+        "cen": "centre",
+        "east side": "east",
+        "east area": "east",
+        "west part of town": "west",
+        "ce": "centre",
+        "town center": "centre",
+        "centre of cambridge": "centre",
+        "city center": "centre",
+        "the south": "south",
+        "scentre": "centre",
+        "town centre": "centre",
+        "in town": "centre",
+        "north part of town": "north",
+        "centre of town": "centre",
+        "cb30aq": "none",
+        # price
+        "mode": "moderate",
+        "moderate -ly": "moderate",
+        "mo": "moderate",
+        # day
+        "next friday": "friday",
+        "monda": "monday",
+        # parking
+        "free parking": "free",
+        # internet
+        "free internet": "yes",
+        # star
+        "4 star": "4",
+        "4 stars": "4",
+        "0 star rarting": "none",
+        # others
+        "y": "yes",
+        "any": "dontcare",
+        "n": "no",
+        "does not care": "dontcare",
+        "not men": "none",
+        "not": "none",
+        "not mentioned": "none",
+        '': "none",
+        "not mendtioned": "none",
+        "3 .": "3",
+        "does not": "no",
+        "fun": "none",
+        "art": "none",
+    }
+
+    hotel_ranges = [
+        "nigh",
+        "moderate -ly priced",
+        "bed and breakfast",
+        "centre",
+        "venetian",
+        "intern",
+        "a cheap -er hotel",
+    ]
+    locations = ["gastropub", "la raza", "galleria", "gallery", "science", "m"]
+    detailed_hotels = ["hotel with free parking and free wifi", "4", "3 star hotel"]
+    areas = ["stansted airport", "cambridge", "silver street"]
+    attr_areas = ["norwich", "ely", "museum", "same area as hotel"]
+
+    for slot in slots:
+        if slot in label_dict.keys():
+            # general typos
+            if label_dict[slot] in GENERAL_TYPO.keys():
+                label_dict[slot] = label_dict[slot].replace(label_dict[slot], GENERAL_TYPO[label_dict[slot]])
+
+            # miss match slot and value
+            if (
+                (slot == "hotel-type" and label_dict[slot] in hotel_ranges)
+                or (slot == "hotel-internet" and label_dict[slot] == "4")
+                or (slot == "hotel-pricerange" and label_dict[slot] == "2")
+                or (slot == "attraction-type" and label_dict[slot] in locations)
+                or ("area" in slot and label_dict[slot] in ["moderate"])
+                or ("day" in slot and label_dict[slot] == "t")
+            ):
+                label_dict[slot] = "none"
+            elif slot == "hotel-type" and label_dict[slot] in detailed_hotels:
+                label_dict[slot] = "hotel"
+            elif slot == "hotel-star" and label_dict[slot] == "3 star hotel":
+                label_dict[slot] = "3"
+            elif "area" in slot:
+                if label_dict[slot] == "no":
+                    label_dict[slot] = "north"
+                elif label_dict[slot] == "we":
+                    label_dict[slot] = "west"
+                elif label_dict[slot] == "cent":
+                    label_dict[slot] = "centre"
+            elif "day" in slot:
+                if label_dict[slot] == "we":
+                    label_dict[slot] = "wednesday"
+                elif label_dict[slot] == "no":
+                    label_dict[slot] = "none"
+            elif "price" in slot and label_dict[slot] == "ch":
+                label_dict[slot] = "cheap"
+            elif "internet" in slot and label_dict[slot] == "free":
+                label_dict[slot] = "yes"
+
+            # some out-of-define classification slot values
+            if (slot == "restaurant-area" and label_dict[slot] in areas) or (
+                slot == "attraction-area" and label_dict[slot] in attr_areas
+            ):
+                label_dict[slot] = "none"
+
+    return label_dict
diff --git a/nemo/collections/nlp/data/datasets/sentence_classification.py b/nemo/collections/nlp/data/datasets/text_classification_dataset.py
similarity index 54%
rename from nemo/collections/nlp/data/datasets/sentence_classification.py
rename to nemo/collections/nlp/data/datasets/text_classification_dataset.py
index 1847eaf7b205..11340ffa4da5 100644
--- a/nemo/collections/nlp/data/datasets/sentence_classification.py
+++ b/nemo/collections/nlp/data/datasets/text_classification_dataset.py
@@ -25,11 +25,24 @@
 import numpy as np
 from torch.utils.data import Dataset
 
-import nemo
-from . import utils
-
-
-class BertSentenceClassificationDataset(Dataset):
+from nemo import logging
+from nemo.collections.nlp.data.datasets.datasets_utils import (
+    get_intent_labels,
+    get_label_stats,
+    get_stats,
+    process_imdb,
+    process_jarvis_datasets,
+    process_nlu,
+    process_sst_2,
+    process_thucnews,
+)
+from nemo.collections.nlp.utils.callback_utils import list2str
+from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights, if_exist
+
+__all__ = ['BertTextClassificationDataset']
+
+
+class BertTextClassificationDataset(Dataset):
     """A dataset class that converts from raw data to
     a dataset that can be used by DataLayerNM.
 
@@ -44,16 +57,14 @@ class BertSentenceClassificationDataset(Dataset):
         shuffle (bool): whether to shuffle your data.
     """
 
-    def __init__(
-        self, input_file, max_seq_length, tokenizer, num_samples=-1, shuffle=True,
-    ):
+    def __init__(self, input_file, max_seq_length, tokenizer, num_samples=-1, shuffle=True):
         with open(input_file, "r") as f:
             sent_labels, all_sent_subtokens = [], []
             sent_lengths = []
             too_long_count = 0
 
             lines = f.readlines()[1:]
-            nemo.logging.info(f'{input_file}: {len(lines)}')
+            logging.info(f'{input_file}: {len(lines)}')
 
             if shuffle or num_samples > -1:
                 random.seed(0)
@@ -63,7 +74,7 @@ def __init__(
 
             for index, line in enumerate(lines):
                 if index % 20000 == 0:
-                    nemo.logging.debug(f"Processing line {index}/{len(lines)}")
+                    logging.debug(f"Processing line {index}/{len(lines)}")
 
                 sent_label = int(line.split()[-1])
                 sent_labels.append(sent_label)
@@ -79,7 +90,7 @@ def __init__(
                 all_sent_subtokens.append(sent_subtokens)
                 sent_lengths.append(len(sent_subtokens))
 
-        utils.get_stats(sent_lengths)
+        get_stats(sent_lengths)
         self.max_seq_length = min(max_seq_length, max(sent_lengths))
 
         for i in range(len(all_sent_subtokens)):
@@ -88,7 +99,7 @@ def __init__(
                 all_sent_subtokens[i] = ['[CLS]'] + shorten_sent
                 too_long_count += 1
 
-        nemo.logging.info(
+        logging.info(
             f'{too_long_count} out of {len(sent_lengths)} \
                        sentencess with more than {max_seq_length} subtokens.'
         )
@@ -120,8 +131,7 @@ def convert_sequences_to_features(self, all_sent_subtokens, sent_labels, tokeniz
         for sent_id in range(len(all_sent_subtokens)):
             sent_subtokens = all_sent_subtokens[sent_id]
             sent_label = sent_labels[sent_id]
-            word_count = 0
-            # input_ids = tokenizer.tokens_to_ids(sent_subtokens)
+
             input_ids = [tokenizer._convert_token_to_id(t) for t in sent_subtokens]
 
             # The mask has 1 for real tokens and 0 for padding tokens.
@@ -138,12 +148,12 @@ def convert_sequences_to_features(self, all_sent_subtokens, sent_labels, tokeniz
             assert len(input_mask) == max_seq_length
 
             if sent_id == 0:
-                nemo.logging.info("*** Example ***")
-                nemo.logging.info("example_index: %s" % sent_id)
-                nemo.logging.info("subtokens: %s" % " ".join(sent_subtokens))
-                nemo.logging.info("sent_label: %s" % sent_label)
-                nemo.logging.info("input_ids: %s" % utils.list2str(input_ids))
-                nemo.logging.info("input_mask: %s" % utils.list2str(input_mask))
+                logging.info("*** Example ***")
+                logging.info("example_index: %s" % sent_id)
+                logging.info("subtokens: %s" % " ".join(sent_subtokens))
+                logging.info("sent_label: %s" % sent_label)
+                logging.info("input_ids: %s" % list2str(input_ids))
+                logging.info("input_mask: %s" % list2str(input_mask))
 
             self.features.append(
                 InputFeatures(
@@ -165,3 +175,74 @@ def __init__(self, sent_id, sent_label, input_ids, input_mask, segment_ids):
         self.input_ids = input_ids
         self.input_mask = input_mask
         self.segment_ids = segment_ids
+
+
+class SentenceClassificationDataDesc:
+    def __init__(self, dataset_name, data_dir, do_lower_case):
+        if dataset_name == 'sst-2':
+            self.data_dir = process_sst_2(data_dir)
+            self.num_labels = 2
+            self.eval_file = self.data_dir + '/dev.tsv'
+        elif dataset_name == 'imdb':
+            self.num_labels = 2
+            self.data_dir = process_imdb(data_dir, do_lower_case)
+            self.eval_file = self.data_dir + '/test.tsv'
+        elif dataset_name == 'thucnews':
+            self.num_labels = 14
+            self.data_dir = process_thucnews(data_dir)
+            self.eval_file = self.data_dir + '/test.tsv'
+        elif dataset_name.startswith('nlu-'):
+            if dataset_name.endswith('chat'):
+                self.data_dir = f'{data_dir}/ChatbotCorpus.json'
+                self.num_labels = 2
+            elif dataset_name.endswith('ubuntu'):
+                self.data_dir = f'{data_dir}/AskUbuntuCorpus.json'
+                self.num_labels = 5
+            elif dataset_name.endswith('web'):
+                data_dir = f'{data_dir}/WebApplicationsCorpus.json'
+                self.num_labels = 8
+            self.data_dir = process_nlu(data_dir, do_lower_case, dataset_name=dataset_name)
+            self.eval_file = self.data_dir + '/test.tsv'
+        elif dataset_name.startswith('jarvis'):
+            self.data_dir = process_jarvis_datasets(
+                data_dir, do_lower_case, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False
+            )
+
+            intents = get_intent_labels(f'{self.data_dir}/dict.intents.csv')
+            self.num_labels = len(intents)
+        else:
+            raise ValueError(
+                "Looks like you passed a dataset name that isn't "
+                "already supported by NeMo. Please make sure "
+                "that you build the preprocessing method for it."
+            )
+
+        self.train_file = self.data_dir + '/train.tsv'
+
+        for mode in ['train', 'test', 'eval']:
+
+            if not if_exist(self.data_dir, [f'{mode}.tsv']):
+                logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.')
+                continue
+
+            input_file = f'{self.data_dir}/{mode}.tsv'
+            with open(input_file, 'r') as f:
+                input_lines = f.readlines()[1:]  # Skipping headers at index 0
+
+            queries, raw_sentences = [], []
+            for input_line in input_lines:
+                parts = input_line.strip().split()
+                raw_sentences.append(int(parts[-1]))
+                queries.append(' '.join(parts[:-1]))
+
+            infold = input_file[: input_file.rfind('/')]
+
+            logging.info(f'Three most popular classes during {mode}ing')
+            total_sents, sent_label_freq = get_label_stats(raw_sentences, infold + f'/{mode}_sentence_stats.tsv')
+
+            if mode == 'train':
+                self.class_weights = calc_class_weights(sent_label_freq)
+                logging.info(f'Class weights are - {self.class_weights}')
+
+            logging.info(f'Total Sentences - {total_sents}')
+            logging.info(f'Sentence class frequencies - {sent_label_freq}')
diff --git a/nemo/collections/nlp/data/datasets/token_classification.py b/nemo/collections/nlp/data/datasets/token_classification_dataset.py
similarity index 88%
rename from nemo/collections/nlp/data/datasets/token_classification.py
rename to nemo/collections/nlp/data/datasets/token_classification_dataset.py
index 857153ca9b3a..966fefd42498 100644
--- a/nemo/collections/nlp/data/datasets/token_classification.py
+++ b/nemo/collections/nlp/data/datasets/token_classification_dataset.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 """
 Utility functions for Token Classification NLP tasks
 Some parts of this code were adapted from the HuggingFace library at
@@ -27,8 +28,10 @@
 import numpy as np
 from torch.utils.data import Dataset
 
-import nemo
-from . import utils
+import nemo.collections.nlp.data.datasets.datasets_utils as datasets_utils
+from nemo import logging
+
+__all__ = ['BertTokenClassificationDataset', 'BertTokenClassificationInferDataset']
 
 
 def get_features(
@@ -109,8 +112,8 @@ def get_features(
             all_labels.append(labels)
 
     max_seq_length = min(max_seq_length, max(sent_lengths))
-    nemo.logging.info(f'Max length: {max_seq_length}')
-    utils.get_stats(sent_lengths)
+    logging.info(f'Max length: {max_seq_length}')
+    datasets_utils.get_stats(sent_lengths)
     too_long_count = 0
 
     for i, subtokens in enumerate(all_subtokens):
@@ -138,27 +141,18 @@ def get_features(
 
         all_segment_ids.append([0] * max_seq_length)
 
-    nemo.logging.warning(f'{too_long_count} are longer than {max_seq_length}')
+    logging.warning(f'{too_long_count} are longer than {max_seq_length}')
 
     for i in range(min(len(all_input_ids), 5)):
-        nemo.logging.debug("*** Example ***")
-        nemo.logging.debug("i: %s", i)
-        nemo.logging.debug("subtokens: %s", " ".join(list(map(str, all_subtokens[i]))))
-        nemo.logging.debug("loss_mask: %s", " ".join(list(map(str, all_loss_mask[i]))))
-        nemo.logging.debug("input_mask: %s", " ".join(list(map(str, all_input_mask[i]))))
-        nemo.logging.debug(
-            "subtokens_mask: %s", " ".join(list(map(str, all_subtokens_mask[i]))),
-        )
+        logging.debug("*** Example ***")
+        logging.debug("i: %s", i)
+        logging.debug("subtokens: %s", " ".join(list(map(str, all_subtokens[i]))))
+        logging.debug("loss_mask: %s", " ".join(list(map(str, all_loss_mask[i]))))
+        logging.debug("input_mask: %s", " ".join(list(map(str, all_input_mask[i]))))
+        logging.debug("subtokens_mask: %s", " ".join(list(map(str, all_subtokens_mask[i]))))
         if with_label:
-            nemo.logging.debug("labels: %s", " ".join(list(map(str, all_labels[i]))))
-    return (
-        all_input_ids,
-        all_segment_ids,
-        all_input_mask,
-        all_loss_mask,
-        all_subtokens_mask,
-        all_labels,
-    )
+            logging.debug("labels: %s", " ".join(list(map(str, all_labels[i]))))
+    return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_labels)
 
 
 class BertTokenClassificationDataset(Dataset):
@@ -224,10 +218,10 @@ def __init__(
         if use_cache and os.path.exists(features_pkl) and os.path.exists(label_ids_pkl):
             # If text_file was already processed, load from pickle
             features = pickle.load(open(features_pkl, 'rb'))
-            nemo.logging.info(f'features restored from {features_pkl}')
+            logging.info(f'features restored from {features_pkl}')
 
             label_ids = pickle.load(open(label_ids_pkl, 'rb'))
-            nemo.logging.info(f'Labels to ids dict restored from {label_ids_pkl}')
+            logging.info(f'Labels to ids dict restored from {label_ids_pkl}')
         else:
             if num_samples == 0:
                 raise ValueError("num_samples has to be positive", num_samples)
@@ -261,16 +255,16 @@ def __init__(
             # for dev/test sets use label mapping from training set
             if label_ids:
                 if len(label_ids) != len(unique_labels):
-                    nemo.logging.warning(
+                    logging.warning(
                         f'Not all labels from the specified'
                         + ' label_ids dictionary are present in the'
                         + ' current dataset. Using the provided'
                         + ' label_ids dictionary.'
                     )
                 else:
-                    nemo.logging.info(f'Using the provided label_ids dictionary.')
+                    logging.info(f'Using the provided label_ids dictionary.')
             else:
-                nemo.logging.info(
+                logging.info(
                     f'Creating a new label to label_id dictionary.'
                     + ' It\'s recommended to use label_ids generated'
                     + ' during training for dev/test sets to avoid'
@@ -298,10 +292,10 @@ def __init__(
 
             if use_cache:
                 pickle.dump(features, open(features_pkl, "wb"))
-                nemo.logging.info(f'features saved to {features_pkl}')
+                logging.info(f'features saved to {features_pkl}')
 
                 pickle.dump(label_ids, open(label_ids_pkl, "wb"))
-                nemo.logging.info(f'labels to ids dict saved to {label_ids_pkl}')
+                logging.info(f'labels to ids dict saved to {label_ids_pkl}')
 
         self.all_input_ids = features[0]
         self.all_segment_ids = features[1]
@@ -313,15 +307,15 @@ def __init__(
 
         infold = text_file[: text_file.rfind('/')]
         merged_labels = itertools.chain.from_iterable(self.all_labels)
-        nemo.logging.info('Three most popular labels')
-        _, self.label_frequencies = utils.get_label_stats(merged_labels, infold + '/label_stats.tsv')
+        logging.info('Three most popular labels')
+        _, self.label_frequencies = datasets_utils.get_label_stats(merged_labels, infold + '/label_stats.tsv')
 
         # save label_ids
         out = open(infold + '/label_ids.csv', 'w')
         labels, _ = zip(*sorted(self.label_ids.items(), key=lambda x: x[1]))
         out.write('\n'.join(labels))
-        nemo.logging.info(f'Labels: {self.label_ids}')
-        nemo.logging.info(f'Labels mapping saved to : {out.name}')
+        logging.info(f'Labels: {self.label_ids}')
+        logging.info(f'Labels mapping saved to : {out.name}')
 
     def __len__(self):
         return len(self.all_input_ids)
diff --git a/nemo/collections/nlp/data/datasets/utils.py b/nemo/collections/nlp/data/datasets/utils.py
deleted file mode 100644
index 4ec542e50ddf..000000000000
--- a/nemo/collections/nlp/data/datasets/utils.py
+++ /dev/null
@@ -1,1681 +0,0 @@
-import csv
-import glob
-import itertools
-import json
-import os
-import random
-import re
-import shutil
-import subprocess
-from collections import Counter
-
-import numpy as np
-from sentencepiece import SentencePieceTrainer as SPT
-from tqdm import tqdm
-
-import nemo
-from ...utils.nlp_utils import get_vocab, label2idx, write_vocab, write_vocab_in_order
-
-DATABASE_EXISTS_TMP = '{} dataset has already been processed and stored at {}'
-MODE_EXISTS_TMP = '{} mode of {} dataset has already been processed and stored at {}'
-
-
-def get_stats(lengths):
-    lengths = np.asarray(lengths)
-    nemo.logging.info(
-        f'Min: {np.min(lengths)} | \
-                 Max: {np.max(lengths)} | \
-                 Mean: {np.mean(lengths)} | \
-                 Median: {np.median(lengths)}'
-    )
-    nemo.logging.info(f'75 percentile: {np.percentile(lengths, 75)}')
-    nemo.logging.info(f'99 percentile: {np.percentile(lengths, 99)}')
-
-
-def get_label_stats(labels, outfile='stats.tsv'):
-    labels = Counter(labels)
-    total = sum(labels.values())
-    out = open(outfile, 'w')
-    i = 0
-    label_frequencies = labels.most_common()
-    for k, v in label_frequencies:
-        out.write(f'{k}\t{v / total}\n')
-        if i < 3:
-            nemo.logging.info(f'{i} item: {k}, {v} out of {total}, {v / total}.')
-        i += 1
-    return total, label_frequencies
-
-
-def list2str(l):
-    return ' '.join([str(x) for x in l])
-
-
-def tensor2list(tensor):
-    return tensor.detach().cpu().tolist()
-
-
-def if_exist(outfold, files):
-    if not os.path.exists(outfold):
-        return False
-    for file in files:
-        if not os.path.exists(f'{outfold}/{file}'):
-            return False
-    return True
-
-
-def process_sst_2(data_dir):
-    if not os.path.exists(data_dir):
-        link = 'https://gluebenchmark.com/tasks'
-        raise ValueError(f'Data not found at {data_dir}. ' f'Please download SST-2 from {link}.')
-    nemo.logging.info('Keep in mind that SST-2 is only available in lower case.')
-    return data_dir
-
-
-def process_imdb(data_dir, uncased, modes=['train', 'test']):
-    if not os.path.exists(data_dir):
-        link = 'www.kaggle.com/iarunava/imdb-movie-reviews-dataset'
-        raise ValueError(f'Data not found at {data_dir}. ' f'Please download IMDB from {link}.')
-
-    outfold = f'{data_dir}/nemo-processed'
-
-    if uncased:
-        outfold = f'{outfold}_uncased'
-
-    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        nemo.logging.info(DATABASE_EXISTS_TMP.format('IMDB', outfold))
-        return outfold
-    nemo.logging.info(f'Processing IMDB dataset and store at {outfold}')
-
-    os.makedirs(outfold, exist_ok=True)
-
-    outfiles = {}
-
-    for mode in modes:
-        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
-        outfiles[mode].write('sentence\tlabel\n')
-        for sent in ['neg', 'pos']:
-            if sent == 'neg':
-                label = 0
-            else:
-                label = 1
-            files = glob.glob(f'{data_dir}/{mode}/{sent}/*.txt')
-            for file in files:
-                with open(file, 'r') as f:
-                    review = f.read().strip()
-                if uncased:
-                    review = review.lower()
-                review = review.replace("<br />", "")
-                outfiles[mode].write(f'{review}\t{label}\n')
-    for mode in modes:
-        outfiles[mode].close()
-
-    return outfold
-
-
-def process_thucnews(data_dir):
-    modes = ['train', 'test']
-    train_size = 0.8
-    if not os.path.exists(data_dir):
-        link = 'thuctc.thunlp.org/'
-        raise ValueError(f'Data not found at {data_dir}. ' f'Please download THUCNews from {link}.')
-
-    outfold = f'{data_dir}/nemo-processed-thucnews'
-
-    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        nemo.logging.info(DATABASE_EXISTS_TMP.format('THUCNews', outfold))
-        return outfold
-    nemo.logging.info(f'Processing THUCNews dataset and store at {outfold}')
-
-    os.makedirs(outfold, exist_ok=True)
-
-    outfiles = {}
-
-    for mode in modes:
-        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'a+', encoding='utf-8')
-        outfiles[mode].write('sentence\tlabel\n')
-    categories = [
-        '体育',
-        '娱乐',
-        '家居',
-        '彩票',
-        '房产',
-        '教育',
-        '时尚',
-        '时政',
-        '星座',
-        '游戏',
-        '社会',
-        '科技',
-        '股票',
-        '财经',
-    ]
-    for category in categories:
-        label = categories.index(category)
-        category_files = glob.glob(f'{data_dir}/{category}/*.txt')
-        test_num = int(len(category_files) * (1 - train_size))
-        test_files = category_files[:test_num]
-        train_files = category_files[test_num:]
-        for mode in modes:
-            nemo.logging.info(f'Processing {mode} data of the category {category}')
-            if mode == 'test':
-                files = test_files
-            else:
-                files = train_files
-            for file in tqdm(files):
-                with open(file, 'r', encoding='utf-8') as f:
-                    news = f.read().strip().replace('\r', '')
-                    news = news.replace('\n', '').replace('\t', ' ')
-                    outfiles[mode].write(f'{news}\t{label}\n')
-    for mode in modes:
-        outfiles[mode].close()
-
-    return outfold
-
-
-def process_nlu(filename, uncased, modes=['train', 'test'], dataset_name='nlu-ubuntu'):
-    """ Dataset has to be of:
-    - ubuntu
-    - chat
-    - web
-    """
-
-    if not os.path.exists(filename):
-        link = 'https://github.com/sebischair/NLU-Evaluation-Corpora'
-        raise ValueError(f'Data not found at {filename}. ' 'Please download IMDB from {link}.')
-
-    if dataset_name == 'nlu-ubuntu':
-        INTENT = {
-            'makeupdate': 1,
-            'setupprinter': 2,
-            'shutdowncomputer': 3,
-            'softwarerecommendation': 4,
-            'none': 0,
-        }
-    elif dataset_name == 'nlu-chat':
-        INTENT = {'departuretime': 0, 'findconnection': 1}
-    elif dataset_name == 'nlu-web':
-        INTENT = {
-            'changepassword': 1,
-            'deleteaccount': 2,
-            'downloadvideo': 3,
-            'exportdata': 4,
-            'filterspam': 5,
-            'findalternative': 6,
-            'syncaccounts': 7,
-            'none': 0,
-        }
-    else:
-        raise ValueError(f'{dataset_name}: Invalid dataset name')
-
-    infold = filename[: filename.rfind('/')]
-    outfold = f'{infold}/{dataset_name}-nemo-processed'
-
-    if uncased:
-        outfold = f'{outfold}_uncased'
-
-    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        nemo.logging.info(DATABASE_EXISTS_TMP.format(dataset_name.upper(), outfold))
-        return outfold
-    nemo.logging.info(f'Processing data and store at {outfold}')
-
-    os.makedirs(outfold, exist_ok=True)
-
-    outfiles = {}
-
-    for mode in modes:
-        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
-        outfiles[mode].write('sentence\tlabel\n')
-
-    with open(filename, 'r') as f:
-        data = json.load(f)
-
-    for obj in data['sentences']:
-        sentence = obj['text'].strip()
-        if uncased:
-            sentence = sentence.lower()
-        intent = obj['intent'].lower().replace(' ', '')
-        label = INTENT[intent]
-        txt = f'{sentence}\t{label}\n'
-        if obj['training']:
-            outfiles['train'].write(txt)
-        else:
-            outfiles['test'].write(txt)
-    for mode in modes:
-        outfiles[mode].close()
-    return outfold
-
-
-def get_intent_labels(intent_file):
-    labels = {}
-    label = 0
-    with open(intent_file, 'r') as f:
-        for line in f:
-            intent = line.strip()
-            labels[intent] = label
-            label += 1
-    return labels
-
-
-def process_twitter_airline(filename, uncased, modes=['train', 'test']):
-    """ Dataset from Kaggle:
-    https://www.kaggle.com/crowdflower/twitter-airline-sentiment
-    """
-    pass
-
-
-def ids2text(ids, vocab):
-    return ' '.join([vocab[int(id_)] for id_ in ids])
-
-
-def process_atis(infold, uncased, modes=['train', 'test'], dev_split=0):
-    """ MSFT's dataset, processed by Kaggle
-    https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk
-    """
-    outfold = f'{infold}/nemo-processed'
-    vocab = get_vocab(f'{infold}/atis.dict.vocab.csv')
-
-    if uncased:
-        outfold = f'{outfold}-uncased'
-
-    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        nemo.logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold))
-        return outfold
-    nemo.logging.info(f'Processing ATIS dataset and store at {outfold}')
-
-    os.makedirs(outfold, exist_ok=True)
-
-    outfiles = {}
-
-    for mode in modes:
-        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
-        outfiles[mode].write('sentence\tlabel\n')
-        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')
-
-        queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines()
-        intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines()
-        slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines()
-
-        for i, query in enumerate(queries):
-            sentence = ids2text(query.strip().split()[1:-1], vocab)
-            outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n')
-            slot = ' '.join(slots[i].strip().split()[1:-1])
-            outfiles[mode + '_slots'].write(slot + '\n')
-
-    shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv')
-    shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv')
-    for mode in modes:
-        outfiles[mode].close()
-
-    return outfold
-
-
-def process_jarvis_datasets(
-    infold, uncased, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False,
-):
-    """ process and convert Jarvis datasets into NeMo's BIO format
-    """
-    outfold = f'{infold}/{dataset_name}-nemo-processed'
-    infold = f'{infold}/'
-
-    if uncased:
-        outfold = f'{outfold}-uncased'
-
-    if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']):
-        nemo.logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold))
-        return outfold
-
-    nemo.logging.info(f'Processing {dataset_name} dataset and store at {outfold}')
-
-    os.makedirs(outfold, exist_ok=True)
-
-    outfiles = {}
-    intents_list = {}
-    slots_list = {}
-    slots_list_all = {}
-
-    outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w')
-    outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w')
-
-    outfiles['dict_slots'].write('O\n')
-    slots_list["O"] = 0
-    slots_list_all["O"] = 0
-
-    for mode in modes:
-        if if_exist(outfold, [f'{mode}.tsv']):
-            nemo.logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode))
-            continue
-
-        if not if_exist(infold, [f'{mode}.tsv']):
-            nemo.logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.')
-            continue
-
-        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
-        outfiles[mode].write('sentence\tlabel\n')
-        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')
-
-        queries = open(f'{infold}/{mode}.tsv', 'r').readlines()
-
-        for i, query in enumerate(queries):
-            line_splits = query.strip().split("\t")
-            if len(line_splits) == 3:
-                intent_str, slot_tags_str, sentence = line_splits
-            else:
-                intent_str, sentence = line_splits
-                slot_tags_str = ""
-
-            if intent_str not in intents_list:
-                intents_list[intent_str] = len(intents_list)
-                outfiles['dict_intents'].write(f'{intent_str}\n')
-
-            if ignore_prev_intent:
-                start_token = 2
-            else:
-                start_token = 1
-            sentence_cld = " ".join(sentence.strip().split()[start_token:-1])
-            outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n')
-
-            slot_tags_list = []
-            if slot_tags_str.strip():
-                slot_tags = slot_tags_str.strip().split(",")
-                for st in slot_tags:
-                    if not st.strip():
-                        continue
-                    [start_i, end_i, slot_name] = st.strip().split(":")
-                    slot_tags_list.append([int(start_i), int(end_i), slot_name])
-                    if slot_name not in slots_list:
-                        slots_list[slot_name] = len(slots_list)
-                        slots_list_all[f'B-{slot_name}'] = len(slots_list_all)
-                        slots_list_all[f'I-{slot_name}'] = len(slots_list_all)
-                        outfiles['dict_slots'].write(f'B-{slot_name}\n')
-                        outfiles['dict_slots'].write(f'I-{slot_name}\n')
-
-            slot_tags_list.sort(key=lambda x: x[0])
-            slots = []
-            processed_index = 0
-            for tag_start, tag_end, tag_str in slot_tags_list:
-                if tag_start > processed_index:
-                    words_list = sentence[processed_index:tag_start].strip().split()
-                    slots.extend([str(slots_list_all['O'])] * len(words_list))
-                words_list = sentence[tag_start:tag_end].strip().split()
-                slots.append(str(slots_list_all[f'B-{tag_str}']))
-                slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1))
-                processed_index = tag_end
-
-            if processed_index < len(sentence):
-                words_list = sentence[processed_index:].strip().split()
-                slots.extend([str(slots_list_all['O'])] * len(words_list))
-
-            slots = slots[1:-1]
-            slot = ' '.join(slots)
-            outfiles[mode + '_slots'].write(slot + '\n')
-
-        outfiles[mode + '_slots'].close()
-        outfiles[mode].close()
-
-    outfiles['dict_slots'].close()
-    outfiles['dict_intents'].close()
-
-    return outfold
-
-
-def reverse_dict(entity2value):
-    value2entity = {}
-    for entity in entity2value:
-        for value in entity2value[entity]:
-            value2entity[value] = entity
-    return value2entity
-
-
-def map_entities(entity2value, entities):
-    for key in entities:
-        if 'data' in entities[key]:
-            if key not in entity2value:
-                entity2value[key] = set([])
-
-            values = []
-            for value in entities[key]['data']:
-                values.append(value['value'])
-                values.extend(value['synonyms'])
-            entity2value[key] = entity2value[key] | set(values)
-
-    return entity2value
-
-
-def get_entities(files):
-    entity2value = {}
-    for file in files:
-        with open(file, 'r') as json_file:
-            data = json.load(json_file)
-            entity2value = map_entities(entity2value, data['entities'])
-
-    value2entity = reverse_dict(entity2value)
-    return entity2value, value2entity
-
-
-def get_data(files, entity2value, value2entity):
-    all_data, all_slots, all_intents = [], set(['O']), set()
-    for file in files:
-        file_data = []
-        with open(file, 'r') as json_file:
-            data = json.load(json_file)
-            for intent in data['intents']:
-                all_intents.add(intent)
-                utterances = data['intents'][intent]['utterances']
-                for utterance in utterances:
-                    tokens, slots = [], []
-                    for frag in utterance['data']:
-                        frag_tokens = frag['text'].strip().split()
-                        tokens.extend(frag_tokens)
-                        if 'slot_name' not in frag:
-                            slot = 'O'
-                        else:
-                            slot = frag['slot_name']
-                            all_slots.add(slot)
-                        slots.extend([slot] * len(frag_tokens))
-                    file_data.append((tokens, slots, intent))
-        all_data.append(file_data)
-    return all_data, all_slots, all_intents
-
-
-def get_dataset(files, dev_split=0.1):
-    entity2value, value2entity = get_entities(files)
-    data, slots, intents = get_data(files, entity2value, value2entity)
-    if len(data) == 1:
-        train, dev = partition(data[0], split=dev_split)
-    else:
-        train, dev = data[0], data[1]
-    return train, dev, slots, intents
-
-
-def partition(data, split=0.1):
-    n = len(data)
-    n_dev = int(n * split)
-    dev_idx = set(random.sample(range(n), n_dev))
-    dev, train = [], []
-
-    for i, item in enumerate(data):
-        if i in dev_idx:
-            dev.append(item)
-        else:
-            train.append(item)
-    return train, dev
-
-
-def write_data(data, slot_dict, intent_dict, outfold, mode, uncased):
-    intent_file = open(f'{outfold}/{mode}.tsv', 'w')
-    intent_file.write('sentence\tlabel\n')
-    slot_file = open(f'{outfold}/{mode}_slots.tsv', 'w')
-    for tokens, slots, intent in data:
-        text = ' '.join(tokens)
-        if uncased:
-            text = text.lower()
-        intent_file.write(f'{text}\t{intent_dict[intent]}\n')
-        slots = [str(slot_dict[slot]) for slot in slots]
-        slot_file.write(' '.join(slots) + '\n')
-    intent_file.close()
-    slot_file.close()
-
-
-def create_dataset(train, dev, slots, intents, uncased, outfold):
-    os.makedirs(outfold, exist_ok=True)
-    if 'O' in slots:
-        slots.remove('O')
-    slots = sorted(list(slots)) + ['O']
-    intents = sorted(list(intents))
-    slots = write_vocab(slots, f'{outfold}/dict.slots.csv')
-    intents = write_vocab(intents, f'{outfold}/dict.intents.csv')
-    write_data(train, slots, intents, outfold, 'train', uncased)
-    write_data(dev, slots, intents, outfold, 'test', uncased)
-
-
-def process_snips(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
-    if not os.path.exists(data_dir):
-        link = 'www.github.com/snipsco/spoken-language'
-        '-understanding-research-datasets'
-        raise ValueError(f'Data not found at {data_dir}. ' 'Resquest to download the SNIPS dataset from {link}.')
-
-    outfold = f'{data_dir}/nemo-processed'
-
-    if uncased:
-        outfold = f'{outfold}-uncased'
-
-    exist = True
-    for dataset in ['light', 'speak', 'all']:
-        if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]):
-            nemo.logging.info(DATABASE_EXISTS_TMP.format('SNIPS-' + dataset.upper(), outfold))
-        else:
-            exist = False
-    if exist:
-        return outfold
-
-    nemo.logging.info(f'Processing SNIPS dataset and store at {outfold}')
-
-    os.makedirs(outfold, exist_ok=True)
-
-    speak_dir = 'smart-speaker-en-close-field'
-    light_dir = 'smart-lights-en-close-field'
-
-    light_files = [f'{data_dir}/{light_dir}/dataset.json']
-    speak_files = [f'{data_dir}/{speak_dir}/training_dataset.json']
-    speak_files.append(f'{data_dir}/{speak_dir}/test_dataset.json')
-
-    light_train, light_dev, light_slots, light_intents = get_dataset(light_files, dev_split)
-    speak_train, speak_dev, speak_slots, speak_intents = get_dataset(speak_files)
-
-    create_dataset(
-        light_train, light_dev, light_slots, light_intents, uncased, f'{outfold}/light',
-    )
-    create_dataset(
-        speak_train, speak_dev, speak_slots, speak_intents, uncased, f'{outfold}/speak',
-    )
-    create_dataset(
-        light_train + speak_train,
-        light_dev + speak_dev,
-        light_slots | speak_slots,
-        light_intents | speak_intents,
-        uncased,
-        f'{outfold}/all',
-    )
-
-    return outfold
-
-
-# def list2str(nums):
-#     return ' '.join([str(num) for num in nums])
-
-
-def merge(data_dir, subdirs, dataset_name, modes=['train', 'test']):
-    outfold = f'{data_dir}/{dataset_name}'
-    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        nemo.logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold))
-        slots = get_vocab(f'{outfold}/dict.slots.csv')
-        none_slot = 0
-        for key in slots:
-            if slots[key] == 'O':
-                none_slot = key
-                break
-        return outfold, int(none_slot)
-
-    os.makedirs(outfold, exist_ok=True)
-
-    data_files, slot_files = {}, {}
-    for mode in modes:
-        data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w')
-        data_files[mode].write('sentence\tlabel\n')
-        slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w')
-
-    intents, slots = {}, {}
-    intent_shift, slot_shift = 0, 0
-    none_intent, none_slot = -1, -1
-
-    for subdir in subdirs:
-        curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv')
-        curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv')
-
-        for key in curr_intents:
-            if intent_shift > 0 and curr_intents[key] == 'O':
-                continue
-            if curr_intents[key] == 'O' and intent_shift == 0:
-                none_intent = int(key)
-            intents[int(key) + intent_shift] = curr_intents[key]
-
-        for key in curr_slots:
-            if slot_shift > 0 and curr_slots[key] == 'O':
-                continue
-            if slot_shift == 0 and curr_slots[key] == 'O':
-                none_slot = int(key)
-            slots[int(key) + slot_shift] = curr_slots[key]
-
-        for mode in modes:
-            with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f:
-                for line in f.readlines()[1:]:
-                    text, label = line.strip().split('\t')
-                    label = int(label)
-                    if curr_intents[label] == 'O':
-                        label = none_intent
-                    else:
-                        label = label + intent_shift
-                    data_files[mode].write(f'{text}\t{label}\n')
-
-            with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f:
-                for line in f.readlines():
-                    labels = [int(label) for label in line.strip().split()]
-                    shifted_labels = []
-                    for label in labels:
-                        if curr_slots[label] == 'O':
-                            shifted_labels.append(none_slot)
-                        else:
-                            shifted_labels.append(label + slot_shift)
-                    slot_files[mode].write(list2str(shifted_labels) + '\n')
-
-        intent_shift += len(curr_intents)
-        slot_shift += len(curr_slots)
-
-    write_vocab_in_order(intents, f'{outfold}/dict.intents.csv')
-    write_vocab_in_order(slots, f'{outfold}/dict.slots.csv')
-    return outfold, none_slot
-
-
-def get_intent_query_files_dialogflow(path):
-    fileslist = []
-    for root, _, files in os.walk(path):
-        for file in files:
-            if '_usersays_en.json' in file:
-                fileslist.append(os.path.join(root, file))
-    return fileslist
-
-
-def get_intents_slots_dialogflow(files, slot_labels):
-    intent_names = []
-    intent_queries = []
-    slot_tags = []
-
-    for index, file in enumerate(files):
-        intent_names.append(os.path.basename(file).split('_usersays')[0])
-
-        with open(file) as json_file:
-            intent_data = json.load(json_file)
-            for query in intent_data:
-                query_text = ""
-                slots = ""
-                for segment in query['data']:
-                    query_text = ''.join([query_text, segment['text']])
-                    if 'alias' in segment:
-                        for _ in segment['text'].split():
-                            slots = ' '.join([slots, slot_labels.get(segment['alias'])])
-                    else:
-                        for _ in segment['text'].split():
-                            slots = ' '.join([slots, slot_labels.get('O')])
-                query_text = f'{query_text.strip()}\t{index}\n'
-                intent_queries.append(query_text)
-                slots = f'{slots.strip()}\n'
-                slot_tags.append(slots)
-    return intent_queries, intent_names, slot_tags
-
-
-def get_slots_dialogflow(files):
-    slot_labels = {}
-    count = 0
-    for file in files:
-        intent_head_file = ''.join([file.split('_usersays')[0], '.json'])
-        with open(intent_head_file) as json_file:
-            intent_meta_data = json.load(json_file)
-            for params in intent_meta_data['responses'][0]['parameters']:
-                if params['name'] not in slot_labels:
-                    slot_labels[params['name']] = str(count)
-                    count += 1
-    slot_labels['O'] = str(count)
-    return slot_labels
-
-
-# The following works for the specified DialogFlow and Mturk output format
-def partition_data(intent_queries, slot_tags, split=0.1):
-    n = len(intent_queries)
-    n_dev = int(n * split)
-    dev_idx = set(random.sample(range(n), n_dev))
-    dev_intents, dev_slots, train_intents, train_slots = [], [], [], []
-
-    dev_intents.append('sentence\tlabel\n')
-    train_intents.append('sentence\tlabel\n')
-
-    for i, item in enumerate(intent_queries):
-        if i in dev_idx:
-            dev_intents.append(item)
-            dev_slots.append(slot_tags[i])
-        else:
-            train_intents.append(item)
-            train_slots.append(slot_tags[i])
-    return train_intents, train_slots, dev_intents, dev_slots
-
-
-# The following works for the specified DialogFlow and Mturk output format
-def write_files(data, outfile):
-    with open(outfile, 'w') as f:
-        for item in data:
-            item = f'{item.strip()}\n'
-            f.write(item)
-
-
-def process_dialogflow(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
-    if not os.path.exists(data_dir):
-        link = 'www.dialogflow.com'
-        raise ValueError(
-            f'Data not found at {data_dir}. ' 'Export your dialogflow data from' '{link} and unzip at {data_dir}.'
-        )
-
-    outfold = f'{data_dir}/dialogflow/nemo-processed'
-
-    '''TO DO  - check for nemo-processed directory
-    already exists. If exists, skip the entire creation steps below. '''
-
-    os.makedirs(outfold, exist_ok=True)
-
-    files = get_intent_query_files_dialogflow(data_dir)
-
-    slot_labels = get_slots_dialogflow(files)
-
-    intent_queries, intent_names, slot_tags = get_intents_slots_dialogflow(files, slot_labels)
-
-    train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split)
-
-    write_files(train_queries, f'{outfold}/train.tsv')
-    write_files(train_slots, f'{outfold}/train_slots.tsv')
-
-    write_files(test_queries, f'{outfold}/test.tsv')
-    write_files(test_slots, f'{outfold}/test_slots.tsv')
-
-    write_files(slot_labels, f'{outfold}/dict.slots.csv')
-    write_files(intent_names, f'{outfold}/dict.intents.csv')
-
-    return outfold
-
-
-def read_csv(file_path):
-    rows = []
-    with open(file_path, 'r') as csvfile:
-        read_csv = csv.reader(csvfile, delimiter=',')
-        for row in read_csv:
-            rows.append(row)
-    return rows
-
-
-def get_intents_mturk(utterances, outfold):
-    intent_names = {}
-    intent_count = 0
-
-    agreed_all = {}
-
-    print('Printing all intent_labels')
-    intent_dict = f'{outfold}/dict.intents.csv'
-    if os.path.exists(intent_dict):
-        with open(intent_dict, 'r') as f:
-            for intent_name in f.readlines():
-                intent_names[intent_name.strip()] = intent_count
-                intent_count += 1
-    print(intent_names)
-
-    for i, utterance in enumerate(utterances[1:]):
-
-        if utterance[1] not in agreed_all:
-            agreed_all[utterance[0]] = utterance[1]
-
-        if utterance[1] not in intent_names:
-            intent_names[utterance[1]] = intent_count
-            intent_count += 1
-
-    print(f'Total number of utterance samples: {len(agreed_all)}')
-
-    return agreed_all, intent_names
-
-
-def get_slot_labels(slot_annotations, task_name):
-    slot_labels = json.loads(slot_annotations[0])
-
-    all_labels = {}
-    count = 0
-    # Generating labels with the IOB format.
-    for label in slot_labels[task_name]['annotations']['labels']:
-        b_slot = 'B-' + label['label']
-        i_slot = 'I-' + label['label']
-        all_labels[b_slot] = str(count)
-        count += 1
-        all_labels[i_slot] = str(count)
-        count += 1
-    all_labels['O'] = str(count)
-
-    return all_labels
-
-
-def process_intent_slot_mturk(slot_annotations, agreed_all, intent_names, task_name):
-    slot_tags = []
-    inorder_utterances = []
-    all_labels = get_slot_labels(slot_annotations, task_name)
-    print(f'agreed_all - {len(agreed_all)}')
-    print(f'Slot annotations - {len(slot_annotations)}')
-
-    for annotation in slot_annotations[0:]:
-        an = json.loads(annotation)
-        utterance = an['source']
-        if len(utterance) > 2 and utterance.startswith('"') and utterance.endswith('"'):
-            utterance = utterance[1:-1]
-
-        if utterance in agreed_all:
-            entities = {}
-            annotated_entities = an[task_name]['annotations']['entities']
-            for i, each_anno in enumerate(annotated_entities):
-                entities[int(each_anno['startOffset'])] = i
-
-            lastptr = 0
-            slotlist = []
-            # sorting annotations by the start offset
-            for i in sorted(entities.keys()):
-                annotated_entities = an[task_name]['annotations']['entities']
-                tags = annotated_entities[entities.get(i)]
-                untagged_words = utterance[lastptr : tags['startOffset']]
-                for _ in untagged_words.split():
-                    slotlist.append(all_labels.get('O'))
-                anno_words = utterance[tags['startOffset'] : tags['endOffset']]
-                # tagging with the IOB format.
-                for j, _ in enumerate(anno_words.split()):
-                    if j == 0:
-                        b_slot = 'B-' + tags['label']
-                        slotlist.append(all_labels.get(b_slot))
-                    else:
-                        i_slot = 'I-' + tags['label']
-                        slotlist.append(all_labels.get(i_slot))
-                lastptr = tags['endOffset']
-
-            untagged_words = utterance[lastptr : len(utterance)]
-            for _ in untagged_words.split():
-                slotlist.append(all_labels.get('O'))
-
-            slotstr = ' '.join(slotlist)
-            slotstr = f'{slotstr.strip()}\n'
-
-            slot_tags.append(slotstr)
-            intent_num = intent_names.get(agreed_all.get(utterance))
-            query_text = f'{utterance.strip()}\t{intent_num}\n'
-            inorder_utterances.append(query_text)
-        # else:
-        #     print(utterance)
-
-    print(f'inorder utterances - {len(inorder_utterances)}')
-
-    return all_labels, inorder_utterances, slot_tags
-
-
-def process_mturk(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
-    if not os.path.exists(data_dir):
-        link = 'www.mturk.com'
-        raise ValueError(
-            f'Data not found at {data_dir}. ' 'Export your mturk data from' '{link} and unzip at {data_dir}.'
-        )
-
-    outfold = f'{data_dir}/nemo-processed'
-
-    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
-        nemo.logging.info(DATABASE_EXISTS_TMP.format('mturk', outfold))
-        return outfold
-
-    nemo.logging.info(f'Processing dataset from mturk and storing at {outfold}')
-
-    os.makedirs(outfold, exist_ok=True)
-
-    classification_data_file = f'{data_dir}/classification.csv'
-    annotation_data_file = f'{data_dir}/annotation.manifest'
-
-    if not os.path.exists(classification_data_file):
-        raise FileNotFoundError(f'File not found ' f'at {classification_data_file}')
-
-    if not os.path.exists(annotation_data_file):
-        raise FileNotFoundError(f'File not found at {annotation_data_file}')
-
-    utterances = []
-    utterances = read_csv(classification_data_file)
-
-    # This function assumes that the intent classification data has been
-    # reviewed and cleaned and only one label per utterance is present.
-    agreed_all, intent_names = get_intents_mturk(utterances, outfold)
-
-    with open(annotation_data_file, 'r') as f:
-        slot_annotations = f.readlines()
-
-    # This function assumes that the preprocess step would have made
-    # the task_name of all the annotations generic
-    task_name = 'retail-combined'
-
-    # It is assumed that every utterances will have corresponding
-    # slot annotation information
-    if len(slot_annotations) < len(agreed_all):
-        raise ValueError(f'Every utterance must have corresponding' f'slot annotation information')
-
-    slot_labels, intent_queries, slot_tags = process_intent_slot_mturk(
-        slot_annotations, agreed_all, intent_names, task_name
-    )
-
-    assert len(slot_tags) == len(intent_queries)
-
-    dev_split = 0.1
-
-    train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split)
-
-    write_files(train_queries, f'{outfold}/train.tsv')
-    write_files(train_slots, f'{outfold}/train_slots.tsv')
-
-    write_files(test_queries, f'{outfold}/test.tsv')
-    write_files(test_slots, f'{outfold}/test_slots.tsv')
-
-    write_files(slot_labels, f'{outfold}/dict.slots.csv')
-    write_files(intent_names, f'{outfold}/dict.intents.csv')
-
-    return outfold
-
-
-# The following works for the DialogFlow and Mturk output format
-# def write_files(data, outfile):
-#     with open(f'{outfile}', 'w') as f:
-#         for item in data:
-#             item = f'{item.strip()}\n'
-#             f.write(item)
-
-
-def calc_class_weights(label_freq):
-    """
-    Goal is to give more weight to the classes with less samples
-    so as to match the one with the higest frequency. We achieve this by
-    dividing the highest frequency by the freq of each label.
-    Example -
-    [12, 5, 3] -> [12/12, 12/5, 12/3] -> [1, 2.4, 4]
-
-    Here label_freq is assumed to be sorted by the frequency. I.e.
-    label_freq[0] is the most frequent element.
-
-    """
-
-    most_common_label_freq = label_freq[0]
-    weighted_slots = sorted([(index, most_common_label_freq[1] / freq) for (index, freq) in label_freq])
-    return [weight for (_, weight) in weighted_slots]
-
-
-class JointIntentSlotDataDesc:
-    """ Convert the raw data to the standard format supported by
-    JointIntentSlotDataset.
-
-    By default, the None label for slots is 'O'.
-
-    JointIntentSlotDataset requires two files:
-
-        input_file: file to sequence + label.
-            the first line is header (sentence [tab] label)
-            each line should be [sentence][tab][label]
-
-        slot_file: file to slot labels, each line corresponding to
-            slot labels for a sentence in input_file. No header.
-
-    To keep the mapping from label index to label consistent during
-    training and inferencing, we require the following files:
-        dicts.intents.csv: each line is an intent. The first line
-            corresponding to the 0 intent label, the second line
-            corresponding to the 1 intent label, and so on.
-
-        dicts.slots.csv: each line is a slot. The first line
-            corresponding to the 0 slot label, the second line
-            corresponding to the 1 slot label, and so on.
-
-    Args:
-        data_dir (str): the directory of the dataset
-        do_lower_case (bool): whether to set your dataset to lowercase
-        dataset_name (str): the name of the dataset. If it's a dataset
-            that follows the standard JointIntentSlotDataset format,
-            you can set the name as 'default'.
-        none_slot_label (str): the label for slots that aren't indentified
-            defaulted to 'O'
-        pad_label (int): the int used for padding. If set to -1,
-             it'll be set to the whatever the None label is.
-
-    """
-
-    def __init__(
-        self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1,
-    ):
-        if dataset_name == 'atis':
-            self.data_dir = process_atis(data_dir, do_lower_case)
-        elif dataset_name == 'snips-atis':
-            self.data_dir, self.pad_label = merge(
-                data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all',], dataset_name,
-            )
-        elif dataset_name == 'dialogflow':
-            self.data_dir = process_dialogflow(data_dir, do_lower_case)
-        elif dataset_name == 'mturk-processed':
-            self.data_dir = process_mturk(data_dir, do_lower_case)
-        elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']):
-            self.data_dir = process_snips(data_dir, do_lower_case)
-            if dataset_name.endswith('light'):
-                self.data_dir = f'{self.data_dir}/light'
-            elif dataset_name.endswith('speak'):
-                self.data_dir = f'{self.data_dir}/speak'
-            elif dataset_name.endswith('all'):
-                self.data_dir = f'{self.data_dir}/all'
-        elif dataset_name.startswith('jarvis'):
-            self.data_dir = process_jarvis_datasets(
-                data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False,
-            )
-        else:
-            if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
-                raise FileNotFoundError(
-                    "Make sure that your data follows the standard format "
-                    "supported by JointIntentSlotDataset. Your data must "
-                    "contain dict.intents.csv and dict.slots.csv."
-                )
-            self.data_dir = data_dir
-
-        self.intent_dict_file = self.data_dir + '/dict.intents.csv'
-        self.slot_dict_file = self.data_dir + '/dict.slots.csv'
-        self.num_intents = len(get_vocab(self.intent_dict_file))
-        slots = label2idx(self.slot_dict_file)
-        self.num_slots = len(slots)
-
-        for mode in ['train', 'test', 'eval']:
-
-            if not if_exist(self.data_dir, [f'{mode}.tsv']):
-                nemo.logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.')
-                continue
-
-            slot_file = f'{self.data_dir}/{mode}_slots.tsv'
-            with open(slot_file, 'r') as f:
-                slot_lines = f.readlines()
-
-            input_file = f'{self.data_dir}/{mode}.tsv'
-            with open(input_file, 'r') as f:
-                input_lines = f.readlines()[1:]  # Skipping headers at index 0
-
-            if len(slot_lines) != len(input_lines):
-                raise ValueError(
-                    "Make sure that the number of slot lines match the "
-                    "number of intent lines. There should be a 1-1 "
-                    "correspondence between every slot and intent lines."
-                )
-
-            dataset = list(zip(slot_lines, input_lines))
-
-            raw_slots, queries, raw_intents = [], [], []
-            for slot_line, input_line in dataset:
-                slot_list = [int(slot) for slot in slot_line.strip().split()]
-                raw_slots.append(slot_list)
-                parts = input_line.strip().split()
-                raw_intents.append(int(parts[-1]))
-                queries.append(' '.join(parts[:-1]))
-
-            infold = input_file[: input_file.rfind('/')]
-
-            nemo.logging.info(f'Three most popular intents during {mode}ing')
-            total_intents, intent_label_freq = get_label_stats(raw_intents, infold + f'/{mode}_intent_stats.tsv')
-            merged_slots = itertools.chain.from_iterable(raw_slots)
-
-            nemo.logging.info(f'Three most popular slots during {mode}ing')
-            slots_total, slots_label_freq = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv')
-
-            if mode == 'train':
-                self.slot_weights = calc_class_weights(slots_label_freq)
-                nemo.logging.info(f'Slot weights are - {self.slot_weights}')
-
-                self.intent_weights = calc_class_weights(intent_label_freq)
-                nemo.logging.info(f'Intent weights are - {self.intent_weights}')
-
-            nemo.logging.info(f'Total intents - {total_intents}')
-            nemo.logging.info(f'Intent label frequency - {intent_label_freq}')
-            nemo.logging.info(f'Total Slots - {slots_total}')
-            nemo.logging.info(f'Slots label frequency - {slots_label_freq}')
-
-        if pad_label != -1:
-            self.pad_label = pad_label
-        else:
-            if none_slot_label not in slots:
-                raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.')
-            self.pad_label = slots[none_slot_label]
-
-
-class SentenceClassificationDataDesc:
-    def __init__(self, dataset_name, data_dir, do_lower_case):
-        if dataset_name == 'sst-2':
-            self.data_dir = process_sst_2(data_dir)
-            self.num_labels = 2
-            self.eval_file = self.data_dir + '/dev.tsv'
-        elif dataset_name == 'imdb':
-            self.num_labels = 2
-            self.data_dir = process_imdb(data_dir, do_lower_case)
-            self.eval_file = self.data_dir + '/test.tsv'
-        elif dataset_name == 'thucnews':
-            self.num_labels = 14
-            self.data_dir = process_thucnews(data_dir)
-            self.eval_file = self.data_dir + '/test.tsv'
-        elif dataset_name.startswith('nlu-'):
-            if dataset_name.endswith('chat'):
-                self.data_dir = f'{data_dir}/ChatbotCorpus.json'
-                self.num_labels = 2
-            elif dataset_name.endswith('ubuntu'):
-                self.data_dir = f'{data_dir}/AskUbuntuCorpus.json'
-                self.num_labels = 5
-            elif dataset_name.endswith('web'):
-                data_dir = f'{data_dir}/WebApplicationsCorpus.json'
-                self.num_labels = 8
-            self.data_dir = process_nlu(data_dir, do_lower_case, dataset_name=dataset_name)
-            self.eval_file = self.data_dir + '/test.tsv'
-        elif dataset_name.startswith('jarvis'):
-            self.data_dir = process_jarvis_datasets(
-                data_dir, do_lower_case, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False,
-            )
-
-            intents = get_intent_labels(f'{self.data_dir}/dict.intents.csv')
-            self.num_labels = len(intents)
-        else:
-            raise ValueError(
-                "Looks like you passed a dataset name that isn't "
-                "already supported by NeMo. Please make sure "
-                "that you build the preprocessing method for it."
-            )
-
-        self.train_file = self.data_dir + '/train.tsv'
-
-        for mode in ['train', 'test', 'eval']:
-
-            if not if_exist(self.data_dir, [f'{mode}.tsv']):
-                nemo.logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.')
-                continue
-
-            input_file = f'{self.data_dir}/{mode}.tsv'
-            with open(input_file, 'r') as f:
-                input_lines = f.readlines()[1:]  # Skipping headers at index 0
-
-            queries, raw_sentences = [], []
-            for input_line in input_lines:
-                parts = input_line.strip().split()
-                raw_sentences.append(int(parts[-1]))
-                queries.append(' '.join(parts[:-1]))
-
-            infold = input_file[: input_file.rfind('/')]
-
-            nemo.logging.info(f'Three most popular classes during {mode}ing')
-            total_sents, sent_label_freq = get_label_stats(raw_sentences, infold + f'/{mode}_sentence_stats.tsv')
-
-            if mode == 'train':
-                self.class_weights = calc_class_weights(sent_label_freq)
-                nemo.logging.info(f'Class weights are - {self.class_weights}')
-
-            nemo.logging.info(f'Total Sentences - {total_sents}')
-            nemo.logging.info(f'Sentence class frequencies - {sent_label_freq}')
-
-
-def create_vocab_lm(data_dir, do_lower_case):
-    if if_exist(data_dir, ['train.txt', 'vocab.txt']):
-        nemo.logging.info("Vocabulary has been created.")
-        with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f:
-            vocab_size = len(f.readlines())
-        return vocab_size
-
-    nemo.logging.info(f'Creating vocabulary from training data at {data_dir}')
-
-    with open(f'{data_dir}/train.txt', 'r') as f:
-        txt = f.read()
-    if do_lower_case:
-        txt = txt.lower()
-    lines = re.split(r'[\n]', txt)
-    sentences = [line.strip().split() for line in lines if line.strip()]
-
-    vocab = {"[PAD]": 0, "[SEP]": 1, "[CLS]": 2, "[MASK]": 3}
-    idx = 4
-    for sentence in sentences:
-        for word in sentence:
-            if word not in vocab:
-                vocab[word] = idx
-                idx += 1
-
-    with open(f'{data_dir}/vocab.txt', 'w') as f:
-        for word in sorted(vocab.keys()):
-            f.write(word + '\n')
-    nemo.logging.info(f"Created vocabulary of size {len(vocab)}")
-
-    return len(vocab)
-
-
-def download_wkt2(data_dir):
-    os.makedirs('data/lm', exist_ok=True)
-    nemo.logging.warning(f'Data not found at {data_dir}. ' f'Downloading wikitext-2 to data/lm')
-    data_dir = 'data/lm/wikitext-2'
-    subprocess.call('scripts/get_wkt2.sh')
-    return data_dir
-
-
-class LanguageModelDataDesc:
-    def __init__(self, dataset_name, data_dir, do_lower_case):
-        if dataset_name == 'wikitext-2':
-            if not os.path.exists(data_dir):
-                data_dir = download_wkt2(data_dir)
-            self.vocab_size = create_vocab_lm(data_dir, do_lower_case)
-            self.data_dir = data_dir
-        else:
-            nemo.logging.warning(
-                "Looks like you passed a dataset name that isn't "
-                "already supported by NeMo. Please make sure that "
-                "you build the preprocessing method for it."
-            )
-
-
-def create_vocab_mlm(
-    data_dir, vocab_size, sample_size, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], train_file='',
-):
-    vocab = special_tokens[:]
-    bert_dir = f'{data_dir}/bert'
-    if if_exist(bert_dir, ['tokenizer.model']):
-        nemo.logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir))
-        return data_dir, f'{bert_dir}/tokenizer.model'
-    nemo.logging.info(f'Processing WikiText dataset and store at {bert_dir}')
-    os.makedirs(bert_dir, exist_ok=True)
-
-    if not train_file:
-        files = glob.glob(f'{data_dir}/*.txt')
-        train_file = f'{bert_dir}/merged.txt'
-        nemo.logging.info(f"Merging {len(files)} txt files into {train_file}")
-
-        with open(train_file, "w") as merged:
-            for file in tqdm(files):
-                with open(file, 'r') as inf:
-                    content = inf.read().strip()
-                merged.write(content + '\n\n\n')
-    else:
-        train_file = f'{data_dir}/{train_file}'
-
-    cmd = (
-        f"--input={train_file} --model_prefix={bert_dir}/tokenizer "
-        f"--vocab_size={vocab_size - len(vocab)} "
-        f"--input_sentence_size={sample_size} "
-        f"--shuffle_input_sentence=true --hard_vocab_limit=false "
-        f"--bos_id=-1 --eos_id=-1"
-    )
-    SPT.Train(cmd)
-
-    # Add BERT control symbols
-    tokens = []
-
-    with open(f"{bert_dir}/tokenizer.vocab", "r") as f:
-        f.readline()  # skip first <unk> token
-
-        # Read tokens from each line and parse for vocab
-        for line in f:
-            piece = line.split("\t")[0]
-            token = piece[1:] if piece.startswith("▁") else f"##{piece}"
-            tokens.append(token)
-
-    vocab.extend(tokens)
-
-    # Save vocabulary to output file
-    with open(f'{bert_dir}/vocab.txt', "w") as f:
-        for token in vocab:
-            f.write(f"{token}\n".format())
-    return data_dir, f'{bert_dir}/tokenizer.model'
-
-
-class BERTPretrainingDataDesc:
-    def __init__(
-        self, dataset_name, data_dir, vocab_size, sample_size, special_tokens, train_file='',
-    ):
-        if dataset_name == 'wikitext-2':
-            if not os.path.exists(data_dir):
-                data_dir = download_wkt2(data_dir)
-            self.data_dir, self.tokenizer_model = create_vocab_mlm(
-                data_dir, vocab_size, sample_size, special_tokens, train_file
-            )
-        else:
-            nemo.logging.warning(
-                "Looks like you passed a dataset name that isn't "
-                "already supported by NeMo. Please make sure that "
-                "you build the preprocessing method for it."
-            )
-
-        self.train_file = f'{data_dir}/train.txt'
-        self.eval_file = f'{data_dir}/valid.txt'
-        self.test_file = f'{data_dir}/test.txt'
-
-
-"""
-Utility functions for GLUE tasks
-This code was adapted from the HuggingFace library at
-https://github.com/huggingface/transformers
-"""
-
-
-class InputExample(object):
-    """A single training/test example for simple sequence classification."""
-
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
-
-        Args:
-            guid: Unique id for the example.
-            text_a: string. The untokenized text of the first sequence.
-            For single sequence tasks, only this sequence must be specified.
-            text_b: (Optional) string. The untokenized text of the second
-            sequence. Only must be specified for sequence pair tasks.
-            label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-
-
-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8-sig") as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                # if sys.version_info[0] == 2:
-                #     line = list(unicode(cell, 'utf-8') for cell in line)
-                lines.append(line)
-            return lines
-
-
-class MrpcProcessor(DataProcessor):
-    """Processor for the MRPC data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        nemo.logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}')
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            text_b = line[4]
-            label = line[0]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MnliProcessor(DataProcessor):
-    """Processor for the MultiNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched",)
-
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[8]
-            text_b = line[9]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class MnliMismatchedProcessor(MnliProcessor):
-    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched",)
-
-
-class ColaProcessor(DataProcessor):
-    """Processor for the CoLA data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[3]
-            label = line[1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-class Sst2Processor(DataProcessor):
-    """Processor for the SST-2 data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, i)
-            text_a = line[0]
-            label = line[1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-        return examples
-
-
-class StsbProcessor(DataProcessor):
-    """Processor for the STS-B data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return [None]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[7]
-            text_b = line[8]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class QqpProcessor(DataProcessor):
-    """Processor for the QQP data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            try:
-                text_a = line[3]
-                text_b = line[4]
-                label = line[5]
-            except IndexError:
-                continue
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class QnliProcessor(DataProcessor):
-    """Processor for the QNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class RteProcessor(DataProcessor):
-    """Processor for the RTE data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["entailment", "not_entailment"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-class WnliProcessor(DataProcessor):
-    """Processor for the WNLI data set (GLUE version)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[1]
-            text_b = line[2]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-
-
-processors = {
-    "cola": ColaProcessor,
-    "mnli": MnliProcessor,
-    "mnli-mm": MnliMismatchedProcessor,
-    "mrpc": MrpcProcessor,
-    "sst-2": Sst2Processor,
-    "sts-b": StsbProcessor,
-    "qqp": QqpProcessor,
-    "qnli": QnliProcessor,
-    "rte": RteProcessor,
-    "wnli": WnliProcessor,
-}
-
-output_modes = {
-    "cola": "classification",
-    "mnli": "classification",
-    "mnli-mm": "classification",
-    "mrpc": "classification",
-    "sst-2": "classification",
-    "sts-b": "regression",
-    "qqp": "classification",
-    "qnli": "classification",
-    "rte": "classification",
-    "wnli": "classification",
-}
-
-GLUE_TASKS_NUM_LABELS = {
-    "cola": 2,
-    "mnli": 3,
-    "mrpc": 2,
-    "sst-2": 2,
-    "sts-b": 1,
-    "qqp": 2,
-    "qnli": 2,
-    "rte": 2,
-    "wnli": 2,
-}
diff --git a/nemo/collections/nlp/data/tokenizers/__init__.py b/nemo/collections/nlp/data/tokenizers/__init__.py
index ba9baba6c89c..4affa23c5655 100644
--- a/nemo/collections/nlp/data/tokenizers/__init__.py
+++ b/nemo/collections/nlp/data/tokenizers/__init__.py
@@ -1,6 +1,22 @@
-from .bert_tokenizer import NemoBertTokenizer
-from .char_tokenizer import CharTokenizer
-from .gpt2_tokenizer import NemoGPT2Tokenizer
-from .spc_tokenizer import SentencePieceTokenizer
-from .word_tokenizer import WordTokenizer
-from .yttm_tokenizer import YouTokenToMeTokenizer
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.data.tokenizers.bert_tokenizer import NemoBertTokenizer
+from nemo.collections.nlp.data.tokenizers.char_tokenizer import CharTokenizer
+from nemo.collections.nlp.data.tokenizers.gpt2_tokenizer import NemoGPT2Tokenizer
+from nemo.collections.nlp.data.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
+from nemo.collections.nlp.data.tokenizers.word_tokenizer import WordTokenizer
+from nemo.collections.nlp.data.tokenizers.youtokentome_tokenizer import YouTokenToMeTokenizer
diff --git a/nemo/collections/nlp/data/tokenizers/bert_tokenizer.py b/nemo/collections/nlp/data/tokenizers/bert_tokenizer.py
index cc6b20e875a8..abb6e27dfd06 100644
--- a/nemo/collections/nlp/data/tokenizers/bert_tokenizer.py
+++ b/nemo/collections/nlp/data/tokenizers/bert_tokenizer.py
@@ -1,8 +1,26 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 import re
 
 from transformers import BertTokenizer
 
-from .tokenizer_spec import TokenizerSpec
+from nemo.collections.nlp.data.tokenizers.tokenizer_spec import TokenizerSpec
+
+__all__ = ['NemoBertTokenizer']
 
 
 def handle_quotes(text):
diff --git a/nemo/collections/nlp/data/tokenizers/char_tokenizer.py b/nemo/collections/nlp/data/tokenizers/char_tokenizer.py
index d634277bd3d5..f2d525a5d6e5 100644
--- a/nemo/collections/nlp/data/tokenizers/char_tokenizer.py
+++ b/nemo/collections/nlp/data/tokenizers/char_tokenizer.py
@@ -1,4 +1,22 @@
-from .tokenizer_spec import TokenizerSpec
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.data.tokenizers.tokenizer_spec import TokenizerSpec
+
+__all__ = ['CharTokenizer']
 
 
 class CharTokenizer(TokenizerSpec):
diff --git a/nemo/collections/nlp/utils/metrics/fairseq_tokenizer.py b/nemo/collections/nlp/data/tokenizers/fairseq_tokenizer.py
similarity index 79%
rename from nemo/collections/nlp/utils/metrics/fairseq_tokenizer.py
rename to nemo/collections/nlp/data/tokenizers/fairseq_tokenizer.py
index f6bfdfad9473..be654de9421a 100644
--- a/nemo/collections/nlp/utils/metrics/fairseq_tokenizer.py
+++ b/nemo/collections/nlp/data/tokenizers/fairseq_tokenizer.py
@@ -1,3 +1,19 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 """ Code from
 https://github.com/NVIDIA/DeepLearningExamples/blob/
 master/PyTorch/Translation/Transformer/fairseq/tokenizer.py
@@ -8,6 +24,8 @@
 import unicodedata
 from collections import defaultdict
 
+__all__ = ['get_unicode_categories', 'tokenize_en']
+
 
 def get_unicode_categories():
     cats = defaultdict(list)
diff --git a/nemo/collections/nlp/data/tokenizers/gpt2_tokenizer.py b/nemo/collections/nlp/data/tokenizers/gpt2_tokenizer.py
index 60e6c3cf3cd5..fe443d90db35 100644
--- a/nemo/collections/nlp/data/tokenizers/gpt2_tokenizer.py
+++ b/nemo/collections/nlp/data/tokenizers/gpt2_tokenizer.py
@@ -1,6 +1,24 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 from transformers import GPT2Tokenizer
 
-from .tokenizer_spec import TokenizerSpec
+from nemo.collections.nlp.data.tokenizers.tokenizer_spec import TokenizerSpec
+
+__all__ = ['NemoGPT2Tokenizer']
 
 
 class NemoGPT2Tokenizer(TokenizerSpec):
diff --git a/nemo/collections/nlp/data/tokenizers/spc_tokenizer.py b/nemo/collections/nlp/data/tokenizers/sentencepiece_tokenizer.py
similarity index 79%
rename from nemo/collections/nlp/data/tokenizers/spc_tokenizer.py
rename to nemo/collections/nlp/data/tokenizers/sentencepiece_tokenizer.py
index 67a2c00bda3e..0cc7e9b62cf2 100644
--- a/nemo/collections/nlp/data/tokenizers/spc_tokenizer.py
+++ b/nemo/collections/nlp/data/tokenizers/sentencepiece_tokenizer.py
@@ -1,6 +1,24 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 import sentencepiece as spm
 
-from .tokenizer_spec import TokenizerSpec
+from nemo.collections.nlp.data.tokenizers.tokenizer_spec import TokenizerSpec
+
+__all__ = ['SentencePieceTokenizer']
 
 
 class SentencePieceTokenizer(TokenizerSpec):
diff --git a/nemo/collections/nlp/data/tokenizers/tokenizer_spec.py b/nemo/collections/nlp/data/tokenizers/tokenizer_spec.py
index eeadf617c189..c9035933ca6c 100644
--- a/nemo/collections/nlp/data/tokenizers/tokenizer_spec.py
+++ b/nemo/collections/nlp/data/tokenizers/tokenizer_spec.py
@@ -1,6 +1,24 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 from abc import ABC, abstractmethod
 from typing import List
 
+__all__ = ['TokenizerSpec']
+
 
 class TokenizerSpec(ABC):
     @abstractmethod
diff --git a/nemo/collections/nlp/data/tokenizers/word_tokenizer.py b/nemo/collections/nlp/data/tokenizers/word_tokenizer.py
index f45940f03c58..0d037f981dc6 100644
--- a/nemo/collections/nlp/data/tokenizers/word_tokenizer.py
+++ b/nemo/collections/nlp/data/tokenizers/word_tokenizer.py
@@ -1,4 +1,22 @@
-from .tokenizer_spec import TokenizerSpec
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.data.tokenizers.tokenizer_spec import TokenizerSpec
+
+__all__ = ['WordTokenizer']
 
 
 class WordTokenizer(TokenizerSpec):
diff --git a/nemo/collections/nlp/data/tokenizers/yttm_tokenizer.py b/nemo/collections/nlp/data/tokenizers/youtokentome_tokenizer.py
similarity index 58%
rename from nemo/collections/nlp/data/tokenizers/yttm_tokenizer.py
rename to nemo/collections/nlp/data/tokenizers/youtokentome_tokenizer.py
index 94acc3e4b1ae..ffc62be9ff28 100644
--- a/nemo/collections/nlp/data/tokenizers/yttm_tokenizer.py
+++ b/nemo/collections/nlp/data/tokenizers/youtokentome_tokenizer.py
@@ -1,6 +1,24 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 import youtokentome as yttm
 
-from .tokenizer_spec import TokenizerSpec
+from nemo.collections.nlp.data.tokenizers.tokenizer_spec import TokenizerSpec
+
+__all__ = ['YouTokenToMeTokenizer']
 
 
 class YouTokenToMeTokenizer(TokenizerSpec):
diff --git a/nemo/collections/nlp/data/utils.py b/nemo/collections/nlp/data/utils.py
deleted file mode 100644
index 1119f48a91aa..000000000000
--- a/nemo/collections/nlp/data/utils.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import os
-import pickle
-import re
-import string
-
-import numpy as np
-
-import nemo
-
-
-def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True):
-    """
-    Reads dataset from file line by line, tokenizes each line with tokenizer,
-    and returns list of lists which corresponds to ids of tokenized strings.
-
-    Args:
-        dataset: path to dataset
-        tokenizer: tokenizer to convert text into ids
-        cache_ids: if True, ids are saved to disk as pickle file
-            with similar name (e.g., data.txt --> data.txt.pkl)
-        add_bos_eos: bool, whether to add <s> and </s> symbols (e.g., for NMT)
-    Returns:
-        ids: list of ids which correspond to tokenized strings of the dataset
-    """
-
-    cached_ids_dataset = dataset + str(".pkl")
-    if os.path.isfile(cached_ids_dataset):
-        nemo.logging.info("Loading cached tokenized dataset ...")
-        ids = pickle.load(open(cached_ids_dataset, "rb"))
-    else:
-        nemo.logging.info("Tokenizing dataset ...")
-        data = open(dataset, "rb").readlines()
-        ids = []
-        for sentence in data:
-            sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8"))
-            if add_bos_eos:
-                sent_ids = [tokenizer.bos_id()] + sent_ids + [tokenizer.eos_id()]
-            ids.append(sent_ids)
-        if cache_ids:
-            nemo.logging.info("Caching tokenized dataset ...")
-            pickle.dump(ids, open(cached_ids_dataset, "wb"))
-    return ids
-
-
-def clean_src_and_target(
-    src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5,
-):
-    """
-    Cleans source and target sentences to get rid of noisy data.
-    Specifically, a pair of sentences is removed if
-      -- either source or target is longer than *max_tokens*
-      -- either source or target is shorter than *min_tokens*
-      -- absolute difference between source and target is larger than
-         *max_tokens_diff*
-      -- one sentence is *max_tokens_ratio* times longer than the other
-    """
-
-    if len(src_ids) != len(tgt_ids):
-        raise ValueError("Source and target corpora have different lengths!")
-    src_ids_, tgt_ids_ = [], []
-    for i in range(len(src_ids)):
-        src_len, tgt_len = len(src_ids[i]), len(tgt_ids[i])
-        if (
-            src_len > max_tokens
-            or tgt_len > max_tokens
-            or src_len < min_tokens
-            or tgt_len < min_tokens
-            or (src_ids[i] == tgt_ids[i])
-            or np.abs(src_len - tgt_len) > max_tokens_diff
-        ):
-            continue
-        ratio = max(src_len - 2, 1) / max(tgt_len - 2, 1)
-        if ratio > max_tokens_ratio or ratio < (1 / max_tokens_ratio):
-            continue
-        src_ids_.append(src_ids[i])
-        tgt_ids_.append(tgt_ids[i])
-    return src_ids_, tgt_ids_
-
-
-def remove_punctuation_from_sentence(sentence):
-    sentence = re.sub('[' + string.punctuation + ']', '', sentence)
-    sentence = sentence.lower()
-    return sentence
-
-
-def check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token.
-
-    Because of the sliding window approach taken to scoring documents,
-    a single token can appear in multiple documents.
-
-    Example:
-        Doc: the man went to the store and bought a gallon of milk
-        Span A: the man went to the
-        Span B: to the store and bought
-        Span C: and bought a gallon of
-        ...
-
-    Now the word 'bought' will have two scores from spans B and C. We only
-    want to consider the score with "maximum context", which we define as
-    the *minimum* of its left and right context (the *sum* of left and
-    right context will always be the same, of course).
-
-    In the example the maximum context for 'bought' would be span C since
-    it has 1 left context and 3 right context, while span B has 4 left context
-    and 0 right context.
-
-    Code adapted from the code by the Google AI and HuggingFace.
-    """
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
diff --git a/nemo/collections/nlp/huggingface/__init__.py b/nemo/collections/nlp/huggingface/__init__.py
deleted file mode 100644
index 5074307bd60a..000000000000
--- a/nemo/collections/nlp/huggingface/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .bert import BERT
diff --git a/nemo/collections/nlp/metrics/__init__.py b/nemo/collections/nlp/metrics/__init__.py
new file mode 100644
index 000000000000..4b9cfe094485
--- /dev/null
+++ b/nemo/collections/nlp/metrics/__init__.py
@@ -0,0 +1,17 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.metrics.bleu import *
diff --git a/nemo/collections/nlp/utils/metrics/bleu.py b/nemo/collections/nlp/metrics/bleu.py
similarity index 88%
rename from nemo/collections/nlp/utils/metrics/bleu.py
rename to nemo/collections/nlp/metrics/bleu.py
index 04e67d1788d6..bab9c5f4c0f6 100644
--- a/nemo/collections/nlp/utils/metrics/bleu.py
+++ b/nemo/collections/nlp/metrics/bleu.py
@@ -22,17 +22,7 @@
 import collections
 import math
 
-
-def compound_split(segment):
-    segment = segment.replace(".", " . ")
-    segment = segment.replace(",", " , ")
-    segment = segment.replace(":", " : ")
-    segment = segment.replace("!", " ! ")
-    segment = segment.replace("?", " ? ")
-    segment = segment.replace("-", " ##AT##-##AT## ")
-    segment = segment.replace("\"", " &quote ")
-    segment = segment.replace("%", " % ")
-    return segment.split()
+__all__ = ['compute_bleu']
 
 
 def _get_ngrams(segment, max_order):
@@ -117,11 +107,4 @@ def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=False
 
     precisions = [p * 100 for p in precisions]
 
-    return (
-        bleu * 100,
-        precisions,
-        bp,
-        ratio,
-        translation_length,
-        reference_length,
-    )
+    return (bleu * 100, precisions, bp, ratio, translation_length, reference_length)
diff --git a/nemo/collections/nlp/utils/metrics/sacrebleu.py b/nemo/collections/nlp/metrics/sacrebleu.py
similarity index 90%
rename from nemo/collections/nlp/utils/metrics/sacrebleu.py
rename to nemo/collections/nlp/metrics/sacrebleu.py
index 411743a91d34..586b19bf2d30 100755
--- a/nemo/collections/nlp/utils/metrics/sacrebleu.py
+++ b/nemo/collections/nlp/metrics/sacrebleu.py
@@ -36,8 +36,8 @@
 from itertools import zip_longest
 from typing import Iterable, List, Tuple, Union
 
-from .fairseq_tokenizer import tokenize_en
 from nemo import logging
+from nemo.collections.nlp.data.tokenizers.fairseq_tokenizer import tokenize_en
 
 VERSION = '1.3.5'
 
@@ -117,10 +117,10 @@
         '\n    publisher = "Association for Computational Linguistics",\n    pages = "543--553",'
         '\n    location = "Brussels, Belgium",\n    url = "http://aclweb.org/anthology/D18-1050"\n}',
         'md5': ['8ce1831ac584979ba8cdcd9d4be43e1d'],
-        'en-fr': ['1:MTNT/valid/valid.en-fr.tsv', '2:MTNT/valid/valid.en-fr.tsv',],
-        'fr-en': ['1:MTNT/valid/valid.fr-en.tsv', '2:MTNT/valid/valid.fr-en.tsv',],
-        'en-ja': ['1:MTNT/valid/valid.en-ja.tsv', '2:MTNT/valid/valid.en-ja.tsv',],
-        'ja-en': ['1:MTNT/valid/valid.ja-en.tsv', '2:MTNT/valid/valid.ja-en.tsv',],
+        'en-fr': ['1:MTNT/valid/valid.en-fr.tsv', '2:MTNT/valid/valid.en-fr.tsv'],
+        'fr-en': ['1:MTNT/valid/valid.fr-en.tsv', '2:MTNT/valid/valid.fr-en.tsv'],
+        'en-ja': ['1:MTNT/valid/valid.en-ja.tsv', '2:MTNT/valid/valid.en-ja.tsv'],
+        'ja-en': ['1:MTNT/valid/valid.ja-en.tsv', '2:MTNT/valid/valid.ja-en.tsv'],
     },
     'mtnt1.1/train': {
         'data': ['https://github.com/pmichel31415/mtnt/releases/download/v1.1/MTNT.1.1.tar.gz'],
@@ -132,44 +132,44 @@
         '\n    publisher = "Association for Computational Linguistics",\n    pages = "543--553",'
         '\n    location = "Brussels, Belgium",\n    url = "http://aclweb.org/anthology/D18-1050"\n}',
         'md5': ['8ce1831ac584979ba8cdcd9d4be43e1d'],
-        'en-fr': ['1:MTNT/train/train.en-fr.tsv', '2:MTNT/train/train.en-fr.tsv',],
-        'fr-en': ['1:MTNT/train/train.fr-en.tsv', '2:MTNT/train/train.fr-en.tsv',],
-        'en-ja': ['1:MTNT/train/train.en-ja.tsv', '2:MTNT/train/train.en-ja.tsv',],
-        'ja-en': ['1:MTNT/train/train.ja-en.tsv', '2:MTNT/train/train.ja-en.tsv',],
+        'en-fr': ['1:MTNT/train/train.en-fr.tsv', '2:MTNT/train/train.en-fr.tsv'],
+        'fr-en': ['1:MTNT/train/train.fr-en.tsv', '2:MTNT/train/train.fr-en.tsv'],
+        'en-ja': ['1:MTNT/train/train.en-ja.tsv', '2:MTNT/train/train.en-ja.tsv'],
+        'ja-en': ['1:MTNT/train/train.ja-en.tsv', '2:MTNT/train/train.ja-en.tsv'],
     },
     'wmt19': {
         'data': ['http://data.statmt.org/wmt19/translation-task/test.tgz'],
         'md5': ['84de7162d158e28403103b01aeefc39a'],
-        'cs-de': ['sgm/newstest2019-csde-src.cs.sgm', 'sgm/newstest2019-csde-ref.de.sgm',],
-        'de-cs': ['sgm/newstest2019-decs-src.de.sgm', 'sgm/newstest2019-decs-ref.cs.sgm',],
-        'de-en': ['sgm/newstest2019-deen-src.de.sgm', 'sgm/newstest2019-deen-ref.en.sgm',],
-        'de-fr': ['sgm/newstest2019-defr-src.de.sgm', 'sgm/newstest2019-defr-ref.fr.sgm',],
-        'en-cs': ['sgm/newstest2019-encs-src.en.sgm', 'sgm/newstest2019-encs-ref.cs.sgm',],
-        'en-de': ['sgm/newstest2019-ende-src.en.sgm', 'sgm/newstest2019-ende-ref.de.sgm',],
-        'en-fi': ['sgm/newstest2019-enfi-src.en.sgm', 'sgm/newstest2019-enfi-ref.fi.sgm',],
-        'en-gu': ['sgm/newstest2019-engu-src.en.sgm', 'sgm/newstest2019-engu-ref.gu.sgm',],
-        'en-kk': ['sgm/newstest2019-enkk-src.en.sgm', 'sgm/newstest2019-enkk-ref.kk.sgm',],
-        'en-lt': ['sgm/newstest2019-enlt-src.en.sgm', 'sgm/newstest2019-enlt-ref.lt.sgm',],
-        'en-ru': ['sgm/newstest2019-enru-src.en.sgm', 'sgm/newstest2019-enru-ref.ru.sgm',],
-        'en-zh': ['sgm/newstest2019-enzh-src.en.sgm', 'sgm/newstest2019-enzh-ref.zh.sgm',],
-        'fi-en': ['sgm/newstest2019-fien-src.fi.sgm', 'sgm/newstest2019-fien-ref.en.sgm',],
-        'fr-de': ['sgm/newstest2019-frde-src.fr.sgm', 'sgm/newstest2019-frde-ref.de.sgm',],
-        'gu-en': ['sgm/newstest2019-guen-src.gu.sgm', 'sgm/newstest2019-guen-ref.en.sgm',],
-        'kk-en': ['sgm/newstest2019-kken-src.kk.sgm', 'sgm/newstest2019-kken-ref.en.sgm',],
-        'lt-en': ['sgm/newstest2019-lten-src.lt.sgm', 'sgm/newstest2019-lten-ref.en.sgm',],
-        'ru-en': ['sgm/newstest2019-ruen-src.ru.sgm', 'sgm/newstest2019-ruen-ref.en.sgm',],
-        'zh-en': ['sgm/newstest2019-zhen-src.zh.sgm', 'sgm/newstest2019-zhen-ref.en.sgm',],
+        'cs-de': ['sgm/newstest2019-csde-src.cs.sgm', 'sgm/newstest2019-csde-ref.de.sgm'],
+        'de-cs': ['sgm/newstest2019-decs-src.de.sgm', 'sgm/newstest2019-decs-ref.cs.sgm'],
+        'de-en': ['sgm/newstest2019-deen-src.de.sgm', 'sgm/newstest2019-deen-ref.en.sgm'],
+        'de-fr': ['sgm/newstest2019-defr-src.de.sgm', 'sgm/newstest2019-defr-ref.fr.sgm'],
+        'en-cs': ['sgm/newstest2019-encs-src.en.sgm', 'sgm/newstest2019-encs-ref.cs.sgm'],
+        'en-de': ['sgm/newstest2019-ende-src.en.sgm', 'sgm/newstest2019-ende-ref.de.sgm'],
+        'en-fi': ['sgm/newstest2019-enfi-src.en.sgm', 'sgm/newstest2019-enfi-ref.fi.sgm'],
+        'en-gu': ['sgm/newstest2019-engu-src.en.sgm', 'sgm/newstest2019-engu-ref.gu.sgm'],
+        'en-kk': ['sgm/newstest2019-enkk-src.en.sgm', 'sgm/newstest2019-enkk-ref.kk.sgm'],
+        'en-lt': ['sgm/newstest2019-enlt-src.en.sgm', 'sgm/newstest2019-enlt-ref.lt.sgm'],
+        'en-ru': ['sgm/newstest2019-enru-src.en.sgm', 'sgm/newstest2019-enru-ref.ru.sgm'],
+        'en-zh': ['sgm/newstest2019-enzh-src.en.sgm', 'sgm/newstest2019-enzh-ref.zh.sgm'],
+        'fi-en': ['sgm/newstest2019-fien-src.fi.sgm', 'sgm/newstest2019-fien-ref.en.sgm'],
+        'fr-de': ['sgm/newstest2019-frde-src.fr.sgm', 'sgm/newstest2019-frde-ref.de.sgm'],
+        'gu-en': ['sgm/newstest2019-guen-src.gu.sgm', 'sgm/newstest2019-guen-ref.en.sgm'],
+        'kk-en': ['sgm/newstest2019-kken-src.kk.sgm', 'sgm/newstest2019-kken-ref.en.sgm'],
+        'lt-en': ['sgm/newstest2019-lten-src.lt.sgm', 'sgm/newstest2019-lten-ref.en.sgm'],
+        'ru-en': ['sgm/newstest2019-ruen-src.ru.sgm', 'sgm/newstest2019-ruen-ref.en.sgm'],
+        'zh-en': ['sgm/newstest2019-zhen-src.zh.sgm', 'sgm/newstest2019-zhen-ref.en.sgm'],
     },
     'wmt19/dev': {
         'data': ['http://data.statmt.org/wmt19/translation-task/dev.tgz'],
         'description': 'Development data for tasks new to 2019.',
         'md5': ['f2ec7af5947c19e0cacb3882eb208002'],
-        'lt-en': ['dev/newsdev2019-lten-src.lt.sgm', 'dev/newsdev2019-lten-ref.en.sgm',],
-        'en-lt': ['dev/newsdev2019-enlt-src.en.sgm', 'dev/newsdev2019-enlt-ref.lt.sgm',],
-        'gu-en': ['dev/newsdev2019-guen-src.gu.sgm', 'dev/newsdev2019-guen-ref.en.sgm',],
-        'en-gu': ['dev/newsdev2019-engu-src.en.sgm', 'dev/newsdev2019-engu-ref.gu.sgm',],
-        'kk-en': ['dev/newsdev2019-kken-src.kk.sgm', 'dev/newsdev2019-kken-ref.en.sgm',],
-        'en-kk': ['dev/newsdev2019-enkk-src.en.sgm', 'dev/newsdev2019-enkk-ref.kk.sgm',],
+        'lt-en': ['dev/newsdev2019-lten-src.lt.sgm', 'dev/newsdev2019-lten-ref.en.sgm'],
+        'en-lt': ['dev/newsdev2019-enlt-src.en.sgm', 'dev/newsdev2019-enlt-ref.lt.sgm'],
+        'gu-en': ['dev/newsdev2019-guen-src.gu.sgm', 'dev/newsdev2019-guen-ref.en.sgm'],
+        'en-gu': ['dev/newsdev2019-engu-src.en.sgm', 'dev/newsdev2019-engu-ref.gu.sgm'],
+        'kk-en': ['dev/newsdev2019-kken-src.kk.sgm', 'dev/newsdev2019-kken-ref.en.sgm'],
+        'en-kk': ['dev/newsdev2019-enkk-src.en.sgm', 'dev/newsdev2019-enkk-ref.kk.sgm'],
     },
     'wmt18': {
         'data': ['http://data.statmt.org/wmt18/translation-task/test.tgz'],
@@ -183,20 +183,20 @@
         '\n    address = "Belgium, Brussels",\n    publisher = "Association for Computational '
         'Linguistics",\n    url = "https://www.aclweb.org/anthology/W18-6401",\n    pages = "272--303",'
         '\n}',
-        'cs-en': ['test/newstest2018-csen-src.cs.sgm', 'test/newstest2018-csen-ref.en.sgm',],
-        'de-en': ['test/newstest2018-deen-src.de.sgm', 'test/newstest2018-deen-ref.en.sgm',],
-        'en-cs': ['test/newstest2018-encs-src.en.sgm', 'test/newstest2018-encs-ref.cs.sgm',],
-        'en-de': ['test/newstest2018-ende-src.en.sgm', 'test/newstest2018-ende-ref.de.sgm',],
-        'en-et': ['test/newstest2018-enet-src.en.sgm', 'test/newstest2018-enet-ref.et.sgm',],
-        'en-fi': ['test/newstest2018-enfi-src.en.sgm', 'test/newstest2018-enfi-ref.fi.sgm',],
-        'en-ru': ['test/newstest2018-enru-src.en.sgm', 'test/newstest2018-enru-ref.ru.sgm',],
-        'et-en': ['test/newstest2018-eten-src.et.sgm', 'test/newstest2018-eten-ref.en.sgm',],
-        'fi-en': ['test/newstest2018-fien-src.fi.sgm', 'test/newstest2018-fien-ref.en.sgm',],
-        'ru-en': ['test/newstest2018-ruen-src.ru.sgm', 'test/newstest2018-ruen-ref.en.sgm',],
-        'en-tr': ['test/newstest2018-entr-src.en.sgm', 'test/newstest2018-entr-ref.tr.sgm',],
-        'tr-en': ['test/newstest2018-tren-src.tr.sgm', 'test/newstest2018-tren-ref.en.sgm',],
-        'en-zh': ['test/newstest2018-enzh-src.en.sgm', 'test/newstest2018-enzh-ref.zh.sgm',],
-        'zh-en': ['test/newstest2018-zhen-src.zh.sgm', 'test/newstest2018-zhen-ref.en.sgm',],
+        'cs-en': ['test/newstest2018-csen-src.cs.sgm', 'test/newstest2018-csen-ref.en.sgm'],
+        'de-en': ['test/newstest2018-deen-src.de.sgm', 'test/newstest2018-deen-ref.en.sgm'],
+        'en-cs': ['test/newstest2018-encs-src.en.sgm', 'test/newstest2018-encs-ref.cs.sgm'],
+        'en-de': ['test/newstest2018-ende-src.en.sgm', 'test/newstest2018-ende-ref.de.sgm'],
+        'en-et': ['test/newstest2018-enet-src.en.sgm', 'test/newstest2018-enet-ref.et.sgm'],
+        'en-fi': ['test/newstest2018-enfi-src.en.sgm', 'test/newstest2018-enfi-ref.fi.sgm'],
+        'en-ru': ['test/newstest2018-enru-src.en.sgm', 'test/newstest2018-enru-ref.ru.sgm'],
+        'et-en': ['test/newstest2018-eten-src.et.sgm', 'test/newstest2018-eten-ref.en.sgm'],
+        'fi-en': ['test/newstest2018-fien-src.fi.sgm', 'test/newstest2018-fien-ref.en.sgm'],
+        'ru-en': ['test/newstest2018-ruen-src.ru.sgm', 'test/newstest2018-ruen-ref.en.sgm'],
+        'en-tr': ['test/newstest2018-entr-src.en.sgm', 'test/newstest2018-entr-ref.tr.sgm'],
+        'tr-en': ['test/newstest2018-tren-src.tr.sgm', 'test/newstest2018-tren-ref.en.sgm'],
+        'en-zh': ['test/newstest2018-enzh-src.en.sgm', 'test/newstest2018-enzh-ref.zh.sgm'],
+        'zh-en': ['test/newstest2018-zhen-src.zh.sgm', 'test/newstest2018-zhen-ref.en.sgm'],
     },
     'wmt18/test-ts': {
         'data': ['http://data.statmt.org/wmt18/translation-task/test-ts.tgz'],
@@ -221,8 +221,8 @@
         'data': ['http://data.statmt.org/wmt18/translation-task/dev.tgz'],
         'md5': ['486f391da54a7a3247f02ebd25996f24'],
         'description': 'Development data (Estonian<>English).',
-        'et-en': ['dev/newsdev2018-eten-src.et.sgm', 'dev/newsdev2018-eten-ref.en.sgm',],
-        'en-et': ['dev/newsdev2018-enet-src.en.sgm', 'dev/newsdev2018-enet-ref.et.sgm',],
+        'et-en': ['dev/newsdev2018-eten-src.et.sgm', 'dev/newsdev2018-eten-ref.en.sgm'],
+        'en-et': ['dev/newsdev2018-enet-src.en.sgm', 'dev/newsdev2018-enet-ref.et.sgm'],
     },
     'wmt17': {
         'data': ['http://data.statmt.org/wmt17/translation-task/test.tgz'],
@@ -238,26 +238,26 @@
         '\n  address   = {Copenhagen, Denmark},\n  publisher = {Association for Computational '
         'Linguistics},\n  pages     = {169--214},\n  url       = {'
         'http://www.aclweb.org/anthology/W17-4717}\n}',
-        'cs-en': ['test/newstest2017-csen-src.cs.sgm', 'test/newstest2017-csen-ref.en.sgm',],
-        'de-en': ['test/newstest2017-deen-src.de.sgm', 'test/newstest2017-deen-ref.en.sgm',],
-        'en-cs': ['test/newstest2017-encs-src.en.sgm', 'test/newstest2017-encs-ref.cs.sgm',],
-        'en-de': ['test/newstest2017-ende-src.en.sgm', 'test/newstest2017-ende-ref.de.sgm',],
-        'en-fi': ['test/newstest2017-enfi-src.en.sgm', 'test/newstest2017-enfi-ref.fi.sgm',],
-        'en-lv': ['test/newstest2017-enlv-src.en.sgm', 'test/newstest2017-enlv-ref.lv.sgm',],
-        'en-ru': ['test/newstest2017-enru-src.en.sgm', 'test/newstest2017-enru-ref.ru.sgm',],
-        'en-tr': ['test/newstest2017-entr-src.en.sgm', 'test/newstest2017-entr-ref.tr.sgm',],
-        'en-zh': ['test/newstest2017-enzh-src.en.sgm', 'test/newstest2017-enzh-ref.zh.sgm',],
-        'fi-en': ['test/newstest2017-fien-src.fi.sgm', 'test/newstest2017-fien-ref.en.sgm',],
-        'lv-en': ['test/newstest2017-lven-src.lv.sgm', 'test/newstest2017-lven-ref.en.sgm',],
-        'ru-en': ['test/newstest2017-ruen-src.ru.sgm', 'test/newstest2017-ruen-ref.en.sgm',],
-        'tr-en': ['test/newstest2017-tren-src.tr.sgm', 'test/newstest2017-tren-ref.en.sgm',],
-        'zh-en': ['test/newstest2017-zhen-src.zh.sgm', 'test/newstest2017-zhen-ref.en.sgm',],
+        'cs-en': ['test/newstest2017-csen-src.cs.sgm', 'test/newstest2017-csen-ref.en.sgm'],
+        'de-en': ['test/newstest2017-deen-src.de.sgm', 'test/newstest2017-deen-ref.en.sgm'],
+        'en-cs': ['test/newstest2017-encs-src.en.sgm', 'test/newstest2017-encs-ref.cs.sgm'],
+        'en-de': ['test/newstest2017-ende-src.en.sgm', 'test/newstest2017-ende-ref.de.sgm'],
+        'en-fi': ['test/newstest2017-enfi-src.en.sgm', 'test/newstest2017-enfi-ref.fi.sgm'],
+        'en-lv': ['test/newstest2017-enlv-src.en.sgm', 'test/newstest2017-enlv-ref.lv.sgm'],
+        'en-ru': ['test/newstest2017-enru-src.en.sgm', 'test/newstest2017-enru-ref.ru.sgm'],
+        'en-tr': ['test/newstest2017-entr-src.en.sgm', 'test/newstest2017-entr-ref.tr.sgm'],
+        'en-zh': ['test/newstest2017-enzh-src.en.sgm', 'test/newstest2017-enzh-ref.zh.sgm'],
+        'fi-en': ['test/newstest2017-fien-src.fi.sgm', 'test/newstest2017-fien-ref.en.sgm'],
+        'lv-en': ['test/newstest2017-lven-src.lv.sgm', 'test/newstest2017-lven-ref.en.sgm'],
+        'ru-en': ['test/newstest2017-ruen-src.ru.sgm', 'test/newstest2017-ruen-ref.en.sgm'],
+        'tr-en': ['test/newstest2017-tren-src.tr.sgm', 'test/newstest2017-tren-ref.en.sgm'],
+        'zh-en': ['test/newstest2017-zhen-src.zh.sgm', 'test/newstest2017-zhen-ref.en.sgm'],
     },
     'wmt17/B': {
         'data': ['http://data.statmt.org/wmt17/translation-task/test.tgz'],
         'md5': ['86a1724c276004aa25455ae2a04cef26'],
         'description': 'Additional reference for EN-FI and FI-EN.',
-        'en-fi': ['test/newstestB2017-enfi-src.en.sgm', 'test/newstestB2017-enfi-ref.fi.sgm',],
+        'en-fi': ['test/newstestB2017-enfi-src.en.sgm', 'test/newstestB2017-enfi-ref.fi.sgm'],
     },
     'wmt17/tworefs': {
         'data': ['http://data.statmt.org/wmt17/translation-task/test.tgz'],
@@ -273,24 +273,24 @@
         'data': ['http://data.statmt.org/wmt17/translation-task/test-update-1.tgz'],
         'md5': ['91dbfd5af99bc6891a637a68e04dfd41'],
         'description': 'Improved zh-en and en-zh translations.',
-        'en-zh': ['newstest2017-enzh-src.en.sgm', 'newstest2017-enzh-ref.zh.sgm',],
-        'zh-en': ['newstest2017-zhen-src.zh.sgm', 'newstest2017-zhen-ref.en.sgm',],
+        'en-zh': ['newstest2017-enzh-src.en.sgm', 'newstest2017-enzh-ref.zh.sgm'],
+        'zh-en': ['newstest2017-zhen-src.zh.sgm', 'newstest2017-zhen-ref.en.sgm'],
     },
     'wmt17/dev': {
         'data': ['http://data.statmt.org/wmt17/translation-task/dev.tgz'],
         'md5': ['9b1aa63c1cf49dccdd20b962fe313989'],
         'description': 'Development sets released for new languages in 2017.',
-        'en-lv': ['dev/newsdev2017-enlv-src.en.sgm', 'dev/newsdev2017-enlv-ref.lv.sgm',],
-        'en-zh': ['dev/newsdev2017-enzh-src.en.sgm', 'dev/newsdev2017-enzh-ref.zh.sgm',],
-        'lv-en': ['dev/newsdev2017-lven-src.lv.sgm', 'dev/newsdev2017-lven-ref.en.sgm',],
-        'zh-en': ['dev/newsdev2017-zhen-src.zh.sgm', 'dev/newsdev2017-zhen-ref.en.sgm',],
+        'en-lv': ['dev/newsdev2017-enlv-src.en.sgm', 'dev/newsdev2017-enlv-ref.lv.sgm'],
+        'en-zh': ['dev/newsdev2017-enzh-src.en.sgm', 'dev/newsdev2017-enzh-ref.zh.sgm'],
+        'lv-en': ['dev/newsdev2017-lven-src.lv.sgm', 'dev/newsdev2017-lven-ref.en.sgm'],
+        'zh-en': ['dev/newsdev2017-zhen-src.zh.sgm', 'dev/newsdev2017-zhen-ref.en.sgm'],
     },
     'wmt17/ms': {
         'data': [
             'https://github.com/MicrosoftTranslator/Translator-HumanParityData/archive/master.zip',
             'http://data.statmt.org/wmt17/translation-task/test-update-1.tgz',
         ],
-        'md5': ['18fdaa7a3c84cf6ef688da1f6a5fa96f', '91dbfd5af99bc6891a637a68e04dfd41',],
+        'md5': ['18fdaa7a3c84cf6ef688da1f6a5fa96f', '91dbfd5af99bc6891a637a68e04dfd41'],
         'description': 'Additional Chinese-English references from Microsoft Research.',
         'citation': '@inproceedings{achieving-human-parity-on-automatic-chinese-to-english-news-translation,'
         '\n  author = {Hassan Awadalla, Hany and Aue, Anthony and Chen, Chang and Chowdhary, Vishal and '
@@ -317,9 +317,9 @@
             'newstest2017-zhen-src.zh.sgm',
             'newstest2017-zhen-ref.en.sgm',
             'Translator-HumanParityData-master/Translator-HumanParityData/References/Translator-HumanParityData'
-            '-Reference-HT.txt',
+            + '-Reference-HT.txt',
             'Translator-HumanParityData-master/Translator-HumanParityData/References/Translator-HumanParityData'
-            '-Reference-PE.txt',
+            + '-Reference-PE.txt',
         ],
     },
     'wmt16': {
@@ -336,24 +336,24 @@
         'Machine Translation},\n  month     = {August},\n  year      = {2016},\n  address   = {Berlin, '
         'Germany},\n  publisher = {Association for Computational Linguistics},\n  pages     = {'
         '131--198},\n  url       = {http://www.aclweb.org/anthology/W/W16/W16-2301}\n}',
-        'cs-en': ['test/newstest2016-csen-src.cs.sgm', 'test/newstest2016-csen-ref.en.sgm',],
-        'de-en': ['test/newstest2016-deen-src.de.sgm', 'test/newstest2016-deen-ref.en.sgm',],
-        'en-cs': ['test/newstest2016-encs-src.en.sgm', 'test/newstest2016-encs-ref.cs.sgm',],
-        'en-de': ['test/newstest2016-ende-src.en.sgm', 'test/newstest2016-ende-ref.de.sgm',],
-        'en-fi': ['test/newstest2016-enfi-src.en.sgm', 'test/newstest2016-enfi-ref.fi.sgm',],
-        'en-ro': ['test/newstest2016-enro-src.en.sgm', 'test/newstest2016-enro-ref.ro.sgm',],
-        'en-ru': ['test/newstest2016-enru-src.en.sgm', 'test/newstest2016-enru-ref.ru.sgm',],
-        'en-tr': ['test/newstest2016-entr-src.en.sgm', 'test/newstest2016-entr-ref.tr.sgm',],
-        'fi-en': ['test/newstest2016-fien-src.fi.sgm', 'test/newstest2016-fien-ref.en.sgm',],
-        'ro-en': ['test/newstest2016-roen-src.ro.sgm', 'test/newstest2016-roen-ref.en.sgm',],
-        'ru-en': ['test/newstest2016-ruen-src.ru.sgm', 'test/newstest2016-ruen-ref.en.sgm',],
-        'tr-en': ['test/newstest2016-tren-src.tr.sgm', 'test/newstest2016-tren-ref.en.sgm',],
+        'cs-en': ['test/newstest2016-csen-src.cs.sgm', 'test/newstest2016-csen-ref.en.sgm'],
+        'de-en': ['test/newstest2016-deen-src.de.sgm', 'test/newstest2016-deen-ref.en.sgm'],
+        'en-cs': ['test/newstest2016-encs-src.en.sgm', 'test/newstest2016-encs-ref.cs.sgm'],
+        'en-de': ['test/newstest2016-ende-src.en.sgm', 'test/newstest2016-ende-ref.de.sgm'],
+        'en-fi': ['test/newstest2016-enfi-src.en.sgm', 'test/newstest2016-enfi-ref.fi.sgm'],
+        'en-ro': ['test/newstest2016-enro-src.en.sgm', 'test/newstest2016-enro-ref.ro.sgm'],
+        'en-ru': ['test/newstest2016-enru-src.en.sgm', 'test/newstest2016-enru-ref.ru.sgm'],
+        'en-tr': ['test/newstest2016-entr-src.en.sgm', 'test/newstest2016-entr-ref.tr.sgm'],
+        'fi-en': ['test/newstest2016-fien-src.fi.sgm', 'test/newstest2016-fien-ref.en.sgm'],
+        'ro-en': ['test/newstest2016-roen-src.ro.sgm', 'test/newstest2016-roen-ref.en.sgm'],
+        'ru-en': ['test/newstest2016-ruen-src.ru.sgm', 'test/newstest2016-ruen-ref.en.sgm'],
+        'tr-en': ['test/newstest2016-tren-src.tr.sgm', 'test/newstest2016-tren-ref.en.sgm'],
     },
     'wmt16/B': {
         'data': ['http://data.statmt.org/wmt16/translation-task/test.tgz'],
         'md5': ['3d809cd0c2c86adb2c67034d15c4e446'],
         'description': 'Additional reference for EN-FI.',
-        'en-fi': ['test/newstest2016-enfi-src.en.sgm', 'test/newstestB2016-enfi-ref.fi.sgm',],
+        'en-fi': ['test/newstest2016-enfi-src.en.sgm', 'test/newstestB2016-enfi-ref.fi.sgm'],
     },
     'wmt16/tworefs': {
         'data': ['http://data.statmt.org/wmt16/translation-task/test.tgz'],
@@ -369,10 +369,10 @@
         'data': ['http://data.statmt.org/wmt16/translation-task/dev.tgz'],
         'md5': ['4a3dc2760bb077f4308cce96b06e6af6'],
         'description': 'Development sets released for new languages in 2016.',
-        'en-ro': ['dev/newsdev2016-enro-src.en.sgm', 'dev/newsdev2016-enro-ref.ro.sgm',],
-        'en-tr': ['dev/newsdev2016-entr-src.en.sgm', 'dev/newsdev2016-entr-ref.tr.sgm',],
-        'ro-en': ['dev/newsdev2016-roen-src.ro.sgm', 'dev/newsdev2016-roen-ref.en.sgm',],
-        'tr-en': ['dev/newsdev2016-tren-src.tr.sgm', 'dev/newsdev2016-tren-ref.en.sgm',],
+        'en-ro': ['dev/newsdev2016-enro-src.en.sgm', 'dev/newsdev2016-enro-ref.ro.sgm'],
+        'en-tr': ['dev/newsdev2016-entr-src.en.sgm', 'dev/newsdev2016-entr-ref.tr.sgm'],
+        'ro-en': ['dev/newsdev2016-roen-src.ro.sgm', 'dev/newsdev2016-roen-ref.en.sgm'],
+        'tr-en': ['dev/newsdev2016-tren-src.tr.sgm', 'dev/newsdev2016-tren-ref.en.sgm'],
     },
     'wmt15': {
         'data': ['http://statmt.org/wmt15/test.tgz'],
@@ -387,16 +387,16 @@
         '\n  month     = {September},\n  year      = {2015},\n  address   = {Lisbon, Portugal},'
         '\n  publisher = {Association for Computational Linguistics},\n  pages     = {1--46},\n  url     '
         '  = {http://aclweb.org/anthology/W15-3001}\n}',
-        'en-fr': ['test/newsdiscusstest2015-enfr-src.en.sgm', 'test/newsdiscusstest2015-enfr-ref.fr.sgm',],
-        'fr-en': ['test/newsdiscusstest2015-fren-src.fr.sgm', 'test/newsdiscusstest2015-fren-ref.en.sgm',],
-        'cs-en': ['test/newstest2015-csen-src.cs.sgm', 'test/newstest2015-csen-ref.en.sgm',],
-        'de-en': ['test/newstest2015-deen-src.de.sgm', 'test/newstest2015-deen-ref.en.sgm',],
-        'en-cs': ['test/newstest2015-encs-src.en.sgm', 'test/newstest2015-encs-ref.cs.sgm',],
-        'en-de': ['test/newstest2015-ende-src.en.sgm', 'test/newstest2015-ende-ref.de.sgm',],
-        'en-fi': ['test/newstest2015-enfi-src.en.sgm', 'test/newstest2015-enfi-ref.fi.sgm',],
-        'en-ru': ['test/newstest2015-enru-src.en.sgm', 'test/newstest2015-enru-ref.ru.sgm',],
-        'fi-en': ['test/newstest2015-fien-src.fi.sgm', 'test/newstest2015-fien-ref.en.sgm',],
-        'ru-en': ['test/newstest2015-ruen-src.ru.sgm', 'test/newstest2015-ruen-ref.en.sgm',],
+        'en-fr': ['test/newsdiscusstest2015-enfr-src.en.sgm', 'test/newsdiscusstest2015-enfr-ref.fr.sgm'],
+        'fr-en': ['test/newsdiscusstest2015-fren-src.fr.sgm', 'test/newsdiscusstest2015-fren-ref.en.sgm'],
+        'cs-en': ['test/newstest2015-csen-src.cs.sgm', 'test/newstest2015-csen-ref.en.sgm'],
+        'de-en': ['test/newstest2015-deen-src.de.sgm', 'test/newstest2015-deen-ref.en.sgm'],
+        'en-cs': ['test/newstest2015-encs-src.en.sgm', 'test/newstest2015-encs-ref.cs.sgm'],
+        'en-de': ['test/newstest2015-ende-src.en.sgm', 'test/newstest2015-ende-ref.de.sgm'],
+        'en-fi': ['test/newstest2015-enfi-src.en.sgm', 'test/newstest2015-enfi-ref.fi.sgm'],
+        'en-ru': ['test/newstest2015-enru-src.en.sgm', 'test/newstest2015-enru-ref.ru.sgm'],
+        'fi-en': ['test/newstest2015-fien-src.fi.sgm', 'test/newstest2015-fien-ref.en.sgm'],
+        'ru-en': ['test/newstest2015-ruen-src.ru.sgm', 'test/newstest2015-ruen-ref.en.sgm'],
     },
     'wmt14': {
         'data': ['http://statmt.org/wmt14/test-filtered.tgz'],
@@ -410,31 +410,31 @@
         'on Statistical Machine Translation},\n  month     = {June},\n  year      = {2014},\n  address   '
         '= {Baltimore, Maryland, USA},\n  publisher = {Association for Computational Linguistics},'
         '\n  pages     = {12--58},\n  url       = {http://www.aclweb.org/anthology/W/W14/W14-3302}\n}',
-        'cs-en': ['test/newstest2014-csen-src.cs.sgm', 'test/newstest2014-csen-ref.en.sgm',],
-        'en-cs': ['test/newstest2014-csen-src.en.sgm', 'test/newstest2014-csen-ref.cs.sgm',],
-        'de-en': ['test/newstest2014-deen-src.de.sgm', 'test/newstest2014-deen-ref.en.sgm',],
-        'en-de': ['test/newstest2014-deen-src.en.sgm', 'test/newstest2014-deen-ref.de.sgm',],
-        'en-fr': ['test/newstest2014-fren-src.en.sgm', 'test/newstest2014-fren-ref.fr.sgm',],
-        'fr-en': ['test/newstest2014-fren-src.fr.sgm', 'test/newstest2014-fren-ref.en.sgm',],
-        'en-hi': ['test/newstest2014-hien-src.en.sgm', 'test/newstest2014-hien-ref.hi.sgm',],
-        'hi-en': ['test/newstest2014-hien-src.hi.sgm', 'test/newstest2014-hien-ref.en.sgm',],
-        'en-ru': ['test/newstest2014-ruen-src.en.sgm', 'test/newstest2014-ruen-ref.ru.sgm',],
-        'ru-en': ['test/newstest2014-ruen-src.ru.sgm', 'test/newstest2014-ruen-ref.en.sgm',],
+        'cs-en': ['test/newstest2014-csen-src.cs.sgm', 'test/newstest2014-csen-ref.en.sgm'],
+        'en-cs': ['test/newstest2014-csen-src.en.sgm', 'test/newstest2014-csen-ref.cs.sgm'],
+        'de-en': ['test/newstest2014-deen-src.de.sgm', 'test/newstest2014-deen-ref.en.sgm'],
+        'en-de': ['test/newstest2014-deen-src.en.sgm', 'test/newstest2014-deen-ref.de.sgm'],
+        'en-fr': ['test/newstest2014-fren-src.en.sgm', 'test/newstest2014-fren-ref.fr.sgm'],
+        'fr-en': ['test/newstest2014-fren-src.fr.sgm', 'test/newstest2014-fren-ref.en.sgm'],
+        'en-hi': ['test/newstest2014-hien-src.en.sgm', 'test/newstest2014-hien-ref.hi.sgm'],
+        'hi-en': ['test/newstest2014-hien-src.hi.sgm', 'test/newstest2014-hien-ref.en.sgm'],
+        'en-ru': ['test/newstest2014-ruen-src.en.sgm', 'test/newstest2014-ruen-ref.ru.sgm'],
+        'ru-en': ['test/newstest2014-ruen-src.ru.sgm', 'test/newstest2014-ruen-ref.en.sgm'],
     },
     'wmt14/full': {
         'data': ['http://statmt.org/wmt14/test-full.tgz'],
         'md5': ['a8cd784e006feb32ac6f3d9ec7eb389a'],
         'description': 'Evaluation data released after official evaluation for further research.',
-        'cs-en': ['test-full/newstest2014-csen-src.cs.sgm', 'test-full/newstest2014-csen-ref.en.sgm',],
-        'en-cs': ['test-full/newstest2014-csen-src.en.sgm', 'test-full/newstest2014-csen-ref.cs.sgm',],
-        'de-en': ['test-full/newstest2014-deen-src.de.sgm', 'test-full/newstest2014-deen-ref.en.sgm',],
-        'en-de': ['test-full/newstest2014-deen-src.en.sgm', 'test-full/newstest2014-deen-ref.de.sgm',],
-        'en-fr': ['test-full/newstest2014-fren-src.en.sgm', 'test-full/newstest2014-fren-ref.fr.sgm',],
-        'fr-en': ['test-full/newstest2014-fren-src.fr.sgm', 'test-full/newstest2014-fren-ref.en.sgm',],
-        'en-hi': ['test-full/newstest2014-hien-src.en.sgm', 'test-full/newstest2014-hien-ref.hi.sgm',],
-        'hi-en': ['test-full/newstest2014-hien-src.hi.sgm', 'test-full/newstest2014-hien-ref.en.sgm',],
-        'en-ru': ['test-full/newstest2014-ruen-src.en.sgm', 'test-full/newstest2014-ruen-ref.ru.sgm',],
-        'ru-en': ['test-full/newstest2014-ruen-src.ru.sgm', 'test-full/newstest2014-ruen-ref.en.sgm',],
+        'cs-en': ['test-full/newstest2014-csen-src.cs.sgm', 'test-full/newstest2014-csen-ref.en.sgm'],
+        'en-cs': ['test-full/newstest2014-csen-src.en.sgm', 'test-full/newstest2014-csen-ref.cs.sgm'],
+        'de-en': ['test-full/newstest2014-deen-src.de.sgm', 'test-full/newstest2014-deen-ref.en.sgm'],
+        'en-de': ['test-full/newstest2014-deen-src.en.sgm', 'test-full/newstest2014-deen-ref.de.sgm'],
+        'en-fr': ['test-full/newstest2014-fren-src.en.sgm', 'test-full/newstest2014-fren-ref.fr.sgm'],
+        'fr-en': ['test-full/newstest2014-fren-src.fr.sgm', 'test-full/newstest2014-fren-ref.en.sgm'],
+        'en-hi': ['test-full/newstest2014-hien-src.en.sgm', 'test-full/newstest2014-hien-ref.hi.sgm'],
+        'hi-en': ['test-full/newstest2014-hien-src.hi.sgm', 'test-full/newstest2014-hien-ref.en.sgm'],
+        'en-ru': ['test-full/newstest2014-ruen-src.en.sgm', 'test-full/newstest2014-ruen-ref.ru.sgm'],
+        'ru-en': ['test-full/newstest2014-ruen-src.ru.sgm', 'test-full/newstest2014-ruen-ref.en.sgm'],
     },
     'wmt13': {
         'data': ['http://statmt.org/wmt13/test.tgz'],
@@ -448,16 +448,16 @@
         '\n  month     = {August},\n  year      = {2013},\n  address   = {Sofia, Bulgaria},\n  publisher '
         '= {Association for Computational Linguistics},\n  pages     = {1--44},\n  url       = {'
         'http://www.aclweb.org/anthology/W13-2201}\n}',
-        'cs-en': ['test/newstest2013-src.cs.sgm', 'test/newstest2013-src.en.sgm',],
-        'en-cs': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.cs.sgm',],
-        'de-en': ['test/newstest2013-src.de.sgm', 'test/newstest2013-src.en.sgm',],
-        'en-de': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.de.sgm',],
-        'es-en': ['test/newstest2013-src.es.sgm', 'test/newstest2013-src.en.sgm',],
-        'en-es': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.es.sgm',],
-        'fr-en': ['test/newstest2013-src.fr.sgm', 'test/newstest2013-src.en.sgm',],
-        'en-fr': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.fr.sgm',],
-        'ru-en': ['test/newstest2013-src.ru.sgm', 'test/newstest2013-src.en.sgm',],
-        'en-ru': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.ru.sgm',],
+        'cs-en': ['test/newstest2013-src.cs.sgm', 'test/newstest2013-src.en.sgm'],
+        'en-cs': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.cs.sgm'],
+        'de-en': ['test/newstest2013-src.de.sgm', 'test/newstest2013-src.en.sgm'],
+        'en-de': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.de.sgm'],
+        'es-en': ['test/newstest2013-src.es.sgm', 'test/newstest2013-src.en.sgm'],
+        'en-es': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.es.sgm'],
+        'fr-en': ['test/newstest2013-src.fr.sgm', 'test/newstest2013-src.en.sgm'],
+        'en-fr': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.fr.sgm'],
+        'ru-en': ['test/newstest2013-src.ru.sgm', 'test/newstest2013-src.en.sgm'],
+        'en-ru': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.ru.sgm'],
     },
     'wmt12': {
         'data': ['http://statmt.org/wmt12/test.tgz'],
@@ -470,14 +470,14 @@
         '\n  month     = {June},\n  year      = {2012},\n  address   = {Montr{\'e}al, Canada},'
         '\n  publisher = {Association for Computational Linguistics},\n  pages     = {10--51},'
         '\n  url       = {http://www.aclweb.org/anthology/W12-3102}\n}',
-        'cs-en': ['test/newstest2012-src.cs.sgm', 'test/newstest2012-src.en.sgm',],
-        'en-cs': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.cs.sgm',],
-        'de-en': ['test/newstest2012-src.de.sgm', 'test/newstest2012-src.en.sgm',],
-        'en-de': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.de.sgm',],
-        'es-en': ['test/newstest2012-src.es.sgm', 'test/newstest2012-src.en.sgm',],
-        'en-es': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.es.sgm',],
-        'fr-en': ['test/newstest2012-src.fr.sgm', 'test/newstest2012-src.en.sgm',],
-        'en-fr': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.fr.sgm',],
+        'cs-en': ['test/newstest2012-src.cs.sgm', 'test/newstest2012-src.en.sgm'],
+        'en-cs': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.cs.sgm'],
+        'de-en': ['test/newstest2012-src.de.sgm', 'test/newstest2012-src.en.sgm'],
+        'en-de': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.de.sgm'],
+        'es-en': ['test/newstest2012-src.es.sgm', 'test/newstest2012-src.en.sgm'],
+        'en-es': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.es.sgm'],
+        'fr-en': ['test/newstest2012-src.fr.sgm', 'test/newstest2012-src.en.sgm'],
+        'en-fr': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.fr.sgm'],
     },
     'wmt11': {
         'data': ['http://statmt.org/wmt11/test.tgz'],
@@ -510,14 +510,14 @@
         '\n  address   = {Uppsala, Sweden},\n  publisher = {Association for Computational Linguistics},'
         '\n  pages     = {17--53},\n  note      = {Revised August 2010},\n  url       = {'
         'http://www.aclweb.org/anthology/W10-1703}\n}',
-        'cs-en': ['test/newstest2010-src.cz.sgm', 'test/newstest2010-src.en.sgm',],
-        'en-cs': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.cz.sgm',],
-        'de-en': ['test/newstest2010-src.de.sgm', 'test/newstest2010-src.en.sgm',],
-        'en-de': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.de.sgm',],
-        'es-en': ['test/newstest2010-src.es.sgm', 'test/newstest2010-src.en.sgm',],
-        'en-es': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.es.sgm',],
-        'fr-en': ['test/newstest2010-src.fr.sgm', 'test/newstest2010-src.en.sgm',],
-        'en-fr': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.fr.sgm',],
+        'cs-en': ['test/newstest2010-src.cz.sgm', 'test/newstest2010-src.en.sgm'],
+        'en-cs': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.cz.sgm'],
+        'de-en': ['test/newstest2010-src.de.sgm', 'test/newstest2010-src.en.sgm'],
+        'en-de': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.de.sgm'],
+        'es-en': ['test/newstest2010-src.es.sgm', 'test/newstest2010-src.en.sgm'],
+        'en-es': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.es.sgm'],
+        'fr-en': ['test/newstest2010-src.fr.sgm', 'test/newstest2010-src.en.sgm'],
+        'en-fr': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.fr.sgm'],
     },
     'wmt09': {
         'data': ['http://statmt.org/wmt09/test.tgz'],
@@ -530,18 +530,18 @@
         '2009},\n  address   = {Athens, Greece},\n  publisher = {Association for Computational '
         'Linguistics},\n  pages     = {1--28},\n  url       = {'
         'http://www.aclweb.org/anthology/W/W09/W09-0401}\n}',
-        'cs-en': ['test/newstest2009-src.cz.sgm', 'test/newstest2009-src.en.sgm',],
-        'en-cs': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.cz.sgm',],
-        'de-en': ['test/newstest2009-src.de.sgm', 'test/newstest2009-src.en.sgm',],
-        'en-de': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.de.sgm',],
-        'es-en': ['test/newstest2009-src.es.sgm', 'test/newstest2009-src.en.sgm',],
-        'en-es': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.es.sgm',],
-        'fr-en': ['test/newstest2009-src.fr.sgm', 'test/newstest2009-src.en.sgm',],
-        'en-fr': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.fr.sgm',],
-        'hu-en': ['test/newstest2009-src.hu.sgm', 'test/newstest2009-src.en.sgm',],
-        'en-hu': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.hu.sgm',],
-        'it-en': ['test/newstest2009-src.it.sgm', 'test/newstest2009-src.en.sgm',],
-        'en-it': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.it.sgm',],
+        'cs-en': ['test/newstest2009-src.cz.sgm', 'test/newstest2009-src.en.sgm'],
+        'en-cs': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.cz.sgm'],
+        'de-en': ['test/newstest2009-src.de.sgm', 'test/newstest2009-src.en.sgm'],
+        'en-de': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.de.sgm'],
+        'es-en': ['test/newstest2009-src.es.sgm', 'test/newstest2009-src.en.sgm'],
+        'en-es': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.es.sgm'],
+        'fr-en': ['test/newstest2009-src.fr.sgm', 'test/newstest2009-src.en.sgm'],
+        'en-fr': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.fr.sgm'],
+        'hu-en': ['test/newstest2009-src.hu.sgm', 'test/newstest2009-src.en.sgm'],
+        'en-hu': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.hu.sgm'],
+        'it-en': ['test/newstest2009-src.it.sgm', 'test/newstest2009-src.en.sgm'],
+        'en-it': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.it.sgm'],
     },
     'wmt08': {
         'data': ['http://statmt.org/wmt08/test.tgz'],
@@ -553,23 +553,23 @@
         'Workshop on Statistical Machine Translation},\n  month     = {June},\n  year      = {2008},'
         '\n  address   = {Columbus, Ohio},\n  publisher = {Association for Computational Linguistics},'
         '\n  pages     = {70--106},\n  url       = {http://www.aclweb.org/anthology/W/W08/W08-0309}\n}',
-        'cs-en': ['test/newstest2008-src.cz.sgm', 'test/newstest2008-src.en.sgm',],
-        'en-cs': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.cz.sgm',],
-        'de-en': ['test/newstest2008-src.de.sgm', 'test/newstest2008-src.en.sgm',],
-        'en-de': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.de.sgm',],
-        'es-en': ['test/newstest2008-src.es.sgm', 'test/newstest2008-src.en.sgm',],
-        'en-es': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.es.sgm',],
-        'fr-en': ['test/newstest2008-src.fr.sgm', 'test/newstest2008-src.en.sgm',],
-        'en-fr': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.fr.sgm',],
-        'hu-en': ['test/newstest2008-src.hu.sgm', 'test/newstest2008-src.en.sgm',],
-        'en-hu': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.hu.sgm',],
+        'cs-en': ['test/newstest2008-src.cz.sgm', 'test/newstest2008-src.en.sgm'],
+        'en-cs': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.cz.sgm'],
+        'de-en': ['test/newstest2008-src.de.sgm', 'test/newstest2008-src.en.sgm'],
+        'en-de': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.de.sgm'],
+        'es-en': ['test/newstest2008-src.es.sgm', 'test/newstest2008-src.en.sgm'],
+        'en-es': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.es.sgm'],
+        'fr-en': ['test/newstest2008-src.fr.sgm', 'test/newstest2008-src.en.sgm'],
+        'en-fr': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.fr.sgm'],
+        'hu-en': ['test/newstest2008-src.hu.sgm', 'test/newstest2008-src.en.sgm'],
+        'en-hu': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.hu.sgm'],
     },
     'wmt08/nc': {
         'data': ['http://statmt.org/wmt08/test.tgz'],
         'md5': ['0582e4e894a3342044059c894e1aea3d'],
         'description': 'Official evaluation data (news commentary).',
-        'cs-en': ['test/nc-test2008-src.cz.sgm', 'test/nc-test2008-src.en.sgm',],
-        'en-cs': ['test/nc-test2008-src.en.sgm', 'test/nc-test2008-src.cz.sgm',],
+        'cs-en': ['test/nc-test2008-src.cz.sgm', 'test/nc-test2008-src.en.sgm'],
+        'en-cs': ['test/nc-test2008-src.en.sgm', 'test/nc-test2008-src.cz.sgm'],
     },
     'wmt08/europarl': {
         'data': ['http://statmt.org/wmt08/test.tgz'],
@@ -618,12 +618,12 @@
         '\n  booktitle = {14th International Workshop on Spoken Language Translation},\n  month     = {'
         'December},\n  year      = {2017},\n  address   = {Tokyo, Japan},\n  pages     = {2--14},'
         '\n  url       = {http://workshop2017.iwslt.org/downloads/iwslt2017_proceeding_v2.pdf}\n}',
-        'en-fr': ['en-fr/IWSLT17.TED.tst2017.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2017.fr-en.fr.xml',],
-        'fr-en': ['fr-en/IWSLT17.TED.tst2017.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2017.en-fr.en.xml',],
-        'en-de': ['en-de/IWSLT17.TED.tst2017.en-de.en.xml', 'de-en/IWSLT17.TED.tst2017.de-en.de.xml',],
-        'de-en': ['de-en/IWSLT17.TED.tst2017.de-en.de.xml', 'en-de/IWSLT17.TED.tst2017.en-de.en.xml',],
-        'en-zh': ['en-zh/IWSLT17.TED.tst2017.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2017.zh-en.zh.xml',],
-        'zh-en': ['zh-en/IWSLT17.TED.tst2017.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2017.en-zh.en.xml',],
+        'en-fr': ['en-fr/IWSLT17.TED.tst2017.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2017.fr-en.fr.xml'],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2017.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2017.en-fr.en.xml'],
+        'en-de': ['en-de/IWSLT17.TED.tst2017.en-de.en.xml', 'de-en/IWSLT17.TED.tst2017.de-en.de.xml'],
+        'de-en': ['de-en/IWSLT17.TED.tst2017.de-en.de.xml', 'en-de/IWSLT17.TED.tst2017.en-de.en.xml'],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2017.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2017.zh-en.zh.xml'],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2017.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2017.en-zh.en.xml'],
     },
     'iwslt17/tst2016': {
         'data': [
@@ -643,12 +643,12 @@
             "cc51d9b7fe1ff2af858c6a0dd80b8815",
         ],
         'description': 'Development data for IWSLT 2017.',
-        'en-fr': ['en-fr/IWSLT17.TED.tst2016.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2016.fr-en.fr.xml',],
-        'fr-en': ['fr-en/IWSLT17.TED.tst2016.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2016.en-fr.en.xml',],
-        'en-de': ['en-de/IWSLT17.TED.tst2016.en-de.en.xml', 'de-en/IWSLT17.TED.tst2016.de-en.de.xml',],
-        'de-en': ['de-en/IWSLT17.TED.tst2016.de-en.de.xml', 'en-de/IWSLT17.TED.tst2016.en-de.en.xml',],
-        'en-zh': ['en-zh/IWSLT17.TED.tst2016.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2016.zh-en.zh.xml',],
-        'zh-en': ['zh-en/IWSLT17.TED.tst2016.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2016.en-zh.en.xml',],
+        'en-fr': ['en-fr/IWSLT17.TED.tst2016.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2016.fr-en.fr.xml'],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2016.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2016.en-fr.en.xml'],
+        'en-de': ['en-de/IWSLT17.TED.tst2016.en-de.en.xml', 'de-en/IWSLT17.TED.tst2016.de-en.de.xml'],
+        'de-en': ['de-en/IWSLT17.TED.tst2016.de-en.de.xml', 'en-de/IWSLT17.TED.tst2016.en-de.en.xml'],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2016.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2016.zh-en.zh.xml'],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2016.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2016.en-zh.en.xml'],
     },
     'iwslt17/tst2015': {
         'data': [
@@ -668,12 +668,12 @@
             "1c0ae40171d52593df8a6963d3828116",
         ],
         'description': 'Development data for IWSLT 2017.',
-        'en-fr': ['en-fr/IWSLT17.TED.tst2015.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2015.fr-en.fr.xml',],
-        'fr-en': ['fr-en/IWSLT17.TED.tst2015.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2015.en-fr.en.xml',],
-        'en-de': ['en-de/IWSLT17.TED.tst2015.en-de.en.xml', 'de-en/IWSLT17.TED.tst2015.de-en.de.xml',],
-        'de-en': ['de-en/IWSLT17.TED.tst2015.de-en.de.xml', 'en-de/IWSLT17.TED.tst2015.en-de.en.xml',],
-        'en-zh': ['en-zh/IWSLT17.TED.tst2015.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2015.zh-en.zh.xml',],
-        'zh-en': ['zh-en/IWSLT17.TED.tst2015.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2015.en-zh.en.xml',],
+        'en-fr': ['en-fr/IWSLT17.TED.tst2015.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2015.fr-en.fr.xml'],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2015.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2015.en-fr.en.xml'],
+        'en-de': ['en-de/IWSLT17.TED.tst2015.en-de.en.xml', 'de-en/IWSLT17.TED.tst2015.de-en.de.xml'],
+        'de-en': ['de-en/IWSLT17.TED.tst2015.de-en.de.xml', 'en-de/IWSLT17.TED.tst2015.en-de.en.xml'],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2015.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2015.zh-en.zh.xml'],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2015.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2015.en-zh.en.xml'],
     },
     'iwslt17/tst2014': {
         'data': [
@@ -693,12 +693,12 @@
             "1c0ae40171d52593df8a6963d3828116",
         ],
         'description': 'Development data for IWSLT 2017.',
-        'en-fr': ['en-fr/IWSLT17.TED.tst2014.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2014.fr-en.fr.xml',],
-        'fr-en': ['fr-en/IWSLT17.TED.tst2014.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2014.en-fr.en.xml',],
-        'en-de': ['en-de/IWSLT17.TED.tst2014.en-de.en.xml', 'de-en/IWSLT17.TED.tst2014.de-en.de.xml',],
-        'de-en': ['de-en/IWSLT17.TED.tst2014.de-en.de.xml', 'en-de/IWSLT17.TED.tst2014.en-de.en.xml',],
-        'en-zh': ['en-zh/IWSLT17.TED.tst2014.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2014.zh-en.zh.xml',],
-        'zh-en': ['zh-en/IWSLT17.TED.tst2014.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2014.en-zh.en.xml',],
+        'en-fr': ['en-fr/IWSLT17.TED.tst2014.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2014.fr-en.fr.xml'],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2014.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2014.en-fr.en.xml'],
+        'en-de': ['en-de/IWSLT17.TED.tst2014.en-de.en.xml', 'de-en/IWSLT17.TED.tst2014.de-en.de.xml'],
+        'de-en': ['de-en/IWSLT17.TED.tst2014.de-en.de.xml', 'en-de/IWSLT17.TED.tst2014.en-de.en.xml'],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2014.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2014.zh-en.zh.xml'],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2014.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2014.en-zh.en.xml'],
     },
     'iwslt17/tst2013': {
         'data': [
@@ -718,12 +718,12 @@
             "1c0ae40171d52593df8a6963d3828116",
         ],
         'description': 'Development data for IWSLT 2017.',
-        'en-fr': ['en-fr/IWSLT17.TED.tst2013.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2013.fr-en.fr.xml',],
-        'fr-en': ['fr-en/IWSLT17.TED.tst2013.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2013.en-fr.en.xml',],
-        'en-de': ['en-de/IWSLT17.TED.tst2013.en-de.en.xml', 'de-en/IWSLT17.TED.tst2013.de-en.de.xml',],
-        'de-en': ['de-en/IWSLT17.TED.tst2013.de-en.de.xml', 'en-de/IWSLT17.TED.tst2013.en-de.en.xml',],
-        'en-zh': ['en-zh/IWSLT17.TED.tst2013.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2013.zh-en.zh.xml',],
-        'zh-en': ['zh-en/IWSLT17.TED.tst2013.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2013.en-zh.en.xml',],
+        'en-fr': ['en-fr/IWSLT17.TED.tst2013.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2013.fr-en.fr.xml'],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2013.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2013.en-fr.en.xml'],
+        'en-de': ['en-de/IWSLT17.TED.tst2013.en-de.en.xml', 'de-en/IWSLT17.TED.tst2013.de-en.de.xml'],
+        'de-en': ['de-en/IWSLT17.TED.tst2013.de-en.de.xml', 'en-de/IWSLT17.TED.tst2013.en-de.en.xml'],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2013.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2013.zh-en.zh.xml'],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2013.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2013.en-zh.en.xml'],
     },
     'iwslt17/tst2012': {
         'data': [
@@ -743,12 +743,12 @@
             "1c0ae40171d52593df8a6963d3828116",
         ],
         'description': 'Development data for IWSLT 2017.',
-        'en-fr': ['en-fr/IWSLT17.TED.tst2012.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2012.fr-en.fr.xml',],
-        'fr-en': ['fr-en/IWSLT17.TED.tst2012.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2012.en-fr.en.xml',],
-        'en-de': ['en-de/IWSLT17.TED.tst2012.en-de.en.xml', 'de-en/IWSLT17.TED.tst2012.de-en.de.xml',],
-        'de-en': ['de-en/IWSLT17.TED.tst2012.de-en.de.xml', 'en-de/IWSLT17.TED.tst2012.en-de.en.xml',],
-        'en-zh': ['en-zh/IWSLT17.TED.tst2012.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2012.zh-en.zh.xml',],
-        'zh-en': ['zh-en/IWSLT17.TED.tst2012.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2012.en-zh.en.xml',],
+        'en-fr': ['en-fr/IWSLT17.TED.tst2012.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2012.fr-en.fr.xml'],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2012.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2012.en-fr.en.xml'],
+        'en-de': ['en-de/IWSLT17.TED.tst2012.en-de.en.xml', 'de-en/IWSLT17.TED.tst2012.de-en.de.xml'],
+        'de-en': ['de-en/IWSLT17.TED.tst2012.de-en.de.xml', 'en-de/IWSLT17.TED.tst2012.en-de.en.xml'],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2012.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2012.zh-en.zh.xml'],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2012.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2012.en-zh.en.xml'],
     },
     'iwslt17/tst2011': {
         'data': [
@@ -768,12 +768,12 @@
             "1c0ae40171d52593df8a6963d3828116",
         ],
         'description': 'Development data for IWSLT 2017.',
-        'en-fr': ['en-fr/IWSLT17.TED.tst2011.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2011.fr-en.fr.xml',],
-        'fr-en': ['fr-en/IWSLT17.TED.tst2011.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2011.en-fr.en.xml',],
-        'en-de': ['en-de/IWSLT17.TED.tst2011.en-de.en.xml', 'de-en/IWSLT17.TED.tst2011.de-en.de.xml',],
-        'de-en': ['de-en/IWSLT17.TED.tst2011.de-en.de.xml', 'en-de/IWSLT17.TED.tst2011.en-de.en.xml',],
-        'en-zh': ['en-zh/IWSLT17.TED.tst2011.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2011.zh-en.zh.xml',],
-        'zh-en': ['zh-en/IWSLT17.TED.tst2011.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2011.en-zh.en.xml',],
+        'en-fr': ['en-fr/IWSLT17.TED.tst2011.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2011.fr-en.fr.xml'],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2011.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2011.en-fr.en.xml'],
+        'en-de': ['en-de/IWSLT17.TED.tst2011.en-de.en.xml', 'de-en/IWSLT17.TED.tst2011.de-en.de.xml'],
+        'de-en': ['de-en/IWSLT17.TED.tst2011.de-en.de.xml', 'en-de/IWSLT17.TED.tst2011.en-de.en.xml'],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2011.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2011.zh-en.zh.xml'],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2011.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2011.en-zh.en.xml'],
     },
     'iwslt17/tst2010': {
         'data': [
@@ -793,12 +793,12 @@
             "1c0ae40171d52593df8a6963d3828116",
         ],
         'description': 'Development data for IWSLT 2017.',
-        'en-fr': ['en-fr/IWSLT17.TED.tst2010.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2010.fr-en.fr.xml',],
-        'fr-en': ['fr-en/IWSLT17.TED.tst2010.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2010.en-fr.en.xml',],
-        'en-de': ['en-de/IWSLT17.TED.tst2010.en-de.en.xml', 'de-en/IWSLT17.TED.tst2010.de-en.de.xml',],
-        'de-en': ['de-en/IWSLT17.TED.tst2010.de-en.de.xml', 'en-de/IWSLT17.TED.tst2010.en-de.en.xml',],
-        'en-zh': ['en-zh/IWSLT17.TED.tst2010.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2010.zh-en.zh.xml',],
-        'zh-en': ['zh-en/IWSLT17.TED.tst2010.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2010.en-zh.en.xml',],
+        'en-fr': ['en-fr/IWSLT17.TED.tst2010.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2010.fr-en.fr.xml'],
+        'fr-en': ['fr-en/IWSLT17.TED.tst2010.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2010.en-fr.en.xml'],
+        'en-de': ['en-de/IWSLT17.TED.tst2010.en-de.en.xml', 'de-en/IWSLT17.TED.tst2010.de-en.de.xml'],
+        'de-en': ['de-en/IWSLT17.TED.tst2010.de-en.de.xml', 'en-de/IWSLT17.TED.tst2010.en-de.en.xml'],
+        'en-zh': ['en-zh/IWSLT17.TED.tst2010.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2010.zh-en.zh.xml'],
+        'zh-en': ['zh-en/IWSLT17.TED.tst2010.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2010.en-zh.en.xml'],
     },
     'iwslt17/dev2010': {
         'data': [
@@ -818,12 +818,12 @@
             "1c0ae40171d52593df8a6963d3828116",
         ],
         'description': 'Development data for IWSLT 2017.',
-        'en-fr': ['en-fr/IWSLT17.TED.dev2010.en-fr.en.xml', 'fr-en/IWSLT17.TED.dev2010.fr-en.fr.xml',],
-        'fr-en': ['fr-en/IWSLT17.TED.dev2010.fr-en.fr.xml', 'en-fr/IWSLT17.TED.dev2010.en-fr.en.xml',],
-        'en-de': ['en-de/IWSLT17.TED.dev2010.en-de.en.xml', 'de-en/IWSLT17.TED.dev2010.de-en.de.xml',],
-        'de-en': ['de-en/IWSLT17.TED.dev2010.de-en.de.xml', 'en-de/IWSLT17.TED.dev2010.en-de.en.xml',],
-        'en-zh': ['en-zh/IWSLT17.TED.dev2010.en-zh.en.xml', 'zh-en/IWSLT17.TED.dev2010.zh-en.zh.xml',],
-        'zh-en': ['zh-en/IWSLT17.TED.dev2010.zh-en.zh.xml', 'en-zh/IWSLT17.TED.dev2010.en-zh.en.xml',],
+        'en-fr': ['en-fr/IWSLT17.TED.dev2010.en-fr.en.xml', 'fr-en/IWSLT17.TED.dev2010.fr-en.fr.xml'],
+        'fr-en': ['fr-en/IWSLT17.TED.dev2010.fr-en.fr.xml', 'en-fr/IWSLT17.TED.dev2010.en-fr.en.xml'],
+        'en-de': ['en-de/IWSLT17.TED.dev2010.en-de.en.xml', 'de-en/IWSLT17.TED.dev2010.de-en.de.xml'],
+        'de-en': ['de-en/IWSLT17.TED.dev2010.de-en.de.xml', 'en-de/IWSLT17.TED.dev2010.en-de.en.xml'],
+        'en-zh': ['en-zh/IWSLT17.TED.dev2010.en-zh.en.xml', 'zh-en/IWSLT17.TED.dev2010.zh-en.zh.xml'],
+        'zh-en': ['zh-en/IWSLT17.TED.dev2010.zh-en.zh.xml', 'en-zh/IWSLT17.TED.dev2010.en-zh.en.xml'],
     },
 }
 
@@ -1087,15 +1087,7 @@ def bleu_signature(args, numrefs):
     """
 
     # Abbreviations for the signature
-    abbr = {
-        'test': 't',
-        'lang': 'l',
-        'smooth': 's',
-        'case': 'c',
-        'tok': 'tok',
-        'numrefs': '#',
-        'version': 'v',
-    }
+    abbr = {'test': 't', 'lang': 'l', 'smooth': 's', 'case': 'c', 'tok': 'tok', 'numrefs': '#', 'version': 'v'}
 
     signature = {
         'tok': args.tokenize,
@@ -1124,15 +1116,7 @@ def chrf_signature(args, numrefs):
     """
 
     # Abbreviations for the signature
-    abbr = {
-        'test': 't',
-        'lang': 'l',
-        'numchars': 'n',
-        'space': 's',
-        'case': 'c',
-        'numrefs': '#',
-        'version': 'v',
-    }
+    abbr = {'test': 't', 'lang': 'l', 'numchars': 'n', 'space': 's', 'case': 'c', 'numrefs': '#', 'version': 'v'}
 
     signature = {
         'tok': args.tokenize,
@@ -1225,24 +1209,20 @@ def process_to_text(rawfile, txtfile, field: int = None):
             with smart_open(rawfile) as fin, smart_open(txtfile, 'wt') as fout:
                 for line in fin:
                     if line.startswith('<seg '):
-                        print(
-                            _clean(re.sub(r'<seg.*?>(.*)</seg>.*?', '\\1', line)), file=fout,
-                        )
+                        logging.info(_clean(re.sub(r'<seg.*?>(.*)</seg>.*?', '\\1', line)), file=fout)
         elif rawfile.endswith('.xml'):  # IWSLT
             with smart_open(rawfile) as fin, smart_open(txtfile, 'wt') as fout:
                 for line in fin:
                     if line.startswith('<seg '):
-                        print(
-                            _clean(re.sub(r'<seg.*?>(.*)</seg>.*?', '\\1', line)), file=fout,
-                        )
+                        logging.info(_clean(re.sub(r'<seg.*?>(.*)</seg>.*?', '\\1', line)), file=fout)
         elif rawfile.endswith('.txt'):  # wmt17/ms
             with smart_open(rawfile) as fin, smart_open(txtfile, 'wt') as fout:
                 for line in fin:
-                    print(line.rstrip(), file=fout)
+                    logging.info(line.rstrip(), file=fout)
         elif rawfile.endswith('.tsv'):  # MTNT
             with smart_open(rawfile) as fin, smart_open(txtfile, 'wt') as fout:
                 for line in fin:
-                    print(line.rstrip().split('\t')[field], file=fout)
+                    logging.info(line.rstrip().split('\t')[field], file=fout)
 
 
 def print_test_set(test_set, langpair, side):
@@ -1260,7 +1240,7 @@ def print_test_set(test_set, langpair, side):
 
     streams = [smart_open(file) for file in files]
     for lines in zip(*streams):
-        print('\t'.join(map(lambda x: x.rstrip(), lines)))
+        logging.info('\t'.join(map(lambda x: x.rstrip(), lines)))
 
 
 def download_test_set(test_set, langpair=None):
@@ -1586,7 +1566,7 @@ def delete_whitespace(text: str) -> str:
 
 
 def get_sentence_statistics(
-    hypothesis: str, reference: str, order: int = CHRF_ORDER, remove_whitespace: bool = True,
+    hypothesis: str, reference: str, order: int = CHRF_ORDER, remove_whitespace: bool = True
 ) -> List[float]:
     hypothesis = delete_whitespace(hypothesis) if remove_whitespace else hypothesis
     reference = delete_whitespace(reference) if remove_whitespace else reference
@@ -1603,11 +1583,11 @@ def get_sentence_statistics(
 
 
 def get_corpus_statistics(
-    hypotheses: Iterable[str], references: Iterable[str], order: int = CHRF_ORDER, remove_whitespace: bool = True,
+    hypotheses: Iterable[str], references: Iterable[str], order: int = CHRF_ORDER, remove_whitespace: bool = True
 ) -> List[float]:
     corpus_statistics = [0] * (order * 3)
     for hypothesis, reference in zip(hypotheses, references):
-        statistics = get_sentence_statistics(hypothesis, reference, order=order, remove_whitespace=remove_whitespace,)
+        statistics = get_sentence_statistics(hypothesis, reference, order=order, remove_whitespace=remove_whitespace)
         for i in range(len(statistics)):
             corpus_statistics[i] += statistics[i]
     return corpus_statistics
@@ -1656,15 +1636,13 @@ def corpus_chrf(
     :param beta: Defines importance of recall w.r.t precision. If beta=1,
     same importance. :return: Chrf score.
     """
-    corpus_statistics = get_corpus_statistics(
-        hypotheses, references, order=order, remove_whitespace=remove_whitespace,
-    )
+    corpus_statistics = get_corpus_statistics(hypotheses, references, order=order, remove_whitespace=remove_whitespace)
     avg_precision, avg_recall = _avg_precision_and_recall(corpus_statistics, order)
     return _chrf(avg_precision, avg_recall, beta=beta)
 
 
 def sentence_chrf(
-    hypothesis: str, reference: str, order: int = CHRF_ORDER, beta: float = CHRF_BETA, remove_whitespace: bool = True,
+    hypothesis: str, reference: str, order: int = CHRF_ORDER, beta: float = CHRF_BETA, remove_whitespace: bool = True
 ) -> float:
     """
     Computes ChrF on a single sentence pair.
@@ -1688,10 +1666,10 @@ def main():
         '    cat output.detok.de | ./sacreBLEU -t wmt14 -l en-de'
     )
     arg_parser.add_argument(
-        '--test-set', '-t', type=str, default=None, choices=DATASETS.keys(), help='the test set to use',
+        '--test-set', '-t', type=str, default=None, choices=DATASETS.keys(), help='the test set to use'
     )
     arg_parser.add_argument(
-        '-lc', action='store_true', default=False, help='use case-insensitive BLEU (default: actual case)',
+        '-lc', action='store_true', default=False, help='use case-insensitive BLEU (default: actual case)'
     )
     arg_parser.add_argument(
         '--smooth',
@@ -1709,7 +1687,7 @@ def main():
         help='The value to pass to the smoothing technique, when relevant. ' 'Default: %(default)s. ',
     )
     arg_parser.add_argument(
-        '--tokenize', '-tok', choices=TOKENIZERS.keys(), default=None, help='tokenization method to use',
+        '--tokenize', '-tok', choices=TOKENIZERS.keys(), default=None, help='tokenization method to use'
     )
     arg_parser.add_argument(
         '--language-pair',
@@ -1718,9 +1696,7 @@ def main():
         default=None,
         help='source-target language pair (2-char ISO639-1 codes)',
     )
-    arg_parser.add_argument(
-        '--download', type=str, default=None, help='download a test set and quit',
-    )
+    arg_parser.add_argument('--download', type=str, default=None, help='download a test set and quit')
     arg_parser.add_argument(
         '--echo',
         choices=['src', 'ref', 'both'],
@@ -1728,9 +1704,7 @@ def main():
         default=None,
         help='output the source (src), reference (ref), or both (both, ' 'pasted) to STDOUT and quit ',
     )
-    arg_parser.add_argument(
-        '--input', '-i', type=str, default='-', help='Read input from a file instead of STDIN',
-    )
+    arg_parser.add_argument('--input', '-i', type=str, default='-', help='Read input from a file instead of STDIN')
     arg_parser.add_argument(
         'refs',
         nargs='*',
@@ -1746,10 +1720,10 @@ def main():
         help='metrics to compute (default: bleu)',
     )
     arg_parser.add_argument(
-        '--chrf-order', type=int, default=CHRF_ORDER, help='chrf character order (default: %(default)s)',
+        '--chrf-order', type=int, default=CHRF_ORDER, help='chrf character order (default: %(default)s)'
     )
     arg_parser.add_argument(
-        '--chrf-beta', type=int, default=CHRF_BETA, help='chrf BETA parameter (default: %(default)s)',
+        '--chrf-beta', type=int, default=CHRF_BETA, help='chrf BETA parameter (default: %(default)s)'
     )
     arg_parser.add_argument(
         '--chrf-whitespace',
@@ -1758,17 +1732,15 @@ def main():
         help='include whitespace in chrF calculation (default: %(default)s)',
     )
     arg_parser.add_argument(
-        '--short', default=False, action='store_true', help='produce a shorter (less human readable) signature',
+        '--short', default=False, action='store_true', help='produce a shorter (less human readable) signature'
     )
     arg_parser.add_argument(
-        '--score-only', '-b', default=False, action='store_true', help='output only the BLEU score',
+        '--score-only', '-b', default=False, action='store_true', help='output only the BLEU score'
     )
     arg_parser.add_argument(
-        '--force', default=False, action='store_true', help='insist that your tokenized input is actually detokenized',
-    )
-    arg_parser.add_argument(
-        '--quiet', '-q', default=False, action='store_true', help='suppress informative output',
+        '--force', default=False, action='store_true', help='insist that your tokenized input is actually detokenized'
     )
+    arg_parser.add_argument('--quiet', '-q', default=False, action='store_true', help='suppress informative output')
     arg_parser.add_argument(
         '--encoding',
         '-e',
@@ -1777,18 +1749,14 @@ def main():
         help='open text files with specified encoding (default: %(default)s)',
     )
     arg_parser.add_argument(
-        '--citation', '--cite', default=False, action='store_true', help='dump the bibtex citation and quit.',
-    )
-    arg_parser.add_argument(
-        '--width', '-w', type=int, default=1, help='floating point width (default: %(default)s)',
-    )
-    arg_parser.add_argument(
-        '-V', '--version', action='version', version='%(prog)s {}'.format(VERSION),
+        '--citation', '--cite', default=False, action='store_true', help='dump the bibtex citation and quit.'
     )
+    arg_parser.add_argument('--width', '-w', type=int, default=1, help='floating point width (default: %(default)s)')
+    arg_parser.add_argument('-V', '--version', action='version', version='%(prog)s {}'.format(VERSION))
     args = arg_parser.parse_args()
 
     # Explicitly set the encoding
-    sys.stdin = open(sys.stdin.fileno(), mode='r', encoding='utf-8', buffering=True, newline="\n",)
+    sys.stdin = open(sys.stdin.fileno(), mode='r', encoding='utf-8', buffering=True, newline="\n")
     sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=True)
 
     if not args.quiet:
@@ -1806,7 +1774,7 @@ def main():
             logging.error('No citation found for %s', args.test_set)
             sys.exit(1)
 
-        print(DATASETS[args.test_set]['citation'])
+        logging.info(DATASETS[args.test_set]['citation'])
         sys.exit(0)
 
     if args.test_set is not None and args.test_set not in DATASETS:
@@ -1871,7 +1839,7 @@ def main():
     if args.test_set:
         _, *refs = download_test_set(args.test_set, args.langpair)
         if len(refs) == 0:
-            print('No references found for test set {}/{}.'.format(args.test_set, args.langpair))
+            logging.info('No references found for test set {}/{}.'.format(args.test_set, args.langpair))
             sys.exit(1)
     else:
         refs = args.refs
@@ -1899,11 +1867,7 @@ def main():
             )
         if 'chrf' in args.metrics:
             chrf = corpus_chrf(
-                system,
-                refs[0],
-                beta=args.chrf_beta,
-                order=args.chrf_order,
-                remove_whitespace=not args.chrf_whitespace,
+                system, refs[0], beta=args.chrf_beta, order=args.chrf_order, remove_whitespace=not args.chrf_whitespace
             )
     except EOFError:
         logging.error('The input and reference stream(s) were of different lengths.\n')
@@ -1927,17 +1891,17 @@ def main():
     for metric in args.metrics:
         if metric == 'bleu':
             if args.score_only:
-                print('{0:.{1}f}'.format(bleu.score, width))
+                logging.info('{0:.{1}f}'.format(bleu.score, width))
             else:
                 version_str = bleu_signature(args, len(refs))
-                print(bleu.format(width).replace('BLEU', 'BLEU+' + version_str))
+                logging.info(bleu.format(width).replace('BLEU', 'BLEU+' + version_str))
 
         elif metric == 'chrf':
             if args.score_only:
-                print('{0:.{1}f}'.format(chrf, width))
+                logging.info('{0:.{1}f}'.format(chrf, width))
             else:
                 version_str = chrf_signature(args, len(refs))
-                print('chrF{0:d}+{1} = {2:.{3}f}'.format(args.chrf_beta, version_str, chrf, width))
+                logging.info('chrF{0:d}+{1} = {2:.{3}f}'.format(args.chrf_beta, version_str, chrf, width))
 
 
 if __name__ == '__main__':
diff --git a/nemo/collections/nlp/utils/metrics/squad_metrics.py b/nemo/collections/nlp/metrics/squad_metrics.py
similarity index 85%
rename from nemo/collections/nlp/utils/metrics/squad_metrics.py
rename to nemo/collections/nlp/metrics/squad_metrics.py
index 13eb29de1931..e5f0af1e2517 100644
--- a/nemo/collections/nlp/utils/metrics/squad_metrics.py
+++ b/nemo/collections/nlp/metrics/squad_metrics.py
@@ -15,13 +15,27 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+
 import collections
-import math
-import re
-import string
 
 from transformers.tokenization_bert import BasicTokenizer
 
+from nemo import logging
+from nemo.collections.nlp.data.datasets.datasets_utils import get_tokens, normalize_answer
+
+__all__ = [
+    'f1_score',
+    'exact_match_score',
+    'apply_no_ans_threshold',
+    'make_eval_dict',
+    'merge_eval',
+    'find_all_best_thresh',
+    'find_best_thresh',
+    'normalize_answer',
+    '_get_best_indexes',
+    'get_final_text',
+]
+
 
 def _get_best_indexes(logits, n_best_size):
     """Get the n-best logits from a list."""
@@ -35,74 +49,6 @@ def _get_best_indexes(logits, n_best_size):
     return best_indexes
 
 
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
-
-
-def get_tokens(s):
-    if not s:
-        return []
-    return normalize_answer(s).split()
-
-
-def f1_score(prediction, ground_truth):
-    prediction_tokens = get_tokens(prediction)
-    ground_truth_tokens = get_tokens(ground_truth)
-    common = collections.Counter(prediction_tokens) & collections.Counter(ground_truth_tokens)
-    num_same = sum(common.values())
-    if len(ground_truth_tokens) == 0 or len(prediction_tokens) == 0:
-        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-        return int(ground_truth_tokens == prediction_tokens)
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(prediction_tokens)
-    recall = 1.0 * num_same / len(ground_truth_tokens)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
-
-
-def exact_match_score(prediction, ground_truth):
-    return int(normalize_answer(prediction) == normalize_answer(ground_truth))
-
-
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    def remove_articles(text):
-        return re.sub(r'\b(a|an|the)\b', ' ', text)
-
-    def white_space_fix(text):
-        return ' '.join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
 def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
     """Project the tokenized prediction back to the original text."""
 
@@ -154,7 +100,7 @@ def _strip_spaces(text):
     start_position = tok_text.find(pred_text)
     if start_position == -1:
         if verbose_logging:
-            print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+            logging.warning("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
         return orig_text
     end_position = start_position + len(pred_text) - 1
 
@@ -163,7 +109,7 @@ def _strip_spaces(text):
 
     if len(orig_ns_text) != len(tok_ns_text):
         if verbose_logging:
-            print(
+            logging.warning(
                 "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text,
             )
         return orig_text
@@ -182,7 +128,7 @@ def _strip_spaces(text):
 
     if orig_start_position is None:
         if verbose_logging:
-            print("Couldn't map start position")
+            logging.warning("Couldn't map start position")
         return orig_text
 
     orig_end_position = None
@@ -193,13 +139,33 @@ def _strip_spaces(text):
 
     if orig_end_position is None:
         if verbose_logging:
-            print("Couldn't map end position")
+            logging.warning("Couldn't map end position")
         return orig_text
 
     output_text = orig_text[orig_start_position : (orig_end_position + 1)]
     return output_text
 
 
+def f1_score(prediction, ground_truth):
+    prediction_tokens = get_tokens(prediction)
+    ground_truth_tokens = get_tokens(ground_truth)
+    common = collections.Counter(prediction_tokens) & collections.Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if len(ground_truth_tokens) == 0 or len(prediction_tokens) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(ground_truth_tokens == prediction_tokens)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth):
+    return int(normalize_answer(prediction) == normalize_answer(ground_truth))
+
+
 def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
     new_scores = {}
     for qid, s in scores.items():
@@ -225,7 +191,7 @@ def make_eval_dict(exact_scores, f1_scores, qid_list=None):
         total = len(qid_list)
         return collections.OrderedDict(
             [
-                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total,),
+                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
                 ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
                 ("total", total),
             ]
diff --git a/nemo/collections/nlp/modules/__init__.py b/nemo/collections/nlp/modules/__init__.py
deleted file mode 100644
index 97328a8b6cbf..000000000000
--- a/nemo/collections/nlp/modules/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .classifiers import *
-from .losses import *
-from .transformer_nm import *
diff --git a/nemo/collections/nlp/modules/classifiers.py b/nemo/collections/nlp/modules/classifiers.py
deleted file mode 100644
index 0d6259cdd31a..000000000000
--- a/nemo/collections/nlp/modules/classifiers.py
+++ /dev/null
@@ -1,363 +0,0 @@
-__all__ = [
-    'TokenClassifier',
-    'BertTokenClassifier',
-    'SequenceClassifier',
-    'JointIntentSlotClassifier',
-    'SequenceRegression',
-]
-
-import torch.nn as nn
-
-from ..transformer.utils import transformer_weights_init
-from nemo.backends.pytorch.common import MultiLayerPerceptron
-from nemo.backends.pytorch.nm import LossNM, TrainableNM
-from nemo.collections.nlp.transformer.utils import gelu
-from nemo.core.neural_types import *
-
-ACT2FN = {"gelu": gelu, "relu": nn.functional.relu}
-
-
-class BertTokenClassifier(TrainableNM):
-    """
-    Neural module which consists of MLP followed by softmax classifier for each
-    token in the sequence.
-
-    Args:
-        hidden_size (int): hidden size (d_model) of the Transformer
-        num_classes (int): number of classes in softmax classifier, e.g. size
-            of the vocabulary in language modeling objective
-        num_layers (int): number of layers in classifier MLP
-        activation (str): activation function applied in classifier MLP layers
-        log_softmax (bool): whether to apply log_softmax to MLP output
-        dropout (float): dropout ratio applied to MLP
-    """
-
-    @property
-    def input_ports(self):
-        """Returns definitions of module input ports.
-
-        hidden_states:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-            2: AxisType(ChannelTag)
-        """
-        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-            2: AxisType(ChannelTag)
-        """
-        return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
-
-    def __init__(
-        self,
-        hidden_size,
-        num_classes,
-        activation='relu',
-        log_softmax=True,
-        dropout=0.0,
-        use_transformer_pretrained=True,
-    ):
-        super().__init__()
-        if activation not in ACT2FN:
-            raise ValueError(f'activation "{activation}" not found')
-        self.dense = nn.Linear(hidden_size, hidden_size)
-        self.act = ACT2FN[activation]
-        self.norm = nn.LayerNorm(hidden_size, eps=1e-12)
-        self.mlp = MultiLayerPerceptron(
-            hidden_size, num_classes, self._device, num_layers=1, activation=activation, log_softmax=log_softmax,
-        )
-        self.dropout = nn.Dropout(dropout)
-        if use_transformer_pretrained:
-            self.apply(lambda module: transformer_weights_init(module, xavier=False))
-        self.to(self._device)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.act(hidden_states)
-        transform = self.norm(hidden_states)
-        logits = self.mlp(transform)
-        return logits
-
-
-class TokenClassifier(TrainableNM):
-    """
-    Neural module which consists of MLP followed by softmax classifier for each
-    token in the sequence.
-
-    Args:
-        hidden_size (int): hidden size (d_model) of the Transformer
-        num_classes (int): number of classes in softmax classifier, e.g. size
-            of the vocabulary in language modeling objective
-        num_layers (int): number of layers in classifier MLP
-        activation (str): activation function applied in classifier MLP layers
-        log_softmax (bool): whether to apply log_softmax to MLP output
-        dropout (float): dropout ratio applied to MLP
-    """
-
-    @property
-    def input_ports(self):
-        """Returns definitions of module input ports.
-
-        hidden_states:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-            2: AxisType(ChannelTag)
-        """
-        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-            2: AxisType(ChannelTag)
-        """
-        return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
-
-    def __init__(
-        self,
-        hidden_size,
-        num_classes,
-        name=None,
-        num_layers=2,
-        activation='relu',
-        log_softmax=True,
-        dropout=0.0,
-        use_transformer_pretrained=True,
-    ):
-        super().__init__()
-
-        self.name = name
-        self.mlp = MultiLayerPerceptron(hidden_size, num_classes, self._device, num_layers, activation, log_softmax,)
-        self.dropout = nn.Dropout(dropout)
-        if use_transformer_pretrained:
-            self.apply(lambda module: transformer_weights_init(module, xavier=False))
-        # self.to(self._device) # sometimes this is necessary
-
-    def __str__(self):
-        name = TrainableNM.__str__(self)
-
-        if self.name:
-            name = self.name + name
-        return name
-
-    def forward(self, hidden_states):
-        hidden_states = self.dropout(hidden_states)
-        logits = self.mlp(hidden_states)
-        return logits
-
-
-class SequenceClassifier(TrainableNM):
-    """
-    Neural module which consists of MLP followed by softmax classifier for each
-    sequence in the batch.
-
-    Args:
-        hidden_size (int): hidden size (d_model) of the Transformer
-        num_classes (int): number of classes in softmax classifier, e.g. number
-            of different sentiments
-        num_layers (int): number of layers in classifier MLP
-        activation (str): activation function applied in classifier MLP layers
-        log_softmax (bool): whether to apply log_softmax to MLP output
-        dropout (float): dropout ratio applied to MLP
-    """
-
-    @property
-    def input_ports(self):
-        """Returns definitions of module input ports.
-
-        hidden_states:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-            2: AxisType(ChannelTag)
-        """
-        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(ChannelTag)
-        """
-        return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})}
-
-    def __init__(
-        self,
-        hidden_size,
-        num_classes,
-        num_layers=2,
-        activation='relu',
-        log_softmax=True,
-        dropout=0.0,
-        use_transformer_pretrained=True,
-    ):
-        super().__init__()
-        self.mlp = MultiLayerPerceptron(hidden_size, num_classes, self._device, num_layers, activation, log_softmax,)
-        self.dropout = nn.Dropout(dropout)
-        if use_transformer_pretrained:
-            self.apply(lambda module: transformer_weights_init(module, xavier=False))
-        # self.to(self._device) # sometimes this is necessary
-
-    def forward(self, hidden_states, idx_conditioned_on=0):
-        hidden_states = self.dropout(hidden_states)
-        logits = self.mlp(hidden_states[:, idx_conditioned_on])
-        return logits
-
-
-class JointIntentSlotClassifier(TrainableNM):
-    """
-    The softmax classifier for the joint intent classification and slot
-    filling task which  consists of a dense layer + relu + softmax for
-    predicting the slots and similar for predicting the intents.
-
-    Args:
-        hidden_size (int): the size of the hidden state for the dense layer
-        num_intents (int): number of intents
-        num_slots (int): number of slots
-        dropout (float): dropout to be applied to the layer
-    """
-
-    @property
-    def input_ports(self):
-        """Returns definitions of module input ports.
-
-        hidden_states:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-            2: AxisType(ChannelTag)
-        """
-        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        intent_logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(ChannelTag)
-
-        slot_logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-            2: AxisType(ChannelTag)
-        """
-        return {
-            "intent_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
-            "slot_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
-        }
-
-    def __init__(self, hidden_size, num_intents, num_slots, dropout=0.0, use_transformer_pretrained=True):
-        super().__init__()
-        self.dropout = nn.Dropout(dropout)
-        self.slot_mlp = MultiLayerPerceptron(
-            hidden_size,
-            num_classes=num_slots,
-            device=self._device,
-            num_layers=2,
-            activation='relu',
-            log_softmax=False,
-        )
-        self.intent_mlp = MultiLayerPerceptron(
-            hidden_size,
-            num_classes=num_intents,
-            device=self._device,
-            num_layers=2,
-            activation='relu',
-            log_softmax=False,
-        )
-        if use_transformer_pretrained:
-            self.apply(lambda module: transformer_weights_init(module, xavier=False))
-        # self.to(self._device)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dropout(hidden_states)
-        intent_logits = self.intent_mlp(hidden_states[:, 0])
-        slot_logits = self.slot_mlp(hidden_states)
-        return intent_logits, slot_logits
-
-
-class SequenceRegression(TrainableNM):
-    """
-    Neural module which consists of MLP, generates a single number prediction
-    that could be used for a regression task. An example of this task would be
-    semantic textual similatity task, for example, STS-B (from GLUE tasks).
-
-    Args:
-        hidden_size (int): the size of the hidden state for the dense layer
-        num_layers (int): number of layers in classifier MLP
-        activation (str): activation function applied in classifier MLP layers
-        dropout (float): dropout ratio applied to MLP
-    """
-
-    @property
-    def input_ports(self):
-        """Returns definitions of module input ports.
-
-        hidden_states:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-            2: AxisType(ChannelTag)
-        """
-        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        preds:
-            0: AxisType(RegressionTag)
-        """
-        return {
-            "preds": NeuralType({0: AxisType(RegressionTag)}),
-        }
-
-    def __init__(
-        self, hidden_size, num_layers=2, activation='relu', dropout=0.0, use_transformer_pretrained=True,
-    ):
-        super().__init__()
-        self.mlp = MultiLayerPerceptron(
-            hidden_size,
-            num_classes=1,
-            device=self._device,
-            num_layers=num_layers,
-            activation=activation,
-            log_softmax=False,
-        )
-        self.dropout = nn.Dropout(dropout)
-        if use_transformer_pretrained:
-            self.apply(lambda module: transformer_weights_init(module, xavier=False))
-        # self.to(self._device) # sometimes this is necessary
-
-    def forward(self, hidden_states, idx_conditioned_on=0):
-        hidden_states = self.dropout(hidden_states)
-        preds = self.mlp(hidden_states[:, idx_conditioned_on])
-        return preds.view(-1)
diff --git a/nemo/collections/nlp/modules/losses.py b/nemo/collections/nlp/modules/losses.py
deleted file mode 100644
index 34912f609fa4..000000000000
--- a/nemo/collections/nlp/modules/losses.py
+++ /dev/null
@@ -1,422 +0,0 @@
-import torch
-from torch import nn
-
-from ..utils.nlp_utils import mask_padded_tokens
-from .pytorch_utils import SmoothedCrossEntropyLoss
-from nemo.backends.pytorch.nm import LossNM
-from nemo.core.neural_types import *
-
-__all__ = [
-    'JointIntentSlotLoss',
-    'LossAggregatorNM',
-    'MaskedLanguageModelingLossNM',
-    'PaddedSmoothedCrossEntropyLossNM',
-    'QuestionAnsweringLoss',
-    'TokenClassificationLoss',
-]
-
-
-class QuestionAnsweringLoss(LossNM):
-    """
-    Neural module which implements QuestionAnswering loss.
-    Args:
-        logits: Output of question answering head, which is a token classfier.
-        start_positions: Ground truth start positions of the answer w.r.t.
-            input sequence. If question is unanswerable, this will be
-            pointing to start token, e.g. [CLS], of the input sequence.
-        end_positions: Ground truth end positions of the answer w.r.t.
-            input sequence. If question is unanswerable, this will be
-            pointing to start token, e.g. [CLS], of the input sequence.
-    """
-
-    @property
-    def input_ports(self):
-        """Returns definitions of module input ports.
-
-        logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-            2: AxisType(ChannelTag)
-
-        start_positions:
-            0: AxisType(BatchTag)
-
-        end_positions:
-            0: AxisType(BatchTag)
-        """
-        return {
-            "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
-            "start_positions": NeuralType({0: AxisType(BatchTag)}),
-            "end_positions": NeuralType({0: AxisType(BatchTag)}),
-        }
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        loss:
-            NeuralType(None)
-
-        start_logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        end_logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-        """
-        return {
-            "loss": NeuralType(None),
-            "start_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "end_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        }
-
-    def __init__(self):
-        super().__init__()
-
-    def _loss_function(self, **kwargs):
-        logits = kwargs['logits']
-        start_positions = kwargs['start_positions']
-        end_positions = kwargs['end_positions']
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-        # If we are on multi-GPU, split add a dimension
-        if len(start_positions.size()) > 1:
-            start_positions = start_positions.squeeze(-1)
-        if len(end_positions.size()) > 1:
-            end_positions = end_positions.squeeze(-1)
-        ignored_index = start_logits.size(1)
-        start_positions.clamp_(0, ignored_index)
-        end_positions.clamp_(0, ignored_index)
-
-        loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
-        start_loss = loss_fct(start_logits, start_positions)
-        end_loss = loss_fct(end_logits, end_positions)
-        total_loss = (start_loss + end_loss) / 2
-        return total_loss, start_logits, end_logits
-
-
-class MaskedLanguageModelingLossNM(LossNM):
-    """
-    Neural module which implements Masked Language Modeling (MLM) loss.
-
-    Args:
-        label_smoothing (float): label smoothing regularization coefficient
-    """
-
-    @property
-    def input_ports(self):
-        """Returns definitions of module input ports.
-
-        logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-            2: AxisType(ChannelTag)
-
-        output_ids:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        output_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-        """
-        return {
-            "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
-            "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        }
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        loss:
-            NeuralType(None)
-        """
-        return {"loss": NeuralType(None)}
-
-    def __init__(self, label_smoothing=0.0):
-        super().__init__()
-        self._criterion = SmoothedCrossEntropyLoss(label_smoothing)
-
-    def _loss_function(self, logits, output_ids, output_mask):
-        loss = self._criterion(logits, output_ids, output_mask)
-        return loss
-
-
-class LossAggregatorNM(LossNM):
-    """
-    Neural module which combines sums several losses into one.
-
-    Args:
-        num_inputs (int): number of input losses
-    """
-
-    @property
-    def input_ports(self):
-        """Returns definitions of module input ports.
-
-        """
-        input_ports = {}
-        for i in range(self.num_losses):
-            input_ports["loss_" + str(i + 1)] = NeuralType(None)
-
-        return input_ports
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        loss:
-            NeuralType(None)
-        """
-        return {"loss": NeuralType(None)}
-
-    def __init__(self, num_inputs=2):
-        super().__init__()
-        # Store number of inputs/losses.
-        self.num_losses = num_inputs
-
-    def _loss_function(self, **kwargs):
-        values = [kwargs[x] for x in sorted(kwargs.keys())]
-        loss = values[0]
-        for loss_i in values[1:]:
-            loss = loss.add(loss_i)
-        return loss
-
-
-class TokenClassificationLoss(LossNM):
-    """
-    Neural module which implements Token Classification loss.
-
-    Args:
-        num_classes (int): number of classes in a classifier, e.g. size
-            of the vocabulary in language modeling objective
-        logits (float): output of the classifier
-        labels (long): ground truth labels
-        loss_mask (long): to differentiate from original tokens and paddings
-    """
-
-    @property
-    def input_ports(self):
-        """Returns definitions of module input ports.
-
-        logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-            2: AxisType(ChannelTag)
-
-        labels:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        loss_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-        """
-        return {
-            "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
-            "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        }
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        loss:
-            NeuralType(None)
-        """
-        return {"loss": NeuralType(None)}
-
-    def __init__(self, num_classes, class_weights=None):
-        super().__init__()
-        if class_weights:
-            class_weights = torch.FloatTensor(class_weights).to(self._device)
-
-        self._criterion = nn.CrossEntropyLoss(weight=class_weights)
-        self.num_classes = num_classes
-
-    def _loss_function(self, logits, labels, loss_mask):
-        active_loss = loss_mask.view(-1) > 0.5
-        active_logits = logits.view(-1, self.num_classes)[active_loss]
-        active_labels = labels.view(-1)[active_loss]
-
-        loss = self._criterion(active_logits, active_labels)
-        return loss
-
-
-class JointIntentSlotLoss(LossNM):
-    """
-    Loss function for the joint intent classification and slot
-    filling task.
-
-    The loss is a joint loss of both tasks, aim to maximize:
-    p(y^i | x)P(y^s1, y^s2, ..., y^sn | x)
-
-    with y^i being the predicted intent and y^s1, y^s2, ..., y^sn
-    are the predicted slots corresponding to x1, x2, ..., xn.
-
-    Args:
-        hidden_states: output of the hidden layers
-        intents: ground truth intents,
-        slots: ground truth slots.
-        input_mask: to differentiate from original tokens and paddings
-        intent_loss_weight: the loss is the sum of:
-            intent_loss_weight * intent_loss +
-            (1 - intent_loss_weight) * slot_loss
-
-    """
-
-    @property
-    def input_ports(self):
-        """Returns definitions of module input ports.
-
-        intent_logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(ChannelTag)
-
-        slot_logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-            2: AxisType(ChannelTag)
-
-        loss_mask:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-        intents:
-            0: AxisType(BatchTag)
-
-        slots:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-        """
-        return {
-            "intent_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
-            "slot_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
-            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "intents": NeuralType({0: AxisType(BatchTag),}),
-            "slots": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        }
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        loss:
-            NeuralType(None)
-        """
-        return {"loss": NeuralType(None)}
-
-    def __init__(
-        self, num_slots, slot_classes_loss_weights=None, intent_classes_loss_weights=None, intent_loss_weight=0.6
-    ):
-        super().__init__()
-        self.num_slots = num_slots
-        self.intent_loss_weight = intent_loss_weight
-        self.slot_classes_loss_weights = slot_classes_loss_weights
-        self.intent_classes_loss_weights = intent_classes_loss_weights
-
-        # For weighted loss to tackle class imbalance
-        if slot_classes_loss_weights:
-            self.slot_classes_loss_weights = torch.FloatTensor(slot_classes_loss_weights).to(self._device)
-
-        if intent_classes_loss_weights:
-            self.intent_classes_loss_weights = torch.FloatTensor(intent_classes_loss_weights).to(self._device)
-
-        self._criterion_intent = nn.CrossEntropyLoss(weight=self.intent_classes_loss_weights)
-        self._criterion_slot = nn.CrossEntropyLoss(weight=self.slot_classes_loss_weights)
-
-    def _loss_function(self, intent_logits, slot_logits, loss_mask, intents, slots):
-        intent_loss = self._criterion_intent(intent_logits, intents)
-
-        active_loss = loss_mask.view(-1) > 0.5
-        active_logits = slot_logits.view(-1, self.num_slots)[active_loss]
-        active_labels = slots.view(-1)[active_loss]
-
-        # To support empty active_labels
-        if len(active_labels) == 0:
-            slot_loss = 0.0
-        else:
-            slot_loss = self._criterion_slot(active_logits, active_labels)
-        loss = intent_loss * self.intent_loss_weight + slot_loss * (1 - self.intent_loss_weight)
-
-        return loss
-
-
-class PaddedSmoothedCrossEntropyLossNM(LossNM):
-    """
-    Neural module which calculates CrossEntropyLoss and
-    1) excludes padding tokens from loss calculation
-    2) allows to use label smoothing regularization
-    3) allows to calculate loss for the desired number of last tokens
-
-    Args:
-        label_smoothing (float): label smoothing regularization coefficient
-        predict_last_k (int): how many last tokens to use for the loss
-            calculation, important for fast evaluation of LM perplexity
-    """
-
-    @property
-    def input_ports(self):
-        """Returns definitions of module input ports.
-
-        logits:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-
-            2: AxisType(ChannelTag)
-
-        target_ids:
-            0: AxisType(BatchTag)
-
-            1: AxisType(TimeTag)
-        """
-        return {
-            "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
-            "target_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-        }
-
-    @property
-    def output_ports(self):
-        """Returns definitions of module output ports.
-
-        loss:
-            NeuralType(None)
-        """
-        return {"loss": NeuralType(None)}
-
-    def __init__(self, pad_id, label_smoothing=0, predict_last_k=0):
-        super().__init__()
-
-        # Create the loss function object.
-        loss_params = {"label_smoothing": label_smoothing, "predict_last_k": predict_last_k}
-        self._loss_fn = SmoothedCrossEntropyLoss(**loss_params)
-        # Store padding.
-        self._pad_id = pad_id
-
-    def _loss_function(self, logits, target_ids):
-        target_mask = mask_padded_tokens(target_ids, self._pad_id).to(logits.dtype)
-        loss = self._loss_fn(logits, target_ids, target_mask)
-        return loss
diff --git a/nemo/collections/nlp/nm/__init__.py b/nemo/collections/nlp/nm/__init__.py
new file mode 100644
index 000000000000..88ccabb8a58a
--- /dev/null
+++ b/nemo/collections/nlp/nm/__init__.py
@@ -0,0 +1,19 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import nemo.collections.nlp.nm.data_layers
+import nemo.collections.nlp.nm.losses
+import nemo.collections.nlp.nm.trainables
diff --git a/nemo/collections/nlp/nm/data_layers/__init__.py b/nemo/collections/nlp/nm/data_layers/__init__.py
new file mode 100644
index 000000000000..1b35d9adc25a
--- /dev/null
+++ b/nemo/collections/nlp/nm/data_layers/__init__.py
@@ -0,0 +1,27 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.nm.data_layers.glue_benchmark_datalayer import *
+from nemo.collections.nlp.nm.data_layers.joint_intent_slot_datalayer import *
+from nemo.collections.nlp.nm.data_layers.lm_bert_datalayer import *
+from nemo.collections.nlp.nm.data_layers.lm_transformer_datalayer import *
+from nemo.collections.nlp.nm.data_layers.machine_translation_datalayer import *
+from nemo.collections.nlp.nm.data_layers.punctuation_capitalization_datalayer import *
+from nemo.collections.nlp.nm.data_layers.qa_squad_datalayer import *
+from nemo.collections.nlp.nm.data_layers.state_tracking_trade_datalayer import *
+from nemo.collections.nlp.nm.data_layers.text_classification_datalayer import *
+from nemo.collections.nlp.nm.data_layers.text_datalayer import *
+from nemo.collections.nlp.nm.data_layers.token_classification_datalayer import *
diff --git a/nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py b/nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py
new file mode 100644
index 000000000000..baf55f55c047
--- /dev/null
+++ b/nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py
@@ -0,0 +1,152 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.data import GLUEDataset
+from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer
+from nemo.core import AxisType, BatchTag, CategoricalTag, NeuralType, RegressionTag, TimeTag
+
+__all__ = ['GlueClassificationDataLayer', 'GlueRegressionDataLayer']
+
+
+class GlueClassificationDataLayer(TextDataLayer):
+    """
+    Creates the data layer to use for the GLUE classification tasks,
+    more details here: https://gluebenchmark.com/tasks
+
+    All the data processing is done in GLUEDataset.
+
+    Args:
+        dataset_type (GLUEDataset):
+                the dataset that needs to be converted to DataLayerNM
+    """
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+            input_ids:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+
+            input_type_ids:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+
+            input_mask:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+
+            labels:
+                0: AxisType(CategoricalTag)
+        """
+        return {
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "labels": NeuralType({0: AxisType(CategoricalTag)}),
+        }
+
+    def __init__(
+        self,
+        data_dir,
+        tokenizer,
+        max_seq_length,
+        processor,
+        evaluate=False,
+        token_params={},
+        shuffle=False,
+        batch_size=64,
+        dataset_type=GLUEDataset,
+    ):
+        dataset_params = {
+            'data_dir': data_dir,
+            'output_mode': 'classification',
+            'processor': processor,
+            'evaluate': evaluate,
+            'token_params': token_params,
+            'tokenizer': tokenizer,
+            'max_seq_length': max_seq_length,
+        }
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle)
+
+
+class GlueRegressionDataLayer(TextDataLayer):
+    """
+    Creates the data layer to use for the GLUE STS-B regression task,
+    more details here: https://gluebenchmark.com/tasks
+
+    All the data processing is done in GLUEDataset.
+
+    Args:
+        dataset_type (GLUEDataset):
+                the dataset that needs to be converted to DataLayerNM
+    """
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+            input_ids:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+
+            input_type_ids:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+
+            input_mask:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+
+            labels:
+                0: AxisType(RegressionTag)
+        """
+        return {
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "labels": NeuralType({0: AxisType(RegressionTag)}),
+        }
+
+    def __init__(
+        self,
+        data_dir,
+        tokenizer,
+        max_seq_length,
+        processor,
+        evaluate=False,
+        token_params={},
+        shuffle=False,
+        batch_size=64,
+        dataset_type=GLUEDataset,
+    ):
+        dataset_params = {
+            'data_dir': data_dir,
+            'output_mode': 'regression',
+            'processor': processor,
+            'evaluate': evaluate,
+            'token_params': token_params,
+            'tokenizer': tokenizer,
+            'max_seq_length': max_seq_length,
+        }
+
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle)
diff --git a/nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py b/nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py
new file mode 100644
index 000000000000..354be6b32a5f
--- /dev/null
+++ b/nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py
@@ -0,0 +1,177 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.data import BertJointIntentSlotDataset, BertJointIntentSlotInferDataset
+from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer
+from nemo.core import AxisType, BatchTag, NeuralType, TimeTag
+
+__all__ = ['BertJointIntentSlotDataLayer', 'BertJointIntentSlotInferDataLayer']
+
+
+class BertJointIntentSlotDataLayer(TextDataLayer):
+    """
+    Creates the data layer to use for the task of joint intent
+    and slot classification with pretrained model.
+
+    All the data processing is done in BertJointIntentSlotDataset.
+
+    input_mask: used to ignore some of the input tokens like paddings
+
+    loss_mask: used to mask and ignore tokens in the loss function
+
+    subtokens_mask: used to ignore the outputs of unwanted tokens in
+    the inference and evaluation like the start and end tokens
+
+    Args:
+        dataset (BertJointIntentSlotDataset):
+            the dataset that needs to be converted to DataLayerNM
+    """
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        input_ids:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_type_ids:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        loss_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        subtokens_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        intents:
+            0: AxisType(BatchTag)
+
+        slots:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+        """
+        return {
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "intents": NeuralType({0: AxisType(BatchTag)}),
+            "slots": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+        }
+
+    def __init__(
+        self,
+        input_file,
+        slot_file,
+        pad_label,
+        tokenizer,
+        max_seq_length,
+        num_samples=-1,
+        shuffle=False,
+        batch_size=64,
+        ignore_extra_tokens=False,
+        ignore_start_end=False,
+        dataset_type=BertJointIntentSlotDataset,
+    ):
+        dataset_params = {
+            'input_file': input_file,
+            'slot_file': slot_file,
+            'pad_label': pad_label,
+            'tokenizer': tokenizer,
+            'max_seq_length': max_seq_length,
+            'num_samples': num_samples,
+            'shuffle': shuffle,
+            'ignore_extra_tokens': ignore_extra_tokens,
+            'ignore_start_end': ignore_start_end,
+        }
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle)
+
+
+class BertJointIntentSlotInferDataLayer(TextDataLayer):
+    """
+    Creates the data layer to use for the task of joint intent
+    and slot classification with pretrained model. This is for
+
+    All the data processing is done in BertJointIntentSlotInferDataset.
+
+    input_mask: used to ignore some of the input tokens like paddings
+
+    loss_mask: used to mask and ignore tokens in the loss function
+
+    subtokens_mask: used to ignore the outputs of unwanted tokens in
+    the inference and evaluation like the start and end tokens
+
+    Args:
+        dataset (BertJointIntentSlotInferDataset):
+            the dataset that needs to be converted to DataLayerNM
+    """
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        input_ids:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_type_ids:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        loss_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        subtokens_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        """
+        return {
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+        }
+
+    def __init__(self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertJointIntentSlotInferDataset):
+        dataset_params = {'queries': queries, 'tokenizer': tokenizer, 'max_seq_length': max_seq_length}
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle=False)
diff --git a/nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py b/nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py
new file mode 100644
index 000000000000..7034c7c18c38
--- /dev/null
+++ b/nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py
@@ -0,0 +1,225 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import os
+import random
+
+import h5py
+import numpy as np
+import torch
+from torch.utils import data as pt_data
+
+from nemo.backends.pytorch import DataLayerNM
+from nemo.collections.nlp.data import BertPretrainingDataset, BertPretrainingPreprocessedDataset
+from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer
+from nemo.core import AxisType, BatchTag, NeuralType, TimeTag
+
+__all__ = ['BertPretrainingDataLayer', 'BertPretrainingPreprocessedDataLayer']
+
+
+class BertPretrainingDataLayer(TextDataLayer):
+    """
+    Data layer for masked language modeling task.
+
+    Args:
+        tokenizer (TokenizerSpec): tokenizer
+        dataset (str): directory or a single file with dataset documents
+        max_seq_length (int): maximum allowed length of the text segments
+        mask_probability (float): probability of masking input sequence tokens
+        batch_size (int): batch size in segments
+        short_seeq_prob (float): Probability of creating sequences which are
+            shorter than the maximum length.
+            Defualts to 0.1.
+    """
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        input_ids: indices of tokens which constitute batches of text segments
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_type_ids: indices of token types (e.g., sentences A & B in BERT)
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_mask: bool tensor with 0s in place of tokens to be masked
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        output_ids: indices of output tokens which should be predicted
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        output_mask: bool tensor with 0s in place of tokens to be excluded
+            from loss calculation
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        labels: indices of classes to be predicted from [CLS] token of text
+            segments (e.g, 0 or 1 in next sentence prediction task)
+            0: AxisType(BatchTag)
+
+        """
+        return {
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "labels": NeuralType({0: AxisType(BatchTag)}),
+        }
+
+    def __init__(self, tokenizer, dataset, max_seq_length, mask_probability, short_seq_prob=0.1, batch_size=64):
+        dataset_params = {
+            'tokenizer': tokenizer,
+            'dataset': dataset,
+            'max_seq_length': max_seq_length,
+            'mask_probability': mask_probability,
+            'short_seq_prob': short_seq_prob,
+        }
+        super().__init__(BertPretrainingDataset, dataset_params, batch_size, shuffle=False)
+
+
+class BertPretrainingPreprocessedDataLayer(DataLayerNM):
+    """
+    Data layer for masked language modeling task.
+
+    Args:
+        tokenizer (TokenizerSpec): tokenizer
+        dataset (str): directory or a single file with dataset documents
+        max_seq_length (int): maximum allowed length of the text segments
+        mask_probability (float): probability of masking input sequence tokens
+        batch_size (int): batch size in segments
+        short_seeq_prob (float): Probability of creating sequences which are
+            shorter than the maximum length.
+            Defualts to 0.1.
+    """
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        input_ids: indices of tokens which constitute batches of text segments
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_type_ids: indices of token types (e.g., sentences A & B in BERT)
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_mask: bool tensor with 0s in place of tokens to be masked
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        output_ids: indices of output tokens which should be predicted
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        output_mask: bool tensor with 0s in place of tokens to be excluded
+            from loss calculation
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        labels: indices of classes to be predicted from [CLS] token of text
+            segments (e.g, 0 or 1 in next sentence prediction task)
+            0: AxisType(BatchTag)
+
+        """
+        return {
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "labels": NeuralType({0: AxisType(BatchTag)}),
+        }
+
+    def __init__(self, dataset, max_pred_length, batch_size=64, training=True):
+
+        if os.path.isdir(dataset):
+            self.files = [
+                os.path.join(dataset, f) for f in os.listdir(dataset) if os.path.isfile(os.path.join(dataset, f))
+            ]
+        else:
+            self.files = [dataset]
+        self.files.sort()
+        self.num_files = len(self.files)
+        self._batch_size = batch_size
+        self.max_pred_length = max_pred_length
+        self.training = training
+        total_length = 0
+        for f in self.files:
+            fp = h5py.File(f, 'r')
+            total_length += len(fp['input_ids'])
+            fp.close()
+        self.total_length = total_length
+        super().__init__()
+
+    def _collate_fn(self, x):
+        num_components = len(x[0])
+        components = [[] for _ in range(num_components)]
+        batch_size = len(x)
+        for i in range(batch_size):
+            for j in range(num_components):
+                components[j].append(x[i][j])
+        src_ids, src_segment_ids, src_mask, tgt_ids, tgt_mask, sent_ids = [np.stack(x, axis=0) for x in components]
+        src_ids = torch.Tensor(src_ids).long().to(self._device)
+        src_segment_ids = torch.Tensor(src_segment_ids).long().to(self._device)
+        src_mask = torch.Tensor(src_mask).long().to(self._device)
+        tgt_ids = torch.Tensor(tgt_ids).long().to(self._device)
+        tgt_mask = torch.Tensor(tgt_mask).long().to(self._device)
+        sent_ids = torch.Tensor(sent_ids).long().to(self._device)
+        return src_ids, src_segment_ids, src_mask, tgt_ids, tgt_mask, sent_ids
+
+    def __len__(self):
+        return self.total_length
+
+    @property
+    def dataset(self):
+        return None
+
+    @property
+    def data_iterator(self):
+        while True:
+            if self.training:
+                random.shuffle(self.files)
+            for f_id in range(self.num_files):
+                data_file = self.files[f_id]
+                train_data = BertPretrainingPreprocessedDataset(
+                    input_file=data_file, max_pred_length=self.max_pred_length
+                )
+                train_sampler = pt_data.RandomSampler(train_data)
+                train_dataloader = pt_data.DataLoader(
+                    dataset=train_data,
+                    batch_size=self._batch_size,
+                    collate_fn=self._collate_fn,
+                    shuffle=False,
+                    sampler=train_sampler,
+                )
+                for x in train_dataloader:
+                    yield x
diff --git a/nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py b/nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py
new file mode 100644
index 000000000000..64e79ffea9f1
--- /dev/null
+++ b/nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py
@@ -0,0 +1,72 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.data import LanguageModelingDataset
+from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer
+from nemo.core import AxisType, BatchTag, NeuralType, TimeTag
+
+__all__ = ['LanguageModelingDataLayer']
+
+
+class LanguageModelingDataLayer(TextDataLayer):
+    """
+    Data layer for standard language modeling task.
+
+    Args:
+        dataset (str): path to text document with data
+        tokenizer (TokenizerSpec): tokenizer
+        max_seq_length (int): maximum allowed length of the text segments
+        batch_step (int): how many tokens to skip between two successive
+            segments of text when constructing batches
+    """
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        input_ids: indices of tokens which constitute batches of text segments
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_mask: bool tensor with 0s in place of tokens to be masked
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        labels: indices of tokens which should be predicted from each of the
+            corresponding tokens in input_ids; for left-to-right language
+            modeling equals to input_ids shifted by 1 to the right
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+        """
+        return {
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+        }
+
+    def __init__(
+        self, dataset, tokenizer, max_seq_length, batch_size, batch_step=128, dataset_type=LanguageModelingDataset
+    ):
+        dataset_params = {
+            'dataset': dataset,
+            'tokenizer': tokenizer,
+            'max_seq_length': max_seq_length,
+            'batch_step': batch_step,
+        }
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle=False)
diff --git a/nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py b/nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py
new file mode 100644
index 000000000000..23aa1c54e913
--- /dev/null
+++ b/nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py
@@ -0,0 +1,137 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import torch
+from torch.utils import data as pt_data
+
+import nemo
+from nemo.collections.nlp.data import TranslationDataset
+from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer
+from nemo.core import AxisType, BatchTag, NeuralType, TimeTag
+
+__all__ = ['TranslationDataLayer']
+
+
+class TranslationDataLayer(TextDataLayer):
+    """
+    Data layer for neural machine translation from source (src) language to
+    target (tgt) language.
+
+    Args:
+        tokenizer_src (TokenizerSpec): source language tokenizer
+        tokenizer_tgt (TokenizerSpec): target language tokenizer
+        dataset_src (str): path to source data
+        dataset_tgt (str): path to target data
+        tokens_in_batch (int): maximum allowed number of tokens in batches,
+            batches will be constructed to minimize the use of <pad> tokens
+        clean (bool): whether to use parallel data cleaning such as removing
+            pairs with big difference in sentences length, removing pairs with
+            the same tokens in src and tgt, etc; useful for training data layer
+            and should not be used in evaluation data layer
+    """
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        src_ids: indices of tokens which correspond to source sentences
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        src_mask: bool tensor with 0s in place of source tokens to be masked
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        tgt_ids: indices of tokens which correspond to target sentences
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        tgt_mask: bool tensor with 0s in place of target tokens to be masked
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        labels: indices of tokens which should be predicted from each of the
+            corresponding target tokens in tgt_ids; for standard neural
+            machine translation equals to tgt_ids shifted by 1 to the right
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        sent_ids: indices of the sentences in a batch; important for
+            evaluation with external metrics, such as SacreBLEU
+            0: AxisType(BatchTag)
+
+        """
+        return {
+            "src_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "src_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "tgt_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "tgt_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "sent_ids": NeuralType({0: AxisType(BatchTag)}),
+        }
+
+    def __init__(
+        self,
+        tokenizer_src,
+        tokenizer_tgt,
+        dataset_src,
+        dataset_tgt,
+        tokens_in_batch=1024,
+        shuffle=False,
+        clean=False,
+        dataset_type=TranslationDataset,
+    ):
+        dataset_params = {
+            'tokenizer_src': tokenizer_src,
+            'tokenizer_tgt': tokenizer_tgt,
+            'dataset_src': dataset_src,
+            'dataset_tgt': dataset_tgt,
+            'tokens_in_batch': tokens_in_batch,
+            'clean': clean,
+        }
+        super().__init__(dataset_type, dataset_params, batch_size=1, shuffle=shuffle)
+
+        if self._placement == nemo.core.DeviceType.AllGpu:
+            sampler = pt_data.distributed.DistributedSampler(self._dataset)
+        else:
+            sampler = None
+
+        self._dataloader = pt_data.DataLoader(
+            dataset=self._dataset, batch_size=1, collate_fn=self._collate_fn, shuffle=sampler is None, sampler=sampler
+        )
+
+    def _collate_fn(self, x):
+        src_ids, src_mask, tgt_ids, tgt_mask, labels, sent_ids = x[0]
+        src_ids = torch.Tensor(src_ids).long().to(self._device)
+        src_mask = torch.Tensor(src_mask).float().to(self._device)
+        tgt_ids = torch.Tensor(tgt_ids).long().to(self._device)
+        tgt_mask = torch.Tensor(tgt_mask).float().to(self._device)
+        labels = torch.Tensor(labels).long().to(self._device)
+        sent_ids = torch.Tensor(sent_ids).long().to(self._device)
+        return src_ids, src_mask, tgt_ids, tgt_mask, labels, sent_ids
+
+    @property
+    def dataset(self):
+        return None
+
+    @property
+    def data_iterator(self):
+        return self._dataloader
diff --git a/nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py b/nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py
new file mode 100644
index 000000000000..41b952827043
--- /dev/null
+++ b/nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py
@@ -0,0 +1,106 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.data import BertPunctuationCapitalizationDataset
+from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer
+from nemo.core import AxisType, BatchTag, NeuralType, TimeTag
+
+__all__ = ['PunctuationCapitalizationDataLayer']
+
+
+class PunctuationCapitalizationDataLayer(TextDataLayer):
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        input_ids:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_type_ids:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        loss_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        subtokens_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        punct_labels:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        capit_labels:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        """
+        return {
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "punct_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "capit_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+        }
+
+    def __init__(
+        self,
+        text_file,
+        label_file,
+        tokenizer,
+        max_seq_length,
+        pad_label='O',
+        punct_label_ids=None,
+        capit_label_ids=None,
+        num_samples=-1,
+        shuffle=False,
+        batch_size=64,
+        ignore_extra_tokens=False,
+        ignore_start_end=False,
+        use_cache=False,
+        dataset_type=BertPunctuationCapitalizationDataset,
+    ):
+        dataset_params = {
+            'text_file': text_file,
+            'label_file': label_file,
+            'max_seq_length': max_seq_length,
+            'tokenizer': tokenizer,
+            'num_samples': num_samples,
+            'shuffle': shuffle,
+            'pad_label': pad_label,
+            'punct_label_ids': punct_label_ids,
+            'capit_label_ids': capit_label_ids,
+            'ignore_extra_tokens': ignore_extra_tokens,
+            'ignore_start_end': ignore_start_end,
+            'use_cache': use_cache,
+        }
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle)
diff --git a/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py b/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py
new file mode 100644
index 000000000000..56d912a35a6d
--- /dev/null
+++ b/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py
@@ -0,0 +1,108 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.data import SquadDataset
+from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer
+from nemo.core import AxisType, BatchTag, NeuralType, TimeTag
+
+__all__ = ['BertQuestionAnsweringDataLayer']
+
+
+class BertQuestionAnsweringDataLayer(TextDataLayer):
+    """
+    Creates the data layer to use for Question Answering classification task.
+
+    Args:
+        data_dir (str): Directory that contains train.*.json and dev.*.json.
+        tokenizer (obj): Tokenizer object, e.g. NemoBertTokenizer.
+        version_2_with_negative (bool): True if training should allow
+            unanswerable questions.
+        doc_stride (int): When splitting up a long document into chunks,
+            how much stride to take between chunks.
+        max_query_length (iny): All training files which have a duration less
+            than min_duration are dropped. Can't be used if the `utt2dur` file
+            does not exist. Defaults to None.
+        max_seq_length (int): All training files which have a duration more
+            than max_duration are dropped. Can't be used if the `utt2dur` file
+            does not exist. Defaults to None.
+        mode (str): Use "train" or "dev" to define between
+            training and evaluation.
+        batch_size (int): Batch size. Defaults to 64.
+        dataset_type (class): Question Answering class.
+            Defaults to SquadDataset.
+    """
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+            input_ids:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+
+            input_type_ids:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+
+            input_mask:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+
+            start_positions:
+                0: AxisType(BatchTag)
+
+            end_positions:
+                0: AxisType(BatchTag)
+
+            unique_ids:
+                0: AxisType(BatchTag)
+
+        """
+        return {
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "start_positions": NeuralType({0: AxisType(BatchTag)}),
+            "end_positions": NeuralType({0: AxisType(BatchTag)}),
+            "unique_ids": NeuralType({0: AxisType(BatchTag)}),
+        }
+
+    def __init__(
+        self,
+        data_dir,
+        tokenizer,
+        version_2_with_negative,
+        doc_stride,
+        max_query_length,
+        max_seq_length,
+        mode="train",
+        batch_size=64,
+        dataset_type=SquadDataset,
+    ):
+        dataset_params = {
+            'data_dir': data_dir,
+            'mode': mode,
+            'tokenizer': tokenizer,
+            'version_2_with_negative': version_2_with_negative,
+            'max_query_length': max_query_length,
+            'max_seq_length': max_seq_length,
+            'doc_stride': doc_stride,
+        }
+
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle=False)
diff --git a/nemo/collections/nlp/nm/data_layers/state_tracking_trade_datalayer.py b/nemo/collections/nlp/nm/data_layers/state_tracking_trade_datalayer.py
new file mode 100644
index 000000000000..decfc035c25b
--- /dev/null
+++ b/nemo/collections/nlp/nm/data_layers/state_tracking_trade_datalayer.py
@@ -0,0 +1,210 @@
+# =============================================================================
+# Copyright 2019 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
+# Copyright 2019 Salesforce Research.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom
+# the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# =============================================================================
+
+import numpy as np
+import torch
+from torch.utils import data as pt_data
+
+import nemo
+from nemo.collections.nlp.data.datasets import MultiWOZDataset
+from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer
+from nemo.core.neural_types import *
+
+__all__ = ['MultiWOZDataLayer']
+
+
+class MultiWOZDataLayer(TextDataLayer):
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        src_ids: ids of input sequences
+            0: AxisType(BatchTag)
+
+            1: AxisType(ChannelTag)
+
+        src_lens: lengths of input sequences
+            0: AxisType(BatchTag)
+
+        tgt_ids: labels for the generator output
+            0: AxisType(BatchTag)
+
+            1: AxisType(ChannelTag)
+
+            2: AxisType(TimeTag)
+
+        tgt_lens: lengths of the generator targets
+            0: AxisType(BatchTag)
+
+            1: AxisType(ChannelTag)
+
+        gating_labels: labels for the gating head
+            0: AxisType(BatchTag)
+
+            1: AxisType(ChannelTag)
+
+        turn_domain: list of the domains
+            NeuralType(None)
+
+        """
+        return {
+            "src_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "src_lens": NeuralType({0: AxisType(BatchTag)}),
+            "tgt_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(TimeTag)}),
+            "tgt_lens": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "gating_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "turn_domain": NeuralType(None),
+        }
+
+    def __init__(
+        self,
+        data_dir,
+        domains,
+        all_domains,
+        vocab,
+        slots,
+        gating_dict,
+        num_samples=-1,
+        batch_size=16,
+        mode='train',
+        dataset_type=MultiWOZDataset,
+        shuffle=False,
+        num_workers=0,
+        input_dropout=0,
+        is_training=False,
+    ):
+
+        dataset_params = {
+            'data_dir': data_dir,
+            'domains': domains,
+            'num_samples': num_samples,
+            'mode': mode,
+            'shuffle': shuffle,
+            'all_domains': all_domains,
+            'vocab': vocab,
+            'slots': slots,
+            'gating_dict': gating_dict,
+        }
+        super().__init__(dataset_type, dataset_params, batch_size=batch_size)
+
+        if self._placement == nemo.core.DeviceType.AllGpu:
+            sampler = pt_data.distributed.DistributedSampler(self._dataset)
+        else:
+            sampler = None
+
+        self._dataloader = pt_data.DataLoader(
+            dataset=self._dataset,
+            batch_size=batch_size,
+            shuffle=sampler is None,
+            num_workers=num_workers,
+            collate_fn=self._collate_fn,
+            sampler=sampler,
+        )
+        self.pad_id = self._dataset.vocab.pad_id
+        self.gating_dict = self._dataset.gating_dict
+        self.input_dropout = input_dropout
+        self.is_training = is_training
+        self.vocab = self._dataset.vocab
+        self.slots = self._dataset.slots
+
+    def _collate_fn(self, data):
+        """ data is a list of batch_size sample
+        each sample is a dictionary of features
+        """
+
+        def pad_batch_context(sequences):
+            '''
+            merge from batch * sent_len to batch * max_len
+            '''
+            lengths = [len(seq) for seq in sequences]
+            max_len = 1 if max(lengths) == 0 else max(lengths)
+            for i, seq in enumerate(sequences):
+                sequences[i] = seq + [1] * (max_len - len(seq))
+            return torch.tensor(sequences), torch.tensor(lengths)
+
+        def pad_batch_response(sequences, pad_id):
+            '''
+            merge from batch * nb_slot * slot_len to batch * nb_slot * max_slot_len
+            '''
+            lengths = []
+            for bsz_seq in sequences:
+                length = [len(v) for v in bsz_seq]
+                lengths.append(length)
+            max_len = max([max(l) for l in lengths])
+            padded_seqs = []
+            for bsz_seq in sequences:
+                pad_seq = []
+                for v in bsz_seq:
+                    v = v + [pad_id] * (max_len - len(v))
+                    pad_seq.append(v)
+                padded_seqs.append(pad_seq)
+            padded_seqs = torch.tensor(padded_seqs)
+            lengths = torch.tensor(lengths)
+            return padded_seqs, lengths
+
+        data.sort(key=lambda x: len(x['context_ids']), reverse=True)
+        item_info = {}
+        for key in data[0]:
+            item_info[key] = [item[key] for item in data]
+
+        src_ids, src_lens = pad_batch_context(item_info['context_ids'])
+        tgt_ids, tgt_lens = pad_batch_response(item_info['responses_ids'], self._dataset.vocab.pad_id)
+        gating_label = torch.tensor(item_info['gating_label'])
+        turn_domain = torch.tensor(item_info['turn_domain'])
+
+        if self.input_dropout > 0 and self.is_training:
+            bi_mask = np.random.binomial([np.ones(src_ids.size())], 1.0 - self.input_dropout)[0]
+            rand_mask = torch.Tensor(bi_mask).long().to(src_ids.device)
+            src_ids = src_ids * rand_mask
+
+        return (
+            src_ids.to(self._device),
+            src_lens.to(self._device),
+            tgt_ids.to(self._device),
+            tgt_lens.to(self._device),
+            gating_label.to(self._device),
+            turn_domain.to(self._device),
+        )
+
+    @property
+    def dataset(self):
+        return None
+
+    @property
+    def data_iterator(self):
+        return self._dataloader
diff --git a/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py b/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py
new file mode 100644
index 000000000000..738144586dd5
--- /dev/null
+++ b/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py
@@ -0,0 +1,83 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.data import BertTextClassificationDataset
+from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer
+from nemo.core import AxisType, BatchTag, NeuralType, TimeTag
+
+__all__ = ['BertSentenceClassificationDataLayer']
+
+
+class BertSentenceClassificationDataLayer(TextDataLayer):
+    """
+    Creates the data layer to use for the task of sentence classification
+    with pretrained model.
+
+    All the data processing is done BertSentenceClassificationDataset.
+
+    Args:
+        dataset (BertTextClassificationDataset):
+                the dataset that needs to be converted to DataLayerNM
+    """
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        input_ids:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_type_ids:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        labels:
+            0: AxisType(BatchTag)
+
+        """
+        return {
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "labels": NeuralType({0: AxisType(BatchTag)}),
+        }
+
+    def __init__(
+        self,
+        input_file,
+        tokenizer,
+        max_seq_length,
+        num_samples=-1,
+        shuffle=False,
+        batch_size=64,
+        dataset_type=BertTextClassificationDataset,
+    ):
+        dataset_params = {
+            'input_file': input_file,
+            'tokenizer': tokenizer,
+            'max_seq_length': max_seq_length,
+            'num_samples': num_samples,
+            'shuffle': shuffle,
+        }
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle)
diff --git a/nemo/collections/nlp/nm/data_layers/text_datalayer.py b/nemo/collections/nlp/nm/data_layers/text_datalayer.py
new file mode 100644
index 000000000000..a2f2cccf2a64
--- /dev/null
+++ b/nemo/collections/nlp/nm/data_layers/text_datalayer.py
@@ -0,0 +1,47 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.backends.pytorch import DataLayerNM
+from nemo.collections.nlp.data.datasets import *
+
+__all__ = ['TextDataLayer']
+
+
+class TextDataLayer(DataLayerNM):
+    """
+    Generic Text Data Layer NM which wraps PyTorch's dataset
+
+    Args:
+        dataset_type: type of dataset used for this datalayer
+        dataset_params (dict): all the params for the dataset
+    """
+
+    def __init__(self, dataset_type, dataset_params, batch_size, shuffle=False):
+        super().__init__()
+        self._dataset = dataset_type(**dataset_params)
+        self._batch_size = batch_size
+        self._shuffle = shuffle
+
+    def __len__(self):
+        return len(self._dataset)
+
+    @property
+    def dataset(self):
+        return self._dataset
+
+    @property
+    def data_iterator(self):
+        return None
diff --git a/nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py b/nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py
new file mode 100644
index 000000000000..b4e0d6ecc51a
--- /dev/null
+++ b/nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py
@@ -0,0 +1,143 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.data import BertTokenClassificationDataset, BertTokenClassificationInferDataset
+from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer
+from nemo.core import AxisType, BatchTag, NeuralType, TimeTag
+
+__all__ = ['BertTokenClassificationDataLayer', 'BertTokenClassificationInferDataLayer']
+
+
+class BertTokenClassificationDataLayer(TextDataLayer):
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+            input_ids:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+
+            input_type_ids:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+
+            input_mask:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+
+            loss_mask:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+
+            subtokens_mask:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+
+            labels:
+                0: AxisType(BatchTag)
+
+                1: AxisType(TimeTag)
+        """
+        return {
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+        }
+
+    def __init__(
+        self,
+        text_file,
+        label_file,
+        tokenizer,
+        max_seq_length,
+        pad_label='O',
+        label_ids=None,
+        num_samples=-1,
+        shuffle=False,
+        batch_size=64,
+        ignore_extra_tokens=False,
+        ignore_start_end=False,
+        use_cache=False,
+        dataset_type=BertTokenClassificationDataset,
+    ):
+        dataset_params = {
+            'text_file': text_file,
+            'label_file': label_file,
+            'max_seq_length': max_seq_length,
+            'tokenizer': tokenizer,
+            'num_samples': num_samples,
+            'shuffle': shuffle,
+            'pad_label': pad_label,
+            'label_ids': label_ids,
+            'ignore_extra_tokens': ignore_extra_tokens,
+            'ignore_start_end': ignore_start_end,
+            'use_cache': use_cache,
+        }
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle)
+
+
+class BertTokenClassificationInferDataLayer(TextDataLayer):
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        input_ids:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_type_ids:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        input_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        loss_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        subtokens_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        """
+        return {
+            "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+        }
+
+    def __init__(
+        self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertTokenClassificationInferDataset,
+    ):
+        dataset_params = {'queries': queries, 'tokenizer': tokenizer, 'max_seq_length': max_seq_length}
+        super().__init__(dataset_type, dataset_params, batch_size, shuffle=False)
diff --git a/nemo/collections/nlp/nm/losses/__init__.py b/nemo/collections/nlp/nm/losses/__init__.py
new file mode 100644
index 000000000000..20333eb42715
--- /dev/null
+++ b/nemo/collections/nlp/nm/losses/__init__.py
@@ -0,0 +1,24 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.nm.losses.aggregator_loss import *
+from nemo.collections.nlp.nm.losses.joint_intent_slot_loss import *
+from nemo.collections.nlp.nm.losses.masked_language_modeling_loss import *
+from nemo.collections.nlp.nm.losses.padded_smoothed_cross_entropy_loss import *
+from nemo.collections.nlp.nm.losses.qa_squad_loss import *
+from nemo.collections.nlp.nm.losses.smoothed_cross_entropy_loss import *
+from nemo.collections.nlp.nm.losses.state_tracking_trade_loss import *
+from nemo.collections.nlp.nm.losses.token_classification_loss import *
diff --git a/nemo/collections/nlp/nm/losses/aggregator_loss.py b/nemo/collections/nlp/nm/losses/aggregator_loss.py
new file mode 100644
index 000000000000..7a66c3cb85f1
--- /dev/null
+++ b/nemo/collections/nlp/nm/losses/aggregator_loss.py
@@ -0,0 +1,61 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.backends.pytorch import LossNM
+from nemo.core import NeuralType
+
+__all__ = ['LossAggregatorNM']
+
+
+class LossAggregatorNM(LossNM):
+    """
+    Neural module which combines sums several losses into one.
+
+    Args:
+        num_inputs (int): number of input losses
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        """
+        input_ports = {}
+        for i in range(self.num_losses):
+            input_ports["loss_" + str(i + 1)] = NeuralType(None)
+
+        return input_ports
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        loss:
+            NeuralType(None)
+        """
+        return {"loss": NeuralType(None)}
+
+    def __init__(self, num_inputs=2):
+        # Store number of inputs/losses.
+        self.num_losses = num_inputs
+        LossNM.__init__(self)
+
+    def _loss_function(self, **kwargs):
+        values = [kwargs[x] for x in sorted(kwargs.keys())]
+        loss = values[0]
+        for loss_i in values[1:]:
+            loss = loss.add(loss_i)
+        return loss
diff --git a/nemo/collections/nlp/nm/losses/joint_intent_slot_loss.py b/nemo/collections/nlp/nm/losses/joint_intent_slot_loss.py
new file mode 100644
index 000000000000..3ba4d631f1da
--- /dev/null
+++ b/nemo/collections/nlp/nm/losses/joint_intent_slot_loss.py
@@ -0,0 +1,128 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import torch
+from torch import nn
+
+from nemo.backends.pytorch import LossNM
+from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
+
+__all__ = ['JointIntentSlotLoss']
+
+
+class JointIntentSlotLoss(LossNM):
+    """
+    Loss function for the joint intent classification and slot
+    filling task.
+
+    The loss is a joint loss of both tasks, aim to maximize:
+    p(y^i | x)P(y^s1, y^s2, ..., y^sn | x)
+
+    with y^i being the predicted intent and y^s1, y^s2, ..., y^sn
+    are the predicted slots corresponding to x1, x2, ..., xn.
+
+    Args:
+        hidden_states: output of the hidden layers
+        intents: ground truth intents,
+        slots: ground truth slots.
+        input_mask: to differentiate from original tokens and paddings
+        intent_loss_weight: the loss is the sum of:
+            intent_loss_weight * intent_loss +
+            (1 - intent_loss_weight) * slot_loss
+
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        intent_logits:
+            0: AxisType(BatchTag)
+
+            1: AxisType(ChannelTag)
+
+        slot_logits:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+
+        loss_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        intents:
+            0: AxisType(BatchTag)
+
+        slots:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        """
+        return {
+            "intent_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "slot_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "intents": NeuralType({0: AxisType(BatchTag)}),
+            "slots": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+        }
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        loss:
+            NeuralType(None)
+        """
+        return {"loss": NeuralType(None)}
+
+    def __init__(
+        self, num_slots, slot_classes_loss_weights=None, intent_classes_loss_weights=None, intent_loss_weight=0.6,
+    ):
+        LossNM.__init__(self)
+        self.num_slots = num_slots
+        self.intent_loss_weight = intent_loss_weight
+        self.slot_classes_loss_weights = slot_classes_loss_weights
+        self.intent_classes_loss_weights = intent_classes_loss_weights
+
+        # For weighted loss to tackle class imbalance
+        if slot_classes_loss_weights:
+            self.slot_classes_loss_weights = torch.FloatTensor(slot_classes_loss_weights).to(self._device)
+
+        if intent_classes_loss_weights:
+            self.intent_classes_loss_weights = torch.FloatTensor(intent_classes_loss_weights).to(self._device)
+
+        self._criterion_intent = nn.CrossEntropyLoss(weight=self.intent_classes_loss_weights)
+        self._criterion_slot = nn.CrossEntropyLoss(weight=self.slot_classes_loss_weights)
+
+    def _loss_function(self, intent_logits, slot_logits, loss_mask, intents, slots):
+        intent_loss = self._criterion_intent(intent_logits, intents)
+
+        active_loss = loss_mask.view(-1) > 0.5
+        active_logits = slot_logits.view(-1, self.num_slots)[active_loss]
+        active_labels = slots.view(-1)[active_loss]
+
+        # To support empty active_labels
+        if len(active_labels) == 0:
+            slot_loss = 0.0
+        else:
+            slot_loss = self._criterion_slot(active_logits, active_labels)
+        loss = intent_loss * self.intent_loss_weight + slot_loss * (1 - self.intent_loss_weight)
+
+        return loss
diff --git a/nemo/collections/nlp/nm/losses/masked_language_modeling_loss.py b/nemo/collections/nlp/nm/losses/masked_language_modeling_loss.py
new file mode 100644
index 000000000000..e5516d9f33c7
--- /dev/null
+++ b/nemo/collections/nlp/nm/losses/masked_language_modeling_loss.py
@@ -0,0 +1,74 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.backends.pytorch import LossNM
+from nemo.collections.nlp.nm.losses.smoothed_cross_entropy_loss import SmoothedCrossEntropyLoss
+from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
+
+__all__ = ['MaskedLanguageModelingLossNM']
+
+
+class MaskedLanguageModelingLossNM(LossNM):
+    """
+    Neural module which implements Masked Language Modeling (MLM) loss.
+
+    Args:
+        label_smoothing (float): label smoothing regularization coefficient
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        logits:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+
+        output_ids:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        output_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+        """
+        return {
+            "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
+            "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+        }
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        loss:
+            NeuralType(None)
+        """
+        return {"loss": NeuralType(None)}
+
+    def __init__(self, label_smoothing=0.0):
+        LossNM.__init__(self)
+        self._criterion = SmoothedCrossEntropyLoss(label_smoothing)
+
+    def _loss_function(self, logits, output_ids, output_mask):
+        loss = self._criterion(logits, output_ids, output_mask)
+        return loss
diff --git a/nemo/collections/nlp/nm/losses/padded_smoothed_cross_entropy_loss.py b/nemo/collections/nlp/nm/losses/padded_smoothed_cross_entropy_loss.py
new file mode 100644
index 000000000000..0ad66e21106d
--- /dev/null
+++ b/nemo/collections/nlp/nm/losses/padded_smoothed_cross_entropy_loss.py
@@ -0,0 +1,77 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.backends.pytorch import LossNM
+from nemo.collections.nlp.nm.losses.smoothed_cross_entropy_loss import SmoothedCrossEntropyLoss
+from nemo.collections.nlp.utils.common_nlp_utils import mask_padded_tokens
+from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
+
+__all__ = ['PaddedSmoothedCrossEntropyLossNM']
+
+
+class PaddedSmoothedCrossEntropyLossNM(LossNM):
+    """
+    Neural module which calculates CrossEntropyLoss and
+    1) excludes padding tokens from loss calculation
+    2) allows to use label smoothing regularization
+    3) allows to calculate loss for the desired number of last tokens
+
+    Args:
+        label_smoothing (float): label smoothing regularization coefficient
+        predict_last_k (int): how many last tokens to use for the loss
+            calculation, important for fast evaluation of LM perplexity
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        logits:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+
+        target_ids:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+        """
+        return {
+            "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
+            "target_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+        }
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        loss:
+            NeuralType(None)
+        """
+        return {"loss": NeuralType(None)}
+
+    def __init__(self, pad_id, label_smoothing=0, predict_last_k=0):
+        LossNM.__init__(self)
+
+        self._loss_fn = SmoothedCrossEntropyLoss(label_smoothing, predict_last_k)
+        self._pad_id = pad_id
+
+    def _loss_function(self, logits, target_ids):
+        target_mask = mask_padded_tokens(target_ids, self._pad_id).to(logits.dtype)
+        loss = self._loss_fn(logits, target_ids, target_mask)
+        return loss
diff --git a/nemo/collections/nlp/nm/losses/qa_squad_loss.py b/nemo/collections/nlp/nm/losses/qa_squad_loss.py
new file mode 100644
index 000000000000..5f60871d4ebb
--- /dev/null
+++ b/nemo/collections/nlp/nm/losses/qa_squad_loss.py
@@ -0,0 +1,107 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from torch import nn
+
+from nemo.backends.pytorch import LossNM
+from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
+
+__all__ = ['QuestionAnsweringLoss']
+
+
+class QuestionAnsweringLoss(LossNM):
+    """
+    Neural module which implements QuestionAnswering loss.
+    Args:
+        logits: Output of question answering head, which is a token classfier.
+        start_positions: Ground truth start positions of the answer w.r.t.
+            input sequence. If question is unanswerable, this will be
+            pointing to start token, e.g. [CLS], of the input sequence.
+        end_positions: Ground truth end positions of the answer w.r.t.
+            input sequence. If question is unanswerable, this will be
+            pointing to start token, e.g. [CLS], of the input sequence.
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        logits:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+
+        start_positions:
+            0: AxisType(BatchTag)
+
+        end_positions:
+            0: AxisType(BatchTag)
+        """
+        return {
+            "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
+            "start_positions": NeuralType({0: AxisType(BatchTag)}),
+            "end_positions": NeuralType({0: AxisType(BatchTag)}),
+        }
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        loss:
+            NeuralType(None)
+
+        start_logits:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        end_logits:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+        """
+        return {
+            "loss": NeuralType(None),
+            "start_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "end_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+        }
+
+    def __init__(self):
+        LossNM.__init__(self)
+
+    def _loss_function(self, **kwargs):
+        logits = kwargs['logits']
+        start_positions = kwargs['start_positions']
+        end_positions = kwargs['end_positions']
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        # If we are on multi-GPU, split add a dimension
+        if len(start_positions.size()) > 1:
+            start_positions = start_positions.squeeze(-1)
+        if len(end_positions.size()) > 1:
+            end_positions = end_positions.squeeze(-1)
+        ignored_index = start_logits.size(1)
+        start_positions.clamp_(0, ignored_index)
+        end_positions.clamp_(0, ignored_index)
+
+        loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+        start_loss = loss_fct(start_logits, start_positions)
+        end_loss = loss_fct(end_logits, end_positions)
+        total_loss = (start_loss + end_loss) / 2
+        return total_loss, start_logits, end_logits
diff --git a/nemo/collections/nlp/modules/pytorch_utils.py b/nemo/collections/nlp/nm/losses/smoothed_cross_entropy_loss.py
similarity index 72%
rename from nemo/collections/nlp/modules/pytorch_utils.py
rename to nemo/collections/nlp/nm/losses/smoothed_cross_entropy_loss.py
index 58af90a6b595..b28e63e54059 100644
--- a/nemo/collections/nlp/modules/pytorch_utils.py
+++ b/nemo/collections/nlp/nm/losses/smoothed_cross_entropy_loss.py
@@ -1,7 +1,23 @@
-__all__ = ['SmoothedCrossEntropyLoss']
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
 
 import torch
 
+__all__ = ['SmoothedCrossEntropyLoss']
+
 
 class SmoothedCrossEntropyLoss(torch.nn.Module):
     """
diff --git a/nemo/collections/nlp/nm/losses/state_tracking_trade_loss.py b/nemo/collections/nlp/nm/losses/state_tracking_trade_loss.py
new file mode 100644
index 000000000000..c591fc453afb
--- /dev/null
+++ b/nemo/collections/nlp/nm/losses/state_tracking_trade_loss.py
@@ -0,0 +1,162 @@
+# =============================================================================
+# Copyright 2019 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
+# Copyright 2019 Salesforce Research.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom
+# the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# =============================================================================
+
+import torch
+
+from nemo.backends.pytorch.nm import LossNM
+from nemo.core.neural_types import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
+
+__all__ = ['TRADEMaskedCrossEntropy', 'CrossEntropyLoss3D']
+
+
+class TRADEMaskedCrossEntropy(LossNM):
+    """
+    Neural module which implements a cross entropy for trade model with masking feature.
+
+    Args:
+        logits (float): output of the classifier
+        targets (long): ground truth targets
+        loss_mask (long): specifies the ones to get ignored in loss calculation
+
+
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        logits: 4d tensor of logits
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+
+            3: AxisType(ChannelTag)
+
+        targets: 3d tensor of labels
+            0: AxisType(BatchTag)
+
+            1: AxisType(ChannelTag)
+
+            2: AxisType(TimeTag)
+
+        loss_mask: specifies the words to be considered in the loss calculation
+            0: AxisType(BatchTag)
+
+            1: AxisType(ChannelTag)
+
+        """
+        return {
+            "logits": NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag), 3: AxisType(ChannelTag)}
+            ),
+            "targets": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(TimeTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+        }
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        loss: loss value
+            NeuralType(None)
+
+        """
+        return {"loss": NeuralType(None)}
+
+    def __init__(self):
+        LossNM.__init__(self)
+
+    def _loss_function(self, logits, targets, loss_mask):
+        logits_flat = logits.view(-1, logits.size(-1))
+        eps = 1e-10
+        log_probs_flat = torch.log(torch.clamp(logits_flat, min=eps))
+        target_flat = targets.view(-1, 1)
+        losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
+        losses = losses_flat.view(*targets.size())
+        loss = self.masking(losses, loss_mask)
+        return loss
+
+    @staticmethod
+    def masking(losses, mask):
+        max_len = losses.size(2)
+
+        mask_ = torch.arange(max_len, device=mask.device)[None, None, :] < mask[:, :, None]
+        mask_ = mask_.float()
+        losses = losses * mask_
+        loss = losses.sum() / mask_.sum()
+        return loss
+
+
+class CrossEntropyLoss3D(LossNM):
+    """
+    Neural module which implements a cross entropy loss for 3d logits.
+    Args:
+        num_classes (int): number of classes in a classifier, e.g. size
+            of the vocabulary in language modeling objective
+        logits (float): output of the classifier
+        labels (long): ground truth labels
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+        """
+        return {
+            "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(ChannelTag)}),
+            "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+        }
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+        """
+        return {"loss": NeuralType(None)}
+
+    def __init__(self, num_classes, **kwargs):
+        LossNM.__init__(self, **kwargs)
+        self._criterion = torch.nn.CrossEntropyLoss()
+        self.num_classes = num_classes
+
+    def _loss_function(self, logits, labels):
+        logits_flatten = logits.view(-1, self.num_classes)
+        labels_flatten = labels.view(-1)
+
+        loss = self._criterion(logits_flatten, labels_flatten)
+        return loss
diff --git a/nemo/collections/nlp/nm/losses/token_classification_loss.py b/nemo/collections/nlp/nm/losses/token_classification_loss.py
new file mode 100644
index 000000000000..5c3c3adcad22
--- /dev/null
+++ b/nemo/collections/nlp/nm/losses/token_classification_loss.py
@@ -0,0 +1,88 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import torch
+from torch import nn
+
+from nemo.backends.pytorch import LossNM
+from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
+
+__all__ = ['TokenClassificationLoss']
+
+
+class TokenClassificationLoss(LossNM):
+    """
+    Neural module which implements Token Classification loss.
+
+    Args:
+        num_classes (int): number of classes in a classifier, e.g. size
+            of the vocabulary in language modeling objective
+        logits (float): output of the classifier
+        labels (long): ground truth labels
+        loss_mask (long): to differentiate from original tokens and paddings
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        logits:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+
+        labels:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        loss_mask:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+        """
+        return {
+            "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
+            "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+        }
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        loss:
+            NeuralType(None)
+        """
+        return {"loss": NeuralType(None)}
+
+    def __init__(self, num_classes, class_weights=None):
+        LossNM.__init__(self)
+        if class_weights:
+            class_weights = torch.FloatTensor(class_weights).to(self._device)
+
+        self._criterion = nn.CrossEntropyLoss(weight=class_weights)
+        self.num_classes = num_classes
+
+    def _loss_function(self, logits, labels, loss_mask):
+        active_loss = loss_mask.view(-1) > 0.5
+        active_logits = logits.view(-1, self.num_classes)[active_loss]
+        active_labels = labels.view(-1)[active_loss]
+
+        loss = self._criterion(active_logits, active_labels)
+        return loss
diff --git a/nemo/collections/nlp/nm/trainables/__init__.py b/nemo/collections/nlp/nm/trainables/__init__.py
new file mode 100644
index 000000000000..d466413a905e
--- /dev/null
+++ b/nemo/collections/nlp/nm/trainables/__init__.py
@@ -0,0 +1,19 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.nm.trainables.common import *
+from nemo.collections.nlp.nm.trainables.dialogue_state_tracking import *
+from nemo.collections.nlp.nm.trainables.joint_intent_slot import *
diff --git a/nemo/collections/nlp/nm/trainables/common/__init__.py b/nemo/collections/nlp/nm/trainables/common/__init__.py
new file mode 100644
index 000000000000..57f80bcbcae1
--- /dev/null
+++ b/nemo/collections/nlp/nm/trainables/common/__init__.py
@@ -0,0 +1,21 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import nemo.collections.nlp.nm.trainables.common.huggingface
+from nemo.collections.nlp.nm.trainables.common.sequence_classification_nm import *
+from nemo.collections.nlp.nm.trainables.common.sequence_regression_nm import *
+from nemo.collections.nlp.nm.trainables.common.token_classification_nm import *
+from nemo.collections.nlp.nm.trainables.common.transformer import *
diff --git a/nemo/collections/nlp/nm/trainables/common/huggingface/__init__.py b/nemo/collections/nlp/nm/trainables/common/huggingface/__init__.py
new file mode 100644
index 000000000000..48c9a2228ee8
--- /dev/null
+++ b/nemo/collections/nlp/nm/trainables/common/huggingface/__init__.py
@@ -0,0 +1,17 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.nm.trainables.common.huggingface.bert_nm import *
diff --git a/nemo/collections/nlp/huggingface/bert.py b/nemo/collections/nlp/nm/trainables/common/huggingface/bert_nm.py
similarity index 84%
rename from nemo/collections/nlp/huggingface/bert.py
rename to nemo/collections/nlp/nm/trainables/common/huggingface/bert_nm.py
index 616c07f60ce0..1f91576be60a 100644
--- a/nemo/collections/nlp/huggingface/bert.py
+++ b/nemo/collections/nlp/nm/trainables/common/huggingface/bert_nm.py
@@ -1,4 +1,19 @@
-# Copyright (c) 2019 NVIDIA Corporation
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
 from typing import List, Optional
 
 from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertConfig, BertModel
@@ -7,6 +22,8 @@
 from nemo.core.neural_modules import PretrainedModelInfo
 from nemo.core.neural_types import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
 
+__all__ = ['BERT']
+
 
 class BERT(TrainableNM):
     """
@@ -16,6 +33,7 @@ class BERT(TrainableNM):
     Args:
         pretrained_model_name (str): If using a pretrained model, this should
             be the model's name. Otherwise, should be left as None.
+        config_filename (str): path to model configuration file. Optional.
         vocab_size (int): Size of the vocabulary file, if not using a
             pretrained model.
         hidden_size (int): Size of the encoder and pooler layers.
@@ -64,7 +82,7 @@ def output_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
 
     def __init__(
         self,
@@ -156,4 +174,4 @@ def list_pretrained_models() -> Optional[List[PretrainedModelInfo]]:
         return pretrained_models
 
     def forward(self, input_ids, token_type_ids, attention_mask):
-        return self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,)[0]
+        return self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
diff --git a/nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py b/nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py
new file mode 100644
index 000000000000..7e0c81c65388
--- /dev/null
+++ b/nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py
@@ -0,0 +1,85 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from torch import nn as nn
+
+from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init
+from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
+
+__all__ = ['SequenceClassifier']
+
+
+class SequenceClassifier(TrainableNM):
+    """
+    Neural module which consists of MLP followed by softmax classifier for each
+    sequence in the batch.
+
+    Args:
+        hidden_size (int): hidden size (d_model) of the Transformer
+        num_classes (int): number of classes in softmax classifier, e.g. number
+            of different sentiments
+        num_layers (int): number of layers in classifier MLP
+        activation (str): activation function applied in classifier MLP layers
+        log_softmax (bool): whether to apply log_softmax to MLP output
+        dropout (float): dropout ratio applied to MLP
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        hidden_states:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+        """
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        logits:
+            0: AxisType(BatchTag)
+
+            1: AxisType(ChannelTag)
+        """
+        return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})}
+
+    def __init__(
+        self,
+        hidden_size,
+        num_classes,
+        num_layers=2,
+        activation='relu',
+        log_softmax=True,
+        dropout=0.0,
+        use_transformer_pretrained=True,
+    ):
+        super().__init__()
+        self.mlp = MultiLayerPerceptron(hidden_size, num_classes, self._device, num_layers, activation, log_softmax)
+        self.dropout = nn.Dropout(dropout)
+        if use_transformer_pretrained:
+            self.apply(lambda module: transformer_weights_init(module, xavier=False))
+        # self.to(self._device) # sometimes this is necessary
+
+    def forward(self, hidden_states, idx_conditioned_on=0):
+        hidden_states = self.dropout(hidden_states)
+        logits = self.mlp(hidden_states[:, idx_conditioned_on])
+        return logits
diff --git a/nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py b/nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py
new file mode 100644
index 000000000000..1032a1f2c43d
--- /dev/null
+++ b/nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py
@@ -0,0 +1,79 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from torch import nn as nn
+
+from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init
+from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, RegressionTag, TimeTag
+
+__all__ = ['SequenceRegression']
+
+
+class SequenceRegression(TrainableNM):
+    """
+    Neural module which consists of MLP, generates a single number prediction
+    that could be used for a regression task. An example of this task would be
+    semantic textual similatity task, for example, STS-B (from GLUE tasks).
+
+    Args:
+        hidden_size (int): the size of the hidden state for the dense layer
+        num_layers (int): number of layers in classifier MLP
+        activation (str): activation function applied in classifier MLP layers
+        dropout (float): dropout ratio applied to MLP
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        hidden_states:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+        """
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        preds:
+            0: AxisType(RegressionTag)
+        """
+        return {"preds": NeuralType({0: AxisType(RegressionTag)})}
+
+    def __init__(self, hidden_size, num_layers=2, activation='relu', dropout=0.0, use_transformer_pretrained=True):
+        super().__init__()
+        self.mlp = MultiLayerPerceptron(
+            hidden_size,
+            num_classes=1,
+            device=self._device,
+            num_layers=num_layers,
+            activation=activation,
+            log_softmax=False,
+        )
+        self.dropout = nn.Dropout(dropout)
+        if use_transformer_pretrained:
+            self.apply(lambda module: transformer_weights_init(module, xavier=False))
+        # self.to(self._device) # sometimes this is necessary
+
+    def forward(self, hidden_states, idx_conditioned_on=0):
+        hidden_states = self.dropout(hidden_states)
+        preds = self.mlp(hidden_states[:, idx_conditioned_on])
+        return preds.view(-1)
diff --git a/nemo/collections/nlp/nm/trainables/common/token_classification_nm.py b/nemo/collections/nlp/nm/trainables/common/token_classification_nm.py
new file mode 100644
index 000000000000..ba848f247eb3
--- /dev/null
+++ b/nemo/collections/nlp/nm/trainables/common/token_classification_nm.py
@@ -0,0 +1,171 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from torch import nn as nn
+
+from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import gelu, transformer_weights_init
+from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
+
+__all__ = ['BertTokenClassifier', 'TokenClassifier']
+
+ACT2FN = {"gelu": gelu, "relu": nn.functional.relu}
+
+
+class BertTokenClassifier(TrainableNM):
+    """
+    Neural module which consists of MLP followed by softmax classifier for each
+    token in the sequence.
+
+    Args:
+        hidden_size (int): hidden size (d_model) of the Transformer
+        num_classes (int): number of classes in softmax classifier, e.g. size
+            of the vocabulary in language modeling objective
+        activation (str): activation function applied in classifier MLP layers
+        log_softmax (bool): whether to apply log_softmax to MLP output
+        dropout (float): dropout ratio applied to MLP
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        hidden_states:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+        """
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        logits:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+        """
+        return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
+
+    def __init__(
+        self,
+        hidden_size,
+        num_classes,
+        activation='relu',
+        log_softmax=True,
+        dropout=0.0,
+        use_transformer_pretrained=True,
+    ):
+        super().__init__()
+        if activation not in ACT2FN:
+            raise ValueError(f'activation "{activation}" not found')
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.act = ACT2FN[activation]
+        self.norm = nn.LayerNorm(hidden_size, eps=1e-12)
+        self.mlp = MultiLayerPerceptron(
+            hidden_size, num_classes, self._device, num_layers=1, activation=activation, log_softmax=log_softmax
+        )
+        self.dropout = nn.Dropout(dropout)
+        if use_transformer_pretrained:
+            self.apply(lambda module: transformer_weights_init(module, xavier=False))
+        self.to(self._device)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.act(hidden_states)
+        transform = self.norm(hidden_states)
+        logits = self.mlp(transform)
+        return logits
+
+
+class TokenClassifier(TrainableNM):
+    """
+    Neural module which consists of MLP followed by softmax classifier for each
+    token in the sequence.
+
+    Args:
+        hidden_size (int): hidden size (d_model) of the Transformer
+        num_classes (int): number of classes in softmax classifier, e.g. size
+            of the vocabulary in language modeling objective
+        num_layers (int): number of layers in classifier MLP
+        activation (str): activation function applied in classifier MLP layers
+        log_softmax (bool): whether to apply log_softmax to MLP output
+        dropout (float): dropout ratio applied to MLP
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        hidden_states:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+        """
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        logits:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+        """
+        return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
+
+    def __init__(
+        self,
+        hidden_size,
+        num_classes,
+        name=None,
+        num_layers=2,
+        activation='relu',
+        log_softmax=True,
+        dropout=0.0,
+        use_transformer_pretrained=True,
+    ):
+        super().__init__()
+
+        self.name = name
+        self.mlp = MultiLayerPerceptron(hidden_size, num_classes, self._device, num_layers, activation, log_softmax)
+        self.dropout = nn.Dropout(dropout)
+        if use_transformer_pretrained:
+            self.apply(lambda module: transformer_weights_init(module, xavier=False))
+        # self.to(self._device) # sometimes this is necessary
+
+    def __str__(self):
+        name = TrainableNM.__str__(self)
+
+        if self.name:
+            name = self.name + name
+        return name
+
+    def forward(self, hidden_states):
+        hidden_states = self.dropout(hidden_states)
+        logits = self.mlp(hidden_states)
+        return logits
diff --git a/nemo/collections/nlp/nm/trainables/common/transformer/__init__.py b/nemo/collections/nlp/nm/trainables/common/transformer/__init__.py
new file mode 100644
index 000000000000..4e0a87804d4d
--- /dev/null
+++ b/nemo/collections/nlp/nm/trainables/common/transformer/__init__.py
@@ -0,0 +1,17 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_nm import *
diff --git a/nemo/collections/nlp/transformer/decoders.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_decoders.py
similarity index 86%
rename from nemo/collections/nlp/transformer/decoders.py
rename to nemo/collections/nlp/nm/trainables/common/transformer/transformer_decoders.py
index ccd1b26d2f38..1f3cbf0e4f44 100644
--- a/nemo/collections/nlp/transformer/decoders.py
+++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_decoders.py
@@ -1,12 +1,15 @@
-__all__ = ['TransformerDecoderBlock', 'TransformerDecoder']
-
 import copy
 
 import torch
 import torch.nn as nn
 
-from .modules import MultiHeadAttention, PositionWiseFF
-from .utils import form_attention_mask
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_modules import (
+    MultiHeadAttention,
+    PositionWiseFF,
+)
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import form_attention_mask
+
+__all__ = []
 
 
 class TransformerDecoderBlock(nn.Module):
@@ -38,16 +41,14 @@ def __init__(
         super().__init__()
 
         self.first_sub_layer = MultiHeadAttention(
-            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout,
+            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout
         )
         self.second_sub_layer = MultiHeadAttention(
-            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout,
+            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout
         )
         self.third_sub_layer = PositionWiseFF(hidden_size, inner_size, ffn_dropout, hidden_act)
 
-    def forward(
-        self, decoder_query, decoder_mask, decoder_keys, encoder_states, encoder_mask,
-    ):
+    def forward(self, decoder_query, decoder_mask, decoder_keys, encoder_states, encoder_mask):
         self_attn_output = self.first_sub_layer(decoder_query, decoder_keys, decoder_keys, decoder_mask)
         enc_dec_attn_output = self.second_sub_layer(self_attn_output, encoder_states, encoder_states, encoder_mask)
         output_states = self.third_sub_layer(enc_dec_attn_output)
@@ -69,7 +70,7 @@ def _get_memory_states(self, decoder_states, decoder_mems_list=None, i=0):
         return memory_states
 
     def forward(
-        self, decoder_states, decoder_mask, encoder_states, encoder_mask, decoder_mems_list=None, return_mems=False,
+        self, decoder_states, decoder_mask, encoder_states, encoder_mask, decoder_mems_list=None, return_mems=False
     ):
         """
         Args:
@@ -91,9 +92,7 @@ def forward(
         cached_mems_list = [memory_states]
 
         for i, layer in enumerate(self.layers):
-            decoder_states = layer(
-                decoder_states, decoder_attn_mask, memory_states, encoder_states, encoder_attn_mask,
-            )
+            decoder_states = layer(decoder_states, decoder_attn_mask, memory_states, encoder_states, encoder_attn_mask)
             memory_states = self._get_memory_states(decoder_states, decoder_mems_list, i + 1)
             cached_mems_list.append(memory_states)
 
diff --git a/nemo/collections/nlp/transformer/encoders.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_encoders.py
similarity index 91%
rename from nemo/collections/nlp/transformer/encoders.py
rename to nemo/collections/nlp/nm/trainables/common/transformer/transformer_encoders.py
index 1eb63eb55124..24c6afce55ad 100644
--- a/nemo/collections/nlp/transformer/encoders.py
+++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_encoders.py
@@ -1,17 +1,16 @@
-__all__ = [
-    'TransformerEncoderBlock',
-    'TransformerEncoder',
-    'XLNetEncoderBlock',
-    'XLNetEncoder',
-]
-
 import copy
 
 import torch
 import torch.nn as nn
 
-from .modules import MultiHeadAttention, PositionWiseFF, TwoStreamSelfAttention
-from .utils import form_attention_mask
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_modules import (
+    MultiHeadAttention,
+    PositionWiseFF,
+    TwoStreamSelfAttention,
+)
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import form_attention_mask
+
+__all__ = []
 
 
 class TransformerEncoderBlock(nn.Module):
@@ -43,7 +42,7 @@ def __init__(
         super().__init__()
 
         self.first_sub_layer = MultiHeadAttention(
-            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout,
+            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout
         )
         self.second_sub_layer = PositionWiseFF(hidden_size, inner_size, ffn_dropout, hidden_act)
 
@@ -68,9 +67,7 @@ def _get_memory_states(self, encoder_states, encoder_mems_list=None, i=0):
             memory_states = encoder_states
         return memory_states
 
-    def forward(
-        self, encoder_states, encoder_mask, encoder_mems_list=None, return_mems=False,
-    ):
+    def forward(self, encoder_states, encoder_mask, encoder_mems_list=None, return_mems=False):
         """
         Args:
             encoder_states: output of the embedding_layer (B x L_enc x H)
@@ -112,7 +109,7 @@ def __init__(
         super().__init__()
 
         self.first_sub_layer = TwoStreamSelfAttention(
-            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout,
+            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout
         )
         self.second_sub_layer = PositionWiseFF(hidden_size, inner_size, ffn_dropout, hidden_act)
 
@@ -135,5 +132,5 @@ def forward(self, query_states, content_states, input_mask):
         query_attn_mask = form_attention_mask(input_mask, diagonal=-1)
         content_attn_mask = form_attention_mask(input_mask, diagonal=0)
         for layer in self.layers:
-            query_states, content_states = layer(query_states, content_states, query_attn_mask, content_attn_mask,)
+            query_states, content_states = layer(query_states, content_states, query_attn_mask, content_attn_mask)
         return query_states, content_states
diff --git a/nemo/collections/nlp/transformer/generators.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_generators.py
similarity index 95%
rename from nemo/collections/nlp/transformer/generators.py
rename to nemo/collections/nlp/nm/trainables/common/transformer/transformer_generators.py
index 9e427a54db61..d878ccd17655 100644
--- a/nemo/collections/nlp/transformer/generators.py
+++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_generators.py
@@ -1,14 +1,10 @@
-__all__ = [
-    'GreedySequenceGenerator',
-    'TopKSequenceGenerator',
-    'BeamSearchSequenceGenerator',
-]
+__all__ = []
 
 import torch
 import torch.nn as nn
 
-from ..utils.nlp_utils import mask_padded_tokens
-from .utils import NEG_INF
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import NEG_INF
+from nemo.collections.nlp.utils.common_nlp_utils import mask_padded_tokens
 
 
 class GreedySequenceGenerator(nn.Module):
@@ -92,7 +88,7 @@ def _forward(
             )
         else:
             decoder_mems_list = self.decoder.forward(
-                decoder_hidden_states, decoder_input_mask, decoder_mems_list, return_mems=True,
+                decoder_hidden_states, decoder_input_mask, decoder_mems_list, return_mems=True
             )
         log_probs = self.log_softmax.forward(decoder_mems_list[-1])
         return log_probs, decoder_mems_list
@@ -124,9 +120,7 @@ def _prepare_for_search(self, decoder_input_ids=None, encoder_hidden_states=None
 
         return tgt, batch_size, max_generation_length
 
-    def forward(
-        self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None,
-    ):
+    def forward(self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None):
 
         tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states)
 
@@ -138,7 +132,7 @@ def forward(
         for i in range(max_generation_length):
 
             log_probs, decoder_mems_list = self._forward(
-                tgt[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i,
+                tgt[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i
             )
 
             next_tokens = torch.argmax(log_probs[:, -1], dim=-1, keepdim=True)
@@ -182,7 +176,7 @@ def _forward(
         pos=0,
     ):
         log_probs, decoder_mems_list = super()._forward(
-            decoder_input_ids, encoder_hidden_states, encoder_input_mask, decoder_mems_list, pos,
+            decoder_input_ids, encoder_hidden_states, encoder_input_mask, decoder_mems_list, pos
         )
 
         batch_size, seq_len, vocab_size = log_probs.size()
@@ -220,9 +214,7 @@ def __init__(self, embedding, decoder, log_softmax, beam_size=1, len_pen=0, **kw
         self.beam_size = beam_size
         self.len_pen = len_pen
 
-    def forward(
-        self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None,
-    ):
+    def forward(self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None):
 
         tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states)
 
@@ -261,7 +253,7 @@ def forward(
 
             # generate and score candidates for prefixes continuation
             log_probs, decoder_mems_list = self._forward(
-                prefixes[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i + 1,
+                prefixes[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i + 1
             )
             scores_i, prefixes_i = torch.topk(log_probs[:, -1, :], self.beam_size, dim=-1)
 
diff --git a/nemo/collections/nlp/transformer/modules.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_modules.py
similarity index 92%
rename from nemo/collections/nlp/transformer/modules.py
rename to nemo/collections/nlp/nm/trainables/common/transformer/transformer_modules.py
index e958c1951c6c..153843e1aad0 100644
--- a/nemo/collections/nlp/transformer/modules.py
+++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_modules.py
@@ -22,27 +22,25 @@
 http://nlp.seas.harvard.edu/2018/04/03/attention.html
 Copyright by the HuggingFace and Annotated Transformer authors.
 """
-__all__ = [
-    'FixedPositionalEncoding',
-    'TransformerEmbedding',
-    'MultiHeadAttention',
-    'LightweightConv1d',
-    'TwoStreamSelfAttention',
-    'PositionWiseFF',
-]
 
 import math
 
 import torch
 from torch import nn
 
-from .utils import gelu
+from nemo import logging
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import gelu
+
+__all__ = []
+
 
 try:
     from apex.normalization import FusedLayerNorm
 except (AttributeError, ModuleNotFoundError):
     # this is lie - it isn't fused in this case
-    print("Unable to import APEX. Mixed precision, distributed training and " "FusedLayerNorm are not available.")
+    logging.warning(
+        "Unable to import APEX. Mixed precision, distributed training and " "FusedLayerNorm are not available."
+    )
     from torch.nn import LayerNorm as FusedLayerNorm
 
 
@@ -114,7 +112,7 @@ def forward(self, input_ids, token_type_ids=None, start_pos=0):
                 "Input sequence is longer than maximum allowed" " sequence length for positional encoding"
             )
         position_ids = torch.arange(
-            start=start_pos, end=start_pos + seq_length, dtype=torch.long, device=input_ids.device,
+            start=start_pos, end=start_pos + seq_length, dtype=torch.long, device=input_ids.device
         )
         position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 
@@ -144,9 +142,7 @@ class MultiHeadAttention(nn.Module):
             whole layer, but before layer normalization
     """
 
-    def __init__(
-        self, hidden_size, num_attention_heads, attn_score_dropout=0.0, attn_layer_dropout=0.0,
-    ):
+    def __init__(self, hidden_size, num_attention_heads, attn_score_dropout=0.0, attn_layer_dropout=0.0):
         super().__init__()
         if hidden_size % num_attention_heads != 0:
             raise ValueError(
@@ -168,7 +164,7 @@ def __init__(
         self.layer_norm = FusedLayerNorm(hidden_size, eps=1e-5)
 
     def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attn_head_size,)
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attn_head_size)
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
@@ -219,9 +215,7 @@ class LightweightConv1d(nn.Module):
             whole layer, but before layer normalization
     """
 
-    def __init__(
-        self, hidden_size, num_attention_heads, kernel_size, conv_weight_dropout=0.0, conv_layer_dropout=0.0,
-    ):
+    def __init__(self, hidden_size, num_attention_heads, kernel_size, conv_weight_dropout=0.0, conv_layer_dropout=0.0):
         super().__init__()
         self.num_heads = num_attention_heads
         self.kernel_size = kernel_size
@@ -246,7 +240,7 @@ def forward(self, hidden_states, attention_mask):
             weight[:, :, pivot:] = 0
 
         output_states = output_states.contiguous().view(-1, self.num_heads, seq_len)
-        output_states = torch.conv1d(output_states, weight, padding=self.kernel_size // 2, groups=self.num_heads,)
+        output_states = torch.conv1d(output_states, weight, padding=self.kernel_size // 2, groups=self.num_heads)
         output_states = output_states.view(batch_size, hidden_size, seq_len)
         output_states = output_states.permute(0, 2, 1)
 
@@ -270,23 +264,19 @@ class TwoStreamSelfAttention(nn.Module):
             whole layer, but before layer normalization
     """
 
-    def __init__(
-        self, hidden_size, num_attention_heads, attn_score_dropout=0.0, attn_layer_dropout=0.0,
-    ):
+    def __init__(self, hidden_size, num_attention_heads, attn_score_dropout=0.0, attn_layer_dropout=0.0):
         super().__init__()
         self.query_stream = MultiHeadAttention(
-            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout,
+            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout
         )
         self.content_stream = MultiHeadAttention(
-            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout,
+            hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout
         )
 
-    def forward(
-        self, query_states, content_states, query_attention_mask, content_attention_mask,
-    ):
+    def forward(self, query_states, content_states, query_attention_mask, content_attention_mask):
         output_query_states = self.query_stream(query_states, content_states, content_states, query_attention_mask)
         output_content_states = self.content_stream(
-            query_states, content_states, content_states, content_attention_mask,
+            query_states, content_states, content_states, content_attention_mask
         )
         return output_query_states, output_content_states
 
diff --git a/nemo/collections/nlp/modules/transformer_nm.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_nm.py
similarity index 92%
rename from nemo/collections/nlp/modules/transformer_nm.py
rename to nemo/collections/nlp/nm/trainables/common/transformer/transformer_nm.py
index e8e9897a825b..b736588a3d33 100644
--- a/nemo/collections/nlp/modules/transformer_nm.py
+++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_nm.py
@@ -2,26 +2,22 @@
 """
 This package contains Transformer for translation Neural Module
 """
-__all__ = [
-    'TransformerEncoderNM',
-    'TransformerDecoderNM',
-    'GreedyLanguageGeneratorNM',
-    'BeamSearchTranslatorNM',
-]
 
 import math
 
-from ..transformer import (
+from nemo.backends.pytorch.nm import TrainableNM
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_decoders import TransformerDecoder
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_encoders import TransformerEncoder
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_generators import (
     BeamSearchSequenceGenerator,
     GreedySequenceGenerator,
-    TransformerDecoder,
-    TransformerEmbedding,
-    TransformerEncoder,
 )
-from ..transformer.utils import transformer_weights_init
-from nemo.backends.pytorch.nm import LossNM, TrainableNM
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_modules import TransformerEmbedding
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init
 from nemo.core.neural_types import *
 
+__all__ = ['TransformerEncoderNM', 'TransformerDecoderNM', 'GreedyLanguageGeneratorNM', 'BeamSearchTranslatorNM']
+
 
 class TransformerEncoderNM(TrainableNM):
     """
@@ -78,7 +74,7 @@ def output_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
 
     def __init__(
         self,
@@ -178,7 +174,7 @@ def input_ports(self):
         """
         return {
             "input_ids_tgt": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
-            "hidden_states_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
+            "hidden_states_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
             "input_mask_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
             "input_mask_tgt": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
@@ -194,7 +190,7 @@ def output_ports(self):
 
             2: AxisType(ChannelTag)
         """
-        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})}
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
 
     def __init__(
         self,
@@ -237,7 +233,7 @@ def __init__(
 
     def forward(self, input_ids_tgt, hidden_states_src, input_mask_src, input_mask_tgt):
         hidden_states_tgt = self.embedding_layer(input_ids_tgt)
-        hidden_states = self.decoder(hidden_states_tgt, input_mask_tgt, hidden_states_src, input_mask_src,)
+        hidden_states = self.decoder(hidden_states_tgt, input_mask_tgt, hidden_states_src, input_mask_src)
         return hidden_states
 
 
@@ -337,7 +333,7 @@ def input_ports(self):
             1: AxisType(TimeTag)
         """
         return {
-            "hidden_states_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}),
+            "hidden_states_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
             "input_mask_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
         }
 
@@ -386,5 +382,5 @@ def __init__(
         )
 
     def forward(self, hidden_states_src, input_mask_src):
-        output_ids = self.generator(encoder_hidden_states=hidden_states_src, encoder_input_mask=input_mask_src,)
+        output_ids = self.generator(encoder_hidden_states=hidden_states_src, encoder_input_mask=input_mask_src)
         return output_ids
diff --git a/nemo/collections/nlp/transformer/utils.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_utils.py
similarity index 100%
rename from nemo/collections/nlp/transformer/utils.py
rename to nemo/collections/nlp/nm/trainables/common/transformer/transformer_utils.py
diff --git a/nemo/collections/nlp/nm/trainables/dialogue_state_tracking/__init__.py b/nemo/collections/nlp/nm/trainables/dialogue_state_tracking/__init__.py
new file mode 100644
index 000000000000..7d8279b73c0d
--- /dev/null
+++ b/nemo/collections/nlp/nm/trainables/dialogue_state_tracking/__init__.py
@@ -0,0 +1,17 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.nm.trainables.dialogue_state_tracking.state_tracking_trade_nm import *
diff --git a/nemo/collections/nlp/nm/trainables/dialogue_state_tracking/state_tracking_trade_nm.py b/nemo/collections/nlp/nm/trainables/dialogue_state_tracking/state_tracking_trade_nm.py
new file mode 100644
index 000000000000..5a2aa466afe1
--- /dev/null
+++ b/nemo/collections/nlp/nm/trainables/dialogue_state_tracking/state_tracking_trade_nm.py
@@ -0,0 +1,235 @@
+# =============================================================================
+# Copyright 2019 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# =============================================================================
+# Copyright 2019 Salesforce Research.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom
+# the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# =============================================================================
+
+
+import random
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn as nn
+
+from nemo.backends.pytorch.nm import TrainableNM
+from nemo.core.neural_types import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
+
+__all__ = ['TRADEGenerator']
+
+
+class TRADEGenerator(TrainableNM):
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        encoder_hidden: hidden states of the encoder
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+
+        encoder_outputs: outputs of the encoder
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+
+        input_lens: lengths of the input sequences to encoder
+            0: AxisType(BatchTag)
+
+        src_ids: input sequences to encoder
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+        targets: targets for the output of the generator
+            0: AxisType(BatchTag)
+
+            1: AxisType(BatchTag)
+
+            2: AxisType(TimeTag)
+
+        """
+        return {
+            'encoder_hidden': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
+            'encoder_outputs': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
+            'input_lens': NeuralType({0: AxisType(BatchTag)}),
+            'src_ids': NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}),
+            'targets': NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(TimeTag)}),
+        }
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        point_outputs: outputs of the generator
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+
+            3: AxisType(ChannelTag)
+
+        gate_outputs: outputs of gating heads
+            0: AxisType(BatchTag)
+
+            1: AxisType(ChannelTag)
+
+            2: AxisType(ChannelTag)
+
+        """
+        return {
+            'point_outputs': NeuralType(
+                {0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag), 3: AxisType(ChannelTag)}
+            ),
+            'gate_outputs': NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(ChannelTag)}),
+        }
+
+    def __init__(self, vocab, embeddings, hid_size, dropout, slots, nb_gate, teacher_forcing=0.5):
+        super().__init__()
+        self.vocab_size = len(vocab)
+        self.vocab = vocab
+        self.embedding = embeddings
+        self.dropout = nn.Dropout(dropout)
+        self.rnn = nn.GRU(hid_size, hid_size, dropout=dropout, batch_first=True)
+        self.nb_gate = nb_gate
+        self.hidden_size = hid_size
+        self.w_ratio = nn.Linear(3 * hid_size, 1)
+        self.w_gate = nn.Linear(hid_size, nb_gate)
+        self.softmax = nn.Softmax(dim=1)
+        self.sigmoid = nn.Sigmoid()
+        self.slots = slots
+        self.teacher_forcing = teacher_forcing
+
+        self._slots_split_to_index()
+        self.slot_emb = nn.Embedding(len(self.slot_w2i), hid_size)
+        self.slot_emb.weight.data.normal_(0, 0.1)
+        self.to(self._device)
+
+    def _slots_split_to_index(self):
+        split_slots = [slot.split('-') for slot in self.slots]
+        domains = [split_slot[0] for split_slot in split_slots]
+        slots = [split_slot[1] for split_slot in split_slots]
+        split_slots = list({s: 0 for s in sum(split_slots, [])})
+        self.slot_w2i = {split_slots[i]: i for i in range(len(split_slots))}
+        self.domain_idx = torch.tensor([self.slot_w2i[domain] for domain in domains], device=self._device)
+        self.subslot_idx = torch.tensor([self.slot_w2i[slot] for slot in slots], device=self._device)
+
+    def forward(self, encoder_hidden, encoder_outputs, input_lens, src_ids, targets=None):
+
+        if (not self.training) or (random.random() > self.teacher_forcing):
+            use_teacher_forcing = False
+        else:
+            use_teacher_forcing = True
+
+        # TODO: set max_res_len to 10 in evaluation mode or
+        #  when targets are not provided
+        max_res_len = targets.shape[2]
+        batch_size = encoder_hidden.shape[0]
+
+        targets = targets.transpose(0, 1)
+
+        all_point_outputs = torch.zeros(len(self.slots), batch_size, max_res_len, self.vocab_size, device=self._device)
+        all_gate_outputs = torch.zeros(len(self.slots), batch_size, self.nb_gate, device=self._device)
+
+        domain_emb = self.slot_emb(self.domain_idx).to(self._device)
+        subslot_emb = self.slot_emb(self.subslot_idx).to(self._device)
+        slot_emb = domain_emb + subslot_emb
+        slot_emb = slot_emb.unsqueeze(1)
+        slot_emb = slot_emb.repeat(1, batch_size, 1)
+        decoder_input = self.dropout(slot_emb).view(-1, self.hidden_size)
+        hidden = encoder_hidden.transpose(0, 1).repeat(len(self.slots), 1, 1)
+
+        hidden = hidden.view(-1, self.hidden_size).unsqueeze(0)
+
+        enc_len = input_lens.repeat(len(self.slots))
+
+        maxlen = encoder_outputs.size(1)
+        padding_mask_bool = ~(torch.arange(maxlen, device=self._device)[None, :] <= enc_len[:, None])
+        padding_mask = torch.zeros_like(padding_mask_bool, dtype=encoder_outputs.dtype, device=self._device)
+        padding_mask.masked_fill_(mask=padding_mask_bool, value=-np.inf)
+
+        for wi in range(max_res_len):
+            dec_state, hidden = self.rnn(decoder_input.unsqueeze(1), hidden)
+
+            enc_out = encoder_outputs.repeat(len(self.slots), 1, 1)
+            context_vec, logits, prob = TRADEGenerator.attend(enc_out, hidden.squeeze(0), padding_mask)
+
+            if wi == 0:
+                all_gate_outputs = torch.reshape(self.w_gate(context_vec), all_gate_outputs.size())
+
+            p_vocab = TRADEGenerator.attend_vocab(self.embedding.weight, hidden.squeeze(0))
+            p_gen_vec = torch.cat([dec_state.squeeze(1), context_vec, decoder_input], -1)
+            vocab_pointer_switches = self.sigmoid(self.w_ratio(p_gen_vec))
+            p_context_ptr = torch.zeros(p_vocab.size(), device=self._device)
+
+            p_context_ptr.scatter_add_(1, src_ids.repeat(len(self.slots), 1), prob)
+
+            final_p_vocab = (1 - vocab_pointer_switches).expand_as(
+                p_context_ptr
+            ) * p_context_ptr + vocab_pointer_switches.expand_as(p_context_ptr) * p_vocab
+            pred_word = torch.argmax(final_p_vocab, dim=1)
+
+            all_point_outputs[:, :, wi, :] = torch.reshape(
+                final_p_vocab, (len(self.slots), batch_size, self.vocab_size)
+            )
+
+            if use_teacher_forcing:
+                decoder_input = self.embedding(torch.flatten(targets[:, :, wi]))
+            else:
+                decoder_input = self.embedding(pred_word)
+
+            decoder_input = decoder_input.to(self._device)
+        all_point_outputs = all_point_outputs.transpose(0, 1).contiguous()
+        all_gate_outputs = all_gate_outputs.transpose(0, 1).contiguous()
+        return all_point_outputs, all_gate_outputs
+
+    @staticmethod
+    def attend(seq, cond, padding_mask):
+        scores_ = cond.unsqueeze(1).expand_as(seq).mul(seq).sum(2)
+        scores_ = scores_ + padding_mask
+        scores = F.softmax(scores_, dim=1)
+        context = scores.unsqueeze(2).expand_as(seq).mul(seq).sum(1)
+        return context, scores_, scores
+
+    @staticmethod
+    def attend_vocab(seq, cond):
+        scores_ = cond.matmul(seq.transpose(1, 0))
+        scores = F.softmax(scores_, dim=1)
+        return scores
diff --git a/nemo/collections/nlp/nm/trainables/joint_intent_slot/__init__.py b/nemo/collections/nlp/nm/trainables/joint_intent_slot/__init__.py
new file mode 100644
index 000000000000..600a32ece82d
--- /dev/null
+++ b/nemo/collections/nlp/nm/trainables/joint_intent_slot/__init__.py
@@ -0,0 +1,17 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm import *
diff --git a/nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py b/nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py
new file mode 100644
index 000000000000..b8707646f746
--- /dev/null
+++ b/nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py
@@ -0,0 +1,95 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from torch import nn as nn
+
+from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM
+from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init
+from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag
+
+__all__ = ['JointIntentSlotClassifier']
+
+
+class JointIntentSlotClassifier(TrainableNM):
+    """
+    The softmax classifier for the joint intent classification and slot
+    filling task which  consists of a dense layer + relu + softmax for
+    predicting the slots and similar for predicting the intents.
+
+    Args:
+        hidden_size (int): the size of the hidden state for the dense layer
+        num_intents (int): number of intents
+        num_slots (int): number of slots
+        dropout (float): dropout to be applied to the layer
+    """
+
+    @property
+    def input_ports(self):
+        """Returns definitions of module input ports.
+
+        hidden_states:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+        """
+        return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})}
+
+    @property
+    def output_ports(self):
+        """Returns definitions of module output ports.
+
+        intent_logits:
+            0: AxisType(BatchTag)
+
+            1: AxisType(ChannelTag)
+
+        slot_logits:
+            0: AxisType(BatchTag)
+
+            1: AxisType(TimeTag)
+
+            2: AxisType(ChannelTag)
+        """
+        return {
+            "intent_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}),
+            "slot_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}),
+        }
+
+    def __init__(self, hidden_size, num_intents, num_slots, dropout=0.0, use_transformer_pretrained=True, **kwargs):
+        super().__init__(**kwargs)
+        self.dropout = nn.Dropout(dropout)
+        self.slot_mlp = MultiLayerPerceptron(
+            hidden_size, num_classes=num_slots, device=self._device, num_layers=2, activation='relu', log_softmax=False
+        )
+        self.intent_mlp = MultiLayerPerceptron(
+            hidden_size,
+            num_classes=num_intents,
+            device=self._device,
+            num_layers=2,
+            activation='relu',
+            log_softmax=False,
+        )
+        if use_transformer_pretrained:
+            self.apply(lambda module: transformer_weights_init(module, xavier=False))
+        # self.to(self._device)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dropout(hidden_states)
+        intent_logits = self.intent_mlp(hidden_states[:, 0])
+        slot_logits = self.slot_mlp(hidden_states)
+        return intent_logits, slot_logits
diff --git a/nemo/collections/nlp/transformer/__init__.py b/nemo/collections/nlp/transformer/__init__.py
deleted file mode 100644
index 1f91c6035a59..000000000000
--- a/nemo/collections/nlp/transformer/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) 2019 NVIDIA Corporation
-from .decoders import *
-from .encoders import *
-from .generators import *
-from .modules import *
diff --git a/nemo/collections/nlp/utils/__init__.py b/nemo/collections/nlp/utils/__init__.py
index 894348fc3114..49948c01f0c6 100644
--- a/nemo/collections/nlp/utils/__init__.py
+++ b/nemo/collections/nlp/utils/__init__.py
@@ -1 +1,3 @@
-from . import callbacks, metrics, nlp_utils
+from nemo.collections.nlp.utils.callback_utils import *
+from nemo.collections.nlp.utils.common_nlp_utils import *
+from nemo.collections.nlp.utils.loss_utils import *
diff --git a/nemo/collections/nlp/utils/callback_utils.py b/nemo/collections/nlp/utils/callback_utils.py
new file mode 100644
index 000000000000..a3da1106d5c9
--- /dev/null
+++ b/nemo/collections/nlp/utils/callback_utils.py
@@ -0,0 +1,97 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import os
+import time
+
+import numpy as np
+from matplotlib import pyplot as plt
+from sklearn.metrics import confusion_matrix
+
+from nemo import logging
+
+__all__ = ['list2str', 'tensor2list', 'plot_confusion_matrix']
+
+
+def list2str(l):
+    return ' '.join([str(x) for x in l])
+
+
+def tensor2list(tensor):
+    return tensor.detach().cpu().tolist()
+
+
+def plot_confusion_matrix(labels, preds, graph_fold, label_ids=None, normalize=False, prefix=''):
+    '''
+    Plot confusion matrix.
+    Args:
+      label_ids (dict): label to id map, for example: {'O': 0, 'LOC': 1}
+      labels (list of ints): list of true labels
+      preds (list of ints): list of predicted labels
+      graph_fold (str): path to output folder
+      normalize (bool): flag to indicate whether to normalize confusion matrix
+      prefix (str): prefix for the plot name
+
+    '''
+    if label_ids is None:
+        _plot_confusion_matrix(labels, preds, graph_fold)
+
+    else:
+        # remove labels from label_ids that don't appear in the dev set
+        used_labels = set(labels) | set(preds)
+        label_ids = {k: label_ids[k] for k, v in label_ids.items() if v in used_labels}
+
+        ids_to_labels = {label_ids[k]: k for k in label_ids}
+        classes = [ids_to_labels[id] for id in sorted(label_ids.values())]
+
+        title = 'Confusion matrix'
+        cm = confusion_matrix(labels, preds)
+        if normalize:
+            sums = cm.sum(axis=1)[:, np.newaxis]
+            sums = np.where(sums == 0, 1, sums)
+            cm = cm.astype('float') / sums
+            title = 'Normalized ' + title
+
+        fig = plt.figure()
+        ax = fig.add_subplot(111)
+
+        cax = ax.matshow(cm)
+        ax.set_xticks(np.arange(-1, len(classes) + 1))
+        ax.set_yticks(np.arange(-1, len(classes) + 1))
+        ax.set_xticklabels([''] + classes, rotation=90)
+        ax.set_yticklabels([''] + classes)
+        ax.set_ylabel('True')
+        ax.set_xlabel('Predicted')
+
+        os.makedirs(graph_fold, exist_ok=True)
+        fig.colorbar(cax)
+
+        title = (prefix + ' ' + title).strip()
+        plt.savefig(os.path.join(graph_fold, title + '_' + time.strftime('%Y%m%d-%H%M%S')))
+
+
+def _plot_confusion_matrix(labels, preds, graph_fold):
+    cm = confusion_matrix(labels, preds)
+    logging.info(f'Confusion matrix:\n{cm}')
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    cax = ax.matshow(cm)
+    plt.title('Confusion matrix of the classifier')
+    fig.colorbar(cax)
+    plt.xlabel('Predicted')
+    plt.ylabel('True')
+    os.makedirs(graph_fold, exist_ok=True)
+    plt.savefig(os.path.join(graph_fold, time.strftime('%Y%m%d-%H%M%S')))
diff --git a/nemo/collections/nlp/utils/callbacks/__init__.py b/nemo/collections/nlp/utils/callbacks/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/nemo/collections/nlp/utils/callbacks/language_modeling.py b/nemo/collections/nlp/utils/callbacks/language_modeling.py
deleted file mode 100644
index daffe2c64d2d..000000000000
--- a/nemo/collections/nlp/utils/callbacks/language_modeling.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2019 NVIDIA Corporation
-__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
-
-import numpy as np
-
-import nemo
-
-GLOBAL_KEYS = ["eval_loss", "sys"]
-
-
-def eval_iter_callback(tensors, global_vars):
-    for key in GLOBAL_KEYS:
-        if key not in global_vars.keys():
-            global_vars[key] = []
-
-    for kv, v in tensors.items():
-        if "loss" in kv:
-            for eval_loss in v:
-                global_vars["eval_loss"].append(eval_loss.item())
-
-
-def eval_epochs_done_callback(global_vars):
-    eval_loss = np.mean(global_vars["eval_loss"])
-    eval_ppl = np.exp(eval_loss)
-
-    nemo.logging.info("------------------------------------------------------")
-    nemo.logging.info("Eval loss: {0}".format(np.round(eval_loss, 3)))
-    nemo.logging.info("Eval  ppl: {0}".format(np.round(eval_ppl, 3)))
-    nemo.logging.info("------------------------------------------------------")
-    for key in GLOBAL_KEYS:
-        global_vars[key] = []
-    return dict({"Eval_loss": eval_loss, "Eval_ppl": eval_ppl})
diff --git a/nemo/collections/nlp/utils/callbacks/sentence_classification.py b/nemo/collections/nlp/utils/callbacks/sentence_classification.py
deleted file mode 100644
index 4810bab9dde1..000000000000
--- a/nemo/collections/nlp/utils/callbacks/sentence_classification.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2019 NVIDIA Corporation
-__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
-
-import os
-import random
-import time
-
-import numpy as np  # nopep8
-from matplotlib import pyplot as plt  # nopep8
-from sklearn.metrics import classification_report, confusion_matrix  # nopep8
-
-import nemo
-
-__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
-
-
-def eval_iter_callback(tensors, global_vars, eval_data_layer):
-    if "all_preds" not in global_vars.keys():
-        global_vars["all_preds"] = []
-    if "all_labels" not in global_vars.keys():
-        global_vars["all_labels"] = []
-
-    logits_lists = []
-    labels_lists = []
-
-    for kv, v in tensors.items():
-        if 'logits' in kv:
-            for v_tensor in v:
-                for logit_tensor in v_tensor:
-                    logits_lists.append(logit_tensor.detach().cpu().tolist())
-
-        if 'labels' in kv:
-            for v_tensor in v:
-                for label_tensor in v_tensor:
-                    labels_lists.append(label_tensor.detach().cpu().tolist())
-
-    preds = list(np.argmax(np.asarray(logits_lists), 1))
-    global_vars["all_preds"].extend(preds)
-    global_vars["all_labels"].extend(labels_lists)
-
-
-def list2str(l):
-    return ' '.join([str(j) for j in l])
-
-
-def eval_epochs_done_callback(global_vars, graph_fold):
-    labels = np.asarray(global_vars['all_labels'])
-    preds = np.asarray(global_vars['all_preds'])
-    accuracy = sum(labels == preds) / labels.shape[0]
-    nemo.logging.info(f'Accuracy: {accuracy}')
-    i = 0
-    if preds.shape[0] > 21:
-        i = random.randint(0, preds.shape[0] - 21)
-    nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i : i + 20]))
-    nemo.logging.info("Sampled labels: [%s]" % list2str(labels[i : i + 20]))
-    cm = confusion_matrix(labels, preds)
-    fig = plt.figure()
-    ax = fig.add_subplot(111)
-    cax = ax.matshow(cm)
-    plt.title('Confusion matrix of the classifier')
-    fig.colorbar(cax)
-    plt.xlabel('Predicted')
-    plt.ylabel('True')
-    os.makedirs(graph_fold, exist_ok=True)
-    plt.savefig(os.path.join(graph_fold, time.strftime('%Y%m%d-%H%M%S')))
-
-    nemo.logging.info(classification_report(labels, preds))
-
-    return dict({"accuracy": accuracy})
diff --git a/nemo/collections/nlp/utils/common_nlp_utils.py b/nemo/collections/nlp/utils/common_nlp_utils.py
new file mode 100644
index 000000000000..47634ae71e83
--- /dev/null
+++ b/nemo/collections/nlp/utils/common_nlp_utils.py
@@ -0,0 +1,144 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import os
+import re
+import string
+
+import numpy as np
+
+from nemo import logging
+
+__all__ = [
+    '_is_whitespace',
+    'mask_padded_tokens',
+    'read_intent_slot_outputs',
+    'get_vocab',
+    'write_vocab',
+    'label2idx',
+    'write_vocab_in_order',
+    'if_exist',
+    'remove_punctuation_from_sentence',
+    'ids2text',
+    'calc_class_weights',
+]
+
+
+def _is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+        return True
+    return False
+
+
+def mask_padded_tokens(tokens, pad_id):
+    mask = tokens != pad_id
+    return mask
+
+
+def read_intent_slot_outputs(
+    queries, intent_file, slot_file, intent_logits, slot_logits, slot_masks, intents=None, slots=None
+):
+    intent_dict = get_vocab(intent_file)
+    slot_dict = get_vocab(slot_file)
+    pred_intents = np.argmax(intent_logits, 1)
+    pred_slots = np.argmax(slot_logits, axis=2)
+    slot_masks = slot_masks > 0.5
+    for i, query in enumerate(queries):
+        logging.info(f'Query: {query}')
+        pred = pred_intents[i]
+        logging.info(f'Predicted intent:\t{pred}\t{intent_dict[pred]}')
+        if intents is not None:
+            logging.info(f'True intent:\t{intents[i]}\t{intent_dict[intents[i]]}')
+
+        pred_slot = pred_slots[i][slot_masks[i]]
+        tokens = query.strip().split()
+
+        if len(pred_slot) != len(tokens):
+            raise ValueError('Pred_slot and tokens must be of the same length')
+
+        for j, token in enumerate(tokens):
+            output = f'{token}\t{slot_dict[pred_slot[j]]}'
+            if slots is not None:
+                output = f'{output}\t{slot_dict[slots[i][j]]}'
+            logging.info(output)
+
+
+def get_vocab(file):
+    lines = open(file, 'r').readlines()
+    lines = [line.strip() for line in lines if line.strip()]
+    labels = {i: lines[i] for i in range(len(lines))}
+    return labels
+
+
+def write_vocab(items, outfile):
+    vocab = {}
+    idx = 0
+    with open(outfile, 'w') as f:
+        for item in items:
+            f.write(item + '\n')
+            vocab[item] = idx
+            idx += 1
+    return vocab
+
+
+def label2idx(file):
+    lines = open(file, 'r').readlines()
+    lines = [line.strip() for line in lines if line.strip()]
+    labels = {lines[i]: i for i in range(len(lines))}
+    return labels
+
+
+def write_vocab_in_order(vocab, outfile):
+    with open(outfile, 'w') as f:
+        for key in sorted(vocab.keys()):
+            f.write(f'{vocab[key]}\n')
+
+
+def if_exist(outfold, files):
+    if not os.path.exists(outfold):
+        return False
+    for file in files:
+        if not os.path.exists(f'{outfold}/{file}'):
+            return False
+    return True
+
+
+def remove_punctuation_from_sentence(sentence):
+    sentence = re.sub('[' + string.punctuation + ']', '', sentence)
+    sentence = sentence.lower()
+    return sentence
+
+
+def ids2text(ids, vocab):
+    return ' '.join([vocab[int(id_)] for id_ in ids])
+
+
+def calc_class_weights(label_freq):
+    """
+    Goal is to give more weight to the classes with less samples
+    so as to match the one with the higest frequency. We achieve this by
+    dividing the highest frequency by the freq of each label.
+    Example -
+    [12, 5, 3] -> [12/12, 12/5, 12/3] -> [1, 2.4, 4]
+
+    Here label_freq is assumed to be sorted by the frequency. I.e.
+    label_freq[0] is the most frequent element.
+
+    """
+
+    most_common_label_freq = label_freq[0]
+    weighted_slots = sorted([(index, most_common_label_freq[1] / freq) for (index, freq) in label_freq])
+    return [weight for (_, weight) in weighted_slots]
diff --git a/nemo/collections/nlp/utils/loss_utils.py b/nemo/collections/nlp/utils/loss_utils.py
new file mode 100644
index 000000000000..f491f7d43fa6
--- /dev/null
+++ b/nemo/collections/nlp/utils/loss_utils.py
@@ -0,0 +1,42 @@
+# =============================================================================
+# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+import math
+
+__all__ = ['_compute_softmax']
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
diff --git a/nemo/collections/nlp/utils/metrics/__init__.py b/nemo/collections/nlp/utils/metrics/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/nemo/collections/nlp/utils/nlp_utils.py b/nemo/collections/nlp/utils/nlp_utils.py
deleted file mode 100644
index 1b1ef57bb27a..000000000000
--- a/nemo/collections/nlp/utils/nlp_utils.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import os
-import time
-
-import numpy as np
-from matplotlib import pyplot as plt
-from sklearn.metrics import confusion_matrix
-
-import nemo
-
-
-def _is_whitespace(c):
-    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-        return True
-    return False
-
-
-def mask_padded_tokens(tokens, pad_id):
-    mask = tokens != pad_id
-    return mask
-
-
-def read_intent_slot_outputs(
-    queries, intent_file, slot_file, intent_logits, slot_logits, slot_masks, intents=None, slots=None,
-):
-    intent_dict = get_vocab(intent_file)
-    slot_dict = get_vocab(slot_file)
-    pred_intents = np.argmax(intent_logits, 1)
-    pred_slots = np.argmax(slot_logits, axis=2)
-    slot_masks = slot_masks > 0.5
-    for i, query in enumerate(queries):
-        nemo.logging.info(f'Query: {query}')
-        pred = pred_intents[i]
-        nemo.logging.info(f'Predicted intent:\t{pred}\t{intent_dict[pred]}')
-        if intents is not None:
-            nemo.logging.info(f'True intent:\t{intents[i]}\t{intent_dict[intents[i]]}')
-
-        pred_slot = pred_slots[i][slot_masks[i]]
-        tokens = query.strip().split()
-
-        if len(pred_slot) != len(tokens):
-            raise ValueError('Pred_slot and tokens must be of the same length')
-
-        for j, token in enumerate(tokens):
-            output = f'{token}\t{slot_dict[pred_slot[j]]}'
-            if slots is not None:
-                output = f'{output}\t{slot_dict[slots[i][j]]}'
-            nemo.logging.info(output)
-
-
-def get_vocab(file):
-    lines = open(file, 'r').readlines()
-    lines = [line.strip() for line in lines if line.strip()]
-    labels = {i: lines[i] for i in range(len(lines))}
-    return labels
-
-
-def write_vocab(items, outfile):
-    vocab = {}
-    idx = 0
-    with open(outfile, 'w') as f:
-        for item in items:
-            f.write(item + '\n')
-            vocab[item] = idx
-            idx += 1
-    return vocab
-
-
-def label2idx(file):
-    lines = open(file, 'r').readlines()
-    lines = [line.strip() for line in lines if line.strip()]
-    labels = {lines[i]: i for i in range(len(lines))}
-    return labels
-
-
-def write_vocab_in_order(vocab, outfile):
-    with open(outfile, 'w') as f:
-        for key in sorted(vocab.keys()):
-            f.write(f'{vocab[key]}\n')
-
-
-def plot_confusion_matrix(label_ids, labels, preds, graph_fold, normalize=False, prefix=''):
-    '''
-    Plot confusion matrix.
-    Args:
-      label_ids (dict): label to id map, for example: {'O': 0, 'LOC': 1}
-      labels (list of ints): list of true labels
-      preds (list of ints): list of predicted labels
-      graph_fold (str): path to output folder
-      normalize (bool): flag to indicate whether to normalize confusion matrix
-      prefix (str): prefix for the plot name
-
-    '''
-    # remove labels from label_ids that don't appear in the dev set
-    used_labels = set(labels) | set(preds)
-    label_ids = {k: label_ids[k] for k, v in label_ids.items() if v in used_labels}
-
-    ids_to_labels = {label_ids[k]: k for k in label_ids}
-    classes = [ids_to_labels[id] for id in sorted(label_ids.values())]
-
-    title = 'Confusion matrix'
-    cm = confusion_matrix(labels, preds)
-    if normalize:
-        sums = cm.sum(axis=1)[:, np.newaxis]
-        sums = np.where(sums == 0, 1, sums)
-        cm = cm.astype('float') / sums
-        title = 'Normalized ' + title
-
-    fig = plt.figure()
-    ax = fig.add_subplot(111)
-
-    cax = ax.matshow(cm)
-    ax.set_xticks(np.arange(-1, len(classes) + 1))
-    ax.set_yticks(np.arange(-1, len(classes) + 1))
-    ax.set_xticklabels([''] + classes, rotation=90)
-    ax.set_yticklabels([''] + classes)
-    ax.set_ylabel('True')
-    ax.set_xlabel('Predicted')
-
-    os.makedirs(graph_fold, exist_ok=True)
-    fig.colorbar(cax)
-
-    title = (prefix + ' ' + title).strip()
-    plt.savefig(os.path.join(graph_fold, title + '_' + time.strftime('%Y%m%d-%H%M%S')))
diff --git a/tests/nlp/test_bert.py b/tests/nlp/test_bert.py
index ced011720b19..b15e040d7e1f 100644
--- a/tests/nlp/test_bert.py
+++ b/tests/nlp/test_bert.py
@@ -22,5 +22,5 @@
 
 class TestBert(NeMoUnitTest):
     def test_list_pretrained_models(self):
-        pretrained_models = nemo_nlp.huggingface.BERT.list_pretrained_models()
+        pretrained_models = nemo_nlp.nm.trainables.huggingface.BERT.list_pretrained_models()
         self.assertTrue(len(pretrained_models) > 0)
diff --git a/tests/nlp/test_spc_tokenizer.py b/tests/nlp/test_spc_tokenizer.py
index ac8363a507d7..fa0259fbc120 100644
--- a/tests/nlp/test_spc_tokenizer.py
+++ b/tests/nlp/test_spc_tokenizer.py
@@ -16,7 +16,7 @@
 # limitations under the License.
 # =============================================================================
 
-from nemo.collections.nlp import SentencePieceTokenizer
+from nemo.collections.nlp.data import SentencePieceTokenizer
 from tests.common_setup import NeMoUnitTest
 
 
diff --git a/tests/nlp/test_squad.py b/tests/nlp/test_squad.py
index f6c7fe0580f1..8d1c460798ca 100644
--- a/tests/nlp/test_squad.py
+++ b/tests/nlp/test_squad.py
@@ -20,22 +20,53 @@
 import os
 import shutil
 
+from examples.nlp.scripts.get_squad import SquadDownloader
+
 import nemo
 import nemo.collections.nlp as nemo_nlp
-from nemo.collections.nlp.utils.callbacks.squad import eval_epochs_done_callback, eval_iter_callback
-from nemo.collections.nlp.utils.download_squad import SquadDownloader
+import nemo.collections.nlp.nm.data_layers.qa_squad_datalayer
+import nemo.collections.nlp.nm.trainables.common.token_classification_nm
+from nemo.collections.nlp.callbacks.qa_squad_callback import eval_epochs_done_callback, eval_iter_callback
 from nemo.utils.lr_policies import get_lr_policy
 from tests.common_setup import NeMoUnitTest
 
+logging = nemo.logging
+
 
 class TestSquad(NeMoUnitTest):
     @classmethod
     def setUpClass(cls) -> None:
         super().setUpClass()
+        data_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/nlp/squad"))
+        squad_v1_folder = os.path.join(data_folder, "v1.1")
+        for f in os.listdir(squad_v1_folder):
+            ff = os.path.join(squad_v1_folder, f)
+            if f.startswith("cache"):
+                logging.info(f"remove {ff}")
+                os.remove(ff)
+        squad_v2_folder = os.path.join(data_folder, "v1.1")
+        for f in os.listdir(squad_v2_folder):
+            ff = os.path.join(squad_v1_folder, f)
+            if f.startswith("cache"):
+                logging.info(f"remove {ff}")
+                os.remove(ff)
 
     @classmethod
     def tearDownClass(cls) -> None:
         super().tearDownClass()
+        data_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/nlp/squad"))
+        squad_v1_folder = os.path.join(data_folder, "v1.1")
+        for f in os.listdir(squad_v1_folder):
+            ff = os.path.join(squad_v1_folder, f)
+            if f.startswith("cache"):
+                logging.info(f"remove {ff}")
+                os.remove(ff)
+        squad_v2_folder = os.path.join(data_folder, "v1.1")
+        for f in os.listdir(squad_v2_folder):
+            ff = os.path.join(squad_v1_folder, f)
+            if f.startswith("cache"):
+                logging.info(f"remove {ff}")
+                os.remove(ff)
 
     def test_squad_v1(self):
         version_2_with_negative = False
@@ -54,16 +85,18 @@ def test_squad_v1(self):
         max_answer_length = 20
         null_score_diff_threshold = 0.0
 
-        tokenizer = nemo_nlp.NemoBertTokenizer(pretrained_bert_model)
+        tokenizer = nemo_nlp.data.NemoBertTokenizer(pretrained_bert_model)
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False
         )
-        model = nemo_nlp.huggingface.BERT(pretrained_model_name=pretrained_bert_model)
+        model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(pretrained_model_name=pretrained_bert_model)
         hidden_size = model.hidden_size
-        qa_head = nemo_nlp.TokenClassifier(hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False,)
-        squad_loss = nemo_nlp.QuestionAnsweringLoss()
+        qa_head = nemo.collections.nlp.nm.trainables.common.token_classification_nm.TokenClassifier(
+            hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False
+        )
+        squad_loss = nemo_nlp.nm.losses.QuestionAnsweringLoss()
 
-        data_layer = nemo_nlp.BertQuestionAnsweringDataLayer(
+        data_layer = nemo.collections.nlp.nm.data_layers.qa_squad_datalayer.BertQuestionAnsweringDataLayer(
             mode='train',
             version_2_with_negative=version_2_with_negative,
             batch_size=batch_size,
@@ -74,14 +107,14 @@ def test_squad_v1(self):
             doc_stride=doc_stride,
         )
 
-        (input_ids, input_type_ids, input_mask, start_positions, end_positions, _,) = data_layer()
+        (input_ids, input_type_ids, input_mask, start_positions, end_positions, _) = data_layer()
 
-        hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
+        hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
 
         qa_output = qa_head(hidden_states=hidden_states)
-        loss, _, _ = squad_loss(logits=qa_output, start_positions=start_positions, end_positions=end_positions,)
+        loss, _, _ = squad_loss(logits=qa_output, start_positions=start_positions, end_positions=end_positions)
 
-        data_layer_eval = nemo_nlp.BertQuestionAnsweringDataLayer(
+        data_layer_eval = nemo.collections.nlp.nm.data_layers.qa_squad_datalayer.BertQuestionAnsweringDataLayer(
             mode='dev',
             version_2_with_negative=version_2_with_negative,
             batch_size=batch_size,
@@ -101,12 +134,12 @@ def test_squad_v1(self):
         ) = data_layer_eval()
 
         hidden_states_eval = model(
-            input_ids=input_ids_eval, token_type_ids=input_type_ids_eval, attention_mask=input_mask_eval,
+            input_ids=input_ids_eval, token_type_ids=input_type_ids_eval, attention_mask=input_mask_eval
         )
 
         qa_output_eval = qa_head(hidden_states=hidden_states_eval)
         _, start_logits_eval, end_logits_eval = squad_loss(
-            logits=qa_output_eval, start_positions=start_positions_eval, end_positions=end_positions_eval,
+            logits=qa_output_eval, start_positions=start_positions_eval, end_positions=end_positions_eval
         )
         eval_output = [start_logits_eval, end_logits_eval, unique_ids_eval]
 
@@ -134,7 +167,7 @@ def test_squad_v1(self):
             eval_step=eval_step_freq,
         )
 
-        lr_policy_fn = get_lr_policy('WarmupAnnealing', total_steps=max_steps, warmup_ratio=lr_warmup_proportion,)
+        lr_policy_fn = get_lr_policy('WarmupAnnealing', total_steps=max_steps, warmup_ratio=lr_warmup_proportion)
 
         neural_factory.train(
             tensors_to_optimize=[loss],
@@ -161,16 +194,20 @@ def test_squad_v2(self):
         max_answer_length = 20
         null_score_diff_threshold = 0.0
 
-        tokenizer = nemo_nlp.NemoBertTokenizer(pretrained_bert_model)
+        tokenizer = nemo_nlp.data.NemoBertTokenizer(pretrained_bert_model)
         neural_factory = nemo.core.NeuralModuleFactory(
-            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False,
+            backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False
         )
-        model = nemo_nlp.huggingface.BERT(pretrained_model_name=pretrained_bert_model)
+        model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(pretrained_model_name=pretrained_bert_model)
+
         hidden_size = model.hidden_size
-        qa_head = nemo_nlp.TokenClassifier(hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False,)
-        squad_loss = nemo_nlp.QuestionAnsweringLoss()
 
-        data_layer = nemo_nlp.BertQuestionAnsweringDataLayer(
+        qa_head = nemo.collections.nlp.nm.trainables.common.token_classification_nm.TokenClassifier(
+            hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False
+        )
+        squad_loss = nemo_nlp.nm.losses.QuestionAnsweringLoss()
+
+        data_layer = nemo.collections.nlp.nm.data_layers.qa_squad_datalayer.BertQuestionAnsweringDataLayer(
             mode='train',
             version_2_with_negative=version_2_with_negative,
             batch_size=batch_size,
@@ -181,14 +218,14 @@ def test_squad_v2(self):
             doc_stride=doc_stride,
         )
 
-        (input_ids, input_type_ids, input_mask, start_positions, end_positions, _,) = data_layer()
+        (input_ids, input_type_ids, input_mask, start_positions, end_positions, _) = data_layer()
 
-        hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,)
+        hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)
 
         qa_output = qa_head(hidden_states=hidden_states)
-        loss, _, _ = squad_loss(logits=qa_output, start_positions=start_positions, end_positions=end_positions,)
+        loss, _, _ = squad_loss(logits=qa_output, start_positions=start_positions, end_positions=end_positions)
 
-        data_layer_eval = nemo_nlp.BertQuestionAnsweringDataLayer(
+        data_layer_eval = nemo.collections.nlp.nm.data_layers.qa_squad_datalayer.BertQuestionAnsweringDataLayer(
             mode='dev',
             version_2_with_negative=version_2_with_negative,
             batch_size=batch_size,
@@ -208,12 +245,12 @@ def test_squad_v2(self):
         ) = data_layer_eval()
 
         hidden_states_eval = model(
-            input_ids=input_ids_eval, token_type_ids=input_type_ids_eval, attention_mask=input_mask_eval,
+            input_ids=input_ids_eval, token_type_ids=input_type_ids_eval, attention_mask=input_mask_eval
         )
 
         qa_output_eval = qa_head(hidden_states=hidden_states_eval)
         _, start_logits_eval, end_logits_eval = squad_loss(
-            logits=qa_output_eval, start_positions=start_positions_eval, end_positions=end_positions_eval,
+            logits=qa_output_eval, start_positions=start_positions_eval, end_positions=end_positions_eval
         )
         eval_output = [start_logits_eval, end_logits_eval, unique_ids_eval]
 
@@ -241,7 +278,7 @@ def test_squad_v2(self):
             eval_step=eval_step_freq,
         )
 
-        lr_policy_fn = get_lr_policy('WarmupAnnealing', total_steps=max_steps, warmup_ratio=lr_warmup_proportion,)
+        lr_policy_fn = get_lr_policy('WarmupAnnealing', total_steps=max_steps, warmup_ratio=lr_warmup_proportion)
 
         neural_factory.train(
             tensors_to_optimize=[loss],
diff --git a/tests/test_deploy_export.py b/tests/test_deploy_export.py
index 5cde3cbbb10e..be6a1a39573c 100644
--- a/tests/test_deploy_export.py
+++ b/tests/test_deploy_export.py
@@ -31,6 +31,7 @@
 import nemo
 import nemo.collections.asr as nemo_asr
 import nemo.collections.nlp as nemo_nlp
+import nemo.collections.nlp.nm.trainables.common.token_classification_nm
 from tests.common_setup import NeMoUnitTest
 
 
@@ -47,9 +48,7 @@ def __test_export_route(self, module, out_name, mode, input_example=None):
         if out.exists():
             os.remove(out)
 
-        self.nf.deployment_export(
-            module=module, output=out_name, input_example=input_example, d_format=mode,
-        )
+        self.nf.deployment_export(module=module, output=out_name, input_example=input_example, d_format=mode)
 
         self.assertTrue(out.exists())
         if mode == nemo.core.DeploymentFormat.ONNX:
@@ -89,7 +88,9 @@ def test_simple_module_export(self):
         )
 
     def test_TokenClassifier_module_export(self):
-        t_class = nemo_nlp.TokenClassifier(hidden_size=512, num_classes=16, use_transformer_pretrained=False)
+        t_class = nemo.collections.nlp.nm.trainables.common.token_classification_nm.TokenClassifier(
+            hidden_size=512, num_classes=16, use_transformer_pretrained=False
+        )
         self.__test_export_route(
             module=t_class,
             out_name="t_class.pt",
@@ -98,7 +99,9 @@ def test_TokenClassifier_module_export(self):
         )
 
     def test_TokenClassifier_module_onnx_export(self):
-        t_class = nemo_nlp.TokenClassifier(hidden_size=512, num_classes=16, use_transformer_pretrained=False)
+        t_class = nemo.collections.nlp.nm.trainables.common.token_classification_nm.TokenClassifier(
+            hidden_size=512, num_classes=16, use_transformer_pretrained=False
+        )
         self.__test_export_route(
             module=t_class,
             out_name="t_class.onnx",
@@ -109,25 +112,23 @@ def test_TokenClassifier_module_onnx_export(self):
     def test_jasper_decoder_export_ts(self):
         j_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=33)
         self.__test_export_route(
-            module=j_decoder, out_name="j_decoder.ts", mode=nemo.core.DeploymentFormat.TORCHSCRIPT, input_example=None,
+            module=j_decoder, out_name="j_decoder.ts", mode=nemo.core.DeploymentFormat.TORCHSCRIPT, input_example=None
         )
 
     def test_hf_bert_ts(self):
-        bert = nemo_nlp.huggingface.BERT(pretrained_model_name="bert-base-uncased")
+        bert = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(pretrained_model_name="bert-base-uncased")
         input_example = (
             torch.randint(low=0, high=16, size=(2, 16)).cuda(),
             torch.randint(low=0, high=1, size=(2, 16)).cuda(),
             torch.randint(low=0, high=1, size=(2, 16)).cuda(),
         )
         self.__test_export_route(
-            module=bert, out_name="bert.ts", mode=nemo.core.DeploymentFormat.TORCHSCRIPT, input_example=input_example,
+            module=bert, out_name="bert.ts", mode=nemo.core.DeploymentFormat.TORCHSCRIPT, input_example=input_example
         )
 
     def test_hf_bert_pt(self):
-        bert = nemo_nlp.huggingface.BERT(pretrained_model_name="bert-base-uncased")
-        self.__test_export_route(
-            module=bert, out_name="bert.pt", mode=nemo.core.DeploymentFormat.PYTORCH,
-        )
+        bert = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(pretrained_model_name="bert-base-uncased")
+        self.__test_export_route(module=bert, out_name="bert.pt", mode=nemo.core.DeploymentFormat.PYTORCH)
 
     def test_jasper_encoder_to_onnx(self):
         with open("tests/data/jasper_smaller.yaml") as file:
@@ -144,5 +145,5 @@ def test_jasper_encoder_to_onnx(self):
             module=jasper_encoder,
             out_name="jasper_encoder.onnx",
             mode=nemo.core.DeploymentFormat.ONNX,
-            input_example=(torch.randn(16, 64, 256).cuda(), torch.randn(256).cuda(),),
+            input_example=(torch.randn(16, 64, 256).cuda(), torch.randn(256).cuda()),
         )
diff --git a/tests/test_infer.py b/tests/test_infer.py
index c6faeb8cdcec..05cec60c6fb9 100644
--- a/tests/test_infer.py
+++ b/tests/test_infer.py
@@ -105,22 +105,20 @@ def test_infer_errors(self):
 
         with self.assertRaisesRegex(ValueError, "use_cache was set, but cache was empty"):
             evaluated_tensors = neural_factory.infer(
-                tensors=[twenty_tensor, thirty_tensor], verbose=False, use_cache=True,
+                tensors=[twenty_tensor, thirty_tensor], verbose=False, use_cache=True
             )
 
         new_ten_tensor = minusten(mod_in=twenty_tensor)
         evaluated_tensors = neural_factory.infer(tensors=[new_ten_tensor], verbose=False, cache=True)
 
         with self.assertRaisesRegex(ValueError, "cache was set but was not empty"):
-            evaluated_tensors = neural_factory.infer(
-                tensors=[twenty_tensor, thirty_tensor], verbose=False, cache=True,
-            )
+            evaluated_tensors = neural_factory.infer(tensors=[twenty_tensor, thirty_tensor], verbose=False, cache=True)
 
         neural_factory.clear_cache()
         evaluated_tensors = neural_factory.infer(tensors=[new_ten_tensor], verbose=False, cache=True)
 
         with self.assertRaisesRegex(ValueError, "cache and use_cache were both set."):
             evaluated_tensors = neural_factory.infer(
-                tensors=[twenty_tensor, thirty_tensor], verbose=False, cache=True, use_cache=True,
+                tensors=[twenty_tensor, thirty_tensor], verbose=False, cache=True, use_cache=True
             )
         self.assertEqual(evaluated_tensors[0][0].squeeze().data, 10)
diff --git a/tests/test_neural_types.py b/tests/test_neural_types.py
index eb52abcffd7b..c2741ca3d7c6 100644
--- a/tests/test_neural_types.py
+++ b/tests/test_neural_types.py
@@ -43,13 +43,13 @@ def setUp(self) -> None:
             logging.info("ASR data found in: {0}".format(data_folder + "asr"))
 
     def test_same(self):
-        btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})
-        btc2 = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})
+        btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})
+        btc2 = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})
         self.assertEqual(btc2.compare(btc), NeuralTypeComparisonResult.SAME)
 
     def test_transpose_same(self):
-        btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})
-        tbc = NeuralType(axis2type={1: AxisType(BatchTag), 0: AxisType(TimeTag), 2: AxisType(ChannelTag),})
+        btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})
+        tbc = NeuralType(axis2type={1: AxisType(BatchTag), 0: AxisType(TimeTag), 2: AxisType(ChannelTag)})
 
         self.assertEqual(btc.compare(tbc), NeuralTypeComparisonResult.TRANSPOSE_SAME)
         self.assertEqual(tbc.compare(btc), NeuralTypeComparisonResult.TRANSPOSE_SAME)
@@ -74,9 +74,9 @@ def test_dim_incompatible(self):
         self.assertEqual(nchw1.compare(nchw2), NeuralTypeComparisonResult.DIM_INCOMPATIBLE)
 
     def test_rank_incompatible(self):
-        btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})
+        btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})
         nchw = NeuralType(
-            axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag),}
+            axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag)}
         )
         self.assertEqual(nchw.compare(btc), NeuralTypeComparisonResult.INCOMPATIBLE)
 
@@ -91,10 +91,10 @@ def test_axis_type(self):
 
     def test_semantic_incompatible(self):
         nchw = NeuralType(
-            axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag),}
+            axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag)}
         )
         badd = NeuralType(
-            axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(ChannelTag), 3: AxisType(WidthTag),}
+            axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(ChannelTag), 3: AxisType(WidthTag)}
         )
         self.assertEqual(nchw.compare(badd), NeuralTypeComparisonResult.INCOMPATIBLE)
         self.assertEqual(badd.compare(nchw), NeuralTypeComparisonResult.INCOMPATIBLE)
@@ -102,9 +102,9 @@ def test_semantic_incompatible(self):
     def test_root(self):
         root = NeuralType({})
         non_tensor = NeuralType(None)
-        btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})
+        btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})
         nchw = NeuralType(
-            axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag),}
+            axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag)}
         )
         self.assertEqual(root.compare(btc), NeuralTypeComparisonResult.SAME)
         self.assertEqual(root.compare(nchw), NeuralTypeComparisonResult.SAME)
@@ -117,10 +117,10 @@ def test_root(self):
     def test_combiner_type_infer(self):
         combiner = nemo.backends.pytorch.common.SimpleCombiner(mode="add")
         x_tg = nemo.core.NmTensor(
-            producer=None, producer_args=None, name=None, ntype=NeuralType({0: AxisType(BatchTag),}),
+            producer=None, producer_args=None, name=None, ntype=NeuralType({0: AxisType(BatchTag)})
         )
         y_tg = nemo.core.NmTensor(
-            producer=None, producer_args=None, name=None, ntype=NeuralType({0: AxisType(BatchTag),}),
+            producer=None, producer_args=None, name=None, ntype=NeuralType({0: AxisType(BatchTag)})
         )
         res = combiner(x1=y_tg, x2=x_tg)
         self.assertEqual(res.compare(x_tg), NeuralTypeComparisonResult.SAME)
@@ -157,7 +157,7 @@ def test_optional_input_no_input(self):
 
         optimizer = nemo.backends.pytorch.actions.PtActions()
         optimizer.train(
-            tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1},
+            tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1}
         )
 
     def test_optional_input_no_with_input(self):
@@ -169,7 +169,7 @@ def test_optional_input_no_with_input(self):
         loss_tensor = loss(predictions=y_pred, target=y)
         optimizer = nemo.backends.pytorch.actions.PtActions()
         optimizer.train(
-            tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1},
+            tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1}
         )
 
     def test_optional_input_no_with_wrong_input(self):
@@ -188,9 +188,7 @@ def wrong_fn():
             loss_tensor = loss(predictions=y_pred, target=y)
             optimizer = nemo.backends.pytorch.actions.PtActions()
             optimizer.train(
-                tensors_to_optimize=[loss_tensor],
-                optimizer="sgd",
-                optimization_params={"lr": 0.0003, "num_epochs": 1},
+                tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1}
             )
 
         self.assertRaises(NeuralPortNmTensorMismatchError, wrong_fn)
@@ -202,7 +200,7 @@ def test_simple_dags(self):
         labels = jasper_model_definition['labels']
 
         data_layer = nemo_asr.AudioToTextDataLayer(
-            manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4,
+            manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4
         )
         data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
             **jasper_model_definition['AudioToMelSpectrogramPreprocessor']
@@ -216,7 +214,7 @@ def test_simple_dags(self):
         greedy_decoder = nemo_asr.GreedyCTCDecoder()
 
         # DAG definition
-        (audio_signal, audio_signal_len, transcript, transcript_len,) = data_layer()
+        (audio_signal, audio_signal_len, transcript, transcript_len) = data_layer()
         processed_signal, processed_signal_len = data_preprocessor(input_signal=audio_signal, length=audio_signal_len)
 
         spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)
@@ -226,7 +224,7 @@ def test_simple_dags(self):
         log_probs = jasper_decoder(encoder_output=encoded)
         predictions = greedy_decoder(log_probs=log_probs)
         loss = ctc_loss(
-            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len,
+            log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len
         )
 
         def wrong():
@@ -235,7 +233,7 @@ def wrong():
             labels = jasper_config['labels']
 
             data_layer = nemo_asr.AudioToTextDataLayer(
-                manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4,
+                manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4
             )
             data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(
                 **jasper_config['AudioToMelSpectrogramPreprocessor']
@@ -246,7 +244,7 @@ def wrong():
             )
             jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(labels))
             # DAG definition
-            (audio_signal, audio_signal_len, transcript, transcript_len,) = data_layer()
+            (audio_signal, audio_signal_len, transcript, transcript_len) = data_layer()
             processed_signal, processed_signal_len = data_preprocessor(
                 input_signal=audio_signal, length=audio_signal_len
             )