changed buckets to use only one because of tensorflow#1790; applied s…

…ome changes from applied _some_ chages from https://github.com/b0noI/tensorflow
voidgit · Oct 14, 2017 · 964c6bd · 964c6bd
1 parent baf8deb
commit 964c6bd
Show file tree

Hide file tree

Showing 2 changed files with 101 additions and 52 deletions.
diff --git a/tutorials/rnn/translate/data_utils.py b/tutorials/rnn/translate/data_utils.py
@@ -244,6 +244,83 @@ def data_to_token_ids(data_path, target_path, vocabulary_path,
           tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
 
 
+# def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer=None):
+  # """Get WMT data into data_dir, create vocabularies and tokenize data.
+
+  # Args:
+    # data_dir: directory in which the data sets will be stored.
+    # en_vocabulary_size: size of the English vocabulary to create and use.
+    # fr_vocabulary_size: size of the French vocabulary to create and use.
+    # tokenizer: a function to use to tokenize each data sentence;
+      # if None, basic_tokenizer will be used.
+
+  # Returns:
+    # A tuple of 6 elements:
+      # (1) path to the token-ids for English training data-set,
+      # (2) path to the token-ids for French training data-set,
+      # (3) path to the token-ids for English development data-set,
+      # (4) path to the token-ids for French development data-set,
+      # (5) path to the English vocabulary file,
+      # (6) path to the French vocabulary file.
+  # """
+  # # Get wmt data to the specified directory.
+  # train_path = get_wmt_enfr_train_set(data_dir)
+  # dev_path = get_wmt_enfr_dev_set(data_dir)
+
+  # from_train_path = train_path + ".en"
+  # to_train_path = train_path + ".fr"
+  # from_dev_path = dev_path + ".en"
+  # to_dev_path = dev_path + ".fr"
+  # return prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, en_vocabulary_size,
+                      # fr_vocabulary_size, tokenizer)
+
+
+# def prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, from_vocabulary_size,
+                 # to_vocabulary_size, tokenizer=None):
+  # """Preapre all necessary files that are required for the training.
+
+    # Args:
+      # data_dir: directory in which the data sets will be stored.
+      # from_train_path: path to the file that includes "from" training samples.
+      # to_train_path: path to the file that includes "to" training samples.
+      # from_dev_path: path to the file that includes "from" dev samples.
+      # to_dev_path: path to the file that includes "to" dev samples.
+      # from_vocabulary_size: size of the "from language" vocabulary to create and use.
+      # to_vocabulary_size: size of the "to language" vocabulary to create and use.
+      # tokenizer: a function to use to tokenize each data sentence;
+        # if None, basic_tokenizer will be used.
+
+    # Returns:
+      # A tuple of 6 elements:
+        # (1) path to the token-ids for "from language" training data-set,
+        # (2) path to the token-ids for "to language" training data-set,
+        # (3) path to the token-ids for "from language" development data-set,
+        # (4) path to the token-ids for "to language" development data-set,
+        # (5) path to the "from language" vocabulary file,
+        # (6) path to the "to language" vocabulary file.
+    # """
+  # # Create vocabularies of the appropriate sizes.
+  # to_vocab_path = os.path.join(data_dir, "vocab%d.to" % to_vocabulary_size)
+  # from_vocab_path = os.path.join(data_dir, "vocab%d.from" % from_vocabulary_size)
+  # create_vocabulary(to_vocab_path, to_train_path , to_vocabulary_size, tokenizer)
+  # create_vocabulary(from_vocab_path, from_train_path , from_vocabulary_size, tokenizer)
+
+  # # Create token ids for the training data.
+  # to_train_ids_path = to_train_path + (".ids%d" % to_vocabulary_size)
+  # from_train_ids_path = from_train_path + (".ids%d" % from_vocabulary_size)
+  # data_to_token_ids(to_train_path, to_train_ids_path, to_vocab_path, tokenizer)
+  # data_to_token_ids(from_train_path, from_train_ids_path, from_vocab_path, tokenizer)
+
+  # # Create token ids for the development data.
+  # to_dev_ids_path = to_dev_path + (".ids%d" % to_vocabulary_size)
+  # from_dev_ids_path = from_dev_path + (".ids%d" % from_vocabulary_size)
+  # data_to_token_ids(to_dev_path, to_dev_ids_path, to_vocab_path, tokenizer)
+  # data_to_token_ids(from_dev_path, from_dev_ids_path, from_vocab_path, tokenizer)
+
+  # return (from_train_ids_path, to_train_ids_path,
+          # from_dev_ids_path, to_dev_ids_path,
+          # from_vocab_path, to_vocab_path)
+
 def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer=None):
   """Get WMT data into data_dir, create vocabularies and tokenize data.
 
@@ -264,59 +341,28 @@ def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer
       (6) path to the French vocabulary file.
   """
   # Get wmt data to the specified directory.
-  train_path = get_wmt_enfr_train_set(data_dir)
-  dev_path = get_wmt_enfr_dev_set(data_dir)
-
-  from_train_path = train_path + ".en"
-  to_train_path = train_path + ".fr"
-  from_dev_path = dev_path + ".en"
-  to_dev_path = dev_path + ".fr"
-  return prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, en_vocabulary_size,
-                      fr_vocabulary_size, tokenizer)
-
-
-def prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, from_vocabulary_size,
-                 to_vocabulary_size, tokenizer=None):
-  """Preapre all necessary files that are required for the training.
-
-    Args:
-      data_dir: directory in which the data sets will be stored.
-      from_train_path: path to the file that includes "from" training samples.
-      to_train_path: path to the file that includes "to" training samples.
-      from_dev_path: path to the file that includes "from" dev samples.
-      to_dev_path: path to the file that includes "to" dev samples.
-      from_vocabulary_size: size of the "from language" vocabulary to create and use.
-      to_vocabulary_size: size of the "to language" vocabulary to create and use.
-      tokenizer: a function to use to tokenize each data sentence;
-        if None, basic_tokenizer will be used.
-
-    Returns:
-      A tuple of 6 elements:
-        (1) path to the token-ids for "from language" training data-set,
-        (2) path to the token-ids for "to language" training data-set,
-        (3) path to the token-ids for "from language" development data-set,
-        (4) path to the token-ids for "to language" development data-set,
-        (5) path to the "from language" vocabulary file,
-        (6) path to the "to language" vocabulary file.
-    """
+  train_path = os.path.join(data_dir, "train")
+  dev_path = train_path
+
   # Create vocabularies of the appropriate sizes.
-  to_vocab_path = os.path.join(data_dir, "vocab%d.to" % to_vocabulary_size)
-  from_vocab_path = os.path.join(data_dir, "vocab%d.from" % from_vocabulary_size)
-  create_vocabulary(to_vocab_path, to_train_path , to_vocabulary_size, tokenizer)
-  create_vocabulary(from_vocab_path, from_train_path , from_vocabulary_size, tokenizer)
+  fr_vocab_path = os.path.join(data_dir, "vocab%d.fr" % fr_vocabulary_size)
+  en_vocab_path = os.path.join(data_dir, "vocab%d.en" % en_vocabulary_size)
+  create_vocabulary(fr_vocab_path, train_path + ".fr", fr_vocabulary_size, tokenizer)
+  create_vocabulary(en_vocab_path, train_path + ".en", en_vocabulary_size, tokenizer)
 
   # Create token ids for the training data.
-  to_train_ids_path = to_train_path + (".ids%d" % to_vocabulary_size)
-  from_train_ids_path = from_train_path + (".ids%d" % from_vocabulary_size)
-  data_to_token_ids(to_train_path, to_train_ids_path, to_vocab_path, tokenizer)
-  data_to_token_ids(from_train_path, from_train_ids_path, from_vocab_path, tokenizer)
+  fr_train_ids_path = train_path + (".ids%d.fr" % fr_vocabulary_size)
+  en_train_ids_path = train_path + (".ids%d.en" % en_vocabulary_size)
+  data_to_token_ids(train_path + ".fr", fr_train_ids_path, fr_vocab_path, tokenizer)
+  data_to_token_ids(train_path + ".en", en_train_ids_path, en_vocab_path, tokenizer)
 
   # Create token ids for the development data.
-  to_dev_ids_path = to_dev_path + (".ids%d" % to_vocabulary_size)
-  from_dev_ids_path = from_dev_path + (".ids%d" % from_vocabulary_size)
-  data_to_token_ids(to_dev_path, to_dev_ids_path, to_vocab_path, tokenizer)
-  data_to_token_ids(from_dev_path, from_dev_ids_path, from_vocab_path, tokenizer)
-
-  return (from_train_ids_path, to_train_ids_path,
-          from_dev_ids_path, to_dev_ids_path,
-          from_vocab_path, to_vocab_path)
+  fr_dev_ids_path = dev_path + (".ids%d.fr" % fr_vocabulary_size)
+  en_dev_ids_path = dev_path + (".ids%d.en" % en_vocabulary_size)
+  data_to_token_ids(dev_path + ".fr", fr_dev_ids_path, fr_vocab_path, tokenizer)
+  data_to_token_ids(dev_path + ".en", en_dev_ids_path, en_vocab_path, tokenizer)
+
+  return (en_train_ids_path, fr_train_ids_path,
+          en_dev_ids_path, fr_dev_ids_path,
+          en_vocab_path, fr_vocab_path)
+
diff --git a/tutorials/rnn/translate/translate.py b/tutorials/rnn/translate/translate.py
@@ -78,7 +78,10 @@
 
 # We use a number of buckets and pad to the closest one for efficiency.
 # See seq2seq_model.Seq2SeqModel for details of how they work.
-_buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]
+# _buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]
+_buckets = [(10, 15)]
+# _buckets = [(40, 50)] # looks ok
+
 
 
 def read_data(source_path, target_path, max_size=None):