Skip to content

Commit

Permalink
changed buckets to use only one because of tensorflow#1790; applied s…
Browse files Browse the repository at this point in the history
…ome changes from applied _some_ chages from https://github.com/b0noI/tensorflow
  • Loading branch information
voidgit committed Oct 14, 2017
1 parent baf8deb commit 964c6bd
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 52 deletions.
148 changes: 97 additions & 51 deletions tutorials/rnn/translate/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,83 @@ def data_to_token_ids(data_path, target_path, vocabulary_path,
tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")


# def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer=None):
# """Get WMT data into data_dir, create vocabularies and tokenize data.

# Args:
# data_dir: directory in which the data sets will be stored.
# en_vocabulary_size: size of the English vocabulary to create and use.
# fr_vocabulary_size: size of the French vocabulary to create and use.
# tokenizer: a function to use to tokenize each data sentence;
# if None, basic_tokenizer will be used.

# Returns:
# A tuple of 6 elements:
# (1) path to the token-ids for English training data-set,
# (2) path to the token-ids for French training data-set,
# (3) path to the token-ids for English development data-set,
# (4) path to the token-ids for French development data-set,
# (5) path to the English vocabulary file,
# (6) path to the French vocabulary file.
# """
# # Get wmt data to the specified directory.
# train_path = get_wmt_enfr_train_set(data_dir)
# dev_path = get_wmt_enfr_dev_set(data_dir)

# from_train_path = train_path + ".en"
# to_train_path = train_path + ".fr"
# from_dev_path = dev_path + ".en"
# to_dev_path = dev_path + ".fr"
# return prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, en_vocabulary_size,
# fr_vocabulary_size, tokenizer)


# def prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, from_vocabulary_size,
# to_vocabulary_size, tokenizer=None):
# """Preapre all necessary files that are required for the training.

# Args:
# data_dir: directory in which the data sets will be stored.
# from_train_path: path to the file that includes "from" training samples.
# to_train_path: path to the file that includes "to" training samples.
# from_dev_path: path to the file that includes "from" dev samples.
# to_dev_path: path to the file that includes "to" dev samples.
# from_vocabulary_size: size of the "from language" vocabulary to create and use.
# to_vocabulary_size: size of the "to language" vocabulary to create and use.
# tokenizer: a function to use to tokenize each data sentence;
# if None, basic_tokenizer will be used.

# Returns:
# A tuple of 6 elements:
# (1) path to the token-ids for "from language" training data-set,
# (2) path to the token-ids for "to language" training data-set,
# (3) path to the token-ids for "from language" development data-set,
# (4) path to the token-ids for "to language" development data-set,
# (5) path to the "from language" vocabulary file,
# (6) path to the "to language" vocabulary file.
# """
# # Create vocabularies of the appropriate sizes.
# to_vocab_path = os.path.join(data_dir, "vocab%d.to" % to_vocabulary_size)
# from_vocab_path = os.path.join(data_dir, "vocab%d.from" % from_vocabulary_size)
# create_vocabulary(to_vocab_path, to_train_path , to_vocabulary_size, tokenizer)
# create_vocabulary(from_vocab_path, from_train_path , from_vocabulary_size, tokenizer)

# # Create token ids for the training data.
# to_train_ids_path = to_train_path + (".ids%d" % to_vocabulary_size)
# from_train_ids_path = from_train_path + (".ids%d" % from_vocabulary_size)
# data_to_token_ids(to_train_path, to_train_ids_path, to_vocab_path, tokenizer)
# data_to_token_ids(from_train_path, from_train_ids_path, from_vocab_path, tokenizer)

# # Create token ids for the development data.
# to_dev_ids_path = to_dev_path + (".ids%d" % to_vocabulary_size)
# from_dev_ids_path = from_dev_path + (".ids%d" % from_vocabulary_size)
# data_to_token_ids(to_dev_path, to_dev_ids_path, to_vocab_path, tokenizer)
# data_to_token_ids(from_dev_path, from_dev_ids_path, from_vocab_path, tokenizer)

# return (from_train_ids_path, to_train_ids_path,
# from_dev_ids_path, to_dev_ids_path,
# from_vocab_path, to_vocab_path)

def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer=None):
"""Get WMT data into data_dir, create vocabularies and tokenize data.
Expand All @@ -264,59 +341,28 @@ def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer
(6) path to the French vocabulary file.
"""
# Get wmt data to the specified directory.
train_path = get_wmt_enfr_train_set(data_dir)
dev_path = get_wmt_enfr_dev_set(data_dir)

from_train_path = train_path + ".en"
to_train_path = train_path + ".fr"
from_dev_path = dev_path + ".en"
to_dev_path = dev_path + ".fr"
return prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, en_vocabulary_size,
fr_vocabulary_size, tokenizer)


def prepare_data(data_dir, from_train_path, to_train_path, from_dev_path, to_dev_path, from_vocabulary_size,
to_vocabulary_size, tokenizer=None):
"""Preapre all necessary files that are required for the training.
Args:
data_dir: directory in which the data sets will be stored.
from_train_path: path to the file that includes "from" training samples.
to_train_path: path to the file that includes "to" training samples.
from_dev_path: path to the file that includes "from" dev samples.
to_dev_path: path to the file that includes "to" dev samples.
from_vocabulary_size: size of the "from language" vocabulary to create and use.
to_vocabulary_size: size of the "to language" vocabulary to create and use.
tokenizer: a function to use to tokenize each data sentence;
if None, basic_tokenizer will be used.
Returns:
A tuple of 6 elements:
(1) path to the token-ids for "from language" training data-set,
(2) path to the token-ids for "to language" training data-set,
(3) path to the token-ids for "from language" development data-set,
(4) path to the token-ids for "to language" development data-set,
(5) path to the "from language" vocabulary file,
(6) path to the "to language" vocabulary file.
"""
train_path = os.path.join(data_dir, "train")
dev_path = train_path

# Create vocabularies of the appropriate sizes.
to_vocab_path = os.path.join(data_dir, "vocab%d.to" % to_vocabulary_size)
from_vocab_path = os.path.join(data_dir, "vocab%d.from" % from_vocabulary_size)
create_vocabulary(to_vocab_path, to_train_path , to_vocabulary_size, tokenizer)
create_vocabulary(from_vocab_path, from_train_path , from_vocabulary_size, tokenizer)
fr_vocab_path = os.path.join(data_dir, "vocab%d.fr" % fr_vocabulary_size)
en_vocab_path = os.path.join(data_dir, "vocab%d.en" % en_vocabulary_size)
create_vocabulary(fr_vocab_path, train_path + ".fr", fr_vocabulary_size, tokenizer)
create_vocabulary(en_vocab_path, train_path + ".en", en_vocabulary_size, tokenizer)

# Create token ids for the training data.
to_train_ids_path = to_train_path + (".ids%d" % to_vocabulary_size)
from_train_ids_path = from_train_path + (".ids%d" % from_vocabulary_size)
data_to_token_ids(to_train_path, to_train_ids_path, to_vocab_path, tokenizer)
data_to_token_ids(from_train_path, from_train_ids_path, from_vocab_path, tokenizer)
fr_train_ids_path = train_path + (".ids%d.fr" % fr_vocabulary_size)
en_train_ids_path = train_path + (".ids%d.en" % en_vocabulary_size)
data_to_token_ids(train_path + ".fr", fr_train_ids_path, fr_vocab_path, tokenizer)
data_to_token_ids(train_path + ".en", en_train_ids_path, en_vocab_path, tokenizer)

# Create token ids for the development data.
to_dev_ids_path = to_dev_path + (".ids%d" % to_vocabulary_size)
from_dev_ids_path = from_dev_path + (".ids%d" % from_vocabulary_size)
data_to_token_ids(to_dev_path, to_dev_ids_path, to_vocab_path, tokenizer)
data_to_token_ids(from_dev_path, from_dev_ids_path, from_vocab_path, tokenizer)

return (from_train_ids_path, to_train_ids_path,
from_dev_ids_path, to_dev_ids_path,
from_vocab_path, to_vocab_path)
fr_dev_ids_path = dev_path + (".ids%d.fr" % fr_vocabulary_size)
en_dev_ids_path = dev_path + (".ids%d.en" % en_vocabulary_size)
data_to_token_ids(dev_path + ".fr", fr_dev_ids_path, fr_vocab_path, tokenizer)
data_to_token_ids(dev_path + ".en", en_dev_ids_path, en_vocab_path, tokenizer)

return (en_train_ids_path, fr_train_ids_path,
en_dev_ids_path, fr_dev_ids_path,
en_vocab_path, fr_vocab_path)

5 changes: 4 additions & 1 deletion tutorials/rnn/translate/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@

# We use a number of buckets and pad to the closest one for efficiency.
# See seq2seq_model.Seq2SeqModel for details of how they work.
_buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]
# _buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]
_buckets = [(10, 15)]
# _buckets = [(40, 50)] # looks ok



def read_data(source_path, target_path, max_size=None):
Expand Down

0 comments on commit 964c6bd

Please sign in to comment.