Merge pull request #1 from tensorflow/master

Updated
tensorflow · Mar 4, 2020 · 43178d7 · 43178d7
2 parents 8b47aa3 + 75d1304
commit 43178d7
Show file tree

Hide file tree

Showing 80 changed files with 422 additions and 6,788 deletions.
diff --git a/official/benchmark/bert_squad_benchmark.py b/official/benchmark/bert_squad_benchmark.py
@@ -172,7 +172,7 @@ def _run_and_report_benchmark(self,
                                 run_eagerly=False,
                                 ds_type='mirrored'):
     """Runs the benchmark and reports various metrics."""
-    if FLAGS.train_batch_size <= 4:
+    if FLAGS.train_batch_size <= 4 or run_eagerly:
       FLAGS.input_meta_data_path = SQUAD_MEDIUM_INPUT_META_DATA_PATH
     else:
       FLAGS.input_meta_data_path = SQUAD_LONG_INPUT_META_DATA_PATH

diff --git a/official/modeling/hyperparams/base_config.py b/official/modeling/hyperparams/base_config.py
@@ -143,12 +143,10 @@ def _get_subconfig_type(cls, k) -> Type[params_dict.ParamsDict]:
     return subconfig_type
 
   def __post_init__(self, default_params, restrictions, *args, **kwargs):
-    logging.error('DEBUG before init %r', type(self))
     super().__init__(default_params=default_params,
                      restrictions=restrictions,
                      *args,
                      **kwargs)
-    logging.error('DEBUG after init %r', type(self))
 
   def _set(self, k, v):
     """Overrides same method in ParamsDict.
@@ -246,3 +244,71 @@ def from_args(cls, *args, **kwargs):
     default_params = {a: p for a, p in zip(attributes, args)}
     default_params.update(kwargs)
     return cls(default_params)
+
+
+@dataclasses.dataclass
+class RuntimeConfig(Config):
+  """High-level configurations for Runtime.
+
+  These include parameters that are not directly related to the experiment,
+  e.g. directories, accelerator type, etc.
+
+  Attributes:
+    distribution_strategy: e.g. 'mirrored', 'tpu', etc.
+    enable_eager: Whether or not to enable eager mode.
+    enable_xla: Whether or not to enable XLA.
+    per_gpu_thread_count: thread count per GPU.
+    gpu_threads_enabled: Whether or not GPU threads are enabled.
+    gpu_thread_mode: Whether and how the GPU device uses its own threadpool.
+    dataset_num_private_threads: Number of threads for a private threadpool
+      created for all datasets computation.
+    tpu: The address of the TPU to use, if any.
+    num_gpus: The number of GPUs to use, if any.
+    worker_hosts: comma-separated list of worker ip:port pairs for running
+      multi-worker models with DistributionStrategy.
+    task_index: If multi-worker training, the task index of this worker.
+    all_reduce_alg: Defines the algorithm for performing all-reduce.
+  """
+  distribution_strategy: str = 'mirrored'
+  enable_eager: bool = False
+  enable_xla: bool = False
+  gpu_threads_enabled: bool = False
+  gpu_thread_mode: Optional[str] = None
+  dataset_num_private_threads: Optional[int] = None
+  per_gpu_thread_count: int = 0
+  tpu: Optional[str] = None
+  num_gpus: int = 0
+  worker_hosts: Optional[str] = None
+  task_index: int = -1
+  all_reduce_alg: Optional[str] = None
+
+
+@dataclasses.dataclass
+class TensorboardConfig(Config):
+  """Configuration for Tensorboard.
+
+  Attributes:
+    track_lr: Whether or not to track the learning rate in Tensorboard. Defaults
+      to True.
+    write_model_weights: Whether or not to write the model weights as
+      images in Tensorboard. Defaults to False.
+
+  """
+  track_lr: bool = True
+  write_model_weights: bool = False
+
+
+@dataclasses.dataclass
+class CallbacksConfig(Config):
+  """Configuration for Callbacks.
+
+  Attributes:
+    enable_checkpoint_and_export: Whether or not to enable checkpoints as a
+      Callback. Defaults to True.
+    enable_tensorboard: Whether or not to enable Tensorboard as a Callback.
+      Defaults to True.
+
+  """
+  enable_checkpoint_and_export: bool = True
+  enable_tensorboard: bool = True
+
diff --git a/official/nlp/modeling/layers/attention.py b/official/nlp/modeling/layers/attention.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based attention layer."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -45,7 +45,7 @@ class Attention(tf.keras.layers.Layer):
   interpolated by these probabilities, then concatenated back to a single
   tensor and returned.
 
-  Attributes:
+  Arguments:
     num_heads: Number of attention heads.
     head_size: Size of each attention head.
     dropout: Dropout probability.
@@ -186,7 +186,7 @@ def call(self, inputs):
 class CachedAttention(Attention):
   """Attention layer with cache used for auto-agressive decoding.
 
-  Attributes:
+  Arguments:
     num_heads: Number of attention heads.
     head_size: Size of each attention head.
     **kwargs: Other keyword arguments inherit from `Attention` class.

diff --git a/official/nlp/modeling/layers/dense_einsum.py b/official/nlp/modeling/layers/dense_einsum.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based einsum layer."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -30,7 +30,7 @@ class DenseEinsum(tf.keras.layers.Layer):
 
   This layer can perform einsum calculations of arbitrary dimensionality.
 
-  Attributes:
+  Arguments:
     output_shape: Positive integer or tuple, dimensionality of the output space.
     num_summed_dimensions: The number of dimensions to sum over. Standard 2D
       matmul should use 1, 3D matmul should use 2, and so forth.

diff --git a/official/nlp/modeling/layers/masked_softmax.py b/official/nlp/modeling/layers/masked_softmax.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based softmax layer with optional masking."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -26,7 +26,7 @@
 class MaskedSoftmax(tf.keras.layers.Layer):
   """Performs a softmax with optional masking on a tensor.
 
-  Attributes:
+  Arguments:
     mask_expansion_axes: Any axes that should be padded on the mask tensor.
   """
 

diff --git a/official/nlp/modeling/layers/on_device_embedding.py b/official/nlp/modeling/layers/on_device_embedding.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based one-hot embedding layer."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -31,7 +31,7 @@ class OnDeviceEmbedding(tf.keras.layers.Layer):
   This layer uses either tf.gather or tf.one_hot to translate integer indices to
   float embeddings.
 
-  Attributes:
+  Arguments:
     vocab_size: Number of elements in the vocabulary.
     embedding_width: Output size of the embedding layer.
     initializer: The initializer to use for the embedding weights. Defaults to

diff --git a/official/nlp/modeling/layers/position_embedding.py b/official/nlp/modeling/layers/position_embedding.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based positional embedding layer."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -37,7 +37,7 @@ class PositionEmbedding(tf.keras.layers.Layer):
   can have a dynamic 1st dimension, while if `use_dynamic_slicing` is False the
   input size must be fixed.
 
-  Attributes:
+  Arguments:
     use_dynamic_slicing: Whether to use the dynamic slicing path.
     max_sequence_length: The maximum size of the dynamic sequence. Only
       applicable if `use_dynamic_slicing` is True.

diff --git a/official/nlp/modeling/layers/transformer.py b/official/nlp/modeling/layers/transformer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based transformer block layer."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -32,7 +32,7 @@ class Transformer(tf.keras.layers.Layer):
   This layer implements the Transformer from "Attention Is All You Need".
   (https://arxiv.org/abs/1706.03762).
 
-  Attributes:
+  Arguments:
     num_attention_heads: Number of attention heads.
     intermediate_size: Size of the intermediate layer.
     intermediate_activation: Activation for the intermediate layer.

diff --git a/official/nlp/modeling/layers/transformer_scaffold.py b/official/nlp/modeling/layers/transformer_scaffold.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based transformer scaffold layer."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -35,7 +35,7 @@ class TransformerScaffold(tf.keras.layers.Layer):
   `attention_cfg`, in which case the scaffold will instantiate the class with
   the config, or pass a class instance to `attention_cls`.
 
-  Attributes:
+  Arguments:
     num_attention_heads: Number of attention heads.
     intermediate_size: Size of the intermediate layer.
     intermediate_activation: Activation for the intermediate layer.

diff --git a/official/nlp/modeling/networks/albert_transformer_encoder.py b/official/nlp/modeling/networks/albert_transformer_encoder.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """ALBERT (https://arxiv.org/abs/1810.04805) text encoder network."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -41,7 +41,7 @@ class AlbertTransformerEncoder(network.Network):
   The default values for this object are taken from the ALBERT-Base
   implementation described in the paper.
 
-  Attributes:
+  Arguments:
     vocab_size: The size of the token vocabulary.
     embedding_width: The width of the word embeddings. If the embedding width
       is not equal to hidden size, embedding parameters will be factorized into

diff --git a/official/nlp/modeling/networks/bert_classifier.py b/official/nlp/modeling/networks/bert_classifier.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Trainer network for BERT-style models."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -36,7 +36,7 @@ class BertClassifier(tf.keras.Model):
   instantiates a classification network based on the passed `num_classes`
   argument.
 
-  Attributes:
+  Arguments:
     network: A transformer network. This network should output a sequence output
       and a classification output. Furthermore, it should expose its embedding
       table via a "get_embedding_table" method.

diff --git a/official/nlp/modeling/networks/bert_pretrainer.py b/official/nlp/modeling/networks/bert_pretrainer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Trainer network for BERT-style models."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -37,7 +37,7 @@ class BertPretrainer(tf.keras.Model):
   instantiates the masked language model and classification networks that are
   used to create the training objectives.
 
-  Attributes:
+  Arguments:
     network: A transformer network. This network should output a sequence output
       and a classification output. Furthermore, it should expose its embedding
       table via a "get_embedding_table" method.

diff --git a/official/nlp/modeling/networks/bert_span_labeler.py b/official/nlp/modeling/networks/bert_span_labeler.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Trainer network for BERT-style models."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -35,7 +35,7 @@ class BertSpanLabeler(tf.keras.Model):
   The BertSpanLabeler allows a user to pass in a transformer stack, and
   instantiates a span labeling network based on a single dense layer.
 
-  Attributes:
+  Arguments:
     network: A transformer network. This network should output a sequence output
       and a classification output. Furthermore, it should expose its embedding
       table via a "get_embedding_table" method.

diff --git a/official/nlp/modeling/networks/classification.py b/official/nlp/modeling/networks/classification.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Classification network."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -31,7 +31,7 @@ class Classification(network.Network):
 
   This network implements a simple classifier head based on a dense layer.
 
-  Attributes:
+  Arguments:
     input_width: The innermost dimension of the input tensor to this network.
     num_classes: The number of classes that this network should classify to.
     activation: The activation, if any, for the dense layer in this network.

diff --git a/official/nlp/modeling/networks/encoder_scaffold.py b/official/nlp/modeling/networks/encoder_scaffold.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Transformer-based text encoder network."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -46,7 +46,7 @@ class instance. If a class is passed, that class will be instantiated using
   If the hidden_cls is not overridden, a default transformer layer will be
   instantiated.
 
-  Attributes:
+  Arguments:
     num_output_classes: The output size of the classification layer.
     classification_layer_initializer: The initializer for the classification
       layer.

diff --git a/official/nlp/modeling/networks/masked_lm.py b/official/nlp/modeling/networks/masked_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Masked language model network."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -32,7 +32,7 @@ class MaskedLM(network.Network):
   This network implements a masked language model based on the provided network.
   It assumes that the network being passed has a "get_embedding_table()" method.
 
-  Attributes:
+  Arguments:
     input_width: The innermost dimension of the input tensor to this network.
     num_predictions: The number of predictions to make per sequence.
     source_network: The network with the embedding layer to use for the

diff --git a/official/nlp/modeling/networks/span_labeling.py b/official/nlp/modeling/networks/span_labeling.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Span labeling network."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -31,7 +31,7 @@ class SpanLabeling(network.Network):
 
   This network implements a simple single-span labeler based on a dense layer.
 
-  Attributes:
+  Arguments:
     input_width: The innermost dimension of the input tensor to this network.
     activation: The activation, if any, for the dense layer in this network.
     initializer: The intializer for the dense layer in this network. Defaults to

diff --git a/official/nlp/modeling/networks/transformer_encoder.py b/official/nlp/modeling/networks/transformer_encoder.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Transformer-based text encoder network."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -40,7 +40,7 @@ class TransformerEncoder(network.Network):
   in "BERT: Pre-training of Deep Bidirectional Transformers for Language
   Understanding".
 
-  Attributes:
+  Arguments:
     vocab_size: The size of the token vocabulary.
     hidden_size: The size of the transformer hidden layers.
     num_layers: The number of transformer layers.