google · mingxingtan · Sep 21, 2020 · Aug 26, 2020 · Aug 26, 2020 · Aug 26, 2020
diff --git a/efficientdet/README.md b/efficientdet/README.md
@@ -369,4 +369,26 @@ For more instructions about training on TPUs, please refer to the following tuto
 
   * EfficientNet tutorial: https://cloud.google.com/tpu/docs/tutorials/efficientnet
 
+## 11. Reducing Memory Usage when Training EfficientDets on GPU.
+
+EfficientDets use a lot of GPU memory for a few reasons:
+
+* Large input resolution: because resolution is one of the scaling dimension, our resolution tends to be higher, which significantly increase activations (although no parameter increase).
+* Large internal activations for backbone: our backbone uses a relatively large expansion ratio (6), causing the large expanded activations.
+* Deep BiFPN: our BiFPN has multiple top-down and bottom-up paths, which leads to a lot of intermediate memory usage during training.
+
+To train this model on GPU with low memory there is an experimental option gradient_checkpointing.
+
+Check these links for a high-level idea of what gradient checkpointing is doing:
+1. https://github.com/cybertronai/gradient-checkpointing
+2. https://medium.com/tensorflow/fitting-larger-networks-into-memory-583e3c758ff9
+
+**gradient_checkpointing: True**
+
+If set to True, strings defined by gradient_checkpointing_list (["Add"] by default) are searched in the tensors names and any tensors that match a string from the list are kept as checkpoints. When this option is used the standard tensorflow.python.ops.gradients method is being replaced with a custom method.
+
+Testing shows that:
+* On d4 network with batch-size of 1 (mixed precision enabled) it takes only 1/3.2 of memory with roughly 32% slower computation
+* It also allows to compute a d6 network with batch size of 2 (mixed precision enabled) on a 11Gb (2080Ti) GPU
+
 NOTE: this is not an official Google product.
diff --git a/efficientdet/det_model_fn.py b/efficientdet/det_model_fn.py
@@ -18,7 +18,6 @@
 from absl import logging
 import numpy as np
 import tensorflow.compat.v1 as tf
-
 import coco_metric
 import efficientdet_arch
 import hparams_config
@@ -153,7 +152,7 @@ def focal_loss(y_pred, y_true, alpha, gamma, normalizer, label_smoothing=0.0):
     pred_prob = tf.sigmoid(y_pred)
     p_t = (y_true * pred_prob) + ((1 - y_true) * (1 - pred_prob))
     alpha_factor = y_true * alpha + (1 - y_true) * (1 - alpha)
-    modulating_factor = (1.0 - p_t) ** gamma
+    modulating_factor = (1.0 - p_t)**gamma
 
     # apply label smoothing for cross_entropy for each entry.
     y_true = y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing
@@ -302,8 +301,7 @@ class and box losses from all levels.
   box_loss = tf.add_n(box_losses) if box_losses else 0
 
   total_loss = (
-      cls_loss +
-      params['box_loss_weight'] * box_loss +
+      cls_loss + params['box_loss_weight'] * box_loss +
       params['iou_loss_weight'] * box_iou_loss)
 
   return total_loss, cls_loss, box_loss, box_iou_loss
@@ -347,6 +345,7 @@ def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
   params['is_training_bn'] = (mode == tf.estimator.ModeKeys.TRAIN)
 
   if params['use_keras_model']:
+
     def model_fn(inputs):
       model = efficientdet_keras.EfficientDetNet(
           config=hparams_config.Config(params))
@@ -418,6 +417,21 @@ def model_fn(inputs):
 
     if params['strategy'] == 'tpu':
       optimizer = tf.tpu.CrossShardOptimizer(optimizer)
+    if params['gradient_checkpointing']:
+      from third_party.grad_checkpoint import memory_saving_gradients  # pylint: disable=import-outside-toplevel
+      from tensorflow.python.ops import gradients  # pylint: disable=import-outside-toplevel
+
+      # monkey patch tf.gradients to point to our custom version,
+      # with automatic checkpoint selection
+      def gradients_(ys, xs, grad_ys=None, **kwargs):
+        return memory_saving_gradients.gradients(
+            ys,
+            xs,
+            grad_ys,
+            checkpoints=params['gradient_checkpointing_list'],
+            **kwargs)
+
+      gradients.__dict__["gradients"] = gradients_
 
     # Batch norm requires update_ops to be added as a train_op dependency.
     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
@@ -615,6 +629,24 @@ def before_run(self, run_context):
         every_n_iter=params.get('iterations_per_loop', 100),
     )
     training_hooks.append(logging_hook)
+
+    if params["nvgpu_logging"]:
+      try:
+        from third_party.tools.nvgpu import gpu_memory_util_message  # pylint: disable=import-outside-toplevel
+
+        mem_message = tf.py_func(gpu_memory_util_message, [], [tf.string])[0]
+
+        logging_hook_nvgpu = tf.estimator.LoggingTensorHook(
+            tensors={
+                "mem_message": mem_message,
+            },
+            every_n_iter=params.get('iterations_per_loop', 100),
+            formatter=lambda x: x["mem_message"].decode("utf-8"),
+        )
+        training_hooks.append(logging_hook_nvgpu)
+      except:
+        logging.error("nvgpu error: nvidia-smi format not recognized")
+
   if params['strategy'] == 'tpu':
     return tf.estimator.tpu.TPUEstimatorSpec(
         mode=mode,

diff --git a/efficientdet/hparams_config.py b/efficientdet/hparams_config.py
@@ -136,7 +136,7 @@ def add_kv_recursive(k, v):
               return {k: [eval_str_fn(vv) for vv in v.split('*')]}
             return {k: eval_str_fn(v)}
           pos = k.index('.')
-          return {k[:pos]: add_kv_recursive(k[pos+1:], v)}
+          return {k[:pos]: add_kv_recursive(k[pos + 1:], v)}
 
         def merge_dict_recursive(target, src):
           """Recursively merge two nested dictionary."""
@@ -161,7 +161,7 @@ def as_dict(self):
       else:
         config_dict[k] = copy.deepcopy(v)
     return config_dict
-# pylint: enable=protected-access
+    # pylint: enable=protected-access
 
 
 def default_detection_configs():
@@ -281,6 +281,16 @@ def default_detection_configs():
   h.dataset_type = None
   h.positives_momentum = None
 
+  # Reduces memory during training
+  h.gradient_checkpointing = False
+
+  # Values that could be used "Add", "Mul", "Conv2d", "Floor", "Sigmoid", etc
+  # or more specific, e.g. "blocks_10/se/conv2d_1"
+  h.gradient_checkpointing_list = ["Add"]
+
+  # enable memory logging for NVIDIA cards
+  h.nvgpu_logging = False
+
   return h
 
 

diff --git a/efficientdet/main.py b/efficientdet/main.py
@@ -26,7 +26,6 @@
 import hparams_config
 import utils
 
-
 flags.DEFINE_string(
     'tpu',
     default=None,
@@ -341,8 +340,7 @@ def run_train_and_eval(e):
           input_fn=train_input_fn,
           max_steps=e * FLAGS.num_examples_per_epoch // FLAGS.train_batch_size)
       print('\n   =====> Starting evaluation, epoch: %d.' % e)
-      eval_results = eval_est.evaluate(
-          input_fn=eval_input_fn, steps=eval_steps)
+      eval_results = eval_est.evaluate(input_fn=eval_input_fn, steps=eval_steps)
       ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
       utils.archive_ckpt(eval_results, eval_results['AP'], ckpt)
 

diff --git a/efficientdet/third_party/tools/nvgpu.py b/efficientdet/third_party/tools/nvgpu.py
@@ -96,3 +96,39 @@ def gpu_info():
     return XmlDictConfig(root)
   except FileNotFoundError:
     return None
+
+
+def gpu_memory_util_message():
+  """Provide information about GPUs."""
+  gpu_info_d = gpu_info()
+  if gpu_info_d is not None:
+    mem_used = gpu_info_d['gpu']['fb_memory_usage']['used']
+    mem_total = gpu_info_d['gpu']['fb_memory_usage']['total']
+    mem_util = commonsize(mem_used) / commonsize(mem_total)
+    logstring = ("GPU memory used: {} = {:.1%} ".format(mem_used, mem_util) +
+                 "of total GPU memory: {}".format(mem_total))
+    return logstring
+  return None
+
+
+def commonsize(inp):
+  """Convert all to MiB."""
+  const_sizes = {
+      'B': 1,
+      'KB': 1e3,
+      'MB': 1e6,
+      'GB': 1e9,
+      'TB': 1e12,
+      'PB': 1e15,
+      'KiB': 1024,
+      'MiB': 1048576,
+      'GiB': 1073741824
+  }
+  inp = inp.split(" ")
+  # convert all to MiB
+  if inp[1] != 'MiB':
+    inp_ = float(inp[0]) * (const_sizes[inp[1]] / 1048576.0)
+  else:
+    inp_ = float(inp[0])
+
+  return inp_