Merge pull request #10537 from srihari-humbarwadi:panoptic-deeplab

PiperOrigin-RevId: 452568716
tensorflow · Jun 2, 2022 · 5290234 · 5290234
2 parents 0290848 + 1f765c5
commit 5290234
Show file tree

Hide file tree

Showing 18 changed files with 3,308 additions and 4 deletions.
diff --git a/official/vision/beta/projects/panoptic_maskrcnn/README.md b/official/vision/beta/projects/panoptic_maskrcnn/README.md
@@ -83,6 +83,12 @@ ResNet-50    | 3x           | `panoptic_fpn_coco`         | 40.64   |   36.29
 
 **Note**: Here 1x schedule refers to ~12 epochs
 
+### Panoptic Deeplab
+Backbone             | Experiment name                 | Overall PQ | Things PQ | Stuff PQ | Checkpoints
+:---------------------| :-------------------------------| ---------- | --------- | -------- | ------------:
+Dilated ResNet-50     | `panoptic_deeplab_resnet_coco`  |   36.80    |  37.51    |  35.73   | [ckpt](gs://tf_model_garden/vision/panoptic/panoptic_deeplab/coco/resnet50)
+Dilated ResNet-101    | `panoptic_deeplab_resnet_coco`  |   38.39    |  39.47    |  36.75   | [ckpt](gs://tf_model_garden/vision/panoptic/panoptic_deeplab/coco/resnet101)
+
 ___
 ## Citation
 ```
@@ -94,4 +100,12 @@ ___
       archivePrefix={arXiv},
       primaryClass={cs.CV}
 }
+
+@article{Cheng2020PanopticDeepLabAS,
+  title={Panoptic-DeepLab: A Simple, Strong, and Fast Baseline for Bottom-Up Panoptic Segmentation},
+  author={Bowen Cheng and Maxwell D. Collins and Yukun Zhu and Ting Liu and Thomas S. Huang and Hartwig Adam and Liang-Chieh Chen},
+  journal={2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2020},
+  pages={12472-12482}
+}
 ```
diff --git a/official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_deeplab.py b/official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_deeplab.py
@@ -0,0 +1,346 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Panoptic Deeplab configuration definition."""
+import dataclasses
+import os
+from typing import List, Optional, Union
+
+import numpy as np
+
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.modeling import hyperparams
+from official.modeling import optimization
+from official.vision.configs import common
+from official.vision.configs import decoders
+from official.vision.configs.google import backbones
+
+
+_COCO_INPUT_PATH_BASE = 'coco/tfrecords'
+_COCO_TRAIN_EXAMPLES = 118287
+_COCO_VAL_EXAMPLES = 5000
+
+
+@dataclasses.dataclass
+class Parser(hyperparams.Config):
+  """Panoptic deeplab parser."""
+  ignore_label: int = 0
+  # If resize_eval_groundtruth is set to False, original image sizes are used
+  # for eval. In that case, groundtruth_padded_size has to be specified too to
+  # allow for batching the variable input sizes of images.
+  resize_eval_groundtruth: bool = True
+  groundtruth_padded_size: List[int] = dataclasses.field(default_factory=list)
+  aug_scale_min: float = 1.0
+  aug_scale_max: float = 1.0
+  aug_rand_hflip: bool = True
+  aug_type: common.Augmentation = common.Augmentation()
+  sigma: float = 8.0
+  small_instance_area_threshold: int = 4096
+  small_instance_weight: float = 3.0
+  dtype = 'float32'
+
+
+@dataclasses.dataclass
+class TfExampleDecoder(common.TfExampleDecoder):
+  """A simple TF Example decoder config."""
+  panoptic_category_mask_key: str = 'image/panoptic/category_mask'
+  panoptic_instance_mask_key: str = 'image/panoptic/instance_mask'
+
+
+@dataclasses.dataclass
+class DataDecoder(common.DataDecoder):
+  """Data decoder config."""
+  simple_decoder: TfExampleDecoder = TfExampleDecoder()
+
+
+@dataclasses.dataclass
+class DataConfig(cfg.DataConfig):
+  """Input config for training."""
+  decoder: DataDecoder = DataDecoder()
+  parser: Parser = Parser()
+  input_path: str = ''
+  drop_remainder: bool = True
+  file_type: str = 'tfrecord'
+  is_training: bool = True
+  global_batch_size: int = 1
+
+
+@dataclasses.dataclass
+class PanopticDeeplabHead(hyperparams.Config):
+  """Panoptic Deeplab head config."""
+  level: int = 3
+  num_convs: int = 2
+  num_filters: int = 256
+  kernel_size: int = 5
+  use_depthwise_convolution: bool = False
+  upsample_factor: int = 1
+  low_level: List[int] = dataclasses.field(default_factory=lambda: [3, 2])
+  low_level_num_filters: List[int] = dataclasses.field(
+      default_factory=lambda: [64, 32])
+  fusion_num_output_filters: int = 256
+
+
+@dataclasses.dataclass
+class SemanticHead(PanopticDeeplabHead):
+  """Semantic head config."""
+  prediction_kernel_size: int = 1
+
+
+@dataclasses.dataclass
+class InstanceHead(PanopticDeeplabHead):
+  """Instance head config."""
+  prediction_kernel_size: int = 1
+
+
+@dataclasses.dataclass
+class PanopticDeeplabPostProcessor(hyperparams.Config):
+  """Panoptic Deeplab PostProcessing config."""
+  output_size: List[int] = dataclasses.field(
+      default_factory=list)
+  center_score_threshold: float = 0.1
+  thing_class_ids: List[int] = dataclasses.field(default_factory=list)
+  label_divisor: int = 256 * 256 * 256
+  stuff_area_limit: int = 4096
+  ignore_label: int = 0
+  nms_kernel: int = 7
+  keep_k_centers: int = 200
+  rescale_predictions: bool = True
+
+
+@dataclasses.dataclass
+class PanopticDeeplab(hyperparams.Config):
+  """Panoptic Deeplab model config."""
+  num_classes: int = 2
+  input_size: List[int] = dataclasses.field(default_factory=list)
+  min_level: int = 3
+  max_level: int = 6
+  norm_activation: common.NormActivation = common.NormActivation()
+  backbone: backbones.Backbone = backbones.Backbone(
+      type='resnet', resnet=backbones.ResNet())
+  decoder: decoders.Decoder = decoders.Decoder(type='aspp')
+  semantic_head: SemanticHead = SemanticHead()
+  instance_head: InstanceHead = InstanceHead()
+  shared_decoder: bool = False
+  generate_panoptic_masks: bool = True
+  post_processor: PanopticDeeplabPostProcessor = PanopticDeeplabPostProcessor()
+
+
+@dataclasses.dataclass
+class Losses(hyperparams.Config):
+  label_smoothing: float = 0.0
+  ignore_label: int = 0
+  class_weights: List[float] = dataclasses.field(default_factory=list)
+  l2_weight_decay: float = 1e-4
+  top_k_percent_pixels: float = 0.15
+  segmentation_loss_weight: float = 1.0
+  center_heatmap_loss_weight: float = 200
+  center_offset_loss_weight: float = 0.01
+
+
+@dataclasses.dataclass
+class Evaluation(hyperparams.Config):
+  """Evaluation config."""
+  ignored_label: int = 0
+  max_instances_per_category: int = 256
+  offset: int = 256 * 256 * 256
+  is_thing: List[float] = dataclasses.field(
+      default_factory=list)
+  rescale_predictions: bool = True
+  report_per_class_pq: bool = False
+
+  report_per_class_iou: bool = False
+  report_train_mean_iou: bool = True  # Turning this off can speed up training.
+
+
+@dataclasses.dataclass
+class PanopticDeeplabTask(cfg.TaskConfig):
+  """Panoptic deeplab task config."""
+  model: PanopticDeeplab = PanopticDeeplab()
+  train_data: DataConfig = DataConfig(is_training=True)
+  validation_data: DataConfig = DataConfig(
+      is_training=False,
+      drop_remainder=False)
+  losses: Losses = Losses()
+  init_checkpoint: Optional[str] = None
+  init_checkpoint_modules: Union[
+      str, List[str]] = 'all'  # all, backbone, and/or decoder
+  evaluation: Evaluation = Evaluation()
+
+
+@exp_factory.register_config_factory('panoptic_deeplab_resnet_coco')
+def panoptic_deeplab_coco() -> cfg.ExperimentConfig:
+  """COCO panoptic segmentation with Panoptic Deeplab."""
+  train_steps = 200000
+  train_batch_size = 64
+  eval_batch_size = 1
+  steps_per_epoch = _COCO_TRAIN_EXAMPLES // train_batch_size
+  validation_steps = _COCO_VAL_EXAMPLES // eval_batch_size
+
+  num_panoptic_categories = 201
+  num_thing_categories = 91
+  ignore_label = 0
+
+  is_thing = [False]
+  for idx in range(1, num_panoptic_categories):
+    is_thing.append(True if idx <= num_thing_categories else False)
+
+  input_size = [640, 640, 3]
+  output_stride = 16
+  aspp_dilation_rates = [6, 12, 18]
+  multigrid = [1, 2, 4]
+  stem_type = 'v1'
+  level = int(np.math.log2(output_stride))
+
+  config = cfg.ExperimentConfig(
+      runtime=cfg.RuntimeConfig(
+          mixed_precision_dtype='bfloat16', enable_xla=True),
+      task=PanopticDeeplabTask(
+          init_checkpoint='gs://tf_model_garden/vision/panoptic/panoptic_deeplab/imagenet/resnet50_v1/ckpt-436800',  # pylint: disable=line-too-long
+          init_checkpoint_modules=['backbone'],
+          model=PanopticDeeplab(
+              num_classes=num_panoptic_categories,
+              input_size=input_size,
+              backbone=backbones.Backbone(
+                  type='dilated_resnet', dilated_resnet=backbones.DilatedResNet(
+                      model_id=50,
+                      stem_type=stem_type,
+                      output_stride=output_stride,
+                      multigrid=multigrid,
+                      se_ratio=0.25,
+                      last_stage_repeats=1,
+                      stochastic_depth_drop_rate=0.2)),
+              decoder=decoders.Decoder(
+                  type='aspp',
+                  aspp=decoders.ASPP(
+                      level=level,
+                      num_filters=256,
+                      pool_kernel_size=input_size[:2],
+                      dilation_rates=aspp_dilation_rates,
+                      use_depthwise_convolution=True,
+                      dropout_rate=0.1)),
+              semantic_head=SemanticHead(
+                  level=level,
+                  num_convs=1,
+                  num_filters=256,
+                  kernel_size=5,
+                  use_depthwise_convolution=True,
+                  upsample_factor=1,
+                  low_level=[3, 2],
+                  low_level_num_filters=[64, 32],
+                  fusion_num_output_filters=256,
+                  prediction_kernel_size=1),
+              instance_head=InstanceHead(
+                  level=level,
+                  num_convs=1,
+                  num_filters=32,
+                  kernel_size=5,
+                  use_depthwise_convolution=True,
+                  upsample_factor=1,
+                  low_level=[3, 2],
+                  low_level_num_filters=[32, 16],
+                  fusion_num_output_filters=128,
+                  prediction_kernel_size=1),
+              shared_decoder=False,
+              generate_panoptic_masks=True,
+              post_processor=PanopticDeeplabPostProcessor(
+                  output_size=input_size[:2],
+                  center_score_threshold=0.1,
+                  thing_class_ids=list(range(1, num_thing_categories)),
+                  label_divisor=256,
+                  stuff_area_limit=4096,
+                  ignore_label=ignore_label,
+                  nms_kernel=41,
+                  keep_k_centers=200,
+                  rescale_predictions=True)),
+          losses=Losses(
+              label_smoothing=0.0,
+              ignore_label=ignore_label,
+              l2_weight_decay=0.0,
+              top_k_percent_pixels=0.2,
+              segmentation_loss_weight=1.0,
+              center_heatmap_loss_weight=200,
+              center_offset_loss_weight=0.01),
+          train_data=DataConfig(
+              input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'train*'),
+              is_training=True,
+              global_batch_size=train_batch_size,
+              parser=Parser(
+                  aug_scale_min=0.5,
+                  aug_scale_max=1.5,
+                  aug_rand_hflip=True,
+                  aug_type=common.Augmentation(
+                      type='autoaug',
+                      autoaug=common.AutoAugment(
+                          augmentation_name='panoptic_deeplab_policy')),
+                  sigma=8.0,
+                  small_instance_area_threshold=4096,
+                  small_instance_weight=3.0)),
+          validation_data=DataConfig(
+              input_path=os.path.join(_COCO_INPUT_PATH_BASE, 'val*'),
+              is_training=False,
+              global_batch_size=eval_batch_size,
+              parser=Parser(
+                  resize_eval_groundtruth=False,
+                  groundtruth_padded_size=[640, 640],
+                  aug_scale_min=1.0,
+                  aug_scale_max=1.0,
+                  aug_rand_hflip=False,
+                  aug_type=None,
+                  sigma=8.0,
+                  small_instance_area_threshold=4096,
+                  small_instance_weight=3.0),
+              drop_remainder=False),
+          evaluation=Evaluation(
+              ignored_label=ignore_label,
+              max_instances_per_category=256,
+              offset=256*256*256,
+              is_thing=is_thing,
+              rescale_predictions=True,
+              report_per_class_pq=False,
+              report_per_class_iou=False,
+              report_train_mean_iou=False)),
+      trainer=cfg.TrainerConfig(
+          train_steps=train_steps,
+          validation_steps=validation_steps,
+          validation_interval=steps_per_epoch,
+          steps_per_loop=steps_per_epoch,
+          summary_interval=steps_per_epoch,
+          checkpoint_interval=steps_per_epoch,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'adam',
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 0.0005,
+                      'decay_steps': train_steps,
+                      'end_learning_rate': 0.0,
+                      'power': 0.9
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': 2000,
+                      'warmup_learning_rate': 0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config