Skip to content
This repository has been archived by the owner on Mar 19, 2024. It is now read-only.

Commit

Permalink
Add new RegNet for SwAV (#214)
Browse files Browse the repository at this point in the history
Summary:
New regnet for SwAV plus unit test to check that associated pre-training is working on 1 node of 8 GPUs

Pull Request resolved: fairinternal/ssl_scaling#214

Reviewed By: prigoyal

Differential Revision: D33801136

Pulled By: QuentinDuval

fbshipit-source-id: 3b8bf89039d91ab7cb9686bf8e60d640ace95907
  • Loading branch information
QuentinDuval authored and facebook-github-bot committed Jan 29, 2022
1 parent 7b18cd7 commit 7337369
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 0 deletions.
94 changes: 94 additions & 0 deletions configs/config/pretrain/seer/models/regnet10B.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# @package _global_
config:
TRAINER:
TASK_NAME: self_supervision_fsdp_task
DATA:
TRAIN:
BATCHSIZE_PER_REPLICA: 16
TRANSFORMS:
- name: ImgPilToMultiCrop
total_num_crops: 6
size_crops: [160, 96]
num_crops: [2, 4]
crop_scales: [[0.14, 1], [0.05, 0.14]]
- name: RandomHorizontalFlip
p: 0.5
- name: ImgPilColorDistortion
strength: 1.0
- name: ImgPilGaussianBlur
p: 0.5
radius_min: 0.1
radius_max: 2.0
- name: ToTensor
- name: Normalize
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
COLLATE_FUNCTION_PARAMS:
create_multidimensional_tensor: True
MODEL:
TRUNK:
NAME: regnet_fsdp
REGNET:
block_type: res_bottleneck_block
depth: 27
group_width: 1010
w_0: 1744
w_a: 620.83
w_m: 2.52
stage_checkpoints: [[2], [7], [9, 17], []]
HEAD:
PARAMS: [
["swav_head_fsdp", {
"dims": [28280, 8192, 8192, 256],
"use_bn": False,
"num_clusters": [16000]
}],
]
FSDP_CONFIG:
AUTO_WRAP_THRESHOLD: 100000000
flatten_parameters: False
mixed_precision: True
fp32_reduce_scatter: False
compute_dtype: float16
CUDA_CACHE:
CLEAR_CUDA_CACHE: True
CLEAR_FREQ: 5000
SYNC_BN_CONFIG:
CONVERT_BN_TO_SYNC_BN: True
SYNC_BN_TYPE: "pytorch"
AMP_PARAMS:
USE_AMP: True
AMP_TYPE: "pytorch"
ACTIVATION_CHECKPOINTING:
USE_ACTIVATION_CHECKPOINTING: True
LOSS:
swav_loss:
num_iters: 10
epsilon: 0.03
temp_hard_assignment_iters: 0
num_crops: 6
num_prototypes: [16000]
OPTIMIZER:
name: "sgd_fsdp"
use_larc: True
construct_single_param_group_only: True
weight_decay: 0.00001
num_epochs: 1
param_schedulers:
lr:
# we make it convenient to scale Learning rate automatically as per the scaling
# rule specified in https://arxiv.org/abs/1706.02677 (ImageNet in 1Hour).
auto_lr_scaling:
auto_scale: True
base_value: 0.3
lengths: [0.043648,0.956352]
CHECKPOINT:
CHECKPOINT_ITER_FREQUENCY: 100
LATEST_CHECKPOINT_RESUME_FILE_NUM: 1
USE_SYMLINK_CHECKPOINT_FOR_RESUME: True
DISTRIBUTED:
NCCL_DEBUG: False
NUM_NODES: 62
NUM_PROC_PER_NODE: 8
NCCL_SOCKET_NTHREADS: ''
LOG_FREQUENCY: 1
52 changes: 52 additions & 0 deletions tests/test_regnet_fsdp_10b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright (c) Facebook, Inc. and its affiliates.

# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import unittest

from vissl.utils.hydra_config import compose_hydra_configuration, convert_to_attrdict
from vissl.utils.test_utils import (
gpu_test,
in_temporary_directory,
run_integration_test,
)


class TestRegnet10B(unittest.TestCase):
@staticmethod
def _create_10B_pretrain_config(num_gpus: int, num_steps: int, batch_size: int):
data_limit = num_steps * batch_size * num_gpus
cfg = compose_hydra_configuration(
[
"config=pretrain/swav/swav_8node_resnet",
"+config/pretrain/seer/models=regnet10B",
"config.OPTIMIZER.num_epochs=1",
"config.LOG_FREQUENCY=1",
# Testing on fake images
"config.DATA.TRAIN.DATA_SOURCES=[synthetic]",
"config.DATA.TRAIN.RANDOM_SYNTHETIC_IMAGES=True",
"config.DATA.TRAIN.USE_DEBUGGING_SAMPLER=True",
# Disable overlap communication and computation for test
"config.MODEL.FSDP_CONFIG.FORCE_SYNC_CUDA=True",
# Testing on 8 V100 32GB GPU only
f"config.DATA.TRAIN.BATCHSIZE_PER_REPLICA={batch_size}",
f"config.DATA.TRAIN.DATA_LIMIT={data_limit}",
"config.DISTRIBUTED.NUM_NODES=1",
f"config.DISTRIBUTED.NUM_PROC_PER_NODE={num_gpus}",
"config.DISTRIBUTED.RUN_ID=auto",
]
)
args, config = convert_to_attrdict(cfg)
return config

@gpu_test(gpu_count=8)
def test_regnet_10b_swav_pretraining(self):
with in_temporary_directory():
config = self._create_10B_pretrain_config(
num_gpus=8, num_steps=2, batch_size=4
)
results = run_integration_test(config)
losses = results.get_losses()
print(losses)
self.assertEqual(len(losses), 2)

0 comments on commit 7337369

Please sign in to comment.