From 77c8e27e603bea9a69e7647587ca8d509dc1990d Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Tue, 7 Jul 2020 01:54:39 +0700
Subject: [PATCH 1/3] Convert BatchNorm to SyncBatchNorm

---
 train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/train.py b/train.py
index ce211f1f5322..9b5deee5d5aa 100644
--- a/train.py
+++ b/train.py
@@ -194,6 +194,7 @@ def train(hyp, tb_writer, opt, device):
     # DDP mode
     if device.type != 'cpu' and opt.local_rank != -1:
         # pip install torch==1.4.0+cku100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
         if mixed_precision:
             model = DDP(model, delay_allreduce=True)
         else:

From 2aa330139f3cc1237aeb3132245ed7e5d6da1683 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Tue, 7 Jul 2020 12:07:40 +0700
Subject: [PATCH 2/3] Remove apex.parallel. Use torch.nn.parallel

For future compatibility
---
 train.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/train.py b/train.py
index 9b5deee5d5aa..7eaedf5ecdf0 100644
--- a/train.py
+++ b/train.py
@@ -7,6 +7,7 @@
 import torch.optim.lr_scheduler as lr_scheduler
 import torch.utils.data
 from torch.utils.tensorboard import SummaryWriter
+from torch.nn.parallel import DistributedDataParallel as DDP
 
 import test  # import test.py to get mAP after each epoch
 from models.yolo import Model
@@ -17,9 +18,7 @@
 mixed_precision = True
 try:  # Mixed precision training https://github.com/NVIDIA/apex
     from apex import amp
-    from apex.parallel import DistributedDataParallel as DDP
 except:
-    from torch.nn.parallel import DistributedDataParallel as DDP
     print('Apex recommended for faster mixed precision training: https://github.com/NVIDIA/apex')
     mixed_precision = False  # not installed
 
@@ -195,10 +194,7 @@ def train(hyp, tb_writer, opt, device):
     if device.type != 'cpu' and opt.local_rank != -1:
         # pip install torch==1.4.0+cku100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html
         model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
-        if mixed_precision:
-            model = DDP(model, delay_allreduce=True)
-        else:
-            model = DDP(model, device_ids=[opt.local_rank])
+        model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank)
 
     # Model parameters
     hyp['cls'] *= nc / 80.  # scale coco-tuned hyp['cls'] to current dataset

From 050b2a5a79a89c9405854d439a1f70f892139b1c Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Tue, 7 Jul 2020 12:38:14 +0700
Subject: [PATCH 3/3] Add cleanup for process_group

---
 train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/train.py b/train.py
index 7eaedf5ecdf0..27c83cd7e56a 100644
--- a/train.py
+++ b/train.py
@@ -399,6 +399,7 @@ def train(hyp, tb_writer, opt, device):
         if not opt.evolve:
             plot_results()  # save as results.png
         print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
+    dist.destroy_process_group() if device.type != 'cpu' and torch.cuda.device_count() > 1 else None
     torch.cuda.empty_cache()
     return results