diff --git a/train.py b/train.py
index 1d26b529ffed..ba69de43d255 100644
--- a/train.py
+++ b/train.py
@@ -67,7 +67,7 @@ def train(hyp, tb_writer, opt, device):
     local_rank = opt.local_rank
 
     # TODO: Init DDP logging. Only the first process is allowed to log.
-    # Since I see lots of print here, the logging is skipped here.
+    # Since I see lots of print here, the logging configuration is skipped here. We may see repeated outputs.
 
     # Configure
     init_seeds(1)
@@ -177,7 +177,8 @@ def train(hyp, tb_writer, opt, device):
     # From https://github.com/rwightman/pytorch-image-models/blob/master/train.py:
     # "Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper"
     # chenyzsjtu: ema should be placed before after SyncBN. As SyncBN introduces new modules.
-    if device.type != 'cpu' and local_rank != -1:
+    if opt.sync_bn and device.type != 'cpu' and local_rank != -1:
+        print("SyncBN activated!")
         model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
     ema = torch_utils.ModelEMA(model) if local_rank in [-1, 0] else None
 
@@ -258,11 +259,10 @@ def train(hyp, tb_writer, opt, device):
         mloss = torch.zeros(4, device=device)  # mean losses
         if local_rank != -1:
             dataloader.sampler.set_epoch(epoch)
+        pbar = enumerate(dataloader)
         if local_rank in [-1, 0]:
             print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size'))
-            pbar = tqdm(enumerate(dataloader), total=nb)  # progress bar
-        else:
-            pbar = enumerate(dataloader)
+            pbar = tqdm(pbar, total=nb)  # progress bar
         optimizer.zero_grad()
         for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
             ni = i + nb * epoch  # number integrated batches (since train start)
@@ -429,6 +429,7 @@ def train(hyp, tb_writer, opt, device):
     parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
     parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
     parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
+    parser.add_argument("--sync-bn", action="store_true", help="Use sync-bn, only avaible in DDP mode.")
     # Parameter For DDP.
     parser.add_argument('--local_rank', type=int, default=-1, help="Extra parameter for DDP implementation. Don't use it manually.")
     opt = parser.parse_args()
@@ -437,7 +438,7 @@ def train(hyp, tb_writer, opt, device):
     if last and not opt.weights:
         print(f'Resuming training from {last}')
     opt.weights = last if opt.resume and not opt.weights else opt.weights
-    with torch_distributed_zero_first(opt.local_rank):
+    if opt.local_rank in [-1, 0]:
         check_git_status()
     opt.cfg = check_file(opt.cfg)  # check file
     opt.data = check_file(opt.data)  # check file