Squashed commit of the following:

commit 94147314e559a6bdd13cb9de62490d385c27596f Merge: 65157e2 37acbdc Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Thu Jul 16 14:00:17 2020 +0800 Merge branch 'master' of https://github.com/ultralytics/yolov4 into feature/DDP_fixed commit 37acbdc Author: Glenn Jocher <glenn.jocher@ultralytics.com> Date: Wed Jul 15 20:03:41 2020 -0700 update test.py --save-txt commit b8c2da4 Author: Glenn Jocher <glenn.jocher@ultralytics.com> Date: Wed Jul 15 20:00:48 2020 -0700 update test.py --save-txt commit 65157e2 Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Wed Jul 15 16:44:13 2020 +0800 Revert the README.md removal commit 1c802bf Merge: cd55b44 0f3b8bb Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Wed Jul 15 16:43:38 2020 +0800 Merge branch 'feature/DDP_fixed' of https://github.com/MagicFrogSJTU/yolov5 into feature/DDP_fixed commit cd55b44 Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Wed Jul 15 16:42:33 2020 +0800 fix the DDP performance deterioration bug. commit 0f3b8bb Author: Glenn Jocher <glenn.jocher@ultralytics.com> Date: Wed Jul 15 00:28:53 2020 -0700 Delete README.md commit f5921ba Merge: 85ab2f3 bd3fdbb Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Wed Jul 15 11:20:17 2020 +0800 Merge branch 'feature/DDP_fixed' of https://github.com/MagicFrogSJTU/yolov5 into feature/DDP_fixed commit bd3fdbb Author: Glenn Jocher <glenn.jocher@ultralytics.com> Date: Tue Jul 14 18:38:20 2020 -0700 Update README.md commit c1a97a7 Merge: 2bf86b8 f796708 Author: Glenn Jocher <glenn.jocher@ultralytics.com> Date: Tue Jul 14 18:36:53 2020 -0700 Merge branch 'master' into feature/DDP_fixed commit 2bf86b8 Author: NanoCode012 <kevinvong@rocketmail.com> Date: Tue Jul 14 22:18:15 2020 +0700 Fixed world_size not found when called from test commit 85ab2f3 Merge: 5a19011 c8357ad Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Tue Jul 14 22:19:58 2020 +0800 Merge branch 'feature/DDP_fixed' of https://github.com/MagicFrogSJTU/yolov5 into feature/DDP_fixed commit 5a19011 Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Tue Jul 14 22:19:15 2020 +0800 Add assertion for <=2 gpus DDP commit c8357ad Merge: e742dd9 787582f Author: yzchen <Chenyzsjtu@gmail.com> Date: Tue Jul 14 22:10:02 2020 +0800 Merge pull request #8 from MagicFrogSJTU/NanoCode012-patch-1 Modify number of dataloaders' workers commit 787582f Author: NanoCode012 <kevinvong@rocketmail.com> Date: Tue Jul 14 20:38:58 2020 +0700 Fixed issue with single gpu not having world_size commit 6364892 Author: NanoCode012 <kevinvong@rocketmail.com> Date: Tue Jul 14 19:16:15 2020 +0700 Add assert message for clarification Clarify why assertion was thrown to users commit 69364d6 Author: NanoCode012 <kevinvong@rocketmail.com> Date: Tue Jul 14 17:36:48 2020 +0700 Changed number of workers check commit d738487 Author: NanoCode012 <kevinvong@rocketmail.com> Date: Tue Jul 14 17:33:38 2020 +0700 Adding world_size Reduce calls to torch.distributed. For use in create_dataloader. commit e742dd9 Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Tue Jul 14 15:38:48 2020 +0800 Make SyncBN a choice commit e90d400 Merge: 5bf8beb cd90360 Author: yzchen <Chenyzsjtu@gmail.com> Date: Tue Jul 14 15:32:10 2020 +0800 Merge pull request #6 from NanoCode012/patch-5 Update train.py commit cd90360 Author: NanoCode012 <kevinvong@rocketmail.com> Date: Tue Jul 14 13:39:29 2020 +0700 Update train.py Remove redundant `opt.` prefix. commit 5bf8beb Merge: c9558a9 a1c8406 Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Tue Jul 14 14:09:51 2020 +0800 Merge branch 'master' of https://github.com/ultralytics/yolov5 into feature/DDP_fixed commit c9558a9 Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Tue Jul 14 13:51:34 2020 +0800 Add device allocation for loss compute commit 4f08c69 Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Thu Jul 9 11:16:27 2020 +0800 Revert drop_last commit 1dabe33 Merge: a1ce9b1 4b8450b Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Thu Jul 9 11:15:49 2020 +0800 Merge branch 'feature/DDP_fixed' of https://github.com/MagicFrogSJTU/yolov5 into feature/DDP_fixed commit a1ce9b1 Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Thu Jul 9 11:15:21 2020 +0800 fix lr warning commit 4b8450b Merge: b9a50ae 02c63ef Author: yzchen <Chenyzsjtu@gmail.com> Date: Wed Jul 8 21:24:24 2020 +0800 Merge pull request #4 from NanoCode012/patch-4 Add drop_last for multi gpu commit 02c63ef Author: NanoCode012 <kevinvong@rocketmail.com> Date: Wed Jul 8 10:08:30 2020 +0700 Add drop_last for multi gpu commit b9a50ae Merge: ec2dc6c 121d90b Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Tue Jul 7 19:48:04 2020 +0800 Merge branch 'master' of https://github.com/ultralytics/yolov5 into feature/DDP_fixed commit ec2dc6c Merge: d0326e3 82a6182 Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Tue Jul 7 19:34:31 2020 +0800 Merge branch 'feature/DDP_fixed' of https://github.com/MagicFrogSJTU/yolov5 into feature/DDP_fixed commit d0326e3 Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Tue Jul 7 19:31:24 2020 +0800 Add SyncBN commit 82a6182 Merge: 96fa40a 050b2a5 Author: yzchen <Chenyzsjtu@gmail.com> Date: Tue Jul 7 19:21:01 2020 +0800 Merge pull request #1 from NanoCode012/patch-2 Convert BatchNorm to SyncBatchNorm commit 050b2a5 Author: NanoCode012 <kevinvong@rocketmail.com> Date: Tue Jul 7 12:38:14 2020 +0700 Add cleanup for process_group commit 2aa3301 Author: NanoCode012 <kevinvong@rocketmail.com> Date: Tue Jul 7 12:07:40 2020 +0700 Remove apex.parallel. Use torch.nn.parallel For future compatibility commit 77c8e27 Author: NanoCode012 <kevinvong@rocketmail.com> Date: Tue Jul 7 01:54:39 2020 +0700 Convert BatchNorm to SyncBatchNorm commit 96fa40a Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Mon Jul 6 21:53:56 2020 +0800 Fix the datset inconsistency problem commit 16e7c26 Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Mon Jul 6 11:34:03 2020 +0800 Add loss multiplication to preserver the single-process performance commit e838055 Merge: 625bb49 3bdea3f Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Fri Jul 3 20:56:30 2020 +0800 Merge branch 'master' of https://github.com/ultralytics/yolov5 into feature/DDP_fixed commit 625bb49 Author: yizhi.chen <chenyzsjtu@outlook.com> Date: Thu Jul 2 22:45:15 2020 +0800 DDP established
MagicFrogSJTU · Jul 16, 2020 · 52a540d · 52a540d
1 parent 49fcf62
commit 52a540d
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 20 deletions.
diff --git a/test.py b/test.py
@@ -18,15 +18,21 @@ def test(data,
          model=None,
          dataloader=None,
          save_dir='',
-         merge=False):
+         merge=False,
+         save_txt=False):
     # Initialize/load model and set device
     training = model is not None
     if training:  # called by train.py
         device = next(model.parameters()).device  # get model device
 
     else:  # called directly
         device = torch_utils.select_device(opt.device, batch_size=batch_size)
-        merge = opt.merge  # use Merge NMS
+        merge, save_txt = opt.merge, opt.save_txt  # use Merge NMS, save *.txt labels
+        if save_txt:
+            out = Path('inference/output')
+            if os.path.exists(out):
+                shutil.rmtree(out)  # delete output folder
+            os.makedirs(out)  # make new output folder
 
         # Remove previous
         for f in glob.glob(str(Path(save_dir) / 'test_batch*.jpg')):
@@ -105,8 +111,14 @@ def test(data,
                 continue
 
             # Append to text file
-            # with open('test.txt', 'a') as file:
-            #    [file.write('%11.5g' * 7 % tuple(x) + '\n') for x in pred]
+            if save_txt:
+                gn = torch.tensor(shapes[si][0])[[1, 0, 1, 0]]  # normalization gain whwh
+                txt_path = str(out / Path(paths[si]).stem)
+                pred[:, :4] = scale_coords(img[si].shape[1:], pred[:, :4], shapes[si][0], shapes[si][1])  # to original
+                for *xyxy, conf, cls in pred:
+                    xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
+                    with open(txt_path + '.txt', 'a') as f:
+                        f.write(('%g ' * 5 + '\n') % (cls, *xywh))  # label format
 
             # Clip boxes to image bounds
             clip_coords(pred, (height, width))
@@ -235,6 +247,7 @@ def test(data,
     parser.add_argument('--augment', action='store_true', help='augmented inference')
     parser.add_argument('--merge', action='store_true', help='use Merge NMS')
     parser.add_argument('--verbose', action='store_true', help='report mAP by class')
+    parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
     opt = parser.parse_args()
     opt.save_json |= opt.data.endswith('coco.yaml')
     opt.data = check_file(opt.data)  # check file

diff --git a/train.py b/train.py
@@ -70,7 +70,7 @@ def train(hyp, tb_writer, opt, device):
     # Since I see lots of print here, the logging configuration is skipped here. We may see repeated outputs.
 
     # Configure
-    init_seeds(1)
+    init_seeds(2+local_rank)
     with open(opt.data) as f:
         data_dict = yaml.load(f, Loader=yaml.FullLoader)  # model dict
     train_path = data_dict['train']
@@ -208,20 +208,20 @@ def train(hyp, tb_writer, opt, device):
     model.names = names
 
     # Class frequency
-    if tb_writer:
-        # tb_writer.add_hparams(hyp, {})  # causes duplicate https://github.com/ultralytics/yolov5/pull/384
+    # Only one check and log is needed.
+    if local_rank in [-1, 0]:
         labels = np.concatenate(dataset.labels, 0)
         c = torch.tensor(labels[:, 0])  # classes
         # cf = torch.bincount(c.long(), minlength=nc) + 1.
         # model._initialize_biases(cf.to(device))
-        plot_labels(labels)
-        tb_writer.add_histogram('classes', c, 0)
-
-
-    # Check anchors
-    if not opt.noautoanchor:
-        check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
-
+        plot_labels(labels, save_dir=log_dir)
+        if tb_writer:
+            # tb_writer.add_hparams(hyp, {})  # causes duplicate https://github.com/ultralytics/yolov5/pull/384
+            tb_writer.add_histogram('classes', c, 0)
+
+        # Check anchors
+        if not opt.noautoanchor:
+            check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
     # Start training
     t0 = time.time()
     nw = max(3 * nb, 1e3)  # number of warmup iterations, max(3 epochs, 1k iterations)
@@ -414,7 +414,7 @@ def train(hyp, tb_writer, opt, device):
     parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path')
     parser.add_argument('--hyp', type=str, default='', help='hyp.yaml path (optional)')
     parser.add_argument('--epochs', type=int, default=300)
-    parser.add_argument('--batch-size', type=int, default=16, help="batch size for all gpus.")
+    parser.add_argument('--batch-size', type=int, default=16, help="Total batch size for all gpus.")
     parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes')
     parser.add_argument('--rect', action='store_true', help='rectangular training')
     parser.add_argument('--resume', nargs='?', const='get_last', default=False,
@@ -450,9 +450,9 @@ def train(hyp, tb_writer, opt, device):
     opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size)))  # extend to 2 sizes (train, test)
     device = torch_utils.select_device(opt.device, apex=mixed_precision, batch_size=opt.batch_size)
     opt.total_batch_size = opt.batch_size
+    opt.world_size = 1
     if device.type == 'cpu':
         mixed_precision = False
-        opt.world_size = 1
     elif opt.local_rank != -1:
         # DDP mode
         assert torch.cuda.device_count() > opt.local_rank
@@ -461,7 +461,7 @@ def train(hyp, tb_writer, opt, device):
         dist.init_process_group(backend='nccl', init_method='env://')  # distributed backend
 
         opt.world_size = dist.get_world_size()
-        assert opt.batch_size % opt.world_size == 0
+        assert opt.batch_size % opt.world_size == 0, "Batch size is not a multiple of the number of devices given!"
         opt.batch_size = opt.total_batch_size // opt.world_size
     print(opt)
 

diff --git a/utils/datasets.py b/utils/datasets.py
@@ -59,7 +59,7 @@ def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=Fa
                                     pad=pad)
 
     batch_size = min(batch_size, len(dataset))
-    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
+    nw = min([os.cpu_count()//(opt.world_size if hasattr(opt, "world_size") else 1), batch_size if batch_size > 1 else 0, 8])  # number of workers
     train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) if local_rank != -1 else None
     dataloader = torch.utils.data.DataLoader(dataset,
                                              batch_size=batch_size,
@@ -305,7 +305,7 @@ def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, r
                     f += glob.iglob(p + os.sep + '*.*')
                 else:
                     raise Exception('%s does not exist' % p)
-            self.img_files = [x.replace('/', os.sep) for x in f if os.path.splitext(x)[-1].lower() in img_formats]
+            self.img_files = sorted([x.replace('/', os.sep) for x in f if os.path.splitext(x)[-1].lower() in img_formats])
         except Exception as e:
             raise Exception('Error loading data from %s: %s\nSee %s' % (path, e, help_url))