diff --git a/requirements.txt b/requirements.txt index 0871ed666685..aaeedd9afd66 100755 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,9 @@ torch>=1.6.0 torchvision>=0.7.0 tqdm>=4.41.0 +# logging ------------------------------------- +# wandb + # coco ---------------------------------------- # pycocotools>=2.0 diff --git a/test.py b/test.py index ae42503b4d10..e2749908e1d6 100644 --- a/test.py +++ b/test.py @@ -33,7 +33,9 @@ def test(data, save_dir=Path(''), # for saving images save_txt=False, # for auto-labelling save_conf=False, - plots=True): + plots=True, + log_imgs=0): # number of logged images + # Initialize/load model and set device training = model is not None if training: # called by train.py @@ -77,6 +79,13 @@ def test(data, iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95 niou = iouv.numel() + # Logging + log_imgs = min(log_imgs, 100) # ceil + try: + import wandb # Weights & Biases + except ImportError: + log_imgs = 0 + # Dataloader if not training: img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img @@ -91,7 +100,7 @@ def test(data, s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95') p, r, f1, mp, mr, map50, map, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0. loss = torch.zeros(3, device=device) - jdict, stats, ap, ap_class = [], [], [], [] + jdict, stats, ap, ap_class, wandb_images = [], [], [], [], [] for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)): img = img.to(device, non_blocking=True) img = img.half() if half else img.float() # uint8 to fp16/32 @@ -139,6 +148,14 @@ def test(data, with open(str(out / Path(paths[si]).stem) + '.txt', 'a') as f: f.write(('%g ' * len(line) + '\n') % line) + # W&B logging + if len(wandb_images) < log_imgs: + bbox_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]}, + "class_id": int(cls), + "scores": {"class_score": conf}, + "domain": "pixel"} for *xyxy, conf, cls in pred.clone().tolist()] + wandb_images.append(wandb.Image(img[si], boxes={"predictions": {"box_data": bbox_data}})) + # Clip boxes to image bounds clip_coords(pred, (height, width)) @@ -196,6 +213,10 @@ def test(data, f = save_dir / f'test_batch{batch_i}_pred.jpg' plot_images(img, output_to_target(output, width, height), paths, str(f), names) # predictions + # W&B logging + if wandb_images: + wandb.log({"outputs": wandb_images}) + # Compute statistics stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy if len(stats) and stats[0].any(): diff --git a/train.py b/train.py index 943688453782..86d2b9d213d1 100644 --- a/train.py +++ b/train.py @@ -33,7 +33,7 @@ logger = logging.getLogger(__name__) -def train(hyp, opt, device, tb_writer=None): +def train(hyp, opt, device, tb_writer=None, wandb=None): logger.info(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory wdir = log_dir / 'weights' # weights directory @@ -118,6 +118,11 @@ def train(hyp, opt, device, tb_writer=None): scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) + # Logging + if wandb and wandb.run is None: + id = ckpt.get('wandb_id') if 'ckpt' in locals() else None + wandb_run = wandb.init(config=opt, resume="allow", project=os.path.basename(log_dir), id=id) + # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: @@ -317,7 +322,8 @@ def train(hyp, opt, device, tb_writer=None): single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir, - plots=epoch == 0 or final_epoch) # plot first and last + plots=epoch == 0 or final_epoch, # plot first and last + log_imgs=opt.log_imgs) # Write with open(results_file, 'a') as f: @@ -325,14 +331,16 @@ def train(hyp, opt, device, tb_writer=None): if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) - # Tensorboard - if tb_writer: - tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss - 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', - 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss - 'x/lr0', 'x/lr1', 'x/lr2'] # params - for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): - tb_writer.add_scalar(tag, x, epoch) + # Log + tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss + 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', + 'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss + 'x/lr0', 'x/lr1', 'x/lr2'] # params + for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): + if tb_writer: + tb_writer.add_scalar(tag, x, epoch) # tensorboard + if wandb: + wandb.log({tag: x}) # W&B # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95] @@ -347,7 +355,8 @@ def train(hyp, opt, device, tb_writer=None): 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, - 'optimizer': None if final_epoch else optimizer.state_dict()} + 'optimizer': None if final_epoch else optimizer.state_dict(), + 'wandb_id': wandb_run.id if wandb else None} # Save last, best and delete torch.save(ckpt, last) @@ -403,7 +412,9 @@ def train(hyp, opt, device, tb_writer=None): parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') parser.add_argument('--logdir', type=str, default='runs/', help='logging directory') + parser.add_argument('--log-imgs', type=int, default=10, help='number of images for W&B logging, max 100') parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers') + opt = parser.parse_args() # Set DDP variables @@ -452,12 +463,23 @@ def train(hyp, opt, device, tb_writer=None): # Train logger.info(opt) if not opt.evolve: - tb_writer = None + tb_writer, wandb = None, None # init loggers if opt.global_rank in [-1, 0]: + # Tensorboard logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.logdir}", view at http://localhost:6006/') tb_writer = SummaryWriter(log_dir=log_dir) # runs/exp0 - train(hyp, opt, device, tb_writer) + # W&B + try: + import wandb + + assert os.environ.get('WANDB_DISABLED') != 'true' + logger.info("Weights & Biases logging enabled, to disable set os.environ['WANDB_DISABLED'] = 'true'") + except (ImportError, AssertionError): + opt.log_imgs = 0 + logger.info("Install Weights & Biases for experiment logging via 'pip install wandb' (recommended)") + + train(hyp, opt, device, tb_writer, wandb) # Evolve hyperparameters (optional) else: