diff --git a/models/common.py b/models/common.py index 464d639a1f0b..83cc8b5ce27b 100644 --- a/models/common.py +++ b/models/common.py @@ -278,7 +278,7 @@ def display(self, pprint=False, show=False, save=False, render=False, save_dir=' def print(self): self.display(pprint=True) # print results print(f'Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {tuple(self.s)}' % - tuple(self.t)) + tuple(self.t)) def show(self): self.display(show=True) # show results diff --git a/test.py b/test.py index 39e0992264ec..61d6965f7414 100644 --- a/test.py +++ b/test.py @@ -35,8 +35,9 @@ def test(data, save_hybrid=False, # for hybrid auto-labelling save_conf=False, # save auto-label confidences plots=True, - log_imgs=0, # number of logged images - compute_loss=None): + wandb_logger=None, + compute_loss=None, + is_coco=False): # Initialize/load model and set device training = model is not None if training: # called by train.py @@ -66,21 +67,19 @@ def test(data, # Configure model.eval() - is_coco = data.endswith('coco.yaml') # is COCO dataset - with open(data) as f: - data = yaml.load(f, Loader=yaml.SafeLoader) # model dict + if isinstance(data, str): + is_coco = data.endswith('coco.yaml') + with open(data) as f: + data = yaml.load(f, Loader=yaml.SafeLoader) check_dataset(data) # check nc = 1 if single_cls else int(data['nc']) # number of classes iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95 niou = iouv.numel() # Logging - log_imgs, wandb = min(log_imgs, 100), None # ceil - try: - import wandb # Weights & Biases - except ImportError: - log_imgs = 0 - + log_imgs = 0 + if wandb_logger and wandb_logger.wandb: + log_imgs = min(wandb_logger.log_imgs, 100) # Dataloader if not training: if device.type != 'cpu': @@ -147,15 +146,17 @@ def test(data, with open(save_dir / 'labels' / (path.stem + '.txt'), 'a') as f: f.write(('%g ' * len(line)).rstrip() % line + '\n') - # W&B logging - if plots and len(wandb_images) < log_imgs: - box_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]}, - "class_id": int(cls), - "box_caption": "%s %.3f" % (names[cls], conf), - "scores": {"class_score": conf}, - "domain": "pixel"} for *xyxy, conf, cls in pred.tolist()] - boxes = {"predictions": {"box_data": box_data, "class_labels": names}} # inference-space - wandb_images.append(wandb.Image(img[si], boxes=boxes, caption=path.name)) + # W&B logging - Media Panel Plots + if len(wandb_images) < log_imgs and wandb_logger.current_epoch > 0: # Check for test operation + if wandb_logger.current_epoch % wandb_logger.bbox_interval == 0: + box_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]}, + "class_id": int(cls), + "box_caption": "%s %.3f" % (names[cls], conf), + "scores": {"class_score": conf}, + "domain": "pixel"} for *xyxy, conf, cls in pred.tolist()] + boxes = {"predictions": {"box_data": box_data, "class_labels": names}} # inference-space + wandb_images.append(wandb_logger.wandb.Image(img[si], boxes=boxes, caption=path.name)) + wandb_logger.log_training_progress(predn, path, names) # logs dsviz tables # Append to pycocotools JSON dictionary if save_json: @@ -239,9 +240,11 @@ def test(data, # Plots if plots: confusion_matrix.plot(save_dir=save_dir, names=list(names.values())) - if wandb and wandb.run: - val_batches = [wandb.Image(str(f), caption=f.name) for f in sorted(save_dir.glob('test*.jpg'))] - wandb.log({"Images": wandb_images, "Validation": val_batches}, commit=False) + if wandb_logger and wandb_logger.wandb: + val_batches = [wandb_logger.wandb.Image(str(f), caption=f.name) for f in sorted(save_dir.glob('test*.jpg'))] + wandb_logger.log({"Validation": val_batches}) + if wandb_images: + wandb_logger.log({"Bounding Box Debugger/Images": wandb_images}) # Save JSON if save_json and len(jdict): diff --git a/utils/wandb_logging/log_dataset.py b/utils/wandb_logging/log_dataset.py index d790a9ce721e..97e68425cddd 100644 --- a/utils/wandb_logging/log_dataset.py +++ b/utils/wandb_logging/log_dataset.py @@ -12,20 +12,7 @@ def create_dataset_artifact(opt): with open(opt.data) as f: data = yaml.load(f, Loader=yaml.SafeLoader) # data dict - logger = WandbLogger(opt, '', None, data, job_type='create_dataset') - nc, names = (1, ['item']) if opt.single_cls else (int(data['nc']), data['names']) - names = {k: v for k, v in enumerate(names)} # to index dictionary - logger.log_dataset_artifact(LoadImagesAndLabels(data['train']), names, name='train') # trainset - logger.log_dataset_artifact(LoadImagesAndLabels(data['val']), names, name='val') # valset - - # Update data.yaml with artifact links - data['train'] = WANDB_ARTIFACT_PREFIX + str(Path(opt.project) / 'train') - data['val'] = WANDB_ARTIFACT_PREFIX + str(Path(opt.project) / 'val') - path = opt.data if opt.overwrite_config else opt.data.replace('.', '_wandb.') # updated data.yaml path - data.pop('download', None) # download via artifact instead of predefined field 'download:' - with open(path, 'w') as f: - yaml.dump(data, f) - print("New Config file => ", path) + logger = WandbLogger(opt, '', None, data, job_type='Dataset Creation') if __name__ == '__main__': @@ -33,7 +20,6 @@ def create_dataset_artifact(opt): parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path') parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset') parser.add_argument('--project', type=str, default='YOLOv5', help='name of W&B Project') - parser.add_argument('--overwrite_config', action='store_true', help='overwrite data.yaml') opt = parser.parse_args() create_dataset_artifact(opt) diff --git a/utils/wandb_logging/wandb_utils.py b/utils/wandb_logging/wandb_utils.py index 264cd4840e3c..c9a32f5b6026 100644 --- a/utils/wandb_logging/wandb_utils.py +++ b/utils/wandb_logging/wandb_utils.py @@ -1,13 +1,18 @@ +import argparse import json +import os import shutil import sys +import torch +import yaml from datetime import datetime from pathlib import Path - -import torch +from tqdm import tqdm sys.path.append(str(Path(__file__).parent.parent.parent)) # add utils/ to path -from utils.general import colorstr, xywh2xyxy +from utils.datasets import LoadImagesAndLabels +from utils.datasets import img2label_paths +from utils.general import colorstr, xywh2xyxy, check_dataset try: import wandb @@ -22,87 +27,183 @@ def remove_prefix(from_string, prefix): return from_string[len(prefix):] +def check_wandb_config_file(data_config_file): + wandb_config = '_wandb.'.join(data_config_file.rsplit('.', 1)) # updated data.yaml path + if Path(wandb_config).is_file(): + return wandb_config + return data_config_file + + +def resume_and_get_id(opt): + # It's more elegant to stick to 1 wandb.init call, but as useful config data is overwritten in the WandbLogger's wandb.init call + if isinstance(opt.resume, str): + if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): + run_path = Path(remove_prefix(opt.resume, WANDB_ARTIFACT_PREFIX)) + run_id = run_path.stem + project = run_path.parent.stem + model_artifact_name = WANDB_ARTIFACT_PREFIX + 'run_' + run_id + '_model' + assert wandb, 'install wandb to resume wandb runs' + # Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config + run = wandb.init(id=run_id, project=project, resume='allow') + opt.resume = model_artifact_name + return run + return None + + class WandbLogger(): def __init__(self, opt, name, run_id, data_dict, job_type='Training'): - self.wandb = wandb - self.wandb_run = wandb.init(config=opt, resume="allow", - project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, - name=name, - job_type=job_type, - id=run_id) if self.wandb else None - - if job_type == 'Training': - self.setup_training(opt, data_dict) - if opt.bbox_interval == -1: - opt.bbox_interval = (opt.epochs // 10) if opt.epochs > 10 else opt.epochs - if opt.save_period == -1: - opt.save_period = (opt.epochs // 10) if opt.epochs > 10 else opt.epochs + # Pre-training routine -- + self.job_type = job_type + self.wandb, self.wandb_run, self.data_dict = wandb, None if not wandb else wandb.run, data_dict + if self.wandb: + self.wandb_run = wandb.init(config=opt, + resume="allow", + project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, + name=name, + job_type=job_type, + id=run_id) if not wandb.run else wandb.run + if self.job_type == 'Training': + if not opt.resume: + wandb_data_dict = self.check_and_upload_dataset(opt) if opt.upload_dataset else data_dict + # Info useful for resuming from artifacts + self.wandb_run.config.opt = vars(opt) + self.wandb_run.config.data_dict = wandb_data_dict + self.data_dict = self.setup_training(opt, data_dict) + if self.job_type == 'Dataset Creation': + self.data_dict = self.check_and_upload_dataset(opt) + + def check_and_upload_dataset(self, opt): + assert wandb, 'Install wandb to upload dataset' + check_dataset(self.data_dict) + config_path = self.log_dataset_artifact(opt.data, + opt.single_cls, + 'YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem) + print("Created dataset config file ", config_path) + with open(config_path) as f: + wandb_data_dict = yaml.load(f, Loader=yaml.SafeLoader) + return wandb_data_dict def setup_training(self, opt, data_dict): - self.log_dict = {} - self.train_artifact_path, self.trainset_artifact = \ - self.download_dataset_artifact(data_dict['train'], opt.artifact_alias) - self.test_artifact_path, self.testset_artifact = \ - self.download_dataset_artifact(data_dict['val'], opt.artifact_alias) - self.result_artifact, self.result_table, self.weights = None, None, None - if self.train_artifact_path is not None: - train_path = Path(self.train_artifact_path) / 'data/images/' - data_dict['train'] = str(train_path) - if self.test_artifact_path is not None: - test_path = Path(self.test_artifact_path) / 'data/images/' - data_dict['val'] = str(test_path) + self.log_dict, self.current_epoch, self.log_imgs = {}, 0, 16 # Logging Constants + self.bbox_interval = opt.bbox_interval + if isinstance(opt.resume, str): + modeldir, _ = self.download_model_artifact(opt) + if modeldir: + self.weights = Path(modeldir) / "last.pt" + config = self.wandb_run.config + opt.weights, opt.save_period, opt.batch_size, opt.bbox_interval, opt.epochs, opt.hyp = str( + self.weights), config.save_period, config.total_batch_size, config.bbox_interval, config.epochs, \ + config.opt['hyp'] + data_dict = dict(self.wandb_run.config.data_dict) # eliminates the need for config file to resume + if 'val_artifact' not in self.__dict__: # If --upload_dataset is set, use the existing artifact, don't download + self.train_artifact_path, self.train_artifact = self.download_dataset_artifact(data_dict.get('train'), + opt.artifact_alias) + self.val_artifact_path, self.val_artifact = self.download_dataset_artifact(data_dict.get('val'), + opt.artifact_alias) + self.result_artifact, self.result_table, self.val_table, self.weights = None, None, None, None + if self.train_artifact_path is not None: + train_path = Path(self.train_artifact_path) / 'data/images/' + data_dict['train'] = str(train_path) + if self.val_artifact_path is not None: + val_path = Path(self.val_artifact_path) / 'data/images/' + data_dict['val'] = str(val_path) + self.val_table = self.val_artifact.get("val") + self.map_val_table_path() + if self.val_artifact is not None: self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation") self.result_table = wandb.Table(["epoch", "id", "prediction", "avg_confidence"]) - if opt.resume_from_artifact: - modeldir, _ = self.download_model_artifact(opt.resume_from_artifact) - if modeldir: - self.weights = Path(modeldir) / "best.pt" - opt.weights = self.weights + if opt.bbox_interval == -1: + self.bbox_interval = opt.bbox_interval = (opt.epochs // 10) if opt.epochs > 10 else 1 + return data_dict def download_dataset_artifact(self, path, alias): if path.startswith(WANDB_ARTIFACT_PREFIX): dataset_artifact = wandb.use_artifact(remove_prefix(path, WANDB_ARTIFACT_PREFIX) + ":" + alias) assert dataset_artifact is not None, "'Error: W&B dataset artifact doesn\'t exist'" datadir = dataset_artifact.download() - labels_zip = Path(datadir) / "data/labels.zip" - shutil.unpack_archive(labels_zip, Path(datadir) / 'data/labels', 'zip') - print("Downloaded dataset to : ", datadir) return datadir, dataset_artifact return None, None - def download_model_artifact(self, name): - model_artifact = wandb.use_artifact(name + ":latest") - assert model_artifact is not None, 'Error: W&B model artifact doesn\'t exist' - modeldir = model_artifact.download() - print("Downloaded model to : ", modeldir) - return modeldir, model_artifact + def download_model_artifact(self, opt): + if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): + model_artifact = wandb.use_artifact(remove_prefix(opt.resume, WANDB_ARTIFACT_PREFIX) + ":latest") + assert model_artifact is not None, 'Error: W&B model artifact doesn\'t exist' + modeldir = model_artifact.download() + epochs_trained = model_artifact.metadata.get('epochs_trained') + total_epochs = model_artifact.metadata.get('total_epochs') + assert epochs_trained < total_epochs, 'training to %g epochs is finished, nothing to resume.' % ( + total_epochs) + return modeldir, model_artifact + return None, None - def log_model(self, path, opt, epoch): - datetime_suffix = datetime.today().strftime('%Y-%m-%d-%H-%M-%S') + def log_model(self, path, opt, epoch, fitness_score, best_model=False): model_artifact = wandb.Artifact('run_' + wandb.run.id + '_model', type='model', metadata={ 'original_url': str(path), - 'epoch': epoch + 1, + 'epochs_trained': epoch + 1, 'save period': opt.save_period, 'project': opt.project, - 'datetime': datetime_suffix + 'total_epochs': opt.epochs, + 'fitness_score': fitness_score }) model_artifact.add_file(str(path / 'last.pt'), name='last.pt') - model_artifact.add_file(str(path / 'best.pt'), name='best.pt') - wandb.log_artifact(model_artifact) + wandb.log_artifact(model_artifact, + aliases=['latest', 'epoch ' + str(self.current_epoch), 'best' if best_model else '']) print("Saving model artifact on epoch ", epoch + 1) - def log_dataset_artifact(self, dataset, class_to_id, name='dataset'): + def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False): + with open(data_file) as f: + data = yaml.load(f, Loader=yaml.SafeLoader) # data dict + nc, names = (1, ['item']) if single_cls else (int(data['nc']), data['names']) + names = {k: v for k, v in enumerate(names)} # to index dictionary + self.train_artifact = self.create_dataset_table(LoadImagesAndLabels( + data['train']), names, name='train') if data.get('train') else None + self.val_artifact = self.create_dataset_table(LoadImagesAndLabels( + data['val']), names, name='val') if data.get('val') else None + if data.get('train'): + data['train'] = WANDB_ARTIFACT_PREFIX + str(Path(project) / 'train') + if data.get('val'): + data['val'] = WANDB_ARTIFACT_PREFIX + str(Path(project) / 'val') + path = data_file if overwrite_config else '_wandb.'.join(data_file.rsplit('.', 1)) # updated data.yaml path + data.pop('download', None) + with open(path, 'w') as f: + yaml.dump(data, f) + + if self.job_type == 'Training': # builds correct artifact pipeline graph + self.wandb_run.use_artifact(self.val_artifact) + self.wandb_run.use_artifact(self.train_artifact) + self.val_artifact.wait() + self.val_table = self.val_artifact.get('val') + self.map_val_table_path() + else: + self.wandb_run.log_artifact(self.train_artifact) + self.wandb_run.log_artifact(self.val_artifact) + return path + + def map_val_table_path(self): + self.val_table_map = {} + print("Mapping dataset") + for i, data in enumerate(tqdm(self.val_table.data)): + self.val_table_map[data[3]] = data[0] + + def create_dataset_table(self, dataset, class_to_id, name='dataset'): + # TODO: Explore multiprocessing to slpit this loop parallely| This is essential for speeding up the the logging artifact = wandb.Artifact(name=name, type="dataset") - image_path = dataset.path - artifact.add_dir(image_path, name='data/images') - table = wandb.Table(columns=["id", "train_image", "Classes"]) + for img_file in tqdm([dataset.path]) if Path(dataset.path).is_dir() else tqdm(dataset.img_files): + if Path(img_file).is_dir(): + artifact.add_dir(img_file, name='data/images') + labels_path = 'labels'.join(dataset.path.rsplit('images', 1)) + artifact.add_dir(labels_path, name='data/labels') + else: + artifact.add_file(img_file, name='data/images/' + Path(img_file).name) + label_file = Path(img2label_paths([img_file])[0]) + artifact.add_file(str(label_file), + name='data/labels/' + label_file.name) if label_file.exists() else None + table = wandb.Table(columns=["id", "train_image", "Classes", "name"]) class_set = wandb.Classes([{'id': id, 'name': name} for id, name in class_to_id.items()]) - for si, (img, labels, paths, shapes) in enumerate(dataset): + for si, (img, labels, paths, shapes) in enumerate(tqdm(dataset)): height, width = shapes[0] - labels[:, 2:] = (xywh2xyxy(labels[:, 2:].view(-1, 4))) - labels[:, 2:] *= torch.Tensor([width, height, width, height]) - box_data = [] - img_classes = {} + labels[:, 2:] = (xywh2xyxy(labels[:, 2:].view(-1, 4))) * torch.Tensor([width, height, width, height]) + box_data, img_classes = [], {} for cls, *xyxy in labels[:, 1:].tolist(): cls = int(cls) box_data.append({"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]}, @@ -112,34 +213,52 @@ def log_dataset_artifact(self, dataset, class_to_id, name='dataset'): "domain": "pixel"}) img_classes[cls] = class_to_id[cls] boxes = {"ground_truth": {"box_data": box_data, "class_labels": class_to_id}} # inference-space - table.add_data(si, wandb.Image(paths, classes=class_set, boxes=boxes), json.dumps(img_classes)) + table.add_data(si, wandb.Image(paths, classes=class_set, boxes=boxes), json.dumps(img_classes), + Path(paths).name) artifact.add(table, name) - labels_path = 'labels'.join(image_path.rsplit('images', 1)) - zip_path = Path(labels_path).parent / (name + '_labels.zip') - if not zip_path.is_file(): # make_archive won't check if file exists - shutil.make_archive(zip_path.with_suffix(''), 'zip', labels_path) - artifact.add_file(str(zip_path), name='data/labels.zip') - wandb.log_artifact(artifact) - print("Saving data to W&B...") + return artifact + + def log_training_progress(self, predn, path, names): + if self.val_table and self.result_table: + class_set = wandb.Classes([{'id': id, 'name': name} for id, name in names.items()]) + box_data = [] + total_conf = 0 + for *xyxy, conf, cls in predn.tolist(): + if conf >= 0.25: + box_data.append( + {"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]}, + "class_id": int(cls), + "box_caption": "%s %.3f" % (names[cls], conf), + "scores": {"class_score": conf}, + "domain": "pixel"}) + total_conf = total_conf + conf + boxes = {"predictions": {"box_data": box_data, "class_labels": names}} # inference-space + id = self.val_table_map[Path(path).name] + self.result_table.add_data(self.current_epoch, + id, + wandb.Image(self.val_table.data[id][1], boxes=boxes, classes=class_set), + total_conf / max(1, len(box_data)) + ) def log(self, log_dict): if self.wandb_run: for key, value in log_dict.items(): self.log_dict[key] = value - def end_epoch(self): - if self.wandb_run and self.log_dict: + def end_epoch(self, best_result=False): + if self.wandb_run: wandb.log(self.log_dict) - self.log_dict = {} + self.log_dict = {} + if self.result_artifact: + train_results = wandb.JoinedTable(self.val_table, self.result_table, "id") + self.result_artifact.add(train_results, 'result') + wandb.log_artifact(self.result_artifact, aliases=['latest', 'epoch ' + str(self.current_epoch), + ('best' if best_result else '')]) + self.result_table = wandb.Table(["epoch", "id", "prediction", "avg_confidence"]) + self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation") def finish_run(self): if self.wandb_run: - if self.result_artifact: - print("Add Training Progress Artifact") - self.result_artifact.add(self.result_table, 'result') - train_results = wandb.JoinedTable(self.testset_artifact.get("val"), self.result_table, "id") - self.result_artifact.add(train_results, 'joined_result') - wandb.log_artifact(self.result_artifact) if self.log_dict: wandb.log(self.log_dict) wandb.run.finish()