diff --git a/train.py b/train.py index 78d91fa37ba4..c5cb858230a4 100644 --- a/train.py +++ b/train.py @@ -443,7 +443,7 @@ def train(hyp, opt, device, tb_writer=None): if wandb_logger.wandb and not opt.evolve: # Log the stripped model wandb_logger.wandb.log_artifact(str(final), type='model', name='run_' + wandb_logger.wandb_run.id + '_model', - aliases=['last', 'best', 'stripped']) + aliases=['latest', 'best', 'stripped']) wandb_logger.finish_run() else: dist.destroy_process_group() diff --git a/utils/wandb_logging/wandb_utils.py b/utils/wandb_logging/wandb_utils.py index 72a11018b429..57ce9035a777 100644 --- a/utils/wandb_logging/wandb_utils.py +++ b/utils/wandb_logging/wandb_utils.py @@ -1,3 +1,4 @@ +"""Utilities and tools for tracking runs with Weights & Biases.""" import json import sys from pathlib import Path @@ -35,8 +36,9 @@ def get_run_info(run_path): run_path = Path(remove_prefix(run_path, WANDB_ARTIFACT_PREFIX)) run_id = run_path.stem project = run_path.parent.stem + entity = run_path.parent.parent.stem model_artifact_name = 'run_' + run_id + '_model' - return run_id, project, model_artifact_name + return entity, project, run_id, model_artifact_name def check_wandb_resume(opt): @@ -44,9 +46,9 @@ def check_wandb_resume(opt): if isinstance(opt.resume, str): if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): if opt.global_rank not in [-1, 0]: # For resuming DDP runs - run_id, project, model_artifact_name = get_run_info(opt.resume) + entity, project, run_id, model_artifact_name = get_run_info(opt.resume) api = wandb.Api() - artifact = api.artifact(project + '/' + model_artifact_name + ':latest') + artifact = api.artifact(entity + '/' + project + '/' + model_artifact_name + ':latest') modeldir = artifact.download() opt.weights = str(Path(modeldir) / "last.pt") return True @@ -78,6 +80,18 @@ def process_wandb_config_ddp_mode(opt): class WandbLogger(): + """Log training runs, datasets, models, and predictions to Weights & Biases. + + This logger sends information to W&B at wandb.ai. By default, this information + includes hyperparameters, system configuration and metrics, model metrics, + and basic data metrics and analyses. + + By providing additional command line arguments to train.py, datasets, + models and predictions can also be logged. + + For more on how this logger is used, see the Weights & Biases documentation: + https://docs.wandb.com/guides/integrations/yolov5 + """ def __init__(self, opt, name, run_id, data_dict, job_type='Training'): # Pre-training routine -- self.job_type = job_type @@ -85,16 +99,17 @@ def __init__(self, opt, name, run_id, data_dict, job_type='Training'): # It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call if isinstance(opt.resume, str): # checks resume from artifact if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): - run_id, project, model_artifact_name = get_run_info(opt.resume) + entity, project, run_id, model_artifact_name = get_run_info(opt.resume) model_artifact_name = WANDB_ARTIFACT_PREFIX + model_artifact_name assert wandb, 'install wandb to resume wandb runs' # Resume wandb-artifact:// runs here| workaround for not overwriting wandb.config - self.wandb_run = wandb.init(id=run_id, project=project, resume='allow') + self.wandb_run = wandb.init(id=run_id, project=project, entity=entity, resume='allow') opt.resume = model_artifact_name elif self.wandb: self.wandb_run = wandb.init(config=opt, resume="allow", project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, + entity=opt.entity, name=name, job_type=job_type, id=run_id) if not wandb.run else wandb.run @@ -172,8 +187,8 @@ def download_model_artifact(self, opt): modeldir = model_artifact.download() epochs_trained = model_artifact.metadata.get('epochs_trained') total_epochs = model_artifact.metadata.get('total_epochs') - assert epochs_trained < total_epochs, 'training to %g epochs is finished, nothing to resume.' % ( - total_epochs) + is_finished = total_epochs is None + assert not is_finished, 'training is finished, can only resume incomplete runs.' return modeldir, model_artifact return None, None @@ -188,7 +203,7 @@ def log_model(self, path, opt, epoch, fitness_score, best_model=False): }) model_artifact.add_file(str(path / 'last.pt'), name='last.pt') wandb.log_artifact(model_artifact, - aliases=['latest', 'epoch ' + str(self.current_epoch), 'best' if best_model else '']) + aliases=['latest', 'last', 'epoch ' + str(self.current_epoch), 'best' if best_model else '']) print("Saving model artifact on epoch ", epoch + 1) def log_dataset_artifact(self, data_file, single_cls, project, overwrite_config=False): @@ -291,7 +306,7 @@ def end_epoch(self, best_result=False): if self.result_artifact: train_results = wandb.JoinedTable(self.val_table, self.result_table, "id") self.result_artifact.add(train_results, 'result') - wandb.log_artifact(self.result_artifact, aliases=['latest', 'epoch ' + str(self.current_epoch), + wandb.log_artifact(self.result_artifact, aliases=['latest', 'last', 'epoch ' + str(self.current_epoch), ('best' if best_result else '')]) self.result_table = wandb.Table(["epoch", "id", "prediction", "avg_confidence"]) self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation")