Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update expt name comment and folder parsing for training #978

Merged
merged 9 commits into from
Oct 13, 2020
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import shutil
import time
from pathlib import Path
from pprint import pprint

import numpy as np
import torch.distributed as dist
Expand Down Expand Up @@ -207,7 +208,8 @@ def train(hyp, opt, device, tb_writer=None):
results = (0, 0, 0, 0, 0, 0, 0) # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
scheduler.last_epoch = start_epoch - 1 # do not move
scaler = amp.GradScaler(enabled=cuda)
logger.info('Image sizes %g train, %g test\nUsing %g dataloader workers\nLogging results to %s\n'
logger.info('Image sizes %g train, %g test\n'
'Using %g dataloader workers\nLogging results to %s\n'
'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs))
for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------
model.train()
Expand Down Expand Up @@ -393,7 +395,7 @@ def train(hyp, opt, device, tb_writer=None):
parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')
parser.add_argument('--cache-images', action='store_true', help='cache images for faster training')
parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training')
parser.add_argument('--name', default='', help='renames results.txt to results_name.txt if supplied')
parser.add_argument('--name', default='', help='renames experiment folder exp{N} to exp{N}_{name} if supplied')
parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
Expand Down Expand Up @@ -440,15 +442,15 @@ def train(hyp, opt, device, tb_writer=None):
assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
opt.batch_size = opt.total_batch_size // opt.world_size

logger.info(opt)
pprint(vars(opt))
with open(opt.hyp) as f:
hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps

# Train
if not opt.evolve:
tb_writer = None
if opt.global_rank in [-1, 0]:
logger.info('Start Tensorboard with "tensorboard --logdir %s", view at http://localhost:6006/' % opt.logdir)
logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.logdir}", view at http://localhost:6006/')
tb_writer = SummaryWriter(log_dir=log_dir) # runs/exp0

train(hyp, opt, device, tb_writer)
Expand All @@ -470,7 +472,8 @@ def train(hyp, opt, device, tb_writer=None):
'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight
'iou_t': (0, 0.1, 0.7), # IoU training threshold
'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold
'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore)
glenn-jocher marked this conversation as resolved.
Show resolved Hide resolved
# temp fix for https://github.com/ultralytics/yolov5/issues/607#issuecomment-692589883
# 'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore)
'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5)
'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction)
'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction)
Expand All @@ -488,11 +491,11 @@ def train(hyp, opt, device, tb_writer=None):
assert opt.local_rank == -1, 'DDP mode not implemented for --evolve'
opt.notest, opt.nosave = True, True # only test/save final epoch
# ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
yaml_file = Path('runs/evolve/hyp_evolved.yaml') # save best result here
yaml_file = Path(opt.logdir) / 'evolve' / 'hyp_evolved.yaml' # save best result here
Borda marked this conversation as resolved.
Show resolved Hide resolved
if opt.bucket:
os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists

for _ in range(300): # generations to evolve
for _ in tqdm(range(300), desc='perform evolve >>'): # generations to evolve
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't use tqdm here as there are many print/logging statements within each training that it does not handle correctly. I tested evolution on the branch right now to verify the issue. Will remove.

Copy link
Contributor Author

@Borda Borda Oct 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well, we can and I would recommend writing somewhere which index is actually running moreover the tqdm gives you an estimate of how much longer you need to finish the evolv...
otherwise, you need to very hard way compute single training from an estimate of each epoch and extrapolate to all queued experiment, the worse case it opening the terminal after some time somehow count how many training finished because there no such count... :]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I understand, unfortunately tqdm is badly behaved when printing within the a tqdm loop. The best way to monitor evolution progress is to look at the yolov5/evolve.txt file, which will show 1 row per generation (sorted by fitness, top row best). hyp.evolve.yaml will also show the best generation index, I should probably update this line to show the best gen / total gen:

# Hyperparameter Evolution Results
# Generations: 306
# P R mAP.5 mAP.5:.95 box obj cls
# Metrics: 0.6 0.936 0.896 0.684 0.0115 0.00805 0.00146

This method also helps you keep track of distributed evolution progress using the example in #607, where multiple single-GPU processes can evolve to the same central evolve.txt and hyp file.

if os.path.exists('evolve.txt'): # if evolve.txt exists: select best hyps and mutate
# Select parent(s)
parent = 'single' # parent selection method: 'single' or 'weighted'
Expand Down Expand Up @@ -532,5 +535,5 @@ def train(hyp, opt, device, tb_writer=None):

# Plot results
plot_evolution(yaml_file)
print('Hyperparameter evolution complete. Best results saved as: %s\nCommand to train a new model with these '
'hyperparameters: $ python train.py --hyp %s' % (yaml_file, yaml_file))
print(f'Hyperparameter evolution complete. Best results saved as: {yaml_file}\n'
f'Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}')
10 changes: 7 additions & 3 deletions utils/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import shutil
import subprocess
import time
import re
from contextlib import contextmanager
from copy import copy
from pathlib import Path
Expand Down Expand Up @@ -952,9 +953,12 @@ def increment_dir(dir, comment=''):
# Increments a directory runs/exp1 --> runs/exp2_comment
n = 0 # number
dir = str(Path(dir)) # os-agnostic
d = sorted(glob.glob(dir + '*')) # directories
if len(d):
n = max([int(x[len(dir):x.rfind('_') if '_' in Path(x).name else None]) for x in d]) + 1 # increment
dirs = sorted(glob.glob(dir + '*')) # directories
glenn-jocher marked this conversation as resolved.
Show resolved Hide resolved
if dirs:
matches = [re.search(r"exp(\d+)", d) for d in dirs]
idxs = [int(m.groups()[0]) for m in matches if m]
if idxs:
n = max(idxs) + 1 # increment
return dir + str(n) + ('_' + comment if comment else '')


Expand Down