From 5b6964b18b1935e63376ab3056e8ae6ec951d838 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Thu, 11 Feb 2021 13:33:33 -0800 Subject: [PATCH 1/3] Amazon AWS EC2 startup and re-startup scripts --- utils/aws/mime.sh | 26 ++++++++++++++++++++++++++ utils/aws/userdata.sh | 26 ++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 utils/aws/mime.sh create mode 100644 utils/aws/userdata.sh diff --git a/utils/aws/mime.sh b/utils/aws/mime.sh new file mode 100644 index 000000000000..c319a83cfbdf --- /dev/null +++ b/utils/aws/mime.sh @@ -0,0 +1,26 @@ +# AWS EC2 instance startup 'MIME' script https://aws.amazon.com/premiumsupport/knowledge-center/execute-user-data-ec2/ +# This script will run on every instance restart, not only on first start +# --- DO NOT COPY ABOVE COMMENTS WHEN PASTING INTO USERDATA --- + +Content-Type: multipart/mixed; boundary="//" +MIME-Version: 1.0 + +--// +Content-Type: text/cloud-config; charset="us-ascii" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +Content-Disposition: attachment; filename="cloud-config.txt" + +#cloud-config +cloud_final_modules: +- [scripts-user, always] + +--// +Content-Type: text/x-shellscript; charset="us-ascii" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +Content-Disposition: attachment; filename="userdata.txt" + +#!/bin/bash +# --- paste contents of userdata.sh here --- +--// diff --git a/utils/aws/userdata.sh b/utils/aws/userdata.sh new file mode 100644 index 000000000000..728226e14d4c --- /dev/null +++ b/utils/aws/userdata.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# AWS EC2 instance startup script https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html +# This script will run only once on first instance start (for a re-start script see mime.sh) +# /home/ubuntu (ubuntu) or /home/ec2-user (amazon-linux) is working dir +# Use >300 GB SSD + +cd home/ubuntu +if [ ! -d yolov5 ]; then + echo "Running first-time script." # install dependencies, download COCO, pull Docker + git clone https://github.com/ultralytics/yolov5 && sudo chmod -R 777 yolov5 + cd yolov5 + bash data/scripts/get_coco.sh && echo "Data done." & + sudo docker pull ultralytics/yolov5:latest && echo "Docker done." & + # python -m pip install --upgrade pip && pip install -r requirements.txt && python detect.py && echo "Requirements done." & +else + echo "Running re-start script." # resume interrupted runs + i=0 + list=$(docker ps -qa) # container list i.e. $'one\ntwo\nthree\nfour' + while IFS= read -r id; do + ((i++)) + echo "restarting container $i: $id" + docker start $id + # docker exec -it $id python train.py --resume # single-GPU + docker exec -d $id python -m torch.distributed.launch --nproc_per_node 8 --master_port $i train.py --resume # multi-GPU + done <<<"$list" +fi From 3715b59fe932a53ce4592792b2eaa3e594b32435 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Tue, 23 Feb 2021 23:05:22 -0800 Subject: [PATCH 2/3] Create resume.py --- train.py | 2 +- utils/aws/__init__.py | 0 utils/aws/resume.py | 34 ++++++++++++++++++++++++++++++++++ utils/aws/userdata.sh | 2 +- 4 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 utils/aws/__init__.py create mode 100644 utils/aws/resume.py diff --git a/train.py b/train.py index 4065e1f149ef..1df4ddb46620 100644 --- a/train.py +++ b/train.py @@ -444,7 +444,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None): parser.add_argument('--hyp', type=str, default='data/hyp.scratch.yaml', help='hyperparameters path') parser.add_argument('--epochs', type=int, default=300) parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs') - parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='[train, test] image sizes') + parser.add_argument('--img-size', nargs='+', type=int, default=[128, 128], help='[train, test] image sizes') parser.add_argument('--rect', action='store_true', help='rectangular training') parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training') parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') diff --git a/utils/aws/__init__.py b/utils/aws/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/utils/aws/resume.py b/utils/aws/resume.py new file mode 100644 index 000000000000..338c8b10127b --- /dev/null +++ b/utils/aws/resume.py @@ -0,0 +1,34 @@ +# Resume all interrupted trainings in yolov5/ dir including DPP trainings +# Usage: $ python utils/aws/resume.py + +import os +from pathlib import Path + +import torch +import yaml + +port = 0 # --master_port +path = Path('').resolve() +for last in path.rglob('*/**/last.pt'): + ckpt = torch.load(last) + if ckpt['optimizer'] is None: + continue + + # Load opt.yaml + with open(last.parent.parent / 'opt.yaml') as f: + opt = yaml.load(f, Loader=yaml.SafeLoader) + + # Get device count + d = opt['device'].split(',') # devices + nd = len(d) # number of devices + ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel + + if ddp: # multi-GPU + port += 1 + cmd = f'python -m torch.distributed.launch --nproc_per_node {nd} --master_port {port} train.py --resume {last}' + else: # single-GPU + cmd = f'python train.py --resume {last}' + + cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread + print(cmd) + os.system(cmd) diff --git a/utils/aws/userdata.sh b/utils/aws/userdata.sh index 728226e14d4c..36405d1a1565 100644 --- a/utils/aws/userdata.sh +++ b/utils/aws/userdata.sh @@ -21,6 +21,6 @@ else echo "restarting container $i: $id" docker start $id # docker exec -it $id python train.py --resume # single-GPU - docker exec -d $id python -m torch.distributed.launch --nproc_per_node 8 --master_port $i train.py --resume # multi-GPU + docker exec -d $id python utils/aws/resume.py done <<<"$list" fi From 1be7e9325f194d242dc2d90fe6d9f90ed7df945b Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Tue, 23 Feb 2021 23:06:14 -0800 Subject: [PATCH 3/3] cleanup --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 1df4ddb46620..4065e1f149ef 100644 --- a/train.py +++ b/train.py @@ -444,7 +444,7 @@ def train(hyp, opt, device, tb_writer=None, wandb=None): parser.add_argument('--hyp', type=str, default='data/hyp.scratch.yaml', help='hyperparameters path') parser.add_argument('--epochs', type=int, default=300) parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs') - parser.add_argument('--img-size', nargs='+', type=int, default=[128, 128], help='[train, test] image sizes') + parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='[train, test] image sizes') parser.add_argument('--rect', action='store_true', help='rectangular training') parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training') parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')