From 0082d18e891d476696b977cc87788f40fa846854 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Tue, 23 Feb 2021 23:10:14 -0800 Subject: [PATCH] Amazon AWS EC2 startup and re-startup scripts (#2185) * Amazon AWS EC2 startup and re-startup scripts * Create resume.py * cleanup --- utils/aws/__init__.py | 0 utils/aws/mime.sh | 26 ++++++++++++++++++++++++++ utils/aws/resume.py | 34 ++++++++++++++++++++++++++++++++++ utils/aws/userdata.sh | 26 ++++++++++++++++++++++++++ 4 files changed, 86 insertions(+) create mode 100644 utils/aws/__init__.py create mode 100644 utils/aws/mime.sh create mode 100644 utils/aws/resume.py create mode 100644 utils/aws/userdata.sh diff --git a/utils/aws/__init__.py b/utils/aws/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/utils/aws/mime.sh b/utils/aws/mime.sh new file mode 100644 index 000000000000..c319a83cfbdf --- /dev/null +++ b/utils/aws/mime.sh @@ -0,0 +1,26 @@ +# AWS EC2 instance startup 'MIME' script https://aws.amazon.com/premiumsupport/knowledge-center/execute-user-data-ec2/ +# This script will run on every instance restart, not only on first start +# --- DO NOT COPY ABOVE COMMENTS WHEN PASTING INTO USERDATA --- + +Content-Type: multipart/mixed; boundary="//" +MIME-Version: 1.0 + +--// +Content-Type: text/cloud-config; charset="us-ascii" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +Content-Disposition: attachment; filename="cloud-config.txt" + +#cloud-config +cloud_final_modules: +- [scripts-user, always] + +--// +Content-Type: text/x-shellscript; charset="us-ascii" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit +Content-Disposition: attachment; filename="userdata.txt" + +#!/bin/bash +# --- paste contents of userdata.sh here --- +--// diff --git a/utils/aws/resume.py b/utils/aws/resume.py new file mode 100644 index 000000000000..338c8b10127b --- /dev/null +++ b/utils/aws/resume.py @@ -0,0 +1,34 @@ +# Resume all interrupted trainings in yolov5/ dir including DPP trainings +# Usage: $ python utils/aws/resume.py + +import os +from pathlib import Path + +import torch +import yaml + +port = 0 # --master_port +path = Path('').resolve() +for last in path.rglob('*/**/last.pt'): + ckpt = torch.load(last) + if ckpt['optimizer'] is None: + continue + + # Load opt.yaml + with open(last.parent.parent / 'opt.yaml') as f: + opt = yaml.load(f, Loader=yaml.SafeLoader) + + # Get device count + d = opt['device'].split(',') # devices + nd = len(d) # number of devices + ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel + + if ddp: # multi-GPU + port += 1 + cmd = f'python -m torch.distributed.launch --nproc_per_node {nd} --master_port {port} train.py --resume {last}' + else: # single-GPU + cmd = f'python train.py --resume {last}' + + cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread + print(cmd) + os.system(cmd) diff --git a/utils/aws/userdata.sh b/utils/aws/userdata.sh new file mode 100644 index 000000000000..36405d1a1565 --- /dev/null +++ b/utils/aws/userdata.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# AWS EC2 instance startup script https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/user-data.html +# This script will run only once on first instance start (for a re-start script see mime.sh) +# /home/ubuntu (ubuntu) or /home/ec2-user (amazon-linux) is working dir +# Use >300 GB SSD + +cd home/ubuntu +if [ ! -d yolov5 ]; then + echo "Running first-time script." # install dependencies, download COCO, pull Docker + git clone https://github.com/ultralytics/yolov5 && sudo chmod -R 777 yolov5 + cd yolov5 + bash data/scripts/get_coco.sh && echo "Data done." & + sudo docker pull ultralytics/yolov5:latest && echo "Docker done." & + # python -m pip install --upgrade pip && pip install -r requirements.txt && python detect.py && echo "Requirements done." & +else + echo "Running re-start script." # resume interrupted runs + i=0 + list=$(docker ps -qa) # container list i.e. $'one\ntwo\nthree\nfour' + while IFS= read -r id; do + ((i++)) + echo "restarting container $i: $id" + docker start $id + # docker exec -it $id python train.py --resume # single-GPU + docker exec -d $id python utils/aws/resume.py + done <<<"$list" +fi