From 1417dfc4cbbc7a0d64c3e8b541af83b1dd8b3381 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 3 Apr 2020 23:57:34 +0200 Subject: [PATCH] simplify examples structure (#1247) * simplify examples structure * update changelog * fix imports * rename example * rename scripts * changelog --- pl_examples/README.md | 71 ++++++++++++++++--- pl_examples/__init__.py | 2 +- pl_examples/basic_examples/README.md | 27 ++++++- pl_examples/basic_examples/cpu_template.py | 2 +- pl_examples/basic_examples/gpu_template.py | 2 +- .../multi_node_ddp2_demo.py | 2 +- .../multi_node_ddp_demo.py | 2 +- .../submit_ddp2_job.sh} | 0 .../submit_ddp_job.sh} | 0 .../{gan.py => generative_adversarial_net.py} | 2 +- .../imagenet.py} | 0 .../semantic_segmentation.py} | 2 +- .../full_examples/imagenet/__init__.py | 0 .../models/unet/model.py | 44 ------------ .../{full_examples => models}/__init__.py | 0 .../lightning_template.py} | 0 .../models/unet/parts.py => models/unet.py} | 41 +++++++++++ pl_examples/multi_node_examples/README.md | 21 ------ pl_examples/multi_node_examples/__init__.py | 0 19 files changed, 135 insertions(+), 83 deletions(-) rename pl_examples/{multi_node_examples => basic_examples}/multi_node_ddp2_demo.py (92%) rename pl_examples/{multi_node_examples => basic_examples}/multi_node_ddp_demo.py (92%) rename pl_examples/{multi_node_examples/ddp2_job_submit.sh => basic_examples/submit_ddp2_job.sh} (100%) rename pl_examples/{multi_node_examples/ddp_job_submit.sh => basic_examples/submit_ddp_job.sh} (100%) rename pl_examples/domain_templates/{gan.py => generative_adversarial_net.py} (99%) rename pl_examples/{full_examples/imagenet/imagenet_example.py => domain_templates/imagenet.py} (100%) rename pl_examples/{full_examples/semantic_segmentation/semseg.py => domain_templates/semantic_segmentation.py} (99%) delete mode 100644 pl_examples/full_examples/imagenet/__init__.py delete mode 100644 pl_examples/full_examples/semantic_segmentation/models/unet/model.py rename pl_examples/{full_examples => models}/__init__.py (100%) rename pl_examples/{basic_examples/lightning_module_template.py => models/lightning_template.py} (100%) rename pl_examples/{full_examples/semantic_segmentation/models/unet/parts.py => models/unet.py} (60%) delete mode 100644 pl_examples/multi_node_examples/README.md delete mode 100644 pl_examples/multi_node_examples/__init__.py diff --git a/pl_examples/README.md b/pl_examples/README.md index 5757d64f0079ee..93715b0e44661f 100644 --- a/pl_examples/README.md +++ b/pl_examples/README.md @@ -1,14 +1,67 @@ # Examples -This folder has 4 sections: +This folder has 3 sections: -### Basic examples -These show the most common use of Lightning for either CPU or GPU training. +## Basic Examples +Use these examples to test how lightning works. -### Domain templates -These are templates to show common approaches such as GANs and RL. +#### Test on CPU +```bash +python cpu_template.py +``` -### Full examples -Contains examples demonstrating ImageNet training, Semantic Segmentation, etc. +--- +#### Train on a single GPU +```bash +python gpu_template.py --gpus 1 +``` -### Multi-node examples -These show how to run jobs on a GPU cluster using lightning. \ No newline at end of file +--- +#### DataParallel (dp) +Train on multiple GPUs using DataParallel. + +```bash +python gpu_template.py --gpus 2 --distributed_backend dp +``` + +--- +#### DistributedDataParallel (ddp) + +Train on multiple GPUs using DistributedDataParallel +```bash +python gpu_template.py --gpus 2 --distributed_backend ddp +``` + +--- +#### DistributedDataParallel+DP (ddp2) + +Train on multiple GPUs using DistributedDataParallel + dataparallel. +On a single node, uses all GPUs for 1 model. Then shares gradient information +across nodes. +```bash +python gpu_template.py --gpus 2 --distributed_backend ddp2 +``` + +## Multi-node example + +This demo launches a job using 2 GPUs on 2 different nodes (4 GPUs total). +To run this demo do the following: + +1. Log into the jumphost node of your SLURM-managed cluster. +2. Create a conda environment with Lightning and a GPU PyTorch version. +3. Choose a script to submit + +### DDP +Submit this job to run with DistributedDataParallel (2 nodes, 2 gpus each) +```bash +sbatch ddp_job_submit.sh YourEnv +``` + +### DDP2 +Submit this job to run with a different implementation of DistributedDataParallel. +In this version, each node acts like DataParallel but syncs across nodes like DDP. +```bash +sbatch ddp2_job_submit.sh YourEnv +``` + +## Domain templates +These are templates to show common approaches such as GANs and RL. diff --git a/pl_examples/__init__.py b/pl_examples/__init__.py index c75a843f6cf71a..1c5908539cfdc6 100644 --- a/pl_examples/__init__.py +++ b/pl_examples/__init__.py @@ -140,7 +140,7 @@ def optimize_on_cluster(hyperparams): """ -from .basic_examples.lightning_module_template import LightningTemplateModel +from pl_examples.models.lightning_template import LightningTemplateModel __all__ = [ 'LightningTemplateModel' diff --git a/pl_examples/basic_examples/README.md b/pl_examples/basic_examples/README.md index 9faae5a3a2a4d2..63fdc7f8c47c71 100644 --- a/pl_examples/basic_examples/README.md +++ b/pl_examples/basic_examples/README.md @@ -1,4 +1,4 @@ -# Basic Examples +## Basic Examples Use these examples to test how lightning works. #### Test on CPU @@ -36,4 +36,27 @@ On a single node, uses all GPUs for 1 model. Then shares gradient information across nodes. ```bash python gpu_template.py --gpus 2 --distributed_backend ddp2 -``` \ No newline at end of file +``` + + +# Multi-node example + +This demo launches a job using 2 GPUs on 2 different nodes (4 GPUs total). +To run this demo do the following: + +1. Log into the jumphost node of your SLURM-managed cluster. +2. Create a conda environment with Lightning and a GPU PyTorch version. +3. Choose a script to submit + +#### DDP +Submit this job to run with DistributedDataParallel (2 nodes, 2 gpus each) +```bash +sbatch ddp_job_submit.sh YourEnv +``` + +#### DDP2 +Submit this job to run with a different implementation of DistributedDataParallel. +In this version, each node acts like DataParallel but syncs across nodes like DDP. +```bash +sbatch ddp2_job_submit.sh YourEnv +``` diff --git a/pl_examples/basic_examples/cpu_template.py b/pl_examples/basic_examples/cpu_template.py index 537c4cdcfaf67e..1ab195d515379b 100644 --- a/pl_examples/basic_examples/cpu_template.py +++ b/pl_examples/basic_examples/cpu_template.py @@ -8,7 +8,7 @@ import torch import pytorch_lightning as pl -from pl_examples.basic_examples.lightning_module_template import LightningTemplateModel +from pl_examples.models.lightning_template import LightningTemplateModel SEED = 2334 torch.manual_seed(SEED) diff --git a/pl_examples/basic_examples/gpu_template.py b/pl_examples/basic_examples/gpu_template.py index b51c3770705365..408b62387fc8c2 100644 --- a/pl_examples/basic_examples/gpu_template.py +++ b/pl_examples/basic_examples/gpu_template.py @@ -8,7 +8,7 @@ import torch import pytorch_lightning as pl -from pl_examples.basic_examples.lightning_module_template import LightningTemplateModel +from pl_examples.models.lightning_template import LightningTemplateModel SEED = 2334 torch.manual_seed(SEED) diff --git a/pl_examples/multi_node_examples/multi_node_ddp2_demo.py b/pl_examples/basic_examples/multi_node_ddp2_demo.py similarity index 92% rename from pl_examples/multi_node_examples/multi_node_ddp2_demo.py rename to pl_examples/basic_examples/multi_node_ddp2_demo.py index 6abf8739e94169..ca9c986a17f259 100644 --- a/pl_examples/multi_node_examples/multi_node_ddp2_demo.py +++ b/pl_examples/basic_examples/multi_node_ddp2_demo.py @@ -8,7 +8,7 @@ import torch import pytorch_lightning as pl -from pl_examples.basic_examples.lightning_module_template import LightningTemplateModel +from pl_examples.models.lightning_template import LightningTemplateModel SEED = 2334 torch.manual_seed(SEED) diff --git a/pl_examples/multi_node_examples/multi_node_ddp_demo.py b/pl_examples/basic_examples/multi_node_ddp_demo.py similarity index 92% rename from pl_examples/multi_node_examples/multi_node_ddp_demo.py rename to pl_examples/basic_examples/multi_node_ddp_demo.py index eb04611b54f22e..518a9f39cc938a 100644 --- a/pl_examples/multi_node_examples/multi_node_ddp_demo.py +++ b/pl_examples/basic_examples/multi_node_ddp_demo.py @@ -8,7 +8,7 @@ import torch import pytorch_lightning as pl -from pl_examples.basic_examples.lightning_module_template import LightningTemplateModel +from pl_examples.models.lightning_template import LightningTemplateModel SEED = 2334 torch.manual_seed(SEED) diff --git a/pl_examples/multi_node_examples/ddp2_job_submit.sh b/pl_examples/basic_examples/submit_ddp2_job.sh similarity index 100% rename from pl_examples/multi_node_examples/ddp2_job_submit.sh rename to pl_examples/basic_examples/submit_ddp2_job.sh diff --git a/pl_examples/multi_node_examples/ddp_job_submit.sh b/pl_examples/basic_examples/submit_ddp_job.sh similarity index 100% rename from pl_examples/multi_node_examples/ddp_job_submit.sh rename to pl_examples/basic_examples/submit_ddp_job.sh diff --git a/pl_examples/domain_templates/gan.py b/pl_examples/domain_templates/generative_adversarial_net.py similarity index 99% rename from pl_examples/domain_templates/gan.py rename to pl_examples/domain_templates/generative_adversarial_net.py index 795876e20e93a1..cbe21fe2dbab3d 100644 --- a/pl_examples/domain_templates/gan.py +++ b/pl_examples/domain_templates/generative_adversarial_net.py @@ -1,6 +1,6 @@ """ To run this template just do: -python gan.py +python generative_adversarial_net.py After a few epochs, launch TensorBoard to see the images being generated at every batch: diff --git a/pl_examples/full_examples/imagenet/imagenet_example.py b/pl_examples/domain_templates/imagenet.py similarity index 100% rename from pl_examples/full_examples/imagenet/imagenet_example.py rename to pl_examples/domain_templates/imagenet.py diff --git a/pl_examples/full_examples/semantic_segmentation/semseg.py b/pl_examples/domain_templates/semantic_segmentation.py similarity index 99% rename from pl_examples/full_examples/semantic_segmentation/semseg.py rename to pl_examples/domain_templates/semantic_segmentation.py index 5426dfa2a78659..8ce01c3b7088b8 100644 --- a/pl_examples/full_examples/semantic_segmentation/semseg.py +++ b/pl_examples/domain_templates/semantic_segmentation.py @@ -6,10 +6,10 @@ import torch.nn.functional as F import torchvision.transforms as transforms from PIL import Image -from models.unet.model import UNet from torch.utils.data import DataLoader, Dataset import pytorch_lightning as pl +from pl_examples.models.unet import UNet class KITTI(Dataset): diff --git a/pl_examples/full_examples/imagenet/__init__.py b/pl_examples/full_examples/imagenet/__init__.py deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/pl_examples/full_examples/semantic_segmentation/models/unet/model.py b/pl_examples/full_examples/semantic_segmentation/models/unet/model.py deleted file mode 100644 index e858ed2f1baef8..00000000000000 --- a/pl_examples/full_examples/semantic_segmentation/models/unet/model.py +++ /dev/null @@ -1,44 +0,0 @@ -import torch.nn as nn - -from models.unet.parts import DoubleConv, Down, Up - - -class UNet(nn.Module): - """ - Architecture based on U-Net: Convolutional Networks for Biomedical Image Segmentation - Link - https://arxiv.org/abs/1505.04597 - - Parameters: - num_classes (int): Number of output classes required (default 19 for KITTI dataset) - bilinear (bool): Whether to use bilinear interpolation or transposed - convolutions for upsampling. - """ - - def __init__(self, num_classes=19, bilinear=False): - super().__init__() - self.layer1 = DoubleConv(3, 64) - self.layer2 = Down(64, 128) - self.layer3 = Down(128, 256) - self.layer4 = Down(256, 512) - self.layer5 = Down(512, 1024) - - self.layer6 = Up(1024, 512, bilinear=bilinear) - self.layer7 = Up(512, 256, bilinear=bilinear) - self.layer8 = Up(256, 128, bilinear=bilinear) - self.layer9 = Up(128, 64, bilinear=bilinear) - - self.layer10 = nn.Conv2d(64, num_classes, kernel_size=1) - - def forward(self, x): - x1 = self.layer1(x) - x2 = self.layer2(x1) - x3 = self.layer3(x2) - x4 = self.layer4(x3) - x5 = self.layer5(x4) - - x6 = self.layer6(x5, x4) - x6 = self.layer7(x6, x3) - x6 = self.layer8(x6, x2) - x6 = self.layer9(x6, x1) - - return self.layer10(x6) diff --git a/pl_examples/full_examples/__init__.py b/pl_examples/models/__init__.py similarity index 100% rename from pl_examples/full_examples/__init__.py rename to pl_examples/models/__init__.py diff --git a/pl_examples/basic_examples/lightning_module_template.py b/pl_examples/models/lightning_template.py similarity index 100% rename from pl_examples/basic_examples/lightning_module_template.py rename to pl_examples/models/lightning_template.py diff --git a/pl_examples/full_examples/semantic_segmentation/models/unet/parts.py b/pl_examples/models/unet.py similarity index 60% rename from pl_examples/full_examples/semantic_segmentation/models/unet/parts.py rename to pl_examples/models/unet.py index 8036ea0544f44c..a7c474f3fc47c9 100644 --- a/pl_examples/full_examples/semantic_segmentation/models/unet/parts.py +++ b/pl_examples/models/unet.py @@ -3,6 +3,47 @@ import torch.nn.functional as F +class UNet(nn.Module): + """ + Architecture based on U-Net: Convolutional Networks for Biomedical Image Segmentation + Link - https://arxiv.org/abs/1505.04597 + + Parameters: + num_classes (int): Number of output classes required (default 19 for KITTI dataset) + bilinear (bool): Whether to use bilinear interpolation or transposed + convolutions for upsampling. + """ + + def __init__(self, num_classes=19, bilinear=False): + super().__init__() + self.layer1 = DoubleConv(3, 64) + self.layer2 = Down(64, 128) + self.layer3 = Down(128, 256) + self.layer4 = Down(256, 512) + self.layer5 = Down(512, 1024) + + self.layer6 = Up(1024, 512, bilinear=bilinear) + self.layer7 = Up(512, 256, bilinear=bilinear) + self.layer8 = Up(256, 128, bilinear=bilinear) + self.layer9 = Up(128, 64, bilinear=bilinear) + + self.layer10 = nn.Conv2d(64, num_classes, kernel_size=1) + + def forward(self, x): + x1 = self.layer1(x) + x2 = self.layer2(x1) + x3 = self.layer3(x2) + x4 = self.layer4(x3) + x5 = self.layer5(x4) + + x6 = self.layer6(x5, x4) + x6 = self.layer7(x6, x3) + x6 = self.layer8(x6, x2) + x6 = self.layer9(x6, x1) + + return self.layer10(x6) + + class DoubleConv(nn.Module): """ Double Convolution and BN and ReLU diff --git a/pl_examples/multi_node_examples/README.md b/pl_examples/multi_node_examples/README.md deleted file mode 100644 index f4bb719dea4d27..00000000000000 --- a/pl_examples/multi_node_examples/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# Multi-node example - -This demo launches a job using 2 GPUs on 2 different nodes (4 GPUs total). -To run this demo do the following: - -1. Log into the jumphost node of your SLURM-managed cluster. -2. Create a conda environment with Lightning and a GPU PyTorch version. -3. Choose a script to submit - -#### DDP -Submit this job to run with DistributedDataParallel (2 nodes, 2 gpus each) -```bash -sbatch ddp_job_submit.sh YourEnv -``` - -#### DDP2 -Submit this job to run with a different implementation of DistributedDataParallel. -In this version, each node acts like DataParallel but syncs across nodes like DDP. -```bash -sbatch ddp2_job_submit.sh YourEnv -``` diff --git a/pl_examples/multi_node_examples/__init__.py b/pl_examples/multi_node_examples/__init__.py deleted file mode 100644 index e69de29bb2d1d6..00000000000000