Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replaces ddp .spawn with subprocess #2029

Merged
merged 114 commits into from
Jun 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
114 commits
Select commit Hold shift + click to select a range
7ec6aea
replace ddp spawn with subprocess
williamFalcon May 31, 2020
38f3d8e
replace ddp spawn with subprocess
williamFalcon May 31, 2020
689f53b
replace ddp spawn with subprocess
williamFalcon May 31, 2020
7368f4d
replace ddp spawn with subprocess
williamFalcon May 31, 2020
241dd59
replace ddp spawn with subprocess
williamFalcon May 31, 2020
c5667c7
replace ddp spawn with subprocess
williamFalcon May 31, 2020
42def65
replace ddp spawn with subprocess
williamFalcon May 31, 2020
3498ca7
replace ddp spawn with subprocess
williamFalcon May 31, 2020
9249cfd
replace ddp spawn with subprocess
williamFalcon May 31, 2020
be3420a
replace ddp spawn with subprocess
williamFalcon May 31, 2020
d5f4aa9
replace ddp spawn with subprocess
williamFalcon May 31, 2020
9e698e0
replace ddp spawn with subprocess
williamFalcon May 31, 2020
3b6fb44
replace ddp spawn with subprocess
williamFalcon May 31, 2020
824323f
replace ddp spawn with subprocess
williamFalcon May 31, 2020
74bc48e
replace ddp spawn with subprocess
williamFalcon May 31, 2020
6ca43ff
replace ddp spawn with subprocess
williamFalcon May 31, 2020
df1e9bb
replace ddp spawn with subprocess
williamFalcon May 31, 2020
e97f6ca
replace ddp spawn with subprocess
williamFalcon May 31, 2020
4e00774
replace ddp spawn with subprocess
williamFalcon May 31, 2020
eb701da
replace ddp spawn with subprocess
williamFalcon May 31, 2020
98def93
replace ddp spawn with subprocess
williamFalcon May 31, 2020
fac94b0
replace ddp spawn with subprocess
williamFalcon May 31, 2020
ce52526
replace ddp spawn with subprocess
williamFalcon May 31, 2020
1267279
replace ddp spawn with subprocess
williamFalcon May 31, 2020
f3675c2
replace ddp spawn with subprocess
williamFalcon May 31, 2020
39fa7f5
replace ddp spawn with subprocess
williamFalcon May 31, 2020
cd10531
replace ddp spawn with subprocess
williamFalcon May 31, 2020
bdba97c
replace ddp spawn with subprocess
williamFalcon May 31, 2020
3c3694e
replace ddp spawn with subprocess
williamFalcon May 31, 2020
0d60400
replace ddp spawn with subprocess
williamFalcon May 31, 2020
83aaa53
replace ddp spawn with subprocess
williamFalcon May 31, 2020
ae2da5a
replace ddp spawn with subprocess
williamFalcon May 31, 2020
e01f641
replace ddp spawn with subprocess
williamFalcon May 31, 2020
239d81c
replace ddp spawn with subprocess
williamFalcon May 31, 2020
8307539
replace ddp spawn with subprocess
williamFalcon May 31, 2020
e697f26
replace ddp spawn with subprocess
williamFalcon May 31, 2020
b7fb0e9
replace ddp spawn with subprocess
williamFalcon May 31, 2020
a4273d8
replace ddp spawn with subprocess
williamFalcon May 31, 2020
a054d0f
replace ddp spawn with subprocess
williamFalcon May 31, 2020
6c8d178
replace ddp spawn with subprocess
williamFalcon May 31, 2020
1e40ea8
replace ddp spawn with subprocess
williamFalcon May 31, 2020
a3cfb04
replace ddp spawn with subprocess
williamFalcon May 31, 2020
4bdd624
replace ddp spawn with subprocess
williamFalcon May 31, 2020
838ca06
replace ddp spawn with subprocess
williamFalcon May 31, 2020
7564256
replace ddp spawn with subprocess
williamFalcon May 31, 2020
cc9391a
replace ddp spawn with subprocess
williamFalcon May 31, 2020
9181d59
replace ddp spawn with subprocess
williamFalcon May 31, 2020
de61e1a
replace ddp spawn with subprocess
williamFalcon May 31, 2020
28768bb
replace ddp spawn with subprocess
williamFalcon May 31, 2020
8249366
replace ddp spawn with subprocess
williamFalcon May 31, 2020
5bd82d6
replace ddp spawn with subprocess
williamFalcon May 31, 2020
d434ab3
replace ddp spawn with subprocess
williamFalcon May 31, 2020
d9512b0
replace ddp spawn with subprocess
williamFalcon May 31, 2020
40361b8
replace ddp spawn with subprocess
williamFalcon May 31, 2020
c825334
replace ddp spawn with subprocess
williamFalcon May 31, 2020
3c69a45
hot fix
williamFalcon May 31, 2020
d90f8f5
hot fix
williamFalcon May 31, 2020
acff33a
hot fix
williamFalcon May 31, 2020
c375b59
hot fix
williamFalcon May 31, 2020
e11087e
hot fix
williamFalcon May 31, 2020
abfb2ff
hot fix
williamFalcon May 31, 2020
2acdabf
hot fix
williamFalcon May 31, 2020
a7f9ea6
hot fix
williamFalcon May 31, 2020
af8d8b8
hot fix
williamFalcon May 31, 2020
682e342
hot fix
williamFalcon May 31, 2020
0d4909e
hot fix
williamFalcon May 31, 2020
69a5f1a
hot fix
williamFalcon May 31, 2020
7708f0d
hot fix
williamFalcon Jun 1, 2020
3ac8db6
hot fix
williamFalcon Jun 1, 2020
17edcc1
hot fix
williamFalcon Jun 1, 2020
3670e4e
hot fix
williamFalcon Jun 1, 2020
37b5895
hot fix
williamFalcon Jun 1, 2020
40c5e03
hot fix
williamFalcon Jun 1, 2020
a2ec041
hot fix
williamFalcon Jun 1, 2020
399e82b
hot fix
williamFalcon Jun 1, 2020
4a1790e
hot fix
williamFalcon Jun 1, 2020
427d0d5
hot fix
williamFalcon Jun 1, 2020
ef925c1
hot fix
williamFalcon Jun 1, 2020
d39e54a
hot fix
williamFalcon Jun 1, 2020
8efb5f1
hot fix
williamFalcon Jun 1, 2020
d6f0ef3
hot fix
williamFalcon Jun 1, 2020
400f654
hot fix
williamFalcon Jun 1, 2020
1e32958
hot fix
williamFalcon Jun 1, 2020
47759d3
hot fix
williamFalcon Jun 1, 2020
9068ad4
hot fix
williamFalcon Jun 1, 2020
c0bddce
hot fix
williamFalcon Jun 1, 2020
76d6532
hot fix
williamFalcon Jun 1, 2020
ddafec0
hot fix
williamFalcon Jun 1, 2020
34f843f
hot fix
williamFalcon Jun 1, 2020
08c2450
hot fix
williamFalcon Jun 1, 2020
7208dae
hot fix
williamFalcon Jun 1, 2020
1d95a06
hot fix
williamFalcon Jun 1, 2020
c2082e9
hot fix
williamFalcon Jun 1, 2020
6c255bd
hot fix
williamFalcon Jun 1, 2020
e0be6f3
hot fix
williamFalcon Jun 1, 2020
e6cfe9c
hot fix
williamFalcon Jun 1, 2020
40eb079
hot fix
williamFalcon Jun 1, 2020
64de141
hot fix
williamFalcon Jun 1, 2020
12943c8
hot fix
williamFalcon Jun 1, 2020
c1ffd03
hot fix
williamFalcon Jun 1, 2020
c42b4c5
hot fix
williamFalcon Jun 1, 2020
750ffc0
hot fix
williamFalcon Jun 1, 2020
c5cb695
hot fix
williamFalcon Jun 1, 2020
0e27016
hot fix
williamFalcon Jun 1, 2020
fdd9958
hot fix
williamFalcon Jun 1, 2020
4d633e3
hot fix
williamFalcon Jun 1, 2020
17e52d3
hot fix
williamFalcon Jun 1, 2020
601816b
hot fix
williamFalcon Jun 1, 2020
4d0e8a3
hot fix
williamFalcon Jun 1, 2020
a36e451
hot fix
williamFalcon Jun 1, 2020
332b0db
hot fix
williamFalcon Jun 1, 2020
f424040
hot fix
williamFalcon Jun 1, 2020
705e805
hot fix
williamFalcon Jun 1, 2020
07777d0
hot fix
williamFalcon Jun 1, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .run_local_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ rm -rf ./tests/cometruns*
rm -rf ./tests/wandb*
rm -rf ./tests/tests/*
rm -rf ./lightning_logs
python -m coverage run --source pytorch_lightning -m py.test pytorch_lightning tests pl_examples -v --doctest-modules --flake8
python -m coverage run --source pytorch_lightning -m py.test pytorch_lightning tests pl_examples -v --doctest-modules --flake8 --durations=0
python -m coverage report -m

# specific file
# python -m coverage run --source pytorch_lightning -m py.test -k test_trainer.py --flake8
# python -m coverage run --source pytorch_lightning -m py.test -k test_trainer.py --flake8 --durations=0
17 changes: 8 additions & 9 deletions pl_examples/basic_examples/cpu_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,23 @@
import pytorch_lightning as pl
from pl_examples.models.lightning_template import LightningTemplateModel

SEED = 2334
torch.manual_seed(SEED)
np.random.seed(SEED)
pl.seed_everything(234)


def main(hparams):
def main(args):
"""
Main training routine specific for this project
:param hparams:
:param args:
"""
# ------------------------
# 1 INIT LIGHTNING MODEL
# ------------------------
model = LightningTemplateModel(hparams)
model = LightningTemplateModel(**vars(args))

# ------------------------
# 2 INIT TRAINER
# ------------------------
trainer = pl.Trainer(max_epochs=hparams.epochs, overfit_pct=0.01, early_stop_callback=True)
trainer = pl.Trainer.from_argparse_args(args)

# ------------------------
# 3 START TRAINING
Expand All @@ -46,9 +44,10 @@ def main(hparams):

# each LightningModule defines arguments relevant to it
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
hyperparams = parser.parse_args()
parser = pl.Trainer.add_argparse_args(parser)
args = parser.parse_args()

# ---------------------
# RUN TRAINING
# ---------------------
main(hyperparams)
main(args)
2 changes: 1 addition & 1 deletion pytorch_lightning/core/lightning.py
Original file line number Diff line number Diff line change
Expand Up @@ -957,7 +957,7 @@ def init_ddp_connection(
f"is not equal to the computed world size ({world_size}). Ignored.")

torch_backend = "nccl" if self.trainer.on_gpu else "gloo"
log.info(f"initializing proc_rank {proc_rank} world {world_size}")
log.info(f"initializing ddp: LOCAL_RANK: {proc_rank}/{world_size - 1} WORLD_SIZE:{world_size}")
torch_distrib.init_process_group(torch_backend, rank=proc_rank, world_size=world_size)

def configure_apex(
Expand Down
88 changes: 82 additions & 6 deletions pytorch_lightning/trainer/distrib_data_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,11 @@ def train_fx(trial_hparams, cluster_manager, _):
import re
from abc import ABC, abstractmethod
from typing import Union
import subprocess
import sys
from time import sleep
import numpy as np
from os.path import abspath

import torch
from pytorch_lightning import _logger as log
Expand Down Expand Up @@ -311,7 +316,7 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

# when slurm is managing the task it sets the visible devices
if not is_slurm_managing_tasks:
if not is_slurm_managing_tasks and 'CUDA_VISIBLE_DEVICES' not in os.environ:
if isinstance(data_parallel_device_ids, int):
id_str = ','.join(str(x) for x in list(range(data_parallel_device_ids)))
os.environ["CUDA_VISIBLE_DEVICES"] = id_str
Expand All @@ -322,7 +327,74 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
# don't make this debug... this is good UX
log.info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')

def ddp_train(self, process_idx, model):
def __set_random_port(self):
"""
When running DDP NOT managed by SLURM, the ports might collide
:return:
"""
try:
default_port = os.environ['MASTER_PORT']
except Exception:
import random
default_port = random.randint(10000, 19000)
os.environ['MASTER_PORT'] = str(default_port)

def spawn_ddp_children(self, model):
self.__set_random_port()
port = os.environ['MASTER_PORT']

master_address = '127.0.0.1' if 'MASTER_ADDR' not in os.environ else os.environ['MASTER_ADDR']
os.environ['MASTER_PORT'] = f'{port}'
os.environ['MASTER_ADDR'] = f'{master_address}'

# allow the user to pass the node rank
node_rank = '0'
if 'NODE_RANK' in os.environ:
node_rank = os.environ['NODE_RANK']
if 'GROUP_RANK' in os.environ:
node_rank = os.environ['GROUP_RANK']

os.environ['NODE_RANK'] = node_rank
os.environ['LOCAL_RANK'] = '0'

# pull out the commands used to run the script and resolve the abs file path
command = sys.argv
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this assumes that it can be called only from cmd with Trainer like arguments...
but what about just a sprit with loaded params from a file...
cc @PyTorchLightning/core-contributors

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you give an example?
these tests were done with cli flags and also trainer args. this local rank is coming from the gpus flag in the trainer.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This also assumes that it is called as (python) path/to/some/script.py and not e.g. from a console entry point

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

full_path = abspath(command[0])
command[0] = full_path
command = ['python'] + command

# since this script sets the visible devices we replace the gpus flag with a number
num_gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',').__len__()

# if script called without a flag, pass in a flag anyhow
if '--gpus' not in command:
arg_gpus = len(self.gpus) if isinstance(self.gpus, list) else self.gpus
command += ['--gpus', arg_gpus]

gpu_flag_idx = command.index('--gpus')
command[gpu_flag_idx + 1] = f'{num_gpus}'

os.environ['WORLD_SIZE'] = f'{num_gpus * self.num_nodes}'

self.interactive_ddp_procs = []
for local_rank in range(1, self.num_processes):
env_copy = os.environ.copy()
env_copy['LOCAL_RANK'] = f'{local_rank}'

# import pdb; pdb.set_trace()
# start process
proc = subprocess.Popen(command, env=env_copy)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

starts the process using env flags and python command

self.interactive_ddp_procs.append(proc)

# starting all processes at once can cause issues
# with dataloaders delay between 1-10 seconds
delay = np.random.uniform(1, 5, 1)[0]
williamFalcon marked this conversation as resolved.
Show resolved Hide resolved
sleep(delay)

local_rank = 0
self.ddp_train(local_rank, model, is_master=True)

def ddp_train(self, process_idx, model, is_master=False):
"""
Entry point into a DP thread
:param gpu_idx:
Expand Down Expand Up @@ -359,7 +431,14 @@ def ddp_train(self, process_idx, model):
# MODEL
# copy model to each gpu
if self.on_gpu:
self.root_gpu = process_idx
gpu_idx = process_idx
if is_master:
# source of truth is cuda for gpu idx
gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
local_rank = int(os.environ['LOCAL_RANK'])
gpu_idx = int(gpus[local_rank])

self.root_gpu = gpu_idx
torch.cuda.set_device(self.root_gpu)
model.cuda(self.root_gpu)

Expand Down Expand Up @@ -388,9 +467,6 @@ def ddp_train(self, process_idx, model):
# continue training routine
self.run_pretrain_routine(model)

# when ddp ends, we save the model
self.save_spawn_weights(model)

def save_spawn_weights(self, model):
"""
Dump a temporary checkpoint after ddp ends to get weights out of the process
Expand Down
10 changes: 10 additions & 0 deletions pytorch_lightning/trainer/distrib_parts.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,8 +685,18 @@ def sanitize_gpu_ids(gpus):
:return: unmodified gpus variable
"""
all_available_gpus = get_all_available_gpus()
misconfig = False
for gpu in gpus:
if gpu not in all_available_gpus:
misconfig = True

if misconfig:
# sometimes auto ddp might have different flags
# but this is not what the user intended
# correct for the user
if len(gpus) == len(all_available_gpus):
gpus = all_available_gpus
else:
raise MisconfigurationException(f"""
You requested GPUs: {gpus}
But your machine only has: {all_available_gpus}
Expand Down
38 changes: 14 additions & 24 deletions pytorch_lightning/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities import rank_zero_warn, parsing


try:
from apex import amp
except ImportError:
Expand Down Expand Up @@ -119,7 +118,7 @@ def __init__(
distributed_backend: Optional[str] = None,
precision: int = 32,
print_nan_grads: bool = False, # backward compatible, todo: remove in v0.9.0
weights_summary: Optional[str] = 'full',
weights_summary: Optional[str] = 'top',
Comment on lines -122 to +121
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is not reflected in the docs

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mind shot a fix PR?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes unless it was not intentional. @williamFalcon yes or no?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, forgot to update docs on this.
i propose we make it top going forward

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added it here #2021, hope it's fine including it there.

weights_save_path: Optional[str] = None,
num_sanity_val_steps: int = 2,
truncated_bptt_steps: Optional[int] = None,
Expand Down Expand Up @@ -494,6 +493,7 @@ def __init__(
# init flags for SLURM+ddp to work
self.proc_rank = 0
self.world_size = 1
self.interactive_ddp_procs = []
self.configure_slurm_ddp(self.num_nodes)
self.node_rank = self.determine_ddp_node_rank()

Expand Down Expand Up @@ -871,16 +871,12 @@ def fit(
task = int(os.environ['LOCAL_RANK'])
self.ddp_train(task, model)

else:
self.__set_random_port()
# track for predict
elif self.distributed_backend == 'cpu_ddp':
self.model = model
# train
mp.spawn(self.ddp_train, nprocs=self.num_processes, args=(model,))
# load weights if not interrupted
if self.on_colab_kaggle:
self.load_spawn_weights(model)
self.model = model

elif self.distributed_backend == 'ddp':
self.spawn_ddp_children(model)

# 1 gpu or dp option triggers training using DP module
# easier to avoid NCCL issues
Expand Down Expand Up @@ -928,18 +924,6 @@ def fit(
# used for testing or when we need to know that training succeeded
return 1

def __set_random_port(self):
"""
When running DDP NOT managed by SLURM, the ports might collide
:return:
"""
try:
default_port = os.environ['MASTER_PORT']
except Exception:
import random
default_port = random.randint(10000, 19000)
os.environ['MASTER_PORT'] = str(default_port)

def __attach_dataloaders(self, model, train_dataloader=None, val_dataloaders=None, test_dataloaders=None):
# when dataloader is passed via fit, patch the train_dataloader
# functions to overwrite with these implementations
Expand Down Expand Up @@ -1046,7 +1030,10 @@ def run_pretrain_routine(self, model: LightningModule):

# clear cache before training
if self.on_gpu:
torch.cuda.empty_cache()
# use context because of:
# https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898
with torch.cuda.device(f'cuda:{self.root_gpu}'):
torch.cuda.empty_cache()

# CORE TRAINING LOOP
self.train()
Expand Down Expand Up @@ -1096,7 +1083,10 @@ def test(
if model is not None:
self.model = model
self.fit(model)
elif self.use_ddp or self.use_tpu: # pragma: no-cover

# on tpu, .spawn means we don't have a trained model
# TODO: remove TPU spawn
elif self.use_tpu: # pragma: no-cover
# attempt to load weights from a spawn
path = os.path.join(self.default_root_dir, '__temp_weight_ddp_end.ckpt')
test_model = self.model
Expand Down
35 changes: 19 additions & 16 deletions pytorch_lightning/trainer/training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def training_step(self, batch, batch_idx):
from pytorch_lightning.trainer.supporters import TensorRunningAccum
from pytorch_lightning.utilities import rank_zero_warn
from pytorch_lightning.utilities.exceptions import MisconfigurationException
import subprocess

try:
from apex import amp
Expand Down Expand Up @@ -305,13 +306,13 @@ def has_arg(self, *args):

def train(self):
# add signal handlers for process kills
def _signal_kill_handler(*args):
return TrainerTrainLoopMixin.run_training_teardown(self)

orig_signal_handlers = {}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as mentioned @justusschock i disabled this since it was a blocker.
TBH, i don't know how this was allowed on master with the exceptions being thrown every time

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine with temporarily disabling this. In our CI these exceptions did not appear

for sig_name in SIGNAL_TERMINATE:
orig_signal_handlers[sig_name] = signal.signal(getattr(signal, sig_name),
_signal_kill_handler)
# def _signal_kill_handler(*args):
# return TrainerTrainLoopMixin.run_training_teardown(self)
#
# orig_signal_handlers = {}
# for sig_name in SIGNAL_TERMINATE:
# orig_signal_handlers[sig_name] = signal.signal(getattr(signal, sig_name),
# _signal_kill_handler)

# get model
model = self.get_model()
Expand Down Expand Up @@ -384,15 +385,17 @@ def _signal_kill_handler(*args):

self.run_training_teardown()

# reset signal handlers
for sig_name in SIGNAL_TERMINATE:
signal.signal(getattr(signal, sig_name), orig_signal_handlers[sig_name])

except KeyboardInterrupt:
if self.proc_rank == 0:
log.info('Detected KeyboardInterrupt, attempting graceful shutdown...')
self.interrupted = True
self.run_training_teardown()
rank_zero_warn('Detected KeyboardInterrupt, attempting graceful shutdown...')

# user could press ctrl+c many times... only shutdown once
if not self.interrupted:
self.interrupted = True

for proc in self.interactive_ddp_procs:
subprocess.Popen.kill(proc)

self.run_training_teardown()

def run_training_epoch(self):

Expand Down Expand Up @@ -678,7 +681,7 @@ def _get_optimizers_iterable(self):
opt_idx = np.argmax(optimizer_freq_cumsum > current_place_in_loop)
return [(opt_idx, self.optimizers[opt_idx])]

@atexit.register
# @atexit.register
def run_training_teardown(self):
if hasattr(self, '_teardown_already_run') and self._teardown_already_run:
return
Expand Down
2 changes: 1 addition & 1 deletion tests/base/model_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def dataloader(self, train):
loader = DataLoader(
dataset=dataset,
batch_size=self.batch_size,
# test and valid shall not be shuffled
num_workers=3,
shuffle=train,
)
return loader
Expand Down
4 changes: 2 additions & 2 deletions tests/base/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def assert_speed_parity(pl_times, pt_times, num_epochs):
f"lightning was slower than PT (threshold {max_diff_per_epoch})"


def run_model_test_without_loggers(trainer_options, model, min_acc=0.50):
def run_model_test_without_loggers(trainer_options, model, min_acc=0.30):
williamFalcon marked this conversation as resolved.
Show resolved Hide resolved
reset_seed()

# fit model
Expand Down Expand Up @@ -155,7 +155,7 @@ def load_model_from_checkpoint(root_weights_dir, module_class=EvalModelTemplate)
return trained_model


def run_prediction(dataloader, trained_model, dp=False, min_acc=0.5):
def run_prediction(dataloader, trained_model, dp=False, min_acc=0.3):
# run prediction on 1 batch
for batch in dataloader:
break
Expand Down
Loading