Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Hyperdrive jobs not possible with v2 SDK #880

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 46 additions & 8 deletions .github/workflows/hi-ml-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ jobs:
with:
flags: ${{ matrix.folder }}

himl_smoke_helloworld_v1:
smoke_helloworld_v1:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
Expand All @@ -242,7 +242,7 @@ jobs:
cd hi-ml
make smoke_helloworld_v1

himl_smoke_helloworld_v2:
smoke_helloworld_v2:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
Expand All @@ -260,7 +260,7 @@ jobs:
cd hi-ml
make smoke_helloworld_v2

himl_smoke_helloworld_v1_2nodes:
smoke_helloworld_v1_2nodes:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
Expand All @@ -278,7 +278,7 @@ jobs:
cd hi-ml
make smoke_helloworld_v1_2nodes

himl_smoke_helloworld_v2_2nodes:
smoke_helloworld_v2_2nodes:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
Expand All @@ -296,15 +296,53 @@ jobs:
cd hi-ml
make smoke_helloworld_v2_2nodes

smoke_helloworld_v1_crossval:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
- uses: actions/checkout@v3
with:
lfs: true

- name: Set up Python ${{ env.pythonVersion }}
uses: ./.github/actions/prepare_himl_python_env
with:
python-version: ${{ env.pythonVersion }}

- name: Run smoke_helloworld_v1_crossval
run: |
cd hi-ml
make smoke_helloworld_v1_crossval

smoke_helloworld_v2_crossval:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
- uses: actions/checkout@v3
with:
lfs: true

- name: Set up Python ${{ env.pythonVersion }}
uses: ./.github/actions/prepare_himl_python_env
with:
python-version: ${{ env.pythonVersion }}

- name: Run smoke_helloworld_v2_crossval
run: |
cd hi-ml
make smoke_helloworld_v2_crossval

himl-smoke-tests-completed:
# This job is just a placeholder to ensure that all smoke tests have completed before
# publishing the package. Reference this job rather than the individual smoke tests.
runs-on: ubuntu-20.04
needs: [
himl_smoke_helloworld_v1,
himl_smoke_helloworld_v2,
himl_smoke_helloworld_v1_2nodes,
himl_smoke_helloworld_v2_2nodes,
smoke_helloworld_v1,
smoke_helloworld_v2,
smoke_helloworld_v1_2nodes,
smoke_helloworld_v2_2nodes,
smoke_helloworld_v1_crossval,
smoke_helloworld_v2_crossval,
]
steps:
- name: Smoke tests completed
Expand Down
1 change: 1 addition & 0 deletions hi-ml-azure/src/health_azure/himl.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,7 @@ def create_command_job(cmd: str) -> Command:
# underlying command such as experiment name and max_total_trials
job_to_submit.experiment_name = experiment_name
job_to_submit.set_limits(max_total_trials=hyperparam_args.get(MAX_TOTAL_TRIALS_ARG, None))
job_to_submit.shm_size = docker_shm_size

else:
job_to_submit = create_command_job(cmd)
Expand Down
12 changes: 12 additions & 0 deletions hi-ml/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,15 @@ smoke_helloworld_v1_2nodes:
# HelloWorld model training on 2 nodes, submitted using the v2 SDK
smoke_helloworld_v2_2nodes:
python src/health_ml/runner.py --model=health_ml.HelloWorld --strictly_aml_v1=False --num_nodes=2 --tag smoke_helloworld_v2_2nodes ${SHARED_ARGS}

# HelloWorld model training via crossvalidation, submitted using the v1 SDK
# Use the HelloWorldWithMemoryCheck model because there were issues in AzureML with not allocating enough memory in
# Hyperdrive jobs
smoke_helloworld_v1_crossval:
python src/health_ml/runner.py --model=health_ml.HelloWorldWithMemoryCheck --strictly_aml_v1=True --crossval_count=2 --tag smoke_helloworld_v1_crossval ${SHARED_ARGS}

# HelloWorld model training via crossvalidation, submitted using the v2 SDK
# Use the HelloWorldWithMemoryCheck model because there were issues in AzureML with not allocating enough memory in
# Hyperdrive jobs
smoke_helloworld_v2_crossval:
python src/health_ml/runner.py --model=health_ml.HelloWorldWithMemoryCheck --strictly_aml_v1=False --crossval_count=2 --tag smoke_helloworld_v2_crossval ${SHARED_ARGS}
21 changes: 21 additions & 0 deletions hi-ml/src/health_ml/configs/hello_world.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import os
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

Expand All @@ -15,6 +16,7 @@
from torch.utils.data import DataLoader, Dataset

from health_ml.lightning_container import LightningContainer
from health_ml.utils.common_utils import get_docker_memory_gb

TEST_MSE_FILE = "test_mse.txt"
TEST_MAE_FILE = "test_mae.txt"
Expand Down Expand Up @@ -291,3 +293,22 @@ def get_callbacks(self) -> List[Callback]:

def get_additional_aml_run_tags(self) -> Dict[str, str]:
return {"max_epochs": str(self.max_epochs)}


class HelloWorldWithMemoryCheck(HelloWorld):
"""
A variant of the HelloWorld container that checks that there is enough memory available to run the model.
"""

def __init__(self) -> None:
super().__init__()
self.docker_shm_size_gb = 20
self.docker_shm_size = f"{self.docker_shm_size_gb}g"

def before_training_on_global_rank_zero(self) -> None:
docker_memory = get_docker_memory_gb(verbose=True)
assert docker_memory is not None
if docker_memory < self.docker_shm_size_gb:
raise ValueError(
f"Not enough memory available. Requested {self.docker_shm_size_gb} GB, but only got {docker_memory} GB"
)
3 changes: 3 additions & 0 deletions hi-ml/src/health_ml/deep_learning_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ class WorkflowParams(param.Parameterized):
This class contains all parameters that affect how the whole training and testing workflow is executed.
"""

docker_shm_size: str = param.String(
"400g", doc="The Docker shared memory size that is required to run this model in AzureML."
)
random_seed: int = param.Integer(42, doc="The seed to use for all random number generators.")
src_checkpoint: CheckpointParser = param.ClassSelector(
class_=CheckpointParser, default=None, instantiate=False, doc=CheckpointParser.DOC
Expand Down
1 change: 0 additions & 1 deletion hi-ml/src/health_ml/experiment_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ class ExperimentConfig(param.Parameterized):
"over the network). When running outside AzureML, datasets will "
"always be mounted.",
)
docker_shm_size: str = param.String("400g", doc="The shared memory in the Docker image for the AzureML VMs.")
wait_for_completion: bool = param.Boolean(
default=False,
doc="If True, wait for AML Run to complete before proceeding. If False, submit the run to AML and exit",
Expand Down
34 changes: 12 additions & 22 deletions hi-ml/src/health_ml/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,13 @@
from health_ml.training_runner import TrainingRunner # noqa: E402
from health_ml.utils import fixed_paths # noqa: E402
from health_ml.utils.logging import ConsoleAndFileOutput # noqa: E402
from health_ml.utils.common_utils import check_conda_environment, choose_conda_env_file, is_linux # noqa: E402
from health_ml.utils.common_utils import (
check_conda_environment,
choose_conda_env_file,
get_docker_memory_gb,
get_memory_gb,
initialize_rpdb,
) # noqa: E402
from health_ml.utils.config_loader import ModelConfigLoader # noqa: E402
from health_ml.utils import health_ml_package_setup # noqa: E402

Expand All @@ -62,24 +68,6 @@
sys.argv[0] = str(runner_path.resolve())


def initialize_rpdb() -> None:
"""
On Linux only, import and initialize rpdb, to enable remote debugging if necessary.
"""
# rpdb signal trapping does not work on Windows, as there is no SIGTRAP:
if not is_linux():
return
import rpdb

rpdb_port = 4444
rpdb.handle_trap(port=rpdb_port)
# For some reason, os.getpid() does not return the ID of what appears to be the currently running process.
logging.info(
"rpdb is handling traps. To debug: identify the main runner.py process, then as root: "
f"kill -TRAP <process_id>; nc 127.0.0.1 {rpdb_port}"
)


def create_runner_parser() -> argparse.ArgumentParser:
"""
Creates a commandline parser, that understands all necessary arguments for training a model
Expand Down Expand Up @@ -191,9 +179,9 @@ def run(self) -> Tuple[LightningContainer, AzureRunInfo]:
# Suppress the logging from all processes but the one for GPU 0 on each node, to make log files more readable
log_level = logging.INFO if is_local_rank_zero() else logging.ERROR
logging_to_stdout(log_level)
# When running in Azure, also output logging to a file. This can help in particular when jobs
# get preempted, but we don't get access to the logs from the previous incarnation of the job
initialize_rpdb()
get_memory_gb(verbose=True)
get_docker_memory_gb(verbose=True)
self.parse_and_load_model()
self.validate()
azure_run_info = self.submit_to_azureml_if_needed()
Expand Down Expand Up @@ -278,7 +266,7 @@ def submit_to_azureml_if_needed(self) -> AzureRunInfo:
ignored_folders=[],
submit_to_azureml=bool(self.experiment_config.cluster),
docker_base_image=DEFAULT_DOCKER_BASE_IMAGE,
docker_shm_size=self.experiment_config.docker_shm_size,
docker_shm_size=self.lightning_container.docker_shm_size,
hyperdrive_config=hyperdrive_config,
hyperparam_args=hyperparam_args,
display_name=self.lightning_container.tag,
Expand Down Expand Up @@ -386,6 +374,8 @@ def run_with_logging(project_root: Path) -> Tuple[LightningContainer, AzureRunIn
Start the main main entry point for training and testing models from the commandline.
When running in Azure, this method also redirects the stdout stream, so that all console output is visible both on
the console and stored in a file. The filename is timestamped and contains the DDP rank of the current process.
This can help in particular when jobs get preempted, but we don't get access to the logs from the previous
incarnation of the job.

:param project_root: The root folder that contains all of the source code that should be executed.
:return: If submitting to AzureML, returns the model configuration that was used for training,
Expand Down
84 changes: 82 additions & 2 deletions hi-ml/src/health_ml/utils/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------

import logging
import os
from contextlib import contextmanager
from datetime import datetime
from enum import Enum, unique
from pathlib import Path
from typing import Any, Generator, List, Optional
from typing import Any, Generator, List, Optional, Tuple

import torch
from torch.nn import Module
Expand All @@ -19,10 +20,12 @@

MAX_PATH_LENGTH = 260

# convert string to None if an empty string or whitespace is provided

logger = logging.getLogger(__name__)


def empty_string_to_none(x: Optional[str]) -> Optional[str]:
"""convert string to None if an empty string or whitespace is provided"""
return None if (x is None or len(x.strip()) == 0) else x


Expand Down Expand Up @@ -226,3 +229,80 @@ def seed_monai_if_available(seed: int) -> None:
set_determinism(seed=seed)
except ImportError:
pass


def initialize_rpdb() -> None:
"""
On Linux only, import and initialize rpdb, to enable remote debugging if necessary.
"""
# rpdb signal trapping does not work on Windows, as there is no SIGTRAP:
if not is_linux():
return
import rpdb

rpdb_port = 4444
rpdb.handle_trap(port=rpdb_port)
# For some reason, os.getpid() does not return the ID of what appears to be the currently running process.
logger.info(
"rpdb is handling traps. To debug: identify the main runner.py process, then as root: "
f"kill -TRAP <process_id>; nc 127.0.0.1 {rpdb_port}"
)


def _is_running_in_docker() -> bool:
"""Return True if the present process is likely to run inside a Docker container"""
return Path("/.dockerenv").exists()


def get_docker_memory_gb(verbose: bool = False) -> Optional[float]:
"""Get the total amount of memory when running in a Docker container. If the process does not
appear to run in a Docker container, return None.

:param verbose: If True, print the amount of total Docker memory to stdout.
:return: The total amount of Docker memory in GB, or None if the process is not running in Docker.
"""
if is_linux() and _is_running_in_docker():
mem_limit_path = Path("/sys/fs/cgroup/memory/memory.limit_in_bytes")
try:
mem_limit = int(mem_limit_path.read_text().strip())
except:
logger.warning(f"Unable to read {mem_limit_path} or process the contents.")
return None
byte_per_GB = 1024.0**3
docker_gb = round(mem_limit / byte_per_GB, 3)
if verbose:
print(f"Total Docker memory: {docker_gb} GB")
return docker_gb

if verbose:
print("Unable to determine Docker memory size because the process does not appear to run in Docker.")
return None


def get_memory_gb(verbose: bool = False) -> Optional[Tuple[float, float, float, float]]:
"""Get the CPU memory, available CPU memory, total memory (CPU plus swap), available memory in GB.
This relies on the Linux 'free' command being available. The function returns None if the command is not available.

:param verbose: If True, print the output of the 'free' command to stdout.
:return: Tuple of (CPU memory, CPU memory available, total memory, total memory available), all in GB.
"""
free_commandline = "free -t -m"
try:
free_output = os.popen(free_commandline).readlines()
except:
logger.warning(f"Unable to run '{free_commandline}'")
return None
if verbose:
print(f"Checking available memory. Result of running '{free_commandline}' (available memory in MB):")
for line in free_output:
# Lines still contain a newline at the end, so no need to add that
print(line, end="")
if len(free_output) < 4:
logger.warning(f"Unexpected result when running '{free_commandline}': {free_output}")
return None
cpu_mem, _, cpu_mem_available = map(float, free_output[1].split()[1:4])
total_mem, _, total_mem_available = map(float, free_output[3].split()[1:4])
MB_per_GB = 1024.0
values = (cpu_mem, cpu_mem_available, total_mem, total_mem_available)
result = tuple(map(lambda x: round(x / MB_per_GB, 3), values))
return result # type: ignore
Loading