microsoft · ant0nsc · May 3, 2023 · May 3, 2023 · May 3, 2023 · May 5, 2023
diff --git a/.github/workflows/hi-ml-pr.yml b/.github/workflows/hi-ml-pr.yml
@@ -224,7 +224,7 @@ jobs:
         with:
             flags: ${{ matrix.folder }}
 
-  himl_smoke_helloworld_v1:
+  smoke_helloworld_v1:
     runs-on: ubuntu-20.04
     needs: [ cancel-azureml ]
     steps:
@@ -242,7 +242,7 @@ jobs:
           cd hi-ml
           make smoke_helloworld_v1
 
-  himl_smoke_helloworld_v2:
+  smoke_helloworld_v2:
     runs-on: ubuntu-20.04
     needs: [ cancel-azureml ]
     steps:
@@ -260,7 +260,7 @@ jobs:
           cd hi-ml
           make smoke_helloworld_v2
 
-  himl_smoke_helloworld_v1_2nodes:
+  smoke_helloworld_v1_2nodes:
     runs-on: ubuntu-20.04
     needs: [ cancel-azureml ]
     steps:
@@ -278,7 +278,7 @@ jobs:
           cd hi-ml
           make smoke_helloworld_v1_2nodes
 
-  himl_smoke_helloworld_v2_2nodes:
+  smoke_helloworld_v2_2nodes:
     runs-on: ubuntu-20.04
     needs: [ cancel-azureml ]
     steps:
@@ -296,15 +296,53 @@ jobs:
           cd hi-ml
           make smoke_helloworld_v2_2nodes
 
+  smoke_helloworld_v1_crossval:
+    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          lfs: true
+
+      - name: Set up Python ${{ env.pythonVersion }}
+        uses: ./.github/actions/prepare_himl_python_env
+        with:
+          python-version: ${{ env.pythonVersion }}
+
+      - name: Run smoke_helloworld_v1_crossval
+        run: |
+          cd hi-ml
+          make smoke_helloworld_v1_crossval
+
+  smoke_helloworld_v2_crossval:
+    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          lfs: true
+
+      - name: Set up Python ${{ env.pythonVersion }}
+        uses: ./.github/actions/prepare_himl_python_env
+        with:
+          python-version: ${{ env.pythonVersion }}
+
+      - name: Run smoke_helloworld_v2_crossval
+        run: |
+          cd hi-ml
+          make smoke_helloworld_v2_crossval
+
   himl-smoke-tests-completed:
     # This job is just a placeholder to ensure that all smoke tests have completed before
     # publishing the package. Reference this job rather than the individual smoke tests.
     runs-on: ubuntu-20.04
     needs: [
-         himl_smoke_helloworld_v1,
-         himl_smoke_helloworld_v2,
-         himl_smoke_helloworld_v1_2nodes,
-         himl_smoke_helloworld_v2_2nodes,
+         smoke_helloworld_v1,
+         smoke_helloworld_v2,
+         smoke_helloworld_v1_2nodes,
+         smoke_helloworld_v2_2nodes,
+         smoke_helloworld_v1_crossval,
+         smoke_helloworld_v2_crossval,
       ]
     steps:
       - name: Smoke tests completed

diff --git a/hi-ml-azure/src/health_azure/himl.py b/hi-ml-azure/src/health_azure/himl.py
@@ -584,6 +584,7 @@ def create_command_job(cmd: str) -> Command:
         # underlying command such as experiment name and max_total_trials
         job_to_submit.experiment_name = experiment_name
         job_to_submit.set_limits(max_total_trials=hyperparam_args.get(MAX_TOTAL_TRIALS_ARG, None))
+        job_to_submit.shm_size = docker_shm_size
 
     else:
         job_to_submit = create_command_job(cmd)

diff --git a/hi-ml/Makefile b/hi-ml/Makefile
@@ -103,3 +103,15 @@ smoke_helloworld_v1_2nodes:
 # HelloWorld model training on 2 nodes, submitted using the v2 SDK
 smoke_helloworld_v2_2nodes:
 	python src/health_ml/runner.py --model=health_ml.HelloWorld --strictly_aml_v1=False --num_nodes=2 --tag smoke_helloworld_v2_2nodes ${SHARED_ARGS}
+
+# HelloWorld model training via crossvalidation, submitted using the v1 SDK
+# Use the HelloWorldWithMemoryCheck model because there were issues in AzureML with not allocating enough memory in
+# Hyperdrive jobs
+smoke_helloworld_v1_crossval:
+	python src/health_ml/runner.py --model=health_ml.HelloWorldWithMemoryCheck --strictly_aml_v1=True --crossval_count=2 --tag smoke_helloworld_v1_crossval ${SHARED_ARGS}
+
+# HelloWorld model training via crossvalidation, submitted using the v2 SDK
+# Use the HelloWorldWithMemoryCheck model because there were issues in AzureML with not allocating enough memory in
+# Hyperdrive jobs
+smoke_helloworld_v2_crossval:
+	python src/health_ml/runner.py --model=health_ml.HelloWorldWithMemoryCheck --strictly_aml_v1=False --crossval_count=2 --tag smoke_helloworld_v2_crossval ${SHARED_ARGS}
diff --git a/hi-ml/src/health_ml/configs/hello_world.py b/hi-ml/src/health_ml/configs/hello_world.py
@@ -2,6 +2,7 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
+import os
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -15,6 +16,7 @@
 from torch.utils.data import DataLoader, Dataset
 
 from health_ml.lightning_container import LightningContainer
+from health_ml.utils.common_utils import get_docker_memory_gb
 
 TEST_MSE_FILE = "test_mse.txt"
 TEST_MAE_FILE = "test_mae.txt"
@@ -291,3 +293,22 @@ def get_callbacks(self) -> List[Callback]:
 
     def get_additional_aml_run_tags(self) -> Dict[str, str]:
         return {"max_epochs": str(self.max_epochs)}
+
+
+class HelloWorldWithMemoryCheck(HelloWorld):
+    """
+    A variant of the HelloWorld container that checks that there is enough memory available to run the model.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.docker_shm_size_gb = 20
+        self.docker_shm_size = f"{self.docker_shm_size_gb}g"
+
+    def before_training_on_global_rank_zero(self) -> None:
+        docker_memory = get_docker_memory_gb(verbose=True)
+        assert docker_memory is not None
+        if docker_memory < self.docker_shm_size_gb:
+            raise ValueError(
+                f"Not enough memory available. Requested {self.docker_shm_size_gb} GB, but only got {docker_memory} GB"
+            )
diff --git a/hi-ml/src/health_ml/deep_learning_config.py b/hi-ml/src/health_ml/deep_learning_config.py
@@ -168,6 +168,9 @@ class WorkflowParams(param.Parameterized):
     This class contains all parameters that affect how the whole training and testing workflow is executed.
     """
 
+    docker_shm_size: str = param.String(
+        "400g", doc="The Docker shared memory size that is required to run this model in AzureML."
+    )
     random_seed: int = param.Integer(42, doc="The seed to use for all random number generators.")
     src_checkpoint: CheckpointParser = param.ClassSelector(
         class_=CheckpointParser, default=None, instantiate=False, doc=CheckpointParser.DOC

diff --git a/hi-ml/src/health_ml/experiment_config.py b/hi-ml/src/health_ml/experiment_config.py
@@ -43,7 +43,6 @@ class ExperimentConfig(param.Parameterized):
         "over the network). When running outside AzureML, datasets will "
         "always be mounted.",
     )
-    docker_shm_size: str = param.String("400g", doc="The shared memory in the Docker image for the AzureML VMs.")
     wait_for_completion: bool = param.Boolean(
         default=False,
         doc="If True, wait for AML Run to complete before proceeding. If False, submit the run to AML and exit",

diff --git a/hi-ml/src/health_ml/runner.py b/hi-ml/src/health_ml/runner.py
@@ -50,7 +50,13 @@
 from health_ml.training_runner import TrainingRunner  # noqa: E402
 from health_ml.utils import fixed_paths  # noqa: E402
 from health_ml.utils.logging import ConsoleAndFileOutput  # noqa: E402
-from health_ml.utils.common_utils import check_conda_environment, choose_conda_env_file, is_linux  # noqa: E402
+from health_ml.utils.common_utils import (
+    check_conda_environment,
+    choose_conda_env_file,
+    get_docker_memory_gb,
+    get_memory_gb,
+    initialize_rpdb,
+)  # noqa: E402
 from health_ml.utils.config_loader import ModelConfigLoader  # noqa: E402
 from health_ml.utils import health_ml_package_setup  # noqa: E402
 
@@ -62,24 +68,6 @@
 sys.argv[0] = str(runner_path.resolve())
 
 
-def initialize_rpdb() -> None:
-    """
-    On Linux only, import and initialize rpdb, to enable remote debugging if necessary.
-    """
-    # rpdb signal trapping does not work on Windows, as there is no SIGTRAP:
-    if not is_linux():
-        return
-    import rpdb
-
-    rpdb_port = 4444
-    rpdb.handle_trap(port=rpdb_port)
-    # For some reason, os.getpid() does not return the ID of what appears to be the currently running process.
-    logging.info(
-        "rpdb is handling traps. To debug: identify the main runner.py process, then as root: "
-        f"kill -TRAP <process_id>; nc 127.0.0.1 {rpdb_port}"
-    )
-
-
 def create_runner_parser() -> argparse.ArgumentParser:
     """
     Creates a commandline parser, that understands all necessary arguments for training a model
@@ -191,9 +179,9 @@ def run(self) -> Tuple[LightningContainer, AzureRunInfo]:
         # Suppress the logging from all processes but the one for GPU 0 on each node, to make log files more readable
         log_level = logging.INFO if is_local_rank_zero() else logging.ERROR
         logging_to_stdout(log_level)
-        # When running in Azure, also output logging to a file. This can help in particular when jobs
-        # get preempted, but we don't get access to the logs from the previous incarnation of the job
         initialize_rpdb()
+        get_memory_gb(verbose=True)
+        get_docker_memory_gb(verbose=True)
         self.parse_and_load_model()
         self.validate()
         azure_run_info = self.submit_to_azureml_if_needed()
@@ -278,7 +266,7 @@ def submit_to_azureml_if_needed(self) -> AzureRunInfo:
                 ignored_folders=[],
                 submit_to_azureml=bool(self.experiment_config.cluster),
                 docker_base_image=DEFAULT_DOCKER_BASE_IMAGE,
-                docker_shm_size=self.experiment_config.docker_shm_size,
+                docker_shm_size=self.lightning_container.docker_shm_size,
                 hyperdrive_config=hyperdrive_config,
                 hyperparam_args=hyperparam_args,
                 display_name=self.lightning_container.tag,
@@ -386,6 +374,8 @@ def run_with_logging(project_root: Path) -> Tuple[LightningContainer, AzureRunIn
     Start the main main entry point for training and testing models from the commandline.
     When running in Azure, this method also redirects the stdout stream, so that all console output is visible both on
     the console and stored in a file. The filename is timestamped and contains the DDP rank of the current process.
+    This can help in particular when jobs get preempted, but we don't get access to the logs from the previous
+    incarnation of the job.
 
     :param project_root: The root folder that contains all of the source code that should be executed.
     :return: If submitting to AzureML, returns the model configuration that was used for training,

diff --git a/hi-ml/src/health_ml/utils/common_utils.py b/hi-ml/src/health_ml/utils/common_utils.py
@@ -3,12 +3,13 @@
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
 
+import logging
 import os
 from contextlib import contextmanager
 from datetime import datetime
 from enum import Enum, unique
 from pathlib import Path
-from typing import Any, Generator, List, Optional
+from typing import Any, Generator, List, Optional, Tuple
 
 import torch
 from torch.nn import Module
@@ -19,10 +20,12 @@
 
 MAX_PATH_LENGTH = 260
 
-# convert string to None if an empty string or whitespace is provided
+
+logger = logging.getLogger(__name__)
 
 
 def empty_string_to_none(x: Optional[str]) -> Optional[str]:
+    """convert string to None if an empty string or whitespace is provided"""
     return None if (x is None or len(x.strip()) == 0) else x
 
 
@@ -226,3 +229,80 @@ def seed_monai_if_available(seed: int) -> None:
         set_determinism(seed=seed)
     except ImportError:
         pass
+
+
+def initialize_rpdb() -> None:
+    """
+    On Linux only, import and initialize rpdb, to enable remote debugging if necessary.
+    """
+    # rpdb signal trapping does not work on Windows, as there is no SIGTRAP:
+    if not is_linux():
+        return
+    import rpdb
+
+    rpdb_port = 4444
+    rpdb.handle_trap(port=rpdb_port)
+    # For some reason, os.getpid() does not return the ID of what appears to be the currently running process.
+    logger.info(
+        "rpdb is handling traps. To debug: identify the main runner.py process, then as root: "
+        f"kill -TRAP <process_id>; nc 127.0.0.1 {rpdb_port}"
+    )
+
+
+def _is_running_in_docker() -> bool:
+    """Return True if the present process is likely to run inside a Docker container"""
+    return Path("/.dockerenv").exists()
+
+
+def get_docker_memory_gb(verbose: bool = False) -> Optional[float]:
+    """Get the total amount of memory when running in a Docker container. If the process does not
+    appear to run in a Docker container, return None.
+
+    :param verbose: If True, print the amount of total Docker memory to stdout.
+    :return: The total amount of Docker memory in GB, or None if the process is not running in Docker.
+    """
+    if is_linux() and _is_running_in_docker():
+        mem_limit_path = Path("/sys/fs/cgroup/memory/memory.limit_in_bytes")
+        try:
+            mem_limit = int(mem_limit_path.read_text().strip())
+        except:
+            logger.warning(f"Unable to read {mem_limit_path} or process the contents.")
+            return None
+        byte_per_GB = 1024.0**3
+        docker_gb = round(mem_limit / byte_per_GB, 3)
+        if verbose:
+            print(f"Total Docker memory: {docker_gb} GB")
+        return docker_gb
+
+    if verbose:
+        print("Unable to determine Docker memory size because the process does not appear to run in Docker.")
+    return None
+
+
+def get_memory_gb(verbose: bool = False) -> Optional[Tuple[float, float, float, float]]:
+    """Get the CPU memory, available CPU memory, total memory (CPU plus swap), available memory in GB.
+    This relies on the Linux 'free' command being available. The function returns None if the command is not available.
+
+    :param verbose: If True, print the output of the 'free' command to stdout.
+    :return: Tuple of (CPU memory, CPU memory available, total memory, total memory available), all in GB.
+    """
+    free_commandline = "free -t -m"
+    try:
+        free_output = os.popen(free_commandline).readlines()
+    except:
+        logger.warning(f"Unable to run '{free_commandline}'")
+        return None
+    if verbose:
+        print(f"Checking available memory. Result of running '{free_commandline}' (available memory in MB):")
+        for line in free_output:
+            # Lines still contain a newline at the end, so no need to add that
+            print(line, end="")
+    if len(free_output) < 4:
+        logger.warning(f"Unexpected result when running '{free_commandline}': {free_output}")
+        return None
+    cpu_mem, _, cpu_mem_available = map(float, free_output[1].split()[1:4])
+    total_mem, _, total_mem_available = map(float, free_output[3].split()[1:4])
+    MB_per_GB = 1024.0
+    values = (cpu_mem, cpu_mem_available, total_mem, total_mem_available)
+    result = tuple(map(lambda x: round(x / MB_per_GB, 3), values))
+    return result  # type: ignore