feat: Framework splitting (#9318)

* make ngc-pytorch images default * fix a profiler bug * update docs for ngc image changes
determined-ai · May 15, 2024 · a96cafd · a96cafd
1 parent 3b1d0df
commit a96cafd
Show file tree

Hide file tree

Showing 41 changed files with 268 additions and 194 deletions.
diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml
@@ -30,13 +30,13 @@ parameters:
     default: ubuntu-2004:2024.01.1
   gpu-machine-image:
     type: string
-    default: ubuntu-2004-cuda-11.2:202103-01
+    default: linux-cuda-12:default
   # DEFAULT_PT_GPU_IMAGE: Pytorch training image reference used by the tests
   # Inject here as a parameter so that it is updated by bumpversion, and can
   # be referenced by --ee testing.
   default-pt-gpu-image:
     type: string
-    default: determinedai/pytorch-tensorflow-cuda-dev:8b3bea3
+    default: determinedai/pytorch-ngc-dev:8b3bea3
   # Some python, go, and react dependencies are cached by circleci via `save_cache`/`restore_cache`.
   # If the dependencies stay the same, but the circleci code that would produce them is changed,
   # it may be necessary to invalidate the cache by incrementing this value.
@@ -251,7 +251,7 @@ commands:
       - when:
           condition: <<parameters.tf2>>
           steps:
-            - run: docker pull determinedai/pytorch-tensorflow-cpu-dev:8b3bea3
+            - run: docker pull determinedai/pytorch-ngc-dev:8b3bea3
 
   login-docker:
     parameters:
@@ -2312,9 +2312,9 @@ jobs:
           paths:
             - test-unit-harness-cpu-pycov
 
-  test-unit-harness-gpu:
+  test-unit-harness-gpu-tf:
     docker:
-      - image: determinedai/pytorch-tensorflow-cuda-dev:8b3bea3
+      - image: determinedai/tensorflow-ngc-dev:8b3bea3
     resource_class: determined-ai/container-runner-gpu
     steps:
       - run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
@@ -2326,17 +2326,17 @@ jobs:
       - run: pip install mypy pytest coverage
       - install-codecov
       - setup-paths
-      - run: COVERAGE_FILE=/root/project/test-unit-harness-gpu-pycov make -C harness test-gpu
-      - run: coverage xml -i --data-file=./test-unit-harness-gpu-pycov
+      - run: COVERAGE_FILE=/root/project/test-unit-harness-gpu-tf-pycov make -C harness test-gpu-tf
+      - run: coverage xml -i --data-file=./test-unit-harness-gpu-tf-pycov
       - run: codecov -v -t $CODECOV_TOKEN -F harness
       - persist_to_workspace:
           root: .
           paths:
-            - test-unit-harness-gpu-pycov
+            - test-unit-harness-gpu-tf-pycov
 
   test-unit-harness-pytorch2-gpu:
     docker:
-      - image: determinedai/pytorch-cuda-dev:8b3bea3
+      - image: determinedai/pytorch-ngc-dev:8b3bea3
     resource_class: determined-ai/container-runner-gpu
     steps:
       - run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
@@ -2358,7 +2358,7 @@ jobs:
 
   test-unit-harness-pytorch2-cpu:
     docker:
-      - image: determinedai/pytorch-cpu-dev:8b3bea3
+      - image: determinedai/pytorch-ngc-dev:8b3bea3
     steps:
       - run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
       - checkout
@@ -2379,7 +2379,7 @@ jobs:
 
   test-unit-harness-gpu-parallel:
     docker:
-      - image: determinedai/pytorch-tensorflow-cuda-dev:8b3bea3
+      - image: determinedai/pytorch-ngc-dev:8b3bea3
     resource_class: determined-ai/container-runner-multi-gpu
     steps:
       - run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
@@ -2423,19 +2423,17 @@ jobs:
 
   test-unit-harness-tf2:
     docker:
-      - image: <<pipeline.parameters.docker-image>>
+      - image: determinedai/tensorflow-ngc-dev:f17151a
     steps:
       - checkout
       - add-and-fetch-upstream
       - skip-if-only-docs
       - skip-if-only-github
       - skip-if-only-webui
       - install-codecov
-      - setup-python-venv:
-          install-python: false
-          determined: true
-          extra-requirements-file: "harness/tests/requirements/requirements-harness.txt"
-          executor: <<pipeline.parameters.docker-image>>
+      - run: pip install mypy pytest coverage
+      - install-codecov
+      - setup-paths
       - run: COVERAGE_FILE=$PWD/test-unit-harness-tf2-pycov make -C harness test-tf2
       - run: coverage xml -i --data-file=./test-unit-harness-tf2-pycov
       - run: codecov -v -t $CODECOV_TOKEN -F harness
@@ -3605,7 +3603,7 @@ jobs:
         type: string
         default: "1"
       environment-image:
-        default: determinedai/pytorch-tensorflow-cuda-dev:8b3bea3
+        default: determinedai/pytorch-ngc-dev:8b3bea3
         type: string
       accel-node-taints:
         type: string
@@ -4005,7 +4003,7 @@ workflows:
       - test-unit-harness-pytorch2-cpu
       - test-unit-harness-pytorch2-gpu
 
-      - test-unit-harness-gpu:
+      - test-unit-harness-gpu-tf:
           filters: *any-upstream
       - test-unit-harness-gpu-deepspeed:
           filters: *any-upstream
@@ -4016,8 +4014,8 @@ workflows:
       - request-gpu-unit-tests:
           type: approval
           filters: *any-fork
-      - test-unit-harness-gpu:
-          name: f-test-unit-harness-gpu
+      - test-unit-harness-gpu-tf:
+          name: f-test-unit-harness-gpu-tf
           filters: *any-fork
       - test-unit-harness-gpu-deepspeed:
           name: f-test-unit-harness-gpu-deepspeed
@@ -4033,7 +4031,7 @@ workflows:
       - python-coverage:
           requires:
             - test-unit-harness-cpu
-            - test-unit-harness-gpu
+            - test-unit-harness-gpu-tf
             - test-unit-harness-gpu-parallel
             - test-unit-harness-tf2
             - test-unit-harness-pytorch2-cpu

diff --git a/docs/manage/troubleshooting.rst b/docs/manage/troubleshooting.rst
@@ -4,6 +4,18 @@
  Troubleshooting
 #################
 
+   .. important::
+
+      TensorFlow users must configure their environment image in their :ref:`experiment
+      configuration <experiment-config-reference>` file before submitting an experiment.
+
+      .. code:: bash
+
+         environment:
+            image:
+               cpu: determinedai/tensorflow-ngc-dev:f17151a
+               gpu: determinedai/tensorflow-ngc-dev:f17151a
+
 ****************
  Error messages
 ****************

diff --git a/docs/model-dev-guide/api-guides/apis-howto/_index.rst b/docs/model-dev-guide/api-guides/apis-howto/_index.rst
@@ -73,16 +73,14 @@ TensorFlow 1 vs 2
 
 Determined supports both TensorFlow 1 and 2. The version of TensorFlow that is used for a particular
 experiment is controlled by the container image that has been configured for that experiment.
-Determined provides prebuilt Docker images that include TensorFlow 2.11, 1.15, and 2.8,
-respectively:
+Determined provides prebuilt Docker images that include TensorFlow 2+, 1.15, and 2.8, respectively:
 
--  ``determinedai/pytorch-tensorflow-cuda-dev:8b3bea3`` (default)
+-  ``determinedai/tensorflow-ngc-dev:8b3bea3``
 -  ``determinedai/environments:cuda-10.2-pytorch-1.7-tf-1.15-gpu-0.21.2``
 -  ``determinedai/environments:cuda-11.2-tf-2.8-gpu-0.29.1``
 
 We also provide lightweight CPU-only counterparts:
 
--  ``determinedai/pytorch-tensorflow-cpu-dev:8b3bea3``
 -  ``determinedai/environments:py-3.8-tf-2.8-cpu-0.29.1``
 
 To change the container image used for an experiment, specify :ref:`environment.image

diff --git a/docs/model-dev-guide/create-experiment.rst b/docs/model-dev-guide/create-experiment.rst
@@ -197,7 +197,7 @@ a distributed training context, the entry point is:
 
 .. code:: bash
 
-   python3 -m determined.launch.horovod --trial model_def:TrialClass
+   python3 -m determined.launch.torch_distributed --trial model_def:TrialClass
 
 Nested Launchers
 ================
@@ -210,7 +210,7 @@ Example:
 
 .. code:: bash
 
-   dlprof --mode=simple python3 -m determined.launch.autohorovod --trial model_def:MnistTrial
+   dlprof --mode=simple python3 -m determined.launch.torch_distributed --trial model_def:MnistTrial
 
 ************************
  Creating an Experiment

diff --git a/docs/model-dev-guide/prepare-container/custom-env.rst b/docs/model-dev-guide/prepare-container/custom-env.rst
@@ -98,15 +98,31 @@ GPU-specific versions of each library are automatically selected when running on
 Default Images
 ==============
 
-+-------------+-------------------------------------------------------------------------------+
-| Environment | File Name                                                                     |
-+=============+===============================================================================+
-| CPUs        | ``determinedai/pytorch-tensorflow-cpu-dev:8b3bea3``                           |
-+-------------+-------------------------------------------------------------------------------+
-| NVIDIA GPUs | ``determinedai/pytorch-tensorflow-cuda-dev:8b3bea3``                          |
-+-------------+-------------------------------------------------------------------------------+
-| AMD GPUs    | ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4``        |
-+-------------+-------------------------------------------------------------------------------+
+.. list-table::
+   :widths: 25 75
+   :header-rows: 1
+
+   -  -  Environment
+      -  File Name
+   -  -  CPUs
+      -  ``determinedai/pytorch-ngc-dev:8b3bea3``
+   -  -  NVIDIA GPUs
+      -  ``determinedai/pytorch-ngc-dev:8b3bea3``
+   -  -  AMD GPUs
+      -  ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4``
+
+.. _ngc-version:
+
+NGC Version
+===========
+
+By default, a suitable NGC container version is used in our images. Users can select a different
+version of NGC containers to build images from. Versions are listed on the `NVIDIA Frameworks site
+<https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html>`__. Once a suitable
+version is selected, users can rebuild these images by cloning the `MLDE environments repo
+<https://github.com/determined-ai/environments>`__ and modifying either NGC_PYTORCH_VERSION or
+NGC_TENSORFLOW_VERSION variables in the MakeFile, then running `make build-pytorch-ngc` or `make
+build-tensorflow-ngc` respectively.
 
 .. _custom-docker-images:
 
@@ -132,7 +148,7 @@ Example Dockerfile that installs custom ``conda``-, ``pip``-, and ``apt``-based
 .. code:: bash
 
    # Determined Image
-   FROM determinedai/pytorch-tensorflow-cuda-dev:8b3bea3
+   FROM determinedai/tensorflow-ngc-dev:8b3bea3
 
    # Custom Configuration
    RUN apt-get update && \
@@ -195,7 +211,7 @@ environments using :ref:`custom images <custom-docker-images>`:
 .. code:: bash
 
    # Determined Image
-   FROM determinedai/pytorch-tensorflow-cpu-dev:8b3bea3
+   FROM determinedai/pytorch-ngc-dev:8b3bea3
 
    # Create a virtual environment
    RUN conda create -n myenv python=3.8

diff --git a/docs/model-dev-guide/prepare-container/set-environment-images.rst b/docs/model-dev-guide/prepare-container/set-environment-images.rst
@@ -8,10 +8,10 @@ Determined launches workloads using Docker containers. By default, workloads exe
 Determined-provided container that includes common deep learning libraries and frameworks. The
 default containers can be found on the Determined Docker Hub with tags for each Determined version:
 
--  `Default containers for CPU training
-   <https://hub.docker.com/r/determinedai/environments/tags?page=1&name=cpu>`__
--  `Default containers for GPU training
-   <https://hub.docker.com/r/determinedai/environments/tags?page=1&name=gpu>`__
+-  `Default containers for CPU and GPU training
+   <https://hub.docker.com/r/determinedai/pytorch-ngc>`__
+-  `Containers for TensorFlow CPU and GPU training
+   <https://hub.docker.com/r/determinedai/tensorflow-ngc>`__
 
 By default, Determined will use the tag corresponding to your cluster's version. To specify a
 different image from this default, update your job configuration to include:
@@ -20,8 +20,8 @@ different image from this default, update your job configuration to include:
 
    environment:
      image:
-       cpu: # full CPU image path, e.g., determined/environments/<tag>
-       gpu: # full GPU image path, e.g., determined/environments/<tag>
+       cpu: # full CPU image path, e.g., determined/tensorflow-ngc:<tag>
+       gpu: # full GPU image path, e.g., determined/tensorflow-ngc:<tag>
 
 If one of the images above contain your required libraries, there is no additional environment
 preparation needed.

diff --git a/docs/reference/deploy/helm-config-reference.rst b/docs/reference/deploy/helm-config-reference.rst
@@ -194,13 +194,13 @@
 
    -  ``cpuImage``: Sets the default Docker image for all non-GPU tasks. If a Docker image is
       specified in the :ref:`experiment config <exp-environment-image>` this default is overriden.
-      Defaults to: ``determinedai/pytorch-tensorflow-cpu-dev:8b3bea3``.
+      Defaults to: ``determinedai/pytorch-ngc-dev:8b3bea3``.
 
    -  ``startupHook``: An optional inline script that will be executed as part of task set up.
 
    -  ``gpuImage``: Sets the default Docker image for all GPU tasks. If a Docker image is specified
       in the :ref:`experiment config <exp-environment-image>` this default is overriden. Defaults
-      to: ``determinedai/pytorch-tensorflow-cuda-dev:8b3bea3``.
+      to: ``determinedai/pytorch-ngc-dev:8b3bea3``.
 
    -  ``logPolicies``: Sets log policies for trials. For details, visit :ref:`log_policies
       <experiment-config-min-validation-period>`.

diff --git a/docs/reference/deploy/master-config-reference.rst b/docs/reference/deploy/master-config-reference.rst
@@ -89,9 +89,12 @@ configure different container images for NVIDIA GPU tasks using the ``cuda`` key
 Determined 0.17.6), CPU tasks using ``cpu`` key, and ROCm (AMD GPU) tasks using the ``rocm`` key.
 Default values:
 
--  ``determinedai/pytorch-tensorflow-cuda-dev:8b3bea3`` for NVIDIA GPUs.
+-  ``determinedai/pytorch-ngc-dev:8b3bea3`` for NVIDIA GPUs and for CPUs.
 -  ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4`` for ROCm.
--  ``determinedai/pytorch-tensorflow-cpu-dev:8b3bea3`` for CPUs.
+
+For TensorFlow users, we provide an image that must be referenced in the experiment configuration:
+
+-  ``determinedai/tensorflow-ngc-dev:8b3bea3`` for NVIDIA GPUs and for CPUs.
 
 ``environment_variables``
 =========================

diff --git a/docs/reference/experiment-config-reference.rst b/docs/reference/experiment-config-reference.rst
@@ -1333,10 +1333,13 @@ Optional. The Docker image to use when executing the workload. This image must b
 container images for NVIDIA GPU tasks using ``cuda`` key (``gpu`` prior to 0.17.6), CPU tasks using
 ``cpu`` key, and ROCm (AMD GPU) tasks using ``rocm`` key. Default values:
 
--  ``determinedai/pytorch-tensorflow-cuda-dev:8b3bea3`` for NVIDIA GPUs.
--  ``determinedai/pytorch-tensorflow-cpu-dev:8b3bea3`` for CPUs.
+-  ``determinedai/pytorch-ngc-dev:8b3bea3`` for NVIDIA GPUs and for CPUs.
 -  ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4`` for ROCm.
 
+For TensorFlow users, we provide an image that must be referenced in the experiment configuration:
+
+-  ``determinedai/tensorflow-ngc-dev:8b3bea3`` for NVIDIA GPUs and for CPUs.
+
 When the cluster is configured with :ref:`resource_manager.type: slurm
 <cluster-configuration-slurm>` and ``container_run_type: singularity``, images are executed using
 the Singularity container runtime which provides additional options for specifying the container

diff --git a/docs/reference/job-config-reference.rst b/docs/reference/job-config-reference.rst
@@ -45,9 +45,13 @@ The following configuration settings are supported:
       different container images for NVIDIA GPU tasks using ``cuda`` key (``gpu`` prior to 0.17.6),
       CPU tasks using ``cpu`` key, and ROCm (AMD GPU) tasks using ``rocm`` key. Default values:
 
-      -  ``determinedai/pytorch-tensorflow-cuda-dev:8b3bea3`` for NVIDIA GPUs.
+      -  ``determinedai/pytorch-ngc-dev:8b3bea3`` for NVIDIA GPUs and for CPUs.
       -  ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4`` for ROCm.
-      -  ``determinedai/pytorch-tensorflow-cpu-dev:8b3bea3`` for CPUs.
+
+      For TensorFlow users, we provide an image that must be referenced in the experiment
+      configuration:
+
+      -  ``determinedai/tensorflow-ngc-dev:8b3bea3`` for NVIDIA GPUs and for CPUs.
 
    -  ``force_pull_image``: Forcibly pull the image from the Docker registry and bypass the Docker
       cache. Defaults to ``false``.

diff --git a/docs/release-notes/framework-splitting.rst b/docs/release-notes/framework-splitting.rst
@@ -0,0 +1,11 @@
+:orphan:
+
+**Breaking Change**
+
+-  Images: The default environment includes images that support PyTorch. TensorFlow users must
+   configure their experiments to target our non-default TensorFlow images. Details on this process
+   can be found at :ref:`set-environment-images`
+
+-  Images: Our new default images are based on Nvidia NGC. While we provide a recommended NGC
+   version, users have the flexibility to build their own images using any NGC version that meets
+   their specific requirements. For more information, visit :ref:`ngc-version`
diff --git a/docs/setup-cluster/deploy-cluster/slurm/singularity.rst b/docs/setup-cluster/deploy-cluster/slurm/singularity.rst
@@ -23,15 +23,18 @@ container runtime in use.
 Each version of Determined utilizes specifically-tagged Docker containers. The image tags referenced
 by default in this version of Determined are described below.
 
-+-------------+--------------------------------------------------------------------------+
-| Environment | File Name                                                                |
-+=============+==========================================================================+
-| CPUs        | ``determinedai/pytorch-tensorflow-cpu-dev:8b3bea3``                      |
-+-------------+--------------------------------------------------------------------------+
-| NVIDIA GPUs | ``determinedai/pytorch-tensorflow-cuda-dev:8b3bea3``                     |
-+-------------+--------------------------------------------------------------------------+
-| AMD GPUs    | ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512``  |
-+-------------+--------------------------------------------------------------------------+
+.. list-table::
+   :widths: 25 75
+   :header-rows: 1
+
+   -  -  Environment
+      -  File Name
+   -  -  CPUs
+      -  ``determinedai/pytorch-ngc-dev:8b3bea3``
+   -  -  NVIDIA GPUs
+      -  ``determinedai/pytorch-ngc-dev:8b3bea3``
+   -  -  AMD GPUs
+      -  ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512``
 
 See :ref:`set-environment-images` for the images Docker Hub location, and add each tagged image
 needed by your experiments to the image cache.

diff --git a/docs/setup-cluster/gcp/install-gcp.rst b/docs/setup-cluster/gcp/install-gcp.rst
@@ -406,5 +406,5 @@ This command line will spin up a cluster of up to 2 A100s in the ``us-central1-c
       --compute-agent-instance-type a2-highgpu-1g --gpu-num 1 \
       --gpu-type nvidia-tesla-a100 \
       --region us-central1 --zone us-central1-c \
-      --gpu-env-image determinedai/pytorch-tensorflow-cuda-dev:8b3bea3 \
-      --cpu-env-image determinedai/pytorch-tensorflow-cpu-dev:8b3bea3
+      --gpu-env-image determinedai/pytorch-ngc-dev:8b3bea3 \
+      --cpu-env-image determinedai/pytorch-ngc-dev:8b3bea3