Skip to content

Commit

Permalink
feat: add helm master level config for tcd startup hooks (#9135)
Browse files Browse the repository at this point in the history
(cherry picked from commit b109108)
  • Loading branch information
hamidzr authored and determined-ci committed Apr 11, 2024
1 parent a6ae2aa commit 990fbfb
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .circleci/real_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -872,7 +872,7 @@ commands:
name: Get Kubeconfig
- run:
command: |
echo 'export HELM_VALUES="detVersion=<<parameters.det-version>>,maxSlotsPerPod=<<parameters.max-slots-per-pod>>,checkpointStorage.type=gcs,checkpointStorage.bucket=<<parameters.storage-bucket>>"' >> "$BASH_ENV"
echo 'export HELM_VALUES="detVersion=<<parameters.det-version>>,maxSlotsPerPod=<<parameters.max-slots-per-pod>>,checkpointStorage.type=gcs,checkpointStorage.bucket=<<parameters.storage-bucket>>,taskContainerDefaults.startupHook=echo hello from master tcd startup hook"' >> "$BASH_ENV"
name: Prepare helm overrides
- when:
condition: <<parameters.gpus-per-machine>>
Expand Down
2 changes: 2 additions & 0 deletions docs/reference/deploy/helm-config-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@
specified in the :ref:`experiment config <exp-environment-image>` this default is overriden.
Defaults to: ``determinedai/environments:py-3.9-pytorch-1.12-tf-2.11-cpu-0.30.1``.

- ``startupHook``: An optional inline script that will be executed as part of task set up.

- ``gpuImage``: Sets the default Docker image for all GPU tasks. If a Docker image is specified
in the :ref:`experiment config <exp-environment-image>` this default is overriden. Defaults
to: ``determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11-gpu-0.30.1``.
Expand Down
47 changes: 46 additions & 1 deletion e2e_tests/tests/api_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import functools
import uuid
from typing import Callable, Optional, Sequence, Tuple, TypeVar
from typing import Any, Callable, Dict, Optional, Sequence, Tuple, TypeVar

import pytest

Expand Down Expand Up @@ -164,6 +164,15 @@ def list_ntsc(
F = TypeVar("F", bound=Callable)


@functools.lru_cache(maxsize=1)
def _get_master_config() -> Optional[Dict[str, Any]]:
try:
sess = admin_session()
return bindings.get_GetMasterConfig(sess).config
except (errors.APIException, errors.MasterNotFoundException):
return None


@functools.lru_cache(maxsize=1)
def _get_is_k8s() -> Optional[bool]:
try:
Expand Down Expand Up @@ -324,6 +333,42 @@ def decorator(f: F) -> F:
return decorator


def skipif_unexpected_master_config(
expected: Callable[[dict], bool], reason: str = "unexpected master config"
) -> Callable[[F], F]:
def decorator(f: F) -> F:
mc = _get_master_config()
if mc is None:
return f
if not expected(mc):
return pytest.mark.skipif(True, reason=reason)(f) # type: ignore
return f

return decorator


def skipif_missing_startup_hook(
reason: str = "tcd startup hook is required for this test",
) -> Callable[[F], F]:
"""skip if the backing cluster can be missing TCD startup hooks for some of the workloads"""

def check_startup_hook(mc: dict) -> bool:
assert isinstance(mc, dict)

def has_hook(conf_with_tcd: dict) -> bool:
hook = (conf_with_tcd.get("task_container_defaults") or {}).get("startup_hook")
return isinstance(hook, str) and hook != ""

if has_hook(mc):
return True
pools = mc.get("resource_pools", [])
for p in pools:
print(p)
return len(pools) > 0 and all(has_hook(pool) for pool in pools)

return skipif_unexpected_master_config(check_startup_hook, reason=reason)


@functools.lru_cache(maxsize=1)
def _get_strict_q() -> Optional[bool]:
sess = api.UnauthSession(conf.make_master_url(), cert(), max_retries=0)
Expand Down
24 changes: 18 additions & 6 deletions e2e_tests/tests/cluster/test_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,15 @@ def _test_trial_logs(log_regex: re.Pattern) -> None:


@pytest.mark.e2e_cpu
@api_utils.skipif_missing_startup_hook()
@pytest.mark.timeout(10 * 60)
def test_tcd_startup_hook_trial_combined() -> None:
_test_trial_logs(re.compile("^.*hello from rp tcd startup hook.*$"))


@pytest.mark.e2e_cpu_elastic
@pytest.mark.e2e_gpu # Note, `e2e_gpu and not gpu_required` hits k8s cpu tests.
@api_utils.skipif_missing_startup_hook()
@pytest.mark.timeout(10 * 60)
def test_tcd_startup_hook_trial_master() -> None:
_test_trial_logs(re.compile("^.*hello from master tcd startup hook.*$"))
Expand Down Expand Up @@ -144,15 +147,24 @@ def do_check_logs() -> None:


@pytest.mark.e2e_cpu
def test_tcd_startup_hook_task_combined() -> None:
for ntsc in ["command", "notebook", "shell"]:
_test_task_logs(ntsc, {}, re.compile("^.*hello from rp tcd startup hook.*$"))
@api_utils.skipif_missing_startup_hook()
@pytest.mark.parametrize(
"task_type",
["command", "notebook", "shell", "tensorboard"],
)
def test_tcd_startup_hook_task_combined(task_type: str) -> None:
_test_task_logs(task_type, {}, re.compile("^.*hello from rp tcd startup hook.*$"))


@pytest.mark.e2e_cpu_elastic
def test_tcd_startup_hook_task_master() -> None:
for ntsc in ["command", "notebook", "shell"]:
_test_task_logs(ntsc, {}, re.compile("^.*hello from master tcd startup hook.*$"))
@pytest.mark.e2e_gpu # Note, `e2e_gpu and not gpu_required` hits k8s cpu tests.
@api_utils.skipif_missing_startup_hook()
@pytest.mark.parametrize(
"task_type",
["command", "notebook", "shell", "tensorboard"],
)
def test_tcd_startup_hook_task_master(task_type: str) -> None:
_test_task_logs(task_type, {}, re.compile("^.*hello from master tcd startup hook.*$"))


@pytest.mark.e2e_cpu
Expand Down
3 changes: 3 additions & 0 deletions helm/charts/determined/templates/master-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ stringData:
{{ if .Values.taskContainerDefaults -}}
task_container_defaults:
{{- if .Values.taskContainerDefaults.startupHook }}
startup_hook: {{ .Values.taskContainerDefaults.startupHook | quote}}
{{- end }}
{{- if .Values.taskContainerDefaults.networkMode }}
network_mode: {{ .Values.taskContainerDefaults.networkMode }}
{{- end }}
Expand Down
3 changes: 3 additions & 0 deletions helm/charts/determined/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,9 @@ taskContainerDefaults:
# action:
# type: exclude_node

# Configure an optional inline script that will be executed as part of the task setup process.
# startupHook: echo "hello from master startup hook"

## Configure whether we collect anonymous information about the usage of Determined.
telemetry:
enabled: true
Expand Down

0 comments on commit 990fbfb

Please sign in to comment.