Skip to content

Commit

Permalink
test: collect det task logs as artifacts for ci jobs (#9459)
Browse files Browse the repository at this point in the history
  • Loading branch information
hamidzr committed Jun 12, 2024
1 parent e3d01c1 commit d0d30cf
Show file tree
Hide file tree
Showing 2 changed files with 171 additions and 0 deletions.
48 changes: 48 additions & 0 deletions .circleci/real_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,38 @@ commands:
- run: git remote add upstream https://github.com/determined-ai/determined
- run: tools/scripts/retry.sh git fetch upstream

collect-task-logs:
parameters:
master_address:
type: string
default: "http://localhost:8080"
store_path:
type: string
default: "/tmp/artifacts/logs"
description: Collect logs from the cluster tasks.
steps:
- run:
name: "Ensure necessary Python packages are available."
command: |
pkg_names="fire determined"
for pkg_name in $pkg_names; do
if ! python -c "import $pkg_name" 2>/dev/null; then
pip install $pkg_name
fi
done
- run:
name: "Collect logs and calculate statistics"
command: |
target_dir="<< parameters.store_path >>"
mkdir -p $target_dir
python .circleci/scripts/collect_logs.py --username determined --password "${INITIAL_USER_PASSWORD}" --mlde-host "<< parameters.master_address >>" save_all_logs $target_dir
echo "collected logs at $target_dir"
log_count=$(find "<< parameters.store_path >>" -type f | wc -l)
total_size=$(du -sh "<< parameters.store_path >>" | cut -f1)
echo "Number of log files collected: $log_count"
echo "Total size of collected logs: $total_size"
skip-if-only-dir:
parameters:
dir:
Expand Down Expand Up @@ -585,6 +617,9 @@ commands:
wait-for-master:
type: boolean
default: true
collect-det-job-logs:
type: boolean
default: true
steps:
# Wait for master before splitting tests, since so many splits depend on
# asking master for its configuration in order to apply skipifs.
Expand Down Expand Up @@ -647,6 +682,13 @@ commands:
- upload-test-job:
only_on_branch: main
test_results_path: <<parameters.junit-path>>
- when:
condition: <<parameters.collect-det-job-logs>>
steps:
- collect-task-logs:
master_address: "<<parameters.master-scheme>>://<<parameters.master-host>>:<<parameters.master-port>>"
- store_artifacts:
path: /tmp/artifacts/logs

run-det-deploy-tests:
parameters:
Expand Down Expand Up @@ -3213,6 +3255,9 @@ jobs:
resource-class:
type: string
default: xlarge
collect-det-job-logs:
type: boolean
default: true
machine:
image: <<pipeline.parameters.machine-image>>
resource_class: <<parameters.resource-class>>
Expand Down Expand Up @@ -3266,6 +3311,7 @@ jobs:
managed-devcluster: <<parameters.managed-devcluster>>
extra-pytest-flags: <<parameters.extra-pytest-flags>>
wait-for-master: <<parameters.wait-for-master>>
collect-det-job-logs: <<parameters.collect-det-job-logs>>

- store_test_results:
path: /tmp/test-results/
Expand Down Expand Up @@ -4564,6 +4610,7 @@ workflows:
# Managed devcluster restarts the master over the course of the tests,
# so `compare_stats` cannot get the full logs from `det master logs`.
extra-pytest-flags: "--no-compare-stats"
collect-det-job-logs: false

- test-e2e:
name: test-e2e-multi-k8s
Expand Down Expand Up @@ -4649,6 +4696,7 @@ workflows:
target-stage: agent
wait-for-master: false
extra-pytest-flags: "--no-compare-stats"
collect-det-job-logs: false

- test-e2e:
name: test-e2e-saml
Expand Down
123 changes: 123 additions & 0 deletions .circleci/scripts/collect_logs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/usr/bin/env python

import warnings
from pathlib import Path
from typing import List, Union

import requests

from determined.common import api
from determined.common.api import bindings as b
from determined.common.api import certs

Task = Union[b.v1Command, b.v1Notebook, b.v1Shell, b.v1Tensorboard]


def obtain_token(username: str, password: str, master_address: str) -> str:
"""
Gets a Determined token without using a Session.
"""
response = requests.post(
f"{master_address}/api/v1/auth/login",
json={"username": username, "password": password},
verify=False,
)
response.raise_for_status()
return response.json()["token"]


class CliBase:
"""
Developer-only CLI.
"""

def __init__(self, username: str, password: str, mlde_host: str = "http://localhost:8080"):
self.username = username
self.password = password
self.mlde_host = mlde_host
self.token = obtain_token(username, password, master_address=mlde_host)
cert = certs.Cert(noverify=True)
self.session = api.Session(mlde_host, username=username, token=self.token, cert=cert)


warnings.filterwarnings("ignore", category=FutureWarning, module="determined.*")


class Cli(CliBase):
def get_experiments(self, just_active: bool = True) -> List[b.v1Experiment]:
non_terminal_states = [
b.experimentv1State.ACTIVE,
b.experimentv1State.PAUSED,
b.experimentv1State.RUNNING,
]
states = non_terminal_states if just_active else None
resp = b.get_GetExperiments(self.session, archived=False, states=states)
return resp.experiments

def get_experiments_ids(self, just_active: bool = True) -> List[int]:
return [exp.id for exp in self.get_experiments(just_active)]

def get_trial_logs(self, trial_id: int):
return (log.message for log in b.get_TrialLogs(self.session, trialId=trial_id))

def get_single_trial_exp_logs(self, exp_id: int):
"""
Get logs for a single trial experiment.
"""
trials = b.get_GetExperimentTrials(self.session, experimentId=exp_id).trials
return self.get_trial_logs(trials[0].id)

def save_single_trial_experiment_logs(self, output_dir: str, just_active: bool = False):
"""
Save logs from first trial of each experiment to a given directory.
"""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
for exp in self.get_experiments(just_active):
output_file = output_path / f"{exp.id}-{exp.name}.log"
print(f"Saving logs {output_file}")
with output_file.open("w") as f:
for log in self.get_single_trial_exp_logs(exp.id):
f.write(log)

def get_tasks(self) -> List[Task]:
tasks: List[Task] = []
tasks.extend(b.get_GetCommands(self.session).commands)
tasks.extend(b.get_GetNotebooks(self.session).notebooks)
tasks.extend(b.get_GetShells(self.session).shells)
tasks.extend(b.get_GetTensorboards(self.session).tensorboards)
return tasks

def clean_os_path(self, s: str) -> str:
"""
Remove some characters and replace spaces with underscores.
"""
return "".join([c if (c.isalnum() or c in "/-_.") else "_" for c in s])

def save_all_logs(self, output_dir: str, just_active: bool = False):
"""
Save all task logs to a given directory.
"""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
for exp in self.get_experiments(just_active):
for trial in b.get_GetExperimentTrials(self.session, experimentId=exp.id).trials:
output_file = output_path / self.clean_os_path(
f"exp{exp.id}-trial{trial.id}-{exp.name}.log"
)
print(f"Saving {output_file}")
with output_file.open("w") as f:
for log in self.get_trial_logs(trial.id):
f.write(log)
for task in self.get_tasks():
output_file = output_path / self.clean_os_path(f"task-{task.id}-{task.description}.log")
print(f"Saving {output_file}")
with output_file.open("w") as f:
for log in b.get_TaskLogs(self.session, taskId=task.id):
f.write(log.message)


if __name__ == "__main__":
import fire

fire.Fire(Cli)

0 comments on commit d0d30cf

Please sign in to comment.