Skip to content

Commit

Permalink
[hud][ch] ttrs percentiles (#5616)
Browse files Browse the repository at this point in the history
This query is also used on the kpis page but I didn't transition that
one over yet
  • Loading branch information
clee2000 authored Sep 4, 2024
1 parent ccfc962 commit 933cc44
Show file tree
Hide file tree
Showing 3 changed files with 163 additions and 143 deletions.
2 changes: 1 addition & 1 deletion torchci/clickhouse_queries/ttrs_percentiles/params.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
"startTime": "DateTime64(3)",
"stopTime": "DateTime64(3)",
"workflow": "String"
}
}
232 changes: 116 additions & 116 deletions torchci/clickhouse_queries/ttrs_percentiles/query.sql
Original file line number Diff line number Diff line change
@@ -1,66 +1,53 @@
-- !!! Query is not converted to CH syntax yet. Delete this line when it gets converted
-- This query is used to compute the TTRS KPI for the pytorch/pytorch repo.
--
-- Results are displayed on HUD in two views:
-- The kpi view, where percentile_to_get should be left at zero in order to get the default percentiles
-- The metrics view, where the percentile_to_get and one_bucket should be set in order to get just the desired percentile
--
-- This query has two special params:
-- percentile_to_get: When set, it returns only the specified percentile. Otherwise it returns
-- p25, p50, p75 and p90 percentiles.
-- percentile_to_get: Custom percentile to get
-- one_bucket: When set to false, buckets data into weekly percentiles. When true, it treats
-- entire time range AS one big bucket and returns percnetiles accordingly

WITH
-- All the percentiles that we want the query to determine
percentiles_desired AS (
SELECT
CONCAT('p', n.percentile) as percentile,
n.percentile / 100.0 as percentile_num
FROM UNNEST(ARRAY_CREATE(25, 50, 75, 90) AS percentile) AS n
UNION ALL
-- if percentile_to_get is specified, we get and only return that percentile
SELECT
CONCAT(
'p',
CAST(
ROUND(: percentile_to_get * 100) AS STRING
)
),
: percentile_to_get
WHERE
: percentile_to_get > 0
),
-- Get all PRs that were merged into master, and get all the SHAs for commits from that PR which CI jobs ran against
-- We need the shas because some jobs (like trunk) don't have a PR they explicitly ran against, but they _were_ run against
-- a commit from a PR
pr_shas AS (
SELECT
r.pull_requests[1].number AS pr_number,
r.pull_requests[1].'number' AS pr_number,
CONCAT(
'https://github.com/pytorch/pytorch/pull/',
r.pull_requests[1].number
r.pull_requests[1].'number'
) AS url,
j.head_sha AS sha,
j.head_sha AS sha
FROM
commons.workflow_job j
INNER JOIN commons.workflow_run r ON j.run_id = r.id
default.workflow_job j final
INNER JOIN default.workflow_run r final ON j.run_id = r.id
WHERE
1 = 1
AND j._event_time > PARSE_DATETIME_ISO8601(: startTime)
AND r._event_time > PARSE_DATETIME_ISO8601(: startTime)
AND j._event_time < PARSE_DATETIME_ISO8601(: stopTime)
AND r._event_time < PARSE_DATETIME_ISO8601(: stopTime)
and j.id in (
select id from
materialized_views.workflow_job_by_started_at
where started_at > {startTime: DateTime64(3)}
and started_at < {stopTime: DateTime64(3)}
)
and r.id in (
select id from
materialized_views.workflow_run_by_run_started_at
where run_started_at > {startTime: DateTime64(3)}
and run_started_at < {stopTime: DateTime64(3)}
)
AND LENGTH(r.pull_requests) = 1
AND r.pull_requests[1].head.repo.name = 'pytorch'
AND r.pull_requests[1].'head'.'repo'.'name' = 'pytorch'
AND r.name IN ('pull', 'trunk', 'Lint') -- Ensure we don't pull in random PRs we don't care about
AND r.head_branch NOT IN (
'master', 'main', 'nightly', 'viable/strict'
) -- Only measure TTRS against PRs
AND (
r.pull_requests[1].base.ref = 'master'
OR r.pull_requests[1].base.ref = 'main'
OR r.pull_requests[1].base.ref like 'gh/%/base'
r.pull_requests[1].'base'.'ref' = 'master'
OR r.pull_requests[1].'base'.'ref' = 'main'
OR r.pull_requests[1].'base'.'ref' like 'gh/%/base'
)
GROUP BY
pr_number,
Expand All @@ -71,58 +58,74 @@ pr_shas AS (
-- Open PRs can be noisy experiments which were never meant to be merged.
merged_pr_shas AS (
SELECT
DISTINCT s.pr_number,
s.url,
DISTINCT pr.number as pr_number,
s.url as url,
s.sha
FROM
pr_shas s
INNER JOIN commons.pull_request pr ON s.pr_number = pr.number
default.pull_request as pr final array
join pr.labels as label
join pr_shas s on pr_shas.pr_number = pr.number
WHERE
pr.closed_at IS NOT NULL -- Ensure the PR was actaully merged
AND 'Merged' IN (
SELECT
name
FROM
UNNEST(pr.labels)
)
pr.closed_at != '' -- Ensure the PR was actaully merged
AND label.name = 'Merged'
),
-- Get all the workflows run against the PR and find the steps & stats we care about
commit_job_durations AS (
SELECT
s.pr_number,
j.steps,
js.name AS step_name,
js.conclusion AS step_conclusion,
PARSE_TIMESTAMP_ISO8601(js.completed_at) AS failure_time,
PARSE_TIMESTAMP_ISO8601(js.started_at) AS start_time,
s.pr_number as pr_number,
j.steps as steps,
r.name AS workflow_name,
j.name AS job_name,
r.html_url AS workflow_url,
-- for debugging
s.sha,
s.sha as sha,
j.conclusion AS conclusion,
j.conclusion = 'cancelled' AS was_cancelled,
-- For convenience
j.run_attempt,
-- the attemp number this job was run on
j.run_attempt as run_attempt,
-- the attempt number this job was run on
r.run_attempt AS total_attempts,
r.id AS workflow_run_id,
s.url -- for debugging
s.url as url -- for debugging
FROM
commons.workflow_job j
INNER JOIN merged_pr_shas s ON j.head_sha = s.sha HINT(join_strategy = lookup)
CROSS JOIN UNNEST (j.steps) js
INNER JOIN commons.workflow_run r ON j.run_id = r.id
default.workflow_job j final
JOIN merged_pr_shas s ON j.head_sha = s.sha
JOIN default.workflow_run r final ON j.run_id = r.id
WHERE
1 = 1
AND r.name = :workflow -- Stick to pull workflows to reduce noise. Trendlines are the same within other workflows
r.name = {workflow: String} -- Stick to pull workflows to reduce noise. Trendlines are the same within other workflows
and j.id in (
select id from materialized_views.workflow_job_by_head_sha mv
where mv.head_sha in (select sha from merged_pr_shas)
)
AND j.conclusion = 'failure' -- we just care about failed jobs
AND js.conclusion = 'failure'
AND j.run_attempt = 1 -- only look at the first run attempt since reruns will either 1) succeed, so are irrelevant or 2) repro the failure, biasing our data
and j.name NOT LIKE 'lintrunner%'
and j.name NOT LIKE '%unstable%' -- The PR doesn't wait for unstable jobs, so they should be excluded when computing TTRS
and js.name LIKE 'Test%' -- Only consider test steps
),
),
commit_job_durations_steps as (
SELECT
j.pr_number,
js.'name' AS step_name,
js.'conclusion' AS step_conclusion,
js.'completed_at' AS failure_time,
js.'started_at' AS start_time,
j.workflow_name,
j.job_name,
j.workflow_url,
j.sha,
j.conclusion,
j.conclusion = 'cancelled' AS was_cancelled,
j.run_attempt,
j.total_attempts,
j.workflow_run_id,
j.url -- for debugging
FROM
commit_job_durations j
array JOIN j.steps as js
WHERE
js.'conclusion' = 'failure'
and js.'name' LIKE 'Test%' -- Only consider test steps
),
-- Refine our measurements to only collect the first red signal per workflow
-- Gets the earliest TTRS across each workflow within the same commit
workflow_failure AS (
Expand All @@ -138,12 +141,13 @@ workflow_failure AS (
PARTITION BY d.pr_number, d.sha, d.workflow_run_id
ORDER BY d.failure_time
) as workflow_name,
DURATION_SECONDS(
FIRST_VALUE(d.failure_time) OVER(
date_diff(
'second',
FIRST_VALUE(d.start_time) OVER(
PARTITION BY d.pr_number, d.sha, d.workflow_run_id
ORDER BY d.failure_time
) -
FIRST_VALUE(d.start_time) OVER(
),
FIRST_VALUE(d.failure_time) OVER(
PARTITION BY d.pr_number, d.sha, d.workflow_run_id
ORDER BY d.failure_time
)
Expand All @@ -159,88 +163,84 @@ workflow_failure AS (
FIRST_VALUE(d.failure_time) OVER(
PARTITION BY d.pr_number, d.sha, d.workflow_run_id
ORDER BY d.failure_time
) as failure_time,
) as failure_time
FROM
commit_job_durations d
commit_job_durations_steps d
),
workflow_failure_buckets AS (
SELECT
-- When :one_bucket is set to true, we want the ttrs percentile over all the data
DATE_TRUNC(
'week',
IF(
: one_bucket,
CURRENT_DATETIME(),
{one_bucket: Bool},
now(),
start_time
)
) AS bucket,
*
FROM
workflow_failure
),
-- Within each bucket, figure out what percentile duration and num_commits each PR falls under
percentiles AS (
SELECT
bucket,
ttrs_mins,
workflow_url,
PERCENT_RANK() OVER(
PARTITION BY bucket
ORDER by
ttrs_mins
) AS percentile,
sha,
FROM
workflow_failure_buckets
),
-- Take the full list of percentiles and get just the ones we care about
ttrs_percentile AS (
SELECT
p.bucket,
pd.percentile,
MIN(p.ttrs_mins) AS ttrs_mins
w.bucket,
quantileExact({percentile_to_get: Float32})(w.ttrs_mins) as custom,
quantileExact(.25)(w.ttrs_mins) as p25,
quantileExact(.5)(w.ttrs_mins) as p50,
quantileExact(.75)(w.ttrs_mins) as p75,
quantileExact(.9)(w.ttrs_mins) as p90
FROM
percentiles p CROSS
JOIN percentiles_desired pd
WHERE
1 = 1
AND p.percentile >= pd.percentile_num
AND (
: percentile_to_get <= 0
OR pd.percentile_num = : percentile_to_get
)
GROUP BY
p.bucket,
pd.percentile
workflow_failure_buckets w
group by w.bucket
),

kpi_results AS (
SELECT
FORMAT_TIMESTAMP('%Y-%m-%d', d.bucket) AS bucket,
formatDateTime(d.bucket, '%Y-%m-%d') AS bucket,
-- rolling average
(
ROUND(AVG(ttrs_mins) OVER(
PARTITION BY percentile
ROUND(AVG(custom) OVER(
ORDER BY
-- Average over this many + 1 buckets (two weeks)
bucket ROWS 0 PRECEDING
))
) AS ttrs_mins,
d.percentile
) AS custom,
(
ROUND(AVG(p25) OVER(
ORDER BY
bucket ROWS 0 PRECEDING
))
) AS p25,
(
ROUND(AVG(p50) OVER(
ORDER BY
bucket ROWS 0 PRECEDING
))
) AS p50,
(
ROUND(AVG(p75) OVER(
ORDER BY
bucket ROWS 0 PRECEDING
))
) AS p75,
(
ROUND(AVG(p90) OVER(
ORDER BY
bucket ROWS 0 PRECEDING
))
) AS p90
FROM
ttrs_percentile d
percentiles d
WHERE
: one_bucket
{one_bucket: Bool}
OR (
d.bucket < CURRENT_TIMESTAMP() - INTERVAL 1 WEEK
) -- discard the latest bucket, which will have noisy, partial data
ORDER BY
bucket ASC,
ttrs_mins
)
SELECT
*
FROM
kpi_results
ORDER BY
bucket DESC,
ttrs_mins DESC
bucket DESC
Loading

0 comments on commit 933cc44

Please sign in to comment.