From aab94e1bcdd1eb8133ec61dd8708e34a14d59c8e Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Fri, 1 Nov 2024 17:19:06 -0500 Subject: [PATCH] Migrate lf rollover percentage query to CH (#5847) Removed references to the obsolete amz2023 runner prefixes Validation: Ensured the data in both CH and Rockset versions of the query return the same data and the charts look the same --- .../lf_rollover_health/params.json | 3 +- .../lf_rollover_percentage/params.json | 3 + .../lf_rollover_percentage/query.sql | 90 +++++++++++++++++++ torchci/pages/metrics.tsx | 19 ++-- 4 files changed, 106 insertions(+), 9 deletions(-) create mode 100644 torchci/clickhouse_queries/lf_rollover_percentage/params.json create mode 100644 torchci/clickhouse_queries/lf_rollover_percentage/query.sql diff --git a/torchci/clickhouse_queries/lf_rollover_health/params.json b/torchci/clickhouse_queries/lf_rollover_health/params.json index 76dfd33584..79575a60ff 100644 --- a/torchci/clickhouse_queries/lf_rollover_health/params.json +++ b/torchci/clickhouse_queries/lf_rollover_health/params.json @@ -1,4 +1,3 @@ { - "days_ago": "Int64", - "granularity": "String" + "days_ago": "Int64" } \ No newline at end of file diff --git a/torchci/clickhouse_queries/lf_rollover_percentage/params.json b/torchci/clickhouse_queries/lf_rollover_percentage/params.json new file mode 100644 index 0000000000..79575a60ff --- /dev/null +++ b/torchci/clickhouse_queries/lf_rollover_percentage/params.json @@ -0,0 +1,3 @@ +{ + "days_ago": "Int64" +} \ No newline at end of file diff --git a/torchci/clickhouse_queries/lf_rollover_percentage/query.sql b/torchci/clickhouse_queries/lf_rollover_percentage/query.sql new file mode 100644 index 0000000000..e8c46a7238 --- /dev/null +++ b/torchci/clickhouse_queries/lf_rollover_percentage/query.sql @@ -0,0 +1,90 @@ +WITH + normalized_jobs AS ( + SELECT + l AS label, + extract(j.name, '[^,]*') AS job_name, -- Remove shard number and label from job names + j.workflow_name, + toStartOfInterval(j.started_at, INTERVAL 1 HOUR) AS bucket + FROM + -- Deliberatly not adding FINAL to this workflow_job. + -- Risks of not using it: + -- - You may get duplicate records for rows that were updated corresponding to their + -- before/after states, but as long as there’s some mechanism in the query to account + -- for that it’s okay (we check for j.status = 'completed`). + -- - In the worst case scenario, you may only see the ‘old’ version of the records for some rows + -- Costs of using it: + -- - Query procesing time increases from ~5 -> 16 seconds + -- - Memory usage grows from 7.5 GB -> 32 GB + -- So the tradeoff is worth it for this query. + workflow_job AS j + ARRAY JOIN j.labels as l + WHERE + j.created_at > now() - INTERVAL {days_ago: Int64} DAY + AND j.status = 'completed' + AND l != 'self-hosted' + AND l NOT LIKE 'lf.c.%' + AND l NOT LIKE '%canary%' + ), + lf_jobs AS ( + SELECT + DISTINCT j.job_name + FROM + normalized_jobs AS j + WHERE + j.label LIKE 'lf%' + ), + -- filter jobs down to the ones that ran in both + -- LF and Meta fleets + comparable_jobs AS ( + SELECT + j.bucket, + j.label, + j.job_name, + -- Remove shard number and label from job names + j.workflow_name + FROM + normalized_jobs AS j + INNER JOIN + lf_jobs AS lfj ON j.job_name = lfj.job_name + ), + success_stats AS ( + SELECT + bucket, + count(*) AS group_size, + job_name, + workflow_name, + label, + if(substring(label, 1, 3) = 'lf.', True, False) AS lf_fleet + FROM + comparable_jobs + GROUP BY + bucket, job_name, workflow_name, label + ), + comparison_stats AS ( + SELECT + lf.bucket, + SUM(lf.group_size + m.group_size) AS total_jobs, + SUM(m.group_size) AS compliment_jobs, + SUM(lf.group_size) AS counted_jobs, + m.lf_fleet AS c_fleet, + lf.lf_fleet AS m_fleet, + CAST(SUM(lf.group_size) AS Float32) / SUM(lf.group_size + m.group_size) * 100 AS percentage, + IF(lf.lf_fleet, 'Linux Foundation', 'Meta') AS fleet + FROM + success_stats AS lf + INNER JOIN + success_stats AS m ON lf.bucket = m.bucket + WHERE + lf.job_name = m.job_name + AND lf.workflow_name = m.workflow_name + AND ( + (lf.lf_fleet = 1 AND m.lf_fleet = 0) + OR (lf.lf_fleet = 0 AND m.lf_fleet = 1) + ) + AND lf.group_size > 3 + AND m.group_size > 3 + GROUP BY + lf.bucket, lf.lf_fleet, m.lf_fleet + ) +SELECT * FROM comparison_stats +ORDER BY bucket DESC, fleet \ No newline at end of file diff --git a/torchci/pages/metrics.tsx b/torchci/pages/metrics.tsx index 49b2810f5b..440b8b9d6c 100644 --- a/torchci/pages/metrics.tsx +++ b/torchci/pages/metrics.tsx @@ -1230,18 +1230,23 @@ export default function Page() { title={"Percentage of jobs rolled over to Linux Foundation"} queryName={"lf_rollover_percentage"} queryCollection={"metrics"} - queryParams={[ - { - name: "days_ago", - type: "int", - value: timeRange, - }, - ]} + queryParams={ + useClickHouse + ? { ...timeParamsClickHouse, days_ago: timeRange } + : [ + { + name: "days_ago", + type: "int", + value: timeRange, + }, + ] + } granularity={"hour"} timeFieldName={"bucket"} yAxisFieldName={"percentage"} groupByFieldName={"fleet"} yAxisRenderer={(value) => value.toFixed(2).toString() + "%"} + useClickHouse={useClickHouse} />