Skip to content

Commit

Permalink
[BE] Put all the flaky test related queries into one subfolder, move …
Browse files Browse the repository at this point in the history
…adhoc queries to saved queries (#6321)

* Puts queries related to flaky tests into a subfolder for better
organization
* Renames some of the queries to be more helpful
* Moves some adhoc queries to named/saved queries
  • Loading branch information
clee2000 authored Feb 24, 2025
1 parent 4fd3541 commit 8c386a1
Show file tree
Hide file tree
Showing 19 changed files with 187 additions and 148 deletions.
2 changes: 0 additions & 2 deletions .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,6 @@ exclude_patterns = [
"torchci/clickhouse_queries/duration_job_per_owning_account/query.sql",
"torchci/clickhouse_queries/reverts/query.sql",
"torchci/clickhouse_queries/cost_job_per_job_name/query.sql",
"torchci/clickhouse_queries/disabled_non_flaky_tests/query.sql",
"torchci/clickhouse_queries/queued_jobs_by_label/query.sql",
"torchci/clickhouse_queries/master_commit_red_avg/query.sql",
"torchci/clickhouse_queries/torchbench_userbenchmark_list_commits/query.sql",
Expand All @@ -426,7 +425,6 @@ exclude_patterns = [
"torchci/clickhouse_queries/job_duration_percentile/query.sql",
"torchci/clickhouse_queries/master_commit_red/query.sql",
"torchci/clickhouse_queries/testStatsDistinctCount/query.sql",
"torchci/clickhouse_queries/flaky_tests/query.sql",
"torchci/clickhouse_queries/cost_job_per_workflow_name/query.sql",
"torchci/clickhouse_queries/unique_repos_in_runnercost/query.sql",
"torchci/clickhouse_queries/pr_commits/query.sql",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"params": {
"name": "String",
"classname": "String",
"invoking_file": "String",
"file": "String",
"numHours": "Int32"
},
"tests": []
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
select
name,
classname as suite,
file,
invoking_file,
job_id,
1 as numGreen,
SUM(LENGTH(rerun)) as numRed,
any(rerun[1].'text') as sampleTraceback
FROM
default.test_run_s3
where
name = {name: String}
and classname = {classname: String}
and invoking_file = {invoking_file: String}
and file = {file: String}
and LENGTH(skipped) = 0
and time_inserted > (CURRENT_TIMESTAMP() - interval {numHours: Int64} hour)
GROUP BY
name,
suite,
file,
invoking_file,
job_id
HAVING
-- succeded at least once
MIN(LENGTH(failure) + LENGTH(error)) = 0
-- failed completely at least once
and MAX(LENGTH(failure) + LENGTH(error)) != 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"params": {
"numHours": "Int32"
},
"tests": []
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
select
DISTINCT name,
file,
invoking_file,
classname
from
default .test_run_s3
where
(
LENGTH(failure) != 0
or LENGTH(error) != 0
)
and file != ''
and time_inserted > (CURRENT_TIMESTAMP() - interval {numHours: Int64} hour)
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"params": {
"job_ids": "Array(Int64)"
},
"tests": []
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
with jobs as (
select
name,
id,
run_id,
run_attempt,
html_url
from default.workflow_job final
where
id in {job_ids: Array(Int64)}
and name not like '%rerun_disabled_tests%'
)
select
j.name as name,
w.name as workflow_name,
j.id as id,
w.id as workflow_id,
w.head_branch as head_branch,
j.run_attempt as run_attempt,
j.html_url as html_url
from
default.workflow_run w final join jobs j on w.id = j.run_id
where
w.id in (select run_id from jobs)
16 changes: 16 additions & 0 deletions torchci/clickhouse_queries/flaky_tests/ind_info/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"params": {
"limit": "Int32",
"name": "String",
"suite": "String",
"file": "String"
},
"tests": [
{
"limit": 100,
"name": "test_non_contiguous_tensors_nn_ConvTranspose1d_cuda_complex32",
"suite": "TestModuleCUDA",
"file": "test_modules.py"
}
]
}
54 changes: 54 additions & 0 deletions torchci/clickhouse_queries/flaky_tests/ind_info/query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
-- File location is misleading, this is actually just any failed test, not
-- necessarily flaky.
WITH failed_test_runs AS (
SELECT
t.name AS name,
t.classname AS classname,
t.file AS file,
t.invoking_file AS invoking_file,
t.job_id
FROM default.failed_test_runs AS t
WHERE
t.name = {name: String}
AND t.classname = {suite: String}
AND t.file = {file: String}
),

failed_jobs AS (
SELECT
j.conclusion AS conclusion,
j.id AS id,
j.run_id AS run_id,
j.name AS name,
j.html_url AS html_url,
j.started_at AS started_at,
tupleElement(j.torchci_classification, 'line') AS line,
tupleElement(j.torchci_classification, 'line_num') AS line_num,
tupleElement(j.torchci_classification, 'captures') AS captures,
j.head_sha AS head_sha
FROM default.workflow_job AS j
WHERE
j.id IN (SELECT t.job_id FROM failed_test_runs t)
)

SELECT DISTINCT
t.name AS name,
t.classname AS classname,
t.file AS file,
t.invoking_file AS invoking_file,
j.conclusion AS conclusion,
j.id AS job_id,
j.name AS job_name,
j.html_url AS job_url,
j.started_at AS job_started_at,
j.line AS line,
j.line_num AS line_num,
j.captures AS captures,
w.head_branch AS head_branch,
j.head_sha AS head_sha
FROM failed_jobs AS j
INNER JOIN failed_test_runs AS t ON j.id = t.job_id
INNER JOIN default.workflow_run AS w ON w.id = j.run_id
ORDER BY j.started_at DESC
LIMIT
{limit: Int32}
2 changes: 1 addition & 1 deletion torchci/lib/fetchDisabledNonFlakyTests.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { DisabledNonFlakyTestData } from "./types";
export default async function fetchDisabledNonFlakyTests(): Promise<
DisabledNonFlakyTestData[]
> {
return await queryClickhouseSaved("disabled_non_flaky_tests", {
return await queryClickhouseSaved("flaky_tests/disabled_non_flaky_tests", {
max_num_red: 0,
min_num_green: 150,
});
Expand Down
115 changes: 24 additions & 91 deletions torchci/lib/fetchFlakyTests.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { queryClickhouse, queryClickhouseSaved } from "./clickhouse";
import { queryClickhouseSaved } from "./clickhouse";
import { FlakyTestData } from "./types";

export default async function fetchFlakyTests(
Expand All @@ -7,7 +7,7 @@ export default async function fetchFlakyTests(
testSuite: string = "%",
testFile: string = "%"
): Promise<FlakyTestData[]> {
return queryClickhouseSaved("flaky_tests", {
return queryClickhouseSaved("flaky_tests/in_subprocess", {
numHours,
name: testName,
suite: testSuite,
Expand All @@ -25,86 +25,13 @@ export async function fetchFlakyTestsAcrossJobs(): Promise<FlakyTestData[]> {
export async function fetchFlakyTestsAcrossFileReruns(
numHours: string = "3"
): Promise<FlakyTestData[]> {
const failedTestsQuery = `
select
DISTINCT name,
file,
invoking_file,
classname
from
default .test_run_s3
where
(
LENGTH(failure) != 0
or LENGTH(error) != 0
)
and file != ''
and time_inserted > (CURRENT_TIMESTAMP() - interval {numHours: Int64} hour)
`;

const checkEveryTestQuery = `
select
name,
classname as suite,
file,
invoking_file,
job_id,
1 as numGreen,
SUM(LENGTH(rerun)) as numRed,
any(rerun[1].'text') as sampleTraceback
FROM
default.test_run_s3
where
name = {name: String}
and classname = {classname: String}
and invoking_file = {invoking_file: String}
and file = {file: String}
and LENGTH(skipped) = 0
and time_inserted > (CURRENT_TIMESTAMP() - interval {numHours: Int64} hour)
GROUP BY
name,
suite,
file,
invoking_file,
job_id
HAVING
-- succeded at least once
MIN(LENGTH(failure) + LENGTH(error)) = 0
-- failed completely at least once
and MAX(LENGTH(failure) + LENGTH(error)) != 0
`;

const workflowJobInfoQuery = `
with jobs as (
select
name,
id,
run_id,
run_attempt,
html_url
from default.workflow_job final
where
id in {job_ids: Array(Int64)}
and name not like '%rerun_disabled_tests%'
)
select
j.name as name,
w.name as workflow_name,
j.id as id,
w.id as workflow_id,
w.head_branch as head_branch,
j.run_attempt as run_attempt,
j.html_url as html_url
from
default.workflow_run w final join jobs j on w.id = j.run_id
where
w.id in (select run_id from jobs)
`;

// Get every distinct failed test on master in the past numHours (usually not a lot)
const failedTestsResults = await queryClickhouse(failedTestsQuery, {
numHours,
});
const failedTestsResults = await queryClickhouseSaved(
"flaky_tests/across_file_reruns/failed_tests",
{
numHours,
}
);

// For every failed test, query the database for jobs that had file level reruns of
// the test in the past numHours. Do this separately because a join on
Expand All @@ -117,13 +44,16 @@ where
rerunTestsUnflattened.push(
await Promise.all(
failedTestsResults.slice(i, i + 25).map(async (e) => {
return await queryClickhouse(checkEveryTestQuery, {
name: e.name,
classname: e.classname,
invoking_file: e.invoking_file,
file: e.file,
numHours,
});
return await queryClickhouseSaved(
"flaky_tests/across_file_reruns/check_every_test",
{
name: e.name,
classname: e.classname,
invoking_file: e.invoking_file,
file: e.file,
numHours,
}
);
})
)
);
Expand All @@ -132,9 +62,12 @@ where

// Query for info about the workflow job. This could be done with the
// previous query but I think this is less resource intense?
const workflowJobInfo = await queryClickhouse(workflowJobInfoQuery, {
job_ids: rerunTests.map((e) => e.job_id),
});
const workflowJobInfo = await queryClickhouseSaved(
"flaky_tests/across_file_reruns/workflow_job_info",
{
job_ids: rerunTests.map((e) => e.job_id),
}
);

const workflowJobMap = new Map(workflowJobInfo.map((e) => [e.id, e]));
const rerunTestsMap: Map<string, FlakyTestData> = rerunTests.reduce(
Expand Down
2 changes: 1 addition & 1 deletion torchci/pages/api/flaky-tests/disable.ts
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ To find relevant log snippets:
}

if (test.numRed === undefined) {
// numRed === undefined indicates that is from the 'flaky_tests_across_jobs' query
// numRed === undefined indicates that is from the 'flaky_tests/across_jobs' query
numRedGreen = `Over the past ${NUM_HOURS_ACROSS_JOBS} hours, it has flakily failed in ${test.workflowIds.length} workflow(s).`;
examplesURL = `https://hud.pytorch.org/failure/${test.name}`;
debuggingSteps = `**Debugging instructions (after clicking on the recent samples link):**
Expand Down
Loading

0 comments on commit 8c386a1

Please sign in to comment.