[BE] Put all the flaky test related queries into one subfolder, move …

…adhoc queries to saved queries (#6321) * Puts queries related to flaky tests into a subfolder for better organization * Renames some of the queries to be more helpful * Moves some adhoc queries to named/saved queries
pytorch · Feb 24, 2025 · 8c386a1 · 8c386a1
1 parent 4fd3541
commit 8c386a1
Show file tree

Hide file tree

Showing 19 changed files with 187 additions and 148 deletions.
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -413,7 +413,6 @@ exclude_patterns = [
     "torchci/clickhouse_queries/duration_job_per_owning_account/query.sql",
     "torchci/clickhouse_queries/reverts/query.sql",
     "torchci/clickhouse_queries/cost_job_per_job_name/query.sql",
-    "torchci/clickhouse_queries/disabled_non_flaky_tests/query.sql",
     "torchci/clickhouse_queries/queued_jobs_by_label/query.sql",
     "torchci/clickhouse_queries/master_commit_red_avg/query.sql",
     "torchci/clickhouse_queries/torchbench_userbenchmark_list_commits/query.sql",
@@ -426,7 +425,6 @@ exclude_patterns = [
     "torchci/clickhouse_queries/job_duration_percentile/query.sql",
     "torchci/clickhouse_queries/master_commit_red/query.sql",
     "torchci/clickhouse_queries/testStatsDistinctCount/query.sql",
-    "torchci/clickhouse_queries/flaky_tests/query.sql",
     "torchci/clickhouse_queries/cost_job_per_workflow_name/query.sql",
     "torchci/clickhouse_queries/unique_repos_in_runnercost/query.sql",
     "torchci/clickhouse_queries/pr_commits/query.sql",

diff --git a/torchci/clickhouse_queries/flaky_tests/across_file_reruns/check_every_test/params.json b/torchci/clickhouse_queries/flaky_tests/across_file_reruns/check_every_test/params.json
@@ -0,0 +1,10 @@
+{
+  "params": {
+    "name": "String",
+    "classname": "String",
+    "invoking_file": "String",
+    "file": "String",
+    "numHours": "Int32"
+  },
+  "tests": []
+}
diff --git a/torchci/clickhouse_queries/flaky_tests/across_file_reruns/check_every_test/query.sql b/torchci/clickhouse_queries/flaky_tests/across_file_reruns/check_every_test/query.sql
@@ -0,0 +1,29 @@
+select
+    name,
+    classname as suite,
+    file,
+    invoking_file,
+    job_id,
+    1 as numGreen,
+    SUM(LENGTH(rerun)) as numRed,
+    any(rerun[1].'text') as sampleTraceback
+FROM
+    default.test_run_s3
+where
+    name = {name: String}
+    and classname = {classname: String}
+    and invoking_file = {invoking_file: String}
+    and file = {file: String}
+    and LENGTH(skipped) = 0
+    and time_inserted > (CURRENT_TIMESTAMP() - interval {numHours: Int64} hour)
+GROUP BY
+    name,
+    suite,
+    file,
+    invoking_file,
+    job_id
+HAVING
+    -- succeded at least once
+    MIN(LENGTH(failure) + LENGTH(error)) = 0
+    -- failed completely at least once
+    and MAX(LENGTH(failure) + LENGTH(error)) != 0
diff --git a/torchci/clickhouse_queries/flaky_tests/across_file_reruns/failed_tests/params.json b/torchci/clickhouse_queries/flaky_tests/across_file_reruns/failed_tests/params.json
@@ -0,0 +1,6 @@
+{
+  "params": {
+    "numHours": "Int32"
+  },
+  "tests": []
+}
diff --git a/torchci/clickhouse_queries/flaky_tests/across_file_reruns/failed_tests/query.sql b/torchci/clickhouse_queries/flaky_tests/across_file_reruns/failed_tests/query.sql
@@ -0,0 +1,14 @@
+select
+    DISTINCT name,
+    file,
+    invoking_file,
+    classname
+from
+    default .test_run_s3
+where
+    (
+        LENGTH(failure) != 0
+        or LENGTH(error) != 0
+    )
+    and file != ''
+    and time_inserted > (CURRENT_TIMESTAMP() - interval {numHours: Int64} hour)
diff --git a/torchci/clickhouse_queries/flaky_tests/across_file_reruns/workflow_job_info/params.json b/torchci/clickhouse_queries/flaky_tests/across_file_reruns/workflow_job_info/params.json
@@ -0,0 +1,6 @@
+{
+  "params": {
+    "job_ids": "Array(Int64)"
+  },
+  "tests": []
+}
diff --git a/torchci/clickhouse_queries/flaky_tests/across_file_reruns/workflow_job_info/query.sql b/torchci/clickhouse_queries/flaky_tests/across_file_reruns/workflow_job_info/query.sql
@@ -0,0 +1,24 @@
+with jobs as (
+  select
+    name,
+    id,
+    run_id,
+    run_attempt,
+    html_url
+  from default.workflow_job final
+  where
+    id in {job_ids: Array(Int64)}
+    and name not like '%rerun_disabled_tests%'
+)
+select
+  j.name as name,
+  w.name as workflow_name,
+  j.id as id,
+  w.id as workflow_id,
+  w.head_branch as head_branch,
+  j.run_attempt as run_attempt,
+  j.html_url as html_url
+from
+  default.workflow_run w final join jobs j on w.id = j.run_id
+where
+  w.id in (select run_id from jobs)
diff --git a/...eries/flaky_tests_across_jobs/params.json → ...eries/flaky_tests/across_jobs/params.json b/...eries/flaky_tests_across_jobs/params.json → ...eries/flaky_tests/across_jobs/params.json
diff --git a/...queries/flaky_tests_across_jobs/query.sql → ...queries/flaky_tests/across_jobs/query.sql b/...queries/flaky_tests_across_jobs/query.sql → ...queries/flaky_tests/across_jobs/query.sql
diff --git a/...ries/disabled_non_flaky_tests/params.json → ...ests/disabled_non_flaky_tests/params.json b/...ries/disabled_non_flaky_tests/params.json → ...ests/disabled_non_flaky_tests/params.json
diff --git a/...ueries/disabled_non_flaky_tests/query.sql → ..._tests/disabled_non_flaky_tests/query.sql b/...ueries/disabled_non_flaky_tests/query.sql → ..._tests/disabled_non_flaky_tests/query.sql
diff --git a/...lickhouse_queries/flaky_tests/params.json → ...ies/flaky_tests/in_subprocess/params.json b/...lickhouse_queries/flaky_tests/params.json → ...ies/flaky_tests/in_subprocess/params.json
diff --git a/.../clickhouse_queries/flaky_tests/query.sql → ...eries/flaky_tests/in_subprocess/query.sql b/.../clickhouse_queries/flaky_tests/query.sql → ...eries/flaky_tests/in_subprocess/query.sql
diff --git a/torchci/clickhouse_queries/flaky_tests/ind_info/params.json b/torchci/clickhouse_queries/flaky_tests/ind_info/params.json
@@ -0,0 +1,16 @@
+{
+  "params": {
+    "limit": "Int32",
+    "name": "String",
+    "suite": "String",
+    "file": "String"
+  },
+  "tests": [
+    {
+      "limit": 100,
+      "name": "test_non_contiguous_tensors_nn_ConvTranspose1d_cuda_complex32",
+      "suite": "TestModuleCUDA",
+      "file": "test_modules.py"
+    }
+  ]
+}
diff --git a/torchci/clickhouse_queries/flaky_tests/ind_info/query.sql b/torchci/clickhouse_queries/flaky_tests/ind_info/query.sql
@@ -0,0 +1,54 @@
+-- File location is misleading, this is actually just any failed test, not
+-- necessarily flaky.
+WITH failed_test_runs AS (
+    SELECT
+        t.name AS name,
+        t.classname AS classname,
+        t.file AS file,
+        t.invoking_file AS invoking_file,
+        t.job_id
+    FROM default.failed_test_runs AS t
+    WHERE
+        t.name = {name: String}
+        AND t.classname = {suite: String}
+        AND t.file = {file: String}
+),
+
+failed_jobs AS (
+    SELECT
+        j.conclusion AS conclusion,
+        j.id AS id,
+        j.run_id AS run_id,
+        j.name AS name,
+        j.html_url AS html_url,
+        j.started_at AS started_at,
+        tupleElement(j.torchci_classification, 'line') AS line,
+        tupleElement(j.torchci_classification, 'line_num') AS line_num,
+        tupleElement(j.torchci_classification, 'captures') AS captures,
+        j.head_sha AS head_sha
+    FROM default.workflow_job AS j
+    WHERE
+        j.id IN (SELECT t.job_id FROM failed_test_runs t)
+)
+
+SELECT DISTINCT
+    t.name AS name,
+    t.classname AS classname,
+    t.file AS file,
+    t.invoking_file AS invoking_file,
+    j.conclusion AS conclusion,
+    j.id AS job_id,
+    j.name AS job_name,
+    j.html_url AS job_url,
+    j.started_at AS job_started_at,
+    j.line AS line,
+    j.line_num AS line_num,
+    j.captures AS captures,
+    w.head_branch AS head_branch,
+    j.head_sha AS head_sha
+FROM failed_jobs AS j
+INNER JOIN failed_test_runs AS t ON j.id = t.job_id
+INNER JOIN default.workflow_run AS w ON w.id = j.run_id
+ORDER BY j.started_at DESC
+LIMIT
+    {limit: Int32}
diff --git a/torchci/lib/fetchDisabledNonFlakyTests.ts b/torchci/lib/fetchDisabledNonFlakyTests.ts
@@ -4,7 +4,7 @@ import { DisabledNonFlakyTestData } from "./types";
 export default async function fetchDisabledNonFlakyTests(): Promise<
   DisabledNonFlakyTestData[]
 > {
-  return await queryClickhouseSaved("disabled_non_flaky_tests", {
+  return await queryClickhouseSaved("flaky_tests/disabled_non_flaky_tests", {
     max_num_red: 0,
     min_num_green: 150,
   });

diff --git a/torchci/lib/fetchFlakyTests.ts b/torchci/lib/fetchFlakyTests.ts
@@ -1,4 +1,4 @@
-import { queryClickhouse, queryClickhouseSaved } from "./clickhouse";
+import { queryClickhouseSaved } from "./clickhouse";
 import { FlakyTestData } from "./types";
 
 export default async function fetchFlakyTests(
@@ -7,7 +7,7 @@ export default async function fetchFlakyTests(
   testSuite: string = "%",
   testFile: string = "%"
 ): Promise<FlakyTestData[]> {
-  return queryClickhouseSaved("flaky_tests", {
+  return queryClickhouseSaved("flaky_tests/in_subprocess", {
     numHours,
     name: testName,
     suite: testSuite,
@@ -25,86 +25,13 @@ export async function fetchFlakyTestsAcrossJobs(): Promise<FlakyTestData[]> {
 export async function fetchFlakyTestsAcrossFileReruns(
   numHours: string = "3"
 ): Promise<FlakyTestData[]> {
-  const failedTestsQuery = `
-select
-    DISTINCT name,
-    file,
-    invoking_file,
-    classname
-from
-    default .test_run_s3
-where
-    (
-        LENGTH(failure) != 0
-        or LENGTH(error) != 0
-    )
-    and file != ''
-    and time_inserted > (CURRENT_TIMESTAMP() - interval {numHours: Int64} hour)
-`;
-
-  const checkEveryTestQuery = `
-select
-    name,
-    classname as suite,
-    file,
-    invoking_file,
-    job_id,
-    1 as numGreen,
-    SUM(LENGTH(rerun)) as numRed,
-    any(rerun[1].'text') as sampleTraceback
-FROM
-    default.test_run_s3
-where
-    name = {name: String}
-    and classname = {classname: String}
-    and invoking_file = {invoking_file: String}
-    and file = {file: String}
-    and LENGTH(skipped) = 0
-    and time_inserted > (CURRENT_TIMESTAMP() - interval {numHours: Int64} hour)
-GROUP BY
-    name,
-    suite,
-    file,
-    invoking_file,
-    job_id
-HAVING
-    -- succeded at least once
-    MIN(LENGTH(failure) + LENGTH(error)) = 0
-    -- failed completely at least once
-    and MAX(LENGTH(failure) + LENGTH(error)) != 0
-`;
-
-  const workflowJobInfoQuery = `
-with jobs as (
-  select
-    name,
-    id,
-    run_id,
-    run_attempt,
-    html_url
-  from default.workflow_job final
-  where
-    id in {job_ids: Array(Int64)}
-    and name not like '%rerun_disabled_tests%'
-)
-select
-  j.name as name,
-  w.name as workflow_name,
-  j.id as id,
-  w.id as workflow_id,
-  w.head_branch as head_branch,
-  j.run_attempt as run_attempt,
-  j.html_url as html_url
-from
-  default.workflow_run w final join jobs j on w.id = j.run_id
-where
-  w.id in (select run_id from jobs)
-`;
-
   // Get every distinct failed test on master in the past numHours (usually not a lot)
-  const failedTestsResults = await queryClickhouse(failedTestsQuery, {
-    numHours,
-  });
+  const failedTestsResults = await queryClickhouseSaved(
+    "flaky_tests/across_file_reruns/failed_tests",
+    {
+      numHours,
+    }
+  );
 
   // For every failed test, query the database for jobs that had file level reruns of
   // the test in the past numHours.  Do this separately because a join on
@@ -117,13 +44,16 @@ where
     rerunTestsUnflattened.push(
       await Promise.all(
         failedTestsResults.slice(i, i + 25).map(async (e) => {
-          return await queryClickhouse(checkEveryTestQuery, {
-            name: e.name,
-            classname: e.classname,
-            invoking_file: e.invoking_file,
-            file: e.file,
-            numHours,
-          });
+          return await queryClickhouseSaved(
+            "flaky_tests/across_file_reruns/check_every_test",
+            {
+              name: e.name,
+              classname: e.classname,
+              invoking_file: e.invoking_file,
+              file: e.file,
+              numHours,
+            }
+          );
         })
       )
     );
@@ -132,9 +62,12 @@ where
 
   // Query for info about the workflow job.  This could be done with the
   // previous query but I think this is less resource intense?
-  const workflowJobInfo = await queryClickhouse(workflowJobInfoQuery, {
-    job_ids: rerunTests.map((e) => e.job_id),
-  });
+  const workflowJobInfo = await queryClickhouseSaved(
+    "flaky_tests/across_file_reruns/workflow_job_info",
+    {
+      job_ids: rerunTests.map((e) => e.job_id),
+    }
+  );
 
   const workflowJobMap = new Map(workflowJobInfo.map((e) => [e.id, e]));
   const rerunTestsMap: Map<string, FlakyTestData> = rerunTests.reduce(

diff --git a/torchci/pages/api/flaky-tests/disable.ts b/torchci/pages/api/flaky-tests/disable.ts
@@ -367,7 +367,7 @@ To find relevant log snippets:
   }
 
   if (test.numRed === undefined) {
-    // numRed === undefined indicates that is from the 'flaky_tests_across_jobs' query
+    // numRed === undefined indicates that is from the 'flaky_tests/across_jobs' query
     numRedGreen = `Over the past ${NUM_HOURS_ACROSS_JOBS} hours, it has flakily failed in ${test.workflowIds.length} workflow(s).`;
     examplesURL = `https://hud.pytorch.org/failure/${test.name}`;
     debuggingSteps = `**Debugging instructions (after clicking on the recent samples link):**