Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sqlfluff linting existing clickhouse queries pt2 #6311

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 0 additions & 52 deletions .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -366,58 +366,6 @@ exclude_patterns = [
# and want to exclude on purpose, add it above this comment instead of below
# so we can track which ones are explicitly excluded and which are excluded
# due to history
"torchci/clickhouse_queries/job_duration_avg/query.sql",
"torchci/clickhouse_queries/cost_job_per_runner_type/query.sql",
"torchci/clickhouse_queries/num_commits_master/query.sql",
"torchci/clickhouse_queries/torchbench_list_userbenchmarks/query.sql",
"torchci/clickhouse_queries/lf_rollover_percentage/query.sql",
"torchci/clickhouse_queries/flaky_tests_across_jobs/query.sql",
"torchci/clickhouse_queries/tts_duration_historical/query.sql",
"torchci/clickhouse_queries/test_time_per_file/query.sql",
"torchci/clickhouse_queries/testStatsSearch/query.sql",
"torchci/clickhouse_queries/cost_job_per_gpu/query.sql",
"torchci/clickhouse_queries/correlation_matrix/query.sql",
"torchci/clickhouse_queries/disabled_test_total/query.sql",
"torchci/clickhouse_queries/query_execution_metrics/query.sql",
"torchci/clickhouse_queries/compilers_benchmark_performance/query.sql",
"torchci/clickhouse_queries/ttrs_percentiles/query.sql",
"torchci/clickhouse_queries/flaky_workflows_jobs/query.sql",
"torchci/clickhouse_queries/strict_lag_sec/query.sql",
"torchci/clickhouse_queries/master_commit_red_percent/query.sql",
"torchci/clickhouse_queries/cost_job_per_repo/query.sql",
"torchci/clickhouse_queries/queue_times_historical/query.sql",
"torchci/clickhouse_queries/test_time_per_class_periodic_jobs/query.sql",
"torchci/clickhouse_queries/runner_utilization_by_repo/query.sql",
"torchci/clickhouse_queries/time_to_signal/query.sql",
"torchci/clickhouse_queries/validation_jobs_red_past_day/query.sql",
"torchci/clickhouse_queries/duration_job_per_runner_type/query.sql",
"torchci/clickhouse_queries/workflow_duration_avg/query.sql",
"torchci/clickhouse_queries/get_workflow_jobs/query.sql",
"torchci/clickhouse_queries/duration_job_per_platform/query.sql",
"torchci/clickhouse_queries/queued_jobs/query.sql",
"torchci/clickhouse_queries/issue_query/query.sql",
"torchci/clickhouse_queries/testStats3d/query.sql",
"torchci/clickhouse_queries/failed_workflow_jobs/query.sql",
"torchci/clickhouse_queries/disabled_test_labels/query.sql",
"torchci/clickhouse_queries/cost_job_per_platform/query.sql",
"torchci/clickhouse_queries/query_execution_metrics_individual/query.sql",
"torchci/clickhouse_queries/runner_utilization_by_activity/query.sql",
"torchci/clickhouse_queries/commit_jobs_batch_query/query.sql",
"torchci/clickhouse_queries/commit_failed_jobs/query.sql",
"torchci/clickhouse_queries/pr_merge_commits/query.sql",
"torchci/clickhouse_queries/hud_query/query.sql",
"torchci/clickhouse_queries/test_time_per_file_periodic_jobs/query.sql",
"torchci/clickhouse_queries/lf_rollover_health/query.sql",
"torchci/clickhouse_queries/runner_utilization/query.sql",
"torchci/clickhouse_queries/test_time_per_class/query.sql",
"torchci/clickhouse_queries/cost_job_per_provider/query.sql",
"torchci/clickhouse_queries/disabled_test_historical/query.sql",
"torchci/clickhouse_queries/duration_job_per_provider/query.sql",
"torchci/clickhouse_queries/nightly_jobs_red_by_name/query.sql",
"torchci/clickhouse_queries/cost_job_per_owning_account/query.sql",
"torchci/clickhouse_queries/filter_forced_merge_pr/query.sql",
"torchci/clickhouse_queries/nightly_jobs_red_past_day/query.sql",
"torchci/clickhouse_queries/duration_job_per_repo/query.sql",
"torchci/clickhouse_queries/torchao_query_branches/query.sql",
"torchci/clickhouse_queries/nightly_jobs_red/query.sql",
"torchci/clickhouse_queries/compilers_benchmark_performance_branches/query.sql",
Expand Down
5 changes: 2 additions & 3 deletions torchci/clickhouse_queries/disabled_test_total/query.sql
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
SELECT
COUNT(issues.title) as number_of_open_disabled_tests
SELECT COUNT(issues.title) AS number_of_open_disabled_tests
FROM
default.issues final
default.issues FINAL
WHERE
issues.title LIKE '%DISABLED%'
AND issues.state = {state: String}
14 changes: 7 additions & 7 deletions torchci/clickhouse_queries/get_workflow_jobs/query.sql
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
-- A simple query to get a job by name
SELECT DISTINCT
job.id,
job.name
job.id,
job.name
FROM
default.workflow_job job FINAL
INNER JOIN workflow_run workflow FINAL on workflow.id = job.run_id
default.workflow_job job FINAL
INNER JOIN workflow_run workflow FINAL ON workflow.id = job.run_id
WHERE
workflow.id = { workflowId: Int64 }
AND job.name LIKE { jobName: String }
workflow.id = { workflowId: Int64 }
AND job.name LIKE { jobName: String }
ORDER BY
job.name
job.name
26 changes: 15 additions & 11 deletions torchci/clickhouse_queries/job_duration_avg/query.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,29 @@ SELECT
job.started_at,
job.completed_at
)
) as duration_sec,
COUNT(*) as count,
CONCAT(workflow.name, ' / ', job.name) as name
) AS duration_sec,
COUNT(*) AS count,
CONCAT(workflow.name, ' / ', job.name) AS name
FROM
default.workflow_job job final
JOIN default.workflow_run workflow final on workflow.id = job.run_id
default.workflow_job job FINAL
JOIN default.workflow_run workflow FINAL ON workflow.id = job.run_id
WHERE
job.name != 'ciflow_should_run'
AND job.name != 'generate-test-matrix'
AND workflow.repository.'full_name' = 'pytorch/pytorch'
AND job.created_at >= {startTime: DateTime64(3)}
AND job.created_at < {stopTime: DateTime64(3)}
AND job.id in (
select id from materialized_views.workflow_job_by_created_at
WHERE created_at >= {startTime: DateTime64(3)} and created_at < {stopTime: DateTime64(3)}
AND job.id IN (
SELECT id FROM materialized_views.workflow_job_by_created_at
WHERE
created_at >= {startTime: DateTime64(3)}
AND created_at < {stopTime: DateTime64(3)}
)
AND workflow.id in (
select id from materialized_views.workflow_run_by_created_at
WHERE created_at >= {startTime: DateTime64(3)} and created_at < {stopTime: DateTime64(3)}
AND workflow.id IN (
SELECT id FROM materialized_views.workflow_run_by_created_at
WHERE
created_at >= {startTime: DateTime64(3)}
AND created_at < {stopTime: DateTime64(3)}
)
AND workflow.head_branch LIKE {branch: String}
AND workflow.run_attempt = 1
Expand Down
38 changes: 19 additions & 19 deletions torchci/clickhouse_queries/nightly_jobs_red_by_name/query.sql
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
SELECT
COUNT(*) AS COUNT,
workflow.name as name
FROM
workflow_job job
JOIN workflow_run workflow on workflow.id = job.run_id
JOIN push ON workflow.head_commit.'id' = push.head_commit.'id'
WHERE
job.name NOT LIKE '%generate-matrix%'
AND job.name NOT LIKE '%unittests%'
AND workflow.name NOT IN ('cron', 'Bandit', 'tests', 'Lint')
AND push.ref = 'refs/heads/nightly'
AND push.repository.'owner'.'name' = 'pytorch'
AND push.repository.'name' IN ('pytorch', 'vision', 'audio')
AND job.created_at >= {startTime: DateTime64(3)}
AND job.created_at < {stopTime: DateTime64(3)}
AND job.conclusion IN ('failure', 'timed_out', 'cancelled')
GROUP BY
workflow.name
SELECT
COUNT(*) AS COUNT,
WORKFLOW.NAME AS NAME
FROM
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@clee2000 is this the correct behavior? I'm not too familiar with clickhouse so I wasn't sure if this syntax is correct

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I think all of the capitalizations it changed here should not be done

WORKFLOW_JOB JOB
JOIN WORKFLOW_RUN WORKFLOW ON WORKFLOW.ID = JOB.RUN_ID
JOIN PUSH ON WORKFLOW.HEAD_COMMIT.'id' = PUSH.HEAD_COMMIT.'id'
WHERE
JOB.NAME NOT LIKE '%generate-matrix%'
AND JOB.NAME NOT LIKE '%unittests%'
AND WORKFLOW.NAME NOT IN ('cron', 'Bandit', 'tests', 'Lint')
AND PUSH.REF = 'refs/heads/nightly'
AND PUSH.REPOSITORY.'owner'.'name' = 'pytorch'
AND PUSH.REPOSITORY.'name' IN ('pytorch', 'vision', 'audio')
AND JOB.CREATED_AT >= {startTime: DateTime64(3)}
AND JOB.CREATED_AT < {stopTime: DateTime64(3)}
AND JOB.CONCLUSION IN ('failure', 'timed_out', 'cancelled')
GROUP BY
WORKFLOW.NAME
28 changes: 14 additions & 14 deletions torchci/clickhouse_queries/nightly_jobs_red_past_day/query.sql
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
SELECT
COUNT(*) AS COUNT,
job.name as name
COUNT(*) AS COUNT,
JOB.NAME AS NAME
FROM
workflow_job job
JOIN workflow_run workflow ON workflow.id = job.run_id
join push on push.head_commit.'id' = workflow.head_commit.'id'
WORKFLOW_JOB JOB
JOIN WORKFLOW_RUN WORKFLOW ON WORKFLOW.ID = JOB.RUN_ID
JOIN PUSH ON PUSH.HEAD_COMMIT.'id' = WORKFLOW.HEAD_COMMIT.'id'
WHERE
job.name NOT LIKE '%generate-matrix%'
AND job.name NOT LIKE '%unittests%'
AND workflow.name NOT IN ('cron', 'Bandit', 'tests')
AND push.ref = 'refs/heads/nightly'
AND push.repository.'owner'.'name' = 'pytorch'
AND push.repository.'name' = {repo: String }
AND job.conclusion IN ('failure', 'timed_out', 'cancelled')
AND job.completed_at >= today() - 1
GROUP BY job.name
JOB.NAME NOT LIKE '%generate-matrix%'
AND JOB.NAME NOT LIKE '%unittests%'
AND WORKFLOW.NAME NOT IN ('cron', 'Bandit', 'tests')
AND PUSH.REF = 'refs/heads/nightly'
AND PUSH.REPOSITORY.'owner'.'name' = 'pytorch'
AND PUSH.REPOSITORY.'name' = {repo: String }
AND JOB.CONCLUSION IN ('failure', 'timed_out', 'cancelled')
AND JOB.COMPLETED_AT >= today() - 1
GROUP BY JOB.NAME
ORDER BY COUNT;
9 changes: 4 additions & 5 deletions torchci/clickhouse_queries/num_commits_master/query.sql
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
select
SUM(LENGTH(p.commits)) as num
from
SELECT SUM(LENGTH(p.commits)) AS num
FROM
push p
where
WHERE
p.repository.full_name = 'pytorch/pytorch'
and p.ref = 'refs/heads/main'
AND p.ref = 'refs/heads/main'
AND p.head_commit.'timestamp' >= {startTime: DateTime64(3)}
AND p.head_commit.'timestamp' < {stopTime: DateTime64(3)}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ SELECT
q.time
) AS granularity_bucket,
/* misnomer, this is the max queue time, not the avg queue time */
AVG(q.avg_queue_s) as avg_queue_s,
AVG(q.avg_queue_s) AS avg_queue_s,
q.machine_type
FROM
default.queue_times_historical q
Expand Down
80 changes: 42 additions & 38 deletions torchci/clickhouse_queries/queued_jobs/query.sql
Original file line number Diff line number Diff line change
@@ -1,43 +1,47 @@
--- This query is used by HUD metrics page to get the list of queued jobs
with possible_queued_jobs as (
select id, run_id
from default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed
where status = 'queued'
AND created_at < (CURRENT_TIMESTAMP() - INTERVAL 5 MINUTE)
AND created_at > (CURRENT_TIMESTAMP() - INTERVAL 1 WEEK)
select
id,
run_id
from default.workflow_job -- FINAL not needed since we just use this to filter a table that has already been FINALed
where
status = 'queued'
and created_at < (CURRENT_TIMESTAMP() - interval 5 minute)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@clee2000 do you know if we also want to have this behavior where if the file starts off with sql commands in lowercase that it makes everything else lowercase?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO it would be better to always stick to either upper case or lower case

and created_at > (CURRENT_TIMESTAMP() - interval 1 week)
)
SELECT
DATE_DIFF(
'second',
job.created_at,
CURRENT_TIMESTAMP()
) AS queue_s,
CONCAT(workflow.name, ' / ', job.name) AS name,
job.html_url,
IF(
LENGTH(job.labels) = 0,
'N/A',

select
DATE_DIFF(
'second',
job.created_at,
CURRENT_TIMESTAMP()
) as queue_s,
CONCAT(workflow.name, ' / ', job.name) as name,
job.html_url,
IF(
LENGTH(job.labels) > 1,
job.labels[2],
job.labels[1]
)
) AS machine_type
FROM
default.workflow_job job final
JOIN default.workflow_run workflow final ON workflow.id = job.run_id
WHERE
job.id in (select id from possible_queued_jobs)
and workflow.id in (select run_id from possible_queued_jobs)
and workflow.repository.'full_name' = 'pytorch/pytorch'
AND job.status = 'queued'
/* These two conditions are workarounds for GitHub's broken API. Sometimes */
/* jobs get stuck in a permanently "queued" state but definitely ran. We can */
/* detect this by looking at whether any steps executed (if there were, */
/* obviously the job started running), and whether the workflow was marked as */
/* complete (somehow more reliable than the job-level API) */
AND LENGTH(job.steps) = 0
AND workflow.status != 'completed'
ORDER BY
queue_s DESC
LENGTH(job.labels) = 0,
'N/A',
IF(
LENGTH(job.labels) > 1,
job.labels[2],
job.labels[1]
)
) as machine_type
from
default.workflow_job job final
join default.workflow_run workflow final on workflow.id = job.run_id
where
job.id in (select id from possible_queued_jobs)
and workflow.id in (select run_id from possible_queued_jobs)
and workflow.repository.'full_name' = 'pytorch/pytorch'
and job.status = 'queued'
/* These two conditions are workarounds for GitHub's broken API. Sometimes */
/* jobs get stuck in a permanently "queued" state but definitely ran. We can */
/* detect this by looking at whether any steps executed (if there were, */
/* obviously the job started running), and whether the workflow was marked as */
/* complete (somehow more reliable than the job-level API) */
and LENGTH(job.steps) = 0
and workflow.status != 'completed'
order by
queue_s desc
settings allow_experimental_analyzer = 1;
20 changes: 10 additions & 10 deletions torchci/clickhouse_queries/strict_lag_sec/query.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
WITH master as (
SELECT
push.head_commit.timestamp as master
WITH master AS (
SELECT push.head_commit.timestamp AS master
FROM
push
WHERE
Expand All @@ -9,12 +8,13 @@ WITH master as (
AND push.repository.name = {repo: String }
AND push.head_commit.id != ''
ORDER BY
push.head_commit.timestamp desc
push.head_commit.timestamp DESC
LIMIT
1
), strict as (
SELECT
push.head_commit.timestamp as strict
),

strict AS (
SELECT push.head_commit.timestamp AS strict
FROM
push
WHERE
Expand All @@ -23,12 +23,12 @@ WITH master as (
AND push.repository.name = {repo: String }
AND push.head_commit.id != ''
ORDER BY
push.head_commit.timestamp desc
push.head_commit.timestamp DESC
LIMIT
1
)
SELECT
DATE_DIFF('second', strict, master) as strict_lag_sec

SELECT DATE_DIFF('second', strict, master) AS strict_lag_sec
FROM
master,
strict
34 changes: 17 additions & 17 deletions torchci/clickhouse_queries/testStatsSearch/query.sql
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
select
t.name,
t.classname,
t.file,
t.invoking_file,
maxMerge(t.last_run) as last_run
t.name,
t.classname,
t.file,
t.invoking_file,
maxMerge(t.last_run) as last_run
from
tests.distinct_names t
tests.distinct_names t
where
t.name like {name: String}
and t.classname like {suite: String}
and t.file like {file: String}
t.name like {name: String}
and t.classname like {suite: String}
and t.file like {file: String}
group by
t.name,
t.classname,
t.file,
t.invoking_file
t.name,
t.classname,
t.file,
t.invoking_file
order by
t.name, t.classname, t.file, t.invoking_file
t.name, t.classname, t.file, t.invoking_file
limit
{per_page: Int}
offset
{offset: Int}
{per_page: Int}
offset
{offset: Int}
Loading
Loading