#12840: Add more handling more multiple attempts by restricting the s…

…pace of `github_job_id`s we're looking to only the ones in the workflow run attempt in question (#12858) * #12840: Add support for artifacts for multiple attempts by downloading all of them, so that the system doesn't get confused when searching for artifacts that were generated in a previous attempt but weren't carried over * #12840: Instead of using trying to match all the job IDs found in the downloaded logs to a test report, only match the ones for the requested job, so we don't handle more than we need to and run into issues such as what we currently have where multiple attempts of a workflow run will cause multiple github_job_ids to have the same uuid, meaning we can't match later because we lose data
tenstorrent · Sep 18, 2024 · ea8522c · ea8522c
1 parent fd38b29
commit ea8522c
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 16 deletions.
diff --git a/infra/data_collection/cicd.py b/infra/data_collection/cicd.py
@@ -41,12 +41,22 @@ def create_cicd_json_for_data_analysis(
 
     workflow_outputs_dir = get_workflow_outputs_dir()
 
-    github_job_id_to_test_reports = get_github_job_id_to_test_reports(workflow_outputs_dir, github_pipeline_id)
+    github_job_ids = []
+    for raw_job in raw_jobs:
+        github_job_id = int(raw_job["github_job_id"])
+        github_job_ids.append(github_job_id)
+
+    github_job_id_to_test_reports = get_github_job_id_to_test_reports(
+        workflow_outputs_dir, github_pipeline_id, github_job_ids
+    )
 
     jobs = []
 
     for raw_job in raw_jobs:
         github_job_id = raw_job["github_job_id"]
+
+        logger.info(f"Processing raw GitHub job {github_job_id}")
+
         test_report_exists = github_job_id in github_job_id_to_test_reports
         if test_report_exists:
             test_report_path = github_job_id_to_test_reports[github_job_id]

diff --git a/infra/data_collection/github/download_cicd_logs_and_artifacts.sh b/infra/data_collection/github/download_cicd_logs_and_artifacts.sh
@@ -27,13 +27,18 @@ download_artifacts() {
 download_logs_for_all_jobs() {
     local repo=$1
     local workflow_run_id=$2
-    local attempt_number=$3
+    local max_attempts=$3
 
+    echo "[info] downloading logs for job with id $job_id for all attempts up to $max_attempts"
+    for attempt_number in $(seq 1 $max_attempts); do
+        echo "[Info] Downloading for attempt $attempt_number"
 
-    gh api /repos/$repo/actions/runs/$workflow_run_id/attempts/$attempt_number/jobs --paginate | jq '.jobs[].id' | while read -r job_id; do
-        echo "[Info] Download logs for job with ID $job_id"
-        gh api /repos/$repo/actions/jobs/$job_id/logs > generated/cicd/$workflow_run_id/logs/$job_id.log
+        gh api /repos/$repo/actions/runs/$workflow_run_id/attempts/$attempt_number/jobs --paginate | jq '.jobs[].id' | while read -r job_id; do
+            echo "[info] download logs for job with id $job_id, attempt number $attempt_number"
+            gh api /repos/$repo/actions/jobs/$job_id/logs > generated/cicd/$workflow_run_id/logs/$job_id.log
+        done
     done
+
 }
 
 main() {

diff --git a/infra/data_collection/github/workflows.py b/infra/data_collection/github/workflows.py
@@ -5,8 +5,9 @@
 import pathlib
 from datetime import datetime
 from functools import partial
+from typing import List
 
-from toolz.dicttoolz import keymap
+from loguru import logger
 
 from infra.data_collection import junit_xml_utils, pydantic_models
 
@@ -52,39 +53,72 @@ def search_for_uuid_in_log_file_(log_file):
     return None
 
 
-def get_workflow_run_uuids_to_github_job_ids_(workflow_outputs_dir, workflow_run_id: int):
+def get_github_job_ids_to_workflow_run_uuids_(workflow_outputs_dir, workflow_run_id: int):
+    """
+    We know we can get a proper mapping of github_job_id -> uuid because based on anecdotal testing,
+    it seems that the same GitHub job ID does not repeat for attempted runs, but because artifacts
+    carry over, uuid will repeat. We want to ensure that the set of github_job_ids is fully captured, and
+    it doesn't matter if the full set of uuids is.
+    """
     logs_dir = workflow_outputs_dir / str(workflow_run_id) / "logs"
 
     log_files = logs_dir.glob("*.log")
 
-    workflow_run_uuids_to_github_job_ids = {}
+    github_job_ids_to_workflow_run_uuids = {}
     for log_file in log_files:
         artifact_uuid_name = search_for_uuid_in_log_file_(log_file)
         if artifact_uuid_name:
             uuid = artifact_uuid_name.replace("test_reports_", "")
             github_job_id = log_file.name.replace(".log", "")
             assert github_job_id.isnumeric(), f"{github_job_id}"
             github_job_id = int(github_job_id)
-            workflow_run_uuids_to_github_job_ids[uuid] = github_job_id
-    return workflow_run_uuids_to_github_job_ids
+            github_job_ids_to_workflow_run_uuids[github_job_id] = uuid
+    return github_job_ids_to_workflow_run_uuids
 
 
-def get_github_job_id_to_test_reports(workflow_outputs_dir, workflow_run_id: int):
+def get_github_job_id_to_test_reports(workflow_outputs_dir, workflow_run_id: int, github_job_ids: List[int]):
     workflow_run_uuids_to_test_report_paths = get_workflow_run_uuids_to_test_reports_paths_(
         workflow_outputs_dir, workflow_run_id
     )
 
-    workflow_run_uuids_to_github_job_ids = get_workflow_run_uuids_to_github_job_ids_(
+    github_job_ids_to_workflow_run_uuids = get_github_job_ids_to_workflow_run_uuids_(
         workflow_outputs_dir, workflow_run_id
     )
 
-    get_github_job_id_from_uuid = lambda uuid_: workflow_run_uuids_to_github_job_ids[uuid_]
-    github_job_id_to_test_reports = keymap(get_github_job_id_from_uuid, workflow_run_uuids_to_test_report_paths)
+    github_job_id_to_test_reports = {}
+    for github_job_id in github_job_ids:
+        # It's possible the upload didn't go through, but if the test report
+        # for a uuid exists, then that means the test must have printed a uuid
+        # Unless the log file itself is gone because it's a stale pipeline
+
+        # It could be a good idea to throw an exception for either continue
+        # step in this loop and have the caller handle it. The key reason to do
+        # that in our case would be to potentially snuff out infrastructure
+        # issues and surface that at the job level.
+
+        if github_job_id not in github_job_ids_to_workflow_run_uuids:
+            """
+            The potential infra failure is: internal error or logs were never pushed up because the runner died
+            """
+            logger.warning(
+                f"No uuid was found for job {github_job_id}, meaning either the log file is not present or it doesn't have a uuid printed"
+            )
+            continue
+
+        uuid = github_job_ids_to_workflow_run_uuids[github_job_id]
+
+        if uuid not in workflow_run_uuids_to_test_report_paths:
+            logger.warning(
+                f"uuid {uuid} for job {github_job_id} does not have a matching test report path, usually meaning the report doesn't exist"
+            )
+            continue
+
+        github_job_id_to_test_reports[github_job_id] = workflow_run_uuids_to_test_report_paths[uuid]
 
     return github_job_id_to_test_reports
 
 
-def get_pydantic_test_from_pytest_testcase(testcase, default_timestamp=datetime.now()):
+def get_pydantic_test_from_pytest_testcase_(testcase, default_timestamp=datetime.now()):
     skipped = junit_xml_utils.get_pytest_testcase_is_skipped(testcase)
     failed = junit_xml_utils.get_pytest_testcase_is_failed(testcase)
     error = junit_xml_utils.get_pytest_testcase_is_error(testcase)
@@ -163,7 +197,7 @@ def get_tests_from_test_report_path(test_report_path):
         testsuite = report_root[0]
         default_timestamp = datetime.strptime(testsuite.attrib["timestamp"], "%Y-%m-%dT%H:%M:%S.%f")
 
-        get_pydantic_test = partial(get_pydantic_test_from_pytest_testcase, default_timestamp=default_timestamp)
+        get_pydantic_test = partial(get_pydantic_test_from_pytest_testcase_, default_timestamp=default_timestamp)
 
         return list(map(get_pydantic_test, testsuite))
     else: