diff --git a/infra/data_collection/cicd.py b/infra/data_collection/cicd.py index 58113d7aae9..64ef9f4234a 100644 --- a/infra/data_collection/cicd.py +++ b/infra/data_collection/cicd.py @@ -41,12 +41,22 @@ def create_cicd_json_for_data_analysis( workflow_outputs_dir = get_workflow_outputs_dir() - github_job_id_to_test_reports = get_github_job_id_to_test_reports(workflow_outputs_dir, github_pipeline_id) + github_job_ids = [] + for raw_job in raw_jobs: + github_job_id = int(raw_job["github_job_id"]) + github_job_ids.append(github_job_id) + + github_job_id_to_test_reports = get_github_job_id_to_test_reports( + workflow_outputs_dir, github_pipeline_id, github_job_ids + ) jobs = [] for raw_job in raw_jobs: github_job_id = raw_job["github_job_id"] + + logger.info(f"Processing raw GitHub job {github_job_id}") + test_report_exists = github_job_id in github_job_id_to_test_reports if test_report_exists: test_report_path = github_job_id_to_test_reports[github_job_id] diff --git a/infra/data_collection/github/download_cicd_logs_and_artifacts.sh b/infra/data_collection/github/download_cicd_logs_and_artifacts.sh index a30efe16375..be553bd421b 100755 --- a/infra/data_collection/github/download_cicd_logs_and_artifacts.sh +++ b/infra/data_collection/github/download_cicd_logs_and_artifacts.sh @@ -27,13 +27,18 @@ download_artifacts() { download_logs_for_all_jobs() { local repo=$1 local workflow_run_id=$2 - local attempt_number=$3 + local max_attempts=$3 + echo "[info] downloading logs for job with id $job_id for all attempts up to $max_attempts" + for attempt_number in $(seq 1 $max_attempts); do + echo "[Info] Downloading for attempt $attempt_number" - gh api /repos/$repo/actions/runs/$workflow_run_id/attempts/$attempt_number/jobs --paginate | jq '.jobs[].id' | while read -r job_id; do - echo "[Info] Download logs for job with ID $job_id" - gh api /repos/$repo/actions/jobs/$job_id/logs > generated/cicd/$workflow_run_id/logs/$job_id.log + gh api /repos/$repo/actions/runs/$workflow_run_id/attempts/$attempt_number/jobs --paginate | jq '.jobs[].id' | while read -r job_id; do + echo "[info] download logs for job with id $job_id, attempt number $attempt_number" + gh api /repos/$repo/actions/jobs/$job_id/logs > generated/cicd/$workflow_run_id/logs/$job_id.log + done done + } main() { diff --git a/infra/data_collection/github/workflows.py b/infra/data_collection/github/workflows.py index 094491f35e1..da6da23ad65 100644 --- a/infra/data_collection/github/workflows.py +++ b/infra/data_collection/github/workflows.py @@ -5,8 +5,9 @@ import pathlib from datetime import datetime from functools import partial +from typing import List -from toolz.dicttoolz import keymap +from loguru import logger from infra.data_collection import junit_xml_utils, pydantic_models @@ -52,12 +53,18 @@ def search_for_uuid_in_log_file_(log_file): return None -def get_workflow_run_uuids_to_github_job_ids_(workflow_outputs_dir, workflow_run_id: int): +def get_github_job_ids_to_workflow_run_uuids_(workflow_outputs_dir, workflow_run_id: int): + """ + We know we can get a proper mapping of github_job_id -> uuid because based on anecdotal testing, + it seems that the same GitHub job ID does not repeat for attempted runs, but because artifacts + carry over, uuid will repeat. We want to ensure that the set of github_job_ids is fully captured, and + it doesn't matter if the full set of uuids is. + """ logs_dir = workflow_outputs_dir / str(workflow_run_id) / "logs" log_files = logs_dir.glob("*.log") - workflow_run_uuids_to_github_job_ids = {} + github_job_ids_to_workflow_run_uuids = {} for log_file in log_files: artifact_uuid_name = search_for_uuid_in_log_file_(log_file) if artifact_uuid_name: @@ -65,26 +72,53 @@ def get_workflow_run_uuids_to_github_job_ids_(workflow_outputs_dir, workflow_run github_job_id = log_file.name.replace(".log", "") assert github_job_id.isnumeric(), f"{github_job_id}" github_job_id = int(github_job_id) - workflow_run_uuids_to_github_job_ids[uuid] = github_job_id - return workflow_run_uuids_to_github_job_ids + github_job_ids_to_workflow_run_uuids[github_job_id] = uuid + return github_job_ids_to_workflow_run_uuids -def get_github_job_id_to_test_reports(workflow_outputs_dir, workflow_run_id: int): +def get_github_job_id_to_test_reports(workflow_outputs_dir, workflow_run_id: int, github_job_ids: List[int]): workflow_run_uuids_to_test_report_paths = get_workflow_run_uuids_to_test_reports_paths_( workflow_outputs_dir, workflow_run_id ) - workflow_run_uuids_to_github_job_ids = get_workflow_run_uuids_to_github_job_ids_( + github_job_ids_to_workflow_run_uuids = get_github_job_ids_to_workflow_run_uuids_( workflow_outputs_dir, workflow_run_id ) - get_github_job_id_from_uuid = lambda uuid_: workflow_run_uuids_to_github_job_ids[uuid_] - github_job_id_to_test_reports = keymap(get_github_job_id_from_uuid, workflow_run_uuids_to_test_report_paths) + github_job_id_to_test_reports = {} + for github_job_id in github_job_ids: + # It's possible the upload didn't go through, but if the test report + # for a uuid exists, then that means the test must have printed a uuid + # Unless the log file itself is gone because it's a stale pipeline + + # It could be a good idea to throw an exception for either continue + # step in this loop and have the caller handle it. The key reason to do + # that in our case would be to potentially snuff out infrastructure + # issues and surface that at the job level. + + if github_job_id not in github_job_ids_to_workflow_run_uuids: + """ + The potential infra failure is: internal error or logs were never pushed up because the runner died + """ + logger.warning( + f"No uuid was found for job {github_job_id}, meaning either the log file is not present or it doesn't have a uuid printed" + ) + continue + + uuid = github_job_ids_to_workflow_run_uuids[github_job_id] + + if uuid not in workflow_run_uuids_to_test_report_paths: + logger.warning( + f"uuid {uuid} for job {github_job_id} does not have a matching test report path, usually meaning the report doesn't exist" + ) + continue + + github_job_id_to_test_reports[github_job_id] = workflow_run_uuids_to_test_report_paths[uuid] return github_job_id_to_test_reports -def get_pydantic_test_from_pytest_testcase(testcase, default_timestamp=datetime.now()): +def get_pydantic_test_from_pytest_testcase_(testcase, default_timestamp=datetime.now()): skipped = junit_xml_utils.get_pytest_testcase_is_skipped(testcase) failed = junit_xml_utils.get_pytest_testcase_is_failed(testcase) error = junit_xml_utils.get_pytest_testcase_is_error(testcase) @@ -163,7 +197,7 @@ def get_tests_from_test_report_path(test_report_path): testsuite = report_root[0] default_timestamp = datetime.strptime(testsuite.attrib["timestamp"], "%Y-%m-%dT%H:%M:%S.%f") - get_pydantic_test = partial(get_pydantic_test_from_pytest_testcase, default_timestamp=default_timestamp) + get_pydantic_test = partial(get_pydantic_test_from_pytest_testcase_, default_timestamp=default_timestamp) return list(map(get_pydantic_test, testsuite)) else: