Skip to content

Commit

Permalink
#12840: Add more handling more multiple attempts by restricting the s…
Browse files Browse the repository at this point in the history
…pace of `github_job_id`s we're looking to only the ones in the workflow run attempt in question (#12858)

* #12840: Add support for artifacts for multiple attempts by downloading all of them, so that the system doesn't get confused when searching for artifacts that were generated in a previous attempt but weren't carried over

* #12840: Instead of using trying to match all the job IDs found in the downloaded logs to a test report, only match the ones for the requested job, so we don't handle more than we need to and run into issues such as what we currently have where multiple attempts of a workflow run will cause multiple github_job_ids to have the same uuid, meaning we can't match later because we lose data
  • Loading branch information
tt-rkim authored Sep 18, 2024
1 parent fd38b29 commit ea8522c
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 16 deletions.
12 changes: 11 additions & 1 deletion infra/data_collection/cicd.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,22 @@ def create_cicd_json_for_data_analysis(

workflow_outputs_dir = get_workflow_outputs_dir()

github_job_id_to_test_reports = get_github_job_id_to_test_reports(workflow_outputs_dir, github_pipeline_id)
github_job_ids = []
for raw_job in raw_jobs:
github_job_id = int(raw_job["github_job_id"])
github_job_ids.append(github_job_id)

github_job_id_to_test_reports = get_github_job_id_to_test_reports(
workflow_outputs_dir, github_pipeline_id, github_job_ids
)

jobs = []

for raw_job in raw_jobs:
github_job_id = raw_job["github_job_id"]

logger.info(f"Processing raw GitHub job {github_job_id}")

test_report_exists = github_job_id in github_job_id_to_test_reports
if test_report_exists:
test_report_path = github_job_id_to_test_reports[github_job_id]
Expand Down
13 changes: 9 additions & 4 deletions infra/data_collection/github/download_cicd_logs_and_artifacts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,18 @@ download_artifacts() {
download_logs_for_all_jobs() {
local repo=$1
local workflow_run_id=$2
local attempt_number=$3
local max_attempts=$3

echo "[info] downloading logs for job with id $job_id for all attempts up to $max_attempts"
for attempt_number in $(seq 1 $max_attempts); do
echo "[Info] Downloading for attempt $attempt_number"

gh api /repos/$repo/actions/runs/$workflow_run_id/attempts/$attempt_number/jobs --paginate | jq '.jobs[].id' | while read -r job_id; do
echo "[Info] Download logs for job with ID $job_id"
gh api /repos/$repo/actions/jobs/$job_id/logs > generated/cicd/$workflow_run_id/logs/$job_id.log
gh api /repos/$repo/actions/runs/$workflow_run_id/attempts/$attempt_number/jobs --paginate | jq '.jobs[].id' | while read -r job_id; do
echo "[info] download logs for job with id $job_id, attempt number $attempt_number"
gh api /repos/$repo/actions/jobs/$job_id/logs > generated/cicd/$workflow_run_id/logs/$job_id.log
done
done

}

main() {
Expand Down
56 changes: 45 additions & 11 deletions infra/data_collection/github/workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
import pathlib
from datetime import datetime
from functools import partial
from typing import List

from toolz.dicttoolz import keymap
from loguru import logger

from infra.data_collection import junit_xml_utils, pydantic_models

Expand Down Expand Up @@ -52,39 +53,72 @@ def search_for_uuid_in_log_file_(log_file):
return None


def get_workflow_run_uuids_to_github_job_ids_(workflow_outputs_dir, workflow_run_id: int):
def get_github_job_ids_to_workflow_run_uuids_(workflow_outputs_dir, workflow_run_id: int):
"""
We know we can get a proper mapping of github_job_id -> uuid because based on anecdotal testing,
it seems that the same GitHub job ID does not repeat for attempted runs, but because artifacts
carry over, uuid will repeat. We want to ensure that the set of github_job_ids is fully captured, and
it doesn't matter if the full set of uuids is.
"""
logs_dir = workflow_outputs_dir / str(workflow_run_id) / "logs"

log_files = logs_dir.glob("*.log")

workflow_run_uuids_to_github_job_ids = {}
github_job_ids_to_workflow_run_uuids = {}
for log_file in log_files:
artifact_uuid_name = search_for_uuid_in_log_file_(log_file)
if artifact_uuid_name:
uuid = artifact_uuid_name.replace("test_reports_", "")
github_job_id = log_file.name.replace(".log", "")
assert github_job_id.isnumeric(), f"{github_job_id}"
github_job_id = int(github_job_id)
workflow_run_uuids_to_github_job_ids[uuid] = github_job_id
return workflow_run_uuids_to_github_job_ids
github_job_ids_to_workflow_run_uuids[github_job_id] = uuid
return github_job_ids_to_workflow_run_uuids


def get_github_job_id_to_test_reports(workflow_outputs_dir, workflow_run_id: int):
def get_github_job_id_to_test_reports(workflow_outputs_dir, workflow_run_id: int, github_job_ids: List[int]):
workflow_run_uuids_to_test_report_paths = get_workflow_run_uuids_to_test_reports_paths_(
workflow_outputs_dir, workflow_run_id
)

workflow_run_uuids_to_github_job_ids = get_workflow_run_uuids_to_github_job_ids_(
github_job_ids_to_workflow_run_uuids = get_github_job_ids_to_workflow_run_uuids_(
workflow_outputs_dir, workflow_run_id
)

get_github_job_id_from_uuid = lambda uuid_: workflow_run_uuids_to_github_job_ids[uuid_]
github_job_id_to_test_reports = keymap(get_github_job_id_from_uuid, workflow_run_uuids_to_test_report_paths)
github_job_id_to_test_reports = {}
for github_job_id in github_job_ids:
# It's possible the upload didn't go through, but if the test report
# for a uuid exists, then that means the test must have printed a uuid
# Unless the log file itself is gone because it's a stale pipeline

# It could be a good idea to throw an exception for either continue
# step in this loop and have the caller handle it. The key reason to do
# that in our case would be to potentially snuff out infrastructure
# issues and surface that at the job level.

if github_job_id not in github_job_ids_to_workflow_run_uuids:
"""
The potential infra failure is: internal error or logs were never pushed up because the runner died
"""
logger.warning(
f"No uuid was found for job {github_job_id}, meaning either the log file is not present or it doesn't have a uuid printed"
)
continue

uuid = github_job_ids_to_workflow_run_uuids[github_job_id]

if uuid not in workflow_run_uuids_to_test_report_paths:
logger.warning(
f"uuid {uuid} for job {github_job_id} does not have a matching test report path, usually meaning the report doesn't exist"
)
continue

github_job_id_to_test_reports[github_job_id] = workflow_run_uuids_to_test_report_paths[uuid]

return github_job_id_to_test_reports


def get_pydantic_test_from_pytest_testcase(testcase, default_timestamp=datetime.now()):
def get_pydantic_test_from_pytest_testcase_(testcase, default_timestamp=datetime.now()):
skipped = junit_xml_utils.get_pytest_testcase_is_skipped(testcase)
failed = junit_xml_utils.get_pytest_testcase_is_failed(testcase)
error = junit_xml_utils.get_pytest_testcase_is_error(testcase)
Expand Down Expand Up @@ -163,7 +197,7 @@ def get_tests_from_test_report_path(test_report_path):
testsuite = report_root[0]
default_timestamp = datetime.strptime(testsuite.attrib["timestamp"], "%Y-%m-%dT%H:%M:%S.%f")

get_pydantic_test = partial(get_pydantic_test_from_pytest_testcase, default_timestamp=default_timestamp)
get_pydantic_test = partial(get_pydantic_test_from_pytest_testcase_, default_timestamp=default_timestamp)

return list(map(get_pydantic_test, testsuite))
else:
Expand Down

0 comments on commit ea8522c

Please sign in to comment.