diff --git a/infra/data_collection/cicd.py b/infra/data_collection/cicd.py
index 58113d7aae9..64ef9f4234a 100644
--- a/infra/data_collection/cicd.py
+++ b/infra/data_collection/cicd.py
@@ -41,12 +41,22 @@ def create_cicd_json_for_data_analysis(
 
     workflow_outputs_dir = get_workflow_outputs_dir()
 
-    github_job_id_to_test_reports = get_github_job_id_to_test_reports(workflow_outputs_dir, github_pipeline_id)
+    github_job_ids = []
+    for raw_job in raw_jobs:
+        github_job_id = int(raw_job["github_job_id"])
+        github_job_ids.append(github_job_id)
+
+    github_job_id_to_test_reports = get_github_job_id_to_test_reports(
+        workflow_outputs_dir, github_pipeline_id, github_job_ids
+    )
 
     jobs = []
 
     for raw_job in raw_jobs:
         github_job_id = raw_job["github_job_id"]
+
+        logger.info(f"Processing raw GitHub job {github_job_id}")
+
         test_report_exists = github_job_id in github_job_id_to_test_reports
         if test_report_exists:
             test_report_path = github_job_id_to_test_reports[github_job_id]
diff --git a/infra/data_collection/github/download_cicd_logs_and_artifacts.sh b/infra/data_collection/github/download_cicd_logs_and_artifacts.sh
index a30efe16375..be553bd421b 100755
--- a/infra/data_collection/github/download_cicd_logs_and_artifacts.sh
+++ b/infra/data_collection/github/download_cicd_logs_and_artifacts.sh
@@ -27,13 +27,18 @@ download_artifacts() {
 download_logs_for_all_jobs() {
     local repo=$1
     local workflow_run_id=$2
-    local attempt_number=$3
+    local max_attempts=$3
 
+    echo "[info] downloading logs for job with id $job_id for all attempts up to $max_attempts"
+    for attempt_number in $(seq 1 $max_attempts); do
+        echo "[Info] Downloading for attempt $attempt_number"
 
-    gh api /repos/$repo/actions/runs/$workflow_run_id/attempts/$attempt_number/jobs --paginate | jq '.jobs[].id' | while read -r job_id; do
-        echo "[Info] Download logs for job with ID $job_id"
-        gh api /repos/$repo/actions/jobs/$job_id/logs > generated/cicd/$workflow_run_id/logs/$job_id.log
+        gh api /repos/$repo/actions/runs/$workflow_run_id/attempts/$attempt_number/jobs --paginate | jq '.jobs[].id' | while read -r job_id; do
+            echo "[info] download logs for job with id $job_id, attempt number $attempt_number"
+            gh api /repos/$repo/actions/jobs/$job_id/logs > generated/cicd/$workflow_run_id/logs/$job_id.log
+        done
     done
+
 }
 
 main() {
diff --git a/infra/data_collection/github/workflows.py b/infra/data_collection/github/workflows.py
index 094491f35e1..da6da23ad65 100644
--- a/infra/data_collection/github/workflows.py
+++ b/infra/data_collection/github/workflows.py
@@ -5,8 +5,9 @@
 import pathlib
 from datetime import datetime
 from functools import partial
+from typing import List
 
-from toolz.dicttoolz import keymap
+from loguru import logger
 
 from infra.data_collection import junit_xml_utils, pydantic_models
 
@@ -52,12 +53,18 @@ def search_for_uuid_in_log_file_(log_file):
     return None
 
 
-def get_workflow_run_uuids_to_github_job_ids_(workflow_outputs_dir, workflow_run_id: int):
+def get_github_job_ids_to_workflow_run_uuids_(workflow_outputs_dir, workflow_run_id: int):
+    """
+    We know we can get a proper mapping of github_job_id -> uuid because based on anecdotal testing,
+    it seems that the same GitHub job ID does not repeat for attempted runs, but because artifacts
+    carry over, uuid will repeat. We want to ensure that the set of github_job_ids is fully captured, and
+    it doesn't matter if the full set of uuids is.
+    """
     logs_dir = workflow_outputs_dir / str(workflow_run_id) / "logs"
 
     log_files = logs_dir.glob("*.log")
 
-    workflow_run_uuids_to_github_job_ids = {}
+    github_job_ids_to_workflow_run_uuids = {}
     for log_file in log_files:
         artifact_uuid_name = search_for_uuid_in_log_file_(log_file)
         if artifact_uuid_name:
@@ -65,26 +72,53 @@ def get_workflow_run_uuids_to_github_job_ids_(workflow_outputs_dir, workflow_run
             github_job_id = log_file.name.replace(".log", "")
             assert github_job_id.isnumeric(), f"{github_job_id}"
             github_job_id = int(github_job_id)
-            workflow_run_uuids_to_github_job_ids[uuid] = github_job_id
-    return workflow_run_uuids_to_github_job_ids
+            github_job_ids_to_workflow_run_uuids[github_job_id] = uuid
+    return github_job_ids_to_workflow_run_uuids
 
 
-def get_github_job_id_to_test_reports(workflow_outputs_dir, workflow_run_id: int):
+def get_github_job_id_to_test_reports(workflow_outputs_dir, workflow_run_id: int, github_job_ids: List[int]):
     workflow_run_uuids_to_test_report_paths = get_workflow_run_uuids_to_test_reports_paths_(
         workflow_outputs_dir, workflow_run_id
     )
 
-    workflow_run_uuids_to_github_job_ids = get_workflow_run_uuids_to_github_job_ids_(
+    github_job_ids_to_workflow_run_uuids = get_github_job_ids_to_workflow_run_uuids_(
         workflow_outputs_dir, workflow_run_id
     )
 
-    get_github_job_id_from_uuid = lambda uuid_: workflow_run_uuids_to_github_job_ids[uuid_]
-    github_job_id_to_test_reports = keymap(get_github_job_id_from_uuid, workflow_run_uuids_to_test_report_paths)
+    github_job_id_to_test_reports = {}
+    for github_job_id in github_job_ids:
+        # It's possible the upload didn't go through, but if the test report
+        # for a uuid exists, then that means the test must have printed a uuid
+        # Unless the log file itself is gone because it's a stale pipeline
+
+        # It could be a good idea to throw an exception for either continue
+        # step in this loop and have the caller handle it. The key reason to do
+        # that in our case would be to potentially snuff out infrastructure
+        # issues and surface that at the job level.
+
+        if github_job_id not in github_job_ids_to_workflow_run_uuids:
+            """
+            The potential infra failure is: internal error or logs were never pushed up because the runner died
+            """
+            logger.warning(
+                f"No uuid was found for job {github_job_id}, meaning either the log file is not present or it doesn't have a uuid printed"
+            )
+            continue
+
+        uuid = github_job_ids_to_workflow_run_uuids[github_job_id]
+
+        if uuid not in workflow_run_uuids_to_test_report_paths:
+            logger.warning(
+                f"uuid {uuid} for job {github_job_id} does not have a matching test report path, usually meaning the report doesn't exist"
+            )
+            continue
+
+        github_job_id_to_test_reports[github_job_id] = workflow_run_uuids_to_test_report_paths[uuid]
 
     return github_job_id_to_test_reports
 
 
-def get_pydantic_test_from_pytest_testcase(testcase, default_timestamp=datetime.now()):
+def get_pydantic_test_from_pytest_testcase_(testcase, default_timestamp=datetime.now()):
     skipped = junit_xml_utils.get_pytest_testcase_is_skipped(testcase)
     failed = junit_xml_utils.get_pytest_testcase_is_failed(testcase)
     error = junit_xml_utils.get_pytest_testcase_is_error(testcase)
@@ -163,7 +197,7 @@ def get_tests_from_test_report_path(test_report_path):
         testsuite = report_root[0]
         default_timestamp = datetime.strptime(testsuite.attrib["timestamp"], "%Y-%m-%dT%H:%M:%S.%f")
 
-        get_pydantic_test = partial(get_pydantic_test_from_pytest_testcase, default_timestamp=default_timestamp)
+        get_pydantic_test = partial(get_pydantic_test_from_pytest_testcase_, default_timestamp=default_timestamp)
 
         return list(map(get_pydantic_test, testsuite))
     else: