From da182aaa537367c85625f37160da570d9952c975 Mon Sep 17 00:00:00 2001
From: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
Date: Tue, 4 Jun 2024 14:39:52 +1000
Subject: [PATCH] chore: validate input repo, commit, provenance to ensure they
 match (#739)

Signed-off-by: Ben Selwyn-Smith <benselwynsmith@googlemail.com>
---
 scripts/dev_scripts/integration_tests.sh      |  14 +-
 src/macaron/json_tools.py                     |  41 +++--
 src/macaron/repo_finder/commit_finder.py      |   2 +-
 .../repo_finder/provenance_extractor.py       | 155 ++++++++++++++++--
 .../repo_finder/repo_finder_deps_dev.py       |   2 +-
 src/macaron/slsa_analyzer/analyzer.py         |  94 ++++++++---
 .../package_registry/jfrog_maven_registry.py  |   4 +-
 .../expectations/cue/cue_validator.py         |   4 +-
 .../provenance/intoto/__init__.py             |   2 +-
 .../provenance/intoto/v01/__init__.py         |   2 +-
 .../provenance/intoto/v1/__init__.py          |   2 +-
 .../slsa_analyzer/provenance/loader.py        |   3 +-
 src/macaron/util.py                           |  16 --
 src/macaron/vsa/vsa.py                        |   2 +-
 ...verifier_explicitly_provided_cue_PASS.json |  90 +++++-----
 .../repo_finder/test_provenance_extractor.py  |  26 ++-
 .../provenance/intoto/v01/test_validate.py    |   4 +-
 .../provenance/intoto/v1/test_validate.py     |   2 +-
 tests/slsa_analyzer/test_analyze_context.py   |   2 +-
 tests/vsa/test_compare_vsa.py                 |   2 +-
 20 files changed, 329 insertions(+), 140 deletions(-)

diff --git a/scripts/dev_scripts/integration_tests.sh b/scripts/dev_scripts/integration_tests.sh
index 839803ecd..8ad4eaf83 100755
--- a/scripts/dev_scripts/integration_tests.sh
+++ b/scripts/dev_scripts/integration_tests.sh
@@ -707,7 +707,7 @@ JSON_RESULT=$WORKSPACE/output/reports/github_com/slsa-framework/slsa-verifier/sl
 EXPECTATION_FILE=$WORKSPACE/tests/slsa_analyzer/provenance/expectations/cue/resources/valid_expectations/slsa_verifier_PASS.cue
 DEFAULTS_FILE=$WORKSPACE/tests/e2e/defaults/slsa_verifier.ini
 PROVENANCE_FILE=$WORKSPACE/tests/slsa_analyzer/provenance/resources/valid_provenances/slsa-verifier-linux-amd64.intoto.jsonl
-$RUN_MACARON -dp $DEFAULTS_FILE analyze -pe $EXPECTATION_FILE -pf $PROVENANCE_FILE -rp https://github.com/slsa-framework/slsa-verifier -b main -d fc50b662fcfeeeb0e97243554b47d9b20b14efac --skip-deps || log_fail
+$RUN_MACARON -dp $DEFAULTS_FILE analyze -pe $EXPECTATION_FILE -pf $PROVENANCE_FILE -rp https://github.com/slsa-framework/slsa-verifier -d 6fb4f7e2dd9c2f5d4f55fa88f6796278a7bba6d6 --skip-deps || log_fail
 
 check_or_update_expected_output $COMPARE_JSON_OUT $JSON_RESULT $JSON_EXPECTED || log_fail
 
@@ -719,7 +719,7 @@ JSON_RESULT=$WORKSPACE/output/reports/github_com/slsa-framework/slsa-verifier/sl
 EXPECTATION_FILE=$WORKSPACE/tests/slsa_analyzer/provenance/expectations/cue/resources/valid_expectations/slsa_verifier_PASS.cue
 DEFAULTS_FILE=$WORKSPACE/tests/e2e/defaults/allow_url_link_github.ini
 PROVENANCE_FILE=$WORKSPACE/tests/slsa_analyzer/provenance/resources/valid_provenances/slsa-verifier-linux-amd64.intoto.jsonl
-$RUN_MACARON -dp $DEFAULTS_FILE analyze -pe $EXPECTATION_FILE -pf $PROVENANCE_FILE -rp https://github.com/slsa-framework/slsa-verifier -b main -d fc50b662fcfeeeb0e97243554b47d9b20b14efac --skip-deps || log_fail
+$RUN_MACARON -dp $DEFAULTS_FILE analyze -pe $EXPECTATION_FILE -pf $PROVENANCE_FILE -rp https://github.com/slsa-framework/slsa-verifier -d 6fb4f7e2dd9c2f5d4f55fa88f6796278a7bba6d6 --skip-deps || log_fail
 
 check_or_update_expected_output $COMPARE_JSON_OUT $JSON_RESULT $JSON_EXPECTED || log_fail
 
@@ -762,7 +762,7 @@ check_or_update_expected_output $COMPARE_POLICIES $POLICY_RESULT $POLICY_EXPECTE
 
 echo -e "\n----------------------------------------------------------------------------------"
 echo "behnazh-w/example-maven-app as a local and remote repository"
-echo "Test the Witness and GitHub provenances as an input, Cue expectation validation, Policy CLI and VSA generation."
+echo "Test the Witness and GitHub provenances as an input, Cue expectation validation, Policy CLI and VSA generation, User input vs. provenance."
 echo -e "----------------------------------------------------------------------------------\n"
 RUN_POLICY="macaron verify-policy"
 POLICY_FILE=$WORKSPACE/tests/policy_engine/resources/policies/example-maven-project/policy.dl
@@ -794,6 +794,14 @@ $RUN_POLICY -f $POLICY_FILE -d "$WORKSPACE/output/macaron.db" || log_fail
 check_or_update_expected_output "$COMPARE_POLICIES" "$POLICY_RESULT" "$POLICY_EXPECTED" || log_fail
 check_or_update_expected_output "$COMPARE_VSA" "$VSA_RESULT" "$VSA_PAYLOAD_EXPECTED" || log_fail
 
+# Validate user input of repo and commit vs provenance.
+$RUN_MACARON analyze -pf $GITHUB_PROVENANCE_FILE -rp https://github.com/behnazh-w/example-maven-app -d 2deca75ed5dd365eaf1558a82347b1f11306135f --skip-deps || log_fail
+
+# Validate user input of repo and commit (via purl) vs provenance.
+$RUN_MACARON analyze -pf $GITHUB_PROVENANCE_FILE -purl pkg:github/behnazh-w/example-maven-app@2deca75 --skip-deps || log_fail
+
+# Validate user input of repo and commit (via purl with tag) vs provenance.
+$RUN_MACARON analyze -pf $GITHUB_PROVENANCE_FILE -purl pkg:github/behnazh-w/example-maven-app@1.0 --skip-deps || log_fail
 
 # Testing the Repo Finder's remote calls.
 # This requires the 'packageurl' Python module
diff --git a/src/macaron/json_tools.py b/src/macaron/json_tools.py
index c38ebe15f..4b4aef98c 100644
--- a/src/macaron/json_tools.py
+++ b/src/macaron/json_tools.py
@@ -3,26 +3,26 @@
 
 """This module provides utility functions for JSON data."""
 import logging
+from collections.abc import Sequence
 from typing import TypeVar
 
-from macaron.util import JsonType
-
+JsonType = int | float | str | None | bool | list["JsonType"] | dict[str, "JsonType"]
 T = TypeVar("T", bound=JsonType)
 
 logger: logging.Logger = logging.getLogger(__name__)
 
 
-def json_extract(entry: JsonType, keys: list[str], type_: type[T]) -> T | None:
+def json_extract(entry: dict | list, keys: Sequence[str | int], type_: type[T]) -> T | None:
     """Return the value found by following the list of depth-sequential keys inside the passed JSON dictionary.
 
     The value must be of the passed type.
 
     Parameters
     ----------
-    entry: JsonType
+    entry: dict | list
         An entry point into a JSON structure.
-    keys: list[str]
-        The list of depth-sequential keys within the JSON.
+    keys: Sequence[str | int]
+        The sequence of depth-sequential keys within the JSON. Can be dict keys or list indices.
     type: type[T]
         The type to check the value against and return it as.
 
@@ -31,19 +31,28 @@ def json_extract(entry: JsonType, keys: list[str], type_: type[T]) -> T | None:
     T | None:
         The found value as the type of the type parameter.
     """
-    target = entry
-
-    for index, key in enumerate(keys):
-        if not isinstance(target, dict):
-            logger.debug("Expect the value .%s to be a dict.", ".".join(keys[:index]))
+    target: JsonType = entry
+    for key in keys:
+        if isinstance(target, dict) and isinstance(key, str):
+            if key not in target:
+                logger.debug("JSON key '%s' not found in dict target.", key)
+                return None
+        elif isinstance(target, list) and isinstance(key, int):
+            if key < 0 or key >= len(target):
+                logger.debug("JSON list index '%s' is outside of list bounds %s.", key, len(target))
+                return None
+        else:
+            logger.debug("Cannot index '%s' (type: %s) in target (type: %s).", key, type(key), type(target))
             return None
-        if key not in target:
-            logger.debug("JSON key '%s' not found in .%s", key, ".".join(keys[:index]))
-            return None
-        target = target[key]
+
+        # If statement required for mypy to not complain. The else case can never happen because of the above if block.
+        if isinstance(target, dict) and isinstance(key, str):
+            target = target[key]
+        elif isinstance(target, list) and isinstance(key, int):
+            target = target[key]
 
     if isinstance(target, type_):
         return target
 
-    logger.debug("Expect the value .%s to be of type %s", ".".join(keys), type_)
+    logger.debug("Found value of incorrect type: %s instead of %s.", type(target), type(type_))
     return None
diff --git a/src/macaron/repo_finder/commit_finder.py b/src/macaron/repo_finder/commit_finder.py
index b4610a36f..e6d4f2e66 100644
--- a/src/macaron/repo_finder/commit_finder.py
+++ b/src/macaron/repo_finder/commit_finder.py
@@ -199,7 +199,7 @@ def extract_commit_from_version(git_obj: Git, version: str) -> str | None:
     if 7 <= len(version) <= 40 and re.match(hex_only_pattern, version):
         try:
             commit = git_obj.get_commit(version)
-        except BadName as error:
+        except (BadName, ValueError) as error:
             logger.debug("Failed to retrieve commit: %s", error)
 
     if not commit:
diff --git a/src/macaron/repo_finder/provenance_extractor.py b/src/macaron/repo_finder/provenance_extractor.py
index b66bbe14d..c327154ab 100644
--- a/src/macaron/repo_finder/provenance_extractor.py
+++ b/src/macaron/repo_finder/provenance_extractor.py
@@ -3,11 +3,20 @@
 
 """This module contains methods for extracting repository and commit metadata from provenance files."""
 import logging
+import urllib.parse
+
+from packageurl import PackageURL
+from pydriller import Git
 
 from macaron.errors import ProvenanceError
-from macaron.json_tools import json_extract
+from macaron.json_tools import JsonType, json_extract
+from macaron.repo_finder.commit_finder import (
+    AbstractPurlType,
+    determine_abstract_purl_type,
+    extract_commit_from_version,
+)
+from macaron.repo_finder.repo_finder import to_domain_from_known_purl_types
 from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, InTotoV1Payload, InTotoV01Payload
-from macaron.util import JsonType
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -67,16 +76,8 @@ def _extract_from_slsa_v01(payload: InTotoV01Payload) -> tuple[str | None, str |
     if not list_index:
         return None, None
 
-    material_list = json_extract(predicate, ["materials"], list)
-    if not material_list:
-        return None, None
-
-    if list_index >= len(material_list):
-        logger.debug("Material list index outside of material list bounds.")
-        return None, None
-
-    material = material_list[list_index]
-    if not material or not isinstance(material, dict):
+    material = json_extract(predicate, ["materials", list_index], dict)
+    if not material:
         logger.debug("Indexed material list entry is invalid.")
         return None, None
 
@@ -232,3 +233,133 @@ def _clean_spdx(uri: str) -> str:
     """
     url, _, _ = uri.lstrip("git+").rpartition("@")
     return url
+
+
+def check_if_input_repo_commit_provenance_conflict(
+    repo_path_input: str | None,
+    digest_input: str | None,
+    provenance_repo_url: str | None,
+    provenance_commit_digest: str | None,
+) -> bool:
+    """Test if the input repo and commit match the contents of the provenance.
+
+    Parameters
+    ----------
+    repo_path_input: str | None
+        The repo URL from input.
+    digest_input: str | None
+        The digest from input.
+    provenance_repo_url: str | None
+        The repo URL from provenance.
+    provenance_commit_digest: str | None
+        The commit digest from provenance.
+
+    Returns
+    -------
+    bool
+        True if there is a conflict between the inputs, False otherwise, or if the comparison cannot be performed.
+    """
+    # Check the provenance repo against the input repo.
+    if repo_path_input and provenance_repo_url and repo_path_input != provenance_repo_url:
+        logger.debug(
+            "The repository URL from input does not match what exists in the provenance. "
+            "Input Repo: %s, Provenance Repo: %s.",
+            repo_path_input,
+            provenance_repo_url,
+        )
+        return True
+
+    # Check the provenance commit against the input commit.
+    if digest_input and provenance_commit_digest and digest_input != provenance_commit_digest:
+        logger.debug(
+            "The commit digest from input does not match what exists in the provenance. "
+            "Input Commit: %s, Provenance Commit: %s.",
+            digest_input,
+            provenance_commit_digest,
+        )
+        return True
+
+    return False
+
+
+def check_if_input_purl_provenance_conflict(
+    git_obj: Git,
+    repo_path_input: bool,
+    digest_input: bool,
+    provenance_repo_url: str | None,
+    provenance_commit_digest: str | None,
+    purl: PackageURL,
+) -> bool:
+    """Test if the input repository type PURL's repo and commit match the contents of the provenance.
+
+    Parameters
+    ----------
+    git_obj: Git
+        The Git object.
+    repo_path_input: bool
+        True if there is a repo as input.
+    digest_input: str
+        True if there is a commit as input.
+    provenance_repo_url: str | None
+        The repo url from provenance.
+    provenance_commit_digest: str | None
+        The commit digest from provenance.
+    purl: PackageURL
+        The input repository PURL.
+
+    Returns
+    -------
+    bool
+        True if there is a conflict between the inputs, False otherwise, or if the comparison cannot be performed.
+    """
+    if determine_abstract_purl_type(purl) != AbstractPurlType.REPOSITORY:
+        return False
+
+    # Check the PURL repo against the provenance.
+    if not repo_path_input and provenance_repo_url:
+        if not check_if_repository_purl_and_url_match(provenance_repo_url, purl):
+            logger.debug(
+                "The repo url passed via purl input does not match what exists in the provenance. "
+                "Purl: %s, Provenance: %s.",
+                purl,
+                provenance_repo_url,
+            )
+            return True
+
+    # Check the PURL commit against the provenance.
+    if not digest_input and provenance_commit_digest and purl.version:
+        purl_commit = extract_commit_from_version(git_obj, purl.version)
+        if purl_commit and purl_commit != provenance_commit_digest:
+            logger.debug(
+                "The commit digest passed via purl input does not match what exists in the "
+                "provenance. Purl Commit: %s, Provenance Commit: %s.",
+                purl_commit,
+                provenance_commit_digest,
+            )
+            return True
+
+    return False
+
+
+def check_if_repository_purl_and_url_match(url: str, repo_purl: PackageURL) -> bool:
+    """Compare a repository PURL and URL for equality.
+
+    Parameters
+    ----------
+    url: str
+        The URL.
+    repo_purl: PackageURL
+        A PURL that is of the repository abstract type. E.g. GitHub.
+
+    Returns
+    -------
+    bool
+        True if the two inputs match in terms of URL netloc/domain and path.
+    """
+    expanded_purl_type = to_domain_from_known_purl_types(repo_purl.type)
+    parsed_url = urllib.parse.urlparse(url)
+    purl_path = repo_purl.name
+    if repo_purl.namespace:
+        purl_path = f"{repo_purl.namespace}/{purl_path}"
+    # Note that the urllib method includes the "/" before path while the PURL method does not.
+    return f"{parsed_url.hostname}{parsed_url.path}".lower() == f"{expanded_purl_type or repo_purl.type}/{purl_path}"
diff --git a/src/macaron/repo_finder/repo_finder_deps_dev.py b/src/macaron/repo_finder/repo_finder_deps_dev.py
index c7de6107f..7f2266051 100644
--- a/src/macaron/repo_finder/repo_finder_deps_dev.py
+++ b/src/macaron/repo_finder/repo_finder_deps_dev.py
@@ -9,7 +9,7 @@
 
 from packageurl import PackageURL
 
-from macaron.repo_finder.provenance_extractor import json_extract
+from macaron.json_tools import json_extract
 from macaron.repo_finder.repo_finder_base import BaseRepoFinder
 from macaron.repo_finder.repo_validator import find_valid_repository_url
 from macaron.util import send_get_http_raw
diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
index 8af5cb848..dd5d2cf5a 100644
--- a/src/macaron/slsa_analyzer/analyzer.py
+++ b/src/macaron/slsa_analyzer/analyzer.py
@@ -35,7 +35,11 @@
 from macaron.output_reporter.results import Record, Report, SCMStatus
 from macaron.repo_finder import repo_finder
 from macaron.repo_finder.commit_finder import find_commit
-from macaron.repo_finder.provenance_extractor import extract_repo_and_commit_from_provenance
+from macaron.repo_finder.provenance_extractor import (
+    check_if_input_purl_provenance_conflict,
+    check_if_input_repo_commit_provenance_conflict,
+    extract_repo_and_commit_from_provenance,
+)
 from macaron.repo_finder.provenance_finder import ProvenanceFinder
 from macaron.slsa_analyzer import git_url
 from macaron.slsa_analyzer.analyze_context import AnalyzeContext
@@ -322,6 +326,8 @@ def run_single(
             provenance_payload = ProvenanceFinder().find_provenance(parsed_purl)
 
         # Try to extract the repository URL and commit digest from the Provenance, if it exists.
+        repo_path_input: str | None = config.get_value("path")
+        digest_input: str | None = config.get_value("digest")
         provenance_repo_url = provenance_commit_digest = None
         if provenance_payload:
             try:
@@ -331,6 +337,17 @@ def run_single(
             except ProvenanceError as error:
                 logger.debug("Failed to extract repo or commit from provenance: %s", error)
 
+            # Try to validate the input repo and/or commit against provenance contents.
+            if (provenance_repo_url or provenance_commit_digest) and check_if_input_repo_commit_provenance_conflict(
+                repo_path_input, digest_input, provenance_repo_url, provenance_commit_digest
+            ):
+                return Record(
+                    record_id=repo_id,
+                    description="Input mismatch between repo/commit and provenance.",
+                    pre_config=config,
+                    status=SCMStatus.ANALYSIS_FAILED,
+                )
+
         # Create the analysis target.
         available_domains = [git_service.hostname for git_service in GIT_SERVICES if git_service.hostname]
         try:
@@ -345,11 +362,40 @@ def run_single(
                 status=SCMStatus.ANALYSIS_FAILED,
             )
 
+        # Prepare the repo.
+        git_obj = None
+        if analysis_target.repo_path:
+            git_obj = self._prepare_repo(
+                os.path.join(self.output_path, self.GIT_REPOS_DIR),
+                analysis_target.repo_path,
+                analysis_target.branch,
+                analysis_target.digest,
+                analysis_target.parsed_purl,
+            )
+
+        # Check if only one of the repo or digest came from direct input.
+        if git_obj and (provenance_repo_url or provenance_commit_digest) and parsed_purl:
+            if check_if_input_purl_provenance_conflict(
+                git_obj,
+                bool(repo_path_input),
+                bool(digest_input),
+                provenance_repo_url,
+                provenance_commit_digest,
+                parsed_purl,
+            ):
+                return Record(
+                    record_id=repo_id,
+                    description="Input mismatch between repo/commit (purl) and provenance.",
+                    pre_config=config,
+                    status=SCMStatus.ANALYSIS_FAILED,
+                )
+
         # Create the component.
         try:
             component = self.add_component(
                 analysis,
                 analysis_target,
+                git_obj,
                 existing_records,
                 provenance_payload,
             )
@@ -507,6 +553,7 @@ def add_component(
         self,
         analysis: Analysis,
         analysis_target: AnalysisTarget,
+        git_obj: Git | None,
         existing_records: dict[str, Record] | None = None,
         provenance_payload: InTotoPayload | None = None,
     ) -> Component:
@@ -521,6 +568,8 @@ def add_component(
             The current analysis instance.
         analysis_target: AnalysisTarget
             The target of this analysis.
+        git_obj: Git | None
+            The pydriller.Git object of the repository.
         existing_records : dict[str, Record] | None
             The mapping of existing records that the analysis has run successfully.
         provenance_payload: InTotoVPayload | None
@@ -539,32 +588,23 @@ def add_component(
             The component is already analyzed in the same session.
         """
         # Note: the component created in this function will be added to the database.
-        repository = None
-        if analysis_target.repo_path:
-            git_obj = self._prepare_repo(
-                os.path.join(self.output_path, self.GIT_REPOS_DIR),
-                analysis_target.repo_path,
-                analysis_target.branch,
-                analysis_target.digest,
-                analysis_target.parsed_purl,
-            )
-            if git_obj:
-                # TODO: use both the repo URL and the commit hash to check.
-                if (
-                    existing_records
-                    and (existing_record := existing_records.get(git_url.get_remote_origin_of_local_repo(git_obj)))
-                    is not None
-                ):
-                    raise DuplicateCmpError(
-                        f"{analysis_target.repo_path} is already analyzed.", context=existing_record.context
-                    )
+        if git_obj:
+            # TODO: use both the repo URL and the commit hash to check.
+            if (
+                existing_records
+                and (existing_record := existing_records.get(git_url.get_remote_origin_of_local_repo(git_obj)))
+                is not None
+            ):
+                raise DuplicateCmpError(
+                    f"{analysis_target.repo_path} is already analyzed.", context=existing_record.context
+                )
 
-                repository = self.add_repository(analysis_target.branch, git_obj)
-            else:
-                # We cannot prepare the repository even though we have successfully resolved the repository path for the
-                # software component. If this happens, we don't raise error and treat the software component as if it
-                # does not have any ``Repository`` attached to it.
-                repository = None
+            repository = self.add_repository(analysis_target.branch, git_obj)
+        else:
+            # We cannot prepare the repository even though we have successfully resolved the repository path for the
+            # software component. If this happens, we don't raise error and treat the software component as if it
+            # does not have any ``Repository`` attached to it.
+            repository = None
 
         if not analysis_target.parsed_purl:
             # If the PURL is not available. This will only mean that the user don't provide PURL but only provide the
@@ -923,7 +963,7 @@ def _resolve_local_path(start_dir: str, local_path: str) -> str:
             The resolved path in canonical form or an empty string if errors.
         """
         # Resolve the path by joining dir and path.
-        # Because strict mode is enabled, if a path doesn’t exist or a symlink loop
+        # Because strict mode is enabled, if a path doesn't exist or a symlink loop
         # is encountered, OSError is raised.
         # ValueError is raised if we use both relative and absolute paths in os.path.commonpath.
         try:
diff --git a/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py b/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py
index ce52a6595..1c78d4409 100644
--- a/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py
+++ b/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """Assets on a package registry."""
@@ -14,11 +14,11 @@
 
 from macaron.config.defaults import defaults
 from macaron.errors import ConfigurationError
+from macaron.json_tools import JsonType
 from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool
 from macaron.slsa_analyzer.build_tool.gradle import Gradle
 from macaron.slsa_analyzer.build_tool.maven import Maven
 from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry
-from macaron.util import JsonType
 
 logger: logging.Logger = logging.getLogger(__name__)
 
diff --git a/src/macaron/slsa_analyzer/provenance/expectations/cue/cue_validator.py b/src/macaron/slsa_analyzer/provenance/expectations/cue/cue_validator.py
index 6feaeab15..70e203af8 100644
--- a/src/macaron/slsa_analyzer/provenance/expectations/cue/cue_validator.py
+++ b/src/macaron/slsa_analyzer/provenance/expectations/cue/cue_validator.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """The cue module invokes the CUE schema validator."""
@@ -10,7 +10,7 @@
 
 from macaron import MACARON_PATH
 from macaron.errors import CUEExpectationError, CUERuntimeError
-from macaron.util import JsonType
+from macaron.json_tools import JsonType
 
 # Load the CUE shared library.
 cue = ctypes.CDLL(os.path.join(MACARON_PATH, "bin", "cuevalidate.so"))
diff --git a/src/macaron/slsa_analyzer/provenance/intoto/__init__.py b/src/macaron/slsa_analyzer/provenance/intoto/__init__.py
index c82a590fc..03b3f16f4 100644
--- a/src/macaron/slsa_analyzer/provenance/intoto/__init__.py
+++ b/src/macaron/slsa_analyzer/provenance/intoto/__init__.py
@@ -10,11 +10,11 @@
 
 from packageurl import PackageURL
 
+from macaron.json_tools import JsonType
 from macaron.slsa_analyzer.provenance.intoto import v01, v1
 from macaron.slsa_analyzer.provenance.intoto.errors import ValidateInTotoPayloadError
 from macaron.slsa_analyzer.provenance.intoto.v01 import InTotoV01Subject
 from macaron.slsa_analyzer.provenance.intoto.v1 import InTotoV1ResourceDescriptor
-from macaron.util import JsonType
 
 # Type of an in-toto statement.
 # This is currently either a v0.1 statement or v1 statement.
diff --git a/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py b/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
index 95fc3b304..94f8b6f78 100644
--- a/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
+++ b/src/macaron/slsa_analyzer/provenance/intoto/v01/__init__.py
@@ -7,8 +7,8 @@
 
 from typing import TypedDict, TypeGuard
 
+from macaron.json_tools import JsonType
 from macaron.slsa_analyzer.provenance.intoto.errors import ValidateInTotoPayloadError
-from macaron.util import JsonType
 
 
 class InTotoV01Statement(TypedDict):
diff --git a/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py b/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
index 3ffe08bd6..a428c712b 100644
--- a/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
+++ b/src/macaron/slsa_analyzer/provenance/intoto/v1/__init__.py
@@ -8,8 +8,8 @@
 from collections.abc import Callable
 from typing import TypedDict, TypeGuard
 
+from macaron.json_tools import JsonType
 from macaron.slsa_analyzer.provenance.intoto.errors import ValidateInTotoPayloadError
-from macaron.util import JsonType
 
 
 class InTotoV1Statement(TypedDict):
diff --git a/src/macaron/slsa_analyzer/provenance/loader.py b/src/macaron/slsa_analyzer/provenance/loader.py
index cdde8245f..65dfee1bb 100644
--- a/src/macaron/slsa_analyzer/provenance/loader.py
+++ b/src/macaron/slsa_analyzer/provenance/loader.py
@@ -12,9 +12,10 @@
 from urllib.parse import urlparse
 
 from macaron.config.defaults import defaults
+from macaron.json_tools import JsonType
 from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, validate_intoto_payload
 from macaron.slsa_analyzer.provenance.intoto.errors import LoadIntotoAttestationError, ValidateInTotoPayloadError
-from macaron.util import JsonType, send_get_http_raw
+from macaron.util import send_get_http_raw
 
 logger: logging.Logger = logging.getLogger(__name__)
 
diff --git a/src/macaron/util.py b/src/macaron/util.py
index 25f876af7..90876b9c0 100644
--- a/src/macaron/util.py
+++ b/src/macaron/util.py
@@ -260,19 +260,3 @@ def copy_file_bulk(file_list: list, src_path: str, target_path: str) -> bool:
                 return False
 
     return True
-
-
-JsonType = int | float | str | None | bool | list["JsonType"] | dict[str, "JsonType"]
-
-
-def get_if_exists(doc: JsonType, path: list[str | int]) -> JsonType | None:
-    """Get a json dict value if it exists."""
-    while len(path) > 0:
-        this = path.pop(0)
-        if isinstance(this, str) and isinstance(doc, dict) and this in doc:
-            doc = doc[this]
-        elif isinstance(this, int) and isinstance(doc, list) and 0 <= this < len(doc):
-            doc = doc[this]
-        else:
-            return None
-    return doc
diff --git a/src/macaron/vsa/vsa.py b/src/macaron/vsa/vsa.py
index f06a948f9..43b9ca156 100644
--- a/src/macaron/vsa/vsa.py
+++ b/src/macaron/vsa/vsa.py
@@ -20,7 +20,7 @@
 
 from macaron.database.database_manager import get_db_manager
 from macaron.database.table_definitions import ProvenanceSubject
-from macaron.util import JsonType
+from macaron.json_tools import JsonType
 
 logger: logging.Logger = logging.getLogger(__name__)
 
diff --git a/tests/e2e/expected_results/slsa-verifier/slsa-verifier_explicitly_provided_cue_PASS.json b/tests/e2e/expected_results/slsa-verifier/slsa-verifier_explicitly_provided_cue_PASS.json
index fd29d3f4c..ef30c99f9 100644
--- a/tests/e2e/expected_results/slsa-verifier/slsa-verifier_explicitly_provided_cue_PASS.json
+++ b/tests/e2e/expected_results/slsa-verifier/slsa-verifier_explicitly_provided_cue_PASS.json
@@ -1,50 +1,50 @@
 {
     "metadata": {
-        "timestamps": "2024-05-07 15:16:38",
+        "timestamps": "2024-05-14 13:23:14",
         "has_passing_check": true,
         "run_checks": [
-            "mcn_build_script_1",
-            "mcn_build_service_1",
-            "mcn_provenance_derived_commit_1",
-            "mcn_trusted_builder_level_three_1",
+            "mcn_provenance_witness_level_one_1",
             "mcn_provenance_derived_repo_1",
             "mcn_build_as_code_1",
             "mcn_provenance_available_1",
-            "mcn_infer_artifact_pipeline_1",
-            "mcn_provenance_expectation_1",
             "mcn_version_control_system_1",
-            "mcn_provenance_witness_level_one_1"
+            "mcn_provenance_expectation_1",
+            "mcn_trusted_builder_level_three_1",
+            "mcn_infer_artifact_pipeline_1",
+            "mcn_provenance_derived_commit_1",
+            "mcn_build_script_1",
+            "mcn_build_service_1"
         ],
         "check_tree": {
             "mcn_provenance_available_1": {
-                "mcn_provenance_expectation_1": {},
+                "mcn_provenance_level_three_1": {},
                 "mcn_provenance_witness_level_one_1": {},
-                "mcn_provenance_level_three_1": {}
+                "mcn_provenance_expectation_1": {}
             },
             "mcn_provenance_derived_commit_1": {},
             "mcn_version_control_system_1": {
-                "mcn_build_script_1": {},
                 "mcn_trusted_builder_level_three_1": {
                     "mcn_build_as_code_1": {
-                        "mcn_build_service_1": {},
-                        "mcn_infer_artifact_pipeline_1": {}
+                        "mcn_infer_artifact_pipeline_1": {},
+                        "mcn_build_service_1": {}
                     }
-                }
+                },
+                "mcn_build_script_1": {}
             },
             "mcn_provenance_derived_repo_1": {}
         }
     },
     "target": {
         "info": {
-            "full_name": "pkg:github.com/slsa-framework/slsa-verifier@fc50b662fcfeeeb0e97243554b47d9b20b14efac",
+            "full_name": "pkg:github.com/slsa-framework/slsa-verifier@6fb4f7e2dd9c2f5d4f55fa88f6796278a7bba6d6",
             "local_cloned_path": "git_repos/github_com/slsa-framework/slsa-verifier",
             "remote_path": "https://github.com/slsa-framework/slsa-verifier",
-            "branch": "main",
-            "commit_hash": "fc50b662fcfeeeb0e97243554b47d9b20b14efac",
-            "commit_date": "2022-10-04T01:00:02+00:00"
+            "branch": null,
+            "commit_hash": "6fb4f7e2dd9c2f5d4f55fa88f6796278a7bba6d6",
+            "commit_date": "2022-08-25T11:37:20-05:00"
         },
         "provenances": {
-            "is_inferred": true,
+            "is_inferred": false,
             "content": {
                 "github_actions": [
                     {
@@ -53,24 +53,23 @@
                         "predicateType": "https://slsa.dev/provenance/v0.2",
                         "predicate": {
                             "builder": {
-                                "id": "https://github.com/slsa-framework/slsa-verifier/blob/fc50b662fcfeeeb0e97243554b47d9b20b14efac/.github/workflows/release.yml"
+                                "id": "<URI>"
                             },
-                            "buildType": "Custom github_actions",
+                            "buildType": "<URI>",
                             "invocation": {
                                 "configSource": {
-                                    "uri": "https://github.com/slsa-framework/slsa-verifier@refs/heads/main",
+                                    "uri": "<URI>",
                                     "digest": {
-                                        "sha1": "fc50b662fcfeeeb0e97243554b47d9b20b14efac"
+                                        "sha1": "<STING>"
                                     },
-                                    "entryPoint": "https://github.com/slsa-framework/slsa-verifier/blob/fc50b662fcfeeeb0e97243554b47d9b20b14efac/.github/workflows/release.yml"
+                                    "entryPoint": "<STRING>"
                                 },
                                 "parameters": {},
                                 "environment": {}
                             },
                             "buildConfig": {
-                                "jobID": "",
-                                "stepID": "",
-                                "stepName": ""
+                                "jobID": "<STRING>",
+                                "stepID": "<STRING>"
                             },
                             "metadata": {
                                 "buildInvocationId": "<STRING>",
@@ -91,15 +90,14 @@
                             ]
                         }
                     }
-                ],
-                "npm Registry": []
+                ]
             }
         },
         "checks": {
             "summary": {
                 "DISABLED": 0,
-                "FAILED": 3,
-                "PASSED": 8,
+                "FAILED": 2,
+                "PASSED": 9,
                 "SKIPPED": 0,
                 "UNKNOWN": 0
             },
@@ -125,9 +123,9 @@
                         "build_tool_name: go",
                         "ci_service_name: github_actions",
                         "language: BuildLanguage.GO",
-                        "build_tool_command: [\"go\", \"build\", \"-mod=vendor\", \"-o\", \"service\", \"./cli/experimental/service/\"]",
+                        "build_tool_command: [\"go\", \"build\", \"-mod=vendor\"]",
                         {
-                            "build_trigger": "https://github.com/slsa-framework/slsa-verifier/blob/fc50b662fcfeeeb0e97243554b47d9b20b14efac/.github/workflows/pre-submit.cli.yml"
+                            "build_trigger": "https://github.com/slsa-framework/slsa-verifier/blob/6fb4f7e2dd9c2f5d4f55fa88f6796278a7bba6d6/.github/workflows/pre-submit.yml"
                         }
                     ],
                     "result_type": "PASSED"
@@ -157,6 +155,17 @@
                     ],
                     "result_type": "PASSED"
                 },
+                {
+                    "check_id": "mcn_provenance_derived_commit_1",
+                    "check_description": "Check whether the commit came from provenance.",
+                    "slsa_requirements": [
+                        "Security - SLSA Level 4"
+                    ],
+                    "justification": [
+                        "commit_info: The commit digest was found from provenance."
+                    ],
+                    "result_type": "PASSED"
+                },
                 {
                     "check_id": "mcn_provenance_derived_repo_1",
                     "check_description": "Check whether the repo came from provenance.",
@@ -164,7 +173,7 @@
                         "Security - SLSA Level 4"
                     ],
                     "justification": [
-                        "repository_url: The repository URL was found from provenance."
+                        "repository_info: The repository URL was found from provenance."
                     ],
                     "result_type": "PASSED"
                 },
@@ -192,7 +201,7 @@
                         "build_tool_name: slsa-framework/slsa-github-generator/.github/workflows/builder_go_slsa3.yml@v1.2.0",
                         "ci_service_name: github_actions",
                         {
-                            "build_trigger": "https://github.com/slsa-framework/slsa-verifier/blob/fc50b662fcfeeeb0e97243554b47d9b20b14efac/.github/workflows/release.yml"
+                            "build_trigger": "https://github.com/slsa-framework/slsa-verifier/blob/6fb4f7e2dd9c2f5d4f55fa88f6796278a7bba6d6/.github/workflows/release.yml"
                         }
                     ],
                     "result_type": "PASSED"
@@ -221,17 +230,6 @@
                     ],
                     "result_type": "FAILED"
                 },
-                {
-                    "check_id": "mcn_provenance_derived_commit_1",
-                    "check_description": "Check whether the commit came from provenance.",
-                    "slsa_requirements": [
-                        "Security - SLSA Level 4"
-                    ],
-                    "justification": [
-                        "commit_digest: The analysis commit did not match the provenance commit."
-                    ],
-                    "result_type": "FAILED"
-                },
                 {
                     "check_id": "mcn_provenance_witness_level_one_1",
                     "check_description": "Check whether the target has a level-1 witness provenance.",
diff --git a/tests/repo_finder/test_provenance_extractor.py b/tests/repo_finder/test_provenance_extractor.py
index ff0914686..704aaaa8f 100644
--- a/tests/repo_finder/test_provenance_extractor.py
+++ b/tests/repo_finder/test_provenance_extractor.py
@@ -5,12 +5,15 @@
 import json
 
 import pytest
+from packageurl import PackageURL
 
 from macaron.errors import ProvenanceError
-from macaron.json_tools import json_extract
-from macaron.repo_finder.provenance_extractor import extract_repo_and_commit_from_provenance
+from macaron.json_tools import JsonType, json_extract
+from macaron.repo_finder.provenance_extractor import (
+    check_if_repository_purl_and_url_match,
+    extract_repo_and_commit_from_provenance,
+)
 from macaron.slsa_analyzer.provenance.intoto import validate_intoto_payload
-from macaron.util import JsonType
 
 
 @pytest.fixture(name="slsa_v1_gcb_1_provenance")
@@ -496,6 +499,21 @@ def test_invalid_type_payloads(type_: str, predicate_type: str) -> None:
         _test_extract_repo_and_commit_from_provenance(payload)
 
 
+@pytest.mark.parametrize(
+    ("url", "purl_string", "expected"),
+    [
+        ("https://github.com:9000/oracle/macaron", "pkg:github/oracle/macaron", True),
+        ("http://user:pass@github.com/oracle/macaron", "pkg:github.com/oracle/macaron", True),
+        ("https://bitbucket.org:9000/example/test", "pkg:bitbucket/example/test", True),
+        ("http://bitbucket.org/example;key1=1?key2=2#key3=3", "pkg:bitbucket.org/example", True),
+    ],
+)
+def test_compare_purl_and_url(url: str, purl_string: str, expected: bool) -> None:
+    """Test comparison of repository type PURLs against matching URLs."""
+    purl = PackageURL.from_string(purl_string)
+    assert expected == check_if_repository_purl_and_url_match(url, purl)
+
+
 def _test_extract_repo_and_commit_from_provenance(
     payload: dict[str, JsonType], expected_repo: str | None = None, expected_commit: str | None = None
 ) -> None:
@@ -506,7 +524,7 @@ def _test_extract_repo_and_commit_from_provenance(
     assert expected_commit == commit
 
 
-def _json_modify(entry: JsonType, keys: list[str], new_value: JsonType) -> None:
+def _json_modify(entry: dict | list, keys: list[str], new_value: JsonType) -> None:
     """Modify the value found by following the list of depth-sequential keys inside the passed JSON dictionary.
 
     The found value will be overwritten by the `new_value` parameter.
diff --git a/tests/slsa_analyzer/provenance/intoto/v01/test_validate.py b/tests/slsa_analyzer/provenance/intoto/v01/test_validate.py
index 2e438a484..99d8f4032 100644
--- a/tests/slsa_analyzer/provenance/intoto/v01/test_validate.py
+++ b/tests/slsa_analyzer/provenance/intoto/v01/test_validate.py
@@ -1,13 +1,13 @@
-# Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """Tests for validation of in-toto attestation version 0.1."""
 
 import pytest
 
+from macaron.json_tools import JsonType
 from macaron.slsa_analyzer.provenance.intoto.errors import ValidateInTotoPayloadError
 from macaron.slsa_analyzer.provenance.intoto.v01 import validate_intoto_statement, validate_intoto_subject
-from macaron.util import JsonType
 
 
 @pytest.mark.parametrize(
diff --git a/tests/slsa_analyzer/provenance/intoto/v1/test_validate.py b/tests/slsa_analyzer/provenance/intoto/v1/test_validate.py
index ca03668a5..44ea4d0a3 100644
--- a/tests/slsa_analyzer/provenance/intoto/v1/test_validate.py
+++ b/tests/slsa_analyzer/provenance/intoto/v1/test_validate.py
@@ -5,9 +5,9 @@
 
 import pytest
 
+from macaron.json_tools import JsonType
 from macaron.slsa_analyzer.provenance.intoto.errors import ValidateInTotoPayloadError
 from macaron.slsa_analyzer.provenance.intoto.v1 import validate_intoto_statement
-from macaron.util import JsonType
 
 
 @pytest.mark.parametrize(
diff --git a/tests/slsa_analyzer/test_analyze_context.py b/tests/slsa_analyzer/test_analyze_context.py
index 110d6aea5..dd33bda50 100644
--- a/tests/slsa_analyzer/test_analyze_context.py
+++ b/tests/slsa_analyzer/test_analyze_context.py
@@ -9,13 +9,13 @@
 from unittest.mock import MagicMock
 
 from macaron.code_analyzer.call_graph import BaseNode, CallGraph
+from macaron.json_tools import JsonType
 from macaron.slsa_analyzer.asset import VirtualReleaseAsset
 from macaron.slsa_analyzer.ci_service.github_actions.github_actions_ci import GitHubActions
 from macaron.slsa_analyzer.provenance.intoto import validate_intoto_payload
 from macaron.slsa_analyzer.provenance.slsa import SLSAProvenanceData
 from macaron.slsa_analyzer.slsa_req import ReqName, SLSAReqStatus
 from macaron.slsa_analyzer.specs.ci_spec import CIInfo
-from macaron.util import JsonType
 from tests.conftest import MockAnalyzeContext
 
 
diff --git a/tests/vsa/test_compare_vsa.py b/tests/vsa/test_compare_vsa.py
index a49db8d3f..7fe9b7281 100644
--- a/tests/vsa/test_compare_vsa.py
+++ b/tests/vsa/test_compare_vsa.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from macaron.util import JsonType
+from macaron.json_tools import JsonType
 from tests.vsa.compare_vsa import compare_json, skip_compare