From a74b61d5b916d3c070bf0ea2b1093a0678a46f1c Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith Date: Mon, 10 Feb 2025 13:42:40 +1000 Subject: [PATCH 1/9] feat: check PyPI registry when deps.dev fails to find a source repository Signed-off-by: Ben Selwyn-Smith --- src/macaron/json_tools.py | 4 +- src/macaron/repo_finder/repo_finder.py | 7 +- src/macaron/repo_finder/repo_finder_enums.py | 14 +++- src/macaron/repo_finder/repo_finder_pypi.py | 70 ++++++++++++++++++++ 4 files changed, 91 insertions(+), 4 deletions(-) create mode 100644 src/macaron/repo_finder/repo_finder_pypi.py diff --git a/src/macaron/json_tools.py b/src/macaron/json_tools.py index 3cd7a7d37..a69b0eaa8 100644 --- a/src/macaron/json_tools.py +++ b/src/macaron/json_tools.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module provides utility functions for JSON data.""" @@ -53,5 +53,5 @@ def json_extract(entry: dict | list, keys: Sequence[str | int], type_: type[T]) if isinstance(entry, type_): return entry - logger.debug("Found value of incorrect type: %s instead of %s.", type(entry), type(type_)) + logger.debug("Found value of incorrect type: %s instead of %s.", type(entry), type_) return None diff --git a/src/macaron/repo_finder/repo_finder.py b/src/macaron/repo_finder/repo_finder.py index a43fadc2d..6c16ac022 100644 --- a/src/macaron/repo_finder/repo_finder.py +++ b/src/macaron/repo_finder/repo_finder.py @@ -43,7 +43,7 @@ from macaron.config.defaults import defaults from macaron.config.global_config import global_config from macaron.errors import CloneError, RepoCheckOutError -from macaron.repo_finder import to_domain_from_known_purl_types +from macaron.repo_finder import repo_finder_pypi, to_domain_from_known_purl_types from macaron.repo_finder.commit_finder import find_commit, match_tags from macaron.repo_finder.repo_finder_base import BaseRepoFinder from macaron.repo_finder.repo_finder_deps_dev import DepsDevRepoFinder @@ -103,6 +103,11 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, logger.debug("Analyzing %s with Repo Finder: %s", purl, type(repo_finder)) found_repo, outcome = repo_finder.find_repo(purl) + if not found_repo and purl.type == "pypi": + found_repo, outcome = repo_finder_pypi.find_repo(purl) + if not found_repo: + logger.debug("Could not find repository from PyPI registry for PURL: %s", purl) + if found_repo or not check_latest_version: return found_repo, outcome diff --git a/src/macaron/repo_finder/repo_finder_enums.py b/src/macaron/repo_finder/repo_finder_enums.py index 4d088a5cc..7dff875c6 100644 --- a/src/macaron/repo_finder/repo_finder_enums.py +++ b/src/macaron/repo_finder/repo_finder_enums.py @@ -57,6 +57,15 @@ class RepoFinderInfo(Enum): #: Reported if deps.dev returns data that does not contain the desired SCM URL. E.g. The repository URL. DDEV_NO_URLS = "deps.dev no URLs" + #: Reported if there was an error with the request sent to the PyPI registry. + PYPI_HTTP_ERROR = "PyPI HTTP error" + + #: Reported if there was an error parsing the JSON returned by the PyPI registry. + PYPI_JSON_ERROR = "PyPI JSON error" + + #: Reported if there was no matching URLs in the JSON returned by the PyPI registry. + PYPI_NO_URLS = "PyPI no matching URLs" + #: Reported if the provided PURL did not produce a result, but a more recent version could not be found. NO_NEWER_VERSION = "No newer version than provided which failed" @@ -70,7 +79,10 @@ class RepoFinderInfo(Enum): FOUND_FROM_PARENT = "Found from parent" #: Reported when a repository is found from a more recent version than was provided by the user. - FOUND_FROM_LATEST = "Found form latest" + FOUND_FROM_LATEST = "Found from latest" + + #: Reported when a repository could only be found by checking the PyPI registry JSON. + FOUND_FROM_PYPI = "Found from PyPI" #: Default value. Reported if the Repo Finder was not called. E.g. Because the repository URL was already present. NOT_USED = "Not used" diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py new file mode 100644 index 000000000..40c042415 --- /dev/null +++ b/src/macaron/repo_finder/repo_finder_pypi.py @@ -0,0 +1,70 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""This module contains the logic for finding repositories of PyPI projects.""" +import logging +import urllib.parse + +from packageurl import PackageURL + +from macaron.errors import InvalidHTTPResponseError +from macaron.json_tools import json_extract +from macaron.repo_finder.repo_finder_enums import RepoFinderInfo + +logger: logging.Logger = logging.getLogger(__name__) + + +def find_repo(purl: PackageURL) -> tuple[str, RepoFinderInfo]: + """Retrieve the repository URL that matches the given PyPI PURL. + + Parameters + ---------- + purl : PackageURL + The parsed PURL to convert to the repository path. + + Returns + ------- + tuple[str, RepoFinderOutcome] : + The repository URL for the passed package, if found, and the outcome to report. + """ + # TODO solve circular dependency + from macaron.slsa_analyzer.package_registry import PyPIRegistry # pylint: disable=import-outside-toplevel + + pypi_registry = PyPIRegistry() + pypi_registry.load_defaults() + json_endpoint = f"pypi/{purl.name}/json" + url = urllib.parse.urljoin(pypi_registry.registry_url, json_endpoint) + try: + json = pypi_registry.download_package_json(url) + except InvalidHTTPResponseError as error: + logger.debug(error) + # TODO improve accuracy of this outcome. + return "", RepoFinderInfo.PYPI_HTTP_ERROR + + url_dict = json_extract(json, ["info", "project_urls"], dict) + if not url_dict: + return "", RepoFinderInfo.PYPI_JSON_ERROR + + for url_key in url_dict: + url = url_dict[url_key] + parsed_url = urllib.parse.urlparse(url) + if not parsed_url.hostname: + continue + if not parsed_url.hostname.lower() == "github.com": + continue + split_path = parsed_url.path.split("/") + if not split_path or len(split_path) < 3: + continue + # Fix the URL so that it is the base GitHub URL. E.g. github.com/{owner}/{repo} + fixed_url = urllib.parse.ParseResult( + scheme=parsed_url.scheme, + netloc=parsed_url.netloc, + path=f"{split_path[1]}/{split_path[2]}", + params=parsed_url.params, + query=parsed_url.query, + fragment=parsed_url.fragment, + ).geturl() + logger.debug("Found repository URL from PyPI: %s", fixed_url) + return fixed_url, RepoFinderInfo.FOUND_FROM_PYPI + + return "", RepoFinderInfo.PYPI_NO_URLS From 0c697f0debda03038c38259b7605ce1b580f814b Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith Date: Tue, 11 Feb 2025 08:20:04 +1000 Subject: [PATCH 2/9] chore: avoid circular dependency Signed-off-by: Ben Selwyn-Smith --- src/macaron/slsa_analyzer/analyzer.py | 2 +- .../package_registry/jfrog_maven_registry.py | 29 +--------------- .../maven_central_registry.py | 27 +-------------- .../package_registry/npm_registry.py | 34 ++----------------- .../package_registry/package_registry.py | 13 ++++--- .../package_registry/pypi_registry.py | 26 +------------- 6 files changed, 14 insertions(+), 117 deletions(-) diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index fb3adb33b..34bd5bb62 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -986,7 +986,7 @@ def _determine_package_registries(self, analyze_ctx: AnalyzeContext) -> None: ) for package_registry in PACKAGE_REGISTRIES: for build_tool in build_tools: - if package_registry.is_detected(build_tool): + if package_registry.is_detected(build_tool.name): analyze_ctx.dynamic_data["package_registries"].append( PackageRegistryInfo( build_tool=build_tool, diff --git a/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py b/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py index f7c8c1d00..881b374d7 100644 --- a/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py @@ -17,9 +17,6 @@ from macaron.config.defaults import defaults from macaron.errors import ConfigurationError from macaron.json_tools import JsonType -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool -from macaron.slsa_analyzer.build_tool.gradle import Gradle -from macaron.slsa_analyzer.build_tool.maven import Maven from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry logger: logging.Logger = logging.getLogger(__name__) @@ -126,6 +123,7 @@ def __init__( self.request_timeout = request_timeout or 10 self.download_timeout = download_timeout or 120 self.enabled = enabled or False + self.build_tool_names = {"maven", "gradle"} super().__init__("JFrog Maven Registry") def load_defaults(self) -> None: @@ -173,31 +171,6 @@ def load_defaults(self) -> None: self.enabled = True - def is_detected(self, build_tool: BaseBuildTool) -> bool: - """Detect if artifacts of the repo under analysis can possibly be published to this package registry. - - The detection here is based on the repo's detected build tool. - If the package registry is compatible with the given build tool, it can be a - possible place where the artifacts produced from the repo are published. - - ``JFrogMavenRegistry`` is compatible with Maven and Gradle. - - Parameters - ---------- - build_tool : BaseBuildTool - A detected build tool of the repository under analysis. - - Returns - ------- - bool - ``True`` if the repo under analysis can be published to this package registry, - based on the given build tool. - """ - if not self.enabled: - return False - compatible_build_tool_classes = [Maven, Gradle] - return any(isinstance(build_tool, build_tool_class) for build_tool_class in compatible_build_tool_classes) - def fetch_artifact_ids(self, group_id: str) -> list[str]: """Get all artifact ids under a group id. diff --git a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py index bdafc14eb..8372fb8e3 100644 --- a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py @@ -12,9 +12,6 @@ from macaron.config.defaults import defaults from macaron.errors import ConfigurationError, InvalidHTTPResponseError -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool -from macaron.slsa_analyzer.build_tool.gradle import Gradle -from macaron.slsa_analyzer.build_tool.maven import Maven from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry from macaron.util import send_get_http_raw @@ -108,6 +105,7 @@ def __init__( self.registry_url_scheme = registry_url_scheme or "" self.registry_url = "" # Created from the registry_url_scheme and registry_url_netloc. self.request_timeout = request_timeout or 10 + self.build_tool_names = {"maven", "gradle"} super().__init__("Maven Central Registry") def load_defaults(self) -> None: @@ -159,29 +157,6 @@ def load_defaults(self) -> None: f"of the .ini configuration file is invalid: {error}", ) from error - def is_detected(self, build_tool: BaseBuildTool) -> bool: - """Detect if artifacts of the repo under analysis can possibly be published to this package registry. - - The detection here is based on the repo's detected build tools. - If the package registry is compatible with the given build tools, it can be a - possible place where the artifacts produced from the repo are published. - - ``MavenCentralRegistry`` is compatible with Maven and Gradle. - - Parameters - ---------- - build_tool : BaseBuildTool - A detected build tool of the repository under analysis. - - Returns - ------- - bool - ``True`` if the repo under analysis can be published to this package registry, - based on the given build tool. - """ - compatible_build_tool_classes = [Maven, Gradle] - return any(isinstance(build_tool, build_tool_class) for build_tool_class in compatible_build_tool_classes) - def find_publish_timestamp(self, purl: str) -> datetime: """Make a search request to Maven Central to find the publishing timestamp of an artifact. diff --git a/src/macaron/slsa_analyzer/package_registry/npm_registry.py b/src/macaron/slsa_analyzer/package_registry/npm_registry.py index d4c97d143..d90fdb062 100644 --- a/src/macaron/slsa_analyzer/package_registry/npm_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/npm_registry.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """The module provides abstractions for the npm package registry.""" @@ -12,9 +12,6 @@ from macaron.config.defaults import defaults from macaron.errors import ConfigurationError, InvalidHTTPResponseError -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool -from macaron.slsa_analyzer.build_tool.npm import NPM -from macaron.slsa_analyzer.build_tool.yarn import Yarn from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry from macaron.util import send_get_http_raw @@ -53,6 +50,7 @@ def __init__( self.attestation_endpoint = attestation_endpoint or "" self.request_timeout = request_timeout or 10 self.enabled = enabled + self.build_tool_names = {"npm", "yarn"} super().__init__("npm Registry") def load_defaults(self) -> None: @@ -95,34 +93,6 @@ def load_defaults(self) -> None: f"of the .ini configuration file is invalid: {error}", ) from error - def is_detected(self, build_tool: BaseBuildTool) -> bool: - """Detect if artifacts under analysis can be published to this package registry. - - The detection here is based on the repo's detected build tools. - If the package registry is compatible with the given build tools, it can be a - possible place where the artifacts are published. - - ``NPMRegistry`` is compatible with npm and Yarn build tools. - - Note: if the npm registry is disabled through the ini configuration, this method returns False. - - Parameters - ---------- - build_tool : BaseBuildTool - A detected build tool of the repository under analysis. - - Returns - ------- - bool - ``True`` if the repo under analysis can be published to this package registry, - based on the given build tool. - """ - if not self.enabled: - logger.debug("Support for the npm registry is disabled.") - return False - compatible_build_tool_classes = [NPM, Yarn] - return any(isinstance(build_tool, build_tool_class) for build_tool_class in compatible_build_tool_classes) - def download_attestation_payload(self, url: str, download_path: str) -> bool: """Download the npm attestation from npm registry. diff --git a/src/macaron/slsa_analyzer/package_registry/package_registry.py b/src/macaron/slsa_analyzer/package_registry/package_registry.py index 146958252..fd943fb3d 100644 --- a/src/macaron/slsa_analyzer/package_registry/package_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/package_registry.py @@ -9,7 +9,6 @@ from macaron.errors import InvalidHTTPResponseError from macaron.json_tools import json_extract -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.package_registry.deps_dev import APIAccessError, DepsDevService logger: logging.Logger = logging.getLogger(__name__) @@ -20,13 +19,14 @@ class PackageRegistry(ABC): def __init__(self, name: str) -> None: self.name = name + self.build_tool_names: set[str] = set() + self.enabled: bool = True @abstractmethod def load_defaults(self) -> None: """Load the .ini configuration for the current package registry.""" - @abstractmethod - def is_detected(self, build_tool: BaseBuildTool) -> bool: + def is_detected(self, build_tool_name: str) -> bool: """Detect if artifacts of the repo under analysis can possibly be published to this package registry. The detection here is based on the repo's detected build tool. @@ -35,8 +35,8 @@ def is_detected(self, build_tool: BaseBuildTool) -> bool: Parameters ---------- - build_tool : BaseBuildTool - A detected build tool of the repository under analysis. + build_tool_name: str + The name of a detected build tool of the repository under analysis. Returns ------- @@ -44,6 +44,9 @@ def is_detected(self, build_tool: BaseBuildTool) -> bool: ``True`` if the repo under analysis can be published to this package registry, based on the given build tool. """ + if not self.enabled: + return False + return build_tool_name in self.build_tool_names def find_publish_timestamp(self, purl: str) -> datetime: """Retrieve the publication timestamp for a package specified by its purl from the deps.dev repository by default. diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 9636ccea7..18daeb0ee 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -21,8 +21,6 @@ from macaron.errors import ConfigurationError, InvalidHTTPResponseError from macaron.json_tools import json_extract from macaron.malware_analyzer.datetime_parser import parse_datetime -from macaron.slsa_analyzer.build_tool import Pip, Poetry -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry from macaron.util import send_get_http_raw @@ -75,6 +73,7 @@ def __init__( self.request_timeout = request_timeout or 10 self.enabled = enabled self.registry_url = "" + self.build_tool_names = {"pip", "poetry"} super().__init__("PyPI Registry") def load_defaults(self) -> None: @@ -129,29 +128,6 @@ def load_defaults(self) -> None: f"of the .ini configuration file is invalid: {error}", ) from error - def is_detected(self, build_tool: BaseBuildTool) -> bool: - """Detect if artifacts of the repo under analysis can possibly be published to this package registry. - - The detection here is based on the repo's detected build tools. - If the package registry is compatible with the given build tools, it can be a - possible place where the artifacts produced from the repo are published. - - ``PyPIRegistry`` is compatible with Pip and Poetry. - - Parameters - ---------- - build_tool: BaseBuildTool - A detected build tool of the repository under analysis. - - Returns - ------- - bool - ``True`` if the repo under analysis can be published to this package registry, - based on the given build tool. - """ - compatible_build_tool_classes = [Pip, Poetry] - return any(isinstance(build_tool, build_tool_class) for build_tool_class in compatible_build_tool_classes) - def download_package_json(self, url: str) -> dict: """Download the package JSON metadata from pypi registry. From 8b7b3afca3ff28aff76af0f9dccb27ea4c5eb90f Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith Date: Tue, 11 Feb 2025 08:21:23 +1000 Subject: [PATCH 3/9] chore: add alternative find repo for latest purl version also Signed-off-by: Ben Selwyn-Smith --- src/macaron/repo_finder/repo_finder.py | 24 +++++++++++++++++---- src/macaron/repo_finder/repo_finder_pypi.py | 11 +++++----- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/src/macaron/repo_finder/repo_finder.py b/src/macaron/repo_finder/repo_finder.py index 6c16ac022..796b3a21b 100644 --- a/src/macaron/repo_finder/repo_finder.py +++ b/src/macaron/repo_finder/repo_finder.py @@ -103,10 +103,8 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, logger.debug("Analyzing %s with Repo Finder: %s", purl, type(repo_finder)) found_repo, outcome = repo_finder.find_repo(purl) - if not found_repo and purl.type == "pypi": - found_repo, outcome = repo_finder_pypi.find_repo(purl) - if not found_repo: - logger.debug("Could not find repository from PyPI registry for PURL: %s", purl) + if not found_repo: + found_repo, outcome = find_repo_alternative(purl, outcome) if found_repo or not check_latest_version: return found_repo, outcome @@ -119,6 +117,12 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, return "", RepoFinderInfo.NO_NEWER_VERSION found_repo, outcome = DepsDevRepoFinder().find_repo(latest_version_purl) + if found_repo: + return found_repo, outcome + + if not found_repo: + found_repo, outcome = find_repo_alternative(latest_version_purl, outcome) + if not found_repo: logger.debug("Could not find repo from latest version of PURL: %s", latest_version_purl) return "", RepoFinderInfo.LATEST_VERSION_INVALID @@ -126,6 +130,18 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, return found_repo, outcome +def find_repo_alternative(purl: PackageURL, outcome: RepoFinderInfo) -> tuple[str, RepoFinderInfo]: + """Use PURL type specific methods to find the repository when the standard methods have failed.""" + found_repo = "" + if purl.type == "pypi": + found_repo, outcome = repo_finder_pypi.find_repo(purl) + + if not found_repo: + logger.debug("Could not find repository using type specific (%s) methods for PURL: %s", purl.type, purl) + + return found_repo, outcome + + def to_repo_path(purl: PackageURL, available_domains: list[str]) -> str | None: """Return the repository path from the PURL string. diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py index 40c042415..70722310f 100644 --- a/src/macaron/repo_finder/repo_finder_pypi.py +++ b/src/macaron/repo_finder/repo_finder_pypi.py @@ -10,6 +10,7 @@ from macaron.errors import InvalidHTTPResponseError from macaron.json_tools import json_extract from macaron.repo_finder.repo_finder_enums import RepoFinderInfo +from macaron.slsa_analyzer.package_registry import PyPIRegistry logger: logging.Logger = logging.getLogger(__name__) @@ -27,9 +28,6 @@ def find_repo(purl: PackageURL) -> tuple[str, RepoFinderInfo]: tuple[str, RepoFinderOutcome] : The repository URL for the passed package, if found, and the outcome to report. """ - # TODO solve circular dependency - from macaron.slsa_analyzer.package_registry import PyPIRegistry # pylint: disable=import-outside-toplevel - pypi_registry = PyPIRegistry() pypi_registry.load_defaults() json_endpoint = f"pypi/{purl.name}/json" @@ -52,14 +50,15 @@ def find_repo(purl: PackageURL) -> tuple[str, RepoFinderInfo]: continue if not parsed_url.hostname.lower() == "github.com": continue - split_path = parsed_url.path.split("/") - if not split_path or len(split_path) < 3: + # The path starts with a "/". + split_path = parsed_url.path[1:].split("/") + if not split_path or len(split_path) < 2: continue # Fix the URL so that it is the base GitHub URL. E.g. github.com/{owner}/{repo} fixed_url = urllib.parse.ParseResult( scheme=parsed_url.scheme, netloc=parsed_url.netloc, - path=f"{split_path[1]}/{split_path[2]}", + path=f"{split_path[0]}/{split_path[1]}", params=parsed_url.params, query=parsed_url.query, fragment=parsed_url.fragment, From ff15d41c20dfeb021d4f91947e8ceb387970c6eb Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith Date: Tue, 11 Feb 2025 08:50:53 +1000 Subject: [PATCH 4/9] chore: add integration test Signed-off-by: Ben Selwyn-Smith --- .../cases/repo_finder_pypi/policy.dl | 10 ++++++++++ .../cases/repo_finder_pypi/test.yaml | 20 +++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 tests/integration/cases/repo_finder_pypi/policy.dl create mode 100644 tests/integration/cases/repo_finder_pypi/test.yaml diff --git a/tests/integration/cases/repo_finder_pypi/policy.dl b/tests/integration/cases/repo_finder_pypi/policy.dl new file mode 100644 index 000000000..38b2dd9f4 --- /dev/null +++ b/tests/integration/cases/repo_finder_pypi/policy.dl @@ -0,0 +1,10 @@ +/* Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. */ +/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */ + +#include "prelude.dl" + +Policy("test_policy", component_id, "") :- + check_passed(component_id, "mcn_version_control_system_1"). + +apply_policy_to("test_policy", component_id) :- + is_component(component_id, "pkg:pypi/torch@2.6.0"). diff --git a/tests/integration/cases/repo_finder_pypi/test.yaml b/tests/integration/cases/repo_finder_pypi/test.yaml new file mode 100644 index 000000000..d3cf1c557 --- /dev/null +++ b/tests/integration/cases/repo_finder_pypi/test.yaml @@ -0,0 +1,20 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +description: | + Analyzing a PyPI PURL that is not correctly found by deps.dev and must be sought on the package registry directly. + +tags: +- macaron-python-package + +steps: +- name: Run macaron analyze + kind: analyze + options: + command_args: + - -purl + - pkg:pypi/torch@2.6.0 +- name: Run macaron verify-policy to verify passed/failed checks + kind: verify + options: + policy: policy.dl From 1811253c542abe65171aab86db56b47fbb1dd754 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith Date: Tue, 11 Feb 2025 08:57:37 +1000 Subject: [PATCH 5/9] chore: update tests Signed-off-by: Ben Selwyn-Smith --- .../package_registry/test_jfrog_maven_registry.py | 6 +++--- .../package_registry/test_maven_central_registry.py | 6 ++---- .../slsa_analyzer/package_registry/test_npm_registry.py | 9 +++------ 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/tests/slsa_analyzer/package_registry/test_jfrog_maven_registry.py b/tests/slsa_analyzer/package_registry/test_jfrog_maven_registry.py index ebb960366..ef7276dcf 100644 --- a/tests/slsa_analyzer/package_registry/test_jfrog_maven_registry.py +++ b/tests/slsa_analyzer/package_registry/test_jfrog_maven_registry.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Tests for the ``JFrogMavenRegistry`` class.""" @@ -129,12 +129,12 @@ def test_is_detected( expected_result: bool, ) -> None: """Test the ``is_detected`` method.""" - assert jfrog_maven.is_detected(build_tool) == expected_result + assert jfrog_maven.is_detected(build_tool.name) == expected_result # The method always returns False when the jfrog_maven instance is not enabled # (in the ini config). jfrog_maven.enabled = False - assert jfrog_maven.is_detected(build_tool) is False + assert jfrog_maven.is_detected(build_tool.name) is False @pytest.mark.parametrize( diff --git a/tests/slsa_analyzer/package_registry/test_maven_central_registry.py b/tests/slsa_analyzer/package_registry/test_maven_central_registry.py index 8a0287b36..62b9fdca0 100644 --- a/tests/slsa_analyzer/package_registry/test_maven_central_registry.py +++ b/tests/slsa_analyzer/package_registry/test_maven_central_registry.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Tests for the Maven Central registry.""" @@ -14,7 +14,6 @@ from macaron.config.defaults import load_defaults from macaron.errors import ConfigurationError, InvalidHTTPResponseError -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.package_registry.maven_central_registry import MavenCentralRegistry @@ -124,12 +123,11 @@ def test_load_defaults_with_invalid_config(tmp_path: Path, user_config_input: st ) def test_is_detected( maven_central: MavenCentralRegistry, - build_tools: dict[str, BaseBuildTool], build_tool_name: str, expected_result: bool, ) -> None: """Test the ``is_detected`` method.""" - assert maven_central.is_detected(build_tools[build_tool_name]) == expected_result + assert maven_central.is_detected(build_tool_name) == expected_result @pytest.mark.parametrize( diff --git a/tests/slsa_analyzer/package_registry/test_npm_registry.py b/tests/slsa_analyzer/package_registry/test_npm_registry.py index a6cadb4ba..a180ea78b 100644 --- a/tests/slsa_analyzer/package_registry/test_npm_registry.py +++ b/tests/slsa_analyzer/package_registry/test_npm_registry.py @@ -13,7 +13,6 @@ from macaron.config.defaults import load_defaults from macaron.errors import ConfigurationError, InvalidHTTPResponseError -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.build_tool.npm import NPM from macaron.slsa_analyzer.package_registry.npm_registry import NPMAttestationAsset, NPMRegistry @@ -45,7 +44,7 @@ def test_disable_npm_registry(npm_registry: NPMRegistry, tmp_path: Path, npm_too npm_registry.load_defaults() assert npm_registry.enabled is False - assert npm_registry.is_detected(build_tool=npm_tool) is False + assert npm_registry.is_detected(npm_tool.name) is False @pytest.mark.parametrize( @@ -87,12 +86,10 @@ def test_npm_registry_invalid_config(npm_registry: NPMRegistry, tmp_path: Path, ("maven", False), ], ) -def test_is_detected( - npm_registry: NPMRegistry, build_tools: dict[str, BaseBuildTool], build_tool_name: str, expected: bool -) -> None: +def test_is_detected(npm_registry: NPMRegistry, build_tool_name: str, expected: bool) -> None: """Test that the registry is correctly detected for a build tool.""" npm_registry.load_defaults() - assert npm_registry.is_detected(build_tool=build_tools[build_tool_name]) == expected + assert npm_registry.is_detected(build_tool_name) == expected @pytest.mark.parametrize( From 165f61101848421feeda4fcc73791223fb058261 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith Date: Tue, 11 Feb 2025 09:04:38 +1000 Subject: [PATCH 6/9] chore: pass build tool names to super class Signed-off-by: Ben Selwyn-Smith --- .../slsa_analyzer/package_registry/jfrog_maven_registry.py | 3 +-- .../package_registry/maven_central_registry.py | 3 +-- src/macaron/slsa_analyzer/package_registry/npm_registry.py | 3 +-- .../slsa_analyzer/package_registry/package_registry.py | 7 +++++-- .../slsa_analyzer/package_registry/pypi_registry.py | 3 +-- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py b/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py index 881b374d7..aaac195c8 100644 --- a/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/jfrog_maven_registry.py @@ -123,8 +123,7 @@ def __init__( self.request_timeout = request_timeout or 10 self.download_timeout = download_timeout or 120 self.enabled = enabled or False - self.build_tool_names = {"maven", "gradle"} - super().__init__("JFrog Maven Registry") + super().__init__("JFrog Maven Registry", {"maven", "gradle"}) def load_defaults(self) -> None: """Load the .ini configuration for the current package registry. diff --git a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py index 8372fb8e3..238087a88 100644 --- a/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/maven_central_registry.py @@ -105,8 +105,7 @@ def __init__( self.registry_url_scheme = registry_url_scheme or "" self.registry_url = "" # Created from the registry_url_scheme and registry_url_netloc. self.request_timeout = request_timeout or 10 - self.build_tool_names = {"maven", "gradle"} - super().__init__("Maven Central Registry") + super().__init__("Maven Central Registry", {"maven", "gradle"}) def load_defaults(self) -> None: """Load the .ini configuration for the current package registry. diff --git a/src/macaron/slsa_analyzer/package_registry/npm_registry.py b/src/macaron/slsa_analyzer/package_registry/npm_registry.py index d90fdb062..6f5063e89 100644 --- a/src/macaron/slsa_analyzer/package_registry/npm_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/npm_registry.py @@ -50,8 +50,7 @@ def __init__( self.attestation_endpoint = attestation_endpoint or "" self.request_timeout = request_timeout or 10 self.enabled = enabled - self.build_tool_names = {"npm", "yarn"} - super().__init__("npm Registry") + super().__init__("npm Registry", {"npm", "yarn"}) def load_defaults(self) -> None: """Load the .ini configuration for the current package registry. diff --git a/src/macaron/slsa_analyzer/package_registry/package_registry.py b/src/macaron/slsa_analyzer/package_registry/package_registry.py index fd943fb3d..7fbbf4258 100644 --- a/src/macaron/slsa_analyzer/package_registry/package_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/package_registry.py @@ -17,9 +17,9 @@ class PackageRegistry(ABC): """Base package registry class.""" - def __init__(self, name: str) -> None: + def __init__(self, name: str, build_tool_names: set[str]) -> None: self.name = name - self.build_tool_names: set[str] = set() + self.build_tool_names = build_tool_names self.enabled: bool = True @abstractmethod @@ -44,6 +44,9 @@ def is_detected(self, build_tool_name: str) -> bool: ``True`` if the repo under analysis can be published to this package registry, based on the given build tool. """ + print() + print(f"{build_tool_name} in {self.build_tool_names} ?") + print() if not self.enabled: return False return build_tool_name in self.build_tool_names diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 18daeb0ee..95429acb0 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -73,8 +73,7 @@ def __init__( self.request_timeout = request_timeout or 10 self.enabled = enabled self.registry_url = "" - self.build_tool_names = {"pip", "poetry"} - super().__init__("PyPI Registry") + super().__init__("PyPI Registry", {"pip", "poetry"}) def load_defaults(self) -> None: """Load the .ini configuration for the current package registry. From 164bad8e4100ae452e50be700dac43c0a38cb202 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith Date: Thu, 13 Feb 2025 14:52:39 +1000 Subject: [PATCH 7/9] chore: reuse PyPI JSON asset Signed-off-by: Ben Selwyn-Smith --- .../metadata/closer_release_join_date.py | 4 +- src/macaron/repo_finder/repo_finder.py | 32 +++++++++-- src/macaron/repo_finder/repo_finder_enums.py | 3 + src/macaron/repo_finder/repo_finder_pypi.py | 42 +++++++++----- src/macaron/slsa_analyzer/analyzer.py | 57 ++++++++++++++----- .../checks/detect_malicious_metadata_check.py | 19 +++++-- .../package_registry/package_registry.py | 3 - .../package_registry/pypi_registry.py | 14 +++-- .../specs/package_registry_spec.py | 5 +- 9 files changed, 128 insertions(+), 51 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py index 83333f3c9..fd556c6de 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Analyzer checks whether the maintainers' join date closer to latest package's release date.""" @@ -95,7 +95,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes The result and related information collected during the analysis. """ maintainers_join_date: list[datetime] | None = self._get_maintainers_join_date( - pypi_package_json.pypi_registry, pypi_package_json.component.name + pypi_package_json.pypi_registry, pypi_package_json.component_name ) latest_release_date: datetime | None = self._get_latest_release_date(pypi_package_json) detail_info: dict[str, JsonType] = { diff --git a/src/macaron/repo_finder/repo_finder.py b/src/macaron/repo_finder/repo_finder.py index 796b3a21b..e1443518f 100644 --- a/src/macaron/repo_finder/repo_finder.py +++ b/src/macaron/repo_finder/repo_finder.py @@ -66,11 +66,14 @@ list_remote_references, resolve_local_path, ) +from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo logger: logging.Logger = logging.getLogger(__name__) -def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, RepoFinderInfo]: +def find_repo( + purl: PackageURL, check_latest_version: bool = True, all_package_registries: list[PackageRegistryInfo] | None = None +) -> tuple[str, RepoFinderInfo]: """Retrieve the repository URL that matches the given PURL. Parameters @@ -79,6 +82,8 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, The parsed PURL to convert to the repository path. check_latest_version: bool A flag that determines whether the latest version of the PURL is also checked. + all_package_registries: list[PackageRegistryInfo] | None + The list of package registries, if any. Returns ------- @@ -104,7 +109,7 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, found_repo, outcome = repo_finder.find_repo(purl) if not found_repo: - found_repo, outcome = find_repo_alternative(purl, outcome) + found_repo, outcome = find_repo_alternative(purl, outcome, all_package_registries) if found_repo or not check_latest_version: return found_repo, outcome @@ -130,11 +135,28 @@ def find_repo(purl: PackageURL, check_latest_version: bool = True) -> tuple[str, return found_repo, outcome -def find_repo_alternative(purl: PackageURL, outcome: RepoFinderInfo) -> tuple[str, RepoFinderInfo]: - """Use PURL type specific methods to find the repository when the standard methods have failed.""" +def find_repo_alternative( + purl: PackageURL, outcome: RepoFinderInfo, all_package_registries: list[PackageRegistryInfo] | None = None +) -> tuple[str, RepoFinderInfo]: + """Use PURL type specific methods to find the repository when the standard methods have failed. + + Parameters + ---------- + purl : PackageURL + The parsed PURL to convert to the repository path. + outcome: RepoFinderInfo + A previous outcome to report if this method does nothing. + all_package_registries: list[PackageRegistryInfo] | None + The list of package registries, if any. + + Returns + ------- + tuple[str, RepoFinderOutcome] : + The repository URL for the passed package, if found, and the outcome to report. + """ found_repo = "" if purl.type == "pypi": - found_repo, outcome = repo_finder_pypi.find_repo(purl) + found_repo, outcome = repo_finder_pypi.find_repo(purl, all_package_registries) if not found_repo: logger.debug("Could not find repository using type specific (%s) methods for PURL: %s", purl.type, purl) diff --git a/src/macaron/repo_finder/repo_finder_enums.py b/src/macaron/repo_finder/repo_finder_enums.py index 7dff875c6..43e8d5e8b 100644 --- a/src/macaron/repo_finder/repo_finder_enums.py +++ b/src/macaron/repo_finder/repo_finder_enums.py @@ -66,6 +66,9 @@ class RepoFinderInfo(Enum): #: Reported if there was no matching URLs in the JSON returned by the PyPI registry. PYPI_NO_URLS = "PyPI no matching URLs" + #: Reported if the PyPI registry is disabled or not present in the list of package registries. + PYPI_NO_REGISTRY = "PyPI registry disabled or absent" + #: Reported if the provided PURL did not produce a result, but a more recent version could not be found. NO_NEWER_VERSION = "No newer version than provided which failed" diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py index 70722310f..537e3297d 100644 --- a/src/macaron/repo_finder/repo_finder_pypi.py +++ b/src/macaron/repo_finder/repo_finder_pypi.py @@ -7,39 +7,55 @@ from packageurl import PackageURL -from macaron.errors import InvalidHTTPResponseError -from macaron.json_tools import json_extract from macaron.repo_finder.repo_finder_enums import RepoFinderInfo -from macaron.slsa_analyzer.package_registry import PyPIRegistry +from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, PyPIRegistry +from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset +from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo logger: logging.Logger = logging.getLogger(__name__) -def find_repo(purl: PackageURL) -> tuple[str, RepoFinderInfo]: +def find_repo( + purl: PackageURL, all_package_registries: list[PackageRegistryInfo] | None = None +) -> tuple[str, RepoFinderInfo]: """Retrieve the repository URL that matches the given PyPI PURL. Parameters ---------- purl : PackageURL The parsed PURL to convert to the repository path. + all_package_registries: list[PackageRegistryInfo] | None + The context of the current analysis, if any. Returns ------- tuple[str, RepoFinderOutcome] : The repository URL for the passed package, if found, and the outcome to report. """ - pypi_registry = PyPIRegistry() + pypi_registry = next((registry for registry in PACKAGE_REGISTRIES if isinstance(registry, PyPIRegistry)), None) + if not pypi_registry: + return "", RepoFinderInfo.PYPI_NO_REGISTRY + pypi_registry.load_defaults() - json_endpoint = f"pypi/{purl.name}/json" - url = urllib.parse.urljoin(pypi_registry.registry_url, json_endpoint) - try: - json = pypi_registry.download_package_json(url) - except InvalidHTTPResponseError as error: - logger.debug(error) - # TODO improve accuracy of this outcome. + pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, pypi_registry, {}) + if not pypi_asset.download(dest=""): return "", RepoFinderInfo.PYPI_HTTP_ERROR - url_dict = json_extract(json, ["info", "project_urls"], dict) + if all_package_registries: + # Find the package registry info object that contains the PyPI registry and has the pypi build tool. + registry_info = next( + ( + info + for info in all_package_registries + if info.package_registry == pypi_registry and info.build_tool_name == "pypi" + ), + None, + ) + if registry_info: + # Save the asset for later use. + registry_info.metadata.append(pypi_asset) + + url_dict = pypi_asset.get_project_links() if not url_dict: return "", RepoFinderInfo.PYPI_JSON_ERROR diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index 34bd5bb62..7d9d35312 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -332,6 +332,9 @@ def run_single( status=SCMStatus.ANALYSIS_FAILED, ) + # Pre-populate all package registries so assets can be stored for later. + all_package_registries = self._populate_package_registry_info() + provenance_is_verified = False if not provenance_payload and parsed_purl: # Try to find the provenance file for the parsed PURL. @@ -367,7 +370,12 @@ def run_single( available_domains = [git_service.hostname for git_service in GIT_SERVICES if git_service.hostname] try: analysis_target = Analyzer.to_analysis_target( - config, available_domains, parsed_purl, provenance_repo_url, provenance_commit_digest + config, + available_domains, + parsed_purl, + provenance_repo_url, + provenance_commit_digest, + all_package_registries, ) except InvalidAnalysisTargetError as error: return Record( @@ -459,7 +467,7 @@ def run_single( self._determine_build_tools(analyze_ctx, git_service) if parsed_purl is not None: self._verify_repository_link(parsed_purl, analyze_ctx) - self._determine_package_registries(analyze_ctx) + self._determine_package_registries(analyze_ctx, all_package_registries) if not provenance_payload: # Look for provenance using the CI. @@ -767,6 +775,7 @@ def to_analysis_target( parsed_purl: PackageURL | None, provenance_repo_url: str | None = None, provenance_commit_digest: str | None = None, + all_package_registries: list[PackageRegistryInfo] | None = None, ) -> AnalysisTarget: """Resolve the details of a software component from user input. @@ -783,6 +792,8 @@ def to_analysis_target( The repository URL extracted from provenance, or None if not found or no provenance. provenance_commit_digest: str | None The commit extracted from provenance, or None if not found or no provenance. + all_package_registries: list[PackageRegistryInfo] | None + The list of all package registries. Returns ------- @@ -825,7 +836,9 @@ def to_analysis_target( converted_repo_path = repo_finder.to_repo_path(parsed_purl, available_domains) if converted_repo_path is None: # Try to find repo from PURL - repo, repo_finder_outcome = repo_finder.find_repo(parsed_purl) + repo, repo_finder_outcome = repo_finder.find_repo( + parsed_purl, all_package_registries=all_package_registries + ) return Analyzer.AnalysisTarget( parsed_purl=parsed_purl, @@ -979,20 +992,38 @@ def _determine_ci_services(self, analyze_ctx: AnalyzeContext, git_service: BaseG ) ) - def _determine_package_registries(self, analyze_ctx: AnalyzeContext) -> None: + def _populate_package_registry_info(self) -> list[PackageRegistryInfo]: + """Add all possible package registries to the analysis context.""" + package_registries = [] + for package_registry in PACKAGE_REGISTRIES: + for build_tool in BUILD_TOOLS: + build_tool_name = build_tool.name + if build_tool_name not in package_registry.build_tool_names: + continue + package_registries.append( + PackageRegistryInfo( + build_tool_name=build_tool_name, + package_registry=package_registry, + ) + ) + return package_registries + + def _determine_package_registries( + self, analyze_ctx: AnalyzeContext, all_package_registries: list[PackageRegistryInfo] + ) -> None: """Determine the package registries used by the software component based on its build tools.""" build_tools = ( analyze_ctx.dynamic_data["build_spec"]["tools"] or analyze_ctx.dynamic_data["build_spec"]["purl_tools"] ) - for package_registry in PACKAGE_REGISTRIES: - for build_tool in build_tools: - if package_registry.is_detected(build_tool.name): - analyze_ctx.dynamic_data["package_registries"].append( - PackageRegistryInfo( - build_tool=build_tool, - package_registry=package_registry, - ) - ) + build_tool_names = {build_tool.name for build_tool in build_tools} + relevant_package_registries = [] + for package_registry in all_package_registries: + if package_registry.build_tool_name not in build_tool_names: + continue + relevant_package_registries.append(package_registry) + + # Assign the updated list of registries. + analyze_ctx.dynamic_data["package_registries"] = relevant_package_registries def _verify_repository_link(self, parsed_purl: PackageURL, analyze_ctx: AnalyzeContext) -> None: """Verify whether the claimed repository links back to the artifact.""" diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 0e2fe0039..57111888d 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -26,8 +26,6 @@ from macaron.malware_analyzer.pypi_heuristics.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer from macaron.slsa_analyzer.analyze_context import AnalyzeContext -from macaron.slsa_analyzer.build_tool.pip import Pip -from macaron.slsa_analyzer.build_tool.poetry import Poetry from macaron.slsa_analyzer.checks.base_check import BaseCheck from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType from macaron.slsa_analyzer.package_registry.deps_dev import APIAccessError, DepsDevService @@ -400,14 +398,23 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: match package_registry_info_entry: # Currently, only PyPI packages are supported. case PackageRegistryInfo( - build_tool=Pip() | Poetry(), + build_tool_name="pip" | "poetry", package_registry=PyPIRegistry() as pypi_registry, ) as pypi_registry_info: - # Create an AssetLocator object for the PyPI package JSON object. - pypi_package_json = PyPIPackageJsonAsset( - component=ctx.component, pypi_registry=pypi_registry, package_json={} + # Retrieve the pre-existing AssetLocator object for the PyPI package JSON object, if it exists. + pypi_package_json = next( + (asset for asset in pypi_registry_info.metadata if isinstance(asset, PyPIPackageJsonAsset)), + None, ) + if not pypi_package_json: + # Create an AssetLocator object for the PyPI package JSON object. + pypi_package_json = PyPIPackageJsonAsset( + component_name=ctx.component.name, + component_version=ctx.component.version, + pypi_registry=pypi_registry, + package_json={}, + ) pypi_registry_info.metadata.append(pypi_package_json) diff --git a/src/macaron/slsa_analyzer/package_registry/package_registry.py b/src/macaron/slsa_analyzer/package_registry/package_registry.py index 7fbbf4258..9e71fc595 100644 --- a/src/macaron/slsa_analyzer/package_registry/package_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/package_registry.py @@ -44,9 +44,6 @@ def is_detected(self, build_tool_name: str) -> bool: ``True`` if the repo under analysis can be published to this package registry, based on the given build tool. """ - print() - print(f"{build_tool_name} in {self.build_tool_names} ?") - print() if not self.enabled: return False return build_tool_name in self.build_tool_names diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 95429acb0..fbbf87f83 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -17,7 +17,6 @@ from requests import RequestException from macaron.config.defaults import defaults -from macaron.database.table_definitions import Component from macaron.errors import ConfigurationError, InvalidHTTPResponseError from macaron.json_tools import json_extract from macaron.malware_analyzer.datetime_parser import parse_datetime @@ -341,8 +340,11 @@ def get_maintainer_join_date(self, username: str) -> datetime | None: class PyPIPackageJsonAsset: """The package JSON hosted on the PyPI registry.""" - #: The target pypi software component. - component: Component + #: The target pypi software component name. + component_name: str + + #: The target pypi software component version. + component_version: str | None #: The pypi registry. pypi_registry: PyPIRegistry @@ -372,7 +374,7 @@ def url(self) -> str: ------- str """ - json_endpoint = f"pypi/{self.component.name}/json" + json_endpoint = f"pypi/{self.component_name}/json" return urllib.parse.urljoin(self.pypi_registry.registry_url, json_endpoint) def download(self, dest: str) -> bool: # pylint: disable=unused-argument @@ -434,8 +436,8 @@ def get_sourcecode_url(self) -> str | None: The URL of the source distribution. """ urls: list | None = None - if self.component.version: - urls = json_extract(self.package_json, ["releases", self.component.version], list) + if self.component_version: + urls = json_extract(self.package_json, ["releases", self.component_version], list) else: # Get the latest version. urls = json_extract(self.package_json, ["urls"], list) diff --git a/src/macaron/slsa_analyzer/specs/package_registry_spec.py b/src/macaron/slsa_analyzer/specs/package_registry_spec.py index e28d9c6d8..ecd91d2b8 100644 --- a/src/macaron/slsa_analyzer/specs/package_registry_spec.py +++ b/src/macaron/slsa_analyzer/specs/package_registry_spec.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. @@ -7,7 +7,6 @@ from dataclasses import dataclass, field from macaron.slsa_analyzer.asset import AssetLocator -from macaron.slsa_analyzer.build_tool import BaseBuildTool from macaron.slsa_analyzer.package_registry import PackageRegistry from macaron.slsa_analyzer.provenance.provenance import DownloadedProvenanceData @@ -17,7 +16,7 @@ class PackageRegistryInfo: """This class contains data for one package registry that is matched against a repository.""" #: The build tool matched against the repository. - build_tool: BaseBuildTool + build_tool_name: str #: The package registry matched against the repository. This is dependent on the build tool detected. package_registry: PackageRegistry #: The provenances matched against the current repo. From cced14bb11de8c7740cfb75ddb797e50f376ec1d Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith Date: Thu, 13 Feb 2025 15:24:22 +1000 Subject: [PATCH 8/9] chore: update tests Signed-off-by: Ben Selwyn-Smith --- .../pypi_heuristics/metadata/wheel_absence.py | 2 +- .../malware_analyzer/pypi/test_closer_release_join_date.py | 5 ++++- tests/malware_analyzer/pypi/test_wheel_absence.py | 7 ++++--- .../checks/test_detect_malicious_metadata_check.py | 5 ++--- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py index 2a8217353..3a3033e22 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_absence.py @@ -61,7 +61,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes logger.debug(error_msg) raise HeuristicAnalyzerValueError(error_msg) - version = pypi_package_json.component.version + version = pypi_package_json.component_version if version is None: # check latest release version version = pypi_package_json.get_latest_version() diff --git a/tests/malware_analyzer/pypi/test_closer_release_join_date.py b/tests/malware_analyzer/pypi/test_closer_release_join_date.py index 4ed1a9b24..309574a21 100644 --- a/tests/malware_analyzer/pypi/test_closer_release_join_date.py +++ b/tests/malware_analyzer/pypi/test_closer_release_join_date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Tests for closer release join date heuristic.""" @@ -17,6 +17,7 @@ def test_analyze_pass(pypi_package_json: MagicMock) -> None: pypi_package_json.pypi_registry.get_maintainers_of_package.return_value = ["maintainer1", "maintainer2"] pypi_package_json.pypi_registry.get_maintainer_join_date.side_effect = [datetime(2018, 1, 1), datetime(2019, 1, 1)] pypi_package_json.get_latest_release_upload_time.return_value = "2022-06-20T12:00:00" + pypi_package_json.component_name = "mock1" # Call the method. result, detail_info = analyzer.analyze(pypi_package_json) @@ -35,6 +36,7 @@ def test_analyze_process(pypi_package_json: MagicMock) -> None: pypi_package_json.pypi_registry.get_maintainers_of_package.return_value = ["maintainer1"] pypi_package_json.pypi_registry.get_maintainer_join_date.side_effect = [datetime(2022, 6, 18)] pypi_package_json.get_latest_release_upload_time.return_value = "2022-06-20T12:00:00" + pypi_package_json.component_name = "mock1" # Call the method. result, detail_info = analyzer.analyze(pypi_package_json) @@ -52,6 +54,7 @@ def test_analyze_skip(pypi_package_json: MagicMock) -> None: # Set up mock return values. pypi_package_json.pypi_registry.get_maintainers_of_package.return_value = None pypi_package_json.get_latest_release_upload_time.return_value = "2022-06-20T12:00:00" + pypi_package_json.component_name = "mock1" # Call the method. result, detail_info = analyzer.analyze(pypi_package_json) diff --git a/tests/malware_analyzer/pypi/test_wheel_absence.py b/tests/malware_analyzer/pypi/test_wheel_absence.py index a2eebd554..3cfccfbe7 100644 --- a/tests/malware_analyzer/pypi/test_wheel_absence.py +++ b/tests/malware_analyzer/pypi/test_wheel_absence.py @@ -67,10 +67,11 @@ def test_analyze_tar_present(mock_send_head_http_raw: MagicMock, pypi_package_js pypi_package_json.get_releases.return_value = release pypi_package_json.get_latest_version.return_value = version - pypi_package_json.component.version = None + pypi_package_json.component_version = None pypi_package_json.package_json = {"info": {"name": "ttttttttest_nester"}} pypi_package_json.pypi_registry.inspector_url_scheme = "https" pypi_package_json.pypi_registry.inspector_url_netloc = "inspector.pypi.io" + mock_send_head_http_raw.return_value = MagicMock() # assume valid URL for testing purposes expected_detail_info = { @@ -126,7 +127,7 @@ def test_analyze_whl_present(mock_send_head_http_raw: MagicMock, pypi_package_js } pypi_package_json.get_releases.return_value = release - pypi_package_json.component.version = version + pypi_package_json.component_version = version pypi_package_json.package_json = {"info": {"name": "ttttttttest_nester"}} pypi_package_json.pypi_registry.inspector_url_scheme = "https" pypi_package_json.pypi_registry.inspector_url_netloc = "inspector.pypi.io" @@ -214,7 +215,7 @@ def test_analyze_both_present(mock_send_head_http_raw: MagicMock, pypi_package_j } pypi_package_json.get_releases.return_value = release - pypi_package_json.component.version = version + pypi_package_json.component_version = version pypi_package_json.package_json = {"info": {"name": "ttttttttest_nester"}} pypi_package_json.pypi_registry.inspector_url_scheme = "https" pypi_package_json.pypi_registry.inspector_url_netloc = "inspector.pypi.io" diff --git a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py index c6ecb044d..8f15c636a 100644 --- a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py +++ b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py @@ -12,7 +12,6 @@ from pytest_httpserver import HTTPServer from macaron.config.defaults import load_defaults -from macaron.slsa_analyzer.build_tool.base_build_tool import BaseBuildTool from macaron.slsa_analyzer.checks.check_result import CheckResultType from macaron.slsa_analyzer.checks.detect_malicious_metadata_check import DetectMaliciousMetadataCheck from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIRegistry @@ -35,7 +34,7 @@ ], ) def test_detect_malicious_metadata( - httpserver: HTTPServer, tmp_path: Path, pip_tool: BaseBuildTool, macaron_path: Path, purl: str, expected: str + httpserver: HTTPServer, tmp_path: Path, macaron_path: Path, purl: str, expected: str ) -> None: """Test that the check handles repositories correctly.""" check = DetectMaliciousMetadataCheck() @@ -43,7 +42,7 @@ def test_detect_malicious_metadata( # Set up the context object with PyPIRegistry instance. ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl=purl) pypi_registry = PyPIRegistry() - ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(pip_tool, pypi_registry)] + ctx.dynamic_data["package_registries"] = [PackageRegistryInfo("pip", pypi_registry)] # Set up responses of PyPI endpoints using the httpserver plugin. with open(os.path.join(RESOURCE_PATH, "pypi_files", "zlibxjson.html"), encoding="utf8") as page: From 553f0e97de5ca9cb555946feb1576119ad2096b7 Mon Sep 17 00:00:00 2001 From: Ben Selwyn-Smith Date: Thu, 13 Feb 2025 16:04:15 +1000 Subject: [PATCH 9/9] chore: add purl type to build tool in registry info Signed-off-by: Ben Selwyn-Smith --- src/macaron/slsa_analyzer/analyzer.py | 1 + .../checks/detect_malicious_metadata_check.py | 1 + .../checks/infer_artifact_pipeline_check.py | 4 ++-- .../slsa_analyzer/specs/package_registry_spec.py | 4 +++- .../test_detect_malicious_metadata_check.py | 2 +- .../checks/test_repo_verification_check.py | 16 +++++++++++----- 6 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index 7d9d35312..f992baefc 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -1003,6 +1003,7 @@ def _populate_package_registry_info(self) -> list[PackageRegistryInfo]: package_registries.append( PackageRegistryInfo( build_tool_name=build_tool_name, + build_tool_purl_type=build_tool.purl_type, package_registry=package_registry, ) ) diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 57111888d..ba13c28c5 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -399,6 +399,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: # Currently, only PyPI packages are supported. case PackageRegistryInfo( build_tool_name="pip" | "poetry", + build_tool_purl_type="pypi", package_registry=PyPIRegistry() as pypi_registry, ) as pypi_registry_info: diff --git a/src/macaron/slsa_analyzer/checks/infer_artifact_pipeline_check.py b/src/macaron/slsa_analyzer/checks/infer_artifact_pipeline_check.py index 8902d6ef2..83fba089d 100644 --- a/src/macaron/slsa_analyzer/checks/infer_artifact_pipeline_check.py +++ b/src/macaron/slsa_analyzer/checks/infer_artifact_pipeline_check.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2023 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This module contains the InferArtifactPipelineCheck class to check if an artifact is published from a pipeline automatically.""" @@ -123,7 +123,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: # Look for the artifact in the corresponding registry and find the publish timestamp. artifact_published_date = None for registry_info in ctx.dynamic_data["package_registries"]: - if registry_info.build_tool.purl_type == ctx.component.type: + if registry_info.build_tool_purl_type == ctx.component.type: try: artifact_published_date = registry_info.package_registry.find_publish_timestamp(ctx.component.purl) break diff --git a/src/macaron/slsa_analyzer/specs/package_registry_spec.py b/src/macaron/slsa_analyzer/specs/package_registry_spec.py index ecd91d2b8..84b2a69e7 100644 --- a/src/macaron/slsa_analyzer/specs/package_registry_spec.py +++ b/src/macaron/slsa_analyzer/specs/package_registry_spec.py @@ -15,8 +15,10 @@ class PackageRegistryInfo: """This class contains data for one package registry that is matched against a repository.""" - #: The build tool matched against the repository. + #: The name of the build tool matched against the repository. build_tool_name: str + #: The purl type of the build tool matched against the repository. + build_tool_purl_type: str #: The package registry matched against the repository. This is dependent on the build tool detected. package_registry: PackageRegistry #: The provenances matched against the current repo. diff --git a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py index 8f15c636a..c4251ff66 100644 --- a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py +++ b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py @@ -42,7 +42,7 @@ def test_detect_malicious_metadata( # Set up the context object with PyPIRegistry instance. ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl=purl) pypi_registry = PyPIRegistry() - ctx.dynamic_data["package_registries"] = [PackageRegistryInfo("pip", pypi_registry)] + ctx.dynamic_data["package_registries"] = [PackageRegistryInfo("pip", "pypi", pypi_registry)] # Set up responses of PyPI endpoints using the httpserver plugin. with open(os.path.join(RESOURCE_PATH, "pypi_files", "zlibxjson.html"), encoding="utf8") as page: diff --git a/tests/slsa_analyzer/checks/test_repo_verification_check.py b/tests/slsa_analyzer/checks/test_repo_verification_check.py index f0f3dd923..dcc15af43 100644 --- a/tests/slsa_analyzer/checks/test_repo_verification_check.py +++ b/tests/slsa_analyzer/checks/test_repo_verification_check.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Module to test the repository verification check.""" @@ -23,7 +23,9 @@ def test_repo_verification_pass(maven_tool: BaseBuildTool, macaron_path: Path) - ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:maven/test/test") maven_registry = MavenCentralRegistry() - ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(maven_tool, maven_registry)] + ctx.dynamic_data["package_registries"] = [ + PackageRegistryInfo(maven_tool.name, maven_tool.purl_type, maven_registry) + ] ctx.dynamic_data["repo_verification"] = [ RepositoryVerificationResult( status=RepositoryVerificationStatus.PASSED, @@ -41,7 +43,9 @@ def test_repo_verification_fail(maven_tool: BaseBuildTool, macaron_path: Path) - ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:maven/test/test") maven_registry = MavenCentralRegistry() - ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(maven_tool, maven_registry)] + ctx.dynamic_data["package_registries"] = [ + PackageRegistryInfo(maven_tool.name, maven_tool.purl_type, maven_registry) + ] ctx.dynamic_data["repo_verification"] = [ RepositoryVerificationResult( status=RepositoryVerificationStatus.FAILED, @@ -59,7 +63,9 @@ def test_check_unknown_for_unknown_repo_verification(maven_tool: BaseBuildTool, ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:maven/test/test") maven_registry = MavenCentralRegistry() - ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(maven_tool, maven_registry)] + ctx.dynamic_data["package_registries"] = [ + PackageRegistryInfo(maven_tool.name, maven_tool.purl_type, maven_registry) + ] ctx.dynamic_data["repo_verification"] = [ RepositoryVerificationResult( status=RepositoryVerificationStatus.UNKNOWN, @@ -77,6 +83,6 @@ def test_check_unknown_for_unsupported_build_tools(pip_tool: BaseBuildTool, maca ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl="pkg:pypi/test/test") pypi_registry = PyPIRegistry() - ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(pip_tool, pypi_registry)] + ctx.dynamic_data["package_registries"] = [PackageRegistryInfo(pip_tool.name, pip_tool.purl_type, pypi_registry)] assert check.run_check(ctx).result_type == CheckResultType.UNKNOWN