From 57aba12b2dc9fce15069b330a7e67d56c04e134e Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Mon, 10 Feb 2025 16:43:20 +1000 Subject: [PATCH] refactor: source code repo heuristic replacing unreachable project links --- .../pypi_heuristics/heuristics.py | 4 +- ...e_project_links.py => source_code_repo.py} | 35 ++---- .../checks/detect_malicious_metadata_check.py | 26 ++--- .../pypi/test_source_code_repo.py | 32 ++++++ ...test_unreachable_project_links_analyzer.py | 105 ------------------ 5 files changed, 58 insertions(+), 144 deletions(-) rename src/macaron/malware_analyzer/pypi_heuristics/metadata/{unreachable_project_links.py => source_code_repo.py} (53%) create mode 100644 tests/malware_analyzer/pypi/test_source_code_repo.py delete mode 100644 tests/malware_analyzer/pypi/test_unreachable_project_links_analyzer.py diff --git a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py index 1bd724fad..bd829a0f1 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py @@ -12,8 +12,8 @@ class Heuristics(str, Enum): #: Indicates that the package does not contain any project links (such as documentation or Git repository pages). EMPTY_PROJECT_LINK = "empty_project_link" - #: Indicates that the package contains project links, but all of them are unreachable. - UNREACHABLE_PROJECT_LINKS = "unreachable_project_links" + #: Indicates that the source code repository for the package was not found. + SOURCE_CODE_REPO = "source_code_repo" #: Indicates that the package contains only one release. ONE_RELEASE = "one_release" diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/unreachable_project_links.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/source_code_repo.py similarity index 53% rename from src/macaron/malware_analyzer/pypi_heuristics/metadata/unreachable_project_links.py rename to src/macaron/malware_analyzer/pypi_heuristics/metadata/source_code_repo.py index 8824c7a25..8d8c9619d 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/unreachable_project_links.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/source_code_repo.py @@ -1,12 +1,10 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. -"""The heuristic analyzer to check the project links.""" +"""The heuristic analyzer to check if a source code repo was found.""" import logging -import requests - from macaron.json_tools import JsonType from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics @@ -15,17 +13,17 @@ logger: logging.Logger = logging.getLogger(__name__) -class UnreachableProjectLinksAnalyzer(BaseHeuristicAnalyzer): +class SourceCodeRepoAnalyzer(BaseHeuristicAnalyzer): """ - Analyze the accessibility of the project links. + Analyze the accessibility of the source code repository. - If >= 1 project links are reachable, the analyzer consider the package as benign. + Passes if a repository was found and validated by the repo finder, otherwise fails. """ def __init__(self) -> None: super().__init__( - name="unreachable_project_links_analyzer", - heuristic=Heuristics.UNREACHABLE_PROJECT_LINKS, + name="source_code_repo_analyzer", + heuristic=Heuristics.SOURCE_CODE_REPO, depends_on=[(Heuristics.EMPTY_PROJECT_LINK, HeuristicResult.PASS)], ) @@ -42,18 +40,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes tuple[HeuristicResult, dict[str, JsonType]]: The result and related information collected during the analysis. """ - project_links: dict | None = pypi_package_json.get_project_links() - - if project_links is None: - return HeuristicResult.SKIP, {} - - for link in project_links.values(): - try: - response = requests.head(link, timeout=3) - if response.status_code < 400: - return HeuristicResult.PASS, {} - except requests.exceptions.RequestException as error: - logger.debug(error) - continue - - return HeuristicResult.FAIL, {} + # If a sourcecode repo exists, then this will have already been validated + if not pypi_package_json.component.repository: + return HeuristicResult.FAIL, {} + return HeuristicResult.PASS, {} diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 0e2fe0039..040daca85 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -20,8 +20,8 @@ from macaron.malware_analyzer.pypi_heuristics.metadata.empty_project_link import EmptyProjectLinkAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.high_release_frequency import HighReleaseFrequencyAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.one_release import OneReleaseAnalyzer +from macaron.malware_analyzer.pypi_heuristics.metadata.source_code_repo import SourceCodeRepoAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.unchanged_release import UnchangedReleaseAnalyzer -from macaron.malware_analyzer.pypi_heuristics.metadata.unreachable_project_links import UnreachableProjectLinksAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_absence import WheelAbsenceAnalyzer from macaron.malware_analyzer.pypi_heuristics.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer @@ -69,7 +69,7 @@ class MaliciousMetadataFacts(CheckFacts): # When implementing new analyzer, appending the classes to this list ANALYZERS: list = [ EmptyProjectLinkAnalyzer, - UnreachableProjectLinksAnalyzer, + SourceCodeRepoAnalyzer, OneReleaseAnalyzer, HighReleaseFrequencyAnalyzer, UnchangedReleaseAnalyzer, @@ -97,7 +97,7 @@ class MaliciousMetadataFacts(CheckFacts): ] = { ( HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Unreachable Project Links + HeuristicResult.SKIP, # Source Code Repo HeuristicResult.FAIL, # One Release HeuristicResult.SKIP, # High Release Frequency HeuristicResult.SKIP, # Unchanged Release @@ -112,7 +112,7 @@ class MaliciousMetadataFacts(CheckFacts): ): Confidence.HIGH, ( HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Unreachable Project Links + HeuristicResult.SKIP, # Source Code Repo HeuristicResult.FAIL, # One Release HeuristicResult.SKIP, # High Release Frequency HeuristicResult.SKIP, # Unchanged Release @@ -127,7 +127,7 @@ class MaliciousMetadataFacts(CheckFacts): ): Confidence.HIGH, ( HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Unreachable Project Links + HeuristicResult.SKIP, # Source Code Repo HeuristicResult.PASS, # One Release HeuristicResult.FAIL, # High Release Frequency HeuristicResult.FAIL, # Unchanged Release @@ -141,7 +141,7 @@ class MaliciousMetadataFacts(CheckFacts): ): Confidence.HIGH, ( HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Unreachable Project Links + HeuristicResult.SKIP, # Source Code Repo HeuristicResult.PASS, # One Release HeuristicResult.FAIL, # High Release Frequency HeuristicResult.PASS, # Unchanged Release @@ -155,7 +155,7 @@ class MaliciousMetadataFacts(CheckFacts): ): Confidence.HIGH, ( HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Unreachable Project Links + HeuristicResult.SKIP, # Source Code Repo HeuristicResult.PASS, # One Release HeuristicResult.FAIL, # High Release Frequency HeuristicResult.FAIL, # Unchanged Release @@ -169,7 +169,7 @@ class MaliciousMetadataFacts(CheckFacts): ): Confidence.MEDIUM, ( HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Unreachable Project Links + HeuristicResult.SKIP, # Source Code Repo HeuristicResult.PASS, # One Release HeuristicResult.FAIL, # High Release Frequency HeuristicResult.FAIL, # Unchanged Release @@ -183,7 +183,7 @@ class MaliciousMetadataFacts(CheckFacts): ): Confidence.MEDIUM, ( HeuristicResult.PASS, # Empty Project - HeuristicResult.FAIL, # Unreachable Project Links + HeuristicResult.FAIL, # Source Code Repo HeuristicResult.PASS, # One Release HeuristicResult.FAIL, # High Release Frequency HeuristicResult.PASS, # Unchanged Release @@ -191,13 +191,13 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Suspicious Setup HeuristicResult.FAIL, # Wheel Absence HeuristicResult.SKIP, # Anomalous Version - # All project links are unreachable, frequent releases of multiple versions, + # No source code repo, frequent releases of multiple versions, # and the maintainer released it shortly after account registration. # The setup.py file contains suspicious imports and .whl file isn't present. ): Confidence.HIGH, ( HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Unreachable Project Links + HeuristicResult.SKIP, # Source Code Repo HeuristicResult.FAIL, # One Release HeuristicResult.SKIP, # High Release Frequency HeuristicResult.SKIP, # Unchanged Release @@ -212,7 +212,7 @@ class MaliciousMetadataFacts(CheckFacts): ): Confidence.MEDIUM, ( HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Unreachable Project Links + HeuristicResult.SKIP, # Source Code Repo HeuristicResult.FAIL, # One Release HeuristicResult.SKIP, # High Release Frequency HeuristicResult.SKIP, # Unchanged Release @@ -227,7 +227,7 @@ class MaliciousMetadataFacts(CheckFacts): ): Confidence.MEDIUM, ( HeuristicResult.FAIL, # Empty Project - HeuristicResult.SKIP, # Unreachable Project Links + HeuristicResult.SKIP, # Source Code Repo HeuristicResult.FAIL, # One Release HeuristicResult.SKIP, # High Release Frequency HeuristicResult.SKIP, # Unchanged Release diff --git a/tests/malware_analyzer/pypi/test_source_code_repo.py b/tests/malware_analyzer/pypi/test_source_code_repo.py new file mode 100644 index 000000000..668c80865 --- /dev/null +++ b/tests/malware_analyzer/pypi/test_source_code_repo.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Tests for heuristic detecting malicious metadata from PyPI""" + +from unittest.mock import MagicMock + +import pytest + +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult +from macaron.malware_analyzer.pypi_heuristics.metadata.source_code_repo import SourceCodeRepoAnalyzer + + +@pytest.mark.parametrize( + ("repository", "expected_result"), + [ + pytest.param(None, HeuristicResult.FAIL, id="test_no_repo"), + pytest.param( + MagicMock(), + HeuristicResult.PASS, + id="test_valid_repo", + ), + ], +) +def test_repo_existence( + pypi_package_json: MagicMock, repository: MagicMock | None, expected_result: HeuristicResult +) -> None: + """Test if the source code repo exists.""" + pypi_package_json.component.repository = repository + analyzer = SourceCodeRepoAnalyzer() + result, _ = analyzer.analyze(pypi_package_json) + assert result == expected_result diff --git a/tests/malware_analyzer/pypi/test_unreachable_project_links_analyzer.py b/tests/malware_analyzer/pypi/test_unreachable_project_links_analyzer.py deleted file mode 100644 index 410fe925e..000000000 --- a/tests/malware_analyzer/pypi/test_unreachable_project_links_analyzer.py +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. - -"""Tests for heuristic detecting malicious metadata from PyPI""" - -from unittest.mock import MagicMock, Mock, patch - -from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult -from macaron.malware_analyzer.pypi_heuristics.metadata.unreachable_project_links import UnreachableProjectLinksAnalyzer -from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset - - -@patch("requests.head") -def test_analyze_with_all_reachable_links(mock_head: Mock) -> None: - """Test for all links are reachable""" - # Setup. - mock_pypi_package = MagicMock(spec=PyPIPackageJsonAsset) - project_links: dict = { - "Documentation": "https://requests.readthedocs.io", - "Homepage": "https://requests.readthedocs.io", - "Source": "https://github.com/psf/requests", - } - - mock_pypi_package.get_project_links.return_value = project_links - expected_result: tuple[HeuristicResult, dict] = (HeuristicResult.PASS, {}) - - mock_response = MagicMock() - mock_response.status_code = 200 - mock_head.return_value = mock_response - analyzer = UnreachableProjectLinksAnalyzer() - - # Execute. - result: tuple[HeuristicResult, dict] = analyzer.analyze(mock_pypi_package) - - # Verify. - assert result == expected_result - - -@patch("requests.head") -def test_analyze_with_all_unreachable_links(mock_head: Mock) -> None: - """Test for all project links are unreachable""" - # Setup. - project_links: dict = {"Homepage": "https://github.com/jiangfubang/fast_requests"} - mock_pypi_package = MagicMock(spec=PyPIPackageJsonAsset) - mock_pypi_package.get_project_links.return_value = project_links - expected_result: tuple[HeuristicResult, dict] = (HeuristicResult.FAIL, {}) - - analyzer = UnreachableProjectLinksAnalyzer() - mock_response = MagicMock() - mock_response.status_code = 404 - mock_head.return_value = mock_response - - # Execute. - result: tuple[HeuristicResult, dict] = analyzer.analyze(mock_pypi_package) - - # Verify. - assert result == expected_result - - -def test_analyze_with_no_project_links() -> None: - """Test for the metadata missing""" - # TODO Package with missing metadata is not available now - # Setup. - mock_pypi_package = MagicMock(spec=PyPIPackageJsonAsset) - mock_pypi_package.get_project_links.return_value = None - analyzer = UnreachableProjectLinksAnalyzer() - expected_result: tuple[HeuristicResult, dict] = (HeuristicResult.SKIP, {}) - - # Execute. - result: tuple[HeuristicResult, dict] = analyzer.analyze(mock_pypi_package) - - # Verify. - assert result == expected_result - - -@patch("requests.head") -def test_analyze_with_mixed_links(mock_head: Mock) -> None: - """Test for the situation when the links are partially accessible""" - # Setup. - project_links: dict = { - "Documentation": "https://requests.readthedocs.io", - "Homepage": "https://requests.readthedocs.io", - "Source": "https://badurl.com", - } - mock_pypi_package = MagicMock(spec=PyPIPackageJsonAsset) - mock_pypi_package.get_project_links.return_value = project_links - expected_result: tuple[HeuristicResult, dict] = (HeuristicResult.PASS, {}) - - # Mock responses for each URL. - def side_effect(url: str, *args: tuple, **kwargs: dict) -> Mock: # pylint: disable=W0613 - mock_response = MagicMock() - if url == "http://badurl.com": - mock_response.status_code = 404 - else: - mock_response.status_code = 200 - return mock_response - - mock_head.side_effect = side_effect - analyzer = UnreachableProjectLinksAnalyzer() - - # Execute. - result: tuple[HeuristicResult, dict] = analyzer.analyze(mock_pypi_package) - - # Verify. - assert result == expected_result