From 4f20e9b6cc7ccab4e29afe5f1f819102afe485d8 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 13 Feb 2025 09:20:04 +1000 Subject: [PATCH] docs: include new heuristics in malware analyzer readme (#987) include details for the anomalous version and wheel absence heuristics in the malware analyzer README, with docstring updates. --- src/macaron/malware_analyzer/README.md | 64 +++++++++---------- .../metadata/anomalous_version.py | 2 +- .../metadata/closer_release_join_date.py | 6 +- .../metadata/empty_project_link.py | 6 +- .../metadata/high_release_frequency.py | 6 +- .../pypi_heuristics/metadata/one_release.py | 6 +- .../sourcecode/suspicious_setup.py | 6 +- 7 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/macaron/malware_analyzer/README.md b/src/macaron/malware_analyzer/README.md index 740571955..6255fb13b 100644 --- a/src/macaron/malware_analyzer/README.md +++ b/src/macaron/malware_analyzer/README.md @@ -1,60 +1,60 @@ # Implementation of Heuristic Malware Detector -## Check +## PyPI Ecosystem -We schedule the heuristics sequentially: +Malware detection is achieved using a combination of metadata and source code heuristics. Certain combinations of the results of these heuristics are indicators of a malicious package. -1. **Empty Project Link**: If the package contains project links (e.g., documentation, Git Repositories), -the analyzer will further operate the heuristic `Unreachable Project Links` to analyze if all the project links are unreachable. -2. **One Release**: Checks if there is only one release of the package. If the package contains multiple -releases, the checker will further check the release frequency through `High Release Frequency` and -`Unchanged Release` to see if the maintainers release multiple times in a short timeframe (threshold), and -whether the contents of the releases are identical. -3. **Closer Release Join Date**: Considers the date when the maintainer registered their account (if -available). The checker will calculate the gap between the latest release date and the maintainer's account -registration date. -4. **Suspicious Setup**: Checks whether the `setup.py` includes suspicious imports, such as `base64` for -encryption and `requests` for data exfiltration. - -## Supported Ecosystem: PyPI - -Define Seven Heuristics: `False` means suspicious and `True` means benign. `SKIP` means some metadata is missing, and the checker will skip the heuristic. +When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator by that heuristic of suspicious behaviour. When a heuristic passes, with `HeuristicResult.PASS`, then that is an indicator of benign behavior. When a heuristic is skipped, returning `HeuristicResult.SKIP`, then this means that heuristic was not applicable to the package, due to either package details or dependencies on other heuristics. When a heuristic encounters a malformed package, a `HeuristicAnalyzerValueError` is raised. The following heuristics are currently run sequentially to gauge package maliciousness. 1. **Empty Project Link** - **Description**: Checks whether the package contains any project links (e.g., documents or Git - Repositories). Many malicious activities do not include any project links. - - **Rule**: Return `FALSE` when there is only one project link; otherwise, return `TRUE`. + Repositories). Many malicious packages do not include any project links. + - **Rule**: Return `HeuristicResult.FAIL` when there are no project links; otherwise, return `HeuristicResult.PASS`. 2. **Unreachable Project Links** - **Description**: Checks the accessibility of the project links. This is considered an auxiliary heuristic since no cases have met this heuristic. - - **Rule**: Return `FALSE` if all project links are unreachable; otherwise, return `TRUE`. + - **Rule**: Return `HeuristicResult.FAIL` if all project links are unreachable; otherwise, return `HeuristicResult.PASS`. + - **Dependency**: Will be run if the Empty Project Link heuristic passes. 3. **One Release** - **Description**: Checks whether the package has only one release. - - **Rule**: Return `FALSE` if the package contains only one release; otherwise, return `TRUE`. + - **Rule**: Return `HeuristicResult.FAIL` if the package contains only one release; otherwise, return `HeuristicResult.PASS`. 4. **High Release Frequency** - **Description**: Checks if the package released multiple versions within a short timeframe. We calculate the release frequency and define a default frequency threshold of 2 days. - - **Rule**: Return `FALSE` if the frequency is higher than the threshold; otherwise, return `TRUE`. + - **Rule**: Return `HeuristicResult.FAIL` if the frequency is higher than the threshold; otherwise, return `HeuristicResult.PASS`. + - **Dependency**: Will be run if the One Release heuristic passes. 5. **Unchanged Release** - - **Description**: Checks if the content of releases remains unchanged. - - **Rule**: Return `FALSE` if the content of releases is identical; otherwise, return `TRUE`. + - **Description**: Checks if the content of releases remains unchanged using the `sha256` digest of the package source. + - **Rule**: Return `HeuristicResult.FAIL` if the content of any two releases is identical; otherwise, return `HeuristicResult.PASS`. + - **Dependency**: Will be run if the High Release Frequency heuristic fails. 6. **Closer Release Join Date** - - **Description**: Checks the gap between the date the maintainer registered their account and the date + - **Description**: Checks the gap between the date the maintainer(s) registered their account and the date of the latest release. A default threshold of 5 days is defined. - - **Rule**: Return `FALSE` if the gap is less than the threshold; otherwise, return `TRUE`. + - **Rule**: Return `HeuristicResult.FAIL` if the gap is less than the threshold for any maintainer; otherwise, return `HeuristicResult.PASS`. 7. **Suspicious Setup** - - **Description**: Checks the `setup.py` to see if there are suspicious imported modules, or - `install_requires` packages that are installed during the package installation process. We define two suspicious - keywords as the blacklist. - - **Rule**: Return `FALSE` if the package name contains suspicious keywords; otherwise, return `TRUE`. + - **Description**: Checks `setup.py` to see if there are suspicious imported modules, or + `install_requires` packages that are installed during the package installation process. Current blacklisted packages are `base64` and `requests`. This heuristic is skipped if no `setup.py` file can be found in the package. + - **Rule**: Return `HeuristicResult.FAIL` if the package name contains suspicious keywords; otherwise, return `HeuristicResult.PASS`. + - **Dependency**: Will be run if the Closer Release Join Date heuristic fails. + +8. **Wheel Absence** + - **Description**: Checks for the presence of a wheel (`.whl`) file distributed with the specified package release. + - **Rule**: Return `HeuristicResult.FAIL` if there is no wheel file present with that package release; otherwise, return `HeuristicResult.PASS`. + +9. **Anomalous Version** + - **Description**: Checks if the version number is abnormally high, checking the epoch and major version against threshold values. This does account for common date-based version number (calendar versioning) patterns. + - **Rule**: Return `HeuristicResult.FAIL` if the major or epoch is abnormally high; otherwise, return `HeuristicResult.PASS`. + - **Dependency**: Will be run if the One Release heuristic fails. + +### Confidence Score Motivation -## Heuristics-Based Analyzer: Scanning 1167 Packages from Trusted Organizations +The original seven heuristics which started this work were Empty Project Link, Unreachable Project Links, One Release, High Release Frequency, Unchange Release, Closer Release Join Date, and Suspicious Setup. These heuristics (excluding those with a dependency) were run on 1167 packages from trusted organizations, with the following results: | Heuristic Name | Count | |------------------| ----- | @@ -64,4 +64,4 @@ Define Seven Heuristics: `False` means suspicious and `True` means benign. `SKIP | Frequent Release | 14 | | Suspicious Setup | 5 | -**The result is used as a reference for the confidence score to lower the false positive rate.** +These results were used as a reference for the confidence score provided in each suspicious combination. diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalous_version.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalous_version.py index f02c4f595..b04588a76 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalous_version.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/anomalous_version.py @@ -23,7 +23,7 @@ class AnomalousVersionAnalyzer(BaseHeuristicAnalyzer): """ Analyze the version number (if there is only a single release) to detect if it is anomalous. - A version number is anomalous if any of its values are greater than the epoch, major, or minor threshold values. + A version number is anomalous if any of its values are greater than the epoch or major threshold values. If the version does not adhere to PyPI standards (PEP 440, as per the 'packaging' module), this heuristic cannot analyze it. diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py index 83333f3c9..bb95a5436 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/closer_release_join_date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Analyzer checks whether the maintainers' join date closer to latest package's release date.""" @@ -14,7 +14,7 @@ class CloserReleaseJoinDateAnalyzer(BaseHeuristicAnalyzer): - """Analyzer checks the heuristic. + """Check whether the maintainers' join date closer to package's latest release date. If any maintainer's date duration is larger than threshold, we consider it as "PASS". """ @@ -82,7 +82,7 @@ def _get_latest_release_date(self, pypi_package_json: PyPIPackageJsonAsset) -> d return parse_datetime(upload_time, datetime_format) def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: - """Check whether the maintainers' join date closer to package's latest release date. + """Analyze the package. Parameters ---------- diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/empty_project_link.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/empty_project_link.py index b6dd7ac80..6ef2cc132 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/empty_project_link.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/empty_project_link.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Analyzer checks there is no project link of the package.""" @@ -10,13 +10,13 @@ class EmptyProjectLinkAnalyzer(BaseHeuristicAnalyzer): - """Analyzer checks heuristic.""" + """Check whether the PyPI package has no project links.""" def __init__(self) -> None: super().__init__(name="empty_project_link_analyzer", heuristic=Heuristics.EMPTY_PROJECT_LINK, depends_on=None) def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: - """Check whether the PyPI package has no project link. + """Analyze the package. Parameters ---------- diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/high_release_frequency.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/high_release_frequency.py index e68b28dca..c4bdc1742 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/high_release_frequency.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/high_release_frequency.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """Analyzer checks the frequent release heuristic.""" @@ -17,7 +17,7 @@ class HighReleaseFrequencyAnalyzer(BaseHeuristicAnalyzer): - """Analyzer checks heuristic.""" + """Check whether the release frequency is high.""" def __init__(self) -> None: super().__init__( @@ -36,7 +36,7 @@ def _load_defaults(self) -> int: return 2 def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: - """Check whether the release frequency is high. + """Analyze the package. Parameters ---------- diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/one_release.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/one_release.py index 4a12b746a..7d81ec010 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/metadata/one_release.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/one_release.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. @@ -11,13 +11,13 @@ class OneReleaseAnalyzer(BaseHeuristicAnalyzer): - """Analyzer checks heuristic.""" + """Determine if there is only one release of the package.""" def __init__(self) -> None: super().__init__(name="one_release_analyzer", heuristic=Heuristics.ONE_RELEASE, depends_on=None) def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: - """Check the releases' total is one. + """Analyze the package. Parameters ---------- diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_setup.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_setup.py index 69c8e4241..89d1909a3 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_setup.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_setup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """This analyzer checks the suspicious pattern within setup.py.""" @@ -23,7 +23,7 @@ class SuspiciousSetupAnalyzer(BaseHeuristicAnalyzer): - """Analyzer checks heuristic.""" + """Check whether suspicious packages are imported in setup.py.""" def __init__(self) -> None: super().__init__( @@ -119,7 +119,7 @@ def _get_setup_source_code(self, pypi_package_json: PyPIPackageJsonAsset) -> str return file.read() def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: - """Analyze suspicious packages are imported in setup.py. + """Analyze the package. Parameters ----------