feat: introduce confidence scores for check facts

Signed-off-by: behnazh-w <behnaz.hassanshahi@oracle.com>
oracle · Jan 31, 2024 · 7f92fe2 · 7f92fe2
1 parent 064ce8f
commit 7f92fe2
Show file tree

Hide file tree

Showing 28 changed files with 531 additions and 434 deletions.
diff --git a/src/macaron/database/table_definitions.py b/src/macaron/database/table_definitions.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """
@@ -10,7 +10,6 @@
 
 For table associated with a check see the check module.
 """
-import hashlib
 import logging
 import os
 import string
@@ -19,14 +18,23 @@
 from typing import Any, Self
 
 from packageurl import PackageURL
-from sqlalchemy import Boolean, Column, Enum, ForeignKey, Integer, String, Table, UniqueConstraint
+from sqlalchemy import (
+    Boolean,
+    CheckConstraint,
+    Column,
+    Enum,
+    Float,
+    ForeignKey,
+    Integer,
+    String,
+    Table,
+    UniqueConstraint,
+)
 from sqlalchemy.orm import Mapped, mapped_column, relationship
 
 from macaron.database.database_manager import ORMBase
 from macaron.database.rfc3339_datetime import RFC3339DateTime
-from macaron.errors import CUEExpectationError, CUERuntimeError, InvalidPURLError
-from macaron.slsa_analyzer.provenance.expectations.cue import cue_validator
-from macaron.slsa_analyzer.provenance.expectations.expectation import Expectation
+from macaron.errors import InvalidPURLError
 from macaron.slsa_analyzer.slsa_req import ReqName
 
 logger: logging.Logger = logging.getLogger(__name__)
@@ -415,6 +423,16 @@ class CheckFacts(ORMBase):
     #: The primary key.
     id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)  # noqa: A003
 
+    #: The confidence score to estimate the accuracy of the check fact. This value should be in [0.0, 1.0] with
+    #: a lower value depicting a lower confidence. Because some analyses used in checks may use
+    #: heuristics, the results can be inaccurate in certain cases.
+    #: We use the confidence score to enable the check designer to assign a confidence estimate.
+    #: This confidence is stored in the database to be used by the policy. This confidence score is
+    #: also used to decide which evidence should be shown to the user in the HTML/JSON report.
+    confidence: Mapped[float] = mapped_column(
+        Float, CheckConstraint("confidence>=0.0 AND confidence<=1.0"), nullable=False
+    )
+
     #: The foreign key to the software component.
     component_id: Mapped[int] = mapped_column(Integer, ForeignKey("_component.id"), nullable=False)
 
@@ -430,68 +448,36 @@ class CheckFacts(ORMBase):
     #: A many-to-one relationship with check results.
     checkresult: Mapped["MappedCheckResult"] = relationship(back_populates="checkfacts")
 
-    #: The polymorphic inheritance configuration.
-    __mapper_args__ = {
-        "polymorphic_identity": "CheckFacts",
-        "polymorphic_on": "check_type",
-    }
-
+    def __lt__(self, other: Self) -> bool:
+        """Compare two check facts using their confidence values.
 
-class CUEExpectation(Expectation, CheckFacts):
-    """ORM Class for an expectation."""
+        This comparison function is intended to be used by a heapq, which is a Min-Heap data structure.
+        The root element in a heapq is the minimum element in the queue and each `confidence` value is in [0, 1].
+        Therefore, we need reverse the comparison function to make sure the fact with highest confidence is stored
+        in the root element. This implementation compares `1 - confidence` to return True if the confidence of
+        `fact_a` is greater than the confidence of `fact_b`.
 
-    # TODO: provenance content check should store the expectation, its evaluation result,
-    # and which PROVENANCE it was applied to rather than only linking to the repository.
+        .. code-block:: pycon
 
-    __tablename__ = "_expectation"
+            >>> fact_a = CheckFacts()
+            >>> fact_b = CheckFacts()
+            >>> fact_a.confidence = 0.2
+            >>> fact_b.confidence = 0.7
+            >>> fact_b < fact_a
+            True
 
-    #: The primary key, which is also a foreign key to the base check table.
-    id: Mapped[int] = mapped_column(ForeignKey("_check_facts.id"), primary_key=True)  # noqa: A003
+        Return
+        ------
+        bool
+        """
+        return (1 - self.confidence) < (1 - other.confidence)
 
     #: The polymorphic inheritance configuration.
     __mapper_args__ = {
-        "polymorphic_identity": "_expectation",
+        "polymorphic_identity": "CheckFacts",
+        "polymorphic_on": "check_type",
     }
 
-    @classmethod
-    def make_expectation(cls, expectation_path: str) -> Self | None:
-        """Construct a CUE expectation from a CUE file.
-
-        Note: we require the CUE expectation file to have a "target" field.
-
-        Parameters
-        ----------
-        expectation_path: str
-            The path to the expectation file.
-
-        Returns
-        -------
-        Self
-            The instantiated expectation object.
-        """
-        logger.info("Generating an expectation from file %s", expectation_path)
-        expectation: CUEExpectation = CUEExpectation(
-            description="CUE expectation",
-            path=expectation_path,
-            target="",
-            expectation_type="CUE",
-        )
-
-        try:
-            with open(expectation_path, encoding="utf-8") as expectation_file:
-                expectation.text = expectation_file.read()
-                expectation.sha = str(hashlib.sha256(expectation.text.encode("utf-8")).hexdigest())
-                expectation.target = cue_validator.get_target(expectation.text)
-                expectation._validator = (  # pylint: disable=protected-access
-                    lambda provenance: cue_validator.validate_expectation(expectation.text, provenance)
-                )
-        except (OSError, CUERuntimeError, CUEExpectationError) as error:
-            logger.error("CUE expectation error: %s", error)
-            return None
-
-        # TODO remove type ignore once mypy adds support for Self.
-        return expectation  # type: ignore
-
 
 class Provenance(ORMBase):
     """ORM class for a provenance document."""

diff --git a/src/macaron/policy_engine/souffle_code_generator.py b/src/macaron/policy_engine/souffle_code_generator.py
@@ -1,12 +1,12 @@
-# Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """Generate souffle datalog for policy prelude."""
 
 import logging
 import os
 
-from sqlalchemy import Column, MetaData, Table
+from sqlalchemy import Column, Float, MetaData, Table
 from sqlalchemy.sql.sqltypes import Boolean, Integer, String, Text
 
 logger: logging.Logger = logging.getLogger(__name__)
@@ -81,6 +81,8 @@ def column_to_souffle_type(column: Column) -> str:
         souffle_type = "symbol"
     elif isinstance(sql_type, Integer):
         souffle_type = "number"
+    elif isinstance(sql_type, Float):
+        souffle_type = "number"
     elif isinstance(sql_type, Text):
         souffle_type = "symbol"
     elif isinstance(sql_type, Boolean):

diff --git a/src/macaron/slsa_analyzer/analyze_context.py b/src/macaron/slsa_analyzer/analyze_context.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 - 2023, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """This module contains the Analyze Context class.
@@ -38,8 +38,6 @@ class ChecksOutputs(TypedDict):
     """The CI services information for this repository."""
     is_inferred_prov: bool
     """True if we cannot find the provenance and Macaron need to infer the provenance."""
-    # We need to use typing.Protocol for multiple inheritance, however, the Expectation
-    # class uses inlined functions, which is not supported by Protocol.
     expectation: Expectation | None
     """The expectation to verify the provenance for this repository."""
     package_registries: list[PackageRegistryInfo]
@@ -109,7 +107,9 @@ def provenances(self) -> dict[str, list[InTotoV01Statement | InTotoV1Statement]]
             # By default, initialize every key with an empty list.
             result: dict[str, list[InTotoV01Statement | InTotoV1Statement]] = defaultdict(list)
             for ci_info in ci_services:
-                result[ci_info["service"].name].extend(payload.statement for payload in ci_info["provenances"])
+                result[ci_info["service"].name].extend(
+                    prov_asset.payload.statement for prov_asset in ci_info["provenances"]
+                )
             package_registry_entries = self.dynamic_data["package_registries"]
             for package_registry_entry in package_registry_entries:
                 result[package_registry_entry.package_registry.name].extend(

diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py
@@ -28,6 +28,7 @@
 from macaron.repo_finder.commit_finder import find_commit
 from macaron.slsa_analyzer import git_url
 from macaron.slsa_analyzer.analyze_context import AnalyzeContext
+from macaron.slsa_analyzer.asset import VirtualReleaseAsset
 from macaron.slsa_analyzer.build_tool import BUILD_TOOLS
 
 # To load all checks into the registry
@@ -40,6 +41,7 @@
 from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES
 from macaron.slsa_analyzer.provenance.expectations.expectation_registry import ExpectationRegistry
 from macaron.slsa_analyzer.provenance.intoto import InTotoV01Payload
+from macaron.slsa_analyzer.provenance.slsa import SLSAProvenanceData
 from macaron.slsa_analyzer.registry import registry
 from macaron.slsa_analyzer.specs.ci_spec import CIInfo
 from macaron.slsa_analyzer.specs.inferred_provenance import Provenance
@@ -857,7 +859,12 @@ def perform_checks(self, analyze_ctx: AnalyzeContext) -> dict[str, CheckResult]:
                             callgraph=callgraph,
                             provenance_assets=[],
                             latest_release={},
-                            provenances=[InTotoV01Payload(statement=Provenance().payload)],
+                            provenances=[
+                                SLSAProvenanceData(
+                                    payload=InTotoV01Payload(statement=Provenance().payload),
+                                    asset=VirtualReleaseAsset(name="No_ASSET", url="NO_URL", size_in_bytes=0),
+                                )
+                            ],
                         )
                     )
 

diff --git a/src/macaron/slsa_analyzer/asset/__init__.py b/src/macaron/slsa_analyzer/asset/__init__.py
@@ -1,12 +1,12 @@
-# Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """This module defines classes and interfaces related to assets.
 
 Assets are files published from some build.
 """
 
-from typing import Protocol
+from typing import NamedTuple, Protocol
 
 
 class AssetLocator(Protocol):
@@ -38,3 +38,30 @@ def download(self, dest: str) -> bool:
         bool
             ``True`` if the asset is downloaded successfully; ``False`` if not.
         """
+
+
+class VirtualReleaseAsset(NamedTuple):
+    """A dummy asset used when an asset doesn't actually exist."""
+
+    #: The asset name.
+    name: str
+    #: The URL to the asset.
+    url: str
+    #: The size of the asset, in bytes.
+    size_in_bytes: int
+
+    def download(self, dest: str) -> bool:  # pylint: disable=unused-argument
+        """Download the asset.
+
+        Parameters
+        ----------
+        dest : str
+            The local destination where the asset is downloaded to.
+            Note that this must include the file name.
+
+        Returns
+        -------
+        bool
+            ``True`` if the asset is downloaded successfully; ``False`` if not.
+        """
+        return False
diff --git a/src/macaron/slsa_analyzer/checks/base_check.py b/src/macaron/slsa_analyzer/checks/base_check.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 - 2023, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2022 - 2024, Oracle and/or its affiliates. All rights reserved.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
 
 """This module contains the BaseCheck class to be inherited by other concrete Checks."""
@@ -99,9 +99,7 @@ def run(self, target: AnalyzeContext, skipped_info: SkippedInfo | None = None) -
         check_result_data: CheckResultData
 
         if skipped_info:
-            check_result_data = CheckResultData(
-                justification=[skipped_info["suppress_comment"]], result_tables=[], result_type=self.result_on_skip
-            )
+            check_result_data = CheckResultData(result_tables=[], result_type=self.result_on_skip)
             logger.info(
                 "Check %s is skipped on target %s, comment: %s",
                 self.check_info.check_id,
@@ -115,14 +113,14 @@ def run(self, target: AnalyzeContext, skipped_info: SkippedInfo | None = None) -
                 self.check_info.check_id,
                 check_result_data.result_type.value,
                 target.component.purl,
-                check_result_data.justification,
+                check_result_data.justification_report,
             )
 
+        # This justification string will be stored in the feedback column of `SLSARequirement` table.
+        # TODO: Storing the justification as feedback in the `SLSARequirement` table seems redundant and might need
+        # refactoring.
         justification_str = ""
-        for ele in check_result_data.justification:
-            if isinstance(ele, dict):
-                for key, val in ele.items():
-                    justification_str += f"{key}: {val}. "
+        for _, ele in check_result_data.justification_report:
             justification_str += f"{str(ele)}. "
 
         target.bulk_update_req_status(