perf(license): tf-idf based matching (#99)

* refactor(utils): create gimie.utils.uri submodule * chore: add numpy + scipy to deps * feat: add tfidf vectorizer * test: unit tests for tfidf vectorizer * refactor(tfidf): more intuitive func names * fix(tfidf): correct ngrams tokenization for n>1, adjust doctests * ci: change python versions 3.8-3.10 -> 3.9-3.12 * chore: add scipy to deps * refactor(license): use tfidf in LicenseParser * chore: rm scancode from deps * feat: add pre-computed license tf-idf * feat: script to regen. tf-idf for all spdx licenses * test(license): update docstrings for tfidf * test(tfidf): rm test corpus from module, adapt doctest * refactor(license): only include osi-approved licenses in tfidf matrix * refactor(license): set min similarity to 0.9 * perf(tfidf): prune vectors to float16 ro reduce memory footprint * chore(license): black fmt * doc(license): mention tfidf in parser docstring * chore: rename test_tfidf.py * docs(tfidf): link to sklearn documentation * refactor(tfidf): reorder methods * refactor: rename utils text module * fix: gimie.utils.text_processing import paths
sdsc-ordes · Nov 24, 2023 · 77a17f5 · 77a17f5
1 parent dc6f149
commit 77a17f5
Show file tree

Hide file tree

Showing 14 changed files with 968 additions and 1,090 deletions.
diff --git a/.github/workflows/poetry-pytest.yml b/.github/workflows/poetry-pytest.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
       # https://github.com/actions/checkout
       - uses: actions/checkout@v4

diff --git a/gimie/extractors/__init__.py b/gimie/extractors/__init__.py
@@ -21,7 +21,7 @@
 from gimie.extractors.github import GithubExtractor
 from gimie.extractors.gitlab import GitlabExtractor
 from gimie.extractors.git import GitExtractor
-from gimie.utils import validate_url
+from gimie.utils.uri import validate_url
 
 GIT_PROVIDERS: Dict[str, Type[Extractor]] = {
     "git": GitExtractor,

diff --git a/gimie/parsers/license.py b/gimie/parsers/license.py
diff --git a/gimie/parsers/license/__init__.py b/gimie/parsers/license/__init__.py
@@ -0,0 +1,132 @@
+# Gimie
+# Copyright 2022 - Swiss Data Science Center (SDSC)
+# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
+# Eidgenössische Technische Hochschule Zürich (ETHZ).
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import csv
+from io import BytesIO
+import pkgutil
+import re
+from typing import List, Optional, Set
+
+import numpy as np
+import scipy.sparse as sp
+from rdflib.term import URIRef
+
+from gimie.graph.namespaces import SDO
+from gimie.parsers.abstract import Parser, Property
+from gimie.utils.text_processing import TfidfVectorizer
+
+
+class LicenseParser(Parser):
+    """Parse LICENSE body into schema:license <spdx-url>.
+    Uses tf-idf-based matching."""
+
+    def __init__(self):
+        super().__init__()
+
+    def parse(self, data: bytes) -> Set[Property]:
+        """Extracts an spdx URL from a license file and returns a
+        set with a single tuple <schema:license> <spdx_url>.
+        If no matching URL is found, an empty set is returned.
+        """
+        props = set()
+        license_url = match_license(data)
+
+        if license_url:
+            props.add((SDO.license, URIRef(license_url)))
+        return props
+
+
+def match_license(data: bytes, min_similarity: float = 0.9) -> Optional[str]:
+    """Given a license file, returns the url of the most similar spdx license.
+    This is done using TF-IDF on the license text and getting the
+    closest match in the SPDX license corpus based on cosine similarity.
+
+    Parameters
+    ----------
+    data:
+        The license body as bytes.
+
+    Examples
+    --------
+    >>> match_license(open('LICENSE', 'rb').read())
+    'https://spdx.org/licenses/Apache-2.0.html'
+    """
+    # Compute tfidf vector for input license
+    vectorizer = load_tfidf_vectorizer()
+    input_vec = vectorizer.transform([str(data)])
+
+    # Load ids and tfidf vectors for spdx licenses
+    spdx_licenses = load_spdx_ids()
+    spdx_vecs = load_tfidf_matrix()
+    # Compute cosine similarity between input_vec and spdx vectors
+    sim: np.ndarray = (input_vec * spdx_vecs.T).todense()
+    # Pick the most similar spdx vector
+    closest_idx = np.argmax(sim)
+    # If similarity is below threshold, return None
+    if sim[0, closest_idx] < min_similarity:
+        return None
+    closest_id = spdx_licenses[closest_idx]
+    return f"https://spdx.org/licenses/{closest_id}.html"
+
+
+def load_tfidf_vectorizer() -> TfidfVectorizer:
+    """Load tfidf matrix and vectorizer from disk."""
+
+    data = pkgutil.get_data(__name__, "data/tfidf_vectorizer.json")
+    if data is None:
+        raise FileNotFoundError("Could not find tfidf_vectorizer.json")
+    return TfidfVectorizer.model_validate_json(data)
+
+
+def load_spdx_ids() -> List[str]:
+    """Load spdx licenses from disk."""
+    data = pkgutil.get_data(__name__, "data/spdx_licenses.csv")
+    if data is None:
+        raise FileNotFoundError("Could not find spdx_licenses.csv")
+    reader = csv.reader(data.decode().split("\n"))
+    return [l[0] for l in reader if l]
+
+
+def load_tfidf_matrix() -> sp.csr_matrix:
+    """Load pre-computed tfidf matrix of spdx licenses from disk.
+    Matrix has dimensions (n_licenses, n_features)."""
+    data = pkgutil.get_data(__name__, "data/tfidf_matrix.npz")
+    if data is None:
+        raise FileNotFoundError("Could not find tfidf_matrix.npz")
+    return sp.load_npz(BytesIO(data))
+
+
+def is_license_filename(filename: str) -> bool:
+    """Given an input filename, returns a boolean indicating whether the filename path looks like a license.
+
+    Parameters
+    ----------
+    filename:
+        A filename to check.
+
+    Examples
+    --------
+    >>> is_license_filename('LICENSE-APACHE')
+    True
+    >>> is_license_filename('README.md')
+    False
+    """
+    if filename.startswith("."):
+        return False
+    pattern = r".*(license(s)?.*|lizenz|reus(e|ing).*|copy(ing)?.*)(\.(txt|md|rst))?$"
+    if re.match(pattern, filename, flags=re.IGNORECASE):
+        return True
+    return False