Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add parsers support #97

Merged
merged 46 commits into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
b036dcb
refactor: gimie.sources -> gimie.extractors
cmdoret Oct 23, 2023
eb30902
feat(parsers): define abstract class interface
cmdoret Oct 23, 2023
27ec229
refactor(license): mv functionality to a parser
cmdoret Oct 23, 2023
1c36043
refactor(extractors): rm Extractor._get_licenses
cmdoret Oct 23, 2023
487fd91
fix: rm calls to Extractor._get_licenses()
cmdoret Oct 23, 2023
e494ab5
docs(parser): include license header
cmdoret Oct 23, 2023
bf72d33
doc: update docstring of extractor interface
cmdoret Oct 23, 2023
9a14490
refactor(project): support parser, clearer variable names, simplify e…
cmdoret Oct 24, 2023
73e739c
fix(parser): recursion error
cmdoret Oct 24, 2023
3c8a074
test(project): adapt to new api
cmdoret Oct 24, 2023
98b4cb0
doc(project): update docstring
cmdoret Oct 24, 2023
e80641a
feat(cli): support parser selection
cmdoret Oct 24, 2023
a4bfcf2
feat(cli): add `gimie parsers` to list parsers
cmdoret Oct 24, 2023
39b7456
refactor: simplify extractor/parser collections
cmdoret Oct 24, 2023
9eba446
refactor(project): better error handling, rm dead code
cmdoret Oct 24, 2023
47bd37e
refactor(project): early return on infer_git_provider
cmdoret Oct 24, 2023
61433ea
doc: add missing license headers
cmdoret Oct 24, 2023
cf6d2df
test(parsers,project): doctests + error handling
cmdoret Oct 24, 2023
a6fb7e7
refactor(cli): better enum name for rdf formats
cmdoret Nov 3, 2023
9db4c1d
refactor(io): replace Resource.name with Resource.path
cmdoret Nov 3, 2023
c47f0a9
refactor(parser): use namedtuple for parser collection
cmdoret Nov 3, 2023
0e51d62
refactor(parser): use bytes -> tuple interface
cmdoret Nov 3, 2023
6c56bb1
refactor(license): use bytes -> tuple interface
cmdoret Nov 3, 2023
a13ef7e
feat(cli): rich output for `gimie parsers`
cmdoret Nov 3, 2023
f1455a9
refactor(project): new parser signature, migrate unrelated funcs
cmdoret Nov 3, 2023
b1eeab5
feat(graph): helper to attach properties to graph
cmdoret Nov 3, 2023
8f1ce60
feat(parser): helper to parse multiple files
cmdoret Nov 3, 2023
e90fd18
refactor(parser): type alias to graph module
cmdoret Nov 3, 2023
2754321
doc: improve parser-related docstrings
cmdoret Nov 3, 2023
91b4778
chore: add license header to gimie.graph
cmdoret Nov 3, 2023
5b712b5
fix(extractors): Parser.name -> Parser.path
cmdoret Nov 3, 2023
b6375a3
doc(extractors): improve docstring
cmdoret Nov 3, 2023
8c6e0cf
fix(graph): py38 compatible typ alias
cmdoret Nov 3, 2023
6a6cdf3
test(license): update signature in doctest
cmdoret Nov 3, 2023
bfccff8
test(project): update function names
cmdoret Nov 3, 2023
a3ab200
fix(license): check if in root dir with Path.parts
cmdoret Nov 3, 2023
959aaaf
doc(parser): property -> predicate
cmdoret Nov 3, 2023
1428d69
doc: property -> predicate
cmdoret Nov 3, 2023
85a1177
chore(graph): rm unused imports
cmdoret Nov 3, 2023
ff70073
fix(parser): rm unused imports
cmdoret Nov 3, 2023
c236353
chore: rebuild poetry.lock
cmdoret Nov 3, 2023
7b2c4fc
refactor(parser): helper funcs instead of dicts
cmdoret Nov 3, 2023
65009c5
refactor(cli): streamline `gimie parsers`
cmdoret Nov 3, 2023
73072f0
refactor(project): use helper functions to select parsers
cmdoret Nov 3, 2023
b2a90a4
test(parser): add tests for helper functions
cmdoret Nov 3, 2023
897a14e
chore: rm unused import
cmdoret Nov 3, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 54 additions & 10 deletions gimie/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,25 +16,26 @@
# limitations under the License.
"""Command line interface to the gimie package."""
from enum import Enum
from typing import Optional
from gimie import __version__
from typing import List, Optional

import click
import typer

from gimie import __version__
from gimie.parsers import get_parser, list_default_parsers, list_parsers
from gimie.project import Project

app = typer.Typer(add_completion=False)


# Used to autogenerate docs with sphinx-click
@click.group()
def cli():
"""Command line group"""
pass


class SerializationFormat(str, Enum):
"""Enumeration of valid RDF serialization formats for project graphs"""

class RDFFormatChoice(str, Enum):
ttl = "ttl"
jsonld = "json-ld"
nt = "nt"
Expand All @@ -50,8 +51,8 @@ def version_callback(value: bool):
@app.command()
def data(
url: str,
format: SerializationFormat = typer.Option(
"ttl",
format: RDFFormatChoice = typer.Option(
RDFFormatChoice.ttl,
"--format",
show_choices=True,
help="Output serialization format for the RDF graph.",
Expand All @@ -61,6 +62,18 @@ def data(
"--base-url",
help="Specify the base URL of the git provider. Inferred by default.",
),
include_parser: Optional[List[str]] = typer.Option(
None,
"--include-parser",
"-I",
help="Only include selected parser. Use 'gimie parsers' to list parsers.",
),
exclude_parser: Optional[List[str]] = typer.Option(
None,
"--exclude-parser",
"-X",
help="Exclude selected parser.",
),
version: Optional[bool] = typer.Option(
None,
"--version",
Expand All @@ -70,10 +83,16 @@ def data(
):
"""Extract linked metadata from a Git repository at the target URL.

The output is sent to stdout, and turtle is used as the default serialization format."""
proj = Project(url, base_url=base_url)
The output is sent to stdout, and turtle is used as the default serialization format.
"""
parser_names = list_default_parsers()
if exclude_parser:
parser_names -= set([parser for parser in exclude_parser])
if include_parser:
parser_names = set([parser for parser in include_parser])
proj = Project(url, base_url=base_url, parser_names=parser_names)
repo_meta = proj.extract()
print(repo_meta.serialize(format=format))
print(repo_meta.serialize(format=format.value))


@app.command()
Expand All @@ -86,9 +105,34 @@ def advice(url: str):
raise typer.Exit()


@app.command()
def parsers(
verbose: bool = typer.Option(
False, "--verbose", help="Show parser description."
)
):
"""List available parsers, specifying which are default.
If --verbose is used, show parser description."""
message = ""
parsers = list_parsers()
default_parsers = list_default_parsers()

for name in parsers:
# Each parser gets their name in bold green
title = typer.style(name, fg=typer.colors.GREEN, bold=True)
default = " (default)" if name in default_parsers else ""
description = f" - {get_parser(name).__doc__}" if verbose else ""

parser_line = f"{title}{default}{description}"
message += f"{parser_line}\n"

typer.echo(message)


typer_cli = typer.main.get_command(app)
cli.add_command(typer_cli, "cli")


# This callback is triggered when gimie is called without subcommand
@app.callback()
def callback(
Expand Down
95 changes: 95 additions & 0 deletions gimie/extractors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Gimie
# Copyright 2022 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Git providers from which metadata can be extracted by gimie."""
from typing import Dict, Optional, Type
from gimie.extractors.abstract import Extractor
from gimie.extractors.github import GithubExtractor
from gimie.extractors.gitlab import GitlabExtractor
from gimie.extractors.git import GitExtractor
from gimie.utils import validate_url

GIT_PROVIDERS: Dict[str, Type[Extractor]] = {
"git": GitExtractor,
"github": GithubExtractor,
"gitlab": GitlabExtractor,
}


def get_extractor(
url: str,
source: str,
base_url: Optional[str] = None,
local_path: Optional[str] = None,
) -> Extractor:
"""Instantiate the correct extractor for a given source.

Parameters
-----------
URL
Where the repository metadata is extracted from.
source
The source of the repository (git, gitlab, github, ...).
base_url
The base URL of the git remote.
local_path
If applicable, the path to the directory where the
repository is located.

Examples
--------
>>> extractor = get_extractor(
... "https://github.com/SDSC-ORD/gimie",
... "github"
... )
"""
try:
return GIT_PROVIDERS[source](
url, base_url=base_url, local_path=local_path
)
except KeyError as err:
raise ValueError(
f"Unknown git provider: {source}.\n"
f"Supported sources: {', '.join(GIT_PROVIDERS)}"
) from err


def infer_git_provider(url: str) -> str:
"""Given a git repository URL, return the corresponding git provider.
Local path or unsupported git providers will return "git".

Examples
--------
>>> infer_git_provider("https://gitlab.com/foo/bar")
'gitlab'
>>> infer_git_provider("/foo/bar")
'git'
>>> infer_git_provider("https://codeberg.org/dnkl/foot")
'git'
"""
# Fall back to git if local path
if not validate_url(url):
return "git"

# NOTE: We just check if the provider name is in the URL.
# We may want to use a more robust check.
for name in GIT_PROVIDERS.keys():
if name in url and name != "git":
return name

# Fall back to git for unsupported providers
return "git"
25 changes: 6 additions & 19 deletions gimie/sources/abstract.py → gimie/extractors/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,22 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Abstract classes for gimie objects."""
"""Abstract for Git repository extractors."""
from abc import ABC, abstractmethod
from typing import List, Optional

from urllib.parse import urlparse

from gimie.io import Resource
from gimie.models import Repository
from gimie.sources.common.license import get_license_url, is_license_path


class Extractor(ABC):
"""Extractor is an Abstract Base Class. It is only meant
to define a standard interface for all extractors.
to define a standard interface for all git repository extractors.

All subclasses must implement extract() and to_graph() methods
they are free to override the default serialize() and jsonld()
Subclasses for different git providers must implement
extract() and list_files() methods.
"""

def __init__(
Expand All @@ -45,9 +44,10 @@ def __init__(

@abstractmethod
def extract(self) -> Repository:
"""Extract metadata"""
"""Extract metadata from the git provider into a Repository object."""
...

@abstractmethod
def list_files(self) -> List[Resource]:
"""List all files in the repository HEAD."""
...
Expand All @@ -66,16 +66,3 @@ def base(self) -> str:
url = urlparse(self.url)
return f"{url.scheme}://{url.netloc}"
return self.base_url

def _get_licenses(self) -> List[str]:
"""Extracts SPDX License URLs from the repository."""
# TODO: Move functionality into a dedicate Parser
license_files = filter(
lambda p: is_license_path(p.name), self.list_files()
)
license_urls = []
for file in license_files:
license_url = get_license_url(file)
if license_url:
license_urls.append(license_url)
return license_urls
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# Gimie
# Copyright 2022 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import requests
from typing import Any, Dict, List, Union

Expand Down
4 changes: 1 addition & 3 deletions gimie/sources/git.py → gimie/extractors/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

from gimie.io import LocalResource
from gimie.models import Person, Repository
from gimie.sources.abstract import Extractor
from gimie.extractors.abstract import Extractor
from pathlib import Path


Expand Down Expand Up @@ -70,14 +70,12 @@ def extract(self) -> Repository:
date_created=self._get_creation_date(),
date_modified=self._get_modification_date(),
name=self.path,
licenses=self._get_licenses(),
url=self.url,
)

return Repository(**repo_meta) # type: ignore

def list_files(self) -> List[LocalResource]:

self.repository = self._repo_data
file_list = []

Expand Down
13 changes: 7 additions & 6 deletions gimie/sources/github.py → gimie/extractors/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@
from urllib.parse import urlparse
from dotenv import load_dotenv

from gimie.sources.abstract import Extractor
from gimie.extractors.abstract import Extractor
from gimie.models import (
Organization,
Person,
Repository,
)

from gimie.io import RemoteResource
from gimie.sources.common.queries import (
from gimie.extractors.common.queries import (
send_rest_query,
send_graphql_query,
)
Expand All @@ -47,7 +47,8 @@ def query_contributors(
) -> List[Dict[str, Any]]:
"""Queries the list of contributors of target repository
using GitHub's REST and GraphQL APIs. Returns a list of GraphQL User nodes.
NOTE: This is a workaround for the lack of a contributors field in the GraphQL API."""
NOTE: This is a workaround for the lack of a contributors field in the GraphQL API.
"""
owner, name = urlparse(url).path.strip("/").split("/")
# Get contributors (available in the REST API but not GraphQL)
data = f"repos/{owner}/{name}/contributors"
Expand Down Expand Up @@ -108,7 +109,7 @@ def list_files(self) -> List[RemoteResource]:

for item in file_dict:
file = RemoteResource(
name=item["name"],
path=item["name"],
url=f'{repo_url}/raw/{defaultbranchref}/{item["path"]}',
headers=self._set_auth(),
)
Expand All @@ -127,7 +128,6 @@ def extract(self) -> Repository:
description=data["description"],
name=self.path,
keywords=self._get_keywords(*data["repositoryTopics"]["nodes"]),
licenses=self._get_licenses(),
url=self.url,
)
if data["parent"]:
Expand Down Expand Up @@ -244,7 +244,8 @@ def _repo_data(self) -> Dict[str, Any]:

def _fetch_contributors(self) -> List[Person]:
"""Queries the GitHub GraphQL API to extract contributors through the commit list.
NOTE: This is a workaround for the lack of a contributors field in the GraphQL API."""
NOTE: This is a workaround for the lack of a contributors field in the GraphQL API.
"""
headers = self._set_auth()
contributors = []
resp = query_contributors(self.url, headers)
Expand Down
Loading