diff --git a/.github/workflows/conventional-prs.yml b/.github/workflows/conventional-prs.yml index d4c95052..d45ea2e8 100644 --- a/.github/workflows/conventional-prs.yml +++ b/.github/workflows/conventional-prs.yml @@ -16,4 +16,4 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: - validateSingleCommit: true \ No newline at end of file + validateSingleCommit: true diff --git a/.github/workflows/sphinx-docs.yml b/.github/workflows/sphinx-docs.yml index 0f515149..1fdf5d44 100644 --- a/.github/workflows/sphinx-docs.yml +++ b/.github/workflows/sphinx-docs.yml @@ -5,7 +5,7 @@ on: pull_request: paths: - 'docs/**' - + permissions: contents: write jobs: @@ -15,10 +15,10 @@ jobs: steps: # https://github.com/actions/checkout - uses: actions/checkout@v4 - + # https://github.com/actions/setup-python - uses: actions/setup-python@v4 - + # https://github.com/snok/install-poetry - name: Install Poetry uses: snok/install-poetry@v1 @@ -37,10 +37,10 @@ jobs: steps: # https://github.com/actions/checkout - uses: actions/checkout@v4 - + # https://github.com/actions/setup-python - uses: actions/setup-python@v4 - + # https://github.com/snok/install-poetry - name: Install Poetry uses: snok/install-poetry@v1 diff --git a/README.md b/README.md index 6090288a..32d8773d 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ Scientific code repositories contain valuable metadata which can be used to enri ---------------------------------------------------------------------- -Using Gimie: easy peasy, it's a 3 step process. +Using Gimie: easy peasy, it's a 3 step process. ## STEP 1: Installation @@ -37,11 +37,11 @@ docker run -e ACCESS_TOKEN=$ACCESS_TOKEN ghcr.io/sdsc-ord/gimie:latest gimie dat ## STEP 2 : Set your credentials -In order to access the github api, you need to provide a github token with the `read:org` scope. +In order to access the github api, you need to provide a github token with the `read:org` scope. ### A. Create access tokens -New to access tokens? Or don't know how to get your Github / Gitlab token ? +New to access tokens? Or don't know how to get your Github / Gitlab token ? Have no fear, see [here for Github tokens](https://docs.github.com/en/enterprise-server@3.4/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) and [here for Gitlab tokens](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html). @@ -68,7 +68,7 @@ export GITLAB_TOKEN= ```shell gimie data https://github.com/numpy/numpy ``` -(want a Gitlab project instead? Just replace the URL in the command line) +(want a Gitlab project instead? Just replace the URL in the command line) ### As a python library @@ -132,7 +132,7 @@ run checks: ```shell make check ``` -for an easier use Github/Gitlab APIs, place your access tokens in the `.env` file: (and don't worry, the `.gitignore` will ignore them when you push to GitHub) +for an easier use Github/Gitlab APIs, place your access tokens in the `.env` file: (and don't worry, the `.gitignore` will ignore them when you push to GitHub) ``` cp .env.dist .env diff --git a/gimie/cli.py b/gimie/cli.py index 2a34f6aa..a677e0d9 100644 --- a/gimie/cli.py +++ b/gimie/cli.py @@ -72,7 +72,8 @@ def data( The output is sent to stdout, and turtle is used as the default serialization format.""" proj = Project(url, base_url=base_url) - print(proj.serialize(format=format)) + repo_meta = proj.extract() + print(repo_meta.serialize(format=format)) @app.command() diff --git a/gimie/models.py b/gimie/models.py index 4a2abb65..1dbaded4 100644 --- a/gimie/models.py +++ b/gimie/models.py @@ -17,11 +17,13 @@ """Data models to represent nodes in the graph generated by gimie.""" from __future__ import annotations from dataclasses import dataclass, field +from datetime import datetime import datetime -from typing import Optional, List +from typing import List, Optional, Union from calamus.schema import JsonLDSchema from calamus import fields +from rdflib import Graph from gimie.graph.namespaces import SDO @@ -42,7 +44,7 @@ class Release: """ tag: str = field(compare=False) - date: datetime.datetime = field(compare=True) + date: datetime = field(compare=True) commit_hash: str = field(compare=False) @@ -103,3 +105,76 @@ class PersonSchema(JsonLDSchema): class Meta: rdf_type = SDO.Person model = Person + + +@dataclass +class Repository: + """This class represents a git repository. + It does not contain any information about the content of the repository. + See https://schema.org/SoftwareSourceCode + """ + + url: str + name: str + + authors: Optional[List[Union[Organization, Person]]] = None + contributors: Optional[List[Person]] = None + date_created: Optional[datetime] = None + date_modified: Optional[datetime] = None + date_published: Optional[datetime] = None + description: Optional[str] = None + download_url: Optional[str] = None + identifier: Optional[str] = None + keywords: Optional[List[str]] = None + licenses: Optional[List[str]] = None + parent_repository: Optional[str] = None + prog_langs: Optional[List[str]] = None + version: Optional[str] = None + + @property + def _id(self) -> str: + """Unique identifier for the repository.""" + return self.url + + def to_graph(self) -> Graph: + """Convert repository to RDF graph.""" + jd = RepositorySchema().dumps(self) + g: Graph = Graph().parse(format="json-ld", data=str(jd)) + g.bind("schema", SDO) + return g + + def serialize(self, format: str = "ttl", **kwargs) -> str: + """Serialize the RDF graph representing the instance.""" + return self.to_graph().serialize(format=format, **kwargs) # type: ignore + + def jsonld(self) -> str: + """Alias for jsonld serialization.""" + return self.serialize(format="json-ld") + + +class RepositorySchema(JsonLDSchema): + """This defines the schema used for json-ld serialization.""" + + _id = fields.Id() + authors = fields.Nested( + SDO.author, [PersonSchema, OrganizationSchema], many=True + ) + contributors = fields.Nested(SDO.contributor, PersonSchema, many=True) + date_created = fields.Date(SDO.dateCreated) + date_modified = fields.Date(SDO.dateModified) + date_published = fields.Date(SDO.datePublished) + description = fields.String(SDO.description) + download_url = fields.IRI(SDO.downloadUrl) + identifier = fields.String(SDO.identifier) + keywords = fields.List(SDO.keywords, fields.String) + licenses = fields.List(SDO.license, fields.IRI) + name = fields.String(SDO.name) + parent_repository = fields.IRI(SDO.isBasedOn) + prog_langs = fields.List(SDO.programmingLanguage, fields.String) + url = fields.IRI(SDO.codeRepository) + version = fields.String(SDO.version) + + class Meta: + rdf_type = SDO.SoftwareSourceCode + model = Repository + add_value_types = False diff --git a/gimie/project.py b/gimie/project.py index 59b3033e..f53601b5 100644 --- a/gimie/project.py +++ b/gimie/project.py @@ -25,6 +25,7 @@ from urllib.parse import urlparse from gimie.graph.operations import combine_graphs +from gimie.models import Repository from gimie.utils import validate_url from gimie.sources import SOURCES from gimie.sources.abstract import Extractor @@ -48,6 +49,7 @@ class Project: Examples -------- >>> proj = Project("https://github.com/SDSC-ORD/gimie") + >>> assert isinstance(proj.extract(), Graph) """ def __init__( @@ -63,24 +65,16 @@ def __init__( self._cloned = False if validate_url(path): self.url = path - # We only need to clone a remote project - # if a local extractor is enabled - if any(map(is_local_source, sources)): - self.project_dir = self.clone(path) else: self.project_dir = path self.extractors = self.get_extractors(sources) - for ex in self.extractors: - ex.extract() - def clone(self, url: str) -> str: - """Clone target url in a new temporary directory""" - target_dir = TemporaryDirectory().name - cloned = git.Repo.clone_from(url, target_dir) # type: ignore - self._cloned = True - - return str(cloned.working_tree_dir) + def extract(self) -> Graph: + repos = [ex.extract() for ex in self.extractors] + graphs = [repo.to_graph() for repo in repos] + graph = combine_graphs(*graphs) + return graph def get_extractors(self, sources: Iterable[str]) -> List[Extractor]: @@ -96,29 +90,6 @@ def get_extractors(self, sources: Iterable[str]) -> List[Extractor]: return extractors - def to_graph(self) -> Graph: - graphs = map(lambda ex: ex.to_graph(), self.extractors) - combined_graph = combine_graphs(*graphs) - return combined_graph - - def serialize(self, format: str = "ttl", **kwargs): - return self.to_graph().serialize(format=format, **kwargs) - - def cleanup(self): - """Recursively delete the project. Only works - for remote (i.e. cloned) projects.""" - try: - tempdir = gettempdir() - if self.project_dir is not None: - in_temp = self.project_dir.startswith(tempdir) - if self._cloned and in_temp: - shutil.rmtree(self.project_dir) - except AttributeError: - pass - - def __del__(self): - self.cleanup() - def split_git_url(url) -> Tuple[str, str]: base_url = urlparse(url).scheme + "://" + urlparse(url).netloc diff --git a/gimie/sources/abstract.py b/gimie/sources/abstract.py index d46d5a00..8945d335 100644 --- a/gimie/sources/abstract.py +++ b/gimie/sources/abstract.py @@ -18,11 +18,11 @@ from abc import ABC, abstractmethod from typing import List, Optional - -from rdflib import Graph from urllib.parse import urlparse -from gimie.sources.common.license import get_license_url, is_license_path + from gimie.io import Resource +from gimie.models import Repository +from gimie.sources.common.license import get_license_url, is_license_path class Extractor(ABC): @@ -44,32 +44,14 @@ def __init__( self.local_path = local_path @abstractmethod - def extract(self): + def extract(self) -> Repository: """Extract metadata""" ... - @abstractmethod - def to_graph(self) -> Graph: - """Generate an RDF graph from the instance""" - return Graph() - def list_files(self) -> List[Resource]: """List all files in the repository HEAD.""" ... - def serialize(self, format: str = "ttl", **kwargs) -> str: - """Serialize the RDF graph representing the instance.""" - return self.to_graph().serialize(format=format, **kwargs) # type: ignore - - def jsonld(self) -> str: - """Alias for jsonld serialization.""" - return self.serialize(format="json-ld") - - @property - def _id(self) -> str: - """Unique identifier for the repository.""" - return self.url - @property def path(self) -> str: """Path to the repository without the base URL.""" @@ -87,6 +69,7 @@ def base(self) -> str: def _get_licenses(self) -> List[str]: """Extracts SPDX License URLs from the repository.""" + # TODO: Move functionality into a dedicate Parser license_files = filter( lambda p: is_license_path(p.name), self.list_files() ) diff --git a/gimie/sources/git.py b/gimie/sources/git.py index 09d49561..0b15539c 100644 --- a/gimie/sources/git.py +++ b/gimie/sources/git.py @@ -17,18 +17,18 @@ """Extractor which uses a locally available (usually cloned) repository.""" from dataclasses import dataclass from datetime import datetime +from functools import cached_property +import os +import shutil +import tempfile from typing import List, Optional import uuid -from calamus import fields -from calamus.schema import JsonLDSchema import git import pydriller -from rdflib import Graph from gimie.io import LocalResource -from gimie.graph.namespaces import SDO -from gimie.models import Person, PersonSchema +from gimie.models import Person, Repository from gimie.sources.abstract import Extractor from pathlib import Path @@ -58,43 +58,59 @@ class GitExtractor(Extractor): url: str base_url: Optional[str] = None local_path: Optional[str] = None + _cloned: bool = False - author: Optional[Person] = None - contributors: Optional[List[Person]] = None - date_created: Optional[datetime] = None - date_modified: Optional[datetime] = None - license: Optional[List[str]] = None - - def extract(self): - if self.local_path is None: - raise ValueError("Local path must be provided for extraction.") - self.repository = pydriller.Repository(self.local_path) + def extract(self) -> Repository: # Assuming author is the first person to commit - self.author = self._get_creator() - self.contributors = self._get_contributors() - self.date_created = self._get_creation_date() - self.date_modified = self._get_modification_date() - self.license = self._get_licenses() + self.repository = self._repo_data + + repo_meta = dict( + authors=[self._get_creator()], + contributors=self._get_contributors(), + date_created=self._get_creation_date(), + date_modified=self._get_modification_date(), + name=self.path, + licenses=self._get_licenses(), + url=self.url, + ) + + return Repository(**repo_meta) # type: ignore def list_files(self) -> List[LocalResource]: - file_list = [] - if self.local_path is None: - return file_list + self.repository = self._repo_data + file_list = [] - for path in Path(self.local_path).rglob("*"): + for path in Path(self.local_path).rglob("*"): # type: ignore if (path.parts[0] == ".git") or not path.is_file(): continue file_list.append(LocalResource(path)) return file_list - def to_graph(self) -> Graph: - """Generate an RDF graph from the instance""" - jd = GitExtractorSchema().dumps(self) - g: Graph = Graph().parse(data=str(jd), format="json-ld") - g.bind("schema", SDO) - return g + def __del__(self): + """Cleanup the cloned repo if it was cloned and is located in tempdir.""" + try: + # Can't be too careful with temp files + tempdir = tempfile.gettempdir() + if ( + self.local_path + and self._cloned + and self.local_path.startswith(tempdir) + and tempdir != os.getcwd() + ): + shutil.rmtree(self.local_path) + except AttributeError: + pass + + @cached_property + def _repo_data(self) -> pydriller.Repository: + """Get the repository data by accessing local data or cloning.""" + if self.local_path is None: + self._cloned = True + self.local_path = tempfile.TemporaryDirectory().name + git.Repo.clone_from(self.url, self.local_path) # type: ignore + return pydriller.Repository(self.local_path) def _get_contributors(self) -> List[Person]: """Get the authors of the repository.""" @@ -113,12 +129,14 @@ def _get_creation_date(self) -> Optional[datetime]: def _get_modification_date(self) -> Optional[datetime]: """Get the last modification date of the repository.""" + commit = None try: for commit in self.repository.traverse_commits(): pass - return commit.author_date except (StopIteration, NameError): - return None + pass + finally: + return commit.author_date if commit else None def _get_creator(self) -> Optional[Person]: """Get the creator of the repository.""" @@ -136,24 +154,10 @@ def _dev_to_person( uid = str(uuid.uuid4()) else: uid = name.replace(" ", "_").lower() - dev_id = f"{self._id}/{uid}" + dev_id = f"{self.url}/{uid}" return Person( _id=dev_id, identifier=uid, name=name, email=email, ) - - -class GitExtractorSchema(JsonLDSchema): - _id = fields.Id() - author = fields.Nested(SDO.author, PersonSchema) - contributors = fields.Nested(SDO.contributor, PersonSchema, many=True) - date_created = fields.Date(SDO.dateCreated) - date_modified = fields.Date(SDO.dateModified) - license = fields.List(SDO.license, fields.IRI) - - class Meta: - rdf_type = SDO.SoftwareSourceCode - model = GitExtractor - add_value_types = False diff --git a/gimie/sources/github.py b/gimie/sources/github.py index c5a763a6..7440cb5f 100644 --- a/gimie/sources/github.py +++ b/gimie/sources/github.py @@ -16,9 +16,7 @@ # limitations under the License. from __future__ import annotations -import tempfile from dataclasses import dataclass -from datetime import datetime from dateutil.parser import isoparse from functools import cached_property import os @@ -26,18 +24,13 @@ from typing import Any, Dict, List, Optional, Union from urllib.parse import urlparse from dotenv import load_dotenv -from calamus import fields -from calamus.schema import JsonLDSchema -from rdflib import Graph from gimie.sources.abstract import Extractor from gimie.models import ( Organization, - OrganizationSchema, Person, - PersonSchema, + Repository, ) -from gimie.graph.namespaces import SDO from gimie.io import RemoteResource from gimie.sources.common.queries import ( @@ -105,25 +98,6 @@ class GithubExtractor(Extractor): local_path: Optional[str] = None token: Optional[str] = None - author: Optional[Union[Organization, Person]] = None - contributors: Optional[List[Person]] = None - prog_langs: Optional[List[str]] = None - download_url: Optional[str] = None - description: Optional[str] = None - date_created: Optional[datetime] = None - date_modified: Optional[datetime] = None - date_published: Optional[datetime] = None - parent_repository: Optional[str] = None - keywords: Optional[List[str]] = None - license: Optional[List[str]] = None - software_version: Optional[str] = None - - def to_graph(self) -> Graph: - """Convert repository to RDF graph.""" - jd = GithubExtractorSchema().dumps(self) - g: Graph = Graph().parse(format="json-ld", data=str(jd)) - g.bind("schema", SDO) - return g def list_files(self) -> List[RemoteResource]: """takes the root repository folder and returns the list of files present""" @@ -135,37 +109,45 @@ def list_files(self) -> List[RemoteResource]: for item in file_dict: file = RemoteResource( name=item["name"], - url=f'{repo_url}/blob/{defaultbranchref}/{item["path"]}', + url=f'{repo_url}/raw/{defaultbranchref}/{item["path"]}', headers=self._set_auth(), ) file_list.append(file) return file_list - def extract(self): + def extract(self) -> Repository: """Extract metadata from target GitHub repository.""" data = self._repo_data - self.author = self._get_author(data["owner"]) - self.contributors = self._fetch_contributors() - self.description = data["description"] - self.date_created = isoparse(data["createdAt"][:-1]) - self.date_modified = isoparse(data["updatedAt"][:-1]) + + repo_meta = dict( + authors=[self._get_author(data["owner"])], + contributors=self._fetch_contributors(), + date_created=isoparse(data["createdAt"][:-1]), + date_modified=isoparse(data["updatedAt"][:-1]), + description=data["description"], + name=self.path, + keywords=self._get_keywords(*data["repositoryTopics"]["nodes"]), + licenses=self._get_licenses(), + url=self.url, + ) if data["parent"]: - self.parent_repository = data["parent"]["url"] + repo_meta["parent_repository"] = data["parent"]["url"] + if data["latestRelease"]: - self.date_published = isoparse( + repo_meta["date_published"] = isoparse( data["latestRelease"]["publishedAt"] ) - self.license = self._get_licenses() if data["primaryLanguage"] is not None: - self.prog_langs = [data["primaryLanguage"]["name"]] - self.keywords = self._get_keywords(*data["repositoryTopics"]["nodes"]) - last_release = data["latestRelease"] - if last_release is not None: - self.version = last_release["name"] - self.download_url = ( - f"{self.url}/archive/refs/tags/{self.version}.tar.gz" - ) + repo_meta["prog_langs"] = [data["primaryLanguage"]["name"]] + + if data["latestRelease"]: + version = data["latestRelease"]["name"] + download_url = f"{self.url}/archive/refs/tags/{version}.tar.gz" + repo_meta["download_url"] = download_url + repo_meta["version"] = version + + return Repository(**repo_meta) # type: ignore @cached_property def _repo_data(self) -> Dict[str, Any]: @@ -321,29 +303,3 @@ def _get_user(self, node: Dict[str, Any]) -> Person: name=node["name"], affiliations=orgs, ) - - -class GithubExtractorSchema(JsonLDSchema): - """This defines the schema used for json-ld serialization.""" - - _id = fields.Id() - path = fields.String(SDO.name) - author = fields.Nested(SDO.author, [PersonSchema, OrganizationSchema]) - contributors = fields.Nested(SDO.contributor, PersonSchema, many=True) - prog_langs = fields.List(SDO.programmingLanguage, fields.String) - download_url = fields.Raw(SDO.downloadUrl) - description = fields.String(SDO.description) - date_created = fields.Date(SDO.dateCreated) - date_modified = fields.Date(SDO.dateModified) - date_published = fields.Date(SDO.datePublished) - license = fields.List(SDO.license, fields.IRI) - url = fields.IRI(SDO.codeRepository) - # NOTE: parent_repository is not available for GitLab's GraphQL API in 2023. Add for GitLab when available - parent_repository = fields.IRI(SDO.isBasedOn) - keywords = fields.List(SDO.keywords, fields.String) - version = fields.String(SDO.version) - - class Meta: - rdf_type = SDO.SoftwareSourceCode - model = GithubExtractor - add_value_types = False diff --git a/gimie/sources/gitlab.py b/gimie/sources/gitlab.py index 478a7e34..1d6d212e 100644 --- a/gimie/sources/gitlab.py +++ b/gimie/sources/gitlab.py @@ -8,17 +8,11 @@ from typing import Any, Dict, List, Optional, Union from urllib.parse import urlparse from dotenv import load_dotenv -from calamus import fields -from calamus.schema import JsonLDSchema -from rdflib import Graph -import tempfile -from gimie.graph.namespaces import SDO from gimie.io import RemoteResource from gimie.models import ( Organization, - OrganizationSchema, Person, - PersonSchema, + Repository, ) from gimie.sources.abstract import Extractor from gimie.sources.common.queries import send_graphql_query, send_rest_query @@ -42,27 +36,6 @@ class GitlabExtractor(Extractor): local_path: Optional[str] = None token: Optional[str] = None - name: Optional[str] = None - identifier: Optional[str] = None - author: Optional[List[Union[Organization, Person]]] = None - contributors: Optional[List[Person]] = None - prog_langs: Optional[List[str]] = None - description: Optional[str] = None - date_created: Optional[datetime] = None - date_modified: Optional[datetime] = None - date_published: Optional[datetime] = None - version: Optional[str] = None - keywords: Optional[List[str]] = None - source_organization: Optional[Organization] = None - download_url: Optional[str] = None - license: Optional[List[str]] = None - - def to_graph(self) -> Graph: - """Convert repository to RDF graph.""" - jd = GitlabExtractorSchema().dumps(self) - g: Graph = Graph().parse(format="json-ld", data=str(jd)) - g.bind("schema", SDO) - return g def list_files(self) -> List[RemoteResource]: """takes the root repository folder and returns the list of files present""" @@ -78,47 +51,48 @@ def list_files(self) -> List[RemoteResource]: file_list.append(file) return file_list - def extract(self): + def extract(self) -> Repository: """Extract metadata from target Gitlab repository.""" # fetch metadata data = self._repo_data - # Each Gitlab project has a unique identifier (integer) - self.identifier = urlparse(data["id"]).path.split("/")[2] - # at the moment, Gimie fetches only the group directly related to the project + # NOTE(identifier): Each Gitlab project has a unique identifier (integer) + # NOTE(author): Fetches only the group directly related to the project # the group takes the form: parent/subgroup - self.source_organization = self._safe_extract_group(data) - self.description = data["description"] - self.prog_langs = [lang["name"] for lang in data["languages"]] - self.date_created = isoparse(data["createdAt"][:-1]) - self.date_modified = isoparse(data["lastActivityAt"][:-1]) + + # NOTE(contributors): contributors = project members + # who are not owners + those that have written merge requests + # owners are either multiple individuals or a group. If no user + # is marked as owner, contributors are project members or merge + # request authors + repo_meta = dict( + authors=self._safe_extract_author(data), + contributors=self._safe_extract_contributors(data), + date_created=isoparse(data["createdAt"][:-1]), + date_modified=isoparse(data["lastActivityAt"][:-1]), + description=data["description"], + identifier=urlparse(data["id"]).path.split("/")[2], + keywords=data["topics"], + licenses=self._get_licenses(), + name=self.path, + prog_langs=[lang["name"] for lang in data["languages"]], + url=self.url, + ) + if data["releases"]["edges"]: - self.date_published = isoparse( + repo_meta["date_published"] = isoparse( data["releases"]["edges"][0]["node"]["releasedAt"] ) - self.license = self._get_licenses() - self.keywords = data["topics"] - - # Get contributors as the project members that are not owners and those that have written merge requests - # owners are either multiple individuals or a group, if not user is marked as owner - self.author = self._safe_extract_author(data) - # contributors are project members or merge request authors - self.contributors = self._safe_extract_contributors(data) if data["releases"] and (len(data["releases"]["edges"]) > 0): # go into releases and take the name from the first node (most recent) - self.version = data["releases"]["edges"][0]["node"]["name"] - self.download_url = f"{self.url}/-/archive/{self.version}/{self.path.split('/')[-1]}-{self.version}.tar.gz" - - def _safe_extract_group( - self, repo: Dict[str, Any] - ) -> Optional[Organization]: - """Extract the group from a GraphQL repository node if it has one.""" - if (self.path is not None) and (repo["group"] is not None): - repo["group"]["name"] = "/".join(self.path.split("/")[0:-1]) - return self._get_organization(repo["group"]) - return None + version = data["releases"]["edges"][0]["node"]["name"] + repo_meta["version"] = version + repo_meta[ + "download_url" + ] = f"{self.url}/-/archive/{version}/{self.path.split('/')[-1]}-{version}.tar.gz" + return Repository(**repo_meta) # type: ignore def _safe_extract_author( self, repo: Dict[str, Any] @@ -217,11 +191,14 @@ def _repo_data(self) -> Dict[str, Any]: repository { rootRef tree{ - blobs{ - nodes { - name - webUrl - }}}} + blobs{ + nodes { + name + webUrl + } + } + } + } releases { edges { node { @@ -306,31 +283,3 @@ def rest_endpoint(self) -> str: @property def graphql_endpoint(self) -> str: return f"{self.base}/api" - - -class GitlabExtractorSchema(JsonLDSchema): - """This defines the schema used for json-ld serialization.""" - - _id = fields.Id() - path = fields.String(SDO.name) - identifier = fields.String(SDO.identifier) - source_organization = fields.Nested(SDO.isPartOf, OrganizationSchema) - author = fields.Nested( - SDO.author, [PersonSchema, OrganizationSchema], many=True - ) - contributors = fields.Nested(SDO.contributor, PersonSchema, many=True) - prog_langs = fields.List(SDO.programmingLanguage, fields.String) - download_url = fields.Raw(SDO.downloadUrl) - description = fields.String(SDO.description) - date_created = fields.Date(SDO.dateCreated) - date_modified = fields.Date(SDO.dateModified) - date_published = fields.Date(SDO.datePublished) - license = fields.List(SDO.license, fields.IRI) - url = fields.IRI(SDO.codeRepository) - keywords = fields.List(SDO.keywords, fields.String) - version = fields.String(SDO.version) - - class Meta: - rdf_type = SDO.SoftwareSourceCode - model = GitlabExtractor - add_value_types = False diff --git a/tests/test_git.py b/tests/test_git.py index 75548489..8d487cff 100644 --- a/tests/test_git.py +++ b/tests/test_git.py @@ -1,11 +1,14 @@ """Tests for the Gimie command line interface.""" import os -from gimie.sources.git import GitExtractor -from gimie.project import Project import datetime + import pytest + from gimie.graph.namespaces import GIMIE +from gimie.io import LocalResource +from gimie.sources.git import GitExtractor +from gimie.project import Project LOCAL_REPOSITORY = os.getcwd() RENKU_GITHUB = "https://github.com/SwissDataScienceCenter/renku" @@ -15,16 +18,16 @@ @pytest.fixture def local_meta(): """Return metadata for a local repository.""" - meta = GitExtractor( + extractor = GitExtractor( "https://github.com/SDSC-ORD/gimie", local_path=LOCAL_REPOSITORY ) - meta.extract() - return meta + return extractor.extract() def test_git_authors(local_meta): """Test part of the authors returned by gimie.""" contribs = [c.name for c in local_meta.contributors] + author = local_meta.authors[0] names = [ "cmdoret", "Martin Nathan Tristan Fontanet", @@ -32,7 +35,7 @@ def test_git_authors(local_meta): "sabrinaossey", ] assert all([n in contribs for n in names]) - assert local_meta.author.name == "Cyril Matthey-Doret" + assert author.name == "Cyril Matthey-Doret" def test_git_creation_date(local_meta): @@ -47,18 +50,26 @@ def test_git_creation_date(local_meta): def test_set_uri(): meta = GitExtractor( "https://example.com/test", local_path=LOCAL_REPOSITORY - ) - meta.extract() + ).extract() assert meta._id == "https://example.com/test" def test_clone_extract_github(): """Clone Git repository by setting git extractor explicitely and extract metadata locally.""" - meta = Project(RENKU_GITHUB, sources="git") + proj = Project(RENKU_GITHUB, sources="git") + assert type(proj.extractors[0]) == GitExtractor + proj.extract() def test_clone_unsupported(): """Instantiate Project from unsupported provider with git as default provider""" - meta = Project(UNSUPPORTED_PROV) + proj = Project(UNSUPPORTED_PROV) + assert type(proj.extractors[0]) == GitExtractor + proj.extract() + + +def test_git_list_files(): + files = GitExtractor(UNSUPPORTED_PROV).list_files() + assert all(isinstance(f, LocalResource) for f in files) diff --git a/tests/test_github.py b/tests/test_github.py index f67e533d..053b8c65 100644 --- a/tests/test_github.py +++ b/tests/test_github.py @@ -1,4 +1,5 @@ # Tests fetching metadata from GitHub repositories with different setups. +from gimie.io import RemoteResource from gimie.sources.github import GithubExtractor import pytest @@ -12,7 +13,12 @@ @pytest.mark.parametrize("repo", TEST_REPOS) -def test_github_extractor(repo): - meta = GithubExtractor(repo) - meta.extract() +def test_github_extract(repo): + meta = GithubExtractor(repo).extract() meta.serialize(format="ttl") + + +@pytest.mark.parametrize("repo", TEST_REPOS) +def test_github_list_files(repo): + files = GithubExtractor(repo).list_files() + assert all(isinstance(f, RemoteResource) for f in files) diff --git a/tests/test_gitlab.py b/tests/test_gitlab.py index 84807a76..8c2b32b8 100644 --- a/tests/test_gitlab.py +++ b/tests/test_gitlab.py @@ -1,3 +1,4 @@ +from gimie.io import RemoteResource from gimie.sources.gitlab import GitlabExtractor import pytest @@ -12,7 +13,13 @@ @pytest.mark.parametrize("repo", TEST_REPOS) -def test_gitlab_extractor(repo): - meta = GitlabExtractor(repo) - meta.extract() +def test_gitlab_extract(repo): + extractor = GitlabExtractor(repo) + meta = extractor.extract() meta.serialize(format="ttl") + + +@pytest.mark.parametrize("repo", TEST_REPOS) +def test_gitlab_list_files(repo): + files = GitlabExtractor(repo).list_files() + assert all(isinstance(f, RemoteResource) for f in files) diff --git a/tests/test_output.py b/tests/test_output.py index 144949b4..85758bba 100644 --- a/tests/test_output.py +++ b/tests/test_output.py @@ -22,9 +22,11 @@ from gimie.project import Project -OUT_TTL = Project( - "https://github.com/SDSC-ORD/gimie", sources=["github"] -).serialize(format="ttl") +OUT_TTL = ( + Project("https://github.com/SDSC-ORD/gimie", sources=["github"]) + .extract() + .serialize(format="ttl") +) def test_validate_output_is_linked_data():