Skip to content

Commit

Permalink
refactor: simplify extractor (#96)
Browse files Browse the repository at this point in the history
* refactor(models): add Repository model and schema

* refactor(sources): rm RDF methods from Extractor interface

* refactor(github): rdf funcs GithubExtractor->Repository

* refactor(project): Project interface match new Extractor

* refactor(cli): use updated Project interface

* style(models): sort Repository attributes

* refactor(git): use Repository mode in GitExtractor

* refactor(gitlab): use Repository model in GitlabExtor

* style(github): sort Repository args

* refactor(models): allow multiple authors

* test: adapt to new Project interface

* fix(git): clone repo upon calling extract() if no local path is provided

* fix(github): use raw url in list files, not blob

* test: cover list_files methods

* fix(models): repository.download URL as RDF IRI

* fix(git): add cloned repo cleanup in destructor

* fix(git): safe cleanup of cloned dir + rm cleanup from project

* style: trim end of lines/files + black formatting

* fix(git): additional safety check on cleanup
  • Loading branch information
cmdoret authored Oct 24, 2023
1 parent 012475b commit 49b47c5
Show file tree
Hide file tree
Showing 14 changed files with 264 additions and 299 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/conventional-prs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
validateSingleCommit: true
validateSingleCommit: true
10 changes: 5 additions & 5 deletions .github/workflows/sphinx-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:
pull_request:
paths:
- 'docs/**'

permissions:
contents: write
jobs:
Expand All @@ -15,10 +15,10 @@ jobs:
steps:
# https://github.com/actions/checkout
- uses: actions/checkout@v4

# https://github.com/actions/setup-python
- uses: actions/setup-python@v4

# https://github.com/snok/install-poetry
- name: Install Poetry
uses: snok/install-poetry@v1
Expand All @@ -37,10 +37,10 @@ jobs:
steps:
# https://github.com/actions/checkout
- uses: actions/checkout@v4

# https://github.com/actions/setup-python
- uses: actions/setup-python@v4

# https://github.com/snok/install-poetry
- name: Install Poetry
uses: snok/install-poetry@v1
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Scientific code repositories contain valuable metadata which can be used to enri

----------------------------------------------------------------------

Using Gimie: easy peasy, it's a 3 step process.
Using Gimie: easy peasy, it's a 3 step process.

## STEP 1: Installation

Expand All @@ -37,11 +37,11 @@ docker run -e ACCESS_TOKEN=$ACCESS_TOKEN ghcr.io/sdsc-ord/gimie:latest gimie dat

## STEP 2 : Set your credentials

In order to access the github api, you need to provide a github token with the `read:org` scope.
In order to access the github api, you need to provide a github token with the `read:org` scope.

### A. Create access tokens

New to access tokens? Or don't know how to get your Github / Gitlab token ?
New to access tokens? Or don't know how to get your Github / Gitlab token ?

Have no fear, see
[here for Github tokens](https://docs.github.com/en/enterprise-server@3.4/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) and [here for Gitlab tokens](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html).
Expand All @@ -68,7 +68,7 @@ export GITLAB_TOKEN=
```shell
gimie data https://github.com/numpy/numpy
```
(want a Gitlab project instead? Just replace the URL in the command line)
(want a Gitlab project instead? Just replace the URL in the command line)

### As a python library

Expand Down Expand Up @@ -132,7 +132,7 @@ run checks:
```shell
make check
```
for an easier use Github/Gitlab APIs, place your access tokens in the `.env` file: (and don't worry, the `.gitignore` will ignore them when you push to GitHub)
for an easier use Github/Gitlab APIs, place your access tokens in the `.env` file: (and don't worry, the `.gitignore` will ignore them when you push to GitHub)

```
cp .env.dist .env
Expand Down
3 changes: 2 additions & 1 deletion gimie/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ def data(
The output is sent to stdout, and turtle is used as the default serialization format."""
proj = Project(url, base_url=base_url)
print(proj.serialize(format=format))
repo_meta = proj.extract()
print(repo_meta.serialize(format=format))


@app.command()
Expand Down
79 changes: 77 additions & 2 deletions gimie/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
"""Data models to represent nodes in the graph generated by gimie."""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import datetime
import datetime
from typing import Optional, List
from typing import List, Optional, Union

from calamus.schema import JsonLDSchema
from calamus import fields
from rdflib import Graph

from gimie.graph.namespaces import SDO

Expand All @@ -42,7 +44,7 @@ class Release:
"""

tag: str = field(compare=False)
date: datetime.datetime = field(compare=True)
date: datetime = field(compare=True)
commit_hash: str = field(compare=False)


Expand Down Expand Up @@ -103,3 +105,76 @@ class PersonSchema(JsonLDSchema):
class Meta:
rdf_type = SDO.Person
model = Person


@dataclass
class Repository:
"""This class represents a git repository.
It does not contain any information about the content of the repository.
See https://schema.org/SoftwareSourceCode
"""

url: str
name: str

authors: Optional[List[Union[Organization, Person]]] = None
contributors: Optional[List[Person]] = None
date_created: Optional[datetime] = None
date_modified: Optional[datetime] = None
date_published: Optional[datetime] = None
description: Optional[str] = None
download_url: Optional[str] = None
identifier: Optional[str] = None
keywords: Optional[List[str]] = None
licenses: Optional[List[str]] = None
parent_repository: Optional[str] = None
prog_langs: Optional[List[str]] = None
version: Optional[str] = None

@property
def _id(self) -> str:
"""Unique identifier for the repository."""
return self.url

def to_graph(self) -> Graph:
"""Convert repository to RDF graph."""
jd = RepositorySchema().dumps(self)
g: Graph = Graph().parse(format="json-ld", data=str(jd))
g.bind("schema", SDO)
return g

def serialize(self, format: str = "ttl", **kwargs) -> str:
"""Serialize the RDF graph representing the instance."""
return self.to_graph().serialize(format=format, **kwargs) # type: ignore

def jsonld(self) -> str:
"""Alias for jsonld serialization."""
return self.serialize(format="json-ld")


class RepositorySchema(JsonLDSchema):
"""This defines the schema used for json-ld serialization."""

_id = fields.Id()
authors = fields.Nested(
SDO.author, [PersonSchema, OrganizationSchema], many=True
)
contributors = fields.Nested(SDO.contributor, PersonSchema, many=True)
date_created = fields.Date(SDO.dateCreated)
date_modified = fields.Date(SDO.dateModified)
date_published = fields.Date(SDO.datePublished)
description = fields.String(SDO.description)
download_url = fields.IRI(SDO.downloadUrl)
identifier = fields.String(SDO.identifier)
keywords = fields.List(SDO.keywords, fields.String)
licenses = fields.List(SDO.license, fields.IRI)
name = fields.String(SDO.name)
parent_repository = fields.IRI(SDO.isBasedOn)
prog_langs = fields.List(SDO.programmingLanguage, fields.String)
url = fields.IRI(SDO.codeRepository)
version = fields.String(SDO.version)

class Meta:
rdf_type = SDO.SoftwareSourceCode
model = Repository
add_value_types = False
43 changes: 7 additions & 36 deletions gimie/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from urllib.parse import urlparse

from gimie.graph.operations import combine_graphs
from gimie.models import Repository
from gimie.utils import validate_url
from gimie.sources import SOURCES
from gimie.sources.abstract import Extractor
Expand All @@ -48,6 +49,7 @@ class Project:
Examples
--------
>>> proj = Project("https://github.com/SDSC-ORD/gimie")
>>> assert isinstance(proj.extract(), Graph)
"""

def __init__(
Expand All @@ -63,24 +65,16 @@ def __init__(
self._cloned = False
if validate_url(path):
self.url = path
# We only need to clone a remote project
# if a local extractor is enabled
if any(map(is_local_source, sources)):
self.project_dir = self.clone(path)
else:
self.project_dir = path

self.extractors = self.get_extractors(sources)
for ex in self.extractors:
ex.extract()

def clone(self, url: str) -> str:
"""Clone target url in a new temporary directory"""
target_dir = TemporaryDirectory().name
cloned = git.Repo.clone_from(url, target_dir) # type: ignore
self._cloned = True

return str(cloned.working_tree_dir)
def extract(self) -> Graph:
repos = [ex.extract() for ex in self.extractors]
graphs = [repo.to_graph() for repo in repos]
graph = combine_graphs(*graphs)
return graph

def get_extractors(self, sources: Iterable[str]) -> List[Extractor]:

Expand All @@ -96,29 +90,6 @@ def get_extractors(self, sources: Iterable[str]) -> List[Extractor]:

return extractors

def to_graph(self) -> Graph:
graphs = map(lambda ex: ex.to_graph(), self.extractors)
combined_graph = combine_graphs(*graphs)
return combined_graph

def serialize(self, format: str = "ttl", **kwargs):
return self.to_graph().serialize(format=format, **kwargs)

def cleanup(self):
"""Recursively delete the project. Only works
for remote (i.e. cloned) projects."""
try:
tempdir = gettempdir()
if self.project_dir is not None:
in_temp = self.project_dir.startswith(tempdir)
if self._cloned and in_temp:
shutil.rmtree(self.project_dir)
except AttributeError:
pass

def __del__(self):
self.cleanup()


def split_git_url(url) -> Tuple[str, str]:
base_url = urlparse(url).scheme + "://" + urlparse(url).netloc
Expand Down
27 changes: 5 additions & 22 deletions gimie/sources/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
from abc import ABC, abstractmethod
from typing import List, Optional


from rdflib import Graph
from urllib.parse import urlparse
from gimie.sources.common.license import get_license_url, is_license_path

from gimie.io import Resource
from gimie.models import Repository
from gimie.sources.common.license import get_license_url, is_license_path


class Extractor(ABC):
Expand All @@ -44,32 +44,14 @@ def __init__(
self.local_path = local_path

@abstractmethod
def extract(self):
def extract(self) -> Repository:
"""Extract metadata"""
...

@abstractmethod
def to_graph(self) -> Graph:
"""Generate an RDF graph from the instance"""
return Graph()

def list_files(self) -> List[Resource]:
"""List all files in the repository HEAD."""
...

def serialize(self, format: str = "ttl", **kwargs) -> str:
"""Serialize the RDF graph representing the instance."""
return self.to_graph().serialize(format=format, **kwargs) # type: ignore

def jsonld(self) -> str:
"""Alias for jsonld serialization."""
return self.serialize(format="json-ld")

@property
def _id(self) -> str:
"""Unique identifier for the repository."""
return self.url

@property
def path(self) -> str:
"""Path to the repository without the base URL."""
Expand All @@ -87,6 +69,7 @@ def base(self) -> str:

def _get_licenses(self) -> List[str]:
"""Extracts SPDX License URLs from the repository."""
# TODO: Move functionality into a dedicate Parser
license_files = filter(
lambda p: is_license_path(p.name), self.list_files()
)
Expand Down
Loading

0 comments on commit 49b47c5

Please sign in to comment.