Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: simplify extractor #96

Merged
merged 19 commits into from
Oct 24, 2023
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion gimie/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ def data(

The output is sent to stdout, and turtle is used as the default serialization format."""
proj = Project(url, base_url=base_url)
print(proj.serialize(format=format))
repo_meta = proj.extract()
print(repo_meta.serialize(format=format))


@app.command()
Expand Down
79 changes: 77 additions & 2 deletions gimie/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
"""Data models to represent nodes in the graph generated by gimie."""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import datetime
import datetime
from typing import Optional, List
from typing import List, Optional, Union

from calamus.schema import JsonLDSchema
from calamus import fields
from rdflib import Graph

from gimie.graph.namespaces import SDO

Expand All @@ -42,7 +44,7 @@ class Release:
"""

tag: str = field(compare=False)
date: datetime.datetime = field(compare=True)
date: datetime = field(compare=True)
commit_hash: str = field(compare=False)


Expand Down Expand Up @@ -103,3 +105,76 @@ class PersonSchema(JsonLDSchema):
class Meta:
rdf_type = SDO.Person
model = Person


@dataclass
class Repository:
"""This class represents a git repository.
It does not contain any information about the content of the repository.
See https://schema.org/SoftwareSourceCode
"""

url: str
name: str

authors: Optional[List[Union[Organization, Person]]] = None
contributors: Optional[List[Person]] = None
date_created: Optional[datetime] = None
date_modified: Optional[datetime] = None
date_published: Optional[datetime] = None
description: Optional[str] = None
download_url: Optional[str] = None
identifier: Optional[str] = None
keywords: Optional[List[str]] = None
licenses: Optional[List[str]] = None
parent_repository: Optional[str] = None
prog_langs: Optional[List[str]] = None
version: Optional[str] = None

@property
def _id(self) -> str:
"""Unique identifier for the repository."""
return self.url

def to_graph(self) -> Graph:
"""Convert repository to RDF graph."""
jd = RepositorySchema().dumps(self)
g: Graph = Graph().parse(format="json-ld", data=str(jd))
g.bind("schema", SDO)
return g

def serialize(self, format: str = "ttl", **kwargs) -> str:
"""Serialize the RDF graph representing the instance."""
return self.to_graph().serialize(format=format, **kwargs) # type: ignore

def jsonld(self) -> str:
"""Alias for jsonld serialization."""
return self.serialize(format="json-ld")


class RepositorySchema(JsonLDSchema):
"""This defines the schema used for json-ld serialization."""

_id = fields.Id()
authors = fields.Nested(
SDO.author, [PersonSchema, OrganizationSchema], many=True
)
contributors = fields.Nested(SDO.contributor, PersonSchema, many=True)
date_created = fields.Date(SDO.dateCreated)
date_modified = fields.Date(SDO.dateModified)
date_published = fields.Date(SDO.datePublished)
description = fields.String(SDO.description)
download_url = fields.IRI(SDO.downloadUrl)
identifier = fields.String(SDO.identifier)
keywords = fields.List(SDO.keywords, fields.String)
licenses = fields.List(SDO.license, fields.IRI)
name = fields.String(SDO.name)
parent_repository = fields.IRI(SDO.isBasedOn)
prog_langs = fields.List(SDO.programmingLanguage, fields.String)
url = fields.IRI(SDO.codeRepository)
version = fields.String(SDO.version)

class Meta:
rdf_type = SDO.SoftwareSourceCode
model = Repository
add_value_types = False
18 changes: 8 additions & 10 deletions gimie/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from urllib.parse import urlparse

from gimie.graph.operations import combine_graphs
from gimie.models import Repository
from gimie.utils import validate_url
from gimie.sources import SOURCES
from gimie.sources.abstract import Extractor
Expand All @@ -48,6 +49,7 @@ class Project:
Examples
--------
>>> proj = Project("https://github.com/SDSC-ORD/gimie")
>>> assert isinstance(proj.extract(), Graph)
"""

def __init__(
Expand All @@ -71,8 +73,12 @@ def __init__(
self.project_dir = path

self.extractors = self.get_extractors(sources)
for ex in self.extractors:
ex.extract()

def extract(self) -> Graph:
repos = [ex.extract() for ex in self.extractors]
graphs = [repo.to_graph() for repo in repos]
graph = combine_graphs(*graphs)
return graph

def clone(self, url: str) -> str:
"""Clone target url in a new temporary directory"""
Expand All @@ -96,14 +102,6 @@ def get_extractors(self, sources: Iterable[str]) -> List[Extractor]:

return extractors

def to_graph(self) -> Graph:
graphs = map(lambda ex: ex.to_graph(), self.extractors)
combined_graph = combine_graphs(*graphs)
return combined_graph

def serialize(self, format: str = "ttl", **kwargs):
return self.to_graph().serialize(format=format, **kwargs)

def cleanup(self):
"""Recursively delete the project. Only works
for remote (i.e. cloned) projects."""
Expand Down
27 changes: 5 additions & 22 deletions gimie/sources/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
from abc import ABC, abstractmethod
from typing import List, Optional


from rdflib import Graph
from urllib.parse import urlparse
from gimie.sources.common.license import get_license_url, is_license_path

from gimie.io import Resource
from gimie.models import Repository
from gimie.sources.common.license import get_license_url, is_license_path


class Extractor(ABC):
Expand All @@ -44,32 +44,14 @@ def __init__(
self.local_path = local_path

@abstractmethod
def extract(self):
def extract(self) -> Repository:
"""Extract metadata"""
...

@abstractmethod
def to_graph(self) -> Graph:
"""Generate an RDF graph from the instance"""
return Graph()

def list_files(self) -> List[Resource]:
"""List all files in the repository HEAD."""
...

def serialize(self, format: str = "ttl", **kwargs) -> str:
"""Serialize the RDF graph representing the instance."""
return self.to_graph().serialize(format=format, **kwargs) # type: ignore

def jsonld(self) -> str:
"""Alias for jsonld serialization."""
return self.serialize(format="json-ld")

@property
def _id(self) -> str:
"""Unique identifier for the repository."""
return self.url

@property
def path(self) -> str:
"""Path to the repository without the base URL."""
Expand All @@ -87,6 +69,7 @@ def base(self) -> str:

def _get_licenses(self) -> List[str]:
"""Extracts SPDX License URLs from the repository."""
# TODO: Move functionality into a dedicate Parser
license_files = filter(
lambda p: is_license_path(p.name), self.list_files()
)
Expand Down
88 changes: 41 additions & 47 deletions gimie/sources/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,17 @@
"""Extractor which uses a locally available (usually cloned) repository."""
from dataclasses import dataclass
from datetime import datetime
from functools import cached_property
import shutil
import tempfile
from typing import List, Optional
import uuid

from calamus import fields
from calamus.schema import JsonLDSchema
import git
import pydriller
from rdflib import Graph

from gimie.io import LocalResource
from gimie.graph.namespaces import SDO
from gimie.models import Person, PersonSchema
from gimie.models import Person, Repository
from gimie.sources.abstract import Extractor
from pathlib import Path

Expand Down Expand Up @@ -59,42 +58,49 @@ class GitExtractor(Extractor):
base_url: Optional[str] = None
local_path: Optional[str] = None

author: Optional[Person] = None
contributors: Optional[List[Person]] = None
date_created: Optional[datetime] = None
date_modified: Optional[datetime] = None
license: Optional[List[str]] = None

def extract(self):
if self.local_path is None:
raise ValueError("Local path must be provided for extraction.")
self.repository = pydriller.Repository(self.local_path)
def extract(self) -> Repository:
# Assuming author is the first person to commit
self.author = self._get_creator()
self.contributors = self._get_contributors()
self.date_created = self._get_creation_date()
self.date_modified = self._get_modification_date()
self.license = self._get_licenses()
self.repository = self._repo_data

repo_meta = dict(
authors=[self._get_creator()],
contributors=self._get_contributors(),
date_created=self._get_creation_date(),
date_modified=self._get_modification_date(),
name=self.path,
licenses=self._get_licenses(),
url=self.url,
)

return Repository(**repo_meta) # type: ignore

def list_files(self) -> List[LocalResource]:
file_list = []

if self.local_path is None:
return file_list
self.repository = self._repo_data
file_list = []

for path in Path(self.local_path).rglob("*"):
for path in Path(self.local_path).rglob("*"): # type: ignore
if (path.parts[0] == ".git") or not path.is_file():
continue
file_list.append(LocalResource(path))

return file_list

def to_graph(self) -> Graph:
"""Generate an RDF graph from the instance"""
jd = GitExtractorSchema().dumps(self)
g: Graph = Graph().parse(data=str(jd), format="json-ld")
g.bind("schema", SDO)
return g
def __del__(self):
"""Cleanup the cloned repo."""
try:
if self.local_path is not None:
shutil.rmtree(self.local_path)
except AttributeError:
pass
cmdoret marked this conversation as resolved.
Show resolved Hide resolved

@cached_property
def _repo_data(self) -> pydriller.Repository:
"""Get the repository data by accessing local data or cloning."""
if self.local_path is None:
self.local_path = tempfile.TemporaryDirectory().name
git.Repo.clone_from(self.url, self.local_path) # type: ignore
return pydriller.Repository(self.local_path)

def _get_contributors(self) -> List[Person]:
"""Get the authors of the repository."""
Expand All @@ -113,12 +119,14 @@ def _get_creation_date(self) -> Optional[datetime]:

def _get_modification_date(self) -> Optional[datetime]:
"""Get the last modification date of the repository."""
commit = None
try:
for commit in self.repository.traverse_commits():
pass
return commit.author_date
except (StopIteration, NameError):
return None
pass
finally:
return commit.author_date if commit else None

def _get_creator(self) -> Optional[Person]:
"""Get the creator of the repository."""
Expand All @@ -136,24 +144,10 @@ def _dev_to_person(
uid = str(uuid.uuid4())
else:
uid = name.replace(" ", "_").lower()
dev_id = f"{self._id}/{uid}"
dev_id = f"{self.url}/{uid}"
return Person(
_id=dev_id,
identifier=uid,
name=name,
email=email,
)


class GitExtractorSchema(JsonLDSchema):
_id = fields.Id()
author = fields.Nested(SDO.author, PersonSchema)
contributors = fields.Nested(SDO.contributor, PersonSchema, many=True)
date_created = fields.Date(SDO.dateCreated)
date_modified = fields.Date(SDO.dateModified)
license = fields.List(SDO.license, fields.IRI)

class Meta:
rdf_type = SDO.SoftwareSourceCode
model = GitExtractor
add_value_types = False
Loading