Update the library for Data Package (v2) (#13)

* Added sources.version * Added sources to DCAT mapper * Added `contributor.given/familyName` * Replace `contributor.role` by `contributor.roles` * Added contributor.role mapper * Support resource.url (v0) * Support field.format (v0) * Renamed `IDict` to `IData` * Rebased on mode_validator for compat * Updated Table Dialect * Added `schema.fieldsMatch` * Moved `field` model to its own folder * Simplified foreignKey * Better separate models/types * Added `schema.uniqueKeys` * Added `list` field type * Added `constraints.jsonSchema` * Support `groupChart` for integers * Added exclusive constraints * Support `primaryKey` from v1 * Improved compat code * Support `foreignKeys` from v1 * Removed profile rules * Removed metadata profile * Added changelog * Simplified dialect defaults * Updated profiles * Removed version from compat * Rebased on `$schema` property * Removed profile model * Fixed plugin tests * Updated actions * Fixed tests * Fixed typo * Updated changelog * Fixed CI tests * Fixed CI tests
frictionlessdata · Apr 12, 2024 · 858a83e · 858a83e
1 parent 8f1a9e9
commit 858a83e
Show file tree

Hide file tree

Showing 59 changed files with 8,028 additions and 670 deletions.
diff --git a/.github/workflows/general.yaml b/.github/workflows/general.yaml
@@ -33,7 +33,7 @@ jobs:
       - name: Prepare variables
         run: cp .env.example .env
       - name: Test software
-        run: hatch run ci:test +py=${{ matrix.py || matrix.python-version }}
+        run: hatch run +py=${{ matrix.py || matrix.python-version }} ci:test
       - name: Report coverage
         uses: codecov/codecov-action@v2
 
@@ -55,7 +55,7 @@ jobs:
         run: cp .env.example .env
       - name: Test software
         # https://stackoverflow.com/questions/9678408/cant-install-psycopg2-with-pip-in-virtualenv-on-mac-os-x-10-7
-        run: LDFLAGS=`echo $(pg_config --ldflags)` hatch run ci:test
+        run: LDFLAGS=`echo $(pg_config --ldflags)` hatch run +py=3.10 ci:test
 
   # Test (Windows)
 
@@ -74,7 +74,7 @@ jobs:
       - name: Prepare variables
         run: cp .env.example .env
       - name: Test software
-        run: hatch run ci:test
+        run: hatch run +py=3.10 ci:test
 
   # Deploy
 

diff --git a/data/package-custom-profile.json b/data/package-custom-profile.json
@@ -1,5 +1,5 @@
 {
-  "profile": "https://raw.githubusercontent.com/frictionlessdata/frictionless-py/main/data/profiles/required.json",
+  "$schema": "https://raw.githubusercontent.com/frictionlessdata/frictionless-py/main/data/profiles/required.json",
   "name": "name",
   "resources": [
     {

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -0,0 +1,11 @@
+# Changelog
+
+This document covers main `dplib-py` releases:
+
+## v0.7
+
+- Updated to Data Package (v2)
+
+## v0.6
+
+- Initial public release
diff --git a/dplib/actions/dialect/check.py b/dplib/actions/dialect/check.py
@@ -8,7 +8,7 @@
 from ..metadata.check import check_metadata
 
 
-def check_dialect(dialect: Union[str, types.IDict, Dialect]) -> List[MetadataError]:
+def check_dialect(dialect: Union[str, types.IData, Dialect]) -> List[MetadataError]:
     """Check the validity of a Table Dialect descriptor
 
     This validates the descriptor against the JSON Schema profiles to ensure

diff --git a/dplib/actions/metadata/check.py b/dplib/actions/metadata/check.py
@@ -2,28 +2,32 @@
 
 from typing import List, Union
 
-from ... import types
+from ... import settings, types
 from ...errors.metadata import MetadataError
 from ...helpers.data import read_data
-from ...helpers.path import is_remote_path
-from ...helpers.profile import check_metadata_against_jsonschema, read_profile
-from ...models import Profile
+from ...helpers.profile import check_profile
 
 
 def check_metadata(
-    metadata: Union[str, types.IDict], *, type: str
+    metadata: Union[str, types.IData], *, type: types.IMetadataType
 ) -> List[MetadataError]:
     if isinstance(metadata, str):
         metadata = read_data(metadata)
 
-    # Base profile
-    profile = Profile.from_dict(read_profile(metadata_type=type))
-    errors = check_metadata_against_jsonschema(metadata, profile.jsonSchema)
+    # Get default profile
+    if type == "dialect":
+        default_profile = settings.PROFILE_DEFAULT_DIALECT
+    elif type == "package":
+        default_profile = settings.PROFILE_DEFAULT_PACKAGE
+    elif type == "resource":
+        default_profile = settings.PROFILE_DEFAULT_RESOURCE
+    elif type == "schema":
+        default_profile = settings.PROFILE_DEFAULT_SCHEMA
+    else:
+        raise ValueError(f"Invalid metadata type: {type}")
 
-    # Custom profile
-    custom_profile = metadata.get("profile")
-    if custom_profile and is_remote_path(custom_profile):
-        custom_profile = Profile.from_path(custom_profile)
-        errors += check_metadata_against_jsonschema(metadata, custom_profile.jsonSchema)
+    # Validate metadata
+    profile = metadata.get("$schema", default_profile)
+    errors = check_profile(metadata=metadata, profile=profile)
 
     return errors
diff --git a/dplib/actions/package/__spec__/test_check.py b/dplib/actions/package/__spec__/test_check.py
@@ -23,9 +23,8 @@ def test_check_package_invalid_dereferencing():
     errors = check_package("data/package-invalid-dereferencing.json")
     assert len(errors) == 1
     error = errors[0]
-    assert (
-        error.full_message == "[/resources/0/dialect/delimiter] 1 is not of type 'string'"
-    )
+    # TODO: extend error path so it shows the full path from the package root
+    assert error.full_message == "[/delimiter] 1 is not of type 'string'"
 
 
 @pytest.mark.vcr

diff --git a/dplib/actions/package/check.py b/dplib/actions/package/check.py
@@ -10,7 +10,7 @@
 from ..metadata.check import check_metadata
 
 
-def check_package(package: Union[str, types.IDict, Package]) -> List[MetadataError]:
+def check_package(package: Union[str, types.IData, Package]) -> List[MetadataError]:
     """Check the validity of a Data Package descriptor
 
     This validates the descriptor against the JSON Schema profiles to ensure
@@ -30,13 +30,15 @@ def check_package(package: Union[str, types.IDict, Package]) -> List[MetadataErr
         basepath = package.basepath
         package = package.to_dict()
 
-    # Dereference resources[].dialect/schema
+    # Validate (including nested descriptors)
+    errors = check_metadata(package, type="package")
     resources = package.get("resources", [])
     if isinstance(resources, list):
         for resource in resources:  # type: ignore
-            for name in ["dialect", "schema"]:
-                value = resource.get(name)  # type: ignore
-                if value and isinstance(value, str):
-                    resource[name] = read_data(value, basepath=basepath)
+            for type in ["dialect", "schema"]:
+                value = resource.get(type)  # type: ignore
+                if isinstance(value, str):
+                    metadata = read_data(value, basepath=basepath)
+                    errors.extend(check_metadata(metadata, type=type))  # type: ignore
 
-    return check_metadata(package, type="package")
+    return errors
diff --git a/dplib/actions/resource/__spec__/test_check.py b/dplib/actions/resource/__spec__/test_check.py
@@ -21,7 +21,8 @@ def test_check_resource_invalid_dereferencing():
     errors = check_resource("data/resource-invalid-dereferencing.json")
     assert len(errors) == 1
     error = errors[0]
-    assert error.full_message == "[/dialect/delimiter] 1 is not of type 'string'"
+    # TODO: extend error path so it shows the full path from the resource root
+    assert error.full_message == "[/delimiter] 1 is not of type 'string'"
 
 
 def test_check_resource_from_model():

diff --git a/dplib/actions/resource/check.py b/dplib/actions/resource/check.py
@@ -10,7 +10,7 @@
 from ..metadata.check import check_metadata
 
 
-def check_resource(resource: Union[str, types.IDict, Resource]) -> List[MetadataError]:
+def check_resource(resource: Union[str, types.IData, Resource]) -> List[MetadataError]:
     """Check the validity of a Data Resource descriptor
 
     This validates the descriptor against the JSON Schema profiles to ensure
@@ -30,10 +30,12 @@ def check_resource(resource: Union[str, types.IDict, Resource]) -> List[Metadata
         basepath = resource.basepath
         resource = resource.to_dict()
 
-    # Dereference dialect/schema
-    for name in ["dialect", "schema"]:
-        value = resource.get(name)
-        if value and isinstance(value, str):
-            resource[name] = read_data(value, basepath=basepath)
+    # Validate (including nested descriptors)
+    errors = check_metadata(resource, type="resource")
+    for type in ["dialect", "schema"]:
+        value = resource.get(type)
+        if isinstance(value, str):
+            metadata = read_data(value, basepath=basepath)
+            errors.extend(check_metadata(metadata, type=type))  # type: ignore
 
-    return check_metadata(resource, type="resource")
+    return errors
diff --git a/dplib/actions/schema/check.py b/dplib/actions/schema/check.py
@@ -8,7 +8,7 @@
 from ..metadata.check import check_metadata
 
 
-def check_schema(schema: Union[str, types.IDict, Schema]) -> List[MetadataError]:
+def check_schema(schema: Union[str, types.IData, Schema]) -> List[MetadataError]:
     """Check the validity of a Table Schema descriptor
 
     This validates the descriptor against the JSON Schema profiles to ensure

diff --git a/dplib/helpers/data.py b/dplib/helpers/data.py
@@ -12,22 +12,22 @@
 
 def read_data(
     path: str, *, format: Optional[str] = None, basepath: Optional[str] = None
-) -> types.IDict:
+) -> types.IData:
     if not format:
         format = infer_format(path, raise_missing=True)
     text = read_file(path, basepath=basepath)
     data = load_data(text, format=format)
     return data
 
 
-def write_data(path: str, data: types.IDict, *, format: Optional[str] = None):
+def write_data(path: str, data: types.IData, *, format: Optional[str] = None):
     if not format:
         format = infer_format(path, raise_missing=True)
     text = dump_data(data, format=format)
     write_file(path, text)
 
 
-def load_data(text: str, *, format: str) -> types.IDict:
+def load_data(text: str, *, format: str) -> types.IData:
     try:
         if format == "json":
             return json.loads(text)
@@ -39,7 +39,7 @@ def load_data(text: str, *, format: str) -> types.IDict:
     raise Error(f"Cannot load data from text with format: {format}")
 
 
-def dump_data(data: types.IDict, *, format: str) -> str:
+def dump_data(data: types.IData, *, format: str) -> str:
     try:
         if format == "json":
             return json.dumps(data, indent=2)
@@ -51,7 +51,7 @@ def dump_data(data: types.IDict, *, format: str) -> str:
     raise Error(f"Cannot dump data to text with format: {format}")
 
 
-def clean_data(data: types.IDict):
+def clean_data(data: types.IData):
     for key, value in list(data.items()):
         if isinstance(value, dict):
             clean_data(value)  # type: ignore

diff --git a/dplib/helpers/profile.py b/dplib/helpers/profile.py
@@ -6,44 +6,59 @@
 
 from jsonschema.validators import validator_for  # type: ignore
 
-from .. import types
+from .. import settings, types
 from ..error import Error
 from ..errors.metadata import MetadataError
 from .data import load_data
 from .file import read_file
 
+# TODO: implement additional user-side profile caching
 
-def select_profile(*, metadata_type: types.IMetadataType) -> str:
-    if metadata_type == "package":
-        return "data-package"
-    elif metadata_type == "resource":
-        return "data-resource"
-    elif metadata_type == "dialect":
-        return "table-dialect"
-    elif metadata_type == "schema":
-        return "table-schema"
-    raise Error(f'Invalid metadata type "{metadata_type}"')
+
+def check_profile(*, metadata: types.IData, profile: str) -> List[MetadataError]:
+    # Prepare validator
+    jsonSchema = read_profile(profile=profile)
+    Validator = validator_for(jsonSchema)  # type: ignore
+    validator = Validator(jsonSchema)  # type: ignore
+
+    # Validate metadata
+    errors: List[MetadataError] = []
+    for validation_error in validator.iter_errors(metadata):  # type: ignore
+        errors.append(MetadataError(validation_error))  # type: ignore
+
+    return errors
 
 
 @lru_cache
-def read_profile(*, metadata_type: types.IMetadataType) -> types.IDict:
-    format = "json"
-    name = select_profile(metadata_type=metadata_type)
-    path = os.path.join(os.path.dirname(__file__), "..", "profiles", f"{name}.{format}")
+def read_profile(*, profile: str) -> types.IData:
+    parts = parse_profile(profile)
+
+    # Replace with builtin copy
+    if parts:
+        version, filename = parts
+        profile = os.path.join(settings.PROFILE_BASEDIR, version, filename)
+
+    # Read jsonSchema
     try:
-        text = read_file(path)
-        data = load_data(text, format=format)
+        text = read_file(profile)
+        data = load_data(text, format="json")
     except Exception:
-        raise Error(f'Cannot read profile "{name}" at "{path}"')
+        raise Error(f'Cannot read profile: "{profile}"')
+
     return data
 
 
-def check_metadata_against_jsonschema(
-    metadata: types.IDict, jsonSchema: types.IDict
-) -> List[MetadataError]:
-    Validator = validator_for(jsonSchema)  # type: ignore
-    validator = Validator(jsonSchema)  # type: ignore
-    errors: List[MetadataError] = []
-    for validation_error in validator.iter_errors(metadata):  # type: ignore
-        errors.append(MetadataError(validation_error))  # type: ignore
-    return errors
+def parse_profile(profile: str):
+    parts = profile.rsplit("/", 3)
+
+    # Ensure builtin copy exists
+    if len(parts) != 3:
+        return None
+    if parts[0] != settings.PROFILE_BASEURL:
+        return None
+    if parts[1] not in os.listdir(settings.PROFILE_BASEDIR):
+        return None
+    if parts[2] not in os.listdir(os.path.join(settings.PROFILE_BASEDIR, parts[1])):
+        return None
+
+    return parts[1], parts[2]
diff --git a/dplib/model.py b/dplib/model.py
@@ -2,7 +2,6 @@
 
 import pprint
 import warnings
-from functools import cached_property
 from typing import Optional
 
 from pydantic import BaseModel
@@ -22,8 +21,8 @@ def __str__(self) -> str:
     def __repr__(self) -> str:
         return pprint.pformat(self.to_dict(), sort_dicts=False)
 
-    @cached_property
-    def custom(self) -> types.IDict:
+    @property
+    def custom(self) -> types.IData:
         assert self.model_extra is not None
         return self.model_extra
 
@@ -63,7 +62,7 @@ def to_dict(self):
         return data
 
     @classmethod
-    def from_dict(cls, data: types.IDict, *, basepath: Optional[str] = None) -> Self:
+    def from_dict(cls, data: types.IData, *, basepath: Optional[str] = None) -> Self:
         if basepath and cls.model_fields.get("basepath"):
             data["basepath"] = basepath
         return cls(**data)

diff --git a/dplib/models/__init__.py b/dplib/models/__init__.py
@@ -1,8 +1,8 @@
 from .contributor import Contributor
 from .dialect import Dialect
+from .field import Constraints, Field
 from .license import License
 from .package import Package
-from .profile import Profile
 from .resource import Resource
-from .schema import Constraints, Field, ForeignKey, ForeignKeyReference, Schema
+from .schema import ForeignKey, ForeignKeyReference, Schema
 from .source import Source
diff --git a/dplib/models/contributor.py b/dplib/models/contributor.py
@@ -1,11 +1,32 @@
-from typing import Optional
+from typing import List, Optional
 
+import pydantic
+
+from .. import types
 from ..model import Model
 
 
 class Contributor(Model):
     title: Optional[str] = None
+    givenName: Optional[str] = None
+    familyName: Optional[str] = None
     path: Optional[str] = None
     email: Optional[str] = None
-    role: Optional[str] = None
+    roles: List[str] = []
     organization: Optional[str] = None
+
+    # Compat
+
+    @pydantic.model_validator(mode="before")
+    @classmethod
+    def compat(cls, data: types.IData):
+        if not isinstance(data, dict):  # type: ignore
+            return data
+
+        # contributor.role
+        if not data.get("roles"):
+            role = data.pop("role", None)
+            if role:
+                data["roles"] = [role]
+
+        return data