From 5eec5a6de5325145c728c2e5200e79193b2bf935 Mon Sep 17 00:00:00 2001 From: davemfish Date: Tue, 16 Jul 2024 16:20:13 -0400 Subject: [PATCH 01/15] using dataclasses to define metadata schema. --- src/geometamaker/__init__.py | 1 + src/geometamaker/models.py | 195 +++++++++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 src/geometamaker/models.py diff --git a/src/geometamaker/__init__.py b/src/geometamaker/__init__.py index 9f56a76..3b8ae6c 100644 --- a/src/geometamaker/__init__.py +++ b/src/geometamaker/__init__.py @@ -1 +1,2 @@ from .geometamaker import MetadataControl +from .geometamaker import MCF_SCHEMA diff --git a/src/geometamaker/models.py b/src/geometamaker/models.py new file mode 100644 index 0000000..cddc7a6 --- /dev/null +++ b/src/geometamaker/models.py @@ -0,0 +1,195 @@ +import dataclasses +from dataclasses import dataclass, field +import logging +import os +import pprint + +import frictionless +import fsspec +import yaml + + +LOGGER = logging.getLogger(__name__) + +# https://stackoverflow.com/questions/13518819/avoid-references-in-pyyaml +class _NoAliasDumper(yaml.SafeDumper): + """Keep the yaml human-readable by avoiding anchors and aliases.""" + + def ignore_aliases(self, data): + return True + + +@dataclass +class ContactSchema: + """Class for keeping track of contact info.""" + + email: str = '' + organization: str = '' + individualname: str = '' + positionname: str = '' + + +@dataclass +class FieldSchema: + """metadata for a field in a table.""" + + # https://datapackage.org/standard/table-schema/ + name: str = '' + title: str = '' + type: str = '' + format: str = '' + example: any = None + description: str = '' + units: str = '' + + +@dataclass +class TableSchema: + """Class for metadata for tables.""" + + # https://datapackage.org/standard/table-schema/ + fields: list = field(default_factory=FieldSchema) + missingValues: list = field(default_factory=list) + primaryKey: list = field(default_factory=list) + foreignKeys: list = field(default_factory=list) + + # def get_field(): + + +@dataclass +class BandSchema: + """Class for metadata for a raster band.""" + + index: int = 1 + description: str = '' + + +@dataclass +class RasterSchema: + """Class for metadata for raster bands.""" + + bands: list = field(default_factory=BandSchema) + + +@dataclass(kw_only=True) +class Resource: + """Base class for metadata for a resource. + + https://datapackage.org/standard/data-resource/ + This class should be based on Data Package - Resource + specification. But we have some additional properties + that are important to us. + """ + + path: str = '' + type: str = '' + scheme: str = '' + encoding: str = '' + format: str = '' + mediatype: str = '' + bytes: int = 0 + hash: str = '' + name: str = '' + title: str = '' + description: str = '' + sources: list = field(default_factory=list) + # schema: dict = field(init=False) + licenses: list = field(default_factory=list) + contact: ContactSchema = ContactSchema() + + # def __post_init__(self): + # self.schema = + + +@dataclass(kw_only=True) +class TableResource(Resource): + """Class for metadata for a table resource.""" + + # without post-init, schema ends up as a dict, or whatever is passed in. + schema: TableSchema = field(default_factory=TableSchema) + # type: str = 'table' + + def __post_init__(self): + # Allow init of the resource with a schema of type + # TableSchema, or type dict. Mostly because dataclasses.replace + # calls init, but the base object will have already been initialized. + if isinstance(self.schema, TableSchema): + return + self.schema = TableSchema(**self.schema) + + +class MetadataControl(object): + + def __init__(self, source_dataset_path=None): + if source_dataset_path is not None: + self.datasource = source_dataset_path + self.data_package_path = f'{self.datasource}.dp.yml' + + # Despite naming, this does not open a resource that must be closed + of = fsspec.open(self.datasource) + if not of.fs.exists(self.datasource): + raise FileNotFoundError(f'{self.datasource} does not exist') + + # TODO: check the filetype here and create the appropriate instance + # this is nice for autodetect of field types, but sometimes + # we will know the table schema (invest MODEL_SPEC). + # Is there any benefit to passing in the known schema? Maybe not + # Can also just overwrite the schema attribute with known data after. + description = frictionless.describe(source_dataset_path).to_dict() + # schema = TableSchema(**description['schema']) + # del description['schema'] + # resource = Resource(resource_dict) + + # Load existing metadata file + try: + with fsspec.open(self.data_package_path, 'r') as file: + yaml_string = file.read() + + # This validates the existing yaml against our dataclasses. + existing_resource = TableResource(**yaml.safe_load(yaml_string)) + # overwrite properties that are intrinsic to the dataset, + # which is everything from `description` other than schema. + del description['schema'] + self.metadata = dataclasses.replace( + existing_resource, **description) + + # Common path: metadata file does not already exist + except FileNotFoundError as err: + self.metadata = TableResource(description) + + def write(self, workspace=None): + """Write datapackage yaml to disk. + + This creates sidecar files with '.yml' + appended to the full filename of the data source. For example, + + - 'myraster.tif' + - 'myraster.tif.yml' + + Args: + workspace (str): if ``None``, files write to the same location + as the source data. If not ``None``, a path to a local directory + to write files. They will still be named to match the source + filename. Use this option if the source data is not on the local + filesystem. + + """ + if workspace is None: + target_path = self.data_package_path + else: + target_path = os.path.join( + workspace, f'{os.path.basename(self.datasource)}.dp.yml') + + with open(target_path, 'w') as file: + file.write(yaml.dump( + dataclasses.asdict(self.metadata), Dumper=_NoAliasDumper)) + + +if __name__ == "__main__": + # from natcap.invest import carbon + # arg_spec = carbon.MODEL_SPEC['args']['carbon_pools_path'] + + filepath = 'C:/Users/dmf/projects/geometamaker/data/carbon_pools.csv' + mc = MetadataControl(filepath) + pprint.pprint(dataclasses.asdict(mc.metadata)) + # mc.write() From 4753341bcb32249f0a425fe2ad4f021c2cd8ab21 Mon Sep 17 00:00:00 2001 From: davemfish Date: Wed, 17 Jul 2024 11:35:20 -0400 Subject: [PATCH 02/15] implemented a VectorResource class --- src/geometamaker/models.py | 115 ++++++++++++++++++++++++++++++++----- 1 file changed, 102 insertions(+), 13 deletions(-) diff --git a/src/geometamaker/models.py b/src/geometamaker/models.py index cddc7a6..de2a6ff 100644 --- a/src/geometamaker/models.py +++ b/src/geometamaker/models.py @@ -6,7 +6,9 @@ import frictionless import fsspec +import pygeoprocessing import yaml +from osgeo import gdal LOGGER = logging.getLogger(__name__) @@ -108,7 +110,7 @@ class TableResource(Resource): # without post-init, schema ends up as a dict, or whatever is passed in. schema: TableSchema = field(default_factory=TableSchema) # type: str = 'table' - + def __post_init__(self): # Allow init of the resource with a schema of type # TableSchema, or type dict. Mostly because dataclasses.replace @@ -118,27 +120,110 @@ def __post_init__(self): self.schema = TableSchema(**self.schema) +@dataclass +class BoundingBox(): + + xmin: float + ymin: float + xmax: float + ymax: float + + +@dataclass +class SpatialSchema(): + + bounding_box: BoundingBox + crs: str + + +@dataclass(kw_only=True) +class VectorResource(TableResource): + """Class for metadata for a vector resource.""" + + spatial: SpatialSchema + + +@dataclass(kw_only=True) +class RasterResource(Resource): + """Class for metadata for a raster resource.""" + + spatial: SpatialSchema + + +def get_file_type(filepath): + # GDAL considers CSV a vector, so check against frictionless + # first + filetype = frictionless.describe(filepath).type + if filetype == 'table': + return filetype + gis_type = pygeoprocessing.get_gis_type(filepath) + if gis_type == pygeoprocessing.VECTOR_TYPE: + return 'vector' + if gis_type == pygeoprocessing.RASTER_TYPE: + return 'raster' + raise ValueError() + + +def describe_vector(source_dataset_path): + description = frictionless.describe(source_dataset_path).to_dict() + fields = [] + vector = gdal.OpenEx(source_dataset_path, gdal.OF_VECTOR) + layer = vector.GetLayer() + for fld in layer.schema: + fields.append( + FieldSchema(name=fld.name, type=fld.type)) + vector = layer = None + description['schema'] = TableSchema(fields=fields) + + info = pygeoprocessing.get_vector_info(source_dataset_path) + spatial = { + 'bounding_box': info['bounding_box'], + 'crs': info['projection_wkt'] + } + description['spatial'] = SpatialSchema(**spatial) + description['sources'] = info['file_list'] + return description + + +def describe_raster(source_dataset_path): + pass + + +def describe_table(source_dataset_path): + return frictionless.describe(source_dataset_path).to_dict() + + +DESRCIBE_FUNCS = { + 'table': describe_table, + 'vector': describe_vector, + 'raster': describe_raster +} + +RESOURCE_MODELS = { + 'table': TableResource, + 'vector': VectorResource, + 'raster': RasterResource +} + + class MetadataControl(object): - def __init__(self, source_dataset_path=None): - if source_dataset_path is not None: - self.datasource = source_dataset_path - self.data_package_path = f'{self.datasource}.dp.yml' + def __init__(self, source_dataset_path): + # if source_dataset_path is not None: + self.datasource = source_dataset_path + self.data_package_path = f'{self.datasource}.dp.yml' # Despite naming, this does not open a resource that must be closed of = fsspec.open(self.datasource) if not of.fs.exists(self.datasource): raise FileNotFoundError(f'{self.datasource} does not exist') - # TODO: check the filetype here and create the appropriate instance + resource_type = get_file_type(source_dataset_path) + description = DESRCIBE_FUNCS[resource_type](source_dataset_path) # this is nice for autodetect of field types, but sometimes # we will know the table schema (invest MODEL_SPEC). # Is there any benefit to passing in the known schema? Maybe not # Can also just overwrite the schema attribute with known data after. - description = frictionless.describe(source_dataset_path).to_dict() - # schema = TableSchema(**description['schema']) - # del description['schema'] - # resource = Resource(resource_dict) # Load existing metadata file try: @@ -146,16 +231,19 @@ def __init__(self, source_dataset_path=None): yaml_string = file.read() # This validates the existing yaml against our dataclasses. - existing_resource = TableResource(**yaml.safe_load(yaml_string)) + existing_resource = RESOURCE_MODELS[resource_type]( + **yaml.safe_load(yaml_string)) # overwrite properties that are intrinsic to the dataset, # which is everything from `description` other than schema. + # Some parts of schema are intrinsic, but others are human-input + # so replace the whole thing for now. del description['schema'] self.metadata = dataclasses.replace( existing_resource, **description) # Common path: metadata file does not already exist except FileNotFoundError as err: - self.metadata = TableResource(description) + self.metadata = RESOURCE_MODELS[resource_type](**description) def write(self, workspace=None): """Write datapackage yaml to disk. @@ -189,7 +277,8 @@ def write(self, workspace=None): # from natcap.invest import carbon # arg_spec = carbon.MODEL_SPEC['args']['carbon_pools_path'] - filepath = 'C:/Users/dmf/projects/geometamaker/data/carbon_pools.csv' + # filepath = 'C:/Users/dmf/projects/geometamaker/data/carbon_pools.csv' + filepath = 'C:/Users/dmf/projects/geometamaker/data/watershed_gura.shp' mc = MetadataControl(filepath) pprint.pprint(dataclasses.asdict(mc.metadata)) # mc.write() From 0158a01cb757ba07028de5d206b958ebfad99346 Mon Sep 17 00:00:00 2001 From: davemfish Date: Wed, 17 Jul 2024 12:51:10 -0400 Subject: [PATCH 03/15] implemented a RasterResource --- src/geometamaker/models.py | 81 ++++++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 21 deletions(-) diff --git a/src/geometamaker/models.py b/src/geometamaker/models.py index de2a6ff..7ed8978 100644 --- a/src/geometamaker/models.py +++ b/src/geometamaker/models.py @@ -21,6 +21,22 @@ def ignore_aliases(self, data): return True +@dataclass +class BoundingBox(): + + xmin: float + ymin: float + xmax: float + ymax: float + + +@dataclass +class SpatialSchema(): + + bounding_box: BoundingBox + crs: str + + @dataclass class ContactSchema: """Class for keeping track of contact info.""" @@ -62,7 +78,10 @@ class TableSchema: class BandSchema: """Class for metadata for a raster band.""" - index: int = 1 + index: int + gdal_type: int + numpy_type: str + nodata: int | float description: str = '' @@ -70,7 +89,9 @@ class BandSchema: class RasterSchema: """Class for metadata for raster bands.""" - bands: list = field(default_factory=BandSchema) + bands: list + pixel_size: list + raster_size: list @dataclass(kw_only=True) @@ -120,22 +141,6 @@ def __post_init__(self): self.schema = TableSchema(**self.schema) -@dataclass -class BoundingBox(): - - xmin: float - ymin: float - xmax: float - ymax: float - - -@dataclass -class SpatialSchema(): - - bounding_box: BoundingBox - crs: str - - @dataclass(kw_only=True) class VectorResource(TableResource): """Class for metadata for a vector resource.""" @@ -147,12 +152,24 @@ class VectorResource(TableResource): class RasterResource(Resource): """Class for metadata for a raster resource.""" + schema: RasterSchema spatial: SpatialSchema + def __post_init__(self): + # Allow init of the resource with a schema of type + # RasterSchema, or type dict. Mostly because dataclasses.replace + # calls init, but the base object will have already been initialized. + if isinstance(self.schema, RasterSchema): + return + self.schema = RasterSchema(**self.schema) + def get_file_type(filepath): + # TODO: guard against classifying netCDF, HDF5, etc as GDAL rasters, + # we'll want a different data model for multi-dimensional arrays. + # GDAL considers CSV a vector, so check against frictionless - # first + # first. filetype = frictionless.describe(filepath).type if filetype == 'table': return filetype @@ -186,7 +203,28 @@ def describe_vector(source_dataset_path): def describe_raster(source_dataset_path): - pass + description = frictionless.describe(source_dataset_path).to_dict() + + bands = [] + info = pygeoprocessing.get_raster_info(source_dataset_path) + for i in range(info['n_bands']): + b = i + 1 + # band = raster.GetRasterBand(b) + # datatype = 'integer' if band.DataType < 6 else 'number' + bands.append(BandSchema( + index=b, + gdal_type=info['datatype'], + numpy_type=info['numpy_type'], + nodata=info['nodata'][i])) + description['schema'] = RasterSchema( + bands=bands, + pixel_size=info['pixel_size'], + raster_size=info['raster_size']) + description['spatial'] = SpatialSchema( + bounding_box=info['bounding_box'], + crs=info['projection_wkt']) + description['sources'] = info['file_list'] + return description def describe_table(source_dataset_path): @@ -278,7 +316,8 @@ def write(self, workspace=None): # arg_spec = carbon.MODEL_SPEC['args']['carbon_pools_path'] # filepath = 'C:/Users/dmf/projects/geometamaker/data/carbon_pools.csv' - filepath = 'C:/Users/dmf/projects/geometamaker/data/watershed_gura.shp' + # filepath = 'C:/Users/dmf/projects/geometamaker/data/watershed_gura.shp' + filepath = 'C:/Users/dmf/projects/geometamaker/data/DEM_gura.tif' mc = MetadataControl(filepath) pprint.pprint(dataclasses.asdict(mc.metadata)) # mc.write() From 0f37522abc62677a9020fc97022aa808adfc9981 Mon Sep 17 00:00:00 2001 From: davemfish Date: Wed, 17 Jul 2024 16:01:54 -0400 Subject: [PATCH 04/15] use frictionless to get file stats --- src/geometamaker/models.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/geometamaker/models.py b/src/geometamaker/models.py index 7ed8978..f4cbf27 100644 --- a/src/geometamaker/models.py +++ b/src/geometamaker/models.py @@ -182,7 +182,8 @@ def get_file_type(filepath): def describe_vector(source_dataset_path): - description = frictionless.describe(source_dataset_path).to_dict() + description = frictionless.describe( + source_dataset_path, stats=True).to_dict() fields = [] vector = gdal.OpenEx(source_dataset_path, gdal.OF_VECTOR) layer = vector.GetLayer() @@ -203,7 +204,8 @@ def describe_vector(source_dataset_path): def describe_raster(source_dataset_path): - description = frictionless.describe(source_dataset_path).to_dict() + description = frictionless.describe( + source_dataset_path, stats=True).to_dict() bands = [] info = pygeoprocessing.get_raster_info(source_dataset_path) @@ -228,7 +230,9 @@ def describe_raster(source_dataset_path): def describe_table(source_dataset_path): - return frictionless.describe(source_dataset_path).to_dict() + # frictionless.describe works + return frictionless.describe( + source_dataset_path, stats=True).to_dict() DESRCIBE_FUNCS = { From 242c93153ce687703c86584dd45152bd37cc2792 Mon Sep 17 00:00:00 2001 From: davemfish Date: Wed, 17 Jul 2024 16:57:38 -0400 Subject: [PATCH 05/15] integrating models with existing MetadataControl class --- src/geometamaker/__init__.py | 1 - src/geometamaker/geometamaker.py | 584 ++++++++++--------------------- src/geometamaker/models.py | 197 ++--------- 3 files changed, 207 insertions(+), 575 deletions(-) diff --git a/src/geometamaker/__init__.py b/src/geometamaker/__init__.py index 3b8ae6c..9f56a76 100644 --- a/src/geometamaker/__init__.py +++ b/src/geometamaker/__init__.py @@ -1,2 +1 @@ from .geometamaker import MetadataControl -from .geometamaker import MCF_SCHEMA diff --git a/src/geometamaker/geometamaker.py b/src/geometamaker/geometamaker.py index 99b45fa..695a40b 100644 --- a/src/geometamaker/geometamaker.py +++ b/src/geometamaker/geometamaker.py @@ -1,19 +1,19 @@ +import dataclasses import logging import os import uuid from datetime import datetime +import frictionless import fsspec -import jsonschema -from jsonschema.exceptions import ValidationError -import pygeometa.core -from pygeometa.schemas import load_schema import pygeoprocessing from osgeo import gdal from osgeo import ogr from osgeo import osr import yaml +from . import models + # https://stackoverflow.com/questions/13518819/avoid-references-in-pyyaml class _NoAliasDumper(yaml.SafeDumper): @@ -25,151 +25,92 @@ def ignore_aliases(self, data): LOGGER = logging.getLogger(__name__) -MCF_SCHEMA_FILE = os.path.join( - pygeometa.core.SCHEMAS, 'mcf', 'core.yaml') -with open(MCF_SCHEMA_FILE, 'r') as schema_file: - MCF_SCHEMA = pygeometa.core.yaml_load(schema_file) - -# modify the core MCF schema so that our default -# template MCFs have all the properties we expect -# users to use. -MCF_SCHEMA['required'].append('content_info') -MCF_SCHEMA['required'].append('dataquality') -MCF_SCHEMA['properties']['identification']['properties'][ - 'citation'] = { - 'type': 'string', - 'description': 'a biobliographic citation for the dataset' - } -MCF_SCHEMA['properties']['identification']['required'].append('citation') -MCF_SCHEMA['properties']['identification']['properties'][ - 'keywords']['patternProperties']['^.*'][ - 'required'] = ['keywords', 'keywords_type'] +# MCF_SCHEMA['properties']['identification']['properties'][ +# 'keywords']['patternProperties']['^.*'][ +# 'required'] = ['keywords', 'keywords_type'] # to accomodate tables that do not represent spatial content: -NO_GEOM_TYPE = 'none' -MCF_SCHEMA['properties']['spatial']['properties'][ - 'geomtype']['enum'].append(NO_GEOM_TYPE) -TABLE_CONTENT_TYPE = 'table' -MCF_SCHEMA['properties']['content_info']['properties'][ - 'type']['enum'].append(TABLE_CONTENT_TYPE) - -OGR_MCF_ATTR_TYPE_MAP = { - ogr.OFTInteger: 'integer', - ogr.OFTInteger64: 'integer', - ogr.OFTReal: 'number', - ogr.OFTString: 'string' -} - - -def _get_default(item): - """Return a default value for a property. - - Args: - item (dict): a jsonschema definition of a property with no children. - Return: - a value from DEFAULT_VALUES - - Raises: - KeyError if ``item`` does not include an - 'enum', 'type', or '$ref' property. - - """ - # TODO: read types from the #/definitions found in MCF_SCHEMA - # instead of hardcoding values here - # TODO: support i18n properly by using objects - # keyed by country codes to contain the array of strings - default_values = { - 'string': str(), - 'int': int(), - 'integer': int(), - 'number': float(), - 'boolean': False, - '#/definitions/date_or_datetime_string': str(), - '#/definitions/i18n_string': str(), - '#/definitions/i18n_array': list(), - '#/definitions/any_type': str(), +def get_file_type(filepath): + # TODO: guard against classifying netCDF, HDF5, etc as GDAL rasters, + # we'll want a different data model for multi-dimensional arrays. + + # GDAL considers CSV a vector, so check against frictionless + # first. + filetype = frictionless.describe(filepath).type + if filetype == 'table': + return filetype + gis_type = pygeoprocessing.get_gis_type(filepath) + if gis_type == pygeoprocessing.VECTOR_TYPE: + return 'vector' + if gis_type == pygeoprocessing.RASTER_TYPE: + return 'raster' + raise ValueError() + + +def describe_vector(source_dataset_path): + description = frictionless.describe( + source_dataset_path, stats=True).to_dict() + fields = [] + vector = gdal.OpenEx(source_dataset_path, gdal.OF_VECTOR) + layer = vector.GetLayer() + for fld in layer.schema: + fields.append( + models.FieldSchema(name=fld.name, type=fld.type)) + vector = layer = None + description['schema'] = models.TableSchema(fields=fields) + + info = pygeoprocessing.get_vector_info(source_dataset_path) + spatial = { + 'bounding_box': info['bounding_box'], + 'crs': info['projection_wkt'] } + description['spatial'] = models.SpatialSchema(**spatial) + description['sources'] = info['file_list'] + return description + + +def describe_raster(source_dataset_path): + description = frictionless.describe( + source_dataset_path, stats=True).to_dict() + + bands = [] + info = pygeoprocessing.get_raster_info(source_dataset_path) + for i in range(info['n_bands']): + b = i + 1 + # band = raster.GetRasterBand(b) + # datatype = 'integer' if band.DataType < 6 else 'number' + bands.append(models.BandSchema( + index=b, + gdal_type=info['datatype'], + numpy_type=info['numpy_type'], + nodata=info['nodata'][i])) + description['schema'] = models.RasterSchema( + bands=bands, + pixel_size=info['pixel_size'], + raster_size=info['raster_size']) + description['spatial'] = models.SpatialSchema( + bounding_box=info['bounding_box'], + crs=info['projection_wkt']) + description['sources'] = info['file_list'] + return description + + +def describe_table(source_dataset_path): + return frictionless.describe( + source_dataset_path, stats=True).to_dict() + + +DESRCIBE_FUNCS = { + 'table': describe_table, + 'vector': describe_vector, + 'raster': describe_raster +} - # If there are enumerated values which must be used - try: - fixed_values = item['enum'] - # TODO: find a better way to choose the default - return fixed_values[0] - except KeyError: - pass - - # If no enumerated values, get a default value based on type - try: - t = item['type'] - except KeyError: - # When 'type' is missing, a $ref to another schema is present - try: - t = item['$ref'] - except KeyError: - raise KeyError( - f'schema has no type and no reference to a type definition\n' - f'{item}') - - return default_values[t] - - -def _get_template(schema): - """Create a minimal dictionary that is valid against ``schema``. - - The dict will ontain only the 'required' properties. - - Args: - schema (dict): a jsonschema definition. - - Return: - dict that is valid against ``schema`` - - Raises: - KeyError if a penultimate property in a schema branch - does not include an 'enum', 'type', or '$ref' property. - - """ - template = {} - if 'type' in schema and schema['type'] == 'object': - for prop, sch in schema['properties'].items(): - if 'required' in schema and prop not in schema['required']: - continue - if 'patternProperties' in sch: - # this item's properties can have any name matching the pattern. - # assign the name 'default' and overwite the current schema - # with a new one that explicitly includes the 'default' property. - example_sch = { - 'type': 'object', - 'required': ['default'], - 'properties': { - 'default': sch['patternProperties']['^.*'] - } - } - sch = example_sch - - if 'properties' in sch and 'anyOf' in sch['properties']: - # if 'anyOf' is a property, then we effectively want to - # treat the children of 'anyOf' as the properties instead. - template[prop] = { - p: _get_template(s) - for p, s in sch['properties']['anyOf'].items() - } - else: - template[prop] = _get_template(sch) - return template - - elif 'type' in schema and schema['type'] == 'array': - if 'properties' in schema: - # for the weird case where identification.extents.spatial - # is type: array but contains 'properties' instead of 'items' - return [{ - p: _get_template(s) - for p, s in schema['properties'].items() - if p in schema['required'] - }] - return [_get_template(schema['items'])] - else: - return _get_default(schema) +RESOURCE_MODELS = { + 'table': models.TableResource, + 'vector': models.VectorResource, + 'raster': models.RasterResource +} class MetadataControl(object): @@ -200,51 +141,42 @@ def __init__(self, source_dataset_path=None): metadata applies """ - self.mcf = None - if source_dataset_path is not None: - self.datasource = source_dataset_path - self.mcf_path = f'{self.datasource}.yml' - - # Despite naming, this does not open a resource that must be closed - of = fsspec.open(self.datasource) - if not of.fs.exists(self.datasource): - raise FileNotFoundError(f'{self.datasource} does not exist') - - try: - with fsspec.open(self.mcf_path, 'r') as file: - yaml_string = file.read() - - # pygeometa.core.read_mcf can parse nested MCF documents, - # where one MCF refers to another - self.mcf = pygeometa.core.read_mcf(yaml_string) - LOGGER.info(f'loaded existing metadata from {self.mcf_path}') - self.validate() - - # Common path: MCF often does not already exist - except FileNotFoundError as err: - LOGGER.debug(err) - - # Uncommon path: MCF already exists but cannot be used - except (pygeometa.core.MCFReadError, - ValidationError, AttributeError) as err: - # AttributeError in read_mcf not caught by pygeometa - LOGGER.warning(err) - self.mcf = None - - if self.mcf is None: - self.mcf = _get_template(MCF_SCHEMA) - self.mcf['metadata']['identifier'] = str(uuid.uuid4()) - - # fill all values that can be derived from the dataset - LOGGER.debug(f'getting properties from {source_dataset_path}') - self._set_spatial_info() - else: - self.mcf = _get_template(MCF_SCHEMA) + # if source_dataset_path is not None: + self.datasource = source_dataset_path + self.data_package_path = f'{self.datasource}.yml' + + # Despite naming, this does not open a resource that must be closed + of = fsspec.open(self.datasource) + if not of.fs.exists(self.datasource): + raise FileNotFoundError(f'{self.datasource} does not exist') - self.mcf['mcf']['version'] = \ - MCF_SCHEMA['properties']['mcf'][ - 'properties']['version']['const'] + resource_type = get_file_type(source_dataset_path) + description = DESRCIBE_FUNCS[resource_type](source_dataset_path) + # this is nice for autodetect of field types, but sometimes + # we will know the table schema (invest MODEL_SPEC). + # Is there any benefit to passing in the known schema? Maybe not + # Can also just overwrite the schema attribute with known data after. + + # Load existing metadata file + try: + with fsspec.open(self.data_package_path, 'r') as file: + yaml_string = file.read() + + # This validates the existing yaml against our dataclasses. + existing_resource = RESOURCE_MODELS[resource_type]( + **yaml.safe_load(yaml_string)) + # overwrite properties that are intrinsic to the dataset, + # which is everything from `description` other than schema. + # Some parts of schema are intrinsic, but others are human-input + # so replace the whole thing for now. + del description['schema'] + self.metadata = dataclasses.replace( + existing_resource, **description) + + # Common path: metadata file does not already exist + except FileNotFoundError as err: + self.metadata = RESOURCE_MODELS[resource_type](**description) def set_title(self, title): """Add a title for the dataset. @@ -253,24 +185,24 @@ def set_title(self, title): title (str) """ - self.mcf['identification']['title'] = title + self.metadata.title = title def get_title(self): """Get the title for the dataset.""" - return self.mcf['identification']['title'] + return self.metadata.title - def set_abstract(self, abstract): - """Add an abstract for the dataset. + def set_description(self, description): + """Add an description for the dataset. Args: - abstract (str) + description (str) """ - self.mcf['identification']['abstract'] = abstract + self.metadata.description = description - def get_abstract(self): - """Get the abstract for the dataset.""" - return self.mcf['identification']['abstract'] + def get_description(self): + """Get the description for the dataset.""" + return self.metadata.description def set_citation(self, citation): """Add a citation string for the dataset. @@ -279,53 +211,41 @@ def set_citation(self, citation): citation (str) """ - self.mcf['identification']['citation'] = citation + self.metadata.citation = citation def get_citation(self): """Get the citation for the dataset.""" - return self.mcf['identification']['citation'] + return self.metadata.citation - def set_contact(self, organization=None, individualname=None, positionname=None, - email=None, section='default', **kwargs): + def set_contact(self, organization=None, individual_name=None, + position_name=None, email=None): """Add a contact section. Args: organization (str): name of the responsible organization - individualname (str): name of the responsible person - positionname (str): role or position of the responsible person - email (str): email address of the responsible organization or individual - section (str): a header for the contact section under which to - apply the other args, since there can be more than one. - kwargs (dict): key-value pairs for any other properties listed in - the contact section of the core MCF schema. + individual_name (str): name of the responsible person + position_name (str): role or position of the responsible person + email (str): address of the responsible organization or individual """ if organization: - self.mcf['contact'][section]['organization'] = organization - if individualname: - self.mcf['contact'][section]['individualname'] = individualname - if positionname: - self.mcf['contact'][section]['positionname'] = positionname + self.metadata.contact.organization = organization + if individual_name: + self.metadata.contact.individualname = individual_name + if position_name: + self.metadata.contact.positionname = position_name if email: - self.mcf['contact'][section]['email'] = email - if kwargs: - for k, v in kwargs.items(): - self.mcf['contact'][section][k] = v - - self.validate() + self.metadata.contact.email = email - def get_contact(self, section='default'): + def get_contact(self): """Get metadata from a contact section. - Args: - section (str): a header for the contact section under which to - apply the other args, since there can be more than one. Returns: - A dict or ``None`` if ``section`` does not exist. + ContactSchema """ - return self.mcf['contact'].get(section) + return self.metadata.contact def set_doi(self, doi): """Add a doi string for the dataset. @@ -334,11 +254,11 @@ def set_doi(self, doi): doi (str) """ - self.mcf['identification']['doi'] = doi + self.metadata.doi = doi def get_doi(self): """Get the doi for the dataset.""" - return self.mcf['identification']['doi'] + return self.metadata.doi def set_edition(self, edition): """Set the edition for the dataset. @@ -347,8 +267,7 @@ def set_edition(self, edition): edition (str): version of the cited resource """ - self.mcf['identification']['edition'] = edition - self.validate() + self.metadata.edition = edition def get_edition(self): """Get the edition of the dataset. @@ -357,7 +276,7 @@ def get_edition(self): str or ``None`` if ``edition`` does not exist. """ - return self.mcf['identification'].get('edition') + return self.metadata.edition def set_keywords(self, keywords, section='default', keywords_type='theme', vocabulary=None): @@ -393,58 +312,56 @@ def set_keywords(self, keywords, section='default', keywords_type='theme', def get_keywords(self, section='default'): return self.mcf['identification']['keywords'][section] - def set_license(self, name=None, url=None): + def set_license(self, title=None, path=None): """Add a license for the dataset. - Either or both name and url are required if there is a license. + Either or both title and path are required if there is a license. Call with no arguments to remove access constraints and license info. Args: - name (str): name of the license of the source dataset - url (str): url for the license + title (str): human-readable title of the license + path (str): url for the license """ - # MCF spec says use 'otherRestrictions' to mean no restrictions - constraints = 'otherRestrictions' - if name or url: - constraints = 'license' - license_dict = {} - license_dict['name'] = name if name else '' - license_dict['url'] = url if url else '' - self.mcf['identification']['license'] = license_dict - self.mcf['identification']['accessconstraints'] = constraints - self.validate() + license_dict['title'] = title if title else '' + license_dict['path'] = path if path else '' + + # TODO: DataPackage/Resource allows for a list of licenses. + # So far we only support one license per resource. + self.licenses = [models.License(**license_dict)] def get_license(self): """Get ``license`` for the dataset. Returns: - dict or ``None`` if ``license`` does not exist. + models.License """ - return self.mcf['identification'].get('license') + # TODO: DataPackage/Resource allows for a list of licenses. + # So far we only support one license per resource. + if self.licenses: + return self.licenses[0] def set_lineage(self, statement): """Set the lineage statement for the dataset. Args: - statement (str): general explanation describing the lineage or provenance - of the dataset + statement (str): general explanation describing the lineage or + provenance of the dataset """ - self.mcf['dataquality']['lineage']['statement'] = statement - self.validate() + self.metadata.lineage = statement def get_lineage(self): """Get the lineage statement of the dataset. Returns: - str or ``None`` if ``lineage`` does not exist. + str """ - return self.mcf['dataquality']['lineage'].get('statement') + return self.metadata.lineage def set_purpose(self, purpose): """Add a purpose for the dataset. @@ -453,21 +370,16 @@ def set_purpose(self, purpose): purpose (str): description of the purpose of the source dataset """ - # 'Purpose' is not supported in the core MCF spec, probably because - # `` was added to ISO-19115 in 2014, and MCF still only - # supports 2015. For now, we can add `purpose` in `identification`. - # Later we can move it elsewhere if it becomes formally supported. - self.mcf['identification']['purpose'] = purpose - self.validate() + self.metadata.purpose = purpose def get_purpose(self): """Get ``purpose`` for the dataset. Returns: - str or ``None`` if ``purpose`` does not exist. + str """ - return self.mcf['identification'].get('purpose') + return self.metadata.purpose def set_url(self, url): """Add a url for the dataset. @@ -476,11 +388,11 @@ def set_url(self, url): url (str) """ - self.mcf['identification']['url'] = url + self.metadata.url = url def get_url(self): """Get the url for the dataset.""" - return self.mcf['identification']['url'] + return self.metadata.url def set_band_description(self, band_number, name=None, title=None, abstract=None, units=None, type=None): @@ -582,19 +494,14 @@ def get_field_description(self, name): idx, attribute = self._get_attr(name) return attribute - def _write_mcf(self, target_path): - with open(target_path, 'w') as file: - file.write(yaml.dump(self.mcf, Dumper=_NoAliasDumper)) - def write(self, workspace=None): - """Write MCF and ISO-19139 XML to disk. + """Write datapackage yaml to disk. - This creates sidecar files with '.yml' and '.xml' extensions + This creates sidecar files with '.yml' appended to the full filename of the data source. For example, - 'myraster.tif' - 'myraster.tif.yml' - - 'myraster.tif.xml' Args: workspace (str): if ``None``, files write to the same location @@ -605,141 +512,14 @@ def write(self, workspace=None): """ if workspace is None: - target_mcf_path = self.mcf_path - target_xml_path = f'{self.datasource}.xml' + target_path = self.data_package_path else: - target_mcf_path = os.path.join( + target_path = os.path.join( workspace, f'{os.path.basename(self.datasource)}.yml') - target_xml_path = os.path.join( - workspace, f'{os.path.basename(self.datasource)}.xml') - - self.mcf['metadata']['datestamp'] = datetime.utcnow().strftime( - '%Y-%m-%d') - self._write_mcf(target_mcf_path) - - schema_obj = load_schema('iso19139') - xml_string = schema_obj.write(self.mcf) - with open(target_xml_path, 'w') as xmlfile: - xmlfile.write(xml_string) - - def validate(self): - """Validate MCF against a jsonschema object.""" - # validate against our own schema, which could - # be a superset of the core MCF schema. - # If we wanted to validate against core MCF, - # we could use pygeometa.core.validate_mcf - jsonschema.validate(self.mcf, MCF_SCHEMA) + + with open(target_path, 'w') as file: + file.write(yaml.dump( + dataclasses.asdict(self.metadata), Dumper=_NoAliasDumper)) def to_string(self): pass - - def _set_spatial_info(self): - """Populate the MCF using spatial properties of the dataset.""" - gis_type = pygeoprocessing.get_gis_type(self.datasource) - self.mcf['metadata']['hierarchylevel'] = 'dataset' - - if gis_type == pygeoprocessing.VECTOR_TYPE: - LOGGER.debug('opening as GDAL vector') - self.mcf['content_info']['type'] = 'coverage' - self.mcf['spatial']['datatype'] = 'vector' - open_options = [] - - if os.path.splitext(self.datasource)[1] == '.csv': - self.mcf['spatial']['datatype'] = 'textTable' - open_options.append('AUTODETECT_TYPE=YES') - - vector = gdal.OpenEx(self.datasource, gdal.OF_VECTOR, - open_options=open_options) - layer = vector.GetLayer() - layer_defn = layer.GetLayerDefn() - geomname = ogr.GeometryTypeToName(layer_defn.GetGeomType()) - geomtype = NO_GEOM_TYPE - # https://www.fgdc.gov/nap/metadata/register/codelists.html - if 'Point' in geomname: - geomtype = 'point' - if 'Polygon' in geomname: - geomtype = 'surface' - if 'Line' in geomname: - geomtype = 'curve' - if 'Collection' in geomname: - geomtype = 'complex' - self.mcf['spatial']['geomtype'] = geomtype - - if len(layer.schema) and 'attributes' not in self.mcf['content_info']: - self.mcf['content_info']['attributes'] = [] - - for field in layer.schema: - try: - idx, attribute = self._get_attr(field.name) - except KeyError: - attribute = _get_template( - MCF_SCHEMA['properties']['content_info']['properties'][ - 'attributes'])[0] - attribute['name'] = field.name - self.mcf['content_info']['attributes'].append( - attribute) - - try: - datatype = OGR_MCF_ATTR_TYPE_MAP[field.type] - except KeyError: - LOGGER.warning( - f'{field.type} is missing in the OGR-to-MCF ' - f'attribute type map; attribute type for field ' - f'{field.name} will be "object".') - datatype = 'object' - self.set_field_description(field.name, type=datatype) - - vector = None - layer = None - - gis_info = pygeoprocessing.get_vector_info(self.datasource) - - if gis_type == pygeoprocessing.RASTER_TYPE: - LOGGER.debug('opening as GDAL raster') - self.mcf['spatial']['datatype'] = 'grid' - self.mcf['spatial']['geomtype'] = 'surface' - self.mcf['content_info']['type'] = 'image' - - raster = gdal.OpenEx(self.datasource, gdal.OF_RASTER) - - attr = _get_template( - MCF_SCHEMA['properties']['content_info']['properties'][ - 'attributes'])[0] - - if 'attributes' not in self.mcf['content_info']: - self.mcf['content_info']['attributes'] = [attr]*raster.RasterCount - else: - n_attrs = len(self.mcf['content_info']['attributes']) - if n_attrs < raster.RasterCount: - extend_n = raster.RasterCount - n_attrs - self.mcf['content_info']['attributes'].extend( - [attr]*extend_n) - - for i in range(raster.RasterCount): - b = i + 1 - band = raster.GetRasterBand(b) - datatype = 'integer' if band.DataType < 6 else 'number' - self.set_band_description(b, type=datatype) - band = None - raster = None - - gis_info = pygeoprocessing.get_raster_info(self.datasource) - - if gis_info['projection_wkt']: - try: - srs = osr.SpatialReference() - srs.ImportFromWkt(gis_info['projection_wkt']) - epsg = srs.GetAttrValue('AUTHORITY', 1) - except TypeError: - LOGGER.warning( - f'could not import a spatial reference system from ' - f'"projection_wkt" in {gis_info}') - epsg = '' - # for human-readable values after yaml dump, use python types - # instead of numpy types - bbox = [float(x) for x in gis_info['bounding_box']] - spatial_info = [{ - 'bbox': bbox, - 'crs': epsg # MCF does not support WKT here - }] - self.mcf['identification']['extents']['spatial'] = spatial_info diff --git a/src/geometamaker/models.py b/src/geometamaker/models.py index f4cbf27..23a0039 100644 --- a/src/geometamaker/models.py +++ b/src/geometamaker/models.py @@ -1,14 +1,8 @@ -import dataclasses from dataclasses import dataclass, field import logging -import os import pprint -import frictionless -import fsspec -import pygeoprocessing import yaml -from osgeo import gdal LOGGER = logging.getLogger(__name__) @@ -43,8 +37,21 @@ class ContactSchema: email: str = '' organization: str = '' - individualname: str = '' - positionname: str = '' + individual_name: str = '' + position_name: str = '' + + +@dataclass +class License: + """Class for storing license info.""" + + # https://datapackage.org/profiles/2.0/dataresource.json + # This profile also includes `name`, described as: + # "MUST be an Open Definition license identifier", + # see http://licenses.opendefinition.org/" + # I don't think that's useful to us yet. + path: str + title: str @dataclass @@ -104,6 +111,10 @@ class Resource: that are important to us. """ + # TODO: DP includes `sources` as list of source files + # with some amount of metadata for each item. For our + # use-case, I think a list of filenames is good enough. + path: str = '' type: str = '' scheme: str = '' @@ -116,8 +127,13 @@ class Resource: title: str = '' description: str = '' sources: list = field(default_factory=list) - # schema: dict = field(init=False) licenses: list = field(default_factory=list) + citation: str = '' + doi: str = '' + url: str = '' + edition: str = '' + lineage: str = '' + purpose: str = '' contact: ContactSchema = ContactSchema() # def __post_init__(self): @@ -162,166 +178,3 @@ def __post_init__(self): if isinstance(self.schema, RasterSchema): return self.schema = RasterSchema(**self.schema) - - -def get_file_type(filepath): - # TODO: guard against classifying netCDF, HDF5, etc as GDAL rasters, - # we'll want a different data model for multi-dimensional arrays. - - # GDAL considers CSV a vector, so check against frictionless - # first. - filetype = frictionless.describe(filepath).type - if filetype == 'table': - return filetype - gis_type = pygeoprocessing.get_gis_type(filepath) - if gis_type == pygeoprocessing.VECTOR_TYPE: - return 'vector' - if gis_type == pygeoprocessing.RASTER_TYPE: - return 'raster' - raise ValueError() - - -def describe_vector(source_dataset_path): - description = frictionless.describe( - source_dataset_path, stats=True).to_dict() - fields = [] - vector = gdal.OpenEx(source_dataset_path, gdal.OF_VECTOR) - layer = vector.GetLayer() - for fld in layer.schema: - fields.append( - FieldSchema(name=fld.name, type=fld.type)) - vector = layer = None - description['schema'] = TableSchema(fields=fields) - - info = pygeoprocessing.get_vector_info(source_dataset_path) - spatial = { - 'bounding_box': info['bounding_box'], - 'crs': info['projection_wkt'] - } - description['spatial'] = SpatialSchema(**spatial) - description['sources'] = info['file_list'] - return description - - -def describe_raster(source_dataset_path): - description = frictionless.describe( - source_dataset_path, stats=True).to_dict() - - bands = [] - info = pygeoprocessing.get_raster_info(source_dataset_path) - for i in range(info['n_bands']): - b = i + 1 - # band = raster.GetRasterBand(b) - # datatype = 'integer' if band.DataType < 6 else 'number' - bands.append(BandSchema( - index=b, - gdal_type=info['datatype'], - numpy_type=info['numpy_type'], - nodata=info['nodata'][i])) - description['schema'] = RasterSchema( - bands=bands, - pixel_size=info['pixel_size'], - raster_size=info['raster_size']) - description['spatial'] = SpatialSchema( - bounding_box=info['bounding_box'], - crs=info['projection_wkt']) - description['sources'] = info['file_list'] - return description - - -def describe_table(source_dataset_path): - # frictionless.describe works - return frictionless.describe( - source_dataset_path, stats=True).to_dict() - - -DESRCIBE_FUNCS = { - 'table': describe_table, - 'vector': describe_vector, - 'raster': describe_raster -} - -RESOURCE_MODELS = { - 'table': TableResource, - 'vector': VectorResource, - 'raster': RasterResource -} - - -class MetadataControl(object): - - def __init__(self, source_dataset_path): - # if source_dataset_path is not None: - self.datasource = source_dataset_path - self.data_package_path = f'{self.datasource}.dp.yml' - - # Despite naming, this does not open a resource that must be closed - of = fsspec.open(self.datasource) - if not of.fs.exists(self.datasource): - raise FileNotFoundError(f'{self.datasource} does not exist') - - resource_type = get_file_type(source_dataset_path) - description = DESRCIBE_FUNCS[resource_type](source_dataset_path) - # this is nice for autodetect of field types, but sometimes - # we will know the table schema (invest MODEL_SPEC). - # Is there any benefit to passing in the known schema? Maybe not - # Can also just overwrite the schema attribute with known data after. - - # Load existing metadata file - try: - with fsspec.open(self.data_package_path, 'r') as file: - yaml_string = file.read() - - # This validates the existing yaml against our dataclasses. - existing_resource = RESOURCE_MODELS[resource_type]( - **yaml.safe_load(yaml_string)) - # overwrite properties that are intrinsic to the dataset, - # which is everything from `description` other than schema. - # Some parts of schema are intrinsic, but others are human-input - # so replace the whole thing for now. - del description['schema'] - self.metadata = dataclasses.replace( - existing_resource, **description) - - # Common path: metadata file does not already exist - except FileNotFoundError as err: - self.metadata = RESOURCE_MODELS[resource_type](**description) - - def write(self, workspace=None): - """Write datapackage yaml to disk. - - This creates sidecar files with '.yml' - appended to the full filename of the data source. For example, - - - 'myraster.tif' - - 'myraster.tif.yml' - - Args: - workspace (str): if ``None``, files write to the same location - as the source data. If not ``None``, a path to a local directory - to write files. They will still be named to match the source - filename. Use this option if the source data is not on the local - filesystem. - - """ - if workspace is None: - target_path = self.data_package_path - else: - target_path = os.path.join( - workspace, f'{os.path.basename(self.datasource)}.dp.yml') - - with open(target_path, 'w') as file: - file.write(yaml.dump( - dataclasses.asdict(self.metadata), Dumper=_NoAliasDumper)) - - -if __name__ == "__main__": - # from natcap.invest import carbon - # arg_spec = carbon.MODEL_SPEC['args']['carbon_pools_path'] - - # filepath = 'C:/Users/dmf/projects/geometamaker/data/carbon_pools.csv' - # filepath = 'C:/Users/dmf/projects/geometamaker/data/watershed_gura.shp' - filepath = 'C:/Users/dmf/projects/geometamaker/data/DEM_gura.tif' - mc = MetadataControl(filepath) - pprint.pprint(dataclasses.asdict(mc.metadata)) - # mc.write() From 539fc5163987fe28ae46938242352f5542bd4cb9 Mon Sep 17 00:00:00 2001 From: davemfish Date: Thu, 18 Jul 2024 14:40:56 -0400 Subject: [PATCH 06/15] more integration, moving methods onto the Resource classes. --- src/geometamaker/geometamaker.py | 352 +------------------------------ src/geometamaker/models.py | 348 ++++++++++++++++++++++++++++-- 2 files changed, 338 insertions(+), 362 deletions(-) diff --git a/src/geometamaker/geometamaker.py b/src/geometamaker/geometamaker.py index 695a40b..e3a6c7b 100644 --- a/src/geometamaker/geometamaker.py +++ b/src/geometamaker/geometamaker.py @@ -25,12 +25,11 @@ def ignore_aliases(self, data): LOGGER = logging.getLogger(__name__) -# MCF_SCHEMA['properties']['identification']['properties'][ -# 'keywords']['patternProperties']['^.*'][ -# 'required'] = ['keywords', 'keywords_type'] -# to accomodate tables that do not represent spatial content: def get_file_type(filepath): + # TODO: zip, or other archives. Can they be represented as a Resource? + # or do they need to be a Package? + # TODO: guard against classifying netCDF, HDF5, etc as GDAL rasters, # we'll want a different data model for multi-dimensional arrays. @@ -178,348 +177,3 @@ def __init__(self, source_dataset_path=None): except FileNotFoundError as err: self.metadata = RESOURCE_MODELS[resource_type](**description) - def set_title(self, title): - """Add a title for the dataset. - - Args: - title (str) - - """ - self.metadata.title = title - - def get_title(self): - """Get the title for the dataset.""" - return self.metadata.title - - def set_description(self, description): - """Add an description for the dataset. - - Args: - description (str) - - """ - self.metadata.description = description - - def get_description(self): - """Get the description for the dataset.""" - return self.metadata.description - - def set_citation(self, citation): - """Add a citation string for the dataset. - - Args: - citation (str) - - """ - self.metadata.citation = citation - - def get_citation(self): - """Get the citation for the dataset.""" - return self.metadata.citation - - def set_contact(self, organization=None, individual_name=None, - position_name=None, email=None): - """Add a contact section. - - Args: - organization (str): name of the responsible organization - individual_name (str): name of the responsible person - position_name (str): role or position of the responsible person - email (str): address of the responsible organization or individual - - """ - - if organization: - self.metadata.contact.organization = organization - if individual_name: - self.metadata.contact.individualname = individual_name - if position_name: - self.metadata.contact.positionname = position_name - if email: - self.metadata.contact.email = email - - def get_contact(self): - """Get metadata from a contact section. - - Returns: - ContactSchema - - """ - return self.metadata.contact - - def set_doi(self, doi): - """Add a doi string for the dataset. - - Args: - doi (str) - - """ - self.metadata.doi = doi - - def get_doi(self): - """Get the doi for the dataset.""" - return self.metadata.doi - - def set_edition(self, edition): - """Set the edition for the dataset. - - Args: - edition (str): version of the cited resource - - """ - self.metadata.edition = edition - - def get_edition(self): - """Get the edition of the dataset. - - Returns: - str or ``None`` if ``edition`` does not exist. - - """ - return self.metadata.edition - - def set_keywords(self, keywords, section='default', keywords_type='theme', - vocabulary=None): - """Describe a dataset with a list of keywords. - - Keywords are grouped into sections for the purpose of complying with - pre-exising keyword schema. A section will be overwritten if it - already exists. - - Args: - keywords (list): sequence of strings - section (string): the name of a keywords section - keywords_type (string): subject matter used to group similar - keywords. Must be one of, - ('discipline', 'place', 'stratum', 'temporal', 'theme') - vocabulary (dict): a dictionary with 'name' and 'url' (optional) - keys. Used to describe the source (thesaurus) of keywords - - Raises: - ValidationError - - """ - section_dict = { - 'keywords': keywords, - 'keywords_type': keywords_type - } - - if vocabulary: - section_dict['vocabulary'] = vocabulary - self.mcf['identification']['keywords'][section] = section_dict - self.validate() - - def get_keywords(self, section='default'): - return self.mcf['identification']['keywords'][section] - - def set_license(self, title=None, path=None): - """Add a license for the dataset. - - Either or both title and path are required if there is a license. - Call with no arguments to remove access constraints and license - info. - - Args: - title (str): human-readable title of the license - path (str): url for the license - - """ - license_dict = {} - license_dict['title'] = title if title else '' - license_dict['path'] = path if path else '' - - # TODO: DataPackage/Resource allows for a list of licenses. - # So far we only support one license per resource. - self.licenses = [models.License(**license_dict)] - - def get_license(self): - """Get ``license`` for the dataset. - - Returns: - models.License - - """ - # TODO: DataPackage/Resource allows for a list of licenses. - # So far we only support one license per resource. - if self.licenses: - return self.licenses[0] - - def set_lineage(self, statement): - """Set the lineage statement for the dataset. - - Args: - statement (str): general explanation describing the lineage or - provenance of the dataset - - """ - self.metadata.lineage = statement - - def get_lineage(self): - """Get the lineage statement of the dataset. - - Returns: - str - - """ - return self.metadata.lineage - - def set_purpose(self, purpose): - """Add a purpose for the dataset. - - Args: - purpose (str): description of the purpose of the source dataset - - """ - self.metadata.purpose = purpose - - def get_purpose(self): - """Get ``purpose`` for the dataset. - - Returns: - str - - """ - return self.metadata.purpose - - def set_url(self, url): - """Add a url for the dataset. - - Args: - url (str) - - """ - self.metadata.url = url - - def get_url(self): - """Get the url for the dataset.""" - return self.metadata.url - - def set_band_description(self, band_number, name=None, title=None, - abstract=None, units=None, type=None): - """Define metadata for a raster band. - - Args: - band_number (int): a raster band index, starting at 1 - name (str): name for the raster band - title (str): title for the raster band - abstract (str): description of the raster band - units (str): unit of measurement for the band's pixel values - type (str): of the band's values, either 'integer' or 'number' - - """ - idx = band_number - 1 - attribute = self.mcf['content_info']['attributes'][idx] - - if name is not None: - attribute['name'] = name - if title is not None: - attribute['title'] = title - if abstract is not None: - attribute['abstract'] = abstract - if units is not None: - attribute['units'] = units - if type is not None: - attribute['type'] = type - - self.mcf['content_info']['attributes'][idx] = attribute - - def get_band_description(self, band_number): - """Get the attribute metadata for a band. - - Args: - band_number (int): a raster band index, starting at 1 - - Returns: - dict - """ - return self.mcf['content_info']['attributes'][band_number - 1] - - def _get_attr(self, name): - """Get an attribute by its name property. - - Args: - name (string): to match the value of the 'name' key in a dict - - Returns: - tuple of (list index of the matching attribute, the attribute - dict) - - Raises: - KeyError if no attributes exist in the MCF or if the named - attribute does not exist. - - """ - if len(self.mcf['content_info']['attributes']) == 0: - raise KeyError( - f'{self.datasource} MCF has not attributes') - for idx, attr in enumerate(self.mcf['content_info']['attributes']): - if attr['name'] == name: - return idx, attr - raise KeyError( - f'{self.datasource} has no attribute named {name}') - - def set_field_description(self, name, title=None, abstract=None, - units=None, type=None): - """Define metadata for a tabular field. - - Args: - name (str): name and unique identifier of the field - title (str): title for the field - abstract (str): description of the field - units (str): unit of measurement for the field's values - - """ - idx, attribute = self._get_attr(name) - - if title is not None: - attribute['title'] = title - if abstract is not None: - attribute['abstract'] = abstract - if units is not None: - attribute['units'] = units - if type is not None: - attribute['type'] = type - - self.mcf['content_info']['attributes'][idx] = attribute - - def get_field_description(self, name): - """Get the attribute metadata for a field. - - Args: - name (str): name and unique identifier of the field - - Returns: - dict - """ - idx, attribute = self._get_attr(name) - return attribute - - def write(self, workspace=None): - """Write datapackage yaml to disk. - - This creates sidecar files with '.yml' - appended to the full filename of the data source. For example, - - - 'myraster.tif' - - 'myraster.tif.yml' - - Args: - workspace (str): if ``None``, files write to the same location - as the source data. If not ``None``, a path to a local directory - to write files. They will still be named to match the source - filename. Use this option if the source data is not on the local - filesystem. - - """ - if workspace is None: - target_path = self.data_package_path - else: - target_path = os.path.join( - workspace, f'{os.path.basename(self.datasource)}.yml') - - with open(target_path, 'w') as file: - file.write(yaml.dump( - dataclasses.asdict(self.metadata), Dumper=_NoAliasDumper)) - - def to_string(self): - pass diff --git a/src/geometamaker/models.py b/src/geometamaker/models.py index 23a0039..fde63db 100644 --- a/src/geometamaker/models.py +++ b/src/geometamaker/models.py @@ -1,5 +1,7 @@ -from dataclasses import dataclass, field +import dataclasses +from dataclasses import dataclass import logging +import os import pprint import yaml @@ -73,12 +75,10 @@ class TableSchema: """Class for metadata for tables.""" # https://datapackage.org/standard/table-schema/ - fields: list = field(default_factory=FieldSchema) - missingValues: list = field(default_factory=list) - primaryKey: list = field(default_factory=list) - foreignKeys: list = field(default_factory=list) - - # def get_field(): + fields: list = dataclasses.field(default_factory=FieldSchema) + missingValues: list = dataclasses.field(default_factory=list) + primaryKey: list = dataclasses.field(default_factory=list) + foreignKeys: list = dataclasses.field(default_factory=list) @dataclass @@ -126,8 +126,9 @@ class Resource: name: str = '' title: str = '' description: str = '' - sources: list = field(default_factory=list) - licenses: list = field(default_factory=list) + keywords: list = [] + sources: list = dataclasses.field(default_factory=list) + licenses: list = dataclasses.field(default_factory=list) citation: str = '' doi: str = '' url: str = '' @@ -136,8 +137,229 @@ class Resource: purpose: str = '' contact: ContactSchema = ContactSchema() - # def __post_init__(self): - # self.schema = + def set_title(self, title): + """Add a title for the dataset. + + Args: + title (str) + + """ + self.metadata.title = title + + def get_title(self): + """Get the title for the dataset.""" + return self.metadata.title + + def set_description(self, description): + """Add an description for the dataset. + + Args: + description (str) + + """ + self.metadata.description = description + + def get_description(self): + """Get the description for the dataset.""" + return self.metadata.description + + def set_citation(self, citation): + """Add a citation string for the dataset. + + Args: + citation (str) + + """ + self.metadata.citation = citation + + def get_citation(self): + """Get the citation for the dataset.""" + return self.metadata.citation + + def set_contact(self, organization=None, individual_name=None, + position_name=None, email=None): + """Add a contact section. + + Args: + organization (str): name of the responsible organization + individual_name (str): name of the responsible person + position_name (str): role or position of the responsible person + email (str): address of the responsible organization or individual + + """ + + if organization is not None: + self.metadata.contact.organization = organization + if individual_name is not None: + self.metadata.contact.individualname = individual_name + if position_name is not None: + self.metadata.contact.positionname = position_name + if email is not None: + self.metadata.contact.email = email + + def get_contact(self): + """Get metadata from a contact section. + + Returns: + ContactSchema + + """ + return self.metadata.contact + + def set_doi(self, doi): + """Add a doi string for the dataset. + + Args: + doi (str) + + """ + self.metadata.doi = doi + + def get_doi(self): + """Get the doi for the dataset.""" + return self.metadata.doi + + def set_edition(self, edition): + """Set the edition for the dataset. + + Args: + edition (str): version of the cited resource + + """ + self.metadata.edition = edition + + def get_edition(self): + """Get the edition of the dataset. + + Returns: + str or ``None`` if ``edition`` does not exist. + + """ + return self.metadata.edition + + def set_keywords(self, keywords): + """Describe a dataset with a list of keywords. + + Args: + keywords (list): sequence of strings + + """ + self.metadata.keywords = keywords + + def get_keywords(self): + return self.metadata.keywords + + def set_license(self, title=None, path=None): + """Add a license for the dataset. + + Either or both title and path are required if there is a license. + Call with no arguments to remove access constraints and license + info. + + Args: + title (str): human-readable title of the license + path (str): url for the license + + """ + license_dict = {} + license_dict['title'] = title if title else '' + license_dict['path'] = path if path else '' + + # TODO: DataPackage/Resource allows for a list of licenses. + # So far we only support one license per resource. + self.licenses = [License(**license_dict)] + + def get_license(self): + """Get ``license`` for the dataset. + + Returns: + models.License + + """ + # TODO: DataPackage/Resource allows for a list of licenses. + # So far we only support one license per resource. + if self.licenses: + return self.licenses[0] + + def set_lineage(self, statement): + """Set the lineage statement for the dataset. + + Args: + statement (str): general explanation describing the lineage or + provenance of the dataset + + """ + self.metadata.lineage = statement + + def get_lineage(self): + """Get the lineage statement of the dataset. + + Returns: + str + + """ + return self.metadata.lineage + + def set_purpose(self, purpose): + """Add a purpose for the dataset. + + Args: + purpose (str): description of the purpose of the source dataset + + """ + self.metadata.purpose = purpose + + def get_purpose(self): + """Get ``purpose`` for the dataset. + + Returns: + str + + """ + return self.metadata.purpose + + def set_url(self, url): + """Add a url for the dataset. + + Args: + url (str) + + """ + self.metadata.url = url + + def get_url(self): + """Get the url for the dataset.""" + return self.metadata.url + + def write(self, workspace=None): + """Write datapackage yaml to disk. + + This creates sidecar files with '.yml' + appended to the full filename of the data source. For example, + + - 'myraster.tif' + - 'myraster.tif.yml' + + Args: + workspace (str): if ``None``, files write to the same location + as the source data. If not ``None``, a path to a local directory + to write files. They will still be named to match the source + filename. Use this option if the source data is not on the local + filesystem. + + """ + if workspace is None: + target_path = self.data_package_path + else: + target_path = os.path.join( + workspace, f'{os.path.basename(self.datasource)}.yml') + + with open(target_path, 'w') as file: + file.write(yaml.dump( + dataclasses.asdict(self.metadata), Dumper=_NoAliasDumper)) + + def to_string(self): + pass @dataclass(kw_only=True) @@ -145,8 +367,7 @@ class TableResource(Resource): """Class for metadata for a table resource.""" # without post-init, schema ends up as a dict, or whatever is passed in. - schema: TableSchema = field(default_factory=TableSchema) - # type: str = 'table' + schema: TableSchema = dataclasses.field(default_factory=TableSchema) def __post_init__(self): # Allow init of the resource with a schema of type @@ -156,6 +377,71 @@ def __post_init__(self): return self.schema = TableSchema(**self.schema) + def _get_field(self, name): + """Get an attribute by its name property. + + Args: + name (string): to match the value of the 'name' key in a dict + + Returns: + tuple of (list index of the matching attribute, the attribute + dict) + + Raises: + KeyError if no attributes exist in the MCF or if the named + attribute does not exist. + + """ + if len(self.schema.fields) == 0: + raise KeyError( + f'{self.schema} has no fields') + for idx, field in enumerate(self.schema.fields): + if field['name'] == name: + return idx, field + raise KeyError( + f'{self.schema} has no field named {name}') + + def set_field_description(self, name, title=None, description=None, + units=None, type=None, format=None, + example=None): + """Define metadata for a tabular field. + + Args: + name (str): name and unique identifier of the field + title (str): title for the field + abstract (str): description of the field + units (str): unit of measurement for the field's values + + """ + idx, field = self._get_field(name) + + if title is not None: + field.title = title + if description is not None: + field.description = description + if units is not None: + field.units = units + if type is not None: + field.type = type + if format is not None: + field.format = format + if example is not None: + field.example = example + + self.schema.fields[idx] = field + + def get_field_description(self, name): + """Get the attribute metadata for a field. + + Args: + name (str): name and unique identifier of the field + + Returns: + dict + """ + idx, field = self._get_field(name) + return field + @dataclass(kw_only=True) class VectorResource(TableResource): @@ -178,3 +464,39 @@ def __post_init__(self): if isinstance(self.schema, RasterSchema): return self.schema = RasterSchema(**self.schema) + + def set_band_description(self, band_number, title=None, + description=None, units=None): + """Define metadata for a raster band. + + Args: + band_number (int): a raster band index, starting at 1 + name (str): name for the raster band + title (str): title for the raster band + abstract (str): description of the raster band + units (str): unit of measurement for the band's pixel values + type (str): of the band's values, either 'integer' or 'number' + + """ + idx = band_number - 1 + band = self.schema.bands[idx] + + if title is not None: + band.title = title + if description is not None: + band.description = description + if units is not None: + band.units = units + + self.schema.bands[idx] = band + + def get_band_description(self, band_number): + """Get the attribute metadata for a band. + + Args: + band_number (int): a raster band index, starting at 1 + + Returns: + dict + """ + return self.schema.bands[band_number - 1] From 1aa4c103a5214a83b65dcca82d8029f6f78f94e0 Mon Sep 17 00:00:00 2001 From: davemfish Date: Fri, 19 Jul 2024 11:35:37 -0400 Subject: [PATCH 07/15] added support for compressed files; updated some tests --- src/geometamaker/__init__.py | 2 +- src/geometamaker/geometamaker.py | 145 +++++------ src/geometamaker/models.py | 83 ++++--- tests/test_geometamaker.py | 397 ++++++++++++------------------- 4 files changed, 284 insertions(+), 343 deletions(-) diff --git a/src/geometamaker/__init__.py b/src/geometamaker/__init__.py index 9f56a76..af30c26 100644 --- a/src/geometamaker/__init__.py +++ b/src/geometamaker/__init__.py @@ -1 +1 @@ -from .geometamaker import MetadataControl +from .geometamaker import describe diff --git a/src/geometamaker/geometamaker.py b/src/geometamaker/geometamaker.py index e3a6c7b..ae9bb71 100644 --- a/src/geometamaker/geometamaker.py +++ b/src/geometamaker/geometamaker.py @@ -6,27 +6,20 @@ import frictionless import fsspec -import pygeoprocessing +import numpy from osgeo import gdal from osgeo import ogr from osgeo import osr +import pygeoprocessing import yaml from . import models -# https://stackoverflow.com/questions/13518819/avoid-references-in-pyyaml -class _NoAliasDumper(yaml.SafeDumper): - """Keep the yaml human-readable by avoiding anchors and aliases.""" - - def ignore_aliases(self, data): - return True - - LOGGER = logging.getLogger(__name__) -def get_file_type(filepath): +def detect_file_type(filepath): # TODO: zip, or other archives. Can they be represented as a Resource? # or do they need to be a Package? @@ -35,9 +28,11 @@ def get_file_type(filepath): # GDAL considers CSV a vector, so check against frictionless # first. - filetype = frictionless.describe(filepath).type - if filetype == 'table': - return filetype + desc = frictionless.describe(filepath) + if desc.type == 'table': + return 'table' + if desc.compression: + return 'archive' gis_type = pygeoprocessing.get_gis_type(filepath) if gis_type == pygeoprocessing.VECTOR_TYPE: return 'vector' @@ -46,17 +41,25 @@ def get_file_type(filepath): raise ValueError() +def describe_archive(source_dataset_path): + description = frictionless.describe( + source_dataset_path, stats=True).to_dict() + return description + + def describe_vector(source_dataset_path): description = frictionless.describe( source_dataset_path, stats=True).to_dict() fields = [] vector = gdal.OpenEx(source_dataset_path, gdal.OF_VECTOR) layer = vector.GetLayer() + description['rows'] = layer.GetFeatureCount() for fld in layer.schema: fields.append( models.FieldSchema(name=fld.name, type=fld.type)) vector = layer = None description['schema'] = models.TableSchema(fields=fields) + description['fields'] = len(fields) info = pygeoprocessing.get_vector_info(source_dataset_path) spatial = { @@ -74,38 +77,42 @@ def describe_raster(source_dataset_path): bands = [] info = pygeoprocessing.get_raster_info(source_dataset_path) + # Some values of raster info are numpy types, which the + # yaml dumper doesn't know how to represent. for i in range(info['n_bands']): b = i + 1 - # band = raster.GetRasterBand(b) - # datatype = 'integer' if band.DataType < 6 else 'number' bands.append(models.BandSchema( index=b, gdal_type=info['datatype'], - numpy_type=info['numpy_type'], + numpy_type=numpy.dtype(info['numpy_type']).name, nodata=info['nodata'][i])) description['schema'] = models.RasterSchema( bands=bands, pixel_size=info['pixel_size'], raster_size=info['raster_size']) description['spatial'] = models.SpatialSchema( - bounding_box=info['bounding_box'], + bounding_box=[float(x) for x in info['bounding_box']], crs=info['projection_wkt']) description['sources'] = info['file_list'] return description def describe_table(source_dataset_path): - return frictionless.describe( + description = frictionless.describe( source_dataset_path, stats=True).to_dict() + description['schema'] = models.TableSchema(**description['schema']) + return description DESRCIBE_FUNCS = { + 'archive': describe_archive, 'table': describe_table, 'vector': describe_vector, 'raster': describe_raster } RESOURCE_MODELS = { + 'archive': models.ArchiveResource, 'table': models.TableResource, 'vector': models.VectorResource, 'raster': models.RasterResource @@ -125,55 +132,55 @@ class MetadataControl(object): """ - def __init__(self, source_dataset_path=None): - """Create an MCF instance, populated with properties of the dataset. - - The MCF will be valid according to the pygeometa schema. It has - all required properties. Properties of the dataset are used to - populate as many MCF properties as possible. Default/placeholder - values are used for properties that require user input. - - Instantiating without a ``source_dataset_path`` creates an MCF template. - - Args: - source_dataset_path (string): path or URL to dataset to which the - metadata applies - - """ - - # if source_dataset_path is not None: - self.datasource = source_dataset_path - self.data_package_path = f'{self.datasource}.yml' - - # Despite naming, this does not open a resource that must be closed - of = fsspec.open(self.datasource) - if not of.fs.exists(self.datasource): - raise FileNotFoundError(f'{self.datasource} does not exist') - - resource_type = get_file_type(source_dataset_path) - description = DESRCIBE_FUNCS[resource_type](source_dataset_path) - # this is nice for autodetect of field types, but sometimes - # we will know the table schema (invest MODEL_SPEC). - # Is there any benefit to passing in the known schema? Maybe not - # Can also just overwrite the schema attribute with known data after. - - # Load existing metadata file - try: - with fsspec.open(self.data_package_path, 'r') as file: - yaml_string = file.read() - - # This validates the existing yaml against our dataclasses. - existing_resource = RESOURCE_MODELS[resource_type]( - **yaml.safe_load(yaml_string)) - # overwrite properties that are intrinsic to the dataset, - # which is everything from `description` other than schema. - # Some parts of schema are intrinsic, but others are human-input - # so replace the whole thing for now. - del description['schema'] - self.metadata = dataclasses.replace( - existing_resource, **description) - - # Common path: metadata file does not already exist - except FileNotFoundError as err: - self.metadata = RESOURCE_MODELS[resource_type](**description) + +def describe(source_dataset_path): + """Create a metadata resource instance with properties of the dataset. + + Properties of the dataset are used to populate as many metadata + properties as possible. Default/placeholder + values are used for properties that require user input. + + Args: + source_dataset_path (string): path or URL to dataset to which the + metadata applies + + Returns + one of TableResource, VectorResource, RasterResource + """ + + data_package_path = f'{source_dataset_path}.yml' + + # Despite naming, this does not open a resource that must be closed + of = fsspec.open(source_dataset_path) + if not of.fs.exists(source_dataset_path): + raise FileNotFoundError(f'{source_dataset_path} does not exist') + + resource_type = detect_file_type(source_dataset_path) + description = DESRCIBE_FUNCS[resource_type](source_dataset_path) + # this is nice for autodetect of field types, but sometimes + # we will know the table schema (invest MODEL_SPEC). + # Is there any benefit to passing in the known schema? Maybe not + # Can also just overwrite the schema attribute with known data after. + + # Load existing metadata file + try: + with fsspec.open(data_package_path, 'r') as file: + yaml_string = file.read() + + # This validates the existing yaml against our dataclasses. + existing_resource = RESOURCE_MODELS[resource_type]( + **yaml.safe_load(yaml_string)) + # overwrite properties that are intrinsic to the dataset, + # which is everything from `description` other than schema. + # Some parts of schema are intrinsic, but others are human-input + # so replace the whole thing for now. + del description['schema'] + resource = dataclasses.replace( + existing_resource, **description) + + # Common path: metadata file does not already exist + except FileNotFoundError as err: + resource = RESOURCE_MODELS[resource_type](**description) + + return resource diff --git a/src/geometamaker/models.py b/src/geometamaker/models.py index fde63db..42e0ee2 100644 --- a/src/geometamaker/models.py +++ b/src/geometamaker/models.py @@ -2,13 +2,13 @@ from dataclasses import dataclass import logging import os -import pprint import yaml LOGGER = logging.getLogger(__name__) + # https://stackoverflow.com/questions/13518819/avoid-references-in-pyyaml class _NoAliasDumper(yaml.SafeDumper): """Keep the yaml human-readable by avoiding anchors and aliases.""" @@ -80,6 +80,18 @@ class TableSchema: primaryKey: list = dataclasses.field(default_factory=list) foreignKeys: list = dataclasses.field(default_factory=list) + def __post_init__(self): + field_schemas = [] + for field in self.fields: + # Allow init of the resource with a schema of type + # FieldSchema, or type dict. Mostly because dataclasses.replace + # calls init, but the base object will have already been initialized. + if isinstance(field, FieldSchema): + field_schemas.append(field) + else: + field_schemas.append(FieldSchema(**field)) + self.fields = field_schemas + @dataclass class BandSchema: @@ -126,7 +138,7 @@ class Resource: name: str = '' title: str = '' description: str = '' - keywords: list = [] + keywords: list = dataclasses.field(default_factory=list) sources: list = dataclasses.field(default_factory=list) licenses: list = dataclasses.field(default_factory=list) citation: str = '' @@ -137,6 +149,9 @@ class Resource: purpose: str = '' contact: ContactSchema = ContactSchema() + def __post_init__(self): + self.metadata_path = f'{self.path}.yml' + def set_title(self, title): """Add a title for the dataset. @@ -144,11 +159,11 @@ def set_title(self, title): title (str) """ - self.metadata.title = title + self.title = title def get_title(self): """Get the title for the dataset.""" - return self.metadata.title + return self.title def set_description(self, description): """Add an description for the dataset. @@ -157,11 +172,11 @@ def set_description(self, description): description (str) """ - self.metadata.description = description + self.description = description def get_description(self): """Get the description for the dataset.""" - return self.metadata.description + return self.description def set_citation(self, citation): """Add a citation string for the dataset. @@ -170,11 +185,11 @@ def set_citation(self, citation): citation (str) """ - self.metadata.citation = citation + self.citation = citation def get_citation(self): """Get the citation for the dataset.""" - return self.metadata.citation + return self.citation def set_contact(self, organization=None, individual_name=None, position_name=None, email=None): @@ -189,13 +204,13 @@ def set_contact(self, organization=None, individual_name=None, """ if organization is not None: - self.metadata.contact.organization = organization + self.contact.organization = organization if individual_name is not None: - self.metadata.contact.individualname = individual_name + self.contact.individual_name = individual_name if position_name is not None: - self.metadata.contact.positionname = position_name + self.contact.position_name = position_name if email is not None: - self.metadata.contact.email = email + self.contact.email = email def get_contact(self): """Get metadata from a contact section. @@ -204,7 +219,7 @@ def get_contact(self): ContactSchema """ - return self.metadata.contact + return self.contact def set_doi(self, doi): """Add a doi string for the dataset. @@ -213,11 +228,11 @@ def set_doi(self, doi): doi (str) """ - self.metadata.doi = doi + self.doi = doi def get_doi(self): """Get the doi for the dataset.""" - return self.metadata.doi + return self.doi def set_edition(self, edition): """Set the edition for the dataset. @@ -226,7 +241,7 @@ def set_edition(self, edition): edition (str): version of the cited resource """ - self.metadata.edition = edition + self.edition = edition def get_edition(self): """Get the edition of the dataset. @@ -235,7 +250,7 @@ def get_edition(self): str or ``None`` if ``edition`` does not exist. """ - return self.metadata.edition + return self.edition def set_keywords(self, keywords): """Describe a dataset with a list of keywords. @@ -244,10 +259,10 @@ def set_keywords(self, keywords): keywords (list): sequence of strings """ - self.metadata.keywords = keywords + self.keywords = keywords def get_keywords(self): - return self.metadata.keywords + return self.keywords def set_license(self, title=None, path=None): """Add a license for the dataset. @@ -289,7 +304,7 @@ def set_lineage(self, statement): provenance of the dataset """ - self.metadata.lineage = statement + self.lineage = statement def get_lineage(self): """Get the lineage statement of the dataset. @@ -298,7 +313,7 @@ def get_lineage(self): str """ - return self.metadata.lineage + return self.lineage def set_purpose(self, purpose): """Add a purpose for the dataset. @@ -307,7 +322,7 @@ def set_purpose(self, purpose): purpose (str): description of the purpose of the source dataset """ - self.metadata.purpose = purpose + self.purpose = purpose def get_purpose(self): """Get ``purpose`` for the dataset. @@ -316,7 +331,7 @@ def get_purpose(self): str """ - return self.metadata.purpose + return self.purpose def set_url(self, url): """Add a url for the dataset. @@ -325,11 +340,11 @@ def set_url(self, url): url (str) """ - self.metadata.url = url + self.url = url def get_url(self): """Get the url for the dataset.""" - return self.metadata.url + return self.url def write(self, workspace=None): """Write datapackage yaml to disk. @@ -349,14 +364,14 @@ def write(self, workspace=None): """ if workspace is None: - target_path = self.data_package_path + target_path = self.metadata_path else: target_path = os.path.join( workspace, f'{os.path.basename(self.datasource)}.yml') with open(target_path, 'w') as file: file.write(yaml.dump( - dataclasses.asdict(self.metadata), Dumper=_NoAliasDumper)) + dataclasses.asdict(self), Dumper=_NoAliasDumper)) def to_string(self): pass @@ -366,10 +381,13 @@ def to_string(self): class TableResource(Resource): """Class for metadata for a table resource.""" + fields: int + rows: int # without post-init, schema ends up as a dict, or whatever is passed in. schema: TableSchema = dataclasses.field(default_factory=TableSchema) def __post_init__(self): + super().__post_init__() # Allow init of the resource with a schema of type # TableSchema, or type dict. Mostly because dataclasses.replace # calls init, but the base object will have already been initialized. @@ -396,7 +414,7 @@ def _get_field(self, name): raise KeyError( f'{self.schema} has no fields') for idx, field in enumerate(self.schema.fields): - if field['name'] == name: + if field.name == name: return idx, field raise KeyError( f'{self.schema} has no field named {name}') @@ -443,6 +461,14 @@ def get_field_description(self, name): return field +@dataclass(kw_only=True) +class ArchiveResource(Resource): + """Class for metadata for an archive resource.""" + + compression: str + innerpath: str + + @dataclass(kw_only=True) class VectorResource(TableResource): """Class for metadata for a vector resource.""" @@ -458,6 +484,7 @@ class RasterResource(Resource): spatial: SpatialSchema def __post_init__(self): + super().__post_init__() # Allow init of the resource with a schema of type # RasterSchema, or type dict. Mostly because dataclasses.replace # calls init, but the base object will have already been initialized. diff --git a/tests/test_geometamaker.py b/tests/test_geometamaker.py index 0a3d455..5670de9 100644 --- a/tests/test_geometamaker.py +++ b/tests/test_geometamaker.py @@ -98,31 +98,31 @@ def tearDown(self): def test_file_does_not_exist(self): """MetadataControl: raises exception if given file does not exist.""" - from geometamaker import MetadataControl + import geometamaker with self.assertRaises(FileNotFoundError): - _ = MetadataControl('foo.tif') + _ = geometamaker.describe('foo.tif') - def test_blank_MetadataControl(self): - """MetadataControl: template has expected properties.""" - from geometamaker import MetadataControl + # def test_blank_geometamaker.describe(self): + # """MetadataControl: template has expected properties.""" + # import geometamaker - target_filepath = os.path.join(self.workspace_dir, 'mcf.yml') + # target_filepath = os.path.join(self.workspace_dir, 'mcf.yml') - mc = MetadataControl() - mc.validate() - mc._write_mcf(target_filepath) + # mc = geometamaker.describe() + # mc.validate() + # mc._write_mcf(target_filepath) - with open(target_filepath, 'r') as file: - actual = yaml.safe_load(file) - with open(os.path.join(REGRESSION_DATA, 'template.yml'), 'r') as file: - expected = yaml.safe_load(file) + # with open(target_filepath, 'r') as file: + # actual = yaml.safe_load(file) + # with open(os.path.join(REGRESSION_DATA, 'template.yml'), 'r') as file: + # expected = yaml.safe_load(file) - self.assertEqual(actual, expected) + # self.assertEqual(actual, expected) - def test_csv_MetadataControl(self): - """MetadataControl: validate basic csv MetadataControl.""" - from geometamaker import MetadataControl + def test_describe_csv(self): + """Test setting properties on csv.""" + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'data.csv') field_names = ['Strings', 'Ints', 'Reals'] @@ -132,49 +132,37 @@ def test_csv_MetadataControl(self): writer.writerow(field_names) writer.writerow(field_values) - mc = MetadataControl(datasource_path) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') + resource = geometamaker.describe(datasource_path) self.assertEqual( - len(mc.mcf['content_info']['attributes']), + len(resource.schema.fields), len(field_names)) - self.assertEqual(mc.get_field_description('Strings')['type'], 'string') - self.assertEqual(mc.get_field_description('Ints')['type'], 'integer') - self.assertEqual(mc.get_field_description('Reals')['type'], 'number') + self.assertEqual(resource.get_field_description('Strings').type, 'string') + self.assertEqual(resource.get_field_description('Ints').type, 'integer') + self.assertEqual(resource.get_field_description('Reals').type, 'number') title = 'title' - abstract = 'some abstract' + description = 'some abstract' units = 'mm' - mc.set_field_description( + resource.set_field_description( field_names[1], title=title, - abstract=abstract) + description=description) # To demonstrate that properties can be added while preserving others - mc.set_field_description( + resource.set_field_description( field_names[1], units=units) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') - attr = [attr for attr in mc.mcf['content_info']['attributes'] - if attr['name'] == field_names[1]][0] - self.assertEqual(attr['title'], title) - self.assertEqual(attr['abstract'], abstract) - self.assertEqual(attr['units'], units) + field = [field for field in resource.schema.fields + if field.name == field_names[1]][0] + self.assertEqual(field.title, title) + self.assertEqual(field.description, description) + self.assertEqual(field.units, units) - def test_bad_csv_MetadataControl(self): + def test_describe_bad_csv(self): """MetadataControl: CSV with extra item in row does not fail.""" - from geometamaker import MetadataControl + import geometamaker - datasource_path = os.path.join('data.csv') + datasource_path = os.path.join(self.workspace_dir, 'data.csv') field_names = ['Strings', 'Ints', 'Reals'] field_values = ['foo', 1, 0.9, 'extra'] with open(datasource_path, 'w') as file: @@ -182,24 +170,19 @@ def test_bad_csv_MetadataControl(self): writer.writerow(field_names) writer.writerow(field_values) - mc = MetadataControl(datasource_path) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') - mc.write() + resource = geometamaker.describe(datasource_path) + + resource.write() self.assertEqual( - len(mc.mcf['content_info']['attributes']), + len(resource.schema.fields), len(field_names)) - self.assertEqual(mc.get_field_description('Strings')['type'], 'string') - self.assertEqual(mc.get_field_description('Ints')['type'], 'integer') - self.assertEqual(mc.get_field_description('Reals')['type'], 'number') + self.assertEqual(resource.get_field_description('Strings').type, 'string') + self.assertEqual(resource.get_field_description('Ints').type, 'integer') + self.assertEqual(resource.get_field_description('Reals').type, 'number') - def test_vector_MetadataControl(self): - """MetadataControl: validate basic vector MetadataControl.""" - from geometamaker import MetadataControl + def test_describe_vector(self): + """Test basic vector.""" + import geometamaker field_map = { f'field_{k}': k @@ -213,217 +196,141 @@ def test_vector_MetadataControl(self): self.workspace_dir, f'vector.{ext}') create_vector(datasource_path, field_map, driver) - mc = MetadataControl(datasource_path) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') - mc.write() + resource = geometamaker.describe(datasource_path) + self.assertTrue(isinstance( + resource.spatial, geometamaker.models.SpatialSchema)) + + resource.write() self.assertTrue(os.path.exists(f'{datasource_path}.yml')) - def test_vector_no_fields(self): - """MetadataControl: validate MetadataControl for basic vector with no fields.""" - from geometamaker import MetadataControl + def test_describe_vector_no_fields(self): + """Test metadata for basic vector with no fields.""" + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'vector.geojson') create_vector(datasource_path, None) - mc = MetadataControl(datasource_path) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') - mc.write() + resource = geometamaker.describe(datasource_path) + self.assertEqual(len(resource.schema.fields), 0) - def test_raster_MetadataControl(self): - """MetadataControl: validate basic raster MetadataControl.""" - from geometamaker import MetadataControl + def test_describe_raster(self): + """Test metadata for basic raster.""" + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') - mc.write() - - def test_vector_attributes(self): - """MetadataControl: validate vector with extra attribute metadata.""" - from geometamaker import MetadataControl + resource = geometamaker.describe(datasource_path) + self.assertTrue(isinstance( + resource.spatial, geometamaker.models.SpatialSchema)) - datasource_path = os.path.join(self.workspace_dir, 'vector.geojson') - field_name = 'foo' - field_map = { - field_name: list(_OGR_TYPES_VALUES_MAP)[0]} - create_vector(datasource_path, field_map) - - mc = MetadataControl(datasource_path) - title = 'title' - abstract = 'some abstract' - units = 'mm' - mc.set_field_description( - field_name, - title=title, - abstract=abstract) - # To demonstrate that properties can be added while preserving others - mc.set_field_description( - field_name, - units=units) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') - - self.assertEqual( - len(mc.mcf['content_info']['attributes']), - len(field_map)) - attr = [attr for attr in mc.mcf['content_info']['attributes'] - if attr['name'] == field_name][0] - self.assertEqual(attr['title'], title) - self.assertEqual(attr['abstract'], abstract) - self.assertEqual(attr['units'], units) + resource.write() + self.assertTrue(os.path.exists(f'{datasource_path}.yml')) def test_raster_attributes(self): - """MetadataControl: validate raster with extra attribute metadata.""" - from geometamaker import MetadataControl + """Test adding extra attribute metadata to raster.""" + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) + numpy_type = numpy.int16 + create_raster(numpy_type, datasource_path) band_number = 1 - mc = MetadataControl(datasource_path) - name = 'name' + resource = geometamaker.describe(datasource_path) title = 'title' - abstract = 'some abstract' + description = 'some abstract' units = 'mm' - mc.set_band_description( + resource.set_band_description( band_number, - name=name, title=title, - abstract=abstract) + description=description) # To demonstrate that properties can be added while preserving others - mc.set_band_description( + resource.set_band_description( band_number, units=units) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') + raster_info = pygeoprocessing.get_raster_info(datasource_path) self.assertEqual( - len(mc.mcf['content_info']['attributes']), - pygeoprocessing.get_raster_info(datasource_path)['n_bands']) - attr = mc.mcf['content_info']['attributes'][band_number - 1] - self.assertEqual(attr['name'], name) - self.assertEqual(attr['title'], title) - self.assertEqual(attr['abstract'], abstract) - self.assertEqual(attr['units'], units) - - def test_set_abstract(self): - """MetadataControl: set and get an abstract.""" - - from geometamaker import MetadataControl - - abstract = 'foo bar' - mc = MetadataControl() - mc.set_abstract(abstract) - self.assertEqual(mc.get_abstract(), abstract) + len(resource.schema.bands), raster_info['n_bands']) + band_idx = band_number - 1 + band = resource.schema.bands[band_idx] + self.assertEqual(band.title, title) + self.assertEqual(band.description, description) + self.assertEqual(band.gdal_type, raster_info['datatype']) + self.assertEqual(band.numpy_type, numpy.dtype(numpy_type).name) + self.assertEqual(band.nodata, raster_info['nodata'][band_idx]) + self.assertEqual(band.units, units) + + def test_set_description(self): + """Test set and get a description for a resource.""" + + import geometamaker + + description = 'foo bar' + resource = geometamaker.models.Resource() + resource.set_description(description) + self.assertEqual(resource.get_description(), description) def test_set_citation(self): - """MetadataControl: set and get a citation.""" + """Test set and get a citation for resource.""" - from geometamaker import MetadataControl + import geometamaker citation = 'foo bar' - mc = MetadataControl() - mc.set_citation(citation) - self.assertEqual(mc.get_citation(), citation) + resource = geometamaker.models.Resource() + resource.set_citation(citation) + self.assertEqual(resource.get_citation(), citation) def test_set_contact(self): - """MetadataControl: set and get a contact section.""" + """Test set and get a contact section for a resource.""" - from geometamaker import MetadataControl + import geometamaker org = 'natcap' name = 'nat' position = 'boss' email = 'abc@def' - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - mc.set_contact( - organization=org, individualname=name, - positionname=position, email=email) - contact_dict = mc.get_contact() - self.assertEqual(contact_dict['organization'], org) - self.assertEqual(contact_dict['individualname'], name) - self.assertEqual(contact_dict['positionname'], position) - self.assertEqual(contact_dict['email'], email) - - def test_set_contact_from_dict(self): - """MetadataControl: set a contact section from a dict.""" - - from geometamaker import MetadataControl - - contact_dict = { - 'organization': 'natcap', - 'individualname': 'nat', - 'positionname': 'boss', - 'email': 'abc@def', - 'fax': '555-1234', - 'postalcode': '01234' - } - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - mc.set_contact(**contact_dict) - actual = mc.get_contact() - for k, v in contact_dict.items(): - self.assertEqual(actual[k], v) + resource = geometamaker.models.Resource() + resource.set_contact( + organization=org, individual_name=name, + position_name=position, email=email) + contact = resource.get_contact() + self.assertEqual(contact.organization, org) + self.assertEqual(contact.individual_name, name) + self.assertEqual(contact.position_name, position) + self.assertEqual(contact.email, email) def test_set_contact_validates(self): """MetadataControl: invalid type raises ValidationError.""" - from geometamaker import MetadataControl + import geometamaker postalcode = 55555 # should be a string datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) with self.assertRaises(ValidationError): mc.set_contact(postalcode=postalcode) def test_set_doi(self): """MetadataControl: set and get a doi.""" - from geometamaker import MetadataControl + import geometamaker doi = '10.foo/bar' - mc = MetadataControl() + mc = geometamaker.describe() mc.set_doi(doi) self.assertEqual(mc.get_doi(), doi) def test_set_get_edition(self): """MetadataControl: set and get dataset edition.""" - from geometamaker import MetadataControl + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) version = '3.14' mc.set_edition(version) self.assertEqual(mc.get_edition(), version) @@ -431,11 +338,11 @@ def test_set_get_edition(self): def test_set_edition_validates(self): """MetadataControl: test set edition raises ValidationError.""" - from geometamaker import MetadataControl + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) version = 3.14 # should be a string with self.assertRaises(ValidationError): mc.set_edition(version) @@ -443,11 +350,11 @@ def test_set_edition_validates(self): def test_set_keywords(self): """MetadataControl: set keywords to default section.""" - from geometamaker import MetadataControl + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) mc.set_keywords(['foo', 'bar']) self.assertEqual( @@ -457,11 +364,11 @@ def test_set_keywords(self): def test_set_keywords_to_section(self): """MetadataControl: set keywords to named section.""" - from geometamaker import MetadataControl + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) mc.set_keywords(['foo', 'bar'], section='first') mc.set_keywords(['baz'], section='second') @@ -475,11 +382,11 @@ def test_set_keywords_to_section(self): def test_overwrite_keywords(self): """MetadataControl: overwrite keywords in existing section.""" - from geometamaker import MetadataControl + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) mc.set_keywords(['foo', 'bar']) mc.set_keywords(['baz']) @@ -489,21 +396,21 @@ def test_overwrite_keywords(self): def test_keywords_raises_validation_error(self): """MetadataControl: set keywords validates.""" - from geometamaker import MetadataControl + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) with self.assertRaises(ValidationError): mc.set_keywords('foo', 'bar') def test_set_and_get_license(self): """MetadataControl: set purpose of dataset.""" - from geometamaker import MetadataControl + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) name = 'CC-BY-4.0' url = 'https://creativecommons.org/licenses/by/4.0/' @@ -528,11 +435,11 @@ def test_set_and_get_license(self): def test_set_license_validates(self): """MetadataControl: test set license raises ValidationError.""" - from geometamaker import MetadataControl + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) name = 4.0 # should be a string with self.assertRaises(ValidationError): mc.set_license(name=name) @@ -541,11 +448,11 @@ def test_set_license_validates(self): def test_set_and_get_lineage(self): """MetadataControl: set lineage of dataset.""" - from geometamaker import MetadataControl + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) statement = 'a lineage statment' mc.set_lineage(statement) @@ -554,22 +461,22 @@ def test_set_and_get_lineage(self): def test_set_lineage_validates(self): """MetadataControl: test set lineage raises ValidationError.""" - from geometamaker import MetadataControl + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) lineage = ['some statement'] # should be a string with self.assertRaises(ValidationError): mc.set_lineage(lineage) def test_set_and_get_purpose(self): """MetadataControl: set purpose of dataset.""" - from geometamaker import MetadataControl + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) purpose = 'foo' mc.set_purpose(purpose) self.assertEqual(mc.get_purpose(), purpose) @@ -577,28 +484,28 @@ def test_set_and_get_purpose(self): def test_set_url(self): """MetadataControl: set and get a url.""" - from geometamaker import MetadataControl + import geometamaker url = 'http://foo/bar' - mc = MetadataControl() + mc = geometamaker.describe() mc.set_url(url) self.assertEqual(mc.get_url(), url) def test_preexisting_mc_raster(self): """MetadataControl: test reading and ammending an existing MCF raster.""" - from geometamaker import MetadataControl + import geometamaker title = 'Title' keyword = 'foo' band_name = 'The Band' datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) mc.set_title(title) mc.set_band_description(1, name=band_name) mc.write() - new_mc = MetadataControl(datasource_path) + new_mc = geometamaker.describe(datasource_path) new_mc.set_keywords([keyword]) self.assertEqual(new_mc.mcf['metadata']['hierarchylevel'], 'dataset') @@ -611,12 +518,12 @@ def test_preexisting_mc_raster(self): def test_preexisting_mc_raster_new_bands(self): """MetadataControl: test existing MCF when the raster has new bands.""" - from geometamaker import MetadataControl + import geometamaker band_name = 'The Band' datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path, n_bands=1) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) mc.set_band_description(1, name=band_name) self.assertEqual(mc.get_band_description(1)['type'], 'integer') mc.write() @@ -625,7 +532,7 @@ def test_preexisting_mc_raster_new_bands(self): # There's an extra band, and the datatype has changed create_raster(numpy.float32, datasource_path, n_bands=2) - new_mc = MetadataControl(datasource_path) + new_mc = geometamaker.describe(datasource_path) band1 = new_mc.get_band_description(1) self.assertEqual(band1['name'], band_name) @@ -636,7 +543,7 @@ def test_preexisting_mc_raster_new_bands(self): def test_preexisting_mc_vector(self): """MetadataControl: test reading and ammending an existing MCF vector.""" - from geometamaker import MetadataControl + import geometamaker title = 'Title' datasource_path = os.path.join(self.workspace_dir, 'vector.geojson') @@ -645,12 +552,12 @@ def test_preexisting_mc_vector(self): field_map = { field_name: list(_OGR_TYPES_VALUES_MAP)[0]} create_vector(datasource_path, field_map) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) mc.set_title(title) mc.set_field_description(field_name, abstract=description) mc.write() - new_mc = MetadataControl(datasource_path) + new_mc = geometamaker.describe(datasource_path) self.assertEqual(new_mc.mcf['metadata']['hierarchylevel'], 'dataset') self.assertEqual( @@ -660,7 +567,7 @@ def test_preexisting_mc_vector(self): def test_preexisting_mc_vector_new_fields(self): """MetadataControl: test an existing MCF for vector with new fields.""" - from geometamaker import MetadataControl + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'vector.geojson') field1_name = 'foo' @@ -668,7 +575,7 @@ def test_preexisting_mc_vector_new_fields(self): field_map = { field1_name: list(_OGR_TYPES_VALUES_MAP)[0]} create_vector(datasource_path, field_map) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) mc.set_field_description(field1_name, abstract=description) self.assertEqual( mc.get_field_description(field1_name)['type'], 'integer') @@ -681,7 +588,7 @@ def test_preexisting_mc_vector_new_fields(self): field1_name: list(_OGR_TYPES_VALUES_MAP)[2], field2_name: list(_OGR_TYPES_VALUES_MAP)[3]} create_vector(datasource_path, new_field_map) - new_mc = MetadataControl(datasource_path) + new_mc = geometamaker.describe(datasource_path) field1 = new_mc.get_field_description(field1_name) self.assertEqual(field1['abstract'], description) @@ -691,11 +598,11 @@ def test_preexisting_mc_vector_new_fields(self): def test_invalid_preexisting_mcf(self): """MetadataControl: test overwriting an existing invalid MetadataControl.""" - from geometamaker import MetadataControl + import geometamaker title = 'Title' datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) mc.set_title(title) # delete a required property and ensure invalid MetadataControl @@ -704,7 +611,7 @@ def test_invalid_preexisting_mcf(self): mc.validate() mc.write() # intentionally writing an invalid MetadataControl - new_mc = MetadataControl(datasource_path) + new_mc = geometamaker.describe(datasource_path) # The new MetadataControl should not have values from the invalid MetadataControl self.assertEqual( @@ -725,11 +632,11 @@ def test_invalid_preexisting_mcf(self): def test_write_to_local_workspace(self): """MetadataControl: test write metadata to a different location.""" - from geometamaker import MetadataControl + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + mc = geometamaker.describe(datasource_path) temp_dir = tempfile.mkdtemp(dir=self.workspace_dir) mc.write(workspace=temp_dir) From 2e0b00eb50ee7ded46aedacfe4f68ba92ecf8364 Mon Sep 17 00:00:00 2001 From: davemfish Date: Mon, 22 Jul 2024 11:54:23 -0400 Subject: [PATCH 08/15] tests passing --- src/geometamaker/geometamaker.py | 67 +++--- src/geometamaker/models.py | 16 +- tests/test_geometamaker.py | 358 ++++++++++--------------------- 3 files changed, 169 insertions(+), 272 deletions(-) diff --git a/src/geometamaker/geometamaker.py b/src/geometamaker/geometamaker.py index ae9bb71..dfd3324 100644 --- a/src/geometamaker/geometamaker.py +++ b/src/geometamaker/geometamaker.py @@ -56,7 +56,7 @@ def describe_vector(source_dataset_path): description['rows'] = layer.GetFeatureCount() for fld in layer.schema: fields.append( - models.FieldSchema(name=fld.name, type=fld.type)) + models.FieldSchema(name=fld.name, type=fld.GetTypeName())) vector = layer = None description['schema'] = models.TableSchema(fields=fields) description['fields'] = len(fields) @@ -119,20 +119,6 @@ def describe_table(source_dataset_path): } -class MetadataControl(object): - """Encapsulates the Metadata Control File and methods for populating it. - - A Metadata Control File (MCF) is a YAML file that complies with the - MCF specification defined by pygeometa. - https://github.com/geopython/pygeometa - - Attributes: - datasource (string): path to dataset to which the metadata applies - mcf (dict): dict representation of the Metadata Control File - - """ - - def describe(source_dataset_path): """Create a metadata resource instance with properties of the dataset. @@ -145,36 +131,63 @@ def describe(source_dataset_path): metadata applies Returns - one of TableResource, VectorResource, RasterResource + instance of + ArchiveResource, TableResource, + VectorResource, RasterResource """ data_package_path = f'{source_dataset_path}.yml' - # Despite naming, this does not open a resource that must be closed + # Despite naming, this does not open a file that must be closed of = fsspec.open(source_dataset_path) if not of.fs.exists(source_dataset_path): raise FileNotFoundError(f'{source_dataset_path} does not exist') resource_type = detect_file_type(source_dataset_path) description = DESRCIBE_FUNCS[resource_type](source_dataset_path) - # this is nice for autodetect of field types, but sometimes - # we will know the table schema (invest MODEL_SPEC). - # Is there any benefit to passing in the known schema? Maybe not - # Can also just overwrite the schema attribute with known data after. # Load existing metadata file try: with fsspec.open(data_package_path, 'r') as file: yaml_string = file.read() - # This validates the existing yaml against our dataclasses. existing_resource = RESOURCE_MODELS[resource_type]( **yaml.safe_load(yaml_string)) - # overwrite properties that are intrinsic to the dataset, - # which is everything from `description` other than schema. - # Some parts of schema are intrinsic, but others are human-input - # so replace the whole thing for now. - del description['schema'] + if 'schema' in description: + if isinstance(description['schema'], models.RasterSchema): + # If existing band metadata still matches schema of the file + # carry over metadata from the existing file because it could + # include human-defined properties. + new_bands = [] + for band in description['schema'].bands: + try: + eband = existing_resource.get_band_description(band.index) + # TODO: rewrite this as __eq__ of BandSchema? + if (band.numpy_type, band.gdal_type, band.nodata) == ( + eband.numpy_type, eband.gdal_type, eband.nodata): + band = dataclasses.replace(band, **eband.__dict__) + except IndexError: + pass + new_bands.append(band) + description['schema'].bands = new_bands + if isinstance(description['schema'], models.TableSchema): + # If existing field metadata still matches schema of the file + # carry over metadata from the existing file because it could + # include human-defined properties. + new_fields = [] + for field in description['schema'].fields: + try: + efield = existing_resource.get_field_description( + field.name) + # TODO: rewrite this as __eq__ of FieldSchema? + if field.type == efield.type: + field = dataclasses.replace(field, **efield.__dict__) + except KeyError: + pass + new_fields.append(field) + description['schema'].fields = new_fields + # overwrite properties that are intrinsic to the dataset + # TODO: any other checks that the resources represent the same data? resource = dataclasses.replace( existing_resource, **description) diff --git a/src/geometamaker/models.py b/src/geometamaker/models.py index 42e0ee2..c8b52cc 100644 --- a/src/geometamaker/models.py +++ b/src/geometamaker/models.py @@ -101,6 +101,7 @@ class BandSchema: gdal_type: int numpy_type: str nodata: int | float + title: str = '' description: str = '' @@ -112,6 +113,19 @@ class RasterSchema: pixel_size: list raster_size: list + def __post_init__(self): + bands = [] + for band in self.bands: + # When loading an existing document + # from serialized data we need to init a BandSchema for + # each band dict. But it's also okay to init a RasterSchema + # with bands as list of BandSchema. + if isinstance(band, BandSchema): + bands.append(band) + else: + bands.append(BandSchema(**band)) + self.bands = bands + @dataclass(kw_only=True) class Resource: @@ -367,7 +381,7 @@ def write(self, workspace=None): target_path = self.metadata_path else: target_path = os.path.join( - workspace, f'{os.path.basename(self.datasource)}.yml') + workspace, os.path.basename(self.metadata_path)) with open(target_path, 'w') as file: file.write(yaml.dump( diff --git a/tests/test_geometamaker.py b/tests/test_geometamaker.py index 5670de9..74405a3 100644 --- a/tests/test_geometamaker.py +++ b/tests/test_geometamaker.py @@ -301,198 +301,94 @@ def test_set_contact(self): self.assertEqual(contact.position_name, position) self.assertEqual(contact.email, email) - def test_set_contact_validates(self): - """MetadataControl: invalid type raises ValidationError.""" - - import geometamaker - - postalcode = 55555 # should be a string - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) - with self.assertRaises(ValidationError): - mc.set_contact(postalcode=postalcode) - def test_set_doi(self): - """MetadataControl: set and get a doi.""" + """Test set and get a doi.""" import geometamaker doi = '10.foo/bar' - mc = geometamaker.describe() - mc.set_doi(doi) - self.assertEqual(mc.get_doi(), doi) + resource = geometamaker.models.Resource() + resource.set_doi(doi) + self.assertEqual(resource.get_doi(), doi) def test_set_get_edition(self): - """MetadataControl: set and get dataset edition.""" + """Test set and get dataset edition.""" import geometamaker - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) + resource = geometamaker.models.Resource() version = '3.14' - mc.set_edition(version) - self.assertEqual(mc.get_edition(), version) - - def test_set_edition_validates(self): - """MetadataControl: test set edition raises ValidationError.""" - - import geometamaker - - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) - version = 3.14 # should be a string - with self.assertRaises(ValidationError): - mc.set_edition(version) + resource.set_edition(version) + self.assertEqual(resource.get_edition(), version) def test_set_keywords(self): - """MetadataControl: set keywords to default section.""" + """Test set and get keywords.""" import geometamaker - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) - mc.set_keywords(['foo', 'bar']) + resource = geometamaker.models.Resource() + resource.set_keywords(['foo', 'bar']) self.assertEqual( - mc.mcf['identification']['keywords']['default']['keywords'], + resource.get_keywords(), ['foo', 'bar']) - def test_set_keywords_to_section(self): - """MetadataControl: set keywords to named section.""" - + def test_set_and_get_license(self): + """Test set and get license for resource.""" import geometamaker - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) - mc.set_keywords(['foo', 'bar'], section='first') - mc.set_keywords(['baz'], section='second') - - self.assertEqual( - mc.mcf['identification']['keywords']['first']['keywords'], - ['foo', 'bar']) - self.assertEqual( - mc.mcf['identification']['keywords']['second']['keywords'], - ['baz']) - - def test_overwrite_keywords(self): - """MetadataControl: overwrite keywords in existing section.""" - - import geometamaker + resource = geometamaker.models.Resource() + title = 'CC-BY-4.0' + path = 'https://creativecommons.org/licenses/by/4.0/' - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) - mc.set_keywords(['foo', 'bar']) - mc.set_keywords(['baz']) + resource.set_license(title=title) self.assertEqual( - mc.mcf['identification']['keywords']['default']['keywords'], - ['baz']) - - def test_keywords_raises_validation_error(self): - """MetadataControl: set keywords validates.""" - import geometamaker - - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) - with self.assertRaises(ValidationError): - mc.set_keywords('foo', 'bar') - - def test_set_and_get_license(self): - """MetadataControl: set purpose of dataset.""" - import geometamaker - - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) - name = 'CC-BY-4.0' - url = 'https://creativecommons.org/licenses/by/4.0/' + resource.get_license().__dict__, {'title': title, 'path': ''}) - mc.set_license(name=name) + resource.set_license(path=path) self.assertEqual( - mc.mcf['identification']['accessconstraints'], - 'license') - self.assertEqual(mc.get_license(), {'name': name, 'url': ''}) - - mc.set_license(url=url) - self.assertEqual(mc.get_license(), {'name': '', 'url': url}) - - mc.set_license(name=name, url=url) - self.assertEqual(mc.get_license(), {'name': name, 'url': url}) + resource.get_license().__dict__, {'title': '', 'path': path}) - mc.set_license() - self.assertEqual(mc.get_license(), {'name': '', 'url': ''}) + resource.set_license(title=title, path=path) self.assertEqual( - mc.mcf['identification']['accessconstraints'], - 'otherRestrictions') - - def test_set_license_validates(self): - """MetadataControl: test set license raises ValidationError.""" - - import geometamaker + resource.get_license().__dict__, {'title': title, 'path': path}) - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) - name = 4.0 # should be a string - with self.assertRaises(ValidationError): - mc.set_license(name=name) - with self.assertRaises(ValidationError): - mc.set_license(url=name) + resource.set_license() + self.assertEqual( + resource.get_license().__dict__, {'title': '', 'path': ''}) def test_set_and_get_lineage(self): - """MetadataControl: set lineage of dataset.""" + """Test set and get lineage of a resource.""" import geometamaker - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) + resource = geometamaker.models.Resource() statement = 'a lineage statment' - mc.set_lineage(statement) - self.assertEqual(mc.get_lineage(), statement) - - def test_set_lineage_validates(self): - """MetadataControl: test set lineage raises ValidationError.""" - - import geometamaker - - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) - lineage = ['some statement'] # should be a string - with self.assertRaises(ValidationError): - mc.set_lineage(lineage) + resource.set_lineage(statement) + self.assertEqual(resource.get_lineage(), statement) def test_set_and_get_purpose(self): - """MetadataControl: set purpose of dataset.""" + """Test set and get purpose of resource.""" import geometamaker - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) + resource = geometamaker.models.Resource() purpose = 'foo' - mc.set_purpose(purpose) - self.assertEqual(mc.get_purpose(), purpose) + resource.set_purpose(purpose) + self.assertEqual(resource.get_purpose(), purpose) def test_set_url(self): - """MetadataControl: set and get a url.""" + """Test set and get a url.""" import geometamaker url = 'http://foo/bar' - mc = geometamaker.describe() - mc.set_url(url) - self.assertEqual(mc.get_url(), url) + resource = geometamaker.models.Resource() + resource.set_url(url) + self.assertEqual(resource.get_url(), url) - def test_preexisting_mc_raster(self): - """MetadataControl: test reading and ammending an existing MCF raster.""" + def test_preexisting_metadata_document(self): + """Test reading and ammending an existing Metadata document.""" import geometamaker title = 'Title' @@ -500,73 +396,48 @@ def test_preexisting_mc_raster(self): band_name = 'The Band' datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) - mc.set_title(title) - mc.set_band_description(1, name=band_name) - mc.write() + resource = geometamaker.describe(datasource_path) + resource.set_title(title) + resource.set_band_description(1, title=band_name) + resource.write() - new_mc = geometamaker.describe(datasource_path) - new_mc.set_keywords([keyword]) + new_resource = geometamaker.describe(datasource_path) + new_resource.set_keywords([keyword]) - self.assertEqual(new_mc.mcf['metadata']['hierarchylevel'], 'dataset') self.assertEqual( - new_mc.get_title(), title) + new_resource.get_title(), title) self.assertEqual( - new_mc.get_band_description(1)['name'], band_name) + new_resource.get_band_description(1).title, band_name) self.assertEqual( - new_mc.get_keywords()['keywords'], [keyword]) + new_resource.get_keywords(), [keyword]) - def test_preexisting_mc_raster_new_bands(self): - """MetadataControl: test existing MCF when the raster has new bands.""" + def test_preexisting_doc_new_bands(self): + """Test existing metadata document when the raster has new bands.""" import geometamaker band_name = 'The Band' datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path, n_bands=1) - mc = geometamaker.describe(datasource_path) - mc.set_band_description(1, name=band_name) - self.assertEqual(mc.get_band_description(1)['type'], 'integer') - mc.write() + resource = geometamaker.describe(datasource_path) + resource.set_band_description(1, title=band_name) + self.assertEqual(resource.get_band_description(1).numpy_type, 'int16') + resource.write() # The raster is modified after it's original metadata was written # There's an extra band, and the datatype has changed create_raster(numpy.float32, datasource_path, n_bands=2) - new_mc = geometamaker.describe(datasource_path) - - band1 = new_mc.get_band_description(1) - self.assertEqual(band1['name'], band_name) - self.assertEqual(band1['type'], 'number') - band2 = new_mc.get_band_description(2) - self.assertEqual(band2['name'], '') - self.assertEqual(band2['type'], 'number') - - def test_preexisting_mc_vector(self): - """MetadataControl: test reading and ammending an existing MCF vector.""" - import geometamaker - - title = 'Title' - datasource_path = os.path.join(self.workspace_dir, 'vector.geojson') - field_name = 'foo' - description = 'description' - field_map = { - field_name: list(_OGR_TYPES_VALUES_MAP)[0]} - create_vector(datasource_path, field_map) - mc = geometamaker.describe(datasource_path) - mc.set_title(title) - mc.set_field_description(field_name, abstract=description) - mc.write() - - new_mc = geometamaker.describe(datasource_path) + new_resource = geometamaker.describe(datasource_path) - self.assertEqual(new_mc.mcf['metadata']['hierarchylevel'], 'dataset') - self.assertEqual( - new_mc.get_title(), title) - self.assertEqual( - new_mc.get_field_description(field_name)['abstract'], description) + band1 = new_resource.get_band_description(1) + self.assertEqual(band1.title, '') + self.assertEqual(band1.numpy_type, 'float32') + band2 = new_resource.get_band_description(2) + self.assertEqual(band2.title, '') + self.assertEqual(band2.numpy_type, 'float32') - def test_preexisting_mc_vector_new_fields(self): - """MetadataControl: test an existing MCF for vector with new fields.""" + def test_preexisting_doc_new_fields(self): + """Test an existing metadata document for vector with new fields.""" import geometamaker datasource_path = os.path.join(self.workspace_dir, 'vector.geojson') @@ -575,11 +446,11 @@ def test_preexisting_mc_vector_new_fields(self): field_map = { field1_name: list(_OGR_TYPES_VALUES_MAP)[0]} create_vector(datasource_path, field_map) - mc = geometamaker.describe(datasource_path) - mc.set_field_description(field1_name, abstract=description) + resource = geometamaker.describe(datasource_path) + resource.set_field_description(field1_name, description=description) self.assertEqual( - mc.get_field_description(field1_name)['type'], 'integer') - mc.write() + resource.get_field_description(field1_name).type, 'Integer') + resource.write() # Modify the dataset by changing the field type of the # existing field. And add a second field. @@ -588,62 +459,61 @@ def test_preexisting_mc_vector_new_fields(self): field1_name: list(_OGR_TYPES_VALUES_MAP)[2], field2_name: list(_OGR_TYPES_VALUES_MAP)[3]} create_vector(datasource_path, new_field_map) - new_mc = geometamaker.describe(datasource_path) - - field1 = new_mc.get_field_description(field1_name) - self.assertEqual(field1['abstract'], description) - self.assertEqual(field1['type'], 'number') - field2 = new_mc.get_field_description(field2_name) - self.assertEqual(field2['type'], 'string') - - def test_invalid_preexisting_mcf(self): - """MetadataControl: test overwriting an existing invalid MetadataControl.""" - import geometamaker - title = 'Title' - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) - mc.set_title(title) - - # delete a required property and ensure invalid MetadataControl - del mc.mcf['mcf'] - with self.assertRaises(ValidationError): - mc.validate() - mc.write() # intentionally writing an invalid MetadataControl - - new_mc = geometamaker.describe(datasource_path) - - # The new MetadataControl should not have values from the invalid MetadataControl - self.assertEqual( - new_mc.mcf['identification']['title'], '') - - try: - new_mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') - try: - new_mc.write() - except Exception as e: - self.fail( - 'unexpected write error occurred\n' - f'{e}') + new_resource = geometamaker.describe(datasource_path) + + field1 = new_resource.get_field_description(field1_name) + # The field type changed, so the description does not carry over + self.assertEqual(field1.description, '') + self.assertEqual(field1.type, 'Real') + field2 = new_resource.get_field_description(field2_name) + self.assertEqual(field2.type, 'String') + + # TODO: this is important, still need to design for it. + # def test_invalid_preexisting_mcf(self): + # """Test overwriting an existing invalid metadata document.""" + # import geometamaker + # title = 'Title' + # datasource_path = os.path.join(self.workspace_dir, 'raster.tif') + # create_raster(numpy.int16, datasource_path) + # mc = geometamaker.describe(datasource_path) + # mc.set_title(title) + + # # delete a required property and ensure invalid MetadataControl + # del mc.mcf['mcf'] + # with self.assertRaises(ValidationError): + # mc.validate() + # mc.write() # intentionally writing an invalid MetadataControl + + # new_mc = geometamaker.describe(datasource_path) + + # # The new MetadataControl should not have values from the invalid MetadataControl + # self.assertEqual( + # new_mc.mcf['identification']['title'], '') + + # try: + # new_mc.validate() + # except (MCFValidationError, SchemaError) as e: + # self.fail( + # 'unexpected validation error occurred\n' + # f'{e}') + # try: + # new_mc.write() + # except Exception as e: + # self.fail( + # 'unexpected write error occurred\n' + # f'{e}') def test_write_to_local_workspace(self): - """MetadataControl: test write metadata to a different location.""" + """Test write metadata to a different location.""" import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = geometamaker.describe(datasource_path) + resource = geometamaker.describe(datasource_path) temp_dir = tempfile.mkdtemp(dir=self.workspace_dir) - mc.write(workspace=temp_dir) + resource.write(workspace=temp_dir) self.assertTrue( os.path.exists(os.path.join( temp_dir, f'{os.path.basename(datasource_path)}.yml'))) - self.assertTrue( - os.path.exists(os.path.join( - temp_dir, f'{os.path.basename(datasource_path)}.xml'))) From 323b792e0bf356743ddaa20ef8f3681dc8f3d118 Mon Sep 17 00:00:00 2001 From: davemfish Date: Mon, 22 Jul 2024 12:55:02 -0400 Subject: [PATCH 09/15] cleanup in tests --- tests/test_geometamaker.py | 56 -------------------------------------- 1 file changed, 56 deletions(-) diff --git a/tests/test_geometamaker.py b/tests/test_geometamaker.py index 74405a3..5cb5019 100644 --- a/tests/test_geometamaker.py +++ b/tests/test_geometamaker.py @@ -4,18 +4,14 @@ import tempfile import unittest -from jsonschema.exceptions import SchemaError -from jsonschema.exceptions import ValidationError import numpy from osgeo import gdal from osgeo import gdal_array from osgeo import ogr from osgeo import osr -from pygeometa.core import MCFValidationError import pygeoprocessing from pygeoprocessing.geoprocessing_core import DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS import shapely -import yaml REGRESSION_DATA = os.path.join( os.path.dirname(__file__), 'data') @@ -103,23 +99,6 @@ def test_file_does_not_exist(self): with self.assertRaises(FileNotFoundError): _ = geometamaker.describe('foo.tif') - # def test_blank_geometamaker.describe(self): - # """MetadataControl: template has expected properties.""" - # import geometamaker - - # target_filepath = os.path.join(self.workspace_dir, 'mcf.yml') - - # mc = geometamaker.describe() - # mc.validate() - # mc._write_mcf(target_filepath) - - # with open(target_filepath, 'r') as file: - # actual = yaml.safe_load(file) - # with open(os.path.join(REGRESSION_DATA, 'template.yml'), 'r') as file: - # expected = yaml.safe_load(file) - - # self.assertEqual(actual, expected) - def test_describe_csv(self): """Test setting properties on csv.""" import geometamaker @@ -468,41 +447,6 @@ def test_preexisting_doc_new_fields(self): field2 = new_resource.get_field_description(field2_name) self.assertEqual(field2.type, 'String') - # TODO: this is important, still need to design for it. - # def test_invalid_preexisting_mcf(self): - # """Test overwriting an existing invalid metadata document.""" - # import geometamaker - # title = 'Title' - # datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - # create_raster(numpy.int16, datasource_path) - # mc = geometamaker.describe(datasource_path) - # mc.set_title(title) - - # # delete a required property and ensure invalid MetadataControl - # del mc.mcf['mcf'] - # with self.assertRaises(ValidationError): - # mc.validate() - # mc.write() # intentionally writing an invalid MetadataControl - - # new_mc = geometamaker.describe(datasource_path) - - # # The new MetadataControl should not have values from the invalid MetadataControl - # self.assertEqual( - # new_mc.mcf['identification']['title'], '') - - # try: - # new_mc.validate() - # except (MCFValidationError, SchemaError) as e: - # self.fail( - # 'unexpected validation error occurred\n' - # f'{e}') - # try: - # new_mc.write() - # except Exception as e: - # self.fail( - # 'unexpected write error occurred\n' - # f'{e}') - def test_write_to_local_workspace(self): """Test write metadata to a different location.""" import geometamaker From 3b9d36a5ca500099435d05d10bb537db279a923e Mon Sep 17 00:00:00 2001 From: davemfish Date: Mon, 22 Jul 2024 13:31:09 -0400 Subject: [PATCH 10/15] cleanup docstrings --- src/geometamaker/geometamaker.py | 65 +++++++++++++++++++++------ src/geometamaker/models.py | 77 +++++++++++++++++--------------- 2 files changed, 91 insertions(+), 51 deletions(-) diff --git a/src/geometamaker/geometamaker.py b/src/geometamaker/geometamaker.py index dfd3324..22d352c 100644 --- a/src/geometamaker/geometamaker.py +++ b/src/geometamaker/geometamaker.py @@ -2,14 +2,11 @@ import logging import os import uuid -from datetime import datetime import frictionless import fsspec import numpy from osgeo import gdal -from osgeo import ogr -from osgeo import osr import pygeoprocessing import yaml @@ -20,11 +17,17 @@ def detect_file_type(filepath): - # TODO: zip, or other archives. Can they be represented as a Resource? - # or do they need to be a Package? + """Detect the type of resource contained in the file. - # TODO: guard against classifying netCDF, HDF5, etc as GDAL rasters, - # we'll want a different data model for multi-dimensional arrays. + Args: + filepath (str): path to a file to be opened by GDAL or frictionless + + Returns + str + + """ + # TODO: guard against classifying netCDF, HDF5, etc as GDAL rasters. + # We'll likely want a different data model for multi-dimensional arrays. # GDAL considers CSV a vector, so check against frictionless # first. @@ -38,16 +41,35 @@ def detect_file_type(filepath): return 'vector' if gis_type == pygeoprocessing.RASTER_TYPE: return 'raster' - raise ValueError() + raise ValueError( + f'{filepath} does not appear to be one of (archive, table, raster, vector)') def describe_archive(source_dataset_path): + """Describe file properties of a compressed file. + + Args: + source_dataset_path (str): path to a file. + + Returns: + dict + + """ description = frictionless.describe( source_dataset_path, stats=True).to_dict() return description def describe_vector(source_dataset_path): + """Describe properties of a GDAL vector file. + + Args: + source_dataset_path (str): path to a GDAL vector. + + Returns: + dict + + """ description = frictionless.describe( source_dataset_path, stats=True).to_dict() fields = [] @@ -72,6 +94,15 @@ def describe_vector(source_dataset_path): def describe_raster(source_dataset_path): + """Describe properties of a GDAL raster file. + + Args: + source_dataset_path (str): path to a GDAL raster. + + Returns: + dict + + """ description = frictionless.describe( source_dataset_path, stats=True).to_dict() @@ -98,6 +129,15 @@ def describe_raster(source_dataset_path): def describe_table(source_dataset_path): + """Describe properties of a tabular dataset. + + Args: + source_dataset_path (str): path to a file representing a table. + + Returns: + dict + + """ description = frictionless.describe( source_dataset_path, stats=True).to_dict() description['schema'] = models.TableSchema(**description['schema']) @@ -131,11 +171,10 @@ def describe(source_dataset_path): metadata applies Returns - instance of - ArchiveResource, TableResource, - VectorResource, RasterResource - """ + instance of ArchiveResource, TableResource, VectorResource, + or RasterResource + """ data_package_path = f'{source_dataset_path}.yml' # Despite naming, this does not open a file that must be closed @@ -187,7 +226,6 @@ def describe(source_dataset_path): new_fields.append(field) description['schema'].fields = new_fields # overwrite properties that are intrinsic to the dataset - # TODO: any other checks that the resources represent the same data? resource = dataclasses.replace( existing_resource, **description) @@ -196,4 +234,3 @@ def describe(source_dataset_path): resource = RESOURCE_MODELS[resource_type](**description) return resource - diff --git a/src/geometamaker/models.py b/src/geometamaker/models.py index c8b52cc..93e0584 100644 --- a/src/geometamaker/models.py +++ b/src/geometamaker/models.py @@ -19,6 +19,7 @@ def ignore_aliases(self, data): @dataclass class BoundingBox(): + """Class for a spatial bounding box.""" xmin: float ymin: float @@ -28,6 +29,7 @@ class BoundingBox(): @dataclass class SpatialSchema(): + """Class for keeping track of spatial info.""" bounding_box: BoundingBox crs: str @@ -58,15 +60,13 @@ class License: @dataclass class FieldSchema: - """metadata for a field in a table.""" + """Metadata for a field in a table.""" # https://datapackage.org/standard/table-schema/ - name: str = '' - title: str = '' - type: str = '' - format: str = '' - example: any = None + name: str + type: str description: str = '' + title: str = '' units: str = '' @@ -101,8 +101,8 @@ class BandSchema: gdal_type: int numpy_type: str nodata: int | float - title: str = '' description: str = '' + title: str = '' @dataclass @@ -132,36 +132,44 @@ class Resource: """Base class for metadata for a resource. https://datapackage.org/standard/data-resource/ - This class should be based on Data Package - Resource + This class borrows from the Data Package - Resource specification. But we have some additional properties that are important to us. - """ - # TODO: DP includes `sources` as list of source files - # with some amount of metadata for each item. For our - # use-case, I think a list of filenames is good enough. + All attributes are keyword-only so that we can init + with default values, allowing the user to get a template + with which to complete later. - path: str = '' - type: str = '' - scheme: str = '' + """ + + # These are populated by `frictionless.describe()` + bytes: int = 0 encoding: str = '' format: str = '' - mediatype: str = '' - bytes: int = 0 hash: str = '' + mediatype: str = '' name: str = '' - title: str = '' - description: str = '' - keywords: list = dataclasses.field(default_factory=list) + path: str = '' + scheme: str = '' + type: str = '' + + # DataPackage includes `sources` as a list of source files + # with some amount of metadata for each item. For our + # use-case, I think a list of filenames is good enough. sources: list = dataclasses.field(default_factory=list) - licenses: list = dataclasses.field(default_factory=list) + + # These are not populated by geometamaker citation: str = '' + contact: ContactSchema = ContactSchema() + description: str = '' doi: str = '' - url: str = '' edition: str = '' + keywords: list = dataclasses.field(default_factory=list) + licenses: list = dataclasses.field(default_factory=list) lineage: str = '' purpose: str = '' - contact: ContactSchema = ContactSchema() + title: str = '' + url: str = '' def __post_init__(self): self.metadata_path = f'{self.path}.yml' @@ -180,7 +188,7 @@ def get_title(self): return self.title def set_description(self, description): - """Add an description for the dataset. + """Add a description for the dataset. Args: description (str) @@ -420,7 +428,7 @@ def _get_field(self, name): dict) Raises: - KeyError if no attributes exist in the MCF or if the named + KeyError if no attributes exist in the resource or if the named attribute does not exist. """ @@ -434,15 +442,15 @@ def _get_field(self, name): f'{self.schema} has no field named {name}') def set_field_description(self, name, title=None, description=None, - units=None, type=None, format=None, - example=None): + units=None, type=None): """Define metadata for a tabular field. Args: name (str): name and unique identifier of the field title (str): title for the field - abstract (str): description of the field + description (str): description of the field units (str): unit of measurement for the field's values + type (str): datatype of values in the field """ idx, field = self._get_field(name) @@ -455,10 +463,6 @@ def set_field_description(self, name, title=None, description=None, field.units = units if type is not None: field.type = type - if format is not None: - field.format = format - if example is not None: - field.example = example self.schema.fields[idx] = field @@ -469,7 +473,7 @@ def get_field_description(self, name): name (str): name and unique identifier of the field Returns: - dict + FieldSchema """ idx, field = self._get_field(name) return field @@ -512,11 +516,9 @@ def set_band_description(self, band_number, title=None, Args: band_number (int): a raster band index, starting at 1 - name (str): name for the raster band title (str): title for the raster band - abstract (str): description of the raster band + description (str): description of the raster band units (str): unit of measurement for the band's pixel values - type (str): of the band's values, either 'integer' or 'number' """ idx = band_number - 1 @@ -538,6 +540,7 @@ def get_band_description(self, band_number): band_number (int): a raster band index, starting at 1 Returns: - dict + BandSchema + """ return self.schema.bands[band_number - 1] From d62f92a017fdbbe444e50a24d31f3184fca5347f Mon Sep 17 00:00:00 2001 From: davemfish Date: Mon, 22 Jul 2024 13:40:58 -0400 Subject: [PATCH 11/15] updates to readme, requirements, etc --- README.md | 46 +++++++++++++++++++------------- docs/environment-rtd.yml | 4 +-- docs/source/conf.py | 1 - requirements.txt | 4 +-- src/geometamaker/geometamaker.py | 2 -- 5 files changed, 29 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 404db44..8384e47 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,10 @@ -A Python library for creating [Metadata Control Files](https://geopython.github.io/pygeometa/reference/mcf/) +A Python library for creating human and machine-readable metadata for geospatial data. + +Supported datatypes include: +* everything supported by GDAL +* tabular formats supported by `frictionless` +* compressed formats supported by `frictionless` + See `requirements.txt` for dependencies @@ -7,48 +13,50 @@ See `requirements.txt` for dependencies #### Creating & adding metadata to file: ```python -from geometamaker import MetadataControl +import geometamaker data_path = 'data/watershed_gura.shp' -mc = MetadataControl(data_path) +resource = geometamaker.describe(data_path) -mc.set_title('My Dataset') -mc.set_abstract('all about my dataset') -mc.set_keywords(['hydrology', 'watersheds']) +resource.set_title('My Dataset') +resource.set_description('all about my dataset') +resource.set_keywords(['hydrology', 'watersheds']) # For a vector: -mc.set_field_description( +resource.set_field_description( 'field_name', # the name of an actual field in the vector's table - abstract='something about the field', + description='something about the field', units='mm') # or for a raster: -mc.set_band_description( +data_path = 'data/dem.tif' +resource = geometamaker.describe(data_path) +resource.set_band_description( 1, # a raster band index, starting at 1 name='band name', - abstract='something about the band', + description='something about the band', units='mm') -mc.validate() -mc.write() +resource.write() ``` #### Creating metadata for a batch of files: ```python import os -from geometamaker import MetadataControl +import geometamaker data_dir = 'C:/Users/dmf/projects/invest/data/invest-sample-data' for path, dirs, files in os.walk(data_dir): for file in files: - if file.endswith(('.shp', '.gpkg', '.tif')): - filepath = os.path.join(path, file) - print(filepath) - mc = MetadataControl(filepath) - mc.validate() - mc.write() + filepath = os.path.join(path, file) + print(filepath) + try: + resource = geometamaker.describe(filepath) + except ValueError as err: + print(err) + resource.write() ``` #### For a complete list of methods: diff --git a/docs/environment-rtd.yml b/docs/environment-rtd.yml index 897d210..8e2bb54 100644 --- a/docs/environment-rtd.yml +++ b/docs/environment-rtd.yml @@ -9,12 +9,10 @@ channels: - conda-forge dependencies: - python=3.8 + - frictionless - fsspec - gdal>=3 - - jsonschema - numpy - - pygeometa - pygeoprocessing>=2.4.2 - - shapely - pyyaml - sphinx_rtd_theme diff --git a/docs/source/conf.py b/docs/source/conf.py index b260b05..f75e375 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -3,7 +3,6 @@ # For the full list of built-in configuration values, see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html -import datetime import os import sys import sphinx.ext.apidoc diff --git a/requirements.txt b/requirements.txt index a0a431d..a9efdbd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,10 +5,8 @@ aiohttp fsspec GDAL -jsonschema +frictionless numpy -pygeometa pygeoprocessing>=2.4.3 pyyaml requests -shapely \ No newline at end of file diff --git a/src/geometamaker/geometamaker.py b/src/geometamaker/geometamaker.py index 22d352c..84559b4 100644 --- a/src/geometamaker/geometamaker.py +++ b/src/geometamaker/geometamaker.py @@ -1,7 +1,5 @@ import dataclasses import logging -import os -import uuid import frictionless import fsspec From 2e37827045baf112b34f211e05a38df1c67aace7 Mon Sep 17 00:00:00 2001 From: davemfish Date: Mon, 22 Jul 2024 15:02:43 -0400 Subject: [PATCH 12/15] python version compatability issues --- src/geometamaker/models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/geometamaker/models.py b/src/geometamaker/models.py index 93e0584..07a666c 100644 --- a/src/geometamaker/models.py +++ b/src/geometamaker/models.py @@ -1,3 +1,4 @@ +from __future__ import annotations import dataclasses from dataclasses import dataclass import logging @@ -160,7 +161,7 @@ class Resource: # These are not populated by geometamaker citation: str = '' - contact: ContactSchema = ContactSchema() + contact: ContactSchema = dataclasses.field(default_factory=ContactSchema) description: str = '' doi: str = '' edition: str = '' From e2418d85ff457c0af4f00618977c83add0085adc Mon Sep 17 00:00:00 2001 From: davemfish Date: Tue, 23 Jul 2024 16:34:31 -0400 Subject: [PATCH 13/15] fixing package path for rtd --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index f75e375..f46bf8d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -8,7 +8,7 @@ import sphinx.ext.apidoc from pkg_resources import get_distribution -sys.path.insert(0, os.path.abspath('../../src/geometamaker')) +sys.path.insert(0, os.path.abspath('../../src')) # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information From 80fb55c8198c9e554ba827f69d67a1858586ec29 Mon Sep 17 00:00:00 2001 From: davemfish Date: Wed, 24 Jul 2024 08:39:14 -0400 Subject: [PATCH 14/15] added classmethod for loading existing metadata, checking compatibility. --- src/geometamaker/__init__.py | 5 ++++ src/geometamaker/geometamaker.py | 11 ++++----- src/geometamaker/models.py | 39 +++++++++++++++++++++++++++++++- tests/test_geometamaker.py | 28 ++++++++++++++++++++++- 4 files changed, 74 insertions(+), 9 deletions(-) diff --git a/src/geometamaker/__init__.py b/src/geometamaker/__init__.py index af30c26..739417f 100644 --- a/src/geometamaker/__init__.py +++ b/src/geometamaker/__init__.py @@ -1 +1,6 @@ +import importlib.metadata + from .geometamaker import describe + + +__version__ = importlib.metadata.version('geometamaker') diff --git a/src/geometamaker/geometamaker.py b/src/geometamaker/geometamaker.py index 84559b4..e419cb8 100644 --- a/src/geometamaker/geometamaker.py +++ b/src/geometamaker/geometamaker.py @@ -173,7 +173,7 @@ def describe(source_dataset_path): or RasterResource """ - data_package_path = f'{source_dataset_path}.yml' + metadata_path = f'{source_dataset_path}.yml' # Despite naming, this does not open a file that must be closed of = fsspec.open(source_dataset_path) @@ -185,11 +185,7 @@ def describe(source_dataset_path): # Load existing metadata file try: - with fsspec.open(data_package_path, 'r') as file: - yaml_string = file.read() - - existing_resource = RESOURCE_MODELS[resource_type]( - **yaml.safe_load(yaml_string)) + existing_resource = RESOURCE_MODELS[resource_type].load(metadata_path) if 'schema' in description: if isinstance(description['schema'], models.RasterSchema): # If existing band metadata still matches schema of the file @@ -228,7 +224,8 @@ def describe(source_dataset_path): existing_resource, **description) # Common path: metadata file does not already exist - except FileNotFoundError as err: + # Or less common, ValueError if it exists but is incompatible + except (FileNotFoundError, ValueError) as err: resource = RESOURCE_MODELS[resource_type](**description) return resource diff --git a/src/geometamaker/models.py b/src/geometamaker/models.py index 07a666c..a50cc2f 100644 --- a/src/geometamaker/models.py +++ b/src/geometamaker/models.py @@ -4,8 +4,11 @@ import logging import os +import fsspec import yaml +import geometamaker + LOGGER = logging.getLogger(__name__) @@ -128,7 +131,7 @@ def __post_init__(self): self.bands = bands -@dataclass(kw_only=True) +@dataclass() class Resource: """Base class for metadata for a resource. @@ -142,6 +145,8 @@ class Resource: with which to complete later. """ + # A version string we can use to identify geometamaker compliant documents + metadata_version: str = dataclasses.field(init=False) # These are populated by `frictionless.describe()` bytes: int = 0 @@ -174,6 +179,38 @@ class Resource: def __post_init__(self): self.metadata_path = f'{self.path}.yml' + self.metadata_version: str = f'geometamaker.{geometamaker.__version__}' + + @classmethod + def load(cls, filepath): + """Load metadata document from a yaml file. + + Args: + filepath (str): path to yaml file + + Returns: + instance of the class + + Raises: + FileNotFoundError if filepath does not exist + ValueError if the metadata is found to be incompatible with + geometamaker. + + """ + with fsspec.open(filepath, 'r') as file: + yaml_string = file.read() + yaml_dict = yaml.safe_load(yaml_string) + if 'metadata_version' not in yaml_dict \ + or not yaml_dict['metadata_version'].startswith('geometamaker'): + message = (f'{filepath} exists but is not compatible with ' + f'geometamaker. It will be overwritten if write() is ' + f'called for this resource.') + LOGGER.warning(message) + raise ValueError(message) + # delete this property so that geometamaker can initialize it itself + # with the current version info. + del yaml_dict['metadata_version'] + return cls(**yaml_dict) def set_title(self, title): """Add a title for the dataset. diff --git a/tests/test_geometamaker.py b/tests/test_geometamaker.py index 5cb5019..19c9804 100644 --- a/tests/test_geometamaker.py +++ b/tests/test_geometamaker.py @@ -12,6 +12,7 @@ import pygeoprocessing from pygeoprocessing.geoprocessing_core import DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS import shapely +import yaml REGRESSION_DATA = os.path.join( os.path.dirname(__file__), 'data') @@ -93,7 +94,7 @@ def tearDown(self): shutil.rmtree(self.workspace_dir) def test_file_does_not_exist(self): - """MetadataControl: raises exception if given file does not exist.""" + """Raises exception if given file does not exist.""" import geometamaker with self.assertRaises(FileNotFoundError): @@ -447,6 +448,31 @@ def test_preexisting_doc_new_fields(self): field2 = new_resource.get_field_description(field2_name) self.assertEqual(field2.type, 'String') + def test_preexisting_incompatible_doc(self): + """Test when yaml file not created by geometamaker already exists.""" + import geometamaker + + datasource_path = os.path.join(self.workspace_dir, 'raster.tif') + target_path = f'{datasource_path}.yml' + with open(target_path, 'w') as file: + file.write(yaml.dump({'foo': 'bar'})) + create_raster(numpy.int16, datasource_path) + + # Describing a dataset that already has an incompatible yaml + # sidecar file should log a warning. + with self.assertLogs('geometamaker', level='WARNING') as cm: + resource = geometamaker.describe(datasource_path) + expected_message = 'exists but is not compatible with' + self.assertIn(expected_message, ''.join(cm.output)) + + # After writing new doc, check it has expected property + resource.write() + with open(target_path, 'r') as file: + yaml_string = file.read() + yaml_dict = yaml.safe_load(yaml_string) + self.assertIn('metadata_version', yaml_dict) + self.assertIn('geometamaker', yaml_dict['metadata_version']) + def test_write_to_local_workspace(self): """Test write metadata to a different location.""" import geometamaker From d95f0254db1d4b84da4bda9d267088be0d91c13e Mon Sep 17 00:00:00 2001 From: davemfish Date: Wed, 24 Jul 2024 09:13:37 -0400 Subject: [PATCH 15/15] bump python version for RTD build --- docs/environment-rtd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/environment-rtd.yml b/docs/environment-rtd.yml index 8e2bb54..dde4957 100644 --- a/docs/environment-rtd.yml +++ b/docs/environment-rtd.yml @@ -8,7 +8,7 @@ name: env-readthedocs channels: - conda-forge dependencies: - - python=3.8 + - python=3.10 - frictionless - fsspec - gdal>=3