diff --git a/README.md b/README.md index 404db44..8384e47 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,10 @@ -A Python library for creating [Metadata Control Files](https://geopython.github.io/pygeometa/reference/mcf/) +A Python library for creating human and machine-readable metadata for geospatial data. + +Supported datatypes include: +* everything supported by GDAL +* tabular formats supported by `frictionless` +* compressed formats supported by `frictionless` + See `requirements.txt` for dependencies @@ -7,48 +13,50 @@ See `requirements.txt` for dependencies #### Creating & adding metadata to file: ```python -from geometamaker import MetadataControl +import geometamaker data_path = 'data/watershed_gura.shp' -mc = MetadataControl(data_path) +resource = geometamaker.describe(data_path) -mc.set_title('My Dataset') -mc.set_abstract('all about my dataset') -mc.set_keywords(['hydrology', 'watersheds']) +resource.set_title('My Dataset') +resource.set_description('all about my dataset') +resource.set_keywords(['hydrology', 'watersheds']) # For a vector: -mc.set_field_description( +resource.set_field_description( 'field_name', # the name of an actual field in the vector's table - abstract='something about the field', + description='something about the field', units='mm') # or for a raster: -mc.set_band_description( +data_path = 'data/dem.tif' +resource = geometamaker.describe(data_path) +resource.set_band_description( 1, # a raster band index, starting at 1 name='band name', - abstract='something about the band', + description='something about the band', units='mm') -mc.validate() -mc.write() +resource.write() ``` #### Creating metadata for a batch of files: ```python import os -from geometamaker import MetadataControl +import geometamaker data_dir = 'C:/Users/dmf/projects/invest/data/invest-sample-data' for path, dirs, files in os.walk(data_dir): for file in files: - if file.endswith(('.shp', '.gpkg', '.tif')): - filepath = os.path.join(path, file) - print(filepath) - mc = MetadataControl(filepath) - mc.validate() - mc.write() + filepath = os.path.join(path, file) + print(filepath) + try: + resource = geometamaker.describe(filepath) + except ValueError as err: + print(err) + resource.write() ``` #### For a complete list of methods: diff --git a/docs/environment-rtd.yml b/docs/environment-rtd.yml index 897d210..dde4957 100644 --- a/docs/environment-rtd.yml +++ b/docs/environment-rtd.yml @@ -8,13 +8,11 @@ name: env-readthedocs channels: - conda-forge dependencies: - - python=3.8 + - python=3.10 + - frictionless - fsspec - gdal>=3 - - jsonschema - numpy - - pygeometa - pygeoprocessing>=2.4.2 - - shapely - pyyaml - sphinx_rtd_theme diff --git a/docs/source/conf.py b/docs/source/conf.py index b260b05..f46bf8d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -3,13 +3,12 @@ # For the full list of built-in configuration values, see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html -import datetime import os import sys import sphinx.ext.apidoc from pkg_resources import get_distribution -sys.path.insert(0, os.path.abspath('../../src/geometamaker')) +sys.path.insert(0, os.path.abspath('../../src')) # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information diff --git a/requirements.txt b/requirements.txt index a0a431d..a9efdbd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,10 +5,8 @@ aiohttp fsspec GDAL -jsonschema +frictionless numpy -pygeometa pygeoprocessing>=2.4.3 pyyaml requests -shapely \ No newline at end of file diff --git a/src/geometamaker/__init__.py b/src/geometamaker/__init__.py index 9f56a76..739417f 100644 --- a/src/geometamaker/__init__.py +++ b/src/geometamaker/__init__.py @@ -1 +1,6 @@ -from .geometamaker import MetadataControl +import importlib.metadata + +from .geometamaker import describe + + +__version__ = importlib.metadata.version('geometamaker') diff --git a/src/geometamaker/geometamaker.py b/src/geometamaker/geometamaker.py index 99b45fa..e419cb8 100644 --- a/src/geometamaker/geometamaker.py +++ b/src/geometamaker/geometamaker.py @@ -1,745 +1,231 @@ +import dataclasses import logging -import os -import uuid -from datetime import datetime +import frictionless import fsspec -import jsonschema -from jsonschema.exceptions import ValidationError -import pygeometa.core -from pygeometa.schemas import load_schema -import pygeoprocessing +import numpy from osgeo import gdal -from osgeo import ogr -from osgeo import osr +import pygeoprocessing import yaml - -# https://stackoverflow.com/questions/13518819/avoid-references-in-pyyaml -class _NoAliasDumper(yaml.SafeDumper): - """Keep the yaml human-readable by avoiding anchors and aliases.""" - - def ignore_aliases(self, data): - return True +from . import models LOGGER = logging.getLogger(__name__) -MCF_SCHEMA_FILE = os.path.join( - pygeometa.core.SCHEMAS, 'mcf', 'core.yaml') -with open(MCF_SCHEMA_FILE, 'r') as schema_file: - MCF_SCHEMA = pygeometa.core.yaml_load(schema_file) - -# modify the core MCF schema so that our default -# template MCFs have all the properties we expect -# users to use. -MCF_SCHEMA['required'].append('content_info') -MCF_SCHEMA['required'].append('dataquality') -MCF_SCHEMA['properties']['identification']['properties'][ - 'citation'] = { - 'type': 'string', - 'description': 'a biobliographic citation for the dataset' - } -MCF_SCHEMA['properties']['identification']['required'].append('citation') -MCF_SCHEMA['properties']['identification']['properties'][ - 'keywords']['patternProperties']['^.*'][ - 'required'] = ['keywords', 'keywords_type'] -# to accomodate tables that do not represent spatial content: -NO_GEOM_TYPE = 'none' -MCF_SCHEMA['properties']['spatial']['properties'][ - 'geomtype']['enum'].append(NO_GEOM_TYPE) -TABLE_CONTENT_TYPE = 'table' -MCF_SCHEMA['properties']['content_info']['properties'][ - 'type']['enum'].append(TABLE_CONTENT_TYPE) - -OGR_MCF_ATTR_TYPE_MAP = { - ogr.OFTInteger: 'integer', - ogr.OFTInteger64: 'integer', - ogr.OFTReal: 'number', - ogr.OFTString: 'string' -} - -def _get_default(item): - """Return a default value for a property. +def detect_file_type(filepath): + """Detect the type of resource contained in the file. Args: - item (dict): a jsonschema definition of a property with no children. - - Return: - a value from DEFAULT_VALUES + filepath (str): path to a file to be opened by GDAL or frictionless - Raises: - KeyError if ``item`` does not include an - 'enum', 'type', or '$ref' property. + Returns + str """ - # TODO: read types from the #/definitions found in MCF_SCHEMA - # instead of hardcoding values here - # TODO: support i18n properly by using objects - # keyed by country codes to contain the array of strings - default_values = { - 'string': str(), - 'int': int(), - 'integer': int(), - 'number': float(), - 'boolean': False, - '#/definitions/date_or_datetime_string': str(), - '#/definitions/i18n_string': str(), - '#/definitions/i18n_array': list(), - '#/definitions/any_type': str(), - } - - # If there are enumerated values which must be used - try: - fixed_values = item['enum'] - # TODO: find a better way to choose the default - return fixed_values[0] - except KeyError: - pass - - # If no enumerated values, get a default value based on type - try: - t = item['type'] - except KeyError: - # When 'type' is missing, a $ref to another schema is present - try: - t = item['$ref'] - except KeyError: - raise KeyError( - f'schema has no type and no reference to a type definition\n' - f'{item}') - - return default_values[t] - - -def _get_template(schema): - """Create a minimal dictionary that is valid against ``schema``. - - The dict will ontain only the 'required' properties. + # TODO: guard against classifying netCDF, HDF5, etc as GDAL rasters. + # We'll likely want a different data model for multi-dimensional arrays. + + # GDAL considers CSV a vector, so check against frictionless + # first. + desc = frictionless.describe(filepath) + if desc.type == 'table': + return 'table' + if desc.compression: + return 'archive' + gis_type = pygeoprocessing.get_gis_type(filepath) + if gis_type == pygeoprocessing.VECTOR_TYPE: + return 'vector' + if gis_type == pygeoprocessing.RASTER_TYPE: + return 'raster' + raise ValueError( + f'{filepath} does not appear to be one of (archive, table, raster, vector)') + + +def describe_archive(source_dataset_path): + """Describe file properties of a compressed file. Args: - schema (dict): a jsonschema definition. + source_dataset_path (str): path to a file. - Return: - dict that is valid against ``schema`` - - Raises: - KeyError if a penultimate property in a schema branch - does not include an 'enum', 'type', or '$ref' property. + Returns: + dict """ - template = {} - if 'type' in schema and schema['type'] == 'object': - for prop, sch in schema['properties'].items(): - if 'required' in schema and prop not in schema['required']: - continue - if 'patternProperties' in sch: - # this item's properties can have any name matching the pattern. - # assign the name 'default' and overwite the current schema - # with a new one that explicitly includes the 'default' property. - example_sch = { - 'type': 'object', - 'required': ['default'], - 'properties': { - 'default': sch['patternProperties']['^.*'] - } - } - sch = example_sch - - if 'properties' in sch and 'anyOf' in sch['properties']: - # if 'anyOf' is a property, then we effectively want to - # treat the children of 'anyOf' as the properties instead. - template[prop] = { - p: _get_template(s) - for p, s in sch['properties']['anyOf'].items() - } - else: - template[prop] = _get_template(sch) - return template - - elif 'type' in schema and schema['type'] == 'array': - if 'properties' in schema: - # for the weird case where identification.extents.spatial - # is type: array but contains 'properties' instead of 'items' - return [{ - p: _get_template(s) - for p, s in schema['properties'].items() - if p in schema['required'] - }] - return [_get_template(schema['items'])] - else: - return _get_default(schema) - - -class MetadataControl(object): - """Encapsulates the Metadata Control File and methods for populating it. - - A Metadata Control File (MCF) is a YAML file that complies with the - MCF specification defined by pygeometa. - https://github.com/geopython/pygeometa - - Attributes: - datasource (string): path to dataset to which the metadata applies - mcf (dict): dict representation of the Metadata Control File - - """ - - def __init__(self, source_dataset_path=None): - """Create an MCF instance, populated with properties of the dataset. - - The MCF will be valid according to the pygeometa schema. It has - all required properties. Properties of the dataset are used to - populate as many MCF properties as possible. Default/placeholder - values are used for properties that require user input. - - Instantiating without a ``source_dataset_path`` creates an MCF template. - - Args: - source_dataset_path (string): path or URL to dataset to which the - metadata applies - - """ - self.mcf = None - if source_dataset_path is not None: - self.datasource = source_dataset_path - self.mcf_path = f'{self.datasource}.yml' - - # Despite naming, this does not open a resource that must be closed - of = fsspec.open(self.datasource) - if not of.fs.exists(self.datasource): - raise FileNotFoundError(f'{self.datasource} does not exist') - - try: - with fsspec.open(self.mcf_path, 'r') as file: - yaml_string = file.read() - - # pygeometa.core.read_mcf can parse nested MCF documents, - # where one MCF refers to another - self.mcf = pygeometa.core.read_mcf(yaml_string) - LOGGER.info(f'loaded existing metadata from {self.mcf_path}') - self.validate() - - # Common path: MCF often does not already exist - except FileNotFoundError as err: - LOGGER.debug(err) - - # Uncommon path: MCF already exists but cannot be used - except (pygeometa.core.MCFReadError, - ValidationError, AttributeError) as err: - # AttributeError in read_mcf not caught by pygeometa - LOGGER.warning(err) - self.mcf = None - - if self.mcf is None: - self.mcf = _get_template(MCF_SCHEMA) - self.mcf['metadata']['identifier'] = str(uuid.uuid4()) - - # fill all values that can be derived from the dataset - LOGGER.debug(f'getting properties from {source_dataset_path}') - self._set_spatial_info() - - else: - self.mcf = _get_template(MCF_SCHEMA) + description = frictionless.describe( + source_dataset_path, stats=True).to_dict() + return description - self.mcf['mcf']['version'] = \ - MCF_SCHEMA['properties']['mcf'][ - 'properties']['version']['const'] - def set_title(self, title): - """Add a title for the dataset. +def describe_vector(source_dataset_path): + """Describe properties of a GDAL vector file. - Args: - title (str) - - """ - self.mcf['identification']['title'] = title - - def get_title(self): - """Get the title for the dataset.""" - return self.mcf['identification']['title'] - - def set_abstract(self, abstract): - """Add an abstract for the dataset. - - Args: - abstract (str) - - """ - self.mcf['identification']['abstract'] = abstract - - def get_abstract(self): - """Get the abstract for the dataset.""" - return self.mcf['identification']['abstract'] - - def set_citation(self, citation): - """Add a citation string for the dataset. - - Args: - citation (str) - - """ - self.mcf['identification']['citation'] = citation - - def get_citation(self): - """Get the citation for the dataset.""" - return self.mcf['identification']['citation'] - - def set_contact(self, organization=None, individualname=None, positionname=None, - email=None, section='default', **kwargs): - """Add a contact section. - - Args: - organization (str): name of the responsible organization - individualname (str): name of the responsible person - positionname (str): role or position of the responsible person - email (str): email address of the responsible organization or individual - section (str): a header for the contact section under which to - apply the other args, since there can be more than one. - kwargs (dict): key-value pairs for any other properties listed in - the contact section of the core MCF schema. - - """ - - if organization: - self.mcf['contact'][section]['organization'] = organization - if individualname: - self.mcf['contact'][section]['individualname'] = individualname - if positionname: - self.mcf['contact'][section]['positionname'] = positionname - if email: - self.mcf['contact'][section]['email'] = email - if kwargs: - for k, v in kwargs.items(): - self.mcf['contact'][section][k] = v - - self.validate() - - def get_contact(self, section='default'): - """Get metadata from a contact section. - - Args: - section (str): a header for the contact section under which to - apply the other args, since there can be more than one. - Returns: - A dict or ``None`` if ``section`` does not exist. - - """ - return self.mcf['contact'].get(section) - - def set_doi(self, doi): - """Add a doi string for the dataset. - - Args: - doi (str) - - """ - self.mcf['identification']['doi'] = doi - - def get_doi(self): - """Get the doi for the dataset.""" - return self.mcf['identification']['doi'] - - def set_edition(self, edition): - """Set the edition for the dataset. + Args: + source_dataset_path (str): path to a GDAL vector. - Args: - edition (str): version of the cited resource + Returns: + dict - """ - self.mcf['identification']['edition'] = edition - self.validate() + """ + description = frictionless.describe( + source_dataset_path, stats=True).to_dict() + fields = [] + vector = gdal.OpenEx(source_dataset_path, gdal.OF_VECTOR) + layer = vector.GetLayer() + description['rows'] = layer.GetFeatureCount() + for fld in layer.schema: + fields.append( + models.FieldSchema(name=fld.name, type=fld.GetTypeName())) + vector = layer = None + description['schema'] = models.TableSchema(fields=fields) + description['fields'] = len(fields) + + info = pygeoprocessing.get_vector_info(source_dataset_path) + spatial = { + 'bounding_box': info['bounding_box'], + 'crs': info['projection_wkt'] + } + description['spatial'] = models.SpatialSchema(**spatial) + description['sources'] = info['file_list'] + return description - def get_edition(self): - """Get the edition of the dataset. - Returns: - str or ``None`` if ``edition`` does not exist. +def describe_raster(source_dataset_path): + """Describe properties of a GDAL raster file. - """ - return self.mcf['identification'].get('edition') + Args: + source_dataset_path (str): path to a GDAL raster. - def set_keywords(self, keywords, section='default', keywords_type='theme', - vocabulary=None): - """Describe a dataset with a list of keywords. + Returns: + dict - Keywords are grouped into sections for the purpose of complying with - pre-exising keyword schema. A section will be overwritten if it - already exists. + """ + description = frictionless.describe( + source_dataset_path, stats=True).to_dict() + + bands = [] + info = pygeoprocessing.get_raster_info(source_dataset_path) + # Some values of raster info are numpy types, which the + # yaml dumper doesn't know how to represent. + for i in range(info['n_bands']): + b = i + 1 + bands.append(models.BandSchema( + index=b, + gdal_type=info['datatype'], + numpy_type=numpy.dtype(info['numpy_type']).name, + nodata=info['nodata'][i])) + description['schema'] = models.RasterSchema( + bands=bands, + pixel_size=info['pixel_size'], + raster_size=info['raster_size']) + description['spatial'] = models.SpatialSchema( + bounding_box=[float(x) for x in info['bounding_box']], + crs=info['projection_wkt']) + description['sources'] = info['file_list'] + return description + + +def describe_table(source_dataset_path): + """Describe properties of a tabular dataset. - Args: - keywords (list): sequence of strings - section (string): the name of a keywords section - keywords_type (string): subject matter used to group similar - keywords. Must be one of, - ('discipline', 'place', 'stratum', 'temporal', 'theme') - vocabulary (dict): a dictionary with 'name' and 'url' (optional) - keys. Used to describe the source (thesaurus) of keywords + Args: + source_dataset_path (str): path to a file representing a table. - Raises: - ValidationError + Returns: + dict - """ - section_dict = { - 'keywords': keywords, - 'keywords_type': keywords_type - } + """ + description = frictionless.describe( + source_dataset_path, stats=True).to_dict() + description['schema'] = models.TableSchema(**description['schema']) + return description - if vocabulary: - section_dict['vocabulary'] = vocabulary - self.mcf['identification']['keywords'][section] = section_dict - self.validate() - - def get_keywords(self, section='default'): - return self.mcf['identification']['keywords'][section] - - def set_license(self, name=None, url=None): - """Add a license for the dataset. - Either or both name and url are required if there is a license. - Call with no arguments to remove access constraints and license - info. +DESRCIBE_FUNCS = { + 'archive': describe_archive, + 'table': describe_table, + 'vector': describe_vector, + 'raster': describe_raster +} - Args: - name (str): name of the license of the source dataset - url (str): url for the license - - """ - # MCF spec says use 'otherRestrictions' to mean no restrictions - constraints = 'otherRestrictions' - if name or url: - constraints = 'license' +RESOURCE_MODELS = { + 'archive': models.ArchiveResource, + 'table': models.TableResource, + 'vector': models.VectorResource, + 'raster': models.RasterResource +} - license_dict = {} - license_dict['name'] = name if name else '' - license_dict['url'] = url if url else '' - self.mcf['identification']['license'] = license_dict - self.mcf['identification']['accessconstraints'] = constraints - self.validate() - def get_license(self): - """Get ``license`` for the dataset. +def describe(source_dataset_path): + """Create a metadata resource instance with properties of the dataset. - Returns: - dict or ``None`` if ``license`` does not exist. + Properties of the dataset are used to populate as many metadata + properties as possible. Default/placeholder + values are used for properties that require user input. - """ - return self.mcf['identification'].get('license') + Args: + source_dataset_path (string): path or URL to dataset to which the + metadata applies - def set_lineage(self, statement): - """Set the lineage statement for the dataset. + Returns + instance of ArchiveResource, TableResource, VectorResource, + or RasterResource - Args: - statement (str): general explanation describing the lineage or provenance - of the dataset + """ + metadata_path = f'{source_dataset_path}.yml' - """ - self.mcf['dataquality']['lineage']['statement'] = statement - self.validate() + # Despite naming, this does not open a file that must be closed + of = fsspec.open(source_dataset_path) + if not of.fs.exists(source_dataset_path): + raise FileNotFoundError(f'{source_dataset_path} does not exist') - def get_lineage(self): - """Get the lineage statement of the dataset. + resource_type = detect_file_type(source_dataset_path) + description = DESRCIBE_FUNCS[resource_type](source_dataset_path) - Returns: - str or ``None`` if ``lineage`` does not exist. - - """ - return self.mcf['dataquality']['lineage'].get('statement') - - def set_purpose(self, purpose): - """Add a purpose for the dataset. - - Args: - purpose (str): description of the purpose of the source dataset - - """ - # 'Purpose' is not supported in the core MCF spec, probably because - # `` was added to ISO-19115 in 2014, and MCF still only - # supports 2015. For now, we can add `purpose` in `identification`. - # Later we can move it elsewhere if it becomes formally supported. - self.mcf['identification']['purpose'] = purpose - self.validate() - - def get_purpose(self): - """Get ``purpose`` for the dataset. - - Returns: - str or ``None`` if ``purpose`` does not exist. - - """ - return self.mcf['identification'].get('purpose') - - def set_url(self, url): - """Add a url for the dataset. - - Args: - url (str) - - """ - self.mcf['identification']['url'] = url - - def get_url(self): - """Get the url for the dataset.""" - return self.mcf['identification']['url'] - - def set_band_description(self, band_number, name=None, title=None, - abstract=None, units=None, type=None): - """Define metadata for a raster band. - - Args: - band_number (int): a raster band index, starting at 1 - name (str): name for the raster band - title (str): title for the raster band - abstract (str): description of the raster band - units (str): unit of measurement for the band's pixel values - type (str): of the band's values, either 'integer' or 'number' - - """ - idx = band_number - 1 - attribute = self.mcf['content_info']['attributes'][idx] - - if name is not None: - attribute['name'] = name - if title is not None: - attribute['title'] = title - if abstract is not None: - attribute['abstract'] = abstract - if units is not None: - attribute['units'] = units - if type is not None: - attribute['type'] = type - - self.mcf['content_info']['attributes'][idx] = attribute - - def get_band_description(self, band_number): - """Get the attribute metadata for a band. - - Args: - band_number (int): a raster band index, starting at 1 - - Returns: - dict - """ - return self.mcf['content_info']['attributes'][band_number - 1] - - def _get_attr(self, name): - """Get an attribute by its name property. - - Args: - name (string): to match the value of the 'name' key in a dict - - Returns: - tuple of (list index of the matching attribute, the attribute - dict) - - Raises: - KeyError if no attributes exist in the MCF or if the named - attribute does not exist. - - """ - if len(self.mcf['content_info']['attributes']) == 0: - raise KeyError( - f'{self.datasource} MCF has not attributes') - for idx, attr in enumerate(self.mcf['content_info']['attributes']): - if attr['name'] == name: - return idx, attr - raise KeyError( - f'{self.datasource} has no attribute named {name}') - - def set_field_description(self, name, title=None, abstract=None, - units=None, type=None): - """Define metadata for a tabular field. - - Args: - name (str): name and unique identifier of the field - title (str): title for the field - abstract (str): description of the field - units (str): unit of measurement for the field's values - - """ - idx, attribute = self._get_attr(name) - - if title is not None: - attribute['title'] = title - if abstract is not None: - attribute['abstract'] = abstract - if units is not None: - attribute['units'] = units - if type is not None: - attribute['type'] = type - - self.mcf['content_info']['attributes'][idx] = attribute - - def get_field_description(self, name): - """Get the attribute metadata for a field. - - Args: - name (str): name and unique identifier of the field - - Returns: - dict - """ - idx, attribute = self._get_attr(name) - return attribute - - def _write_mcf(self, target_path): - with open(target_path, 'w') as file: - file.write(yaml.dump(self.mcf, Dumper=_NoAliasDumper)) - - def write(self, workspace=None): - """Write MCF and ISO-19139 XML to disk. - - This creates sidecar files with '.yml' and '.xml' extensions - appended to the full filename of the data source. For example, - - - 'myraster.tif' - - 'myraster.tif.yml' - - 'myraster.tif.xml' - - Args: - workspace (str): if ``None``, files write to the same location - as the source data. If not ``None``, a path to a local directory - to write files. They will still be named to match the source - filename. Use this option if the source data is not on the local - filesystem. - - """ - if workspace is None: - target_mcf_path = self.mcf_path - target_xml_path = f'{self.datasource}.xml' - else: - target_mcf_path = os.path.join( - workspace, f'{os.path.basename(self.datasource)}.yml') - target_xml_path = os.path.join( - workspace, f'{os.path.basename(self.datasource)}.xml') - - self.mcf['metadata']['datestamp'] = datetime.utcnow().strftime( - '%Y-%m-%d') - self._write_mcf(target_mcf_path) - - schema_obj = load_schema('iso19139') - xml_string = schema_obj.write(self.mcf) - with open(target_xml_path, 'w') as xmlfile: - xmlfile.write(xml_string) - - def validate(self): - """Validate MCF against a jsonschema object.""" - # validate against our own schema, which could - # be a superset of the core MCF schema. - # If we wanted to validate against core MCF, - # we could use pygeometa.core.validate_mcf - jsonschema.validate(self.mcf, MCF_SCHEMA) - - def to_string(self): - pass - - def _set_spatial_info(self): - """Populate the MCF using spatial properties of the dataset.""" - gis_type = pygeoprocessing.get_gis_type(self.datasource) - self.mcf['metadata']['hierarchylevel'] = 'dataset' - - if gis_type == pygeoprocessing.VECTOR_TYPE: - LOGGER.debug('opening as GDAL vector') - self.mcf['content_info']['type'] = 'coverage' - self.mcf['spatial']['datatype'] = 'vector' - open_options = [] - - if os.path.splitext(self.datasource)[1] == '.csv': - self.mcf['spatial']['datatype'] = 'textTable' - open_options.append('AUTODETECT_TYPE=YES') - - vector = gdal.OpenEx(self.datasource, gdal.OF_VECTOR, - open_options=open_options) - layer = vector.GetLayer() - layer_defn = layer.GetLayerDefn() - geomname = ogr.GeometryTypeToName(layer_defn.GetGeomType()) - geomtype = NO_GEOM_TYPE - # https://www.fgdc.gov/nap/metadata/register/codelists.html - if 'Point' in geomname: - geomtype = 'point' - if 'Polygon' in geomname: - geomtype = 'surface' - if 'Line' in geomname: - geomtype = 'curve' - if 'Collection' in geomname: - geomtype = 'complex' - self.mcf['spatial']['geomtype'] = geomtype - - if len(layer.schema) and 'attributes' not in self.mcf['content_info']: - self.mcf['content_info']['attributes'] = [] - - for field in layer.schema: - try: - idx, attribute = self._get_attr(field.name) - except KeyError: - attribute = _get_template( - MCF_SCHEMA['properties']['content_info']['properties'][ - 'attributes'])[0] - attribute['name'] = field.name - self.mcf['content_info']['attributes'].append( - attribute) - - try: - datatype = OGR_MCF_ATTR_TYPE_MAP[field.type] - except KeyError: - LOGGER.warning( - f'{field.type} is missing in the OGR-to-MCF ' - f'attribute type map; attribute type for field ' - f'{field.name} will be "object".') - datatype = 'object' - self.set_field_description(field.name, type=datatype) - - vector = None - layer = None - - gis_info = pygeoprocessing.get_vector_info(self.datasource) - - if gis_type == pygeoprocessing.RASTER_TYPE: - LOGGER.debug('opening as GDAL raster') - self.mcf['spatial']['datatype'] = 'grid' - self.mcf['spatial']['geomtype'] = 'surface' - self.mcf['content_info']['type'] = 'image' - - raster = gdal.OpenEx(self.datasource, gdal.OF_RASTER) - - attr = _get_template( - MCF_SCHEMA['properties']['content_info']['properties'][ - 'attributes'])[0] - - if 'attributes' not in self.mcf['content_info']: - self.mcf['content_info']['attributes'] = [attr]*raster.RasterCount - else: - n_attrs = len(self.mcf['content_info']['attributes']) - if n_attrs < raster.RasterCount: - extend_n = raster.RasterCount - n_attrs - self.mcf['content_info']['attributes'].extend( - [attr]*extend_n) - - for i in range(raster.RasterCount): - b = i + 1 - band = raster.GetRasterBand(b) - datatype = 'integer' if band.DataType < 6 else 'number' - self.set_band_description(b, type=datatype) - band = None - raster = None - - gis_info = pygeoprocessing.get_raster_info(self.datasource) - - if gis_info['projection_wkt']: - try: - srs = osr.SpatialReference() - srs.ImportFromWkt(gis_info['projection_wkt']) - epsg = srs.GetAttrValue('AUTHORITY', 1) - except TypeError: - LOGGER.warning( - f'could not import a spatial reference system from ' - f'"projection_wkt" in {gis_info}') - epsg = '' - # for human-readable values after yaml dump, use python types - # instead of numpy types - bbox = [float(x) for x in gis_info['bounding_box']] - spatial_info = [{ - 'bbox': bbox, - 'crs': epsg # MCF does not support WKT here - }] - self.mcf['identification']['extents']['spatial'] = spatial_info + # Load existing metadata file + try: + existing_resource = RESOURCE_MODELS[resource_type].load(metadata_path) + if 'schema' in description: + if isinstance(description['schema'], models.RasterSchema): + # If existing band metadata still matches schema of the file + # carry over metadata from the existing file because it could + # include human-defined properties. + new_bands = [] + for band in description['schema'].bands: + try: + eband = existing_resource.get_band_description(band.index) + # TODO: rewrite this as __eq__ of BandSchema? + if (band.numpy_type, band.gdal_type, band.nodata) == ( + eband.numpy_type, eband.gdal_type, eband.nodata): + band = dataclasses.replace(band, **eband.__dict__) + except IndexError: + pass + new_bands.append(band) + description['schema'].bands = new_bands + if isinstance(description['schema'], models.TableSchema): + # If existing field metadata still matches schema of the file + # carry over metadata from the existing file because it could + # include human-defined properties. + new_fields = [] + for field in description['schema'].fields: + try: + efield = existing_resource.get_field_description( + field.name) + # TODO: rewrite this as __eq__ of FieldSchema? + if field.type == efield.type: + field = dataclasses.replace(field, **efield.__dict__) + except KeyError: + pass + new_fields.append(field) + description['schema'].fields = new_fields + # overwrite properties that are intrinsic to the dataset + resource = dataclasses.replace( + existing_resource, **description) + + # Common path: metadata file does not already exist + # Or less common, ValueError if it exists but is incompatible + except (FileNotFoundError, ValueError) as err: + resource = RESOURCE_MODELS[resource_type](**description) + + return resource diff --git a/src/geometamaker/models.py b/src/geometamaker/models.py new file mode 100644 index 0000000..a50cc2f --- /dev/null +++ b/src/geometamaker/models.py @@ -0,0 +1,584 @@ +from __future__ import annotations +import dataclasses +from dataclasses import dataclass +import logging +import os + +import fsspec +import yaml + +import geometamaker + + +LOGGER = logging.getLogger(__name__) + + +# https://stackoverflow.com/questions/13518819/avoid-references-in-pyyaml +class _NoAliasDumper(yaml.SafeDumper): + """Keep the yaml human-readable by avoiding anchors and aliases.""" + + def ignore_aliases(self, data): + return True + + +@dataclass +class BoundingBox(): + """Class for a spatial bounding box.""" + + xmin: float + ymin: float + xmax: float + ymax: float + + +@dataclass +class SpatialSchema(): + """Class for keeping track of spatial info.""" + + bounding_box: BoundingBox + crs: str + + +@dataclass +class ContactSchema: + """Class for keeping track of contact info.""" + + email: str = '' + organization: str = '' + individual_name: str = '' + position_name: str = '' + + +@dataclass +class License: + """Class for storing license info.""" + + # https://datapackage.org/profiles/2.0/dataresource.json + # This profile also includes `name`, described as: + # "MUST be an Open Definition license identifier", + # see http://licenses.opendefinition.org/" + # I don't think that's useful to us yet. + path: str + title: str + + +@dataclass +class FieldSchema: + """Metadata for a field in a table.""" + + # https://datapackage.org/standard/table-schema/ + name: str + type: str + description: str = '' + title: str = '' + units: str = '' + + +@dataclass +class TableSchema: + """Class for metadata for tables.""" + + # https://datapackage.org/standard/table-schema/ + fields: list = dataclasses.field(default_factory=FieldSchema) + missingValues: list = dataclasses.field(default_factory=list) + primaryKey: list = dataclasses.field(default_factory=list) + foreignKeys: list = dataclasses.field(default_factory=list) + + def __post_init__(self): + field_schemas = [] + for field in self.fields: + # Allow init of the resource with a schema of type + # FieldSchema, or type dict. Mostly because dataclasses.replace + # calls init, but the base object will have already been initialized. + if isinstance(field, FieldSchema): + field_schemas.append(field) + else: + field_schemas.append(FieldSchema(**field)) + self.fields = field_schemas + + +@dataclass +class BandSchema: + """Class for metadata for a raster band.""" + + index: int + gdal_type: int + numpy_type: str + nodata: int | float + description: str = '' + title: str = '' + + +@dataclass +class RasterSchema: + """Class for metadata for raster bands.""" + + bands: list + pixel_size: list + raster_size: list + + def __post_init__(self): + bands = [] + for band in self.bands: + # When loading an existing document + # from serialized data we need to init a BandSchema for + # each band dict. But it's also okay to init a RasterSchema + # with bands as list of BandSchema. + if isinstance(band, BandSchema): + bands.append(band) + else: + bands.append(BandSchema(**band)) + self.bands = bands + + +@dataclass() +class Resource: + """Base class for metadata for a resource. + + https://datapackage.org/standard/data-resource/ + This class borrows from the Data Package - Resource + specification. But we have some additional properties + that are important to us. + + All attributes are keyword-only so that we can init + with default values, allowing the user to get a template + with which to complete later. + + """ + # A version string we can use to identify geometamaker compliant documents + metadata_version: str = dataclasses.field(init=False) + + # These are populated by `frictionless.describe()` + bytes: int = 0 + encoding: str = '' + format: str = '' + hash: str = '' + mediatype: str = '' + name: str = '' + path: str = '' + scheme: str = '' + type: str = '' + + # DataPackage includes `sources` as a list of source files + # with some amount of metadata for each item. For our + # use-case, I think a list of filenames is good enough. + sources: list = dataclasses.field(default_factory=list) + + # These are not populated by geometamaker + citation: str = '' + contact: ContactSchema = dataclasses.field(default_factory=ContactSchema) + description: str = '' + doi: str = '' + edition: str = '' + keywords: list = dataclasses.field(default_factory=list) + licenses: list = dataclasses.field(default_factory=list) + lineage: str = '' + purpose: str = '' + title: str = '' + url: str = '' + + def __post_init__(self): + self.metadata_path = f'{self.path}.yml' + self.metadata_version: str = f'geometamaker.{geometamaker.__version__}' + + @classmethod + def load(cls, filepath): + """Load metadata document from a yaml file. + + Args: + filepath (str): path to yaml file + + Returns: + instance of the class + + Raises: + FileNotFoundError if filepath does not exist + ValueError if the metadata is found to be incompatible with + geometamaker. + + """ + with fsspec.open(filepath, 'r') as file: + yaml_string = file.read() + yaml_dict = yaml.safe_load(yaml_string) + if 'metadata_version' not in yaml_dict \ + or not yaml_dict['metadata_version'].startswith('geometamaker'): + message = (f'{filepath} exists but is not compatible with ' + f'geometamaker. It will be overwritten if write() is ' + f'called for this resource.') + LOGGER.warning(message) + raise ValueError(message) + # delete this property so that geometamaker can initialize it itself + # with the current version info. + del yaml_dict['metadata_version'] + return cls(**yaml_dict) + + def set_title(self, title): + """Add a title for the dataset. + + Args: + title (str) + + """ + self.title = title + + def get_title(self): + """Get the title for the dataset.""" + return self.title + + def set_description(self, description): + """Add a description for the dataset. + + Args: + description (str) + + """ + self.description = description + + def get_description(self): + """Get the description for the dataset.""" + return self.description + + def set_citation(self, citation): + """Add a citation string for the dataset. + + Args: + citation (str) + + """ + self.citation = citation + + def get_citation(self): + """Get the citation for the dataset.""" + return self.citation + + def set_contact(self, organization=None, individual_name=None, + position_name=None, email=None): + """Add a contact section. + + Args: + organization (str): name of the responsible organization + individual_name (str): name of the responsible person + position_name (str): role or position of the responsible person + email (str): address of the responsible organization or individual + + """ + + if organization is not None: + self.contact.organization = organization + if individual_name is not None: + self.contact.individual_name = individual_name + if position_name is not None: + self.contact.position_name = position_name + if email is not None: + self.contact.email = email + + def get_contact(self): + """Get metadata from a contact section. + + Returns: + ContactSchema + + """ + return self.contact + + def set_doi(self, doi): + """Add a doi string for the dataset. + + Args: + doi (str) + + """ + self.doi = doi + + def get_doi(self): + """Get the doi for the dataset.""" + return self.doi + + def set_edition(self, edition): + """Set the edition for the dataset. + + Args: + edition (str): version of the cited resource + + """ + self.edition = edition + + def get_edition(self): + """Get the edition of the dataset. + + Returns: + str or ``None`` if ``edition`` does not exist. + + """ + return self.edition + + def set_keywords(self, keywords): + """Describe a dataset with a list of keywords. + + Args: + keywords (list): sequence of strings + + """ + self.keywords = keywords + + def get_keywords(self): + return self.keywords + + def set_license(self, title=None, path=None): + """Add a license for the dataset. + + Either or both title and path are required if there is a license. + Call with no arguments to remove access constraints and license + info. + + Args: + title (str): human-readable title of the license + path (str): url for the license + + """ + license_dict = {} + license_dict['title'] = title if title else '' + license_dict['path'] = path if path else '' + + # TODO: DataPackage/Resource allows for a list of licenses. + # So far we only support one license per resource. + self.licenses = [License(**license_dict)] + + def get_license(self): + """Get ``license`` for the dataset. + + Returns: + models.License + + """ + # TODO: DataPackage/Resource allows for a list of licenses. + # So far we only support one license per resource. + if self.licenses: + return self.licenses[0] + + def set_lineage(self, statement): + """Set the lineage statement for the dataset. + + Args: + statement (str): general explanation describing the lineage or + provenance of the dataset + + """ + self.lineage = statement + + def get_lineage(self): + """Get the lineage statement of the dataset. + + Returns: + str + + """ + return self.lineage + + def set_purpose(self, purpose): + """Add a purpose for the dataset. + + Args: + purpose (str): description of the purpose of the source dataset + + """ + self.purpose = purpose + + def get_purpose(self): + """Get ``purpose`` for the dataset. + + Returns: + str + + """ + return self.purpose + + def set_url(self, url): + """Add a url for the dataset. + + Args: + url (str) + + """ + self.url = url + + def get_url(self): + """Get the url for the dataset.""" + return self.url + + def write(self, workspace=None): + """Write datapackage yaml to disk. + + This creates sidecar files with '.yml' + appended to the full filename of the data source. For example, + + - 'myraster.tif' + - 'myraster.tif.yml' + + Args: + workspace (str): if ``None``, files write to the same location + as the source data. If not ``None``, a path to a local directory + to write files. They will still be named to match the source + filename. Use this option if the source data is not on the local + filesystem. + + """ + if workspace is None: + target_path = self.metadata_path + else: + target_path = os.path.join( + workspace, os.path.basename(self.metadata_path)) + + with open(target_path, 'w') as file: + file.write(yaml.dump( + dataclasses.asdict(self), Dumper=_NoAliasDumper)) + + def to_string(self): + pass + + +@dataclass(kw_only=True) +class TableResource(Resource): + """Class for metadata for a table resource.""" + + fields: int + rows: int + # without post-init, schema ends up as a dict, or whatever is passed in. + schema: TableSchema = dataclasses.field(default_factory=TableSchema) + + def __post_init__(self): + super().__post_init__() + # Allow init of the resource with a schema of type + # TableSchema, or type dict. Mostly because dataclasses.replace + # calls init, but the base object will have already been initialized. + if isinstance(self.schema, TableSchema): + return + self.schema = TableSchema(**self.schema) + + def _get_field(self, name): + """Get an attribute by its name property. + + Args: + name (string): to match the value of the 'name' key in a dict + + Returns: + tuple of (list index of the matching attribute, the attribute + dict) + + Raises: + KeyError if no attributes exist in the resource or if the named + attribute does not exist. + + """ + if len(self.schema.fields) == 0: + raise KeyError( + f'{self.schema} has no fields') + for idx, field in enumerate(self.schema.fields): + if field.name == name: + return idx, field + raise KeyError( + f'{self.schema} has no field named {name}') + + def set_field_description(self, name, title=None, description=None, + units=None, type=None): + """Define metadata for a tabular field. + + Args: + name (str): name and unique identifier of the field + title (str): title for the field + description (str): description of the field + units (str): unit of measurement for the field's values + type (str): datatype of values in the field + + """ + idx, field = self._get_field(name) + + if title is not None: + field.title = title + if description is not None: + field.description = description + if units is not None: + field.units = units + if type is not None: + field.type = type + + self.schema.fields[idx] = field + + def get_field_description(self, name): + """Get the attribute metadata for a field. + + Args: + name (str): name and unique identifier of the field + + Returns: + FieldSchema + """ + idx, field = self._get_field(name) + return field + + +@dataclass(kw_only=True) +class ArchiveResource(Resource): + """Class for metadata for an archive resource.""" + + compression: str + innerpath: str + + +@dataclass(kw_only=True) +class VectorResource(TableResource): + """Class for metadata for a vector resource.""" + + spatial: SpatialSchema + + +@dataclass(kw_only=True) +class RasterResource(Resource): + """Class for metadata for a raster resource.""" + + schema: RasterSchema + spatial: SpatialSchema + + def __post_init__(self): + super().__post_init__() + # Allow init of the resource with a schema of type + # RasterSchema, or type dict. Mostly because dataclasses.replace + # calls init, but the base object will have already been initialized. + if isinstance(self.schema, RasterSchema): + return + self.schema = RasterSchema(**self.schema) + + def set_band_description(self, band_number, title=None, + description=None, units=None): + """Define metadata for a raster band. + + Args: + band_number (int): a raster band index, starting at 1 + title (str): title for the raster band + description (str): description of the raster band + units (str): unit of measurement for the band's pixel values + + """ + idx = band_number - 1 + band = self.schema.bands[idx] + + if title is not None: + band.title = title + if description is not None: + band.description = description + if units is not None: + band.units = units + + self.schema.bands[idx] = band + + def get_band_description(self, band_number): + """Get the attribute metadata for a band. + + Args: + band_number (int): a raster band index, starting at 1 + + Returns: + BandSchema + + """ + return self.schema.bands[band_number - 1] diff --git a/tests/test_geometamaker.py b/tests/test_geometamaker.py index 0a3d455..19c9804 100644 --- a/tests/test_geometamaker.py +++ b/tests/test_geometamaker.py @@ -4,14 +4,11 @@ import tempfile import unittest -from jsonschema.exceptions import SchemaError -from jsonschema.exceptions import ValidationError import numpy from osgeo import gdal from osgeo import gdal_array from osgeo import ogr from osgeo import osr -from pygeometa.core import MCFValidationError import pygeoprocessing from pygeoprocessing.geoprocessing_core import DEFAULT_GTIFF_CREATION_TUPLE_OPTIONS import shapely @@ -97,32 +94,15 @@ def tearDown(self): shutil.rmtree(self.workspace_dir) def test_file_does_not_exist(self): - """MetadataControl: raises exception if given file does not exist.""" - from geometamaker import MetadataControl + """Raises exception if given file does not exist.""" + import geometamaker with self.assertRaises(FileNotFoundError): - _ = MetadataControl('foo.tif') + _ = geometamaker.describe('foo.tif') - def test_blank_MetadataControl(self): - """MetadataControl: template has expected properties.""" - from geometamaker import MetadataControl - - target_filepath = os.path.join(self.workspace_dir, 'mcf.yml') - - mc = MetadataControl() - mc.validate() - mc._write_mcf(target_filepath) - - with open(target_filepath, 'r') as file: - actual = yaml.safe_load(file) - with open(os.path.join(REGRESSION_DATA, 'template.yml'), 'r') as file: - expected = yaml.safe_load(file) - - self.assertEqual(actual, expected) - - def test_csv_MetadataControl(self): - """MetadataControl: validate basic csv MetadataControl.""" - from geometamaker import MetadataControl + def test_describe_csv(self): + """Test setting properties on csv.""" + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'data.csv') field_names = ['Strings', 'Ints', 'Reals'] @@ -132,49 +112,37 @@ def test_csv_MetadataControl(self): writer.writerow(field_names) writer.writerow(field_values) - mc = MetadataControl(datasource_path) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') + resource = geometamaker.describe(datasource_path) self.assertEqual( - len(mc.mcf['content_info']['attributes']), + len(resource.schema.fields), len(field_names)) - self.assertEqual(mc.get_field_description('Strings')['type'], 'string') - self.assertEqual(mc.get_field_description('Ints')['type'], 'integer') - self.assertEqual(mc.get_field_description('Reals')['type'], 'number') + self.assertEqual(resource.get_field_description('Strings').type, 'string') + self.assertEqual(resource.get_field_description('Ints').type, 'integer') + self.assertEqual(resource.get_field_description('Reals').type, 'number') title = 'title' - abstract = 'some abstract' + description = 'some abstract' units = 'mm' - mc.set_field_description( + resource.set_field_description( field_names[1], title=title, - abstract=abstract) + description=description) # To demonstrate that properties can be added while preserving others - mc.set_field_description( + resource.set_field_description( field_names[1], units=units) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') - - attr = [attr for attr in mc.mcf['content_info']['attributes'] - if attr['name'] == field_names[1]][0] - self.assertEqual(attr['title'], title) - self.assertEqual(attr['abstract'], abstract) - self.assertEqual(attr['units'], units) - - def test_bad_csv_MetadataControl(self): + + field = [field for field in resource.schema.fields + if field.name == field_names[1]][0] + self.assertEqual(field.title, title) + self.assertEqual(field.description, description) + self.assertEqual(field.units, units) + + def test_describe_bad_csv(self): """MetadataControl: CSV with extra item in row does not fail.""" - from geometamaker import MetadataControl + import geometamaker - datasource_path = os.path.join('data.csv') + datasource_path = os.path.join(self.workspace_dir, 'data.csv') field_names = ['Strings', 'Ints', 'Reals'] field_values = ['foo', 1, 0.9, 'extra'] with open(datasource_path, 'w') as file: @@ -182,24 +150,19 @@ def test_bad_csv_MetadataControl(self): writer.writerow(field_names) writer.writerow(field_values) - mc = MetadataControl(datasource_path) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') - mc.write() + resource = geometamaker.describe(datasource_path) + + resource.write() self.assertEqual( - len(mc.mcf['content_info']['attributes']), + len(resource.schema.fields), len(field_names)) - self.assertEqual(mc.get_field_description('Strings')['type'], 'string') - self.assertEqual(mc.get_field_description('Ints')['type'], 'integer') - self.assertEqual(mc.get_field_description('Reals')['type'], 'number') + self.assertEqual(resource.get_field_description('Strings').type, 'string') + self.assertEqual(resource.get_field_description('Ints').type, 'integer') + self.assertEqual(resource.get_field_description('Reals').type, 'number') - def test_vector_MetadataControl(self): - """MetadataControl: validate basic vector MetadataControl.""" - from geometamaker import MetadataControl + def test_describe_vector(self): + """Test basic vector.""" + import geometamaker field_map = { f'field_{k}': k @@ -213,454 +176,249 @@ def test_vector_MetadataControl(self): self.workspace_dir, f'vector.{ext}') create_vector(datasource_path, field_map, driver) - mc = MetadataControl(datasource_path) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') - mc.write() + resource = geometamaker.describe(datasource_path) + self.assertTrue(isinstance( + resource.spatial, geometamaker.models.SpatialSchema)) + + resource.write() self.assertTrue(os.path.exists(f'{datasource_path}.yml')) - def test_vector_no_fields(self): - """MetadataControl: validate MetadataControl for basic vector with no fields.""" - from geometamaker import MetadataControl + def test_describe_vector_no_fields(self): + """Test metadata for basic vector with no fields.""" + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'vector.geojson') create_vector(datasource_path, None) - mc = MetadataControl(datasource_path) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') - mc.write() + resource = geometamaker.describe(datasource_path) + self.assertEqual(len(resource.schema.fields), 0) - def test_raster_MetadataControl(self): - """MetadataControl: validate basic raster MetadataControl.""" - from geometamaker import MetadataControl + def test_describe_raster(self): + """Test metadata for basic raster.""" + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') - mc.write() + resource = geometamaker.describe(datasource_path) + self.assertTrue(isinstance( + resource.spatial, geometamaker.models.SpatialSchema)) - def test_vector_attributes(self): - """MetadataControl: validate vector with extra attribute metadata.""" - from geometamaker import MetadataControl - - datasource_path = os.path.join(self.workspace_dir, 'vector.geojson') - field_name = 'foo' - field_map = { - field_name: list(_OGR_TYPES_VALUES_MAP)[0]} - create_vector(datasource_path, field_map) - - mc = MetadataControl(datasource_path) - title = 'title' - abstract = 'some abstract' - units = 'mm' - mc.set_field_description( - field_name, - title=title, - abstract=abstract) - # To demonstrate that properties can be added while preserving others - mc.set_field_description( - field_name, - units=units) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') - - self.assertEqual( - len(mc.mcf['content_info']['attributes']), - len(field_map)) - attr = [attr for attr in mc.mcf['content_info']['attributes'] - if attr['name'] == field_name][0] - self.assertEqual(attr['title'], title) - self.assertEqual(attr['abstract'], abstract) - self.assertEqual(attr['units'], units) + resource.write() + self.assertTrue(os.path.exists(f'{datasource_path}.yml')) def test_raster_attributes(self): - """MetadataControl: validate raster with extra attribute metadata.""" - from geometamaker import MetadataControl + """Test adding extra attribute metadata to raster.""" + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) + numpy_type = numpy.int16 + create_raster(numpy_type, datasource_path) band_number = 1 - mc = MetadataControl(datasource_path) - name = 'name' + resource = geometamaker.describe(datasource_path) title = 'title' - abstract = 'some abstract' + description = 'some abstract' units = 'mm' - mc.set_band_description( + resource.set_band_description( band_number, - name=name, title=title, - abstract=abstract) + description=description) # To demonstrate that properties can be added while preserving others - mc.set_band_description( + resource.set_band_description( band_number, units=units) - try: - mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') + raster_info = pygeoprocessing.get_raster_info(datasource_path) self.assertEqual( - len(mc.mcf['content_info']['attributes']), - pygeoprocessing.get_raster_info(datasource_path)['n_bands']) - attr = mc.mcf['content_info']['attributes'][band_number - 1] - self.assertEqual(attr['name'], name) - self.assertEqual(attr['title'], title) - self.assertEqual(attr['abstract'], abstract) - self.assertEqual(attr['units'], units) - - def test_set_abstract(self): - """MetadataControl: set and get an abstract.""" - - from geometamaker import MetadataControl - - abstract = 'foo bar' - mc = MetadataControl() - mc.set_abstract(abstract) - self.assertEqual(mc.get_abstract(), abstract) + len(resource.schema.bands), raster_info['n_bands']) + band_idx = band_number - 1 + band = resource.schema.bands[band_idx] + self.assertEqual(band.title, title) + self.assertEqual(band.description, description) + self.assertEqual(band.gdal_type, raster_info['datatype']) + self.assertEqual(band.numpy_type, numpy.dtype(numpy_type).name) + self.assertEqual(band.nodata, raster_info['nodata'][band_idx]) + self.assertEqual(band.units, units) + + def test_set_description(self): + """Test set and get a description for a resource.""" + + import geometamaker + + description = 'foo bar' + resource = geometamaker.models.Resource() + resource.set_description(description) + self.assertEqual(resource.get_description(), description) def test_set_citation(self): - """MetadataControl: set and get a citation.""" + """Test set and get a citation for resource.""" - from geometamaker import MetadataControl + import geometamaker citation = 'foo bar' - mc = MetadataControl() - mc.set_citation(citation) - self.assertEqual(mc.get_citation(), citation) + resource = geometamaker.models.Resource() + resource.set_citation(citation) + self.assertEqual(resource.get_citation(), citation) def test_set_contact(self): - """MetadataControl: set and get a contact section.""" + """Test set and get a contact section for a resource.""" - from geometamaker import MetadataControl + import geometamaker org = 'natcap' name = 'nat' position = 'boss' email = 'abc@def' - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - mc.set_contact( - organization=org, individualname=name, - positionname=position, email=email) - contact_dict = mc.get_contact() - self.assertEqual(contact_dict['organization'], org) - self.assertEqual(contact_dict['individualname'], name) - self.assertEqual(contact_dict['positionname'], position) - self.assertEqual(contact_dict['email'], email) - - def test_set_contact_from_dict(self): - """MetadataControl: set a contact section from a dict.""" - - from geometamaker import MetadataControl - - contact_dict = { - 'organization': 'natcap', - 'individualname': 'nat', - 'positionname': 'boss', - 'email': 'abc@def', - 'fax': '555-1234', - 'postalcode': '01234' - } - - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - mc.set_contact(**contact_dict) - actual = mc.get_contact() - for k, v in contact_dict.items(): - self.assertEqual(actual[k], v) - def test_set_contact_validates(self): - """MetadataControl: invalid type raises ValidationError.""" - - from geometamaker import MetadataControl - - postalcode = 55555 # should be a string - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - with self.assertRaises(ValidationError): - mc.set_contact(postalcode=postalcode) + resource = geometamaker.models.Resource() + resource.set_contact( + organization=org, individual_name=name, + position_name=position, email=email) + contact = resource.get_contact() + self.assertEqual(contact.organization, org) + self.assertEqual(contact.individual_name, name) + self.assertEqual(contact.position_name, position) + self.assertEqual(contact.email, email) def test_set_doi(self): - """MetadataControl: set and get a doi.""" + """Test set and get a doi.""" - from geometamaker import MetadataControl + import geometamaker doi = '10.foo/bar' - mc = MetadataControl() - mc.set_doi(doi) - self.assertEqual(mc.get_doi(), doi) + resource = geometamaker.models.Resource() + resource.set_doi(doi) + self.assertEqual(resource.get_doi(), doi) def test_set_get_edition(self): - """MetadataControl: set and get dataset edition.""" + """Test set and get dataset edition.""" - from geometamaker import MetadataControl + import geometamaker - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + resource = geometamaker.models.Resource() version = '3.14' - mc.set_edition(version) - self.assertEqual(mc.get_edition(), version) - - def test_set_edition_validates(self): - """MetadataControl: test set edition raises ValidationError.""" - - from geometamaker import MetadataControl - - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - version = 3.14 # should be a string - with self.assertRaises(ValidationError): - mc.set_edition(version) + resource.set_edition(version) + self.assertEqual(resource.get_edition(), version) def test_set_keywords(self): - """MetadataControl: set keywords to default section.""" + """Test set and get keywords.""" - from geometamaker import MetadataControl + import geometamaker - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - mc.set_keywords(['foo', 'bar']) + resource = geometamaker.models.Resource() + resource.set_keywords(['foo', 'bar']) self.assertEqual( - mc.mcf['identification']['keywords']['default']['keywords'], + resource.get_keywords(), ['foo', 'bar']) - def test_set_keywords_to_section(self): - """MetadataControl: set keywords to named section.""" + def test_set_and_get_license(self): + """Test set and get license for resource.""" + import geometamaker - from geometamaker import MetadataControl + resource = geometamaker.models.Resource() + title = 'CC-BY-4.0' + path = 'https://creativecommons.org/licenses/by/4.0/' - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - mc.set_keywords(['foo', 'bar'], section='first') - mc.set_keywords(['baz'], section='second') + resource.set_license(title=title) self.assertEqual( - mc.mcf['identification']['keywords']['first']['keywords'], - ['foo', 'bar']) - self.assertEqual( - mc.mcf['identification']['keywords']['second']['keywords'], - ['baz']) - - def test_overwrite_keywords(self): - """MetadataControl: overwrite keywords in existing section.""" - - from geometamaker import MetadataControl - - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - mc.set_keywords(['foo', 'bar']) - mc.set_keywords(['baz']) + resource.get_license().__dict__, {'title': title, 'path': ''}) + resource.set_license(path=path) self.assertEqual( - mc.mcf['identification']['keywords']['default']['keywords'], - ['baz']) - - def test_keywords_raises_validation_error(self): - """MetadataControl: set keywords validates.""" - from geometamaker import MetadataControl - - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - with self.assertRaises(ValidationError): - mc.set_keywords('foo', 'bar') - - def test_set_and_get_license(self): - """MetadataControl: set purpose of dataset.""" - from geometamaker import MetadataControl - - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - name = 'CC-BY-4.0' - url = 'https://creativecommons.org/licenses/by/4.0/' + resource.get_license().__dict__, {'title': '', 'path': path}) - mc.set_license(name=name) + resource.set_license(title=title, path=path) self.assertEqual( - mc.mcf['identification']['accessconstraints'], - 'license') - self.assertEqual(mc.get_license(), {'name': name, 'url': ''}) - - mc.set_license(url=url) - self.assertEqual(mc.get_license(), {'name': '', 'url': url}) + resource.get_license().__dict__, {'title': title, 'path': path}) - mc.set_license(name=name, url=url) - self.assertEqual(mc.get_license(), {'name': name, 'url': url}) - - mc.set_license() - self.assertEqual(mc.get_license(), {'name': '', 'url': ''}) + resource.set_license() self.assertEqual( - mc.mcf['identification']['accessconstraints'], - 'otherRestrictions') - - def test_set_license_validates(self): - """MetadataControl: test set license raises ValidationError.""" - - from geometamaker import MetadataControl - - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - name = 4.0 # should be a string - with self.assertRaises(ValidationError): - mc.set_license(name=name) - with self.assertRaises(ValidationError): - mc.set_license(url=name) + resource.get_license().__dict__, {'title': '', 'path': ''}) def test_set_and_get_lineage(self): - """MetadataControl: set lineage of dataset.""" - from geometamaker import MetadataControl + """Test set and get lineage of a resource.""" + import geometamaker - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + resource = geometamaker.models.Resource() statement = 'a lineage statment' - mc.set_lineage(statement) - self.assertEqual(mc.get_lineage(), statement) - - def test_set_lineage_validates(self): - """MetadataControl: test set lineage raises ValidationError.""" - - from geometamaker import MetadataControl - - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - lineage = ['some statement'] # should be a string - with self.assertRaises(ValidationError): - mc.set_lineage(lineage) + resource.set_lineage(statement) + self.assertEqual(resource.get_lineage(), statement) def test_set_and_get_purpose(self): - """MetadataControl: set purpose of dataset.""" - from geometamaker import MetadataControl + """Test set and get purpose of resource.""" + import geometamaker - datasource_path = os.path.join(self.workspace_dir, 'raster.tif') - create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + resource = geometamaker.models.Resource() purpose = 'foo' - mc.set_purpose(purpose) - self.assertEqual(mc.get_purpose(), purpose) + resource.set_purpose(purpose) + self.assertEqual(resource.get_purpose(), purpose) def test_set_url(self): - """MetadataControl: set and get a url.""" + """Test set and get a url.""" - from geometamaker import MetadataControl + import geometamaker url = 'http://foo/bar' - mc = MetadataControl() - mc.set_url(url) - self.assertEqual(mc.get_url(), url) + resource = geometamaker.models.Resource() + resource.set_url(url) + self.assertEqual(resource.get_url(), url) - def test_preexisting_mc_raster(self): - """MetadataControl: test reading and ammending an existing MCF raster.""" - from geometamaker import MetadataControl + def test_preexisting_metadata_document(self): + """Test reading and ammending an existing Metadata document.""" + import geometamaker title = 'Title' keyword = 'foo' band_name = 'The Band' datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - mc.set_title(title) - mc.set_band_description(1, name=band_name) - mc.write() + resource = geometamaker.describe(datasource_path) + resource.set_title(title) + resource.set_band_description(1, title=band_name) + resource.write() - new_mc = MetadataControl(datasource_path) - new_mc.set_keywords([keyword]) + new_resource = geometamaker.describe(datasource_path) + new_resource.set_keywords([keyword]) - self.assertEqual(new_mc.mcf['metadata']['hierarchylevel'], 'dataset') self.assertEqual( - new_mc.get_title(), title) + new_resource.get_title(), title) self.assertEqual( - new_mc.get_band_description(1)['name'], band_name) + new_resource.get_band_description(1).title, band_name) self.assertEqual( - new_mc.get_keywords()['keywords'], [keyword]) + new_resource.get_keywords(), [keyword]) - def test_preexisting_mc_raster_new_bands(self): - """MetadataControl: test existing MCF when the raster has new bands.""" - from geometamaker import MetadataControl + def test_preexisting_doc_new_bands(self): + """Test existing metadata document when the raster has new bands.""" + import geometamaker band_name = 'The Band' datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path, n_bands=1) - mc = MetadataControl(datasource_path) - mc.set_band_description(1, name=band_name) - self.assertEqual(mc.get_band_description(1)['type'], 'integer') - mc.write() + resource = geometamaker.describe(datasource_path) + resource.set_band_description(1, title=band_name) + self.assertEqual(resource.get_band_description(1).numpy_type, 'int16') + resource.write() # The raster is modified after it's original metadata was written # There's an extra band, and the datatype has changed create_raster(numpy.float32, datasource_path, n_bands=2) - new_mc = MetadataControl(datasource_path) - - band1 = new_mc.get_band_description(1) - self.assertEqual(band1['name'], band_name) - self.assertEqual(band1['type'], 'number') - band2 = new_mc.get_band_description(2) - self.assertEqual(band2['name'], '') - self.assertEqual(band2['type'], 'number') - - def test_preexisting_mc_vector(self): - """MetadataControl: test reading and ammending an existing MCF vector.""" - from geometamaker import MetadataControl - - title = 'Title' - datasource_path = os.path.join(self.workspace_dir, 'vector.geojson') - field_name = 'foo' - description = 'description' - field_map = { - field_name: list(_OGR_TYPES_VALUES_MAP)[0]} - create_vector(datasource_path, field_map) - mc = MetadataControl(datasource_path) - mc.set_title(title) - mc.set_field_description(field_name, abstract=description) - mc.write() - - new_mc = MetadataControl(datasource_path) + new_resource = geometamaker.describe(datasource_path) - self.assertEqual(new_mc.mcf['metadata']['hierarchylevel'], 'dataset') - self.assertEqual( - new_mc.get_title(), title) - self.assertEqual( - new_mc.get_field_description(field_name)['abstract'], description) + band1 = new_resource.get_band_description(1) + self.assertEqual(band1.title, '') + self.assertEqual(band1.numpy_type, 'float32') + band2 = new_resource.get_band_description(2) + self.assertEqual(band2.title, '') + self.assertEqual(band2.numpy_type, 'float32') - def test_preexisting_mc_vector_new_fields(self): - """MetadataControl: test an existing MCF for vector with new fields.""" - from geometamaker import MetadataControl + def test_preexisting_doc_new_fields(self): + """Test an existing metadata document for vector with new fields.""" + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'vector.geojson') field1_name = 'foo' @@ -668,11 +426,11 @@ def test_preexisting_mc_vector_new_fields(self): field_map = { field1_name: list(_OGR_TYPES_VALUES_MAP)[0]} create_vector(datasource_path, field_map) - mc = MetadataControl(datasource_path) - mc.set_field_description(field1_name, abstract=description) + resource = geometamaker.describe(datasource_path) + resource.set_field_description(field1_name, description=description) self.assertEqual( - mc.get_field_description(field1_name)['type'], 'integer') - mc.write() + resource.get_field_description(field1_name).type, 'Integer') + resource.write() # Modify the dataset by changing the field type of the # existing field. And add a second field. @@ -681,62 +439,51 @@ def test_preexisting_mc_vector_new_fields(self): field1_name: list(_OGR_TYPES_VALUES_MAP)[2], field2_name: list(_OGR_TYPES_VALUES_MAP)[3]} create_vector(datasource_path, new_field_map) - new_mc = MetadataControl(datasource_path) + new_resource = geometamaker.describe(datasource_path) - field1 = new_mc.get_field_description(field1_name) - self.assertEqual(field1['abstract'], description) - self.assertEqual(field1['type'], 'number') - field2 = new_mc.get_field_description(field2_name) - self.assertEqual(field2['type'], 'string') + field1 = new_resource.get_field_description(field1_name) + # The field type changed, so the description does not carry over + self.assertEqual(field1.description, '') + self.assertEqual(field1.type, 'Real') + field2 = new_resource.get_field_description(field2_name) + self.assertEqual(field2.type, 'String') + + def test_preexisting_incompatible_doc(self): + """Test when yaml file not created by geometamaker already exists.""" + import geometamaker - def test_invalid_preexisting_mcf(self): - """MetadataControl: test overwriting an existing invalid MetadataControl.""" - from geometamaker import MetadataControl - title = 'Title' datasource_path = os.path.join(self.workspace_dir, 'raster.tif') + target_path = f'{datasource_path}.yml' + with open(target_path, 'w') as file: + file.write(yaml.dump({'foo': 'bar'})) create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) - mc.set_title(title) - - # delete a required property and ensure invalid MetadataControl - del mc.mcf['mcf'] - with self.assertRaises(ValidationError): - mc.validate() - mc.write() # intentionally writing an invalid MetadataControl - - new_mc = MetadataControl(datasource_path) - # The new MetadataControl should not have values from the invalid MetadataControl - self.assertEqual( - new_mc.mcf['identification']['title'], '') - - try: - new_mc.validate() - except (MCFValidationError, SchemaError) as e: - self.fail( - 'unexpected validation error occurred\n' - f'{e}') - try: - new_mc.write() - except Exception as e: - self.fail( - 'unexpected write error occurred\n' - f'{e}') + # Describing a dataset that already has an incompatible yaml + # sidecar file should log a warning. + with self.assertLogs('geometamaker', level='WARNING') as cm: + resource = geometamaker.describe(datasource_path) + expected_message = 'exists but is not compatible with' + self.assertIn(expected_message, ''.join(cm.output)) + + # After writing new doc, check it has expected property + resource.write() + with open(target_path, 'r') as file: + yaml_string = file.read() + yaml_dict = yaml.safe_load(yaml_string) + self.assertIn('metadata_version', yaml_dict) + self.assertIn('geometamaker', yaml_dict['metadata_version']) def test_write_to_local_workspace(self): - """MetadataControl: test write metadata to a different location.""" - from geometamaker import MetadataControl + """Test write metadata to a different location.""" + import geometamaker datasource_path = os.path.join(self.workspace_dir, 'raster.tif') create_raster(numpy.int16, datasource_path) - mc = MetadataControl(datasource_path) + resource = geometamaker.describe(datasource_path) temp_dir = tempfile.mkdtemp(dir=self.workspace_dir) - mc.write(workspace=temp_dir) + resource.write(workspace=temp_dir) self.assertTrue( os.path.exists(os.path.join( temp_dir, f'{os.path.basename(datasource_path)}.yml'))) - self.assertTrue( - os.path.exists(os.path.join( - temp_dir, f'{os.path.basename(datasource_path)}.xml')))