Skip to content

Commit

Permalink
YDA-5801: copy user metadata from research to vault
Browse files Browse the repository at this point in the history
  • Loading branch information
lwesterhof committed Aug 23, 2024
1 parent 8673e05 commit f847f34
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 40 deletions.
67 changes: 39 additions & 28 deletions meta.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
"""JSON metadata handling."""

__copyright__ = 'Copyright (c) 2019-2023, Utrecht University'
__copyright__ = 'Copyright (c) 2019-2024, Utrecht University'
__license__ = 'GPLv3, see LICENSE'

import json
Expand All @@ -26,8 +26,7 @@
'rule_meta_modified_post',
'rule_meta_datamanager_vault_ingest',
'rule_meta_collection_has_cloneable_metadata',
'rule_get_latest_vault_metadata_path',
'rule_copy_user_metadata']
'rule_get_latest_vault_metadata_path']


def metadata_get_links(metadata):
Expand Down Expand Up @@ -326,9 +325,6 @@ def ingest_metadata_research(ctx, path):
log.write(ctx, 'ingest_metadata_research failed: {} is invalid'.format(path))
return

# Remove any remaining legacy XML-style AVUs.
ctx.iiRemoveAVUs(coll, constants.UUUSERMETADATAPREFIX)

# Note: We do not set a $id in research space: this would trigger jsonavu
# validation, which does not respect our wish to ignore required
# properties in the research area.
Expand Down Expand Up @@ -486,9 +482,6 @@ def ingest_metadata_vault(ctx, path):
if config.enable_open_search and group.exists(ctx, coll.split("/")[3].replace("vault-", "deposit-", 1)):
update_index_metadata(ctx, coll + "/index", metadata, creation_time, data_package)

# Remove any remaining legacy XML-style AVUs.
ctx.iiRemoveAVUs(coll, constants.UUUSERMETADATAPREFIX)

# Replace all metadata under this namespace.
avu_json.set_json_to_obj(ctx, coll, '-C',
constants.UUUSERMETADATAROOT,
Expand Down Expand Up @@ -682,31 +675,49 @@ def set_result(msg_short, msg_long):
set_result('Success', '')


@rule.make()
def rule_copy_user_metadata(ctx, source, target):
copy_user_metadata(ctx, source, target)


def copy_user_metadata(ctx, source, target):
"""
Copy the user metadata of a collection to another collection.
Copy the user metadata (AVUs) of a collection to another collection.
This only copies user metadata, so it ignores system metadata.
:param ctx: Combined type of a callback and rei struct
:param source: Path of source collection.
:param target: Path of target collection.
"""
try:
# Retrieve all user metadata on source collection.
iter = genquery.row_iterator(
"META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE",
"COLL_NAME = '{}' AND META_COLL_ATTR_NAME = '{}%'".format(source, constants.UUUSERMETADATAPREFIX),
genquery.AS_LIST, ctx
)

# Set user metadata on target collection.
for row in iter:
avu.associate_to_coll(ctx, target, row[0], row[1])

log.write(ctx, "rule_copy_user_metadata: copied user metadata from <{}> to <{}>".format(source, target))
# Retrieve all AVUs inside source collection.
user_metadata = list(avu.inside_coll(ctx, source, recursive=True))

# Group AVUs by entity and filter system metadata.
grouped_user_metadata = {}
for path, type, attribute, value, unit in user_metadata:
if not attribute.startswith(constants.UUORGMETADATAPREFIX) and unit != constants.UUFLATINDEX and not unit.startswith(constants.UUUSERMETADATAROOT + '_'):
grouped_user_metadata.setdefault(path, {"type": type, "avus": []})
grouped_user_metadata[path]["avus"].append((attribute, value, unit))

# Generate metadata operations.
for path, item in grouped_user_metadata.items():
operations = {
"entity_name": path.replace(source, target + "/original", 1),
"entity_type": item["type"],
"operations": []
}

for attribute, value, unit in item["avus"]:
operations["operations"].append(
{
"operation": "add",
"attribute": attribute,
"value": value,
"units": unit
}
)

# Apply metadata operations.
if not avu.apply_atomic_operations(ctx, operations):
log.write(ctx, "copy_user_metadata: failed to copy user metadata for <{}>".format(path))

log.write(ctx, "copy_user_metadata: copied user metadata from <{}> to <{}>".format(source, target))
except Exception:
log.write(ctx, "rule_copy_user_metadata: failed to copy user metadata from <{}> to <{}>".format(source, target))
log.write(ctx, "copy_user_metadata: failed to copy user metadata from <{}> to <{}>".format(source, target))
62 changes: 57 additions & 5 deletions util/avu.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,70 @@
Avu.unit = Avu.u


def of_data(ctx, path):
"""Get (a,v,u) triplets for a given data object."""
return itertools.imap(lambda x: Avu(*x),
genquery.Query(ctx, "META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE, META_DATA_ATTR_UNITS",
"COLL_NAME = '{}' AND DATA_NAME = '{}'".format(*pathutil.chop(path))))


def of_coll(ctx, coll):
"""Get (a,v,u) triplets for a given collection."""
return itertools.imap(lambda x: Avu(*x),
genquery.Query(ctx, "META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE, META_COLL_ATTR_UNITS",
"COLL_NAME = '{}'".format(coll)))


def of_data(ctx, path):
"""Get (a,v,u) triplets for a given data object."""
return itertools.imap(lambda x: Avu(*x),
genquery.Query(ctx, "META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE, META_DATA_ATTR_UNITS",
"COLL_NAME = '{}' AND DATA_NAME = '{}'".format(*pathutil.chop(path))))
def inside_coll(ctx, path, recursive=False):
"""Get a list of all AVUs inside a collection with corresponding paths.
Note: the returned value is a generator / lazy list, so that large
collections can be handled without keeping everything in memory.
use list(...) on the result to get an actual list if necessary.
The returned paths are absolute paths (e.g. '/tempZone/home/x').
:param ctx: Combined type of a callback and rei struct
:param path: Path of collection
:param recursive: List AVUs recursively
:returns: List of all AVUs inside a collection with corresponding paths
"""
# coll+name -> path
def to_absolute(row, type):
if type == "collection":
return (row[1], type, row[2], row[3], row[4])
else:
return ('{}/{}'.format(row[0], row[1]), type, row[2], row[3], row[4])

collection_root = genquery.row_iterator(
"COLL_PARENT_NAME, COLL_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE, META_COLL_ATTR_UNITS",
"COLL_PARENT_NAME = '{}'".format(path),
genquery.AS_LIST, ctx)
collection_root = itertools.imap(lambda x: to_absolute(x, "collection"), collection_root)

data_objects_root = genquery.row_iterator(
"COLL_NAME, DATA_NAME, META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE, META_DATA_ATTR_UNITS",
"COLL_NAME = '{}'".format(path),
genquery.AS_LIST, ctx)
data_objects_root = itertools.imap(lambda x: to_absolute(x, "data_object"), data_objects_root)

if not recursive:
return itertools.chain(collection_root, data_objects_root)

collection_sub = genquery.row_iterator(
"COLL_PARENT_NAME, COLL_NAME, META_COLL_ATTR_NAME, META_COLL_ATTR_VALUE, META_COLL_ATTR_UNITS",
"COLL_PARENT_NAME like '{}/%'".format(path),
genquery.AS_LIST, ctx)
collection_sub = itertools.imap(lambda x: to_absolute(x, "collection"), collection_sub)

data_objects_sub = genquery.row_iterator(
"COLL_NAME, DATA_NAME, META_DATA_ATTR_NAME, META_DATA_ATTR_VALUE, META_DATA_ATTR_UNITS",
"COLL_NAME like '{}/%'".format(path),
genquery.AS_LIST, ctx)
data_objects_sub = itertools.imap(lambda x: to_absolute(x, "data_object"), data_objects_sub)

return itertools.chain(collection_root, data_objects_root, collection_sub, data_objects_sub)


def of_group(ctx, group):
Expand Down
8 changes: 2 additions & 6 deletions util/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
IIVAULTPREFIX = "vault-"

UUORGMETADATAPREFIX = 'org_'
"""Prefix for organisational metadata."""

UUSYSTEMCOLLECTION = '/yoda'

UUREVISIONCOLLECTION = UUSYSTEMCOLLECTION + '/revisions'
Expand Down Expand Up @@ -60,15 +62,9 @@
"""The maximum file size that can be read into a string in memory, to prevent
DOSing / out of control memory consumption."""

UUUSERMETADATAPREFIX = 'usr_'
"""Prefix of user metadata (applied via legacy XML metadata file changes)."""

UUUSERMETADATAROOT = 'usr'
"""JSONAVU JSON root / namespace of user metadata (applied via JSON metadata file changes)."""

UUORGMETADATAPREFIX = 'org_'
"""Prefix for organisational metadata."""

UUFLATINDEX = 'FlatIndex'
"""Flat unstructured index fields."""

Expand Down
2 changes: 1 addition & 1 deletion vault.py
Original file line number Diff line number Diff line change
Expand Up @@ -1550,7 +1550,7 @@ def get_title(ctx, path):
"""
iter = genquery.row_iterator(
"META_COLL_ATTR_VALUE",
"COLL_NAME = '%s' AND META_COLL_ATTR_NAME = 'Title' AND META_COLL_ATTR_UNITS = 'usr_0_s'" % (path),
"COLL_NAME = '{}' AND META_COLL_ATTR_NAME = 'Title' AND META_COLL_ATTR_UNITS = '{}_0_s'".format(constants.UUUSERMETADATAROOT, path),
genquery.AS_LIST, ctx
)

Expand Down

0 comments on commit f847f34

Please sign in to comment.