Skip to content

Commit

Permalink
rules: add custom fields
Browse files Browse the repository at this point in the history
* transform:  move the retrieval of vocabularies
  • Loading branch information
kpsherva committed Sep 30, 2024
1 parent 3ae0784 commit 546f37f
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 86 deletions.
23 changes: 23 additions & 0 deletions cds_migrator_kit/migration_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from invenio_app_rdm.config import CELERY_BEAT_SCHEDULE as APP_RDM_CELERY_BEAT_SCHEDULE
from invenio_app_rdm.config import *
from invenio_i18n import lazy_gettext as _
from invenio_vocabularies.services.custom_fields import VocabularyCF


def _(x): # needed to avoid start time failure with lazy strings
Expand Down Expand Up @@ -351,6 +352,28 @@ def _(x): # needed to avoid start time failure with lazy strings

RDM_PERMISSION_POLICY = CDSRDMRecordPermissionPolicy


RDM_NAMESPACES = {
# CERN
"cern": "https://greybook.cern.ch/",
}

RDM_CUSTOM_FIELDS = [
VocabularyCF(
name="cern:experiment",
vocabulary_id="experiments",
dump_options=True,
multiple=False,
),
VocabularyCF(
name="cern:department",
vocabulary_id="departments",
dump_options=True,
multiple=False,
),
]


base_path = os.path.dirname(os.path.realpath(__file__))
logs_dir = os.path.join(base_path, "tmp/logs/")
CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ class CMSSummerStudent(CdsOverdo):
"8564_u", # Files
"8564_x", # Files system field
"8564_y", # Files
"100__9", # #BEARD# tag
"700__9", # #BEARD# tag
"700__m", # Contributors (email)


# TO Implement (to remove from here)
Expand All @@ -69,20 +72,16 @@ class CMSSummerStudent(CdsOverdo):
# "595__z", # SOME RECORD HAVE UNCL as value, do we keep it?
# "693__a", # accelerator
# "693__b", # value 'H4' in 1 record: 2640381
# "693__e", # experiment
# "693__f", # facility
# "693__p", # project
# "693__s", # study
# "700__0", # Contributors (cds author id) - TBD if we keep
# "700__9", # Contributors (?) - value '#BEARD#' in some records - to ignore
# "700__m", # Contributors (email)
# "700__u", # Contributors (affiliation)
# "700__u", # Contributors (affiliation?)
# "710__g", # Collaboration
"906__p", # name, is it supervisor?
"906__p", # name, is it supervisor? # todo migrate as contributor
# "906__p", # probably supervisor TODO: check
"960__a", # collection id? usually value 12
"963__a", # restriction
"963__a", # restriction # todo assert if any record is restricted -> to implement in collection specific rules
"970__a", # some kind of identifier? "000732636CER"
"980__a", # collection tag
# "980__c",
Expand All @@ -101,9 +100,14 @@ class CMSSummerStudent(CdsOverdo):
# "520__a", # Note (-> description.type = abstract
# "6531_9", # keyword provenance
# "6531_a", # keyword
# "6931_a", # keyword
# "6931_9", # keyword
# "693__e", # custom_fields.cern:experiment # TODO this field is single value, do we have lists?

# "650172", # subject provenance
# "65017a", # subject value
# "700__a", # Contributors (full name)
# "700__u", # Contributors (affiliation)
# "916__n",
# "916__s",
# "916__w",
Expand Down
97 changes: 75 additions & 22 deletions cds_migrator_kit/rdm/migration/transform/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@

from cds_migrator_kit.rdm.migration.transform.xml_processing.dumper import CDSRecordDump
from cds_migrator_kit.rdm.migration.transform.xml_processing.errors import (
LossyConversion, RestrictedFileDetected,
LossyConversion, RestrictedFileDetected, UnexpectedValue,
)
from cds_migrator_kit.records.log import RDMJsonLogger
from invenio_access.permissions import system_identity
from invenio_search.engine import dsl
from invenio_records_resources.proxies import current_service_registry

cli_logger = logging.getLogger("migrator")

Expand Down Expand Up @@ -96,23 +99,44 @@ def _communities(self, json_entry):
def _metadata(self, json_entry):
def creators(json):
try:
_creators = json["creators"]
vocab_type = "affiliations"
service = current_service_registry.get(vocab_type)
extra_filter = dsl.Q("term", type__id=vocab_type)
for creator in _creators:
affiliations = creator["affiliations"]
transformed_aff = []
for affiliation_name in affiliations:

title = dsl.Q("match", **{f"title": affiliation_name})
acronym = dsl.Q("match_phrase",
**{f"acronym.keyword": affiliation_name})
title_filter = dsl.query.Bool("should", should=[title, acronym])

vocabulary_result = (service.search(system_identity,
extra_filter=title_filter | extra_filter)
.to_dict())
if vocabulary_result["hits"]["total"]:
transformed_aff.append({
"name": affiliation_name,
"id": vocabulary_result["hits"]["hits"][0]["id"]}
)
else:
raise UnexpectedValue(subfield="u",
value=affiliation_name,
field="author",
message=f"Affiliation {affiliation_name} not found.")
creator["affiliations"] = transformed_aff
return json["creators"]
except KeyError:
return [
{
"person_or_org": {
"given_name": "unknown",
"name": "unknown",
"family_name": "unknown",
"type": "personal",
}
}
]
raise UnexpectedValue(field="creators")

def _resource_type(data):
t = "publication-technicalnote"
st = None
return {"id": f"{t}-{st}"} if st else {"id": t}


return {
"creators": creators(json_entry),
"title": json_entry["title"],
Expand All @@ -121,32 +145,61 @@ def _resource_type(data):
"publication_date": json_entry.get("publication_date"),
}

def _custom_fields(self, json_entry):

experiment = json_entry.get("custom_fields", {}).get("cern:experiment")
custom_fields = {}

if experiment:
vocab_type = "experiments"
service = current_service_registry.get("vocabularies")
vocabulary_result = (
service.search(system_identity, type=vocab_type,
q=f"{experiment}")
.to_dict())
if vocabulary_result["hits"]["total"]:

custom_fields["cern:experiment"] = {
"id": vocabulary_result["hits"]["hits"][0]["id"]
}

else:
raise UnexpectedValue(subfield="a",
value=experiment,
field="experiment",
message=f"Experiment {experiment} not found.")
return custom_fields
def transform(self, entry):
"""Transform a record single entry."""
record_dump = CDSRecordDump(
entry,
)
migration_logger = RDMJsonLogger()
migration_logger.add_recid_to_stats(entry["recid"])
try:
migration_logger = RDMJsonLogger()
migration_logger.add_recid_to_stats(entry["recid"])

record_dump.prepare_revisions()
timestamp, json_data = record_dump.revisions[-1]
migration_logger.add_record(json_data)
json_output = {
"created": self._created(json_data),
"updated": self._updated(record_dump),
"pids": self._pids(json_data),
"files": self._files(record_dump),
"metadata": self._metadata(json_data),
"access": self._access(json_data, record_dump),
}
custom_fields = self._custom_fields(json_data)
if custom_fields:
json_output.update({"custom_fields": custom_fields})
return {
"created": self._created(json_data),
"updated": self._updated(record_dump),
"version_id": self._version_id(record_dump),
"index": self._index(record_dump),
"recid": self._recid(record_dump),
# "communities": self._communities(json_data),
"json": {
"created": self._created(json_data),
"updated": self._updated(record_dump),
"pids": self._pids(json_data),
"files": self._files(record_dump),
"metadata": self._metadata(json_data),
"access": self._access(json_data, record_dump),
},
"json": json_output
}
except LossyConversion as e:
cli_logger.error("[DATA ERROR]: {0}".format(e.message))
Expand Down Expand Up @@ -242,7 +295,7 @@ def _draft_files(self, entry):
{
file["full_name"]: {
"eos_tmp_path": tmp_eos_root
/ full_path.relative_to(legacy_path_root),
/ full_path.relative_to(legacy_path_root),
"key": file["full_name"],
"metadata": {},
"mimetype": file["mime"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,15 @@
import re

from dojson.utils import force_list
from invenio_vocabularies.records.models import VocabularyType

from cds_migrator_kit.rdm.migration.transform.xml_processing.errors import (
UnexpectedValue,
)
from invenio_records_resources.proxies import current_service_registry
from invenio_access.permissions import system_identity

from cds_migrator_kit.rdm.migration.transform.xml_processing.quality.parsers import \
StringValue
from cds_migrator_kit.rdm.migration.transform.xml_processing.quality.regex import \
ALPHANUMERIC_ONLY
from invenio_search.engine import dsl


# "contributors": {
Expand Down Expand Up @@ -139,36 +135,12 @@ def get_contributor_affiliations(info):
u = info.get("u", "")
if not u:
return
if "CERN" not in u:
print(u, "-------------------------------")
affiliations = force_list(u)
vocab_type = "affiliations"
service = current_service_registry.get(vocab_type)

parsed_affiliations = [StringValue(aff, str).parse().filter_regex(ALPHANUMERIC_ONLY)
for
aff in affiliations]
vocabulary_type = VocabularyType.query.filter_by(id=vocab_type).one()
extra_filter = dsl.Q("term", type__id=vocabulary_type.id)
for affiliation_name in parsed_affiliations:

title = dsl.Q("match", **{f"title": affiliation_name})
acronym = dsl.Q("match_phrase", **{f"acronym.keyword": affiliation_name})
title_filter = dsl.query.Bool("should", should=[title, acronym])

vocabulary_result = (service.search(system_identity,
extra_filter=title_filter | extra_filter)
.to_dict())
if vocabulary_result["hits"]["total"]:
aff_results.append({"name": affiliation_name,
"id": vocabulary_result["hits"]["hits"][0]["id"]})
else:
raise UnexpectedValue(subfield="u",
value=affiliation_name,
field="author",
message=f"Affiliation {affiliation_name} not found.")

return aff_results

parsed_affiliations = \
[StringValue(aff, str).parse().filter_regex(ALPHANUMERIC_ONLY) for
aff in affiliations]
return parsed_affiliations


def extract_json_contributor_ids(info):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,39 +206,28 @@ def subjects(self, key, value):
"""Translates subjects fields."""
_subjects = self.get("subjects", [])
subject_value = value.get("a")
subject_scheme = value.get("2")

if key == "6931_":
subject_e = value.get("e")

if subject_value:
obj = {"subject": subject_value}
if obj not in _subjects:
_subjects.append(obj)
if subject_e:
obj = {"subject": subject_e}
if obj not in _subjects:
_subjects.append(obj)
subject_scheme = value.get("2") or value.get("9")

if subject_scheme and subject_scheme.lower() != "szgecern":
raise UnexpectedValue(field=key, subfield="2")
if key == "65017":
if key == "65017" or key == "6531_":
if subject_value:
subject = {
"id": subject_value,
"subject": subject_value,
"scheme": "CERN",
}
_subjects.append(subject)
return _subjects

if key == "6531_":
subject_scheme = value.get("9")
if subject_value:
subject = {
"id": subject_value,
"subject": subject_value,
"scheme": subject_scheme,
}
_subjects.append(subject)

return _subjects
@model.over("custom_fields", "(^693__)")
def custom_fields(self, key, value):
"""Translates custom fields."""

_custom_fields = self.get("custom_fields", {})

if key == "693__":
experiment = value.get("e")
_custom_fields["cern:experiment"] = experiment
return _custom_fields
2 changes: 1 addition & 1 deletion cds_migrator_kit/records/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def resolve_error_type(self, exc, output, key, value):
elif isinstance(exc, UnexpectedValue):
rec_stats["unexpected_value"].append(
dict(
key=exc.__name__,
key=exc.__class__.__name__,
value=value,
subfield=exc.subfield,
message=str(exc.message),
Expand Down

0 comments on commit 546f37f

Please sign in to comment.