rules: add custom fields

* transform: move the retrieval of vocabularies
CERNDocumentServer · Sep 30, 2024 · 546f37f · 546f37f
1 parent 3ae0784
commit 546f37f
Show file tree

Hide file tree

Showing 6 changed files with 127 additions and 86 deletions.
diff --git a/cds_migrator_kit/migration_config.py b/cds_migrator_kit/migration_config.py
@@ -18,6 +18,7 @@
 from invenio_app_rdm.config import CELERY_BEAT_SCHEDULE as APP_RDM_CELERY_BEAT_SCHEDULE
 from invenio_app_rdm.config import *
 from invenio_i18n import lazy_gettext as _
+from invenio_vocabularies.services.custom_fields import VocabularyCF
 
 
 def _(x):  # needed to avoid start time failure with lazy strings
@@ -351,6 +352,28 @@ def _(x):  # needed to avoid start time failure with lazy strings
 
 RDM_PERMISSION_POLICY = CDSRDMRecordPermissionPolicy
 
+
+RDM_NAMESPACES = {
+    # CERN
+    "cern": "https://greybook.cern.ch/",
+}
+
+RDM_CUSTOM_FIELDS = [
+    VocabularyCF(
+        name="cern:experiment",
+        vocabulary_id="experiments",
+        dump_options=True,
+        multiple=False,
+    ),
+    VocabularyCF(
+        name="cern:department",
+        vocabulary_id="departments",
+        dump_options=True,
+        multiple=False,
+    ),
+]
+
+
 base_path = os.path.dirname(os.path.realpath(__file__))
 logs_dir = os.path.join(base_path, "tmp/logs/")
 CDS_MIGRATOR_KIT_LOGS_PATH = logs_dir

diff --git a/cds_migrator_kit/rdm/migration/transform/models/summer_student_report.py b/cds_migrator_kit/rdm/migration/transform/models/summer_student_report.py
@@ -45,6 +45,9 @@ class CMSSummerStudent(CdsOverdo):
         "8564_u",  # Files
         "8564_x",  # Files system field
         "8564_y",  # Files
+        "100__9",  # #BEARD# tag
+        "700__9",  # #BEARD# tag
+        "700__m",  # Contributors (email)
 
 
         # TO Implement (to remove from here)
@@ -69,20 +72,16 @@ class CMSSummerStudent(CdsOverdo):
         # "595__z",  # SOME RECORD HAVE UNCL as value, do we keep it?
         # "693__a", # accelerator
         # "693__b", # value 'H4' in 1 record: 2640381
-        # "693__e", # experiment
         # "693__f", # facility
         # "693__p",  # project
         # "693__s",  # study
         # "700__0",  # Contributors (cds author id) - TBD if we keep
-        # "700__9",  # Contributors (?) - value '#BEARD#' in some records - to ignore
-        # "700__m",  # Contributors (email)
         # "700__u",  # Contributors (affiliation)
-        # "700__u",  # Contributors (affiliation?)
         # "710__g",  # Collaboration
-        "906__p",  # name, is it supervisor?
+        "906__p",  # name, is it supervisor? # todo migrate as contributor
         # "906__p", # probably supervisor TODO: check
         "960__a",  # collection id? usually value 12
-        "963__a",  # restriction
+        "963__a",  # restriction # todo assert if any record is restricted -> to implement in collection specific rules
         "970__a",  # some kind of identifier? "000732636CER"
         "980__a",  # collection tag
         # "980__c",
@@ -101,9 +100,14 @@ class CMSSummerStudent(CdsOverdo):
         # "520__a",  # Note (-> description.type = abstract
         # "6531_9",  # keyword provenance
         # "6531_a",  # keyword
+        # "6931_a",  # keyword
+        # "6931_9",  # keyword
+        # "693__e",  # custom_fields.cern:experiment  # TODO this field is single value, do we have lists?
+
         # "650172",  # subject provenance
         # "65017a",  # subject value
         # "700__a",  # Contributors (full name)
+        # "700__u",  # Contributors (affiliation)
         # "916__n",
         # "916__s",
         # "916__w",

diff --git a/cds_migrator_kit/rdm/migration/transform/transform.py b/cds_migrator_kit/rdm/migration/transform/transform.py
@@ -20,9 +20,12 @@
 
 from cds_migrator_kit.rdm.migration.transform.xml_processing.dumper import CDSRecordDump
 from cds_migrator_kit.rdm.migration.transform.xml_processing.errors import (
-    LossyConversion, RestrictedFileDetected,
+    LossyConversion, RestrictedFileDetected, UnexpectedValue,
 )
 from cds_migrator_kit.records.log import RDMJsonLogger
+from invenio_access.permissions import system_identity
+from invenio_search.engine import dsl
+from invenio_records_resources.proxies import current_service_registry
 
 cli_logger = logging.getLogger("migrator")
 
@@ -96,23 +99,44 @@ def _communities(self, json_entry):
     def _metadata(self, json_entry):
         def creators(json):
             try:
+                _creators = json["creators"]
+                vocab_type = "affiliations"
+                service = current_service_registry.get(vocab_type)
+                extra_filter = dsl.Q("term", type__id=vocab_type)
+                for creator in _creators:
+                    affiliations = creator["affiliations"]
+                    transformed_aff = []
+                    for affiliation_name in affiliations:
+
+                        title = dsl.Q("match", **{f"title": affiliation_name})
+                        acronym = dsl.Q("match_phrase",
+                                        **{f"acronym.keyword": affiliation_name})
+                        title_filter = dsl.query.Bool("should", should=[title, acronym])
+
+                        vocabulary_result = (service.search(system_identity,
+                                                            extra_filter=title_filter | extra_filter)
+                                             .to_dict())
+                        if vocabulary_result["hits"]["total"]:
+                            transformed_aff.append({
+                                "name": affiliation_name,
+                                "id": vocabulary_result["hits"]["hits"][0]["id"]}
+                            )
+                        else:
+                            raise UnexpectedValue(subfield="u",
+                                                  value=affiliation_name,
+                                                  field="author",
+                                                  message=f"Affiliation {affiliation_name} not found.")
+                    creator["affiliations"] = transformed_aff
                 return json["creators"]
             except KeyError:
-                return [
-                    {
-                        "person_or_org": {
-                            "given_name": "unknown",
-                            "name": "unknown",
-                            "family_name": "unknown",
-                            "type": "personal",
-                        }
-                    }
-                ]
+                raise UnexpectedValue(field="creators")
 
         def _resource_type(data):
             t = "publication-technicalnote"
             st = None
             return {"id": f"{t}-{st}"} if st else {"id": t}
+
+
         return {
             "creators": creators(json_entry),
             "title": json_entry["title"],
@@ -121,32 +145,61 @@ def _resource_type(data):
             "publication_date": json_entry.get("publication_date"),
         }
 
+    def _custom_fields(self, json_entry):
+
+        experiment = json_entry.get("custom_fields", {}).get("cern:experiment")
+        custom_fields = {}
+
+        if experiment:
+            vocab_type = "experiments"
+            service = current_service_registry.get("vocabularies")
+            vocabulary_result = (
+                service.search(system_identity, type=vocab_type,
+                               q=f"{experiment}")
+                .to_dict())
+            if vocabulary_result["hits"]["total"]:
+
+                custom_fields["cern:experiment"] = {
+                    "id": vocabulary_result["hits"]["hits"][0]["id"]
+                }
+
+            else:
+                raise UnexpectedValue(subfield="a",
+                                      value=experiment,
+                                      field="experiment",
+                                      message=f"Experiment {experiment} not found.")
+            return custom_fields
     def transform(self, entry):
         """Transform a record single entry."""
         record_dump = CDSRecordDump(
             entry,
         )
+        migration_logger = RDMJsonLogger()
+        migration_logger.add_recid_to_stats(entry["recid"])
         try:
-            migration_logger = RDMJsonLogger()
-            migration_logger.add_recid_to_stats(entry["recid"])
+
             record_dump.prepare_revisions()
             timestamp, json_data = record_dump.revisions[-1]
             migration_logger.add_record(json_data)
+            json_output = {
+                "created": self._created(json_data),
+                "updated": self._updated(record_dump),
+                "pids": self._pids(json_data),
+                "files": self._files(record_dump),
+                "metadata": self._metadata(json_data),
+                "access": self._access(json_data, record_dump),
+            }
+            custom_fields = self._custom_fields(json_data)
+            if custom_fields:
+                json_output.update({"custom_fields": custom_fields})
             return {
                 "created": self._created(json_data),
                 "updated": self._updated(record_dump),
                 "version_id": self._version_id(record_dump),
                 "index": self._index(record_dump),
                 "recid": self._recid(record_dump),
                 # "communities": self._communities(json_data),
-                "json": {
-                    "created": self._created(json_data),
-                    "updated": self._updated(record_dump),
-                    "pids": self._pids(json_data),
-                    "files": self._files(record_dump),
-                    "metadata": self._metadata(json_data),
-                    "access": self._access(json_data, record_dump),
-                },
+                "json": json_output
             }
         except LossyConversion as e:
             cli_logger.error("[DATA ERROR]: {0}".format(e.message))
@@ -242,7 +295,7 @@ def _draft_files(self, entry):
                 {
                     file["full_name"]: {
                         "eos_tmp_path": tmp_eos_root
-                        / full_path.relative_to(legacy_path_root),
+                                        / full_path.relative_to(legacy_path_root),
                         "key": file["full_name"],
                         "metadata": {},
                         "mimetype": file["mime"],

diff --git a/cds_migrator_kit/rdm/migration/transform/xml_processing/quality/contributors.py b/cds_migrator_kit/rdm/migration/transform/xml_processing/quality/contributors.py
@@ -10,19 +10,15 @@
 import re
 
 from dojson.utils import force_list
-from invenio_vocabularies.records.models import VocabularyType
 
 from cds_migrator_kit.rdm.migration.transform.xml_processing.errors import (
     UnexpectedValue,
 )
-from invenio_records_resources.proxies import current_service_registry
-from invenio_access.permissions import system_identity
 
 from cds_migrator_kit.rdm.migration.transform.xml_processing.quality.parsers import \
     StringValue
 from cds_migrator_kit.rdm.migration.transform.xml_processing.quality.regex import \
     ALPHANUMERIC_ONLY
-from invenio_search.engine import dsl
 
 
 # "contributors": {
@@ -139,36 +135,12 @@ def get_contributor_affiliations(info):
     u = info.get("u", "")
     if not u:
         return
-    if "CERN" not in u:
-        print(u, "-------------------------------")
     affiliations = force_list(u)
-    vocab_type = "affiliations"
-    service = current_service_registry.get(vocab_type)
-
-    parsed_affiliations = [StringValue(aff, str).parse().filter_regex(ALPHANUMERIC_ONLY)
-                           for
-                           aff in affiliations]
-    vocabulary_type = VocabularyType.query.filter_by(id=vocab_type).one()
-    extra_filter = dsl.Q("term", type__id=vocabulary_type.id)
-    for affiliation_name in parsed_affiliations:
-
-        title = dsl.Q("match", **{f"title": affiliation_name})
-        acronym = dsl.Q("match_phrase", **{f"acronym.keyword": affiliation_name})
-        title_filter = dsl.query.Bool("should", should=[title, acronym])
-
-        vocabulary_result = (service.search(system_identity,
-                                            extra_filter=title_filter | extra_filter)
-                             .to_dict())
-        if vocabulary_result["hits"]["total"]:
-            aff_results.append({"name": affiliation_name,
-                                "id": vocabulary_result["hits"]["hits"][0]["id"]})
-        else:
-            raise UnexpectedValue(subfield="u",
-                                  value=affiliation_name,
-                                  field="author",
-                                  message=f"Affiliation {affiliation_name} not found.")
-
-    return aff_results
+
+    parsed_affiliations = \
+        [StringValue(aff, str).parse().filter_regex(ALPHANUMERIC_ONLY) for
+         aff in affiliations]
+    return parsed_affiliations
 
 
 def extract_json_contributor_ids(info):

diff --git a/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/base.py b/cds_migrator_kit/rdm/migration/transform/xml_processing/rules/base.py
@@ -206,39 +206,28 @@ def subjects(self, key, value):
     """Translates subjects fields."""
     _subjects = self.get("subjects", [])
     subject_value = value.get("a")
-    subject_scheme = value.get("2")
-
-    if key == "6931_":
-        subject_e = value.get("e")
-
-        if subject_value:
-            obj = {"subject": subject_value}
-            if obj not in _subjects:
-                _subjects.append(obj)
-        if subject_e:
-            obj = {"subject": subject_e}
-            if obj not in _subjects:
-                _subjects.append(obj)
+    subject_scheme = value.get("2") or value.get("9")
 
     if subject_scheme and subject_scheme.lower() != "szgecern":
         raise UnexpectedValue(field=key, subfield="2")
-    if key == "65017":
+    if key == "65017" or key == "6531_":
         if subject_value:
             subject = {
                 "id": subject_value,
                 "subject": subject_value,
                 "scheme": "CERN",
             }
             _subjects.append(subject)
+    return _subjects
 
-    if key == "6531_":
-        subject_scheme = value.get("9")
-        if subject_value:
-            subject = {
-                "id": subject_value,
-                "subject": subject_value,
-                "scheme": subject_scheme,
-            }
-            _subjects.append(subject)
 
-    return _subjects
+@model.over("custom_fields", "(^693__)")
+def custom_fields(self, key, value):
+    """Translates custom fields."""
+
+    _custom_fields = self.get("custom_fields", {})
+
+    if key == "693__":
+        experiment = value.get("e")
+        _custom_fields["cern:experiment"] = experiment
+    return _custom_fields
diff --git a/cds_migrator_kit/records/log.py b/cds_migrator_kit/records/log.py
@@ -145,7 +145,7 @@ def resolve_error_type(self, exc, output, key, value):
         elif isinstance(exc, UnexpectedValue):
             rec_stats["unexpected_value"].append(
                 dict(
-                    key=exc.__name__,
+                    key=exc.__class__.__name__,
                     value=value,
                     subfield=exc.subfield,
                     message=str(exc.message),