errors: log load errors

CERNDocumentServer · Jul 26, 2024 · 71ab28e · 71ab28e
1 parent daaaac0
commit 71ab28e
Show file tree

Hide file tree

Showing 9 changed files with 209 additions and 97 deletions.
diff --git a/cds_migrator_kit/config.py b/cds_migrator_kit/config.py
@@ -18,7 +18,6 @@
 
 from invenio_app.config import APP_DEFAULT_SECURE_HEADERS
 
-
 def _(x):
     """Identity function used to trigger string extraction."""
     return x

diff --git a/cds_migrator_kit/rdm/migration/cli.py b/cds_migrator_kit/rdm/migration/cli.py
@@ -6,7 +6,7 @@
 # the terms of the MIT License; see LICENSE file for more details.
 
 """CDS-RDM command line module."""
-
+import logging
 from pathlib import Path
 
 import click
@@ -15,7 +15,10 @@
 
 from cds_migrator_kit.rdm.migration.runner import Runner
 from cds_migrator_kit.rdm.migration.streams import RecordStreamDefinition
+from cds_migrator_kit.records.log import RDMJsonLogger
 
+cli_logger = logging.getLogger("migrator")
+migration_logger = RDMJsonLogger()
 
 @click.group()
 def migration():

diff --git a/cds_migrator_kit/rdm/migration/load.py b/cds_migrator_kit/rdm/migration/load.py
@@ -6,13 +6,19 @@
 # the terms of the MIT License; see LICENSE file for more details.
 
 """CDS-RDM migration load module."""
-
+import logging
 import os
 
 from invenio_access.permissions import system_identity
+from invenio_db import db
 from invenio_rdm_migrator.load.base import Load
 from invenio_rdm_records.proxies import current_rdm_records_service
 
+from cds_migrator_kit.rdm.migration.transform.xml_processing.errors import \
+    ManualImportRequired
+
+cli_logger = logging.getLogger("migrator")
+
 
 def import_legacy_files(filepath):
     """Download file from legacy."""
@@ -37,34 +43,46 @@ def _prepare(self, entry):
 
     def _load(self, entry):
         """Use the services to load the entries."""
+        from cds_migrator_kit.rdm.migration.cli import migration_logger, cli_logger
+        recid = entry.get("record", {}).get("json", {}).get("id")
+        migration_logger.add_recid_to_stats(recid)
         identity = system_identity  # Should we create an identity for the migration?
         draft = current_rdm_records_service.create(identity, entry["record"]["json"])
+        parent = draft._record.parent
+        access = entry["parent"]["json"]["access"]
+        parent.access = access
+        parent.commit()
+        db.session.commit()
         draft_files = entry["draft_files"]
 
         for file in draft_files:
-            current_rdm_records_service.draft_files.init_files(
-                identity,
-                draft.id,
-                data=[
-                    {
-                        "key": file["key"],
-                        "metadata": file["metadata"],
-                        "access": {"hidden": False},
-                    }
-                ],
-            )
-            current_rdm_records_service.draft_files.set_file_content(
-                identity,
-                draft.id,
-                file["key"],
-                import_legacy_files(file["eos_tmp_path"]),
-            )
-            result = current_rdm_records_service.draft_files.commit_file(
-                identity, draft.id, file["key"]
-            )
-            legacy_checksum = f"md5:{file['checksum']}"
-            new_checksum = result.to_dict()["checksum"]
-            assert legacy_checksum == new_checksum
+            try:
+                current_rdm_records_service.draft_files.init_files(
+                    identity,
+                    draft.id,
+                    data=[
+                        {
+                            "key": file["key"],
+                            "metadata": file["metadata"],
+                            "access": {"hidden": False},
+                        }
+                    ],
+                )
+                current_rdm_records_service.draft_files.set_file_content(
+                    identity,
+                    draft.id,
+                    file["key"],
+                    import_legacy_files(file["eos_tmp_path"]),
+                )
+                result = current_rdm_records_service.draft_files.commit_file(
+                    identity, draft.id, file["key"]
+                )
+                legacy_checksum = f"md5:{file['checksum']}"
+                new_checksum = result.to_dict()["checksum"]
+                assert legacy_checksum == new_checksum
+            except Exception as e:
+                exc = ManualImportRequired(message=str(e))
+                migration_logger.add_log(exc, output=entry)
         current_rdm_records_service.publish(system_identity, draft["id"])
 
     def _cleanup(self, *args, **kwargs):

diff --git a/cds_migrator_kit/rdm/migration/runner.py b/cds_migrator_kit/rdm/migration/runner.py
@@ -111,9 +111,10 @@ def run(self):
                 # on successful stream run, persist state
                 # STATE.flush_cache()
                 # self.state.save(filename=f"{stream.name}.db")
-                from cds_migrator_kit.rdm.migration.transform.transform import logger
+                from cds_migrator_kit.rdm.migration.cli import migration_logger, \
+                    cli_logger
 
-                logger.save()
+                migration_logger.save()
             except Exception:
                 Logger.get_logger().exception(
                     f"Stream {stream.name} failed.", exc_info=1

diff --git a/cds_migrator_kit/rdm/migration/transform/models/summer_student_report.py b/cds_migrator_kit/rdm/migration/transform/models/summer_student_report.py
@@ -32,66 +32,58 @@ class CMSSummerStudent(CdsOverdo):
     __query__ = "980__:NOTE 037__:CERN-STUDENTS-Note-*"
 
     __ignore_keys__ = {
-        "65017a",
-        "937__c",
-        "960__a",
-        "269__b",
-        "6531_a",
-        "906__p",
-        "269__a",
+        #"0247_2",  # DOI, summer student notes do not have it
+        #"0247_a",  # DOI
+        "0248_a",  # oai identifier, not needed to migrate, TBD
+        "0248_p",  # oai identifier, not needed to migrate, TBD
+        # "0248_q",  does appear
+        "035__9",  # Inspire schema
+        "035__a",  # Inspire id value
+        "246__a",  # explanation of abrreviations, TODO: shall we keep it in notes?
+        "246__i",  # abbreviation tag, applies to value of 246__A
+        "088__a",  # RN (manual introduced?)
         "100__0",
-        "100__u",  # Author affiliation
         "100__9",  # Author to check
-        "970__a",  # TODO: check it
-        "980__a",
-        "980__c",  # TODO: remove this one, it should not appear
-        # '260__c',
-        "8564_s",  # Files
-        "8564_y",  # Files
-        "8564_x",  # Files
-        "8564_8",  # Files
-        "500__a",  # Note
-        "246__i",  # Abbreviation
+        "100__a",
+        "100__m",  # author's email <-- decided not to keep in RDM,
+        "100__m",  # author's email <-- decided not to keep in RDM,
+        "100__u",  # Author affiliation
         "246__a",  # Abbreviation
-        "595__z",  # TODO: check it
-        "0248_q",  # TODO: check it
-        "700__9",  # Contributors (?)
-        "700__0",  # Contributors (cds author)
-        "700__u",  # Contributors (affiliation?)
-        "700__m",  # Contributors (email)
-        "693__s",  # study
-        "693__p",  # project
-        "693__b",  # TODO: check it
-        "088__a",  # RN (manual introduced?)
-        "0248_a",
-        "0248_p",
-        "0247_a",  # DOI
-        "0247_2",  # DOI
-        "710__g",  # Collaboration
-        "981__a",  # TODO: check it
-        "562__c",  # TODO: check it
-        "970__d",  # TODO: check it
-        "270__p",  # TODO: check it
-        "270__m",  # TODO: check it
-        "035__a",  # Inspire ref
-        "035__9",  # Inspire ref
-        "693__a",
-        "710__5",
-        "916__w",
-        "690C_a",
-        "595__a",
-        "693__e",
-        "650172",
-        "916__s",
-        "041__a",
-        "937__s",
-        "859__f",
+        "246__i",  # Abbreviation
+        # "270__m",
+        # "270__p",
+        # "500__a",  # Note
+        # "562__c",  # note
+        "595__a",    # always value CERN EDS, not displayed, TODO: do we keep?
+        "595__z",  # SOME RECORD HAVE UNCL as value, do we keep it?
+        "650172",  # TODO TBD
+        "65017a",  # TODO TBD
         "6531_9",
-        "520__a",
-        "963__a",
-        "710__a",
-        "100__a",
-        "100__m",
+        "6531_a",
+        # "693__a",
+        # "693__b",  # TODO: check it
+        # "693__e",
+        # "693__p",  # project
+        # "693__s",  # study
+        # "700__0",  # Contributors (cds author)
+        # "700__9",  # Contributors (?)
+        # "700__m",  # Contributors (email)
+        # "700__u",  # Contributors (affiliation?)
+        # "710__5",
+        # "710__a",
+        # "710__g",  # Collaboration
+        "8564_8",  # Files system field
+        "8564_s",  # Files system field
+        "8564_x",  # Files system field
+        "8564_y",  # Files
+        "859__f",  # creator's email, to be used to determine the owner ???
+        # "906__p", # probably supervisor TODO: check
+        # "916__s", # creation date
+        # "916__w", # creation date
+        "960__a",   # collection id? usuall y valu 12
+        "963__a",   # restriction
+        "980__a",
+        "980__c",  # TODO: remove this one, it should not appear
     }
     _default_fields = None
 

diff --git a/cds_migrator_kit/rdm/migration/transform/transform.py b/cds_migrator_kit/rdm/migration/transform/transform.py
@@ -16,14 +16,16 @@
     RDMRecordTransform,
 )
 
+
 from cds_migrator_kit.rdm.migration.transform.xml_processing.dumper import CDSRecordDump
 from cds_migrator_kit.rdm.migration.transform.xml_processing.errors import (
     LossyConversion,
 )
-from cds_migrator_kit.records.log import RDMJsonLogger
 
-cli_logger = logging.getLogger("migrator")
-logger = RDMJsonLogger()
+
+CDS_DATACITE_PREFIXES = [
+    "10.17181"
+]
 
 
 class CDSToRDMRecordEntry(RDMRecordEntry):
@@ -81,7 +83,8 @@ def _media_files(self, entry):
         return {}
 
     def _pids(self, json_entry):
-        return {}
+        return []
+
 
     def _files(self, record_dump):
         """Transform the files of a record."""
@@ -126,14 +129,15 @@ def _resource_type(data):
 
     def transform(self, entry):
         """Transform a record single entry."""
+        from cds_migrator_kit.rdm.migration.cli import migration_logger, cli_logger
         record_dump = CDSRecordDump(
             entry,
         )
         try:
-            logger.add_recid_to_stats(entry["recid"])
+            migration_logger.add_recid_to_stats(entry["recid"])
             record_dump.prepare_revisions()
             timestamp, json_data = record_dump.revisions[-1]
-            logger.add_record(json_data)
+            migration_logger.add_record(json_data)
             return {
                 "created": self._created(json_data),
                 "updated": self._updated(record_dump),
@@ -150,9 +154,9 @@ def transform(self, entry):
             }
         except LossyConversion as e:
             cli_logger.error("[DATA ERROR]: {0}".format(e.message))
-            logger.add_log(e, output=entry)
+            migration_logger.add_log(e, output=entry)
         except Exception as e:
-            logger.add_log(e, output=entry)
+            migration_logger.add_log(e, output=entry)
             raise e
         # TODO take only the last
 
@@ -183,7 +187,7 @@ def _parent(self, entry, record):
                 # loader is responsible for creating/updating if the PID exists.
                 "id": f'{record["json"]["id"]}-parent',
                 "access": {
-                    # "owned_by": [{"user": o} for o in entry["json"].get("owners", [])]
+                    "owned_by": {"user": "1"},
                 },
                 # "communities": self._community_id(entry, record),
             },

diff --git a/cds_migrator_kit/rdm/migration/transform/xml_processing/quality/parsers.py b/cds_migrator_kit/rdm/migration/transform/xml_processing/quality/parsers.py
@@ -6,9 +6,10 @@
 # the terms of the MIT License; see LICENSE file for more details.
 
 """CDS-RDM migration data cleaning module."""
+from dojson.utils import force_list
 
 from cds_migrator_kit.rdm.migration.transform.xml_processing.errors import (
-    UnexpectedValue,
+    UnexpectedValue, MissingRequiredField,
 )
 
 
@@ -18,3 +19,68 @@ def clean_str(to_clean):
         return to_clean.strip()
     except AttributeError:
         raise UnexpectedValue
+
+
+def clean_val(
+    subfield,
+    value,
+    var_type,
+    req=False,
+    regex_format=None,
+    default=None,
+    manual=False,
+    transform=None,
+    multiple_values=False,
+):
+    """
+    Tests values using common rules.
+
+    :param subfield: marcxml subfield indicator
+    :param value: marcxml value
+    :param var_type: expected type for value to be cleaned
+    :param req: specifies if the value is required in the end schema
+    :param regex_format: specifies if the value should have a pattern
+    :param default: if value is missing and required it outputs default
+    :param manual: if the value should be cleaned manually during the migration
+    :param transform: string transform function (or callable)
+    :param multiple_values: allow multiple values in subfield
+    :return: cleaned output value
+    """
+
+    def _clean(value_to_clean):
+        if value_to_clean is not None:
+            try:
+                if var_type is str:
+                    return clean_str(value_to_clean, regex_format, req, transform)
+                elif var_type is bool:
+                    return bool(value_to_clean)
+                elif var_type is int:
+                    return int(value_to_clean)
+                else:
+                    raise NotImplementedError
+            except ValueError:
+                raise UnexpectedValue(subfield=subfield)
+            except TypeError:
+                raise UnexpectedValue(subfield=subfield)
+            except (UnexpectedValue, MissingRequiredField) as e:
+                e.subfield = subfield
+                e.message += str(force_list(value))
+                raise e
+
+    to_clean = value.get(subfield)
+
+    is_tuple = type(to_clean) is tuple
+    if is_tuple and not multiple_values:
+        raise UnexpectedValue(subfield=subfield)
+
+    if multiple_values:
+        if is_tuple:
+            cleaned_values = []
+            for v in to_clean:
+                cleaned_values.append(_clean(v))
+            return cleaned_values
+        else:
+            # always return a list when `multiple_values` is True
+            return [_clean(to_clean)]
+    else:
+        return _clean(to_clean)