Skip to content

Commit

Permalink
errors: log load errors
Browse files Browse the repository at this point in the history
  • Loading branch information
kpsherva committed Jul 26, 2024
1 parent daaaac0 commit 71ab28e
Show file tree
Hide file tree
Showing 9 changed files with 209 additions and 97 deletions.
1 change: 0 additions & 1 deletion cds_migrator_kit/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

from invenio_app.config import APP_DEFAULT_SECURE_HEADERS


def _(x):
"""Identity function used to trigger string extraction."""
return x
Expand Down
5 changes: 4 additions & 1 deletion cds_migrator_kit/rdm/migration/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM command line module."""

import logging
from pathlib import Path

import click
Expand All @@ -15,7 +15,10 @@

from cds_migrator_kit.rdm.migration.runner import Runner
from cds_migrator_kit.rdm.migration.streams import RecordStreamDefinition
from cds_migrator_kit.records.log import RDMJsonLogger

cli_logger = logging.getLogger("migrator")
migration_logger = RDMJsonLogger()

@click.group()
def migration():
Expand Down
66 changes: 42 additions & 24 deletions cds_migrator_kit/rdm/migration/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,19 @@
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM migration load module."""

import logging
import os

from invenio_access.permissions import system_identity
from invenio_db import db
from invenio_rdm_migrator.load.base import Load
from invenio_rdm_records.proxies import current_rdm_records_service

from cds_migrator_kit.rdm.migration.transform.xml_processing.errors import \
ManualImportRequired

cli_logger = logging.getLogger("migrator")


def import_legacy_files(filepath):
"""Download file from legacy."""
Expand All @@ -37,34 +43,46 @@ def _prepare(self, entry):

def _load(self, entry):
"""Use the services to load the entries."""
from cds_migrator_kit.rdm.migration.cli import migration_logger, cli_logger
recid = entry.get("record", {}).get("json", {}).get("id")
migration_logger.add_recid_to_stats(recid)
identity = system_identity # Should we create an identity for the migration?
draft = current_rdm_records_service.create(identity, entry["record"]["json"])
parent = draft._record.parent
access = entry["parent"]["json"]["access"]
parent.access = access
parent.commit()
db.session.commit()
draft_files = entry["draft_files"]

for file in draft_files:
current_rdm_records_service.draft_files.init_files(
identity,
draft.id,
data=[
{
"key": file["key"],
"metadata": file["metadata"],
"access": {"hidden": False},
}
],
)
current_rdm_records_service.draft_files.set_file_content(
identity,
draft.id,
file["key"],
import_legacy_files(file["eos_tmp_path"]),
)
result = current_rdm_records_service.draft_files.commit_file(
identity, draft.id, file["key"]
)
legacy_checksum = f"md5:{file['checksum']}"
new_checksum = result.to_dict()["checksum"]
assert legacy_checksum == new_checksum
try:
current_rdm_records_service.draft_files.init_files(
identity,
draft.id,
data=[
{
"key": file["key"],
"metadata": file["metadata"],
"access": {"hidden": False},
}
],
)
current_rdm_records_service.draft_files.set_file_content(
identity,
draft.id,
file["key"],
import_legacy_files(file["eos_tmp_path"]),
)
result = current_rdm_records_service.draft_files.commit_file(
identity, draft.id, file["key"]
)
legacy_checksum = f"md5:{file['checksum']}"
new_checksum = result.to_dict()["checksum"]
assert legacy_checksum == new_checksum
except Exception as e:
exc = ManualImportRequired(message=str(e))
migration_logger.add_log(exc, output=entry)
current_rdm_records_service.publish(system_identity, draft["id"])

def _cleanup(self, *args, **kwargs):
Expand Down
5 changes: 3 additions & 2 deletions cds_migrator_kit/rdm/migration/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,10 @@ def run(self):
# on successful stream run, persist state
# STATE.flush_cache()
# self.state.save(filename=f"{stream.name}.db")
from cds_migrator_kit.rdm.migration.transform.transform import logger
from cds_migrator_kit.rdm.migration.cli import migration_logger, \
cli_logger

logger.save()
migration_logger.save()
except Exception:
Logger.get_logger().exception(
f"Stream {stream.name} failed.", exc_info=1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,66 +32,58 @@ class CMSSummerStudent(CdsOverdo):
__query__ = "980__:NOTE 037__:CERN-STUDENTS-Note-*"

__ignore_keys__ = {
"65017a",
"937__c",
"960__a",
"269__b",
"6531_a",
"906__p",
"269__a",
#"0247_2", # DOI, summer student notes do not have it
#"0247_a", # DOI
"0248_a", # oai identifier, not needed to migrate, TBD
"0248_p", # oai identifier, not needed to migrate, TBD
# "0248_q", does appear
"035__9", # Inspire schema
"035__a", # Inspire id value
"246__a", # explanation of abrreviations, TODO: shall we keep it in notes?
"246__i", # abbreviation tag, applies to value of 246__A
"088__a", # RN (manual introduced?)
"100__0",
"100__u", # Author affiliation
"100__9", # Author to check
"970__a", # TODO: check it
"980__a",
"980__c", # TODO: remove this one, it should not appear
# '260__c',
"8564_s", # Files
"8564_y", # Files
"8564_x", # Files
"8564_8", # Files
"500__a", # Note
"246__i", # Abbreviation
"100__a",
"100__m", # author's email <-- decided not to keep in RDM,
"100__m", # author's email <-- decided not to keep in RDM,
"100__u", # Author affiliation
"246__a", # Abbreviation
"595__z", # TODO: check it
"0248_q", # TODO: check it
"700__9", # Contributors (?)
"700__0", # Contributors (cds author)
"700__u", # Contributors (affiliation?)
"700__m", # Contributors (email)
"693__s", # study
"693__p", # project
"693__b", # TODO: check it
"088__a", # RN (manual introduced?)
"0248_a",
"0248_p",
"0247_a", # DOI
"0247_2", # DOI
"710__g", # Collaboration
"981__a", # TODO: check it
"562__c", # TODO: check it
"970__d", # TODO: check it
"270__p", # TODO: check it
"270__m", # TODO: check it
"035__a", # Inspire ref
"035__9", # Inspire ref
"693__a",
"710__5",
"916__w",
"690C_a",
"595__a",
"693__e",
"650172",
"916__s",
"041__a",
"937__s",
"859__f",
"246__i", # Abbreviation
# "270__m",
# "270__p",
# "500__a", # Note
# "562__c", # note
"595__a", # always value CERN EDS, not displayed, TODO: do we keep?
"595__z", # SOME RECORD HAVE UNCL as value, do we keep it?
"650172", # TODO TBD
"65017a", # TODO TBD
"6531_9",
"520__a",
"963__a",
"710__a",
"100__a",
"100__m",
"6531_a",
# "693__a",
# "693__b", # TODO: check it
# "693__e",
# "693__p", # project
# "693__s", # study
# "700__0", # Contributors (cds author)
# "700__9", # Contributors (?)
# "700__m", # Contributors (email)
# "700__u", # Contributors (affiliation?)
# "710__5",
# "710__a",
# "710__g", # Collaboration
"8564_8", # Files system field
"8564_s", # Files system field
"8564_x", # Files system field
"8564_y", # Files
"859__f", # creator's email, to be used to determine the owner ???
# "906__p", # probably supervisor TODO: check
# "916__s", # creation date
# "916__w", # creation date
"960__a", # collection id? usuall y valu 12
"963__a", # restriction
"980__a",
"980__c", # TODO: remove this one, it should not appear
}
_default_fields = None

Expand Down
22 changes: 13 additions & 9 deletions cds_migrator_kit/rdm/migration/transform/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,16 @@
RDMRecordTransform,
)


from cds_migrator_kit.rdm.migration.transform.xml_processing.dumper import CDSRecordDump
from cds_migrator_kit.rdm.migration.transform.xml_processing.errors import (
LossyConversion,
)
from cds_migrator_kit.records.log import RDMJsonLogger

cli_logger = logging.getLogger("migrator")
logger = RDMJsonLogger()

CDS_DATACITE_PREFIXES = [
"10.17181"
]


class CDSToRDMRecordEntry(RDMRecordEntry):
Expand Down Expand Up @@ -81,7 +83,8 @@ def _media_files(self, entry):
return {}

def _pids(self, json_entry):
return {}
return []


def _files(self, record_dump):
"""Transform the files of a record."""
Expand Down Expand Up @@ -126,14 +129,15 @@ def _resource_type(data):

def transform(self, entry):
"""Transform a record single entry."""
from cds_migrator_kit.rdm.migration.cli import migration_logger, cli_logger
record_dump = CDSRecordDump(
entry,
)
try:
logger.add_recid_to_stats(entry["recid"])
migration_logger.add_recid_to_stats(entry["recid"])
record_dump.prepare_revisions()
timestamp, json_data = record_dump.revisions[-1]
logger.add_record(json_data)
migration_logger.add_record(json_data)
return {
"created": self._created(json_data),
"updated": self._updated(record_dump),
Expand All @@ -150,9 +154,9 @@ def transform(self, entry):
}
except LossyConversion as e:
cli_logger.error("[DATA ERROR]: {0}".format(e.message))
logger.add_log(e, output=entry)
migration_logger.add_log(e, output=entry)
except Exception as e:
logger.add_log(e, output=entry)
migration_logger.add_log(e, output=entry)
raise e
# TODO take only the last

Expand Down Expand Up @@ -183,7 +187,7 @@ def _parent(self, entry, record):
# loader is responsible for creating/updating if the PID exists.
"id": f'{record["json"]["id"]}-parent',
"access": {
# "owned_by": [{"user": o} for o in entry["json"].get("owners", [])]
"owned_by": {"user": "1"},
},
# "communities": self._community_id(entry, record),
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM migration data cleaning module."""
from dojson.utils import force_list

from cds_migrator_kit.rdm.migration.transform.xml_processing.errors import (
UnexpectedValue,
UnexpectedValue, MissingRequiredField,
)


Expand All @@ -18,3 +19,68 @@ def clean_str(to_clean):
return to_clean.strip()
except AttributeError:
raise UnexpectedValue


def clean_val(
subfield,
value,
var_type,
req=False,
regex_format=None,
default=None,
manual=False,
transform=None,
multiple_values=False,
):
"""
Tests values using common rules.
:param subfield: marcxml subfield indicator
:param value: marcxml value
:param var_type: expected type for value to be cleaned
:param req: specifies if the value is required in the end schema
:param regex_format: specifies if the value should have a pattern
:param default: if value is missing and required it outputs default
:param manual: if the value should be cleaned manually during the migration
:param transform: string transform function (or callable)
:param multiple_values: allow multiple values in subfield
:return: cleaned output value
"""

def _clean(value_to_clean):
if value_to_clean is not None:
try:
if var_type is str:
return clean_str(value_to_clean, regex_format, req, transform)
elif var_type is bool:
return bool(value_to_clean)
elif var_type is int:
return int(value_to_clean)
else:
raise NotImplementedError
except ValueError:
raise UnexpectedValue(subfield=subfield)
except TypeError:
raise UnexpectedValue(subfield=subfield)
except (UnexpectedValue, MissingRequiredField) as e:
e.subfield = subfield
e.message += str(force_list(value))
raise e

to_clean = value.get(subfield)

is_tuple = type(to_clean) is tuple
if is_tuple and not multiple_values:
raise UnexpectedValue(subfield=subfield)

if multiple_values:
if is_tuple:
cleaned_values = []
for v in to_clean:
cleaned_values.append(_clean(v))
return cleaned_values
else:
# always return a list when `multiple_values` is True
return [_clean(to_clean)]
else:
return _clean(to_clean)
Loading

0 comments on commit 71ab28e

Please sign in to comment.