Skip to content

Commit

Permalink
migration: migrate record statistics
Browse files Browse the repository at this point in the history
  • Loading branch information
zzacharo committed Oct 8, 2024
1 parent 61fbc1d commit 52bf6c8
Show file tree
Hide file tree
Showing 14 changed files with 720 additions and 27 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,9 @@ target/

# vscode
.history/

# migration data folder
**/migration/data/**
**/migration/log/**

**.DS_Store
41 changes: 40 additions & 1 deletion cds_migrator_kit/rdm/migration/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ model = CMSNote(bases=(),

**query** - defines the MARC fields to which specific record should match. Attention: It does not recognise regexes that are used to specify the collection query in the admin interface of legacy CDS.

**__model_ignore_keys__** - set of keys to be ignored for this data model - fields will not be migrated
\***\*model_ignore_keys\*\*** - set of keys to be ignored for this data model - fields will not be migrated

**bases** - by defining bases of models you can specify a parent model which fits all the subsets of records (f.e. 245 - title field MARC to JSON translation could be the same for all the models)

Expand Down Expand Up @@ -134,6 +134,45 @@ records = db.session.query(model_cls.id).filter(
current_rdm_records_service.indexer.bulk_index((rec.id for rec in records))
```

### Migrate the statistics for the successfully migrated records

When the `invenio migration run` command ends it will produce a `records_state.json` file which has linked information about the migrated records and the old system. The format will be similar to below:

```json
{
"legacy_recid": "2884810",
"parent_recid": "zts3q-6ef46",
"latest_version": "1mae4-skq89",
"versions": [
{
"new_recid": "1mae4-skq89",
"version": 2,
"files": [
{
"legacy_file_id": 1568736,
"bucket_id": "155be22f-3038-49e0-9f17-9518eaac783a",
"file_key": "Summer student program report.pdf",
"file_id": "06cdb9d2-635f-4dbe-89fe-4b27afddeaa2",
"size": "1690854"
}
]
}
]
}
```

- Open the `cds_migrator_kit/rdm/migration/stats/config.py` and

- set the `RECID_LIST_FILE` to that path
- set the credentials of the legacy prod cluster so we can query it i.e `SRC_SEARCH_AUTH`

- Open a python shell and run the following commands

```python
from cds_migrator_kit.rdm.migration.stats.run import run

run(dry_run=False)
```

### To visualise the errors (locally):

Expand Down
2 changes: 1 addition & 1 deletion cds_migrator_kit/rdm/migration/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,6 @@ def run(dry_run=False):
stream_definitions=[RecordStreamDefinition],
# stream_definitions=[UserStreamDefinition],
config_filepath=Path(stream_config).absolute(),
dry_run=dry_run
dry_run=dry_run,
)
runner.run()
119 changes: 109 additions & 10 deletions cds_migrator_kit/rdm/migration/load/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

"""CDS-RDM migration load module."""
import logging
import os
import json

import arrow
from invenio_access.permissions import system_identity
Expand All @@ -17,7 +19,9 @@

from cds_migrator_kit.rdm.migration.transform.xml_processing.errors import (
ManualImportRequired,
RecordStatsNotImported,
)

from cds_migrator_kit.records.log import RDMJsonLogger
from cds_rdm.minters import legacy_recid_minter

Expand All @@ -33,8 +37,15 @@ def import_legacy_files(filepath):
class CDSRecordServiceLoad(Load):
"""CDSRecordServiceLoad."""

def __init__(self, db_uri, data_dir, tmp_dir, existing_data=False, entries=None,
dry_run=False):
def __init__(
self,
db_uri,
data_dir,
tmp_dir,
existing_data=False,
entries=None,
dry_run=False,
):
"""Constructor."""
self.db_uri = db_uri
self.data_dir = data_dir
Expand Down Expand Up @@ -66,7 +77,10 @@ def _load_files(self, draft, entry, version_files):
data=[
{
"key": file_data["key"],
"metadata": file_data["metadata"],
"metadata": {
**file_data["metadata"],
"legacy_file_id": file_data["id_bibdoc"],
},
"access": {"hidden": False},
}
],
Expand Down Expand Up @@ -105,8 +119,9 @@ def _load_files(self, draft, entry, version_files):
exc = ManualImportRequired(
message=str(e), field="filename", value=file_data["key"]
)
migration_logger.add_log(exc, output=entry, key="filename",
value=file_data["key"])
migration_logger.add_log(
exc, output=entry, key="filename", value=file_data["key"]
)

def _load_access(self, draft, entry):
"""Load access rights."""
Expand All @@ -131,12 +146,15 @@ def publish_and_mint_recid(draft, version):
# mint legacy ids for redirections
if version == 1:
record._record.model.created = arrow.get(
entry["record"]["created"]).datetime
entry["record"]["created"]
).datetime

# it seems more intuitive if we mint the lrecid for parent
# but then we get a double redirection
legacy_recid_minter(entry["record"]["recid"],
record._record.parent.model.id)
legacy_recid_minter(
entry["record"]["recid"], record._record.parent.model.id
)
return record

if not draft_files:
# when missing files, just publish
Expand Down Expand Up @@ -173,7 +191,9 @@ def publish_and_mint_recid(draft, version):
system_identity, draft["id"], data=missing_data
)

publish_and_mint_recid(draft, version)
record = publish_and_mint_recid(draft, version)
records.append(record._record)
return records

def _load_model_fields(self, draft, entry):
"""Load model fields of the record."""
Expand All @@ -184,13 +204,83 @@ def _load_model_fields(self, draft, entry):
self._load_communities(draft, entry)
db.session.commit()

def _load_record_state(self, legacy_recid, records):
"""Compute state for legacy recid.
Returns
{
"legacy_recid": "2884810",
"parent_recid": "zts3q-6ef46",
"latest_version": "1mae4-skq89"
"versions": [
{
"new_recid": "1mae4-skq89",
"version": 2,
"files": [
{
"legacy_file_id": 1568736,
"bucket_id": "155be22f-3038-49e0-9f17-9518eaac783a",
"file_key": "Summer student program report.pdf",
"file_id": "06cdb9d2-635f-4dbe-89fe-4b27afddeaa2",
"size": "1690854"
}
]
}
]
}
"""

def convert_file_format(file_entries, bucket_id):
"""Convert the file metadata into the required format."""
return [
{
"legacy_file_id": entry["metadata"]["legacy_file_id"],
"bucket_id": bucket_id,
"file_key": entry["key"],
"file_id": entry["file_id"],
"size": str(entry["size"]),
}
for entry in file_entries.values()
]

def extract_record_version(record):
"""Extract relevant details from a single record."""
bucket_id = str(record.files.bucket_id)
files = record.__class__.files.dump(
record, record.files, include_entries=True
).get("entries", {})
return {
"new_recid": record.pid.pid_value,
"version": record.versions.index,
"files": convert_file_format(files, bucket_id),
}

recid_state = {"legacy_recid": legacy_recid, "versions": []}
parent_recid = None

for record in records:
if parent_recid is None:
parent_recid = record.parent.pid.pid_value
recid_state["parent_recid"] = parent_recid

recid_version = extract_record_version(record)
# Save the record versions for legacy recid
recid_state["versions"].append(recid_version)

if "latest_version" not in recid_state:
recid_state["latest_version"] = record.get_latest_by_parent(
record.parent
)["id"]
return recid_state

def _load(self, entry):
"""Use the services to load the entries."""
if entry:
recid = entry.get("record", {}).get("recid", {})
migration_logger = RDMJsonLogger()
migration_logger.add_recid_to_stats(recid)
identity = system_identity # TODO: load users instead
exc = None
if self.dry_run:
try:

Expand All @@ -212,7 +302,16 @@ def _load(self, entry):
try:
self._load_model_fields(draft, entry)

self._load_versions(draft, entry)
records = self._load_versions(draft, entry)

if records:
legacy_recid = entry["record"]["recid"]
record_state_context = self._load_record_state(
legacy_recid, records
)
# Dump the computed record state. This is useful to migrate then the record stats
if record_state_context:
migration_logger.add_record_state(record_state_context)
except PIDAlreadyExists:
pass
except Exception as e:
Expand Down
8 changes: 8 additions & 0 deletions cds_migrator_kit/rdm/migration/stats/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM migration stats module."""
76 changes: 76 additions & 0 deletions cds_migrator_kit/rdm/migration/stats/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import os


# Collection ids
RECID_LIST_FILE = os.path.join(
"<path>",
"ssn_recids.json",
)


######## Logging ###########
ROOT_PATH = os.path.join(
"<path>",
"stats",
)

####### Search ##############
SRC_SEARCH_URL = "https://os-cds-legacy.cern.ch:443/os"
SRC_SEARCH_AUTH = ("", "")

SRC_SEARCH_SIZE = 5000
SRC_SEARCH_SCROLL = "1h"
DEST_SEARCH_INDEX_PREFIX = "cds-rdm-sandbox-events-stats"
DEST_SEARCH_URL = "http://127.0.0.1:9200/"
DEST_SEARCH_AUTH = ("", "")


######## Statistics ###########

_QUERY_VIEWS = {
"query": {
"bool": {
"must": [
{"match": {"id_bibrec": "<recid>"}},
{"match": {"event_type": "<type>"}},
]
}
}
}

LEGACY_TO_RDM_EVENTS_MAP = {
"events.pageviews": {
"type": "record-view",
"query": _QUERY_VIEWS,
},
"events.downloads": {
"type": "file-downloads",
"query": _QUERY_VIEWS,
},
}

EVENT_TYPES = ["events.pageviews", "events.downloads"]

LEGACY_INDICES = [
"cds-2004",
"cds-2005",
"cds-2006",
"cds-2007",
"cds-2008",
"cds-2009",
"cds-2010",
"cds-2011",
"cds-2012",
"cds-2013",
"cds-2014",
"cds-2015",
"cds-2016",
"cds-2017",
"cds-2018",
"cds-2019",
"cds-2020",
"cds-2021",
"cds-2022",
"cds-2023",
"cds-2024",
]
Loading

0 comments on commit 52bf6c8

Please sign in to comment.