migration: migrate record statistics

CERNDocumentServer · Oct 8, 2024 · 52bf6c8 · 52bf6c8
1 parent 61fbc1d
commit 52bf6c8
Show file tree

Hide file tree

Showing 14 changed files with 720 additions and 27 deletions.
diff --git a/.gitignore b/.gitignore
@@ -65,3 +65,9 @@ target/
 
 # vscode
 .history/
+
+# migration data folder
+**/migration/data/**
+**/migration/log/**
+
+**.DS_Store
diff --git a/cds_migrator_kit/rdm/migration/README.md b/cds_migrator_kit/rdm/migration/README.md
@@ -36,7 +36,7 @@ model = CMSNote(bases=(),
 
 **query** - defines the MARC fields to which specific record should match. Attention: It does not recognise regexes that are used to specify the collection query in the admin interface of legacy CDS.
 
-**__model_ignore_keys__** - set of keys to be ignored for this data model - fields will not be migrated
+\***\*model_ignore_keys\*\*** - set of keys to be ignored for this data model - fields will not be migrated
 
 **bases** - by defining bases of models you can specify a parent model which fits all the subsets of records (f.e. 245 - title field MARC to JSON translation could be the same for all the models)
 
@@ -134,6 +134,45 @@ records = db.session.query(model_cls.id).filter(
 current_rdm_records_service.indexer.bulk_index((rec.id for rec in records))
 ```
 
+### Migrate the statistics for the successfully migrated records
+
+When the `invenio migration run` command ends it will produce a `records_state.json` file which has linked information about the migrated records and the old system. The format will be similar to below:
+
+```json
+{
+  "legacy_recid": "2884810",
+  "parent_recid": "zts3q-6ef46",
+  "latest_version": "1mae4-skq89",
+  "versions": [
+    {
+      "new_recid": "1mae4-skq89",
+      "version": 2,
+      "files": [
+        {
+          "legacy_file_id": 1568736,
+          "bucket_id": "155be22f-3038-49e0-9f17-9518eaac783a",
+          "file_key": "Summer student program report.pdf",
+          "file_id": "06cdb9d2-635f-4dbe-89fe-4b27afddeaa2",
+          "size": "1690854"
+        }
+      ]
+    }
+  ]
+}
+```
+
+- Open the `cds_migrator_kit/rdm/migration/stats/config.py` and
+
+  - set the `RECID_LIST_FILE` to that path
+  - set the credentials of the legacy prod cluster so we can query it i.e `SRC_SEARCH_AUTH`
+
+- Open a python shell and run the following commands
+
+```python
+from cds_migrator_kit.rdm.migration.stats.run import run
+
+run(dry_run=False)
+```
 
 ### To visualise the errors (locally):
 

diff --git a/cds_migrator_kit/rdm/migration/cli.py b/cds_migrator_kit/rdm/migration/cli.py
@@ -39,6 +39,6 @@ def run(dry_run=False):
         stream_definitions=[RecordStreamDefinition],
         # stream_definitions=[UserStreamDefinition],
         config_filepath=Path(stream_config).absolute(),
-        dry_run=dry_run
+        dry_run=dry_run,
     )
     runner.run()
diff --git a/cds_migrator_kit/rdm/migration/load/load.py b/cds_migrator_kit/rdm/migration/load/load.py
@@ -7,6 +7,8 @@
 
 """CDS-RDM migration load module."""
 import logging
+import os
+import json
 
 import arrow
 from invenio_access.permissions import system_identity
@@ -17,7 +19,9 @@
 
 from cds_migrator_kit.rdm.migration.transform.xml_processing.errors import (
     ManualImportRequired,
+    RecordStatsNotImported,
 )
+
 from cds_migrator_kit.records.log import RDMJsonLogger
 from cds_rdm.minters import legacy_recid_minter
 
@@ -33,8 +37,15 @@ def import_legacy_files(filepath):
 class CDSRecordServiceLoad(Load):
     """CDSRecordServiceLoad."""
 
-    def __init__(self, db_uri, data_dir, tmp_dir, existing_data=False, entries=None,
-                 dry_run=False):
+    def __init__(
+        self,
+        db_uri,
+        data_dir,
+        tmp_dir,
+        existing_data=False,
+        entries=None,
+        dry_run=False,
+    ):
         """Constructor."""
         self.db_uri = db_uri
         self.data_dir = data_dir
@@ -66,7 +77,10 @@ def _load_files(self, draft, entry, version_files):
                     data=[
                         {
                             "key": file_data["key"],
-                            "metadata": file_data["metadata"],
+                            "metadata": {
+                                **file_data["metadata"],
+                                "legacy_file_id": file_data["id_bibdoc"],
+                            },
                             "access": {"hidden": False},
                         }
                     ],
@@ -105,8 +119,9 @@ def _load_files(self, draft, entry, version_files):
                 exc = ManualImportRequired(
                     message=str(e), field="filename", value=file_data["key"]
                 )
-                migration_logger.add_log(exc, output=entry, key="filename",
-                                         value=file_data["key"])
+                migration_logger.add_log(
+                    exc, output=entry, key="filename", value=file_data["key"]
+                )
 
     def _load_access(self, draft, entry):
         """Load access rights."""
@@ -131,12 +146,15 @@ def publish_and_mint_recid(draft, version):
             # mint legacy ids for redirections
             if version == 1:
                 record._record.model.created = arrow.get(
-                    entry["record"]["created"]).datetime
+                    entry["record"]["created"]
+                ).datetime
 
                 # it seems more intuitive if we mint the lrecid for parent
                 # but then we get a double redirection
-                legacy_recid_minter(entry["record"]["recid"],
-                                    record._record.parent.model.id)
+                legacy_recid_minter(
+                    entry["record"]["recid"], record._record.parent.model.id
+                )
+            return record
 
         if not draft_files:
             # when missing files, just publish
@@ -173,7 +191,9 @@ def publish_and_mint_recid(draft, version):
                     system_identity, draft["id"], data=missing_data
                 )
 
-            publish_and_mint_recid(draft, version)
+            record = publish_and_mint_recid(draft, version)
+            records.append(record._record)
+        return records
 
     def _load_model_fields(self, draft, entry):
         """Load model fields of the record."""
@@ -184,13 +204,83 @@ def _load_model_fields(self, draft, entry):
         self._load_communities(draft, entry)
         db.session.commit()
 
+    def _load_record_state(self, legacy_recid, records):
+        """Compute state for legacy recid.
+
+        Returns
+        {
+            "legacy_recid": "2884810",
+            "parent_recid": "zts3q-6ef46",
+            "latest_version": "1mae4-skq89"
+            "versions": [
+                {
+                    "new_recid": "1mae4-skq89",
+                    "version": 2,
+                    "files": [
+                        {
+                            "legacy_file_id": 1568736,
+                            "bucket_id": "155be22f-3038-49e0-9f17-9518eaac783a",
+                            "file_key": "Summer student program report.pdf",
+                            "file_id": "06cdb9d2-635f-4dbe-89fe-4b27afddeaa2",
+                            "size": "1690854"
+                        }
+                    ]
+                }
+            ]
+        }
+        """
+
+        def convert_file_format(file_entries, bucket_id):
+            """Convert the file metadata into the required format."""
+            return [
+                {
+                    "legacy_file_id": entry["metadata"]["legacy_file_id"],
+                    "bucket_id": bucket_id,
+                    "file_key": entry["key"],
+                    "file_id": entry["file_id"],
+                    "size": str(entry["size"]),
+                }
+                for entry in file_entries.values()
+            ]
+
+        def extract_record_version(record):
+            """Extract relevant details from a single record."""
+            bucket_id = str(record.files.bucket_id)
+            files = record.__class__.files.dump(
+                record, record.files, include_entries=True
+            ).get("entries", {})
+            return {
+                "new_recid": record.pid.pid_value,
+                "version": record.versions.index,
+                "files": convert_file_format(files, bucket_id),
+            }
+
+        recid_state = {"legacy_recid": legacy_recid, "versions": []}
+        parent_recid = None
+
+        for record in records:
+            if parent_recid is None:
+                parent_recid = record.parent.pid.pid_value
+                recid_state["parent_recid"] = parent_recid
+
+            recid_version = extract_record_version(record)
+            # Save the record versions for legacy recid
+            recid_state["versions"].append(recid_version)
+
+            if "latest_version" not in recid_state:
+                recid_state["latest_version"] = record.get_latest_by_parent(
+                    record.parent
+                )["id"]
+        return recid_state
+
     def _load(self, entry):
         """Use the services to load the entries."""
         if entry:
             recid = entry.get("record", {}).get("recid", {})
             migration_logger = RDMJsonLogger()
             migration_logger.add_recid_to_stats(recid)
             identity = system_identity  # TODO: load users instead
+            exc = None
             if self.dry_run:
                 try:
 
@@ -212,7 +302,16 @@ def _load(self, entry):
                 try:
                     self._load_model_fields(draft, entry)
 
-                    self._load_versions(draft, entry)
+                    records = self._load_versions(draft, entry)
+
+                    if records:
+                        legacy_recid = entry["record"]["recid"]
+                        record_state_context = self._load_record_state(
+                            legacy_recid, records
+                        )
+                        # Dump the computed record state. This is useful to migrate then the record stats
+                        if record_state_context:
+                            migration_logger.add_record_state(record_state_context)
                 except PIDAlreadyExists:
                     pass
                 except Exception as e:

diff --git a/cds_migrator_kit/rdm/migration/stats/__init__.py b/cds_migrator_kit/rdm/migration/stats/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2022 CERN.
+#
+# CDS-RDM is free software; you can redistribute it and/or modify it under
+# the terms of the MIT License; see LICENSE file for more details.
+
+"""CDS-RDM migration stats module."""
diff --git a/cds_migrator_kit/rdm/migration/stats/config.py b/cds_migrator_kit/rdm/migration/stats/config.py
@@ -0,0 +1,76 @@
+import os
+
+
+# Collection ids
+RECID_LIST_FILE = os.path.join(
+    "<path>",
+    "ssn_recids.json",
+)
+
+
+######## Logging ###########
+ROOT_PATH = os.path.join(
+    "<path>",
+    "stats",
+)
+
+####### Search ##############
+SRC_SEARCH_URL = "https://os-cds-legacy.cern.ch:443/os"
+SRC_SEARCH_AUTH = ("", "")
+
+SRC_SEARCH_SIZE = 5000
+SRC_SEARCH_SCROLL = "1h"
+DEST_SEARCH_INDEX_PREFIX = "cds-rdm-sandbox-events-stats"
+DEST_SEARCH_URL = "http://127.0.0.1:9200/"
+DEST_SEARCH_AUTH = ("", "")
+
+
+######## Statistics ###########
+
+_QUERY_VIEWS = {
+    "query": {
+        "bool": {
+            "must": [
+                {"match": {"id_bibrec": "<recid>"}},
+                {"match": {"event_type": "<type>"}},
+            ]
+        }
+    }
+}
+
+LEGACY_TO_RDM_EVENTS_MAP = {
+    "events.pageviews": {
+        "type": "record-view",
+        "query": _QUERY_VIEWS,
+    },
+    "events.downloads": {
+        "type": "file-downloads",
+        "query": _QUERY_VIEWS,
+    },
+}
+
+EVENT_TYPES = ["events.pageviews", "events.downloads"]
+
+LEGACY_INDICES = [
+    "cds-2004",
+    "cds-2005",
+    "cds-2006",
+    "cds-2007",
+    "cds-2008",
+    "cds-2009",
+    "cds-2010",
+    "cds-2011",
+    "cds-2012",
+    "cds-2013",
+    "cds-2014",
+    "cds-2015",
+    "cds-2016",
+    "cds-2017",
+    "cds-2018",
+    "cds-2019",
+    "cds-2020",
+    "cds-2021",
+    "cds-2022",
+    "cds-2023",
+    "cds-2024",
+]