New: Indexing work nodes

Refs: rism-digital/rism-online-issues#309
rism-digital · Nov 27, 2024 · 747c196 · 747c196
1 parent 3480411
commit 747c196
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 0 deletions.
diff --git a/indexer/index_sources.py b/indexer/index_sources.py
@@ -38,6 +38,7 @@ def _get_sources(cfg: dict) -> Generator[dict, None, None]:
         (SELECT GROUP_CONCAT(DISTINCT CONCAT_WS('|:|', stos.relator_code, sours.marc_source) SEPARATOR '|~|') FROM {dbname}.sources_to_sources AS stos LEFT JOIN {dbname}.sources AS sours ON stos.source_b_id = sours.id WHERE marc_tag = '787' AND source_a_id = child.id) AS related_sources,
         (SELECT GROUP_CONCAT(DISTINCT do.digital_object_id SEPARATOR ',') FROM {dbname}.digital_object_links AS do WHERE do.object_link_type = 'Source' AND do.object_link_id = child.id) AS digital_objects,
         (SELECT GROUP_CONCAT(DISTINCT sw.work_id SEPARATOR '\n') FROM {dbname}.sources_to_works AS sw WHERE sw.source_id = child.id) AS work_ids,
+        (SELECT GROUP_CONCAT(DISTINCT CONCAT_WS('|:|', wn.id, wn.marc_source) SEPARATOR '|~|') FROM {dbname}.sources_to_work_nodes AS swn LEFT JOIN {dbname}.work_nodes AS wn ON swn.work_node_id = wn.id WHERE swn.source_id = child.id) AS work_nodes,
         GROUP_CONCAT(DISTINCT h.marc_source SEPARATOR '\n') AS holdings_marc,
         GROUP_CONCAT(DISTINCT hp.marc_source SEPARATOR '\n') as parent_holdings_marc,
         GROUP_CONCAT(DISTINCT h.lib_siglum SEPARATOR '\n') AS holdings_org,

diff --git a/indexer/records/source.py b/indexer/records/source.py
@@ -256,6 +256,21 @@ def create_source_index_documents(record: dict, cfg: dict) -> list:
 
     has_digitization: bool = _get_has_digitization(all_marc_records)
 
+    all_work_node_things = []
+    all_work_node_ids = []
+    if w := record.get("work_nodes"):
+        work_nodes: list[dict] = _get_work_nodes(w, source_id)
+        for wn in work_nodes:
+            all_work_node_ids.append(wn["external_id"])
+
+        all_work_node_things.extend(work_nodes)
+
+    work_nodes_json = (
+        orjson.dumps(all_work_node_things).decode("utf-8")
+        if all_work_node_things
+        else None
+    )
+
     # add some core fields to the source. These are fields that may not be easily
     # derived directly from the MARC record, or that include data from the database.
     source_core: dict = {
@@ -308,6 +323,8 @@ def create_source_index_documents(record: dict, cfg: dict) -> list:
         "related_sources_json": related_sources_json,
         "works_catalogue_json": works_catalogue_json,
         "related_institution_sigla_sm": related_institution_sigla,
+        "work_nodes_json": work_nodes_json,
+        "work_node_ids": all_work_node_ids,
         # purposefully left empty so we can fill this up later.
         "external_records_jsonm": [],
         "created": record["created"].strftime("%Y-%m-%dT%H:%M:%SZ"),
@@ -633,3 +650,62 @@ def _get_holding_people_ids(records: list[pymarc.Record]) -> set[str]:
             ids.update(p_ids)
 
     return ids
+
+
+def _get_work_nodes(wns: str, source_id: str) -> Optional[list[dict]]:
+    all_records = wns.split("|~|")
+    work_nodes: list[(str, dict)] = []
+
+    for record in all_records:
+        wnid, wn_marc_source = record.split("|:|")
+        wn_marc_record: Optional[pymarc.Record] = (
+            create_marc(wn_marc_source) if wn_marc_source else None
+        )
+        if not wn_marc_record:
+            log.error("Could not load Work Node MARC record: %s", wnid)
+            continue
+
+        work_node_id: str = f"work_node_{wnid}"
+        link_field: pymarc.Field = wn_marc_record.get("024")
+        if not link_field:
+            log.error("Work Node without an 024. Skipping: %s", work_node_id)
+            continue
+
+        if link_field and "2" in link_field and "a" in link_field:
+            ident: str = f"{link_field['2'].lower()}:{link_field['a']}"
+        else:
+            log.error(
+                "Work Node with 024 but without $2 or $a. Skipping: %s", work_node_id
+            )
+            continue
+
+        creator: pymarc.Field = wn_marc_record.get("100")
+        composer_name: Optional[str] = None
+        composer_id: Optional[str] = None
+        work_title: Optional[str] = None
+        if creator and "a" in creator:
+            name: str = creator["a"].strip()
+            dates: str = f" ({d})" if (d := creator.get("d")) else ""
+
+            composer_name = f"{name}{dates}"
+            composer_id = f"person_{creator["0"]}"
+
+            work_title_subf: str = creator["t"]
+            partial_title_subf: str = creator.get("p", "")
+
+            work_title = f"{work_title_subf}{partial_title_subf}"
+
+        d: dict = {
+            "id": work_node_id,
+            "type": "work_node",
+            "external_id": ident,
+            "composer_name": composer_name,
+            "composer_id": composer_id,
+            "work_title": work_title,
+            "this_id": source_id,
+            "this_type": "source",
+        }
+
+        work_nodes.append({k: v for k, v in d.items() if v})
+
+    return work_nodes