From 2edab996b112107cde93f9cbdebc20f0ec3a4b13 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 23 Jan 2025 16:32:07 -0800 Subject: [PATCH] fixed up to index by any annotation value --- scripts/gen-model-meta.py | 126 +++++++++++++++----------------------- scripts/pyproject.toml | 5 +- 2 files changed, 55 insertions(+), 76 deletions(-) diff --git a/scripts/gen-model-meta.py b/scripts/gen-model-meta.py index 2feff3bf..dd6aa3f7 100644 --- a/scripts/gen-model-meta.py +++ b/scripts/gen-model-meta.py @@ -1,51 +1,74 @@ +""" +Script to generate model indicies. to run: + +% cd scripts +% poetry install +% poetry run python gen-model-meta.py --keys-to-index contributor,term_id,reference_id --output-dir /tmp/output + +""" import os -import glob import json import sys import logging from pathlib import Path -import click +from pprint import pprint +import click # Logger basic setup logging.basicConfig(level=logging.INFO) -LOG = logging.getLogger('gen-model-meta') +LOG = logging.getLogger("gen-model-meta") LOG.setLevel(logging.WARNING) +json_path = Path("/tmp/gocams/") def die_screaming(instr): """Exit in a way that will get attention.""" LOG.error(instr) sys.exit(1) - -def create_indices(read_data, model_id, keys_to_index): +def process_json_files(keys_to_index, output_dir, path_to_json=json_path): """ - Process the JSON data structure once and generate indices for the specified keys. - - :param read_data: The JSON data structure - :param model_id: The model ID to associate with indexed values - :param keys_to_index: List of keys to index - - :return: A dictionary of indices for the specified keys + Process all JSON files, generate indices for the specified keys, and save them. """ + # Get the list of files from path_to_json + json_files = [f for f in os.listdir(path_to_json) if f.endswith(".json")] indices = {key: {} for key in keys_to_index} - individuals = read_data.get("individuals", []) - - for individual in individuals: - annotations = individual.get("annotations", []) - for annotation in annotations: - key = annotation.get("key") - value = annotation.get("value") - if key in keys_to_index: - if value not in indices[key]: - indices[key][value] = [] - if model_id not in indices[key][value]: - indices[key][value].append(model_id) + # Ensure the output directory exists + os.makedirs(output_dir, exist_ok=True) - return indices + # Process each JSON file + for file_name in json_files: + model_id = Path(file_name).stem + # Open, parse, and save the JSON in a dictionary file + file_url = os.path.join(path_to_json, file_name) + with open(file_url, "r") as f: + print(model_id) + read_data = json.load(f) + if not read_data: + die_screaming(f"ERROR: No data in file: {file_url}") + + # Create indices for the current file + individuals = read_data.get("individuals", []) + for individual in individuals: + annotations = individual.get("annotations", []) + for annotation in annotations: + key = annotation.get("key") + value = annotation.get("value") + if key in keys_to_index: + if value not in indices[key]: + indices[key][value] = [] + if model_id not in indices[key][value]: + indices[key][value].append(model_id) + # for each top level key in the indicies dictionary, write out the JSON to a file + for key, value in indices.items(): + with open(os.path.join(output_dir, f"{key}.json"), "w") as f: + json.dump(value, f, indent=4) + + +# CLI using Click @click.command() @click.option( "--keys-to-index", @@ -54,13 +77,6 @@ def create_indices(read_data, model_id, keys_to_index): required=True, help="List of keys to index (e.g., contributor, term_id, reference_id).", ) -@click.option( - "--input-dir", - "-i", - type=click.Path(file_okay=False, dir_okay=True, readable=True, path_type=str), - required=True, - help="Directory containing the JSON input files.", -) @click.option( "--output-dir", "-o", @@ -68,49 +84,9 @@ def create_indices(read_data, model_id, keys_to_index): required=True, help="Directory where the JSON index files will be saved.", ) -def main(keys_to_index, input_dir, output_dir): - """ - Process all JSON files in the input directory, generate indices for the specified keys, and save them. - """ - # Ensure the output directory exists - os.makedirs(output_dir, exist_ok=True) - - # Process each JSON file in the input directory - for file_path in glob.glob(f"{input_dir}/*.json"): - model_id = Path(file_path).stem - - with open(file_path) as fhandle: - try: - read_data = json.load(fhandle) - except json.JSONDecodeError: - die_screaming(f"ERROR decoding JSON in file: {file_path}") - - if not read_data: - die_screaming(f"ERROR: No data in file: {file_path}") - - # Create indices for the current file - indices = create_indices(read_data, model_id, keys_to_index) - - # Save each index as a JSON file - for key, index in indices.items(): - output_file = os.path.join(output_dir, f"{key}_index.json") - # Append to the existing file if it exists - if os.path.exists(output_file): - with open(output_file, "r") as f: - existing_data = json.load(f) - for k, v in index.items(): - if k in existing_data: - existing_data[k].extend(v) - existing_data[k] = list(set(existing_data[k])) # Ensure unique entries - else: - existing_data[k] = v - index = existing_data - - with open(output_file, "w") as f: - json.dump(index, f, indent=4) - - click.echo(f"Saved {key} index to {output_file}") - +def main(keys_to_index, output_dir): + process_json_files(keys_to_index, output_dir) +# Ensure the script runs only when executed directly if __name__ == "__main__": main() diff --git a/scripts/pyproject.toml b/scripts/pyproject.toml index 596a6e4a..35c21a4f 100644 --- a/scripts/pyproject.toml +++ b/scripts/pyproject.toml @@ -16,4 +16,7 @@ docutils = "0.21.2" [build-system] requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" \ No newline at end of file +build-backend = "poetry.core.masonry.api" + +[scripts] +gen_model_meta = "gen_model_meta:main" \ No newline at end of file