refactor: use catalog and catalog-build to hold the data and build fi…

…les (#79) (#80)
human-pangenomics · Nov 20, 2024 · fc082da · fc082da
1 parent 70b0949
commit fc082da
Show file tree

Hide file tree

Showing 27 changed files with 151,721 additions and 91,755 deletions.
diff --git a/.prettierignore b/.prettierignore
@@ -8,6 +8,7 @@ coverage
 # next.js
 .next
 out
+catalog
 
 # production
 build

diff --git a/README.md b/README.md
@@ -5,6 +5,7 @@ This app requires Node `20.10.0`. If you do not have Node installed, download it
 [n](https://www.npmjs.com/package/n/v/5.0.1) to select that version.
 
 Once you have Node installed with the correct version, clone the repository using:
+
 ```shell
 git clone git@github.com:human-pangenomics/hprc-data-explorer.git
 ```
@@ -14,14 +15,7 @@ Then, install the required packages using:
 npm ci
 ```
 
-Check that the file `files/out/raw-sequencing-data.json` is present.
-If it is not, build it by running:
-
-```shell
-npm run build-hprc-db
-```
-
-You're now ready to go! With that file made, you can run the development server:
+Run the development server
 
 ```shell
 npm run dev
@@ -31,19 +25,6 @@ yarn dev
 
 Once the server is running, visit [localhost:3000](localhost:3000) to view the Explorer!
 
-### Building the data source files
-The raw seqencing, assemblies, and annotations data are generated through a Python script. To create a virtual environment
-and install the required libraries, use the following commands from the root project directory:
-```shell
-python3 -m venv ./venv
-source ./venv/bin/activate
-pip install -r files/requirements.txt
-```
-Then run the scripts with:
-```shell
-python3 files/build-raw-sequencing-files.py
-python3 files/build-assemblies-data.py
-python3 files/build-annotations-data.py
-```
-The environment can be deactivated by running `deactivate`, and re-activated by running `source ./venv/bin/activate`
-again.
+## Building the Catalog Files
+
+To rebuild the catalog files see [catalog-build/README.md](catalog-build/README.md)
diff --git a/catalog-build/.gitignore b/catalog-build/.gitignore
@@ -0,0 +1,4 @@
+
+# Catalog input files
+unprocessed_files/*
+!unprocessed_files/.gitkeep
diff --git a/catalog-build/README.md b/catalog-build/README.md
@@ -0,0 +1,38 @@
+
+
+## Building the catalog-build/source Files
+
+The raw sequencing, assemblies, and annotations source files are generated through Python scripts.
+
+To create a virtual environment and install the required libraries, use the following commands from the root project directory:
+
+```shell
+python3 -m venv ./venv
+source ./venv/bin/activate
+pip install -r catalog-build/requirements.txt
+```
+
+Then run the scripts from this directory with:
+```shell
+python3 build-sequencing-data.py
+python3 build-assemblies.py
+python3 build-annotations.py
+```
+
+
+This can also be accomplshed by running
+```shell
+npm run build-catalog-source
+```
+
+The environment can be deactivated by running `deactivate`, and re-activated by running `source ./venv/bin/activate`
+again.
+
+## Building the Catalog Files
+
+Once the source files are generated, you can build the /catalog files with:
+
+```shell
+npm run build-catalog
+```
+
diff --git a/files/build-annotations-data.py → catalog-build/build-annotations.py b/files/build-annotations-data.py → catalog-build/build-annotations.py
@@ -1,3 +1,4 @@
+import os
 import pandas as pd
 import numpy as np
 from buildHelp import downloadFile
@@ -13,8 +14,13 @@
     "TRF": "https://raw.githubusercontent.com/human-pangenomics/HPP_Year1_Assemblies/main/annotation_index/Year1_assemblies_v2_genbank_TRF.index",
 }
 
-DOWNLOADS_FOLDER_PATH = "./files/unprocessed_files"
-OUTPUT_FILE_PATH = "./files/source/annotations.csv"
+# Base directory of the script
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Define paths relative to the script's directory
+DOWNLOADS_FOLDER_PATH = os.path.join(BASE_DIR, "unprocessed_files")
+OUTPUT_FILE_PATH = os.path.join(BASE_DIR, "source/annotations.csv")
+
 
 CHM13 = "chm13"
 HG38 = "hg38"

diff --git a/files/build-assemblies-data.py → catalog-build/build-assemblies.py b/files/build-assemblies-data.py → catalog-build/build-assemblies.py
@@ -1,3 +1,4 @@
+import os
 import pandas as pd
 import numpy as np
 from buildHelp import downloadFile
@@ -7,8 +8,11 @@
 BIOSAMPLE_TABLE_URL = "https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/main/data/production/hprc-production-biosample-table.tsv"
 EXCLUDED_SAMPLE_IDS = ["CHM13_v1.1", "GRCh38_no_alt_analysis_set"]
 
-DOWNLOADS_FOLDER_PATH = "./files/unprocessed_files"
-OUTPUT_FILE_PATH = "./files/source/assemblies.csv"
+
+# Determine the base directory of the script
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DOWNLOADS_FOLDER_PATH = os.path.join(BASE_DIR, "unprocessed_files")
+OUTPUT_FILE_PATH = os.path.join(BASE_DIR, "source/assemblies.csv")
 
 HAPLOTYPES = ["maternal", "paternal"]
 MATERNAL_HAPLOTYPE_ID = 0

diff --git a/files/build-catalog.ts → catalog-build/build-catalog.ts b/files/build-catalog.ts → catalog-build/build-catalog.ts
@@ -19,30 +19,34 @@ import {
   SourceRawSequencingData,
 } from "./entities";
 
-const SOURCE_PATH_RAW_SEQUENCING_DATA = "files/source/raw-sequencing-data.csv";
-const SOURCE_PATH_ASSEMBLIES = "files/source/assemblies.csv";
-const SOURCE_PATH_ANNOTATIONS = "files/source/annotations.csv";
-const SOURCE_PATH_ALIGNMENTS = "files/source/alignments.csv";
+const CATALOG_DIR = "catalog";
+
+const SOURCE_PATH_RAW_SEQUENCING_DATA =
+  "catalog-build/source/sequencing-data.csv";
+const SOURCE_PATH_ASSEMBLIES = "catalog-build/source/assemblies.csv";
+const SOURCE_PATH_ANNOTATIONS = "catalog-build/source/annotations.csv";
+const SOURCE_PATH_ALIGNMENTS = "catalog-build/source/alignments.csv";
 
 buildCatalog();
 
 async function buildCatalog(): Promise<void> {
+  console.log("Building catalog...");
   const rawSequencingData = await buildRawSequencingData();
   const assemblies = await buildAssemblies();
   const annotations = await buildAnnotations();
   const alignments = await buildAlignments();
 
-  console.log("Raw sequencing data:", rawSequencingData.length);
-  await saveJson("files/out/raw-sequencing-data.json", rawSequencingData);
+  console.log("Sequencing data:", rawSequencingData.length);
+  await saveJson(`${CATALOG_DIR}/sequencing-data.json`, rawSequencingData);
 
   console.log("Assemblies:", assemblies.length);
-  await saveJson("files/out/assemblies.json", assemblies);
+  await saveJson(`${CATALOG_DIR}/assemblies.json`, assemblies);
 
   console.log("Annotations:", annotations.length);
-  await saveJson("files/out/annotations.json", annotations);
+  await saveJson(`${CATALOG_DIR}/annotations.json`, annotations);
 
   console.log("Alignments:", alignments.length);
-  await saveJson("files/out/alignments.json", alignments);
+  await saveJson(`${CATALOG_DIR}/alignments.json`, alignments);
 
   console.log("Done");
 }
@@ -55,7 +59,7 @@ async function buildRawSequencingData(): Promise<
   );
   const mappedRows = sourceRows.map(
     (row): HPRCDataExplorerRawSequencingData => ({
-      Gb: parseNumberOrNA(row.Gb).toString(),
+      Gb: LABEL.NA,
       accession: parseStringOrNull(row.Accession),
       assembly: parseStringOrNull(row.assembly),
       basecaller: row.basecaller,
@@ -101,7 +105,7 @@ async function buildRawSequencingData(): Promise<
       quartile25: parseNumberOrNA(row.quartile_25).toString(),
       quartile50: parseNumberOrNA(row.quartile_50).toString(),
       quartile75: parseNumberOrNA(row.quartile_75).toString(),
-      readN50: parseNumberOrNA(row.read_N50).toString(),
+      readN50: LABEL.NA,
       result: row.result,
       sampleId: row.sample_ID,
       seqKit: row.seq_kit,

diff --git a/files/build-raw-sequencing-files.py → catalog-build/build-sequencing-data.py b/files/build-raw-sequencing-files.py → catalog-build/build-sequencing-data.py
@@ -1,9 +1,15 @@
+import os
 import pandas as pd
 import numpy as np
 from buildHelp import downloadFile
 
-STORAGE_FOLDER_PATH = "./files/unprocessed_files/"
-OUTPUT_PATH = "./files/source/raw-sequencing-data.csv"
+# Determine the base directory of the script
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Define paths relative to the script's directory
+STORAGE_FOLDER_PATH = os.path.join(BASE_DIR, "unprocessed_files/")
+OUTPUT_PATH = os.path.join(BASE_DIR, "source/sequencing-data.csv")
+
 HIC_URL = "https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/main/data/hprc-data-explorer-tables/HPRC_HiC.tsv"
 ONT_URL = "https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/main/data/hprc-data-explorer-tables/HPRC_ONT.tsv"
 PACBIO_HIFI_URL = "https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/main/data/hprc-data-explorer-tables/HPRC_PacBio_HiFi.tsv"

diff --git a/files/buildHelp.py → catalog-build/buildHelp.py b/files/buildHelp.py → catalog-build/buildHelp.py
diff --git a/files/entities.ts → catalog-build/entities.ts b/files/entities.ts → catalog-build/entities.ts
diff --git a/catalog-build/requirements.txt b/catalog-build/requirements.txt
@@ -0,0 +1,4 @@
+numpy==2.0.1
+pandas==2.2.2
+requests==2.32.3
+
diff --git a/files/source/alignments.csv → catalog-build/source/alignments.csv b/files/source/alignments.csv → catalog-build/source/alignments.csv
diff --git a/files/source/annotations.csv → catalog-build/source/annotations.csv b/files/source/annotations.csv → catalog-build/source/annotations.csv
diff --git a/files/source/assemblies.csv → catalog-build/source/assemblies.csv b/files/source/assemblies.csv → catalog-build/source/assemblies.csv
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,7 @@ coverage @@
     # next.js
     .next
     out
+    catalog
     # production
     build
@@ Expand Down @@