Skip to content

Commit

Permalink
refactor: use catalog and catalog-build to hold the data and build fi…
Browse files Browse the repository at this point in the history
…les (#79) (#80)
  • Loading branch information
NoopDog authored Nov 20, 2024
1 parent 70b0949 commit fc082da
Show file tree
Hide file tree
Showing 27 changed files with 151,721 additions and 91,755 deletions.
1 change: 1 addition & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ coverage
# next.js
.next
out
catalog

# production
build
Expand Down
29 changes: 5 additions & 24 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ This app requires Node `20.10.0`. If you do not have Node installed, download it
[n](https://www.npmjs.com/package/n/v/5.0.1) to select that version.

Once you have Node installed with the correct version, clone the repository using:

```shell
git clone git@github.com:human-pangenomics/hprc-data-explorer.git
```
Expand All @@ -14,14 +15,7 @@ Then, install the required packages using:
npm ci
```

Check that the file `files/out/raw-sequencing-data.json` is present.
If it is not, build it by running:

```shell
npm run build-hprc-db
```

You're now ready to go! With that file made, you can run the development server:
Run the development server

```shell
npm run dev
Expand All @@ -31,19 +25,6 @@ yarn dev

Once the server is running, visit [localhost:3000](localhost:3000) to view the Explorer!

### Building the data source files
The raw seqencing, assemblies, and annotations data are generated through a Python script. To create a virtual environment
and install the required libraries, use the following commands from the root project directory:
```shell
python3 -m venv ./venv
source ./venv/bin/activate
pip install -r files/requirements.txt
```
Then run the scripts with:
```shell
python3 files/build-raw-sequencing-files.py
python3 files/build-assemblies-data.py
python3 files/build-annotations-data.py
```
The environment can be deactivated by running `deactivate`, and re-activated by running `source ./venv/bin/activate`
again.
## Building the Catalog Files

To rebuild the catalog files see [catalog-build/README.md](catalog-build/README.md)
4 changes: 4 additions & 0 deletions catalog-build/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

# Catalog input files
unprocessed_files/*
!unprocessed_files/.gitkeep
38 changes: 38 additions & 0 deletions catalog-build/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@


## Building the catalog-build/source Files

The raw sequencing, assemblies, and annotations source files are generated through Python scripts.

To create a virtual environment and install the required libraries, use the following commands from the root project directory:

```shell
python3 -m venv ./venv
source ./venv/bin/activate
pip install -r catalog-build/requirements.txt
```

Then run the scripts from this directory with:
```shell
python3 build-sequencing-data.py
python3 build-assemblies.py
python3 build-annotations.py
```


This can also be accomplshed by running
```shell
npm run build-catalog-source
```

The environment can be deactivated by running `deactivate`, and re-activated by running `source ./venv/bin/activate`
again.

## Building the Catalog Files

Once the source files are generated, you can build the /catalog files with:

```shell
npm run build-catalog
```

Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import pandas as pd
import numpy as np
from buildHelp import downloadFile
Expand All @@ -13,8 +14,13 @@
"TRF": "https://raw.githubusercontent.com/human-pangenomics/HPP_Year1_Assemblies/main/annotation_index/Year1_assemblies_v2_genbank_TRF.index",
}

DOWNLOADS_FOLDER_PATH = "./files/unprocessed_files"
OUTPUT_FILE_PATH = "./files/source/annotations.csv"
# Base directory of the script
BASE_DIR = os.path.dirname(os.path.abspath(__file__))

# Define paths relative to the script's directory
DOWNLOADS_FOLDER_PATH = os.path.join(BASE_DIR, "unprocessed_files")
OUTPUT_FILE_PATH = os.path.join(BASE_DIR, "source/annotations.csv")


CHM13 = "chm13"
HG38 = "hg38"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import pandas as pd
import numpy as np
from buildHelp import downloadFile
Expand All @@ -7,8 +8,11 @@
BIOSAMPLE_TABLE_URL = "https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/main/data/production/hprc-production-biosample-table.tsv"
EXCLUDED_SAMPLE_IDS = ["CHM13_v1.1", "GRCh38_no_alt_analysis_set"]

DOWNLOADS_FOLDER_PATH = "./files/unprocessed_files"
OUTPUT_FILE_PATH = "./files/source/assemblies.csv"

# Determine the base directory of the script
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DOWNLOADS_FOLDER_PATH = os.path.join(BASE_DIR, "unprocessed_files")
OUTPUT_FILE_PATH = os.path.join(BASE_DIR, "source/assemblies.csv")

HAPLOTYPES = ["maternal", "paternal"]
MATERNAL_HAPLOTYPE_ID = 0
Expand Down
26 changes: 15 additions & 11 deletions files/build-catalog.ts → catalog-build/build-catalog.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,30 +19,34 @@ import {
SourceRawSequencingData,
} from "./entities";

const SOURCE_PATH_RAW_SEQUENCING_DATA = "files/source/raw-sequencing-data.csv";
const SOURCE_PATH_ASSEMBLIES = "files/source/assemblies.csv";
const SOURCE_PATH_ANNOTATIONS = "files/source/annotations.csv";
const SOURCE_PATH_ALIGNMENTS = "files/source/alignments.csv";
const CATALOG_DIR = "catalog";

const SOURCE_PATH_RAW_SEQUENCING_DATA =
"catalog-build/source/sequencing-data.csv";
const SOURCE_PATH_ASSEMBLIES = "catalog-build/source/assemblies.csv";
const SOURCE_PATH_ANNOTATIONS = "catalog-build/source/annotations.csv";
const SOURCE_PATH_ALIGNMENTS = "catalog-build/source/alignments.csv";

buildCatalog();

async function buildCatalog(): Promise<void> {
console.log("Building catalog...");
const rawSequencingData = await buildRawSequencingData();
const assemblies = await buildAssemblies();
const annotations = await buildAnnotations();
const alignments = await buildAlignments();

console.log("Raw sequencing data:", rawSequencingData.length);
await saveJson("files/out/raw-sequencing-data.json", rawSequencingData);
console.log("Sequencing data:", rawSequencingData.length);
await saveJson(`${CATALOG_DIR}/sequencing-data.json`, rawSequencingData);

console.log("Assemblies:", assemblies.length);
await saveJson("files/out/assemblies.json", assemblies);
await saveJson(`${CATALOG_DIR}/assemblies.json`, assemblies);

console.log("Annotations:", annotations.length);
await saveJson("files/out/annotations.json", annotations);
await saveJson(`${CATALOG_DIR}/annotations.json`, annotations);

console.log("Alignments:", alignments.length);
await saveJson("files/out/alignments.json", alignments);
await saveJson(`${CATALOG_DIR}/alignments.json`, alignments);

console.log("Done");
}
Expand All @@ -55,7 +59,7 @@ async function buildRawSequencingData(): Promise<
);
const mappedRows = sourceRows.map(
(row): HPRCDataExplorerRawSequencingData => ({
Gb: parseNumberOrNA(row.Gb).toString(),
Gb: LABEL.NA,
accession: parseStringOrNull(row.Accession),
assembly: parseStringOrNull(row.assembly),
basecaller: row.basecaller,
Expand Down Expand Up @@ -101,7 +105,7 @@ async function buildRawSequencingData(): Promise<
quartile25: parseNumberOrNA(row.quartile_25).toString(),
quartile50: parseNumberOrNA(row.quartile_50).toString(),
quartile75: parseNumberOrNA(row.quartile_75).toString(),
readN50: parseNumberOrNA(row.read_N50).toString(),
readN50: LABEL.NA,
result: row.result,
sampleId: row.sample_ID,
seqKit: row.seq_kit,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
import os
import pandas as pd
import numpy as np
from buildHelp import downloadFile

STORAGE_FOLDER_PATH = "./files/unprocessed_files/"
OUTPUT_PATH = "./files/source/raw-sequencing-data.csv"
# Determine the base directory of the script
BASE_DIR = os.path.dirname(os.path.abspath(__file__))

# Define paths relative to the script's directory
STORAGE_FOLDER_PATH = os.path.join(BASE_DIR, "unprocessed_files/")
OUTPUT_PATH = os.path.join(BASE_DIR, "source/sequencing-data.csv")

HIC_URL = "https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/main/data/hprc-data-explorer-tables/HPRC_HiC.tsv"
ONT_URL = "https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/main/data/hprc-data-explorer-tables/HPRC_ONT.tsv"
PACBIO_HIFI_URL = "https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/main/data/hprc-data-explorer-tables/HPRC_PacBio_HiFi.tsv"
Expand Down
File renamed without changes.
File renamed without changes.
4 changes: 4 additions & 0 deletions catalog-build/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
numpy==2.0.1
pandas==2.2.2
requests==2.32.3

File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit fc082da

Please sign in to comment.