-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #93 from LiuzLab/hpo_update
HPO database update
- Loading branch information
Showing
3 changed files
with
225 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,3 +17,7 @@ out/ | |
trace-*txt | ||
report-*html | ||
params.yaml | ||
|
||
# Ignore R history and session files | ||
.Rhistory | ||
.RData |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
#NOTE: Replace the download links with newest ones and then parse the files | ||
|
||
library(tidyverse) | ||
library(httr) | ||
|
||
#download new hpo-related data files | ||
hp_obo_file <- "https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-08-13/hp.obo" | ||
hpo_omim_file <- "https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-08-13/phenotype.hpoa" | ||
genemap2_file <- "https://data.omim.org/downloads/E8eFWaP3SOu67gXVTVKiGA/genemap2.txt" | ||
|
||
GET(hp_obo_file, write_disk("hp.obo", overwrite = TRUE)) | ||
GET(hpo_omim_file, write_disk("phenotype.hpoa", overwrite = TRUE)) | ||
GET(genemap2_file, write_disk("genemap2.txt", overwrite = TRUE)) | ||
|
||
|
||
#read the new data file | ||
df_new <- read_delim("phenotype.hpoa", delim = '\t', skip = 4) | ||
|
||
old_colnames <- c("OMIM_ID", | ||
"HPO_ID", | ||
"DiseaseName", | ||
"Onset", | ||
"Frequency", | ||
"Sex", | ||
"Aspect") | ||
|
||
#make the format of new data file same as the old one | ||
df_new_omim <- df_new %>% | ||
separate( | ||
database_id, | ||
into = c("database", "omim"), | ||
sep = ":", | ||
remove = FALSE | ||
) %>% | ||
filter(database == "OMIM") | ||
|
||
df_new_omim_clean <- df_new_omim %>% | ||
select(omim, hpo_id, disease_name, onset, frequency, sex, aspect) | ||
|
||
colnames(df_new_omim_clean) <- old_colnames | ||
df_new_omim_clean$OMIM_ID <- as.numeric(df_new_omim_clean$OMIM_ID) | ||
|
||
#save the new HPO_OMIM.tsv | ||
write_tsv(df_new_omim_clean, 'HPO_OMIM.tsv') | ||
|
||
#parse the genemap2 table | ||
system("cat genemap2.txt | ./parseGeneMap2_output.py > genemap2_parsed.txt", | ||
intern = FALSE) | ||
|
||
#organize the table | ||
old_colnames <- c( | ||
"MIM_Number", | ||
"Pheno_ID", | ||
"Phenotypes", | ||
"Approved_Gene_Symbol", | ||
"Gene_Symbols", | ||
"Ensembl_Gene_ID", | ||
"Entrez_Gene_ID", | ||
"inheritance", | ||
"Pheno_mapKey" | ||
) | ||
|
||
gm2_new_parsed <- read_delim('genemap2_parsed.txt') | ||
gm2_new_parsed_selected <- gm2_new_parsed |> | ||
select( | ||
MIM_Number, | ||
Phenotype_MIM_Number, | ||
Phenotype_Name, | ||
Approved_Gene_Symbol, | ||
Gene_Symbols, | ||
Ensembl_Gene_ID, | ||
Entrez_Gene_ID, | ||
Inheritance, | ||
Mapping_Key | ||
) | ||
colnames(gm2_new_parsed_selected) <- old_colnames | ||
|
||
#save genemap2 data as rds | ||
#the file name here is not changed, even though the newest data is from 2024 | ||
saveRDS(gm2_new_parsed_selected, file = "genemap2_v2022.rds") | ||
|
||
#remove unnecessary file | ||
file.remove("phenotype.hpoa", "genemap2.txt", "genemap2_parsed.txt") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
# | ||
# This script parses the genemap2.txt file from OMIM and outputs | ||
# all fields along with processed phenotype information. | ||
# Usage: cat genemap2.txt | ./parseGeneMap2.py > parsed.txt | ||
# | ||
|
||
#originially from: https://github.com/OMIM-org/genemap2-parser/tree/master | ||
|
||
# Imports | ||
import sys | ||
import re | ||
|
||
# Define the expected number of fields in genemap2.txt | ||
EXPECTED_NUM_FIELDS = 14 | ||
|
||
# Print header for the output file (TSV format) | ||
header = [ | ||
"Chromosome", | ||
"Genomic_Position_Start", | ||
"Genomic_Position_End", | ||
"Cyto_Location", | ||
"Computed_Cyto_Location", | ||
"MIM_Number", | ||
"Gene_Symbols", | ||
"Gene_Name", | ||
"Approved_Gene_Symbol", | ||
"Entrez_Gene_ID", | ||
"Ensembl_Gene_ID", | ||
"Comments", | ||
"Mouse", | ||
"Phenotype_Name", | ||
"Phenotype_MIM_Number", | ||
"Mapping_Key", | ||
"Inheritance" | ||
] | ||
print("\t".join(header)) | ||
|
||
# Read from stdin | ||
for line_number, line in enumerate(sys.stdin, 1): | ||
# Skip comments | ||
if line.startswith('#'): | ||
continue | ||
|
||
# Strip trailing new line and carriage return | ||
line = line.strip('\n').strip('\r') | ||
|
||
# Split the line into fields based on tab | ||
valueList = line.split('\t') | ||
|
||
# Validate the number of fields | ||
if len(valueList) < EXPECTED_NUM_FIELDS: | ||
sys.stderr.write(f"Warning: Line {line_number} has fewer fields ({len(valueList)}) than expected ({EXPECTED_NUM_FIELDS}). Skipping.\n") | ||
continue # Skip malformed lines | ||
|
||
# Assign fields to variables | ||
chromosome = valueList[0] | ||
genomicPositionStart = valueList[1] | ||
genomicPositionEnd = valueList[2] | ||
cytoLocation = valueList[3] | ||
computedCytoLocation = valueList[4] | ||
mimNumber = valueList[5] | ||
geneSymbols = valueList[6] | ||
geneName = valueList[7] | ||
approvedGeneSymbol = valueList[8] | ||
entrezGeneID = valueList[9] | ||
ensemblGeneID = valueList[10] | ||
comments = valueList[11] | ||
phenotypeString = valueList[12] | ||
mouse = valueList[13] | ||
|
||
# Skip empty phenotypes | ||
if not phenotypeString: | ||
continue | ||
|
||
# Parse the phenotypes | ||
phenotypes = phenotypeString.split(';') | ||
for phenotype in phenotypes: | ||
# Clean the phenotype string | ||
phenotype = phenotype.strip() | ||
|
||
# Initialize phenotype-related variables | ||
phenotype_name = "" | ||
phenotype_mim_number = "" | ||
phenotype_mapping_key = "" | ||
inheritances = "" | ||
|
||
# Attempt to match the long phenotype pattern | ||
matcher = re.match(r'^(.*),\s(\d{6})\s\((\d)\)(?:,\s*(.*))?$', phenotype) | ||
if matcher: | ||
phenotype_name = matcher.group(1).strip() | ||
phenotype_mim_number = matcher.group(2).strip() | ||
phenotype_mapping_key = matcher.group(3).strip() | ||
inheritances_raw = matcher.group(4) | ||
|
||
if inheritances_raw: | ||
inheritances = ','.join([inh.strip() for inh in inheritances_raw.split(',')]) | ||
|
||
else: | ||
# Attempt to match the short phenotype pattern | ||
matcher = re.match(r'^(.*)\((\d)\)(?:,\s*(.*))?$', phenotype) | ||
if matcher: | ||
phenotype_name = matcher.group(1).strip() | ||
# phenotype_mim_number remains empty for short phenotypes | ||
phenotype_mapping_key = matcher.group(2).strip() | ||
inheritances_raw = matcher.group(3) | ||
|
||
if inheritances_raw: | ||
inheritances = ','.join([inh.strip() for inh in inheritances_raw.split(',')]) | ||
|
||
# Prepare the output row | ||
output_fields = [ | ||
chromosome, | ||
genomicPositionStart, | ||
genomicPositionEnd, | ||
cytoLocation, | ||
computedCytoLocation, | ||
mimNumber, | ||
geneSymbols, | ||
geneName, | ||
approvedGeneSymbol, | ||
entrezGeneID, | ||
ensemblGeneID, | ||
comments, | ||
mouse, | ||
phenotype_name, | ||
phenotype_mim_number, | ||
phenotype_mapping_key, | ||
inheritances | ||
] | ||
|
||
# Replace any None or missing inheritances with empty string | ||
output_fields = [field if field else "" for field in output_fields] | ||
|
||
# Join fields with tabs and print | ||
print("\t".join(output_fields)) |