Skip to content

Commit

Permalink
Merge pull request #93 from LiuzLab/hpo_update
Browse files Browse the repository at this point in the history
HPO database update
  • Loading branch information
jylee-bcm authored Oct 22, 2024
2 parents e193d8a + dc0847a commit 0e3f346
Show file tree
Hide file tree
Showing 3 changed files with 225 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,7 @@ out/
trace-*txt
report-*html
params.yaml

# Ignore R history and session files
.Rhistory
.RData
83 changes: 83 additions & 0 deletions utils/hpo_update/hpo_update.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#NOTE: Replace the download links with newest ones and then parse the files

library(tidyverse)
library(httr)

#download new hpo-related data files
hp_obo_file <- "https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-08-13/hp.obo"
hpo_omim_file <- "https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-08-13/phenotype.hpoa"
genemap2_file <- "https://data.omim.org/downloads/E8eFWaP3SOu67gXVTVKiGA/genemap2.txt"

GET(hp_obo_file, write_disk("hp.obo", overwrite = TRUE))
GET(hpo_omim_file, write_disk("phenotype.hpoa", overwrite = TRUE))
GET(genemap2_file, write_disk("genemap2.txt", overwrite = TRUE))


#read the new data file
df_new <- read_delim("phenotype.hpoa", delim = '\t', skip = 4)

old_colnames <- c("OMIM_ID",
"HPO_ID",
"DiseaseName",
"Onset",
"Frequency",
"Sex",
"Aspect")

#make the format of new data file same as the old one
df_new_omim <- df_new %>%
separate(
database_id,
into = c("database", "omim"),
sep = ":",
remove = FALSE
) %>%
filter(database == "OMIM")

df_new_omim_clean <- df_new_omim %>%
select(omim, hpo_id, disease_name, onset, frequency, sex, aspect)

colnames(df_new_omim_clean) <- old_colnames
df_new_omim_clean$OMIM_ID <- as.numeric(df_new_omim_clean$OMIM_ID)

#save the new HPO_OMIM.tsv
write_tsv(df_new_omim_clean, 'HPO_OMIM.tsv')

#parse the genemap2 table
system("cat genemap2.txt | ./parseGeneMap2_output.py > genemap2_parsed.txt",
intern = FALSE)

#organize the table
old_colnames <- c(
"MIM_Number",
"Pheno_ID",
"Phenotypes",
"Approved_Gene_Symbol",
"Gene_Symbols",
"Ensembl_Gene_ID",
"Entrez_Gene_ID",
"inheritance",
"Pheno_mapKey"
)

gm2_new_parsed <- read_delim('genemap2_parsed.txt')
gm2_new_parsed_selected <- gm2_new_parsed |>
select(
MIM_Number,
Phenotype_MIM_Number,
Phenotype_Name,
Approved_Gene_Symbol,
Gene_Symbols,
Ensembl_Gene_ID,
Entrez_Gene_ID,
Inheritance,
Mapping_Key
)
colnames(gm2_new_parsed_selected) <- old_colnames

#save genemap2 data as rds
#the file name here is not changed, even though the newest data is from 2024
saveRDS(gm2_new_parsed_selected, file = "genemap2_v2022.rds")

#remove unnecessary file
file.remove("phenotype.hpoa", "genemap2.txt", "genemap2_parsed.txt")
138 changes: 138 additions & 0 deletions utils/hpo_update/parseGeneMap2_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

#
# This script parses the genemap2.txt file from OMIM and outputs
# all fields along with processed phenotype information.
# Usage: cat genemap2.txt | ./parseGeneMap2.py > parsed.txt
#

#originially from: https://github.com/OMIM-org/genemap2-parser/tree/master

# Imports
import sys
import re

# Define the expected number of fields in genemap2.txt
EXPECTED_NUM_FIELDS = 14

# Print header for the output file (TSV format)
header = [
"Chromosome",
"Genomic_Position_Start",
"Genomic_Position_End",
"Cyto_Location",
"Computed_Cyto_Location",
"MIM_Number",
"Gene_Symbols",
"Gene_Name",
"Approved_Gene_Symbol",
"Entrez_Gene_ID",
"Ensembl_Gene_ID",
"Comments",
"Mouse",
"Phenotype_Name",
"Phenotype_MIM_Number",
"Mapping_Key",
"Inheritance"
]
print("\t".join(header))

# Read from stdin
for line_number, line in enumerate(sys.stdin, 1):
# Skip comments
if line.startswith('#'):
continue

# Strip trailing new line and carriage return
line = line.strip('\n').strip('\r')

# Split the line into fields based on tab
valueList = line.split('\t')

# Validate the number of fields
if len(valueList) < EXPECTED_NUM_FIELDS:
sys.stderr.write(f"Warning: Line {line_number} has fewer fields ({len(valueList)}) than expected ({EXPECTED_NUM_FIELDS}). Skipping.\n")
continue # Skip malformed lines

# Assign fields to variables
chromosome = valueList[0]
genomicPositionStart = valueList[1]
genomicPositionEnd = valueList[2]
cytoLocation = valueList[3]
computedCytoLocation = valueList[4]
mimNumber = valueList[5]
geneSymbols = valueList[6]
geneName = valueList[7]
approvedGeneSymbol = valueList[8]
entrezGeneID = valueList[9]
ensemblGeneID = valueList[10]
comments = valueList[11]
phenotypeString = valueList[12]
mouse = valueList[13]

# Skip empty phenotypes
if not phenotypeString:
continue

# Parse the phenotypes
phenotypes = phenotypeString.split(';')
for phenotype in phenotypes:
# Clean the phenotype string
phenotype = phenotype.strip()

# Initialize phenotype-related variables
phenotype_name = ""
phenotype_mim_number = ""
phenotype_mapping_key = ""
inheritances = ""

# Attempt to match the long phenotype pattern
matcher = re.match(r'^(.*),\s(\d{6})\s\((\d)\)(?:,\s*(.*))?$', phenotype)
if matcher:
phenotype_name = matcher.group(1).strip()
phenotype_mim_number = matcher.group(2).strip()
phenotype_mapping_key = matcher.group(3).strip()
inheritances_raw = matcher.group(4)

if inheritances_raw:
inheritances = ','.join([inh.strip() for inh in inheritances_raw.split(',')])

else:
# Attempt to match the short phenotype pattern
matcher = re.match(r'^(.*)\((\d)\)(?:,\s*(.*))?$', phenotype)
if matcher:
phenotype_name = matcher.group(1).strip()
# phenotype_mim_number remains empty for short phenotypes
phenotype_mapping_key = matcher.group(2).strip()
inheritances_raw = matcher.group(3)

if inheritances_raw:
inheritances = ','.join([inh.strip() for inh in inheritances_raw.split(',')])

# Prepare the output row
output_fields = [
chromosome,
genomicPositionStart,
genomicPositionEnd,
cytoLocation,
computedCytoLocation,
mimNumber,
geneSymbols,
geneName,
approvedGeneSymbol,
entrezGeneID,
ensemblGeneID,
comments,
mouse,
phenotype_name,
phenotype_mim_number,
phenotype_mapping_key,
inheritances
]

# Replace any None or missing inheritances with empty string
output_fields = [field if field else "" for field in output_fields]

# Join fields with tabs and print
print("\t".join(output_fields))

0 comments on commit 0e3f346

Please sign in to comment.