Merge pull request #93 from LiuzLab/hpo_update

HPO database update
LiuzLab · Oct 22, 2024 · 0e3f346 · 0e3f346
2 parents e193d8a + dc0847a
commit 0e3f346
Show file tree

Hide file tree

Showing 3 changed files with 225 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -17,3 +17,7 @@ out/
 trace-*txt
 report-*html
 params.yaml
+
+# Ignore R history and session files
+.Rhistory
+.RData
diff --git a/utils/hpo_update/hpo_update.r b/utils/hpo_update/hpo_update.r
@@ -0,0 +1,83 @@
+#NOTE: Replace the download links with newest ones and then parse the files
+
+library(tidyverse)
+library(httr)
+
+#download new hpo-related data files
+hp_obo_file <- "https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-08-13/hp.obo"
+hpo_omim_file <- "https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-08-13/phenotype.hpoa"
+genemap2_file <- "https://data.omim.org/downloads/E8eFWaP3SOu67gXVTVKiGA/genemap2.txt"
+
+GET(hp_obo_file, write_disk("hp.obo", overwrite = TRUE))
+GET(hpo_omim_file, write_disk("phenotype.hpoa", overwrite = TRUE))
+GET(genemap2_file, write_disk("genemap2.txt", overwrite = TRUE))
+
+
+#read the new data file
+df_new <- read_delim("phenotype.hpoa", delim = '\t', skip = 4)
+
+old_colnames <- c("OMIM_ID",
+                  "HPO_ID",
+                  "DiseaseName",
+                  "Onset",
+                  "Frequency",
+                  "Sex",
+                  "Aspect")
+
+#make the format of new data file same as the old one
+df_new_omim <- df_new %>%
+  separate(
+    database_id,
+    into = c("database", "omim"),
+    sep = ":",
+    remove = FALSE
+  ) %>%
+  filter(database == "OMIM")
+
+df_new_omim_clean <- df_new_omim %>%
+  select(omim, hpo_id, disease_name, onset, frequency, sex, aspect)
+
+colnames(df_new_omim_clean) <- old_colnames
+df_new_omim_clean$OMIM_ID <- as.numeric(df_new_omim_clean$OMIM_ID)
+
+#save the new HPO_OMIM.tsv
+write_tsv(df_new_omim_clean, 'HPO_OMIM.tsv')
+
+#parse the genemap2 table
+system("cat genemap2.txt | ./parseGeneMap2_output.py > genemap2_parsed.txt",
+       intern = FALSE)
+
+#organize the table
+old_colnames <- c(
+  "MIM_Number",
+  "Pheno_ID",
+  "Phenotypes",
+  "Approved_Gene_Symbol",
+  "Gene_Symbols",
+  "Ensembl_Gene_ID",
+  "Entrez_Gene_ID",
+  "inheritance",
+  "Pheno_mapKey"
+)
+
+gm2_new_parsed <- read_delim('genemap2_parsed.txt')
+gm2_new_parsed_selected <- gm2_new_parsed |>
+  select(
+    MIM_Number,
+    Phenotype_MIM_Number,
+    Phenotype_Name,
+    Approved_Gene_Symbol,
+    Gene_Symbols,
+    Ensembl_Gene_ID,
+    Entrez_Gene_ID,
+    Inheritance,
+    Mapping_Key
+  )
+colnames(gm2_new_parsed_selected) <- old_colnames
+
+#save genemap2 data as rds
+#the file name here is not changed, even though the newest data is from 2024
+saveRDS(gm2_new_parsed_selected, file = "genemap2_v2022.rds")
+
+#remove unnecessary file
+file.remove("phenotype.hpoa", "genemap2.txt", "genemap2_parsed.txt")
diff --git a/utils/hpo_update/parseGeneMap2_output.py b/utils/hpo_update/parseGeneMap2_output.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+#
+# This script parses the genemap2.txt file from OMIM and outputs
+# all fields along with processed phenotype information.
+# Usage: cat genemap2.txt | ./parseGeneMap2.py > parsed.txt
+#
+
+#originially from: https://github.com/OMIM-org/genemap2-parser/tree/master
+
+# Imports
+import sys
+import re
+
+# Define the expected number of fields in genemap2.txt
+EXPECTED_NUM_FIELDS = 14
+
+# Print header for the output file (TSV format)
+header = [
+    "Chromosome",
+    "Genomic_Position_Start",
+    "Genomic_Position_End",
+    "Cyto_Location",
+    "Computed_Cyto_Location",
+    "MIM_Number",
+    "Gene_Symbols",
+    "Gene_Name",
+    "Approved_Gene_Symbol",
+    "Entrez_Gene_ID",
+    "Ensembl_Gene_ID",
+    "Comments",
+    "Mouse",
+    "Phenotype_Name",
+    "Phenotype_MIM_Number",
+    "Mapping_Key",
+    "Inheritance"
+]
+print("\t".join(header))
+
+# Read from stdin
+for line_number, line in enumerate(sys.stdin, 1):
+    # Skip comments
+    if line.startswith('#'):
+        continue
+
+    # Strip trailing new line and carriage return
+    line = line.strip('\n').strip('\r')
+
+    # Split the line into fields based on tab
+    valueList = line.split('\t')
+
+    # Validate the number of fields
+    if len(valueList) < EXPECTED_NUM_FIELDS:
+        sys.stderr.write(f"Warning: Line {line_number} has fewer fields ({len(valueList)}) than expected ({EXPECTED_NUM_FIELDS}). Skipping.\n")
+        continue  # Skip malformed lines
+
+    # Assign fields to variables
+    chromosome = valueList[0]
+    genomicPositionStart = valueList[1]
+    genomicPositionEnd = valueList[2]
+    cytoLocation = valueList[3]
+    computedCytoLocation = valueList[4]
+    mimNumber = valueList[5]
+    geneSymbols = valueList[6]
+    geneName = valueList[7]
+    approvedGeneSymbol = valueList[8]
+    entrezGeneID = valueList[9]
+    ensemblGeneID = valueList[10]
+    comments = valueList[11]
+    phenotypeString = valueList[12]
+    mouse = valueList[13]
+
+    # Skip empty phenotypes
+    if not phenotypeString:
+        continue
+
+    # Parse the phenotypes
+    phenotypes = phenotypeString.split(';')
+    for phenotype in phenotypes:
+        # Clean the phenotype string
+        phenotype = phenotype.strip()
+
+        # Initialize phenotype-related variables
+        phenotype_name = ""
+        phenotype_mim_number = ""
+        phenotype_mapping_key = ""
+        inheritances = ""
+
+        # Attempt to match the long phenotype pattern
+        matcher = re.match(r'^(.*),\s(\d{6})\s\((\d)\)(?:,\s*(.*))?$', phenotype)
+        if matcher:
+            phenotype_name = matcher.group(1).strip()
+            phenotype_mim_number = matcher.group(2).strip()
+            phenotype_mapping_key = matcher.group(3).strip()
+            inheritances_raw = matcher.group(4)
+
+            if inheritances_raw:
+                inheritances = ','.join([inh.strip() for inh in inheritances_raw.split(',')])
+
+        else:
+            # Attempt to match the short phenotype pattern
+            matcher = re.match(r'^(.*)\((\d)\)(?:,\s*(.*))?$', phenotype)
+            if matcher:
+                phenotype_name = matcher.group(1).strip()
+                # phenotype_mim_number remains empty for short phenotypes
+                phenotype_mapping_key = matcher.group(2).strip()
+                inheritances_raw = matcher.group(3)
+
+                if inheritances_raw:
+                    inheritances = ','.join([inh.strip() for inh in inheritances_raw.split(',')])
+
+        # Prepare the output row
+        output_fields = [
+            chromosome,
+            genomicPositionStart,
+            genomicPositionEnd,
+            cytoLocation,
+            computedCytoLocation,
+            mimNumber,
+            geneSymbols,
+            geneName,
+            approvedGeneSymbol,
+            entrezGeneID,
+            ensemblGeneID,
+            comments,
+            mouse,
+            phenotype_name,
+            phenotype_mim_number,
+            phenotype_mapping_key,
+            inheritances
+        ]
+
+        # Replace any None or missing inheritances with empty string
+        output_fields = [field if field else "" for field in output_fields]
+
+        # Join fields with tabs and print
+        print("\t".join(output_fields))