From dc0847a4d86f1f31a1777e6122d03ed50ece0a05 Mon Sep 17 00:00:00 2001 From: Zhijian Yu Date: Wed, 25 Sep 2024 17:22:46 -0500 Subject: [PATCH] add ignoring R related history files in .gitignore --- .gitignore | 4 + utils/hpo_update/.Rhistory | 257 ------------------------------------- 2 files changed, 4 insertions(+), 257 deletions(-) delete mode 100644 utils/hpo_update/.Rhistory diff --git a/.gitignore b/.gitignore index 055eff2..9861bc9 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,7 @@ out/ trace-*txt report-*html params.yaml + +# Ignore R history and session files +.Rhistory +.RData diff --git a/utils/hpo_update/.Rhistory b/utils/hpo_update/.Rhistory deleted file mode 100644 index 9d99e45..0000000 --- a/utils/hpo_update/.Rhistory +++ /dev/null @@ -1,257 +0,0 @@ -library(tidyverse) -library(httr) -#download new hpo-related data files -hp_obo_file <- "https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-08-13/hp.obo" -hpo_omim_file <- "https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-08-13/phenotype.hpoa" -genemap2_file <- "https://data.omim.org/downloads/E8eFWaP3SOu67gXVTVKiGA/genemap2.txt" -getwd() -GET(hp_obo_file, write_disk("hp.obo", overwrite = TRUE)) -GET(hpo_omim_file, write_disk("phenotype.hpoa", overwrite = TRUE)) -GET(genemap2_file, write_disk("genemap2.txt", overwrite = TRUE)) -#use old dataframe for reference; read the new data file -old_colnames <- c("OMIM_ID", -"HPO_ID", -"DiseaseName", -"Onset", -"Frequency", -"Sex", -"Aspect") -df_new <- read_delim("phenotype.hpoa", delim = '\t', skip = 4) -#make the format of new data file same as the old one -df_new_omim <- df_new %>% -separate( -database_id, -into = c("database", "omim"), -sep = ":", -remove = FALSE -) %>% -filter(database == "OMIM") -df_new_omim_clean <- df_new_omim %>% -select(omim, hpo_id, disease_name, onset, frequency, sex, aspect) -colnames(df_new_omim_clean) <- old_colnames -df_new_omim_clean$OMIM_ID <- as.numeric(df_new_omim_clean$OMIM_ID) -#remove unnecessary file -file.remove("phenotype.hpoa") -#save the new HPO_OMIM.tsv -write_tsv(df_new_omim_clean, 'HPO_OMIM.tsv') -load('genemap2_v2022.rds') -readRDS('genemap2_v2022.rds') -gm2<-readRDS('genemap2_v2022.rds') -View(gm2) -gm2_new<-read_delim('genemap2.txt') -View(gm2_new) -gm2_new<-read_delim('genemap2.txt', skip = 3) -View(gm2) -View(gm2_new) -gm2_new_selected<-gm2_new |> -select(`MIM Number`) -select(`MIM Number`, `Phenotypes`, `Approved Gene Symbol`, `Gene/Locus And Other Related Symbols`,`Ensembl Gene ID`, `Entrez Gene ID`) -gm2_new_selected<-gm2_new |> -select(`MIM Number`, `Phenotypes`, `Approved Gene Symbol`, `Gene/Locus And Other Related Symbols`,`Ensembl Gene ID`, `Entrez Gene ID`) -#save the genemap2 as rds -gm2_new<-read_delim('genemap2.txt', skip=3) -gm2_new_selected<-gm2_new |> -+ select(`MIM Number`, `Phenotypes`, `Approved Gene Symbol`, `Gene/Locus And Other Related Symbols`,`Ensembl Gene ID`, `Entrez Gene ID`) -gm2_new_selected<-gm2_new |> -select(`MIM Number`, `Phenotypes`, `Approved Gene Symbol`, `Gene/Locus And Other Related Symbols`,`Ensembl Gene ID`, `Entrez Gene ID`) -View(gm2_new_selected) -gm2_new_selected_pheno<-gm2_new_selected |> -filter(!is.na(Phenotypes)) -View(gm2_new_selected_pheno) -View(gm2) -str_split("{Otitis media, susceptibility to}, 166760 (3)", ",") -str_split("{Otitis media, susceptibility to}, 166760 (3)", ";") -str_split([Blood group, P1Pk system, P(2) phenotype], 111400 (3); NOR polyagglutination syndrome, 111400 (3); [Blood group, P1Pk system, p phenotype], 111400 (3), ";") -str_split('[Blood group, P1Pk system, P(2) phenotype], 111400 (3); NOR polyagglutination syndrome, 111400 (3); [Blood group, P1Pk system, p phenotype], 111400 (3)', ";") -View(gm2_new_selected_pheno) -gm2_new_selected_pheno<-gm2_new_selected |> -filter(!is.na(Phenotypes)) |> -separate_rows(Phenotypes, sep=";") -View(gm2_new_selected_pheno) -gm2_new_parsed<-read_delim('genemap2_parsed.txt') -View(gm2_new_parsed) -gm2_new_parsed_selected<-gm2_new_parsed |> -select(MIM_Number,Phenotype_MIM_Number,Phenotype_Name,Approved_Gene_Symbol,Gene_Symbols,Ensembl_Gene_ID,Entrez_Gene_ID,Inheritance,Mapping_Key) -View(gm2_new_parsed_selected) -View(gm2_new) -#parse the genemap2 table and save as rds -colnames(gm2) -cat(colnames(gm2)) -old_colnames<-c("MIM_Number","Pheno_ID","Phenotypes","Approved_Gene_Symbol","Gene_Symbols","Ensembl_Gene_ID","Entrez_Gene_ID","inheritance","Pheno_mapKey") -gm2_new_parsed<-read_delim('genemap2_parsed.txt') -gm2_new_parsed_selected<-gm2_new_parsed |> -select(MIM_Number,Phenotype_MIM_Number,Phenotype_Name,Approved_Gene_Symbol,Gene_Symbols,Ensembl_Gene_ID,Entrez_Gene_ID,Inheritance,Mapping_Key) -colnames(gm2_new_parsed_selected)<-old_colnames -View(gm2_new_parsed_selected) -View(gm2) -file_list <- system("ls -1", intern = TRUE) -print(file_list) -system("cat genemap2.txt | ./parseGeneMap2_output.py > genemap2_parsed.txt", intern = FALSE) -View(gm2) -library(tidyverse) -library(httr) -#download new hpo-related data files -hp_obo_file <- "https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-08-13/hp.obo" -hpo_omim_file <- "https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-08-13/phenotype.hpoa" -genemap2_file <- "https://data.omim.org/downloads/E8eFWaP3SOu67gXVTVKiGA/genemap2.txt" -GET(hp_obo_file, write_disk("hp.obo", overwrite = TRUE)) -GET(hpo_omim_file, write_disk("phenotype.hpoa", overwrite = TRUE)) -GET(genemap2_file, write_disk("genemap2.txt", overwrite = TRUE)) -#use old dataframe for reference; read the new data file -old_colnames <- c("OMIM_ID", -"HPO_ID", -"DiseaseName", -"Onset", -"Frequency", -"Sex", -"Aspect") -df_new <- read_delim("phenotype.hpoa", delim = '\t', skip = 4) -#make the format of new data file same as the old one -df_new_omim <- df_new %>% -separate( -database_id, -into = c("database", "omim"), -sep = ":", -remove = FALSE -) %>% -filter(database == "OMIM") -df_new_omim_clean <- df_new_omim %>% -select(omim, hpo_id, disease_name, onset, frequency, sex, aspect) -colnames(df_new_omim_clean) <- old_colnames -df_new_omim_clean$OMIM_ID <- as.numeric(df_new_omim_clean$OMIM_ID) -#save the new HPO_OMIM.tsv -write_tsv(df_new_omim_clean, 'HPO_OMIM.tsv') -#parse the genemap2 table -system("cat genemap2.txt | ./parseGeneMap2_output.py > genemap2_parsed.txt", intern = FALSE) -#organize the table -old_colnames<-c("MIM_Number","Pheno_ID","Phenotypes","Approved_Gene_Symbol","Gene_Symbols","Ensembl_Gene_ID","Entrez_Gene_ID","inheritance","Pheno_mapKey") -gm2_new_parsed<-read_delim('genemap2_parsed.txt') -gm2_new_parsed_selected<-gm2_new_parsed |> -select(MIM_Number,Phenotype_MIM_Number,Phenotype_Name,Approved_Gene_Symbol,Gene_Symbols,Ensembl_Gene_ID,Entrez_Gene_ID,Inheritance,Mapping_Key) -colnames(gm2_new_parsed_selected)<-old_colnames -#save genemap2 data as rds -#the file name here is not changed, even though the newest data is from 2024 -saveRDS(gm2_new_parsed_selected, file="genemap2_v2022.rds") -#remove unnecessary file -file.remove("phenotype.hpoa","genemap2.txt","genemap2_parsed.txt") -library(ontologyIndex) -library(ontologySimilarity) -PATIENT_HPO <- 'vpa_hpo_040924.txt' -OMIM_OBO <- 'hp.obo' -OMIM_GENEMAP2 <- 'genemap2_v2022.rds' -OMIM_PHENO <- 'HPO_OMIM.tsv' -OUTFILE_CZ_NAME <- "test_cz" -OUTFILE_DX_NAME <- "test_dx" -library(dplyr) -# Load HPO_obo -HPO_obo <- get_OBO(OMIM_OBO, propagate_relationships = c("is_a", "part_of"), extract_tags = "minimal") -# set simi_thresh -simi_thresh <- 0 -# Load OMIM gene-disease data -genemap2 <- readRDS(OMIM_GENEMAP2) -## ---- Load OMIM Phenotype ---- -HPO_orig <- read.table(OMIM_PHENO, sep = "\t", header = T, stringsAsFactors = FALSE, comment.char = "", fill = TRUE, quote = "\"") -OMIM_HPO <- HPO_orig[, c("OMIM_ID", "DiseaseName", "HPO_ID")] -# rename colnames -colnames(OMIM_HPO) <- c("OMIM_ID", "Disease_Name", "HPO_ID") -OMIM_HPO_cl <- unique(OMIM_HPO) -# Function to get_HPO_list -get_HPO_list <- function(df1) { -df2 <- df1 %>% -dplyr::group_by(OMIM_ID, Disease_Name) %>% -dplyr::summarise(HPO = paste0(HPO_ID, collapse = " ")) %>% -dplyr::ungroup() %>% -as.data.frame() -df2$HPO_list <- lapply(df2$HPO, function(i) unlist(strsplit(i, " "))) -# rownames(df2) <- ifelse(is.na(df2$OMIM_ID), df2$Disease_Name, df2$OMIM_ID) -return(df2) -} -# Prepare OMIM HPO list for ontology similarity comparision -OMIM_HPO_all <- get_HPO_list(OMIM_HPO_cl[, c("OMIM_ID", "Disease_Name", "HPO_ID")]) -View(OMIM_HPO_all) -HPO.orig <- read.table(PATIENT_HPO, sep = "\t", fill = T, header = F, stringsAsFactors = F) -HPO <- HPO.orig$V1 -# remove terms without a HPO ID -HPO <- HPO[grepl("HP:", HPO)] -HPO <- list(HPO) -sim_mat <- get_asym_sim_grid(HPO, OMIM_HPO_all$HPO_list, ontology = HPO_obo) -OMIM_HPO_all$Similarity_Score <- as.vector(sim_mat) -View(OMIM_HPO_all) -# convert HPO ID to HPO term -OMIM_HPO_all$HPO_term <- unlist(lapply(OMIM_HPO_all$HPO_list, function(x) paste0(HPO_obo$name[unlist(x)], collapse = "|"))) -## Add gene - disease relationship -OMIM_HPO_all_wGene <- merge(unique(genemap2[, c("Pheno_ID", "Approved_Gene_Symbol", "Ensembl_Gene_ID", "Entrez_Gene_ID")]), -OMIM_HPO_all[, c("OMIM_ID", "Disease_Name", "Similarity_Score", "HPO_term")], -by.y = "OMIM_ID", by.x = "Pheno_ID" -) -colnames(OMIM_HPO_all_wGene)[2] <- "Gene_Symbol" -OMIM_HPO_all_order <- OMIM_HPO_all_wGene[order(OMIM_HPO_all_wGene$Similarity_Score, decreasing = T), ] -# OMIM_HPO_all_filt <- head(OMIM_HPO_all_order, n = No_candidate) -OMIM_HPO_all_filt <- OMIM_HPO_all_order[OMIM_HPO_all_order$Similarity_Score >= simi_thresh, ] -library(ontologyIndex) -library(ontologySimilarity) -PATIENT_HPO <- 'vpa_hpo_040924.txt' -OMIM_HGMD <- 'HGMD_phen.tsv' -OMIM_OBO <- 'hp.obo' -OMIM_GENEMAP2 <- 'genemap2_v2022.rds' -OMIM_PHENO <- 'HPO_OMIM.tsv' -OUTFILE_CZ_NAME <- "test_cz" -OUTFILE_DX_NAME <- "test_dx" -dat <- read.csv(OMIM_HGMD, sep = "\t") -library(dplyr) -get_HPO_list <- function(df1) { -df2 <- df1 %>% -dplyr::group_by(acc_num, phen_id, gene_sym) %>% -dplyr::summarise(HPO = paste0(hpo_id, collapse = " ")) %>% -dplyr::ungroup() %>% -as.data.frame() -df2$HPO_list <- lapply(df2$HPO, function(i) unlist(strsplit(i, " "))) -return(df2) -} -# Load HPO_obo -HPO_obo <- get_OBO(OMIM_OBO, propagate_relationships = c("is_a", "part_of"), extract_tags = "minimal") -# set simi_thresh -simi_thresh <- 0 -# In public release, there might be empty HGMD phenotype file -if (dim(dat)[1] == 0) { -col_names <- c("acc_num", "phen_id", "gene_sym", "HPO", "HPO_list", "Similarity_Score") -dat2 <- data.frame(matrix(nrow = 0, ncol = length(col_names))) -colnames(dat2) <- col_names -} else { -dat <- dat[!is.na(dat$hpo_id), ] -dat2_ori <- get_HPO_list(dat) -dat2 <- dat2_ori -# Load patient HPO -HPO.orig <- read.table(PATIENT_HPO, sep = "\t", fill = T, header = F, stringsAsFactors = F) -HPO <- HPO.orig$V1 -# remove terms without a HPO ID -HPO <- HPO[grepl("HP:", HPO)] -HPO <- list(HPO) -sim_mat <- get_asym_sim_grid(HPO, dat2$HPO_list, ontology = HPO_obo) -dat2$Similarity_Score <- as.vector(sim_mat) -dat2$HPO_list <- unlist(lapply(dat2$HPO_list, function(x) paste0(unlist(x), collapse = "|"))) -dat2 <- dat2[order(dat2$Similarity_Score, decreasing = T), ] -} -write.table(dat2, OUTFILE_CZ_NAME, sep = "\t", quote = F, row.names = F) -# In public release, there might be empty HGMD phenotype file -if (dim(dat)[1] == 0) { -col_names <- c("acc_num", "phen_id", "gene_sym", "HPO", "HPO_list", "Similarity_Score") -dat2 <- data.frame(matrix(nrow = 0, ncol = length(col_names))) -colnames(dat2) <- col_names -} else { -dat <- dat[!is.na(dat$hpo_id), ] -dat2_ori <- get_HPO_list(dat) -dat2 <- dat2_ori -# Load patient HPO -HPO.orig <- read.table(PATIENT_HPO, sep = "\t", fill = T, header = F, stringsAsFactors = F) -HPO <- HPO.orig$V1 -# remove terms without a HPO ID -HPO <- HPO[grepl("HP:", HPO)] -HPO <- list(HPO) -sim_mat <- get_asym_sim_grid(HPO, dat2$HPO_list, ontology = HPO_obo) -dat2$Similarity_Score <- as.vector(sim_mat) -dat2$HPO_list <- unlist(lapply(dat2$HPO_list, function(x) paste0(unlist(x), collapse = "|"))) -dat2 <- dat2[order(dat2$Similarity_Score, decreasing = T), ] -} -View(HPO_orig) -View(HPO.orig)