Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

major performance upgrade for get_ssm_by_regions #83

Merged
merged 8 commits into from
Jul 9, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
new feature: implement overlaps
  • Loading branch information
Kdreval committed May 13, 2024
commit 30f327dc613c7ce057585f500e59f22b93fedac6
204 changes: 149 additions & 55 deletions R/get_ssm_by_regions.R
Original file line number Diff line number Diff line change
@@ -22,6 +22,9 @@
#' @param projection Obtain variants projected to this reference (one of grch37 or hg38), default is grch37.
#' @param verbose Set to TRUE to maximize the output to console. Default is TRUE.
#' This parameter also dictates the verbosity of any helper function internally called inside the main function.
#' @param engine String indicating which approach to use. Accepted values are
#' "default" (legacy behaviour with for loop for each region) and
#' "overlaps" (more efficient approach using overlaps).
#' @param ... Any additional parameters.
#'
#' @return Returns a data frame of variants in MAF-like format.
@@ -51,18 +54,18 @@ get_ssm_by_regions = function(these_sample_ids = NULL,
use_name_column = FALSE,
projection = "grch37",
verbose = FALSE,
engine="default",
engine = "default",
...){

# check provided projection
# first, get valid projections
valid_projections = grep("meta", names(GAMBLR.data::sample_data),
valid_projections = grep("meta", names(GAMBLR.data::sample_data),
value = TRUE, invert = TRUE)
if(! projection %in% valid_projections){
stop("Please provide a valid projection. The following are available: ",
paste(valid_projections, collapse=", "), ".")
}

#check if any invalid parameters are provided
check_excess_params(...)

@@ -79,77 +82,168 @@ get_ssm_by_regions = function(these_sample_ids = NULL,
}else{
regions = regions_list
}

# handle the correct chr prefixing
regions <- gsub("chr", "", regions)
if(projection == "hg38"){
regions <- paste0("chr", regions)
}

if(verbose){
print(regions)
}

#get samples with the dedicated helper function
metadata = id_ease(these_samples_metadata = these_samples_metadata,
these_sample_ids = these_sample_ids,
verbose = verbose,
this_seq_type = this_seq_type)

# The following has these steps to return the maf:
# 1. First check if the maf data is present
# 1a. If maf data is not provided, check whether this_study is specified
# 1b. For both this_study present and absent, check which engine is specified
# 2. If maf data is present, trust that the maf file is pre-defined accordingly
# and no need to handle the this_study parameter explicitly.
# 2a. Check which engine is specified and handle maf_data accordingly

if(missing(maf_data)){
#warn/notify the user what version of this function they are using
message("Using the bundled SSM calls (.maf) calls in GAMBLR.data...")

if(missing(this_study)){
region_mafs = lapply(
regions, function(x){
get_ssm_by_region(
region = x,
these_samples_metadata = metadata,
this_seq_type = this_seq_type,
streamlined = streamlined,
projection = projection,
mode = mode,
verbose = FALSE, #force to FALSE, suppressing noisy output
...
)
if(engine == "overlaps"){
if(verbose){
print("Using the non-default engine for efficiency...")
}
)
}else if(engine == "foverlaps"){
sample_maf = get_ssm_by_samples(these_samples_metadata=these_samples_metadata)
setkey(sample_maf, Chromosome, Start_Position,End_Position)
regions_dt = data.frame(all_lymphome_gene_regions) %>% rownames_to_column("Hugo_Symbol") %>%
separate(all_lymphome_gene_regions,c("Chromosome","region"),sep = ":") %>%
separate(region,c("Start_Position","End_Position"),"-") %>%
mutate(Start_Position=as.numeric(Start_Position),End_Position=as.numeric(End_Position)) %>%
as.data.table()
setkey(regions_dt,Chromosome,Start_Position,End_Position)
maf_regions = foverlaps(sample_maf, regions_dt, type="within", which=TRUE,
by.x=c("Chromosome","Start_Position","End_Position"),
by.y=c("Chromosome","Start_Position","End_Position"))

match_rows=filter(maf_regions,!is.na(yid)) %>% pull(xid)
match_maf = sample_maf[match_rows,]
return(match_maf)

sample_maf <- get_ssm_by_samples(
these_samples_metadata = these_samples_metadata,
this_seq_type = this_seq_type,
projection = projection,
tool_name = mode
)
regions_df <- as.data.frame(regions) %>%
`names<-`("regions") %>%
separate(
regions,
c("Chromosome", "Start_Position","End_Position"),
":|-"
) %>%
mutate(
Start_Position = as.numeric(Start_Position),
End_Position = as.numeric(End_Position)
) %>%
mutate(region = row_number())
region_mafs <- cool_overlaps(
sample_maf,
regions_df
) %>%
dplyr::rename_with(~ gsub(".x", "", .x, fixed = TRUE)) %>%
dplyr::select(all_of(c(names(sample_maf), "region"))) %>%
dplyr::group_split(region)
} else {
region_mafs = lapply(
regions, function(x){
get_ssm_by_region(
region = x,
these_samples_metadata = metadata,
this_seq_type = this_seq_type,
streamlined = streamlined,
projection = projection,
mode = mode,
verbose = FALSE, #force to FALSE, suppressing noisy output
...
)
}
)
}
}else{
region_mafs = lapply(
regions, function(x){
get_ssm_by_region(
region = x,
these_samples_metadata = metadata,
this_seq_type = this_seq_type,
streamlined = streamlined,
projection = projection,
mode = mode,
this_study = this_study,
verbose = FALSE, #force to FALSE, suppressing noisy output
...
)
if(engine == "overlaps"){
if(verbose){
print("Using the non-default engine for efficiency...")
}
)

sample_maf <- get_ssm_by_samples(
these_samples_metadata = these_samples_metadata,
this_seq_type = this_seq_type,
projection = projection,
tool_name = mode,
this_study = this_study
)
regions_df <- as.data.frame(regions) %>%
`names<-`("regions") %>%
separate(
regions,
c("Chromosome", "Start_Position","End_Position"),
":|-"
) %>%
mutate(
Start_Position = as.numeric(Start_Position),
End_Position = as.numeric(End_Position)
) %>%
mutate(region = row_number())

region_mafs <- cool_overlaps(
sample_maf,
regions_df
) %>%
dplyr::rename_with(~ gsub(".x", "", .x, fixed = TRUE)) %>%
dplyr::select(all_of(c(names(sample_maf), "region"))) %>%
dplyr::group_split(region)

} else {
region_mafs = lapply(
regions, function(x){
get_ssm_by_region(
region = x,
these_samples_metadata = metadata,
this_seq_type = this_seq_type,
streamlined = streamlined,
projection = projection,
mode = mode,
this_study = this_study,
verbose = FALSE, #force to FALSE, suppressing noisy output
...
)
}
)
}
}
}else{
region_mafs = lapply(regions, function(x){get_ssm_by_region(region = x,
maf_data = maf_data,
these_samples_metadata = metadata,
this_seq_type = this_seq_type,
streamlined = streamlined,
projection = projection,
verbose = FALSE)})
if(engine == "overlaps"){
if(verbose){
print("Using the non-default engine for efficiency...")
}
regions_df <- as.data.frame(regions) %>%
`names<-`("regions") %>%
separate(
regions,
c("Chromosome", "Start_Position","End_Position"),
":|-"
) %>%
mutate(
Start_Position = as.numeric(Start_Position),
End_Position = as.numeric(End_Position)
) %>%
mutate(region = row_number())

region_mafs <- cool_overlaps(
sample_maf,
regions_df
) %>%
dplyr::rename_with(~ gsub(".x", "", .x, fixed = TRUE)) %>%
dplyr::select(all_of(c(names(sample_maf), "region"))) %>%
dplyr::group_split(region)
}else{
region_mafs = lapply(regions, function(x){get_ssm_by_region(region = x,
maf_data = maf_data,
these_samples_metadata = metadata,
this_seq_type = this_seq_type,
streamlined = streamlined,
projection = projection,
verbose = FALSE)})
}
}

#deal with region names
1 change: 1 addition & 0 deletions man/get_ssm_by_regions.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading