From 03ca328668e84767b3ea04a97478649837d74301 Mon Sep 17 00:00:00 2001 From: krassowski Date: Wed, 7 Apr 2021 21:59:42 +0100 Subject: [PATCH 1/4] Add a test for set overlap --- R/tests.R | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/R/tests.R b/R/tests.R index 5e01925..8b40027 100644 --- a/R/tests.R +++ b/R/tests.R @@ -116,3 +116,144 @@ upset_test = function( result = comparison[order(comparison$fdr), ] result } + + +test_set_overlaps_of_degree = function( + data, intersect, degree, remaining_degrees, diagnosis_rates, test, min_size +) { + if (remaining_degrees == 0) { + data_subset = data[diagnosis_rates$name] + diagnosed_with_all = sum(apply(data_subset, 1, all), na.rm=TRUE) + + if (is.na(diagnosed_with_all)) { + diagnosed_with_all = 0 + } + + if (diagnosed_with_all < min_size) { + return(list()) + } + + having_data_for_all = sum(apply(!is.na(data_subset), 1, all), na.rm=TRUE) + + expected_overlap = prod(diagnosis_rates$rate) + + if (degree == 1) { + # testing overlap does not make sense for degree=1 and could mislead FDR calculations + p = list( + statistic=NA, + p.value=NA + ) + } else { + p = test( + x=c( + 'with_all_overlaping'=diagnosed_with_all, + 'without_all_overlapping'=having_data_for_all - diagnosed_with_all + ), + p=c( + 'with_all_overlapping'=expected_overlap, + 'without_all_overlapping'=1 - expected_overlap + ) + ) + } + + result = list( + 'expected_overlap'=expected_overlap, + 'expected_n'=expected_overlap * having_data_for_all, + 'observed_n'=diagnosed_with_all, + 'observed_overlap'=diagnosed_with_all / having_data_for_all, + 'sample_size'=having_data_for_all, + 'identifier'=paste(sanitize_names(diagnosis_rates$name), collapse='-'), + 'statistic'=p$statistic, + 'p_value'=p$p.value + ) + return (result) + } else { + expected_overlaps = list() + + for (disease in intersect) { + + if (disease %in% diagnosis_rates$name) { + next() + } + + data_disease = data[[disease]] + diagnoses = sum(data_disease, na.rm=TRUE) + + if (diagnoses < min_size) { + next() + } + + status_known_for = sum(!is.na(data_disease)) + diagnosis_rate = diagnoses / status_known_for + + diagnosis_rates_temp = rbind( + diagnosis_rates, + list( + 'name'=disease, + 'rate'=diagnosis_rate, + 'diagnoses'=diagnoses, + 'status_known_for'=status_known_for + ) + ) + + expected_overlap = test_set_overlaps_of_degree( + data=data, + intersect=intersect, + diagnosis_rates=diagnosis_rates_temp, + degree=degree, + remaining_degrees=remaining_degrees - 1, + test=test, + min_size=min_size + ) + + if (length(expected_overlap)) { + expected_overlaps[[length(expected_overlaps) + 1]] = expected_overlap + } + } + return (data.frame(do.call(rbind, expected_overlaps))) + } +} + + +test_set_overlaps = function( + data, intersect, + min_degre=1, max_degree=3, + test=chisq.test, + min_size=0, + encode=FALSE, + fdr_method='fdr' +) { + data_subset = data[intersect] + + if (encode) { + intersect = encode_names(intersect, avoid=c()) + colnames(data_subset) = intersect + } + + all_expected_overlaps = list() + + for (degree in seq(min_degre, max_degree)) { + + expected_overlaps = test_set_overlaps_of_degree( + data=data_subset, + intersect=intersect, + degree=degree, + remaining_degrees=degree, + diagnosis_rates=data.frame(name=character(), rate=numeric()), + test=test, + min_size=min_size + ) + + if (length(expected_overlaps)) { + expected_overlaps$degree = degree + all_expected_overlaps[[length(all_expected_overlaps) + 1]] = expected_overlaps + } + } + + df = data.frame(do.call(rbind, all_expected_overlaps)) + df = data.frame(lapply(df, unlist)) + df$enrichment = df$observed_overlap - df$expected_overlap + df$phi = sqrt(df$statistic^2 / df$sample_size) + df$fdr = p.adjust(df$p_value, method = fdr_method) + df = df[order(-df$phi), ] +} From 958be5ad0df5330f5bb6f11f1fa0d2083ff73e7b Mon Sep 17 00:00:00 2001 From: krassowski Date: Wed, 7 Apr 2021 22:39:16 +0100 Subject: [PATCH 2/4] Prevent redundancy (use combinations, not permutations), convert to logical --- R/tests.R | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/R/tests.R b/R/tests.R index 8b40027..ed4c5db 100644 --- a/R/tests.R +++ b/R/tests.R @@ -170,7 +170,8 @@ test_set_overlaps_of_degree = function( } else { expected_overlaps = list() - for (disease in intersect) { + for (i in 1:length(intersect)) { + disease = intersect[i] if (disease %in% diagnosis_rates$name) { next() @@ -198,7 +199,7 @@ test_set_overlaps_of_degree = function( expected_overlap = test_set_overlaps_of_degree( data=data, - intersect=intersect, + intersect=intersect[i:length(intersect)], diagnosis_rates=diagnosis_rates_temp, degree=degree, remaining_degrees=remaining_degrees - 1, @@ -223,11 +224,18 @@ test_set_overlaps = function( encode=FALSE, fdr_method='fdr' ) { - data_subset = data[intersect] + data = data[intersect] + + # TODO: shared? + is_column_logical = sapply(data[, intersect], is.logical) + if (any(!is_column_logical)) { + non_logical = names(is_column_logical[is_column_logical == FALSE]) + data[, non_logical] = sapply(data[, non_logical], as.logical) + } if (encode) { intersect = encode_names(intersect, avoid=c()) - colnames(data_subset) = intersect + colnames(data) = intersect } all_expected_overlaps = list() @@ -235,7 +243,7 @@ test_set_overlaps = function( for (degree in seq(min_degre, max_degree)) { expected_overlaps = test_set_overlaps_of_degree( - data=data_subset, + data=data, intersect=intersect, degree=degree, remaining_degrees=degree, From f4cc80d0e9c9b904c62af09e8eca7316650e2167 Mon Sep 17 00:00:00 2001 From: krassowski Date: Sat, 10 Apr 2021 12:16:09 +0100 Subject: [PATCH 3/4] Allow to store sets as columns --- R/tests.R | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/R/tests.R b/R/tests.R index ed4c5db..7c2eee6 100644 --- a/R/tests.R +++ b/R/tests.R @@ -119,7 +119,8 @@ upset_test = function( test_set_overlaps_of_degree = function( - data, intersect, degree, remaining_degrees, diagnosis_rates, test, min_size + data, intersect, degree, remaining_degrees, diagnosis_rates, test, min_size, max_degree, + sets_as_columns=FALSE ) { if (remaining_degrees == 0) { data_subset = data[diagnosis_rates$name] @@ -155,7 +156,6 @@ test_set_overlaps_of_degree = function( ) ) } - result = list( 'expected_overlap'=expected_overlap, 'expected_n'=expected_overlap * having_data_for_all, @@ -166,6 +166,16 @@ test_set_overlaps_of_degree = function( 'statistic'=p$statistic, 'p_value'=p$p.value ) + + if (sets_as_columns) { + sets = c( + diagnosis_rates$name, + rep(NA, max_degree - length(diagnosis_rates$name)) + ) + names(sets) = paste0('set_', seq(1, max_degree)) + result = c(result, sets) + } + return (result) } else { expected_overlaps = list() @@ -204,7 +214,9 @@ test_set_overlaps_of_degree = function( degree=degree, remaining_degrees=remaining_degrees - 1, test=test, - min_size=min_size + min_size=min_size, + sets_as_columns=sets_as_columns, + max_degree=max_degree ) if (length(expected_overlap)) { @@ -222,7 +234,8 @@ test_set_overlaps = function( test=chisq.test, min_size=0, encode=FALSE, - fdr_method='fdr' + fdr_method='fdr', + sets_in_columns=FALSE ) { data = data[intersect] @@ -249,7 +262,9 @@ test_set_overlaps = function( remaining_degrees=degree, diagnosis_rates=data.frame(name=character(), rate=numeric()), test=test, - min_size=min_size + min_size=min_size, + max_degree=max_degree, + sets_as_columns=sets_in_columns ) if (length(expected_overlaps)) { From a6b1feb7e1cc9ae72e2b89e6e2e8de11ef751e20 Mon Sep 17 00:00:00 2001 From: krassowski Date: Sat, 10 Apr 2021 12:23:24 +0100 Subject: [PATCH 4/4] Put sets columns upfront for igraph convenience --- R/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/tests.R b/R/tests.R index 7c2eee6..80a66a3 100644 --- a/R/tests.R +++ b/R/tests.R @@ -173,7 +173,7 @@ test_set_overlaps_of_degree = function( rep(NA, max_degree - length(diagnosis_rates$name)) ) names(sets) = paste0('set_', seq(1, max_degree)) - result = c(result, sets) + result = c(sets, result) } return (result)