10
10
# ' @param features File containing total genomic lengths of features [Default 'data/genomic_features.txt']
11
11
# ' @param genome_length The total legnth of the genome [Default 118274340 (mappable regions on chroms 2, 3, 4, X & Y for Drosophila melanogastor Dmel6.12)]
12
12
# ' @keywords enrichment
13
- # ' @import dplyr
14
- # ' @return A data frame with FC scores for all genes seen at least n times in snv data
15
- # ' @export
13
+ # ' @import dplyr ggpubr
14
+ # ' @return A snv_data frame with FC scores for all genes seen at least n times in snv snv_data
15
+ # ' @export
16
+ featureEnrichment <- function (... , snv_data = NULL , features = ' data/genomic_features.txt' , genome_length = 118274340 , write = FALSE ){
16
17
17
- featureEnrichment <- function (features = ' data/genomic_features.txt' , genome_length = 118274340 ){
18
18
genome_features <- read.delim(features , header = T )
19
- data <- getData()
20
- mutCount <- nrow(data )
21
-
19
+
20
+ if (missing(snv_data )){
21
+ snv_data <- getData(... )
22
+ }
23
+ mutCount <- nrow(snv_data )
24
+
22
25
# To condense exon counts into "exon"
23
- data $ feature <- as.factor(gsub(" exon_.*" , " exon" , data $ feature ))
24
-
25
- classCount <- table(data $ feature )
26
+ snv_data $ feature <- as.factor(gsub(" exon_.*" , " exon" , snv_data $ feature ))
27
+
28
+ classCount <- table(snv_data $ feature )
26
29
classLengths <- setNames(as.list(genome_features $ length ), genome_features $ feature )
27
-
30
+
28
31
fun <- function (f ) {
29
32
# Calculate the fraction of geneome occupied by each feature
30
33
featureFraction <- classLengths [[f ]]/ genome_length
31
-
32
- # How many times should we expect to see this feature hit in our data (given number of obs. and fraction)?
34
+
35
+ # How many times should we expect to see this feature hit in our snv_data (given number of obs. and fraction)?
33
36
featureExpect <- (mutCount * featureFraction )
34
-
35
- # observed/expected
37
+
38
+ # observed/expected
36
39
fc <- classCount [[f ]]/ featureExpect
37
- fc <- round (fc , digits = 1 )
40
+ Log2FC <- log2 (fc )
38
41
featureExpect <- round(featureExpect ,digits = 3 )
39
-
42
+
40
43
# Binomial test
41
44
if (! is.null(classLengths [[f ]])){
42
45
if (classCount [f ] > = featureExpect ){
@@ -47,17 +50,63 @@ featureEnrichment <- function(features='data/genomic_features.txt', genome_lengt
47
50
stat <- binom.test(x = classCount [f ], n = mutCount , p = featureFraction , alternative = " less" )
48
51
test <- " depletion"
49
52
}
50
- sig_val <- ' F'
51
- if (stat $ p.value < = 0.05 ){ sig_val <- ' T' }
53
+ sig_val <- ifelse(stat $ p.value < = 0.001 , " ***" ,
54
+ ifelse(stat $ p.value < = 0.01 , " **" ,
55
+ ifelse(stat $ p.value < = 0.05 , " *" , " " )))
56
+
52
57
p_val <- format.pval(stat $ p.value , digits = 3 , eps = 0.0001 )
53
- list (feature = f , observed = classCount [f ], expected = featureExpect , fc = fc , test = test , sig = sig_val , p_val = p_val )
58
+ list (feature = f , observed = classCount [f ], expected = featureExpect , Log2FC = Log2FC , test = test , sig = sig_val , p_val = p_val )
54
59
}
55
60
}
56
-
57
- enriched <- lapply(levels(data $ feature ), fun )
61
+
62
+ enriched <- lapply(levels(snv_data $ feature ), fun )
58
63
enriched <- do.call(rbind , enriched )
59
64
featuresFC <- as.data.frame(enriched )
60
65
# Sort by FC value
61
- featuresFC <- arrange(featuresFC ,desc(as.integer(fc )))
62
- return (featuresFC )
66
+ featuresFC <- dplyr :: arrange(featuresFC ,desc(abs(as.numeric(Log2FC ))))
67
+ featuresFC $ Log2FC <- round(as.numeric(featuresFC $ Log2FC ), 1 )
68
+
69
+ if (write ){
70
+ featuresFC <- filter(featuresFC , observed > = 5 )
71
+ first.step <- lapply(featuresFC , unlist )
72
+ second.step <- as.data.frame(first.step , stringsAsFactors = F )
73
+ ggpubr :: ggtexttable(second.step , rows = NULL , theme = ttheme(" mGreen" ))
74
+ feat_enrichment_table <- paste(" feature_enrichment_table.tiff" )
75
+ cat(" Writing to file: " , ' plots/' , feat_enrichment_table , sep = ' ' )
76
+ ggsave(paste(" plots/" , feat_enrichment_table , sep = " " ), width = 5.5 , height = (nrow(featuresFC )/ 3 ), dpi = 300 )
77
+ } else {
78
+ return (featuresFC )
79
+ }
80
+ }
81
+
82
+
83
+ featureEnrichmentPlot <- function (write = FALSE ) {
84
+ feature_enrichment <- featureEnrichment()
85
+
86
+ feature_enrichment $ feature <- as.character(feature_enrichment $ feature )
87
+ feature_enrichment $ Log2FC <- as.numeric(feature_enrichment $ Log2FC )
88
+
89
+ feature_enrichment <- transform(feature_enrichment , feature = reorder(feature , - Log2FC ))
90
+
91
+ feature_enrichment <- filter(feature_enrichment , observed > = 5 )
92
+
93
+ # Custom sorting
94
+ # feature_enrichment$feature <- factor(feature_enrichment$feature, levels=c("intron", "intergenic", "exon", "3UTR", "ncRNA", "5UTR"))
95
+
96
+ p <- ggplot(feature_enrichment )
97
+ p <- p + geom_bar(aes(feature , Log2FC , fill = as.character(test )), stat = " identity" )
98
+ p <- p + guides(fill = FALSE )
99
+ p <- p + ylim(- 2 ,2 )
100
+ p <- p + cleanTheme() +
101
+ theme(panel.grid.major.y = element_line(color = " grey80" , size = 0.5 , linetype = " dotted" ),
102
+ axis.text.x = element_text(angle = 45 , hjust = 1 ),
103
+ axis.text = element_text(size = 20 )
104
+ )
105
+
106
+ if (write ){
107
+ feat_plot <- paste(" feat_plot.pdf" )
108
+ cat(" Writing file" , feat_plot , " \n " )
109
+ ggsave(paste(" plots/" , feat_plot , sep = " " ), width = 5 , height = 10 )
110
+ }
111
+ p
63
112
}
0 commit comments