-
Notifications
You must be signed in to change notification settings - Fork 2
/
report.rmd
111 lines (96 loc) · 5.7 KB
/
report.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
---
title: "ScNapBar results"
output: pdf_document
fontsize: 12pt
---
```{r global_options, include=FALSE}
Sys.setlocale("LC_NUMERIC","C")
options(stringsAsFactors = FALSE)
args = commandArgs(trailingOnly=TRUE)
library(reshape2)
library(ggplot2)
library(knitr)
names(args)=c('sim_barcodes','sim_label','sim_prob','real_label','real_prob','real_log','cutoff','sis_pred')
args = as.list(args)
args$cutoff=as.integer(args$cutoff)
feat = c("adaptor_score", "barcode_score", "barcode_indel", "barcode_start", "barcode_mismatch", "umi_length", "polyT_length")
```
ScNapBar outputs the probability scores for each barcode alignment which allow user to select cutoffs according to their preferences on sensitivity and specificity. In Figure \ref{fig:simbench} we see the performance metrics of the simulated reads and the selected cutoff.
# Simulated reads
## Benchmarking
```{r, eval=TRUE, echo=FALSE, cache=TRUE, cache.lazy=FALSE, message=FALSE, warning=FALSE, fig.width=6, fig.height=4, fig.cap="\\label{fig:simbench}Performance metrics of the simulated reads"}
v = seq(0,100)
t1=read.table(args$sim_barcodes)
m =read.table(args$sim_label, header=TRUE)
df=do.call(rbind,lapply(v,function(d) {
t2=m[m[,3]>d,1:2]
i=match(t1[,1], t2[,1])
pred=as.integer(is.na(t2[i,2]))
fal=t2[i,2]!=t1[,3]
fal[is.na(fal)]=FALSE
unal=grepl('^chr',t1[,1])
gtruth=as.integer(unal | fal)
res=caret::confusionMatrix(factor(pred), factor(gtruth))
c(res$overall,res$byClass)[c(1,8,9,12,14)]
}))
rownames(df)=v
df=df[!is.na(df[,4]),]
df=melt(df)
colnames(df)[2]='Measures'
ggplot(df,aes(Var1,value,col=Measures))+geom_line()+geom_vline(xintercept=args$cutoff, linetype=4)+
labs(x='Predicted probability cutoff',y='Scores')
t2=m[m[,3]>args$cutoff,1:2]
i=match(t1[,1], t2[,1])
pred=as.integer(is.na(t2[i,2]))
fal=t2[i,2]!=t1[,3]
fal[is.na(fal)]=FALSE
unal=grepl('^chr',t1[,1])
gtruth=as.integer(unal | fal)
res=caret::confusionMatrix(factor(pred), factor(gtruth))
colnames(res$table)=rownames(res$table)=c('P','N')
kable(res$table, format='pandoc', caption='Confusion matrix')
```
ScNapBar does supervised learning using the features from adapter and barcode sequences alignments. We label them as correct or wrong assignment from the simulated reads (Figure \ref{fig:simfeat}), and train the model based on that.
## Feature distributions
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.width=8, fig.height=6, fig.cap="\\label{fig:simfeat}Features from the simulated reads"}
x=read.table(args$sim_prob,header=TRUE)
df=data.frame(assignment=c('wrong','correct')[as.integer(x$pred>args$cutoff)+1],melt(x[,feat],id.vars=NULL))
ggplot(df,aes(value,fill=assignment))+geom_histogram()+facet_wrap(~variable,scale="free")
```
We apply the model to the simulated reads and output the score. Looking at the distribution of the scores, the scores of correct assignment are much higher than the wrong assignment (Figure \ref{fig:simscore}). User may choose a cutoff based on the distributions.
## Assignment score distributions
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.width=8, fig.height=6, fig.cap="\\label{fig:simscore}Score distributions of the simulated reads"}
df=read.table(args$sim_prob,header=TRUE)
df$Assignment=c('correct','wrong')[df$label+1]
ggplot(df, aes(x = pred, group = Assignment)) + geom_density(aes(color = Assignment))+ labs(x='Scores', y='Density')
```
# Nanopore reads
## Number of reads each step
In Figure \ref{fig:realcount} we see the number of reads left in each step of ScNapBar.
```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.width=6, fig.height=3, fig.cap="\\label{fig:realcount}Number of Nanopore reads each step"}
x=read.table(args$real_log,sep='\t')
a=system(paste0('wc -l ',args$real_label),intern=TRUE)
x[nrow(x)+1,]=c('Assigned to barcode', gsub('[^0-9]', '', a))
x[,2]=as.integer(x[,2])
x$Steps=factor(x[,1],levels=rev(x[,1]))
ggplot(x, aes(x=Steps, y=V2)) + geom_bar(stat="identity", position=position_dodge(), fill = "#ffff99") +coord_flip()+ labs(x="Processing steps", y="Number of simulated reads")+geom_text(aes(label=V2), position=position_dodge(width=0.9), hjust=1)
```
In Figure \ref{fig:realfeat} we see the features from adapter and barcode sequences alignments of the Nanopore reads. We use the model trained on the simulated reads, and predict whether the barcode alignment is correct ('assigned') or wrong ('unassigned').
## Feature distributions
```{r, echo=FALSE, message=FALSE, warning=FALSE, cache=TRUE, cache.lazy=FALSE, fig.width=8, fig.height=6, fig.cap="\\label{fig:realfeat}Features from the Nanopore reads"}
x=read.table(args$real_prob,header=TRUE,nrow=1000000)
df=data.frame(assignment=c('unassigned','assigned')[as.integer(x$pred>args$cutoff)+1],melt(x[,feat],id.vars=NULL))
ggplot(df,aes(value,fill=assignment))+geom_histogram()+facet_wrap(~variable,scale="free")
```
```{r, eval=FALSE, echo=FALSE, message=FALSE, warning=FALSE, cache=TRUE, cache.lazy=FALSE, fig.width=8, fig.height=6, fig.cap="\\label{fig:realcompfeat}Features from the Nanopore reads"}
x=read.table(args$sis_pred,header=TRUE)
a=c('Not found by Sicelore','Same with Sicelore','Diff. with Sicelore')[x$X.1+2]
df=data.frame(assignment=a,melt(x[,feat],id.vars=NULL))
ggplot(df,aes(value,fill=assignment))+geom_histogram()+facet_wrap(~variable,scale="free")+ labs(title='Feature distributions compared with Sicelore')
```
```{r, eval=FALSE, echo=FALSE, message=FALSE, warning=FALSE, cache=TRUE, cache.lazy=FALSE, fig.width=8, fig.height=6, fig.cap="\\label{fig:realcompscore}Features from the Nanopore reads"}
x=read.table(args$sis_pred,header=TRUE)
a=c('Not found by Sicelore','Same with Sicelore','Diff. with Sicelore')[x$X.1+2]
x$assignment=a
ggplot(x, aes(x=pred, group=assignment)) + geom_density(aes(color=assignment))+ labs(title='Score distributions compared with Sicelore', x='Scores', y='Density')
```