-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconPCR_needleSam.R
1265 lines (952 loc) · 48.5 KB
/
conPCR_needleSam.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Note, below is all taken directly from needleSam.R
# here, applied to the control PCR on sample #56635
# will do as little modifications as possible, so will look convoluted as original script is applied on all the samples
# needle in a SAM
# finding a possible somatic mutation in the reads
# like a needle in a haystack!
# needle = somatic mutation / haystack = SAM files
oprlength <- 123
# threshold used in readSam() to exclude reads that start or terminate in the OPR by CIGAR lengths
# some common things we may need to change often --------------------------
hapcol <- 21 # nth field in the SAM file contains the HP tags (looks like HP:i:1; it was 20th then 21th in a newer version of minimap2)
hapcols <- c('#f1876b', '#4D4D4D', '#B3BDC4') # colours for the haplotypes, it goes 1, 2, 0
# packages and functions --------------------------------------------------
library(here)
library(stringr)
library(data.table)
library(openxlsx)
library(msa)
library(ggplot2)
library(tictoc)
library(dplyr)
library(tidyr)
library(ggpubr)
library(ggbeeswarm)
# substrEnding = last n characters of a string
# x = string (or vector of strings); n = last n characters
substrEnding <- function(x, n){
substr(x, nchar(x)-n+1, nchar(x))
}
# beforeFirstCharacter = given a string, takes everything before first instance of a character
# eg. beforeFirstCharacter('59060_file.bam', '_') >> '59060'
beforeFirstCharacter <- function(stri, cha) {
as.character(sapply(stri, function(st){
substr(st,
1,
which(strsplit(st, '')[[1]] == cha) [length(which(strsplit(st, '')[[1]] == cha))]-1)
}))
}
###
# getReadlengthCigar() is a small helper function for readSam() below
# the goal is to exlude reads which start or terminate in the OPR
# these get categorised as reference because their CIGAR are for example 56M
# However, these reads do not span the whole OPR, so they should be excluded
# but excluding by readlength does not work, for example a 103-bp read could be a OPRD1 or a reference read that terminated early in the OPR
# one solution is to compute the alignment length as told by the CIGAR (only Match/Insertion/Deletion)
# this essentially corresponds to the window of hg38 that is covered by the read (as it will include deletions)
# get read length as told by CIGAR
getReadlengthCigar <- function(read) { # works for one row of a sam file
cigar <- read[6]
tmp <- str_extract_all(cigar, '[0-9]+[MDI]')[[1]] # all the Match/Deletion/Insertion components of the CIGAR
rlci <- sum(as.numeric(unlist(strsplit(tmp, '[A-Z]')))) # rlci = readlength cigar
return(rlci)
}
###
# readSam = reads SAM properly into R
readSam <- function(sampath) {
# check the path actually looks like it leads to a SAM file
if(substrEnding(sampath, 4) != '.sam') stop('\t \t \t \t >>> Error: that does not look like the path to a SAM file... \n')
# if ok -- import the SAM file
# numcols <- max(count.fields(sampath, sep='\t'), na.rm=TRUE)
# above is to count maximum number of columns to be used below as sprintf('col%i', numcols)
# works well, but when importing more than one SAM, can have variable number of columns
# alternative: set to a large number. It will create NA columns at the end, but at least we know all the SAMs look the same
# (can delete the NA columns after rbind all the SAMs, if that is what we want to do)
sam <- read.table(file=sampath, fill=TRUE, comment.char='@', sep='\t', na.strings=c('', 'NA'),
col.names=sprintf('col%i', 1:30))
# check what should be the haplotype tag column, can be messy for different reasons
hpcol <- 21
taghp <- substr(sam[,hpcol], 1, 2) # will take mostly NA or HP and we do not want anything else, hence below
cat('\t \t \t \t \t >>> Found', length(which(taghp=='HP')), 'haplotags in the expected column \n')
if (length(which(taghp!='HP')) > 0) {
cat('\t \t \t \t \t >>> Removed', length(which(taghp!='HP')), 'unexpected tags from the HP column \n')
sam[which(taghp!='HP'), hpcol] <- NA # the rows that are not NA or HP, replace them with NA
# ! for these rows, it is possible that the HP tag was pushed to the next column
# no need to look only at these rows, just look in the next column for any HP
taghpnxt <- substr(sam[,hpcol+1], 1, 2)
if(length(which(taghpnxt=='HP')) > 0) { # if found any HP tag to salvage from the next column
cat('\t \t \t \t \t \t >>> for', length(which(taghpnxt=='HP')), 'of these reads, salvaged the HP tag found in the next column \n')
sam[which(taghpnxt=='HP'), hpcol+1] <- sam[which(taghpnxt=='HP'), hpcol+1] # copy it to the HP column
# ! Note this is far from an ideal solution, will copy the HP tag and it also means the columns are shifted for these rows
# but at the moment I am not using any other tag than HP
}
}
# check there is no HP tag in other columns before or after HP column
for (co in ((hpcol-4):(hpcol+4))[-5]) { # check all 4 columns before HP column, all 4 columns after HP column, except HP column itself
tagco <- substr(sam[,co], 1, 2)
if(length(which(tagco=='HP')) > 0) {
cat('\t \t \t \t \t >>> found ', length(which(tagco=='HP')), ' HP tags in column', co ,'\n')
}
}
# check there is no duplicate read ID
# as I removed secondary alignments in filterBam.command it should be good
if (sum(duplicated(sam[1])) != 0) {
stop('\t \t \t \t \t >>> Error: found', sum(duplicated(sam[1])), 'duplicated read IDs \n')
} else {
cat('\t \t \t \t \t >>> All read IDs are unique \n')
}
# exclude reads that start or terminate in the OPR (see explanations for getReadlengthCigar() above)
reads2del <- which(apply(sam, 1, getReadlengthCigar) < oprlength)
if (length(reads2del) == 0) {
cat('\t \t \t \t \t >>> 0 read that start or terminate in the OPR \n')
} else {
cat('\t \t \t \t \t >>> Deleting', length(reads2del),'reads that start or terminate in the OPR \n')
sam <- sam[ - reads2del ,]
}
# exclude reads that include letters that are not ACGT
# I think removing the reads that start/terminate reads above include this, but just to be safe
totnotACGT <- sapply(1:nrow(sam), function(ri) { # for each read, count the total of letters that are not ACGT
sum( ! strsplit(sam[ri,10], '')[[1]] %in% c('A', 'C', 'G', 'T'))
})
reads2del <- which(totnotACGT!=0)
if (length(reads2del) != 0) {
cat('\t \t \t \t \t >>> Deleting', length(reads2del),'reads with non-ACGT nucleotide \n')
sam <- sam[ - reads2del ,]
}
# say final number of reads we are importing
cat('\t \t \t \t \t >>> IMPORTING', nrow(sam),'READS \n')
return(sam)
}
###
# from CIGAR of one read; get total insertion
getTotalInsertion <- function(read) {
cigar <- read[6]
if(length(grep(pattern='I', cigar))!=0) {
tmp <- str_extract_all(cigar, '[0-9]+I')[[1]] # extracts the I components; eg. 59I129M32I >> 59I 32I
totin <- sum(as.numeric(unlist(strsplit(tmp, '[A-Z]'))))
# split at any letter (here will be only I), so example above: 59 32
# convert to numeric & sum
} else {
totin <- 0
return(totin)
}
}
###
# from CIGAR of one read; get total deletion
getTotalDeletion <- function(read) {
cigar <- read[6]
if(length(grep(pattern='D', cigar))!=0) {
tmp <- str_extract_all(cigar, '[0-9]+D')[[1]] # extracts the I components; eg. 59I129M32I >> 59I 32I
totdel <- -sum(as.numeric(unlist(strsplit(tmp, '[A-Z]'))))
# split at any letter (here will be only I), so example above: 59 32
# convert to numeric & sum & turn to negative for deletion
return(totdel)
} else {
totdel <- 0
return(totdel)
}
}
###
# counts & frequencies within bins of a continuous variable (each bin = one range of values) per group
# e.g. data is number of mismatches for each read
# and 2 groups: reference / mutated
# for each group; how many reads in bin 0--10 mismatches; how many reads in bin 11-20 etc.
# both in counts and frequencies (% of total within each group)
# Note; bins represent the right boundary, e.g. a case where we have proportions, 0.1 means the bin that contains the data from 0 to 0.1
# same in the dataframe it returns
# example usage/ freqInBins(df = mydata, datcol = 'mismatchpercentage', grpcol = 'treated', bins=seq(0, 1, by=0.1))
freqInBins <- function(df, datcol, grpcol, bins) {
ngrps <- length(unique(df[,grpcol])) # number of groups
grpnms <- unique(df[,grpcol]) # group names
cfpg <- vector(mode='list', length=ngrps) # preallocate list counts + frequencies per group
for(i in 1:length(grpnms)) { # loop through groups
dat <- as.numeric(df[ which(df[,grpcol] == grpnms[i]) , datcol]) # data for this group (the data we need to bin)
cos <- hist(dat, breaks=bins, include.lowest=TRUE, plot=FALSE)$counts # counts
fre <- cos / sum(cos) # frequencies
gdf <- data.frame(grp=grpnms[i], bin=bins[-1], counts=cos, freq=fre) # small dataframe for this group, two columns counts and frequencies
# group column should be called the same as grpcol
colnames(gdf)[1] <- grpcol
# add it to the list
cfpg[[i]] <- gdf
}
# stick the list together and return it
fbing <- as.data.frame(rbindlist(cfpg)) # frequencies in each bin by group
return(fbing)
}
# 1- Import what we need --------------------------------------------------
# 1- Catalog of consensus OPR sequences
# OPRConsensusCatalog.csv written by generateOPRCatalog.R
cato <- read.csv(here('needleSam', 'OPRConsensusCatalog.csv')) # consensus sequence from OPRD4 up to OPRI24
# 2- Meta information of samples
meta <- read.xlsx(here('AdditionalFile1.xlsx'), sheet='samples')
# make the formatting match
# - >> NA
meta$SV[which(meta$SV=='-')] <- NA
# special case: PDG 46345 is 5 OPRI/1 OPRD
# make a new column SV2 for it (just for the 1 OPRD)
meta <- meta %>%
mutate(SV2=NA, .after=SV)
meta[which(meta$sample==46345), 'SV2'] <- unlist(strsplit(meta[which(meta$sample==46345), 'SV'], '/'))[2]
meta[which(meta$sample==46345), 'SV'] <- unlist(strsplit(meta[which(meta$sample==46345), 'SV'], '/'))[1]
# 2 OPRD >> OPRD2, etc.
# first lapply takes what is after the space = OPRD or OPRI
# second lapply takes what is before the space = 2, 1, ...
meta$SV <- paste0(lapply(strsplit(meta$SV, ' '), function(i) {i[2]}),
lapply(strsplit(meta$SV, ' '), function(i) {i[1]}))
# the NA above become 'NANA', put them back to NA
meta$SV[which(meta$SV=='NANA')] <- NA
# same for SV2
meta$SV2 <- paste0(lapply(strsplit(meta$SV2, ' '), function(i) {i[2]}),
lapply(strsplit(meta$SV2, ' '), function(i) {i[1]}))
meta$SV2[which(meta$SV2=='NANA')] <- NA
# sample: 52331 >> pdg52331, etc.
meta$sample <- sprintf('pdg%s', meta$sample)
# 4- Import the SAMs
# preallocate list, each element = the SAM of one sample
# 2 folders where to find the SAMs:
# one for SAMs that could not be haplotagged
# the other for SAMs that could be haplotagged
samfold <- here('controlPCR', 'filt', 'OPRtrim_sam') # folder of no-tag SAMs
samnms <- list.files(samfold) # names of all the SAMs
sams <- vector(mode='list', length=length(samnms)) # preallocate list that will store all the SAMs
pdgs <- 'pdg56635'
names(sams) <- pdgs # put the pdg58398 etc. as names of the elements in the list
# gets the full paths where to find each SAM
sampaths <- list.files(samfold, full.names=TRUE)
# fill in the list using function readSam from above
for (s in 1:length(sams)) {
cat('\t \t \t \t >>>', s, '/', length(sams), 'Reading SAM of sample', names(sams)[s], '\n')
sams[[s]] <- readSam(sampaths[s])
}
# 2- Calculate total insertion/deletion of each read ----------------------
# from the CIGAR; eg. 3I 2D 100M 3I 5D >> insertion = 6 bp // deletion = 5 bp
# readSam() should have removed anything funky in HP column, but check before running below just to be sure
hpcol <- 21
sum(unlist(lapply(sams, function(sa) {
which( substr(as.character(sa[,hpcol]), 1, 2) != 'HP') # any elements in all the HP columns that is not NA or does not start with HP
}))) == 0
# preallocate list of indel dataframes
# same length as sams
ies <- vector(mode='list', length=length(sams)) # ies for Insertion/dEletion dataframeS
names(ies) <- pdgs # put the pdg58398 etc. as names of the elements in the list
# each element in the list will be a dataframe;
# column1 = sample
# column2 = haplotype (from whatshap haplotag)
# column2 = read name
# column3 = read sequence
# column3 = total insertion (in bp, as per CIGAR)
# column4 = total deletion (in bp, as per CIGAR)
for (i in 1:length(ies)) { # for each element in the list (each sample)
cat('\t \t \t \t >>> Format & parsing CIGARs for SAM', i, '/', length(ies), '\n')
# preallocate the dataframe
colnms <- c('sample', 'haplotype', 'hapqual', 'read', 'samflag', 'seq', 'ins', 'del')
tmp <- as.data.frame(matrix(nrow=nrow(sams[[i]]), ncol=length(colnms)))
colnames(tmp) <- colnms
# fill in sample name
tmp$sample <- rep(names(ies)[i], nrow(sams[[i]]))
# fill in haplotype
tmp$haplotype <- as.numeric(substr(sams[[i]][,hpcol], 6, 6)) # simplifies the haplotype tag, from HP:i:1 / HP:i:2 to 1 / 2; will return NA if no present
# fill in haplotype quality
tmp$hapqual <- as.numeric(substr(sams[[i]][,hpcol+1], 6, 999)) # simplifies the haplotype quality, from eg. PC:i:150 to 150
# fill in read names
tmp$read <- sams[[i]][,1]
# fill in sam flags
tmp$samflag <- sams[[i]][,2]
# fill in read sequences
tmp$seq <- sams[[i]][,10]
# fill in total insertions
tmp$ins <- apply(sams[[i]], 1, getTotalInsertion)
# fill in total deletions
tmp$del <- apply(sams[[i]], 1, getTotalDeletion)
# place that sample's dataframe in the list
ies[[i]] <- tmp
}
# turn the ies list into one big dataframe
ied <- as.data.frame(rbindlist(ies)) # for Insertion/dEletion Dataframe
# haplotype tags: set all NA to 0
ied$haplotype[is.na(ied$haplotype)] <- 0
# make sure haplotype is stored as factor
ied$haplotype <- factor(ied$haplotype, levels=c(1, 2, 0))
# check no duplicated read ID after pooling all samples
sum(duplicated(ied$read))
# 5- Plot insertions/deletions for each sample ----------------------------
# will then assign each read to a most likely OPR based on total insertion/deletion
# should leave some space for nanopore noise
# eg. an OPRI often appears as a 23-bp insertion
# here: take the opportunity to look how much space we should leave
# tentative thresholds
lownoise <- 6
highnoise <- 3
# eg. for OPRI1: ok from 24 - lownoise up to 24 + highnoise
# for example, if lownoise = 6 // highnoise = 3; considers potential OPRI1 anything from insertion of 18 bp to insertion of 27 bp
# makes sense for lownoise to be larger, as there is always a drag towards lower lengths
# (or in other words, nanopore makes more deletions errors than insertions)
# plot an insertion + a deletion histogram for each sample
# preallocate list to store the plots
# ! one list for all deletion plots and one list for all insertion plots
ips <- vector(mode='list', length=length(pdgs)) # Insertions PlotS
names(ips) <- pdgs
dps <- vector(mode='list', length=length(pdgs)) # Deletions PlotS
names(dps) <- pdgs
# make a table of target insertion/deletion lengths
# eg. OPRD1 : deletion -24, etc.
# will do from OPRD4 up to OPRI24, so 4 OPRD + reference + 24 OPRI
tarl <- as.data.frame(matrix(nrow=29, ncol=2)) # target lengths
colnames(tarl) <- c('oprg', 'targetindel') # OPR genotype // target indel length
tarl$oprg <- c(sprintf('OPRD%i', 4:1) , 'reference' , sprintf('OPRI%i', 1:24))
tarl$targetindel <- c( seq(-24*4, -24*1, 24) , 0 , seq(24*1, 24*24, 24)) # (one OPR = 24 bp)
# first; DELETION plots
for (s in 1:length(dps)) {
# PDG we are plotting
pd <- names(dps)[s]
# get its OPR variant (if any)
sva <- as.character(subset(meta, sample==pd, SV))
# and from it the target length
target <- as.numeric(subset(tarl, oprg==sva, targetindel))
# eg. PDG 1906 >> OPRD2 >> -48 bp
# for that sample, count the reads with each deletion length
# can also do it automatically with geom_histogram(), but I found it leave a bar when count = 0 if computed before, which I prefer
tmp <- ied %>%
subset(sample==pd) %>%
group_by(del, haplotype) %>%
tally(name='nreads') %>%
mutate(freq=nreads/nrow(subset(ied, sample==pd)))
# above: compute frequency by dividing number of reads with each insertion length by total number of reads from that sample
# if sample has a OPRD: version with marker lines
# ! need to specify manually if PDG 46345 = compound homozygous 1 OPRD / 5 OPRI
if (pd =='pdg46345') {
target <- -24
delp <- ggplot(tmp, aes(x=del, y=freq ,colour=haplotype)) +
geom_col(fill='white', alpha=0.0, position='identity') +
scale_color_manual(values=hapcols) +
coord_cartesian(ylim=c(0,0.2), xlim=c(-250,0)) +
geom_vline(xintercept=target) +
geom_vline(xintercept=c(target-lownoise, target+lownoise), linetype=2) +
theme_minimal() +
theme(
legend.position='none'
) +
xlab('deletion') + ylab('frequency') +
ggtitle(pd)
} else if (!is.na(target) & target<0) { # if there is a target (i.e. sample has an OPRD) >> add the marker lines
delp <- ggplot(tmp, aes(x=del, y=freq ,colour=haplotype)) +
geom_col(fill='white', alpha=0.0, position='identity') +
scale_color_manual(values=hapcols) +
coord_cartesian(ylim=c(0,0.2), xlim=c(-250,0)) +
geom_vline(xintercept=target) +
geom_vline(xintercept=c(target-lownoise, target+lownoise), linetype=2) +
theme_minimal() +
theme(
legend.position='none'
) +
xlab('deletion') + ylab('frequency') +
ggtitle(pd)
} else { # if not (i.e. reference or OPRI), skip the marker lines
delp <- ggplot(tmp, aes(x=del, y=freq , colour=haplotype)) +
geom_col(fill='white', alpha=0.0, position='identity') +
scale_color_manual(values=hapcols) +
coord_cartesian(ylim=c(0,0.2), xlim=c(-250,0)) +
theme_minimal() +
theme(
legend.position='none'
) +
xlab('deletion') + ylab('frequency') +
ggtitle(pd)
}
# in any case, add deletion plot to the list
dps[[s]] <- delp
}
# second; INSERTION plots
for (s in 1:length(ips)) {
# PDG we are plotting
pd <- names(ips)[s]
# get its OPR variant (if any)
sva <- as.character(subset(meta, sample==pd, SV))
# and from it the target length
target <- as.numeric(subset(tarl, oprg==sva, targetindel))
# eg. PDG 1906 >> OPRD2 >> -48 bp
# for that sample, count the reads with each insertion length
# can also do it automatically with geom_histogram(), but I found it leave a bar when count = 0 if computed before, which I prefer
tmp <- ied %>%
subset(sample==pd) %>%
group_by(ins, haplotype) %>%
tally(name='nreads') %>%
mutate(freq=nreads/nrow(subset(ied, sample==pd)))
# above: compute frequency by dividing number of reads with each insertion length by total number of reads from that sample
# if sample has a OPRI: version with marker lines
if (!is.na(target) & target>0) {
insp <- ggplot(tmp, aes(x=ins, y=freq ,colour=haplotype)) +
geom_col(fill='white', alpha=0.0, position='identity') +
scale_color_manual(values=hapcols) +
coord_cartesian(ylim=c(0,0.2), xlim=c(0,250)) +
geom_vline(xintercept=target) +
geom_vline(xintercept=c(target-lownoise, target+lownoise), linetype=2) +
theme_minimal() +
theme(
legend.position='none'
) +
xlab('insertion') + ylab('frequency') +
ggtitle(pd)
} else { # if not (i.e. reference or OPRI), skip the marker lines
insp <- ggplot(tmp, aes(x=ins, y=freq ,colour=haplotype)) +
geom_col(fill='white', alpha=0.0, position='identity') +
scale_color_manual(values=hapcols) +
coord_cartesian(ylim=c(0,0.2), xlim=c(0,250)) +
theme_minimal() +
theme(
legend.position='none'
) +
xlab('insertion') + ylab('frequency') +
ggtitle(pd)
}
# in any case, add deletion plot to the list
ips[[s]] <- insp
}
# arrange each list of plots in a grid of one column and export
# grid of deletion plots
delgrid <- ggarrange(plotlist=dps, ncol=1, nrow=length(dps))
ggsave(filename=here('controlPCR', 'deletionGrid.pdf'), plot=delgrid, height=100, width=100, units='mm', useDingbats=FALSE)
insgrid <- ggarrange(plotlist=ips, ncol=1, nrow=length(ips))
ggsave(filename=here('controlPCR', 'insertionGrid.pdf'), plot=insgrid, height=100, width=100, units='mm', useDingbats=FALSE)
# Note there are 3 options for the histograms above:
# position='stack' (default): each bar is a stacked barplot with proportion of each haplotype
# in a sense it is the best way to plot all the information, but intuitively when I see the tip of the bars in orange (for example),
# I imagine that there is another plot below it, which changes completely the interpretation
# position='identity': histograms for each haplotypes are effectively separated, overlayed
# position='dodge': same but the bars are a little bit shifted so there is less overlay
# I think 'identity' is the less likely to create confusion (even though it is the one with the most overlay)
# 6- Assign each read to a most likely OPR --------------------------------
# eg. 23-bp insertion >> could be a OPRI
# below also allows OPRI + OPRD in same read, although that is unlikely
# create the possible intervals for OPRDs
delInts <- vector(mode='list', length=4) # OPR deletion intervals
# minimum = 4 OPRD = - 24 * 4
# maximum = 1 OPRD = -24 * 1
delOpr <- seq(-24*4, -24*1, 24)
for (o in 1:length(delOpr)) {
delInts[[o]] <- (delOpr[o] - lownoise) : (delOpr[o] + highnoise)
}
names(delInts) <- sprintf('OPRD%i', 4:1)
# create the possible intervals for OPRIs
inInts <- vector(mode='list', length=24) # OPR insertion intervals
# minimum = 1 OPRI = 24 * 1
# maximum = 24 OPRI = 24 * 24
inOpr <- seq(24*1, 24*24, 24)
for (o in 1:length(inOpr)) {
inInts[[o]] <- (inOpr[o] - lownoise) : (inOpr[o] + highnoise)
}
names(inInts) <- sprintf('OPRI%i', 1:24)
# create the possible interval for reference
# reference interval is a bit different as will overlap deletions/insertions
# i.e. centered around 0, - lownoise / + highnoise
refInt <- (0-lownoise):(0+highnoise)
# given one total insertion (in bp), check if belongs to one OPRI interval
# eg. +23 bp >> return OPRI1
checkOPRI <- function(totin) {
check <- unlist(lapply(inInts, function(int){totin %in% int}))
if (length(which(check==TRUE))==0){
return(NA)
} else(
return(names(which(check) == TRUE))
)
}
# given one total deletion (in bp), check if belongs to one OPRD interval
# eg. -23 bp >> return OPRD1
checkOPRD <- function(totdel) {
check <- unlist(lapply(delInts, function(int){totdel %in% int}))
if (length(which(check==TRUE))==0){
return(NA)
} else(
return(names(which(check) == TRUE))
)
}
# given both total deletion / total insertion, check if it should be categorised as reference
# eg. -3 bp >> return reference
# ! flipping around here: functions above are meant to be applied to all column; checkRef is meant to be apply to each row of ied
# (because need both totdel & totins for each read)
checkRef <- function(read) { # read = row of ied
totdel <- read['del']
totins <- read['ins']
# del and ins should be both in interval
# eg. -5 del / +1 ins >> reference
# eg. -12 del / +2 ins >> NA (in this case read will not have a category as deletion too small for an OPRD1)
check <- (totdel %in% refInt) & (totdel %in% refInt) # check will be Logical, are both totdel / totins in Ref interval
if (check) {
read['del'] <- 'reference'
read['ins'] <- 'reference'
return(read)
} else {
return(read)
}
}
# now assign reads to possible OPR mutations
ied$opri <- sapply(ied$ins, checkOPRI)
ied$oprd <- sapply(ied$del, checkOPRD)
# put reads that were not assigned to an OPRI and/or an OPRD in either 'reference' or 'unassigned'
# 'reference' = total insertion & total deletion are in reference length interval (i.e. no or small insertion + no or small deletion)
# 'unassigned' = none of total insertion or total deletion are in any interval (so anything falling between the cracks, eg. a 10 bp insertion)
# create a new column for these categories (prevents from plotting same reads in different plots below)
ied$othercategory <- NA
ied[which((ied$ins %in% refInt) & (ied$del %in% refInt)) , 'othercategory'] <- 'reference'
# explicitly call all the leftover reads 'unassigned'; leftover =
# do not have an OPRD
# do not have an OPRI
# are NA for othercategory (i.e. were not categorised as reference above)
ied[which(is.na(ied$opri) & is.na(ied$oprd) & is.na(ied$othercategory)) , 'othercategory'] <- 'unassigned'
# Note1; reads categorised as 'reference' or 'unassigned' will always be OPRI NA / OPRD NA
# Note2; it is a bit of a pain to keep 3 columns when really each read should be in a single category
# (i.e. has an OPRI OR has OPRD OR is reference OR is unassigned)
# the only advantage is to leave the opportunity for a single read to be both OPRD & OPRI
# which are (hopefully) cases which will go away when aligning later
# Check
# that means all reads should be in at least one category
# i.e. all three columns cannot be NA at once
subset(ied, is.na(opri) & is.na(oprd) & is.na(othercategory))
### ...
# Check strand directions -------------------------------------------------
# check nothing else than 0 (= forward) or 16 (= reverse complement) in samflag column
which(ied$samflag!=0 & ied$samflag!=16) # ok
# if yes -- replace all 0 to +, all 16 to -
ied[which(ied$samflag==0), 'samflag'] <- '+'
ied[which(ied$samflag==16), 'samflag'] <- '-'
# check all +/- now
which(ied$samflag!='+' & ied$samflag!='-')
# ok
# change column name to strand
colnames(ied)[which(colnames(ied)=='samflag')] <- 'strand'
# quick sanity checks with strand direction
strandcount <- ied %>%
group_by(strand) %>%
tally(name='count') %>%
mutate(prop=count/nrow(ied))
strandPro <- ggplot(strandcount, aes(x=strand, y=prop)) +
geom_col() +
theme_minimal() +
xlab('proportion of reads') + ylab('strand')
strandPro
ggsave(here('controlPCR', 'strand.pdf'), strandPro, width=100, height=100, units='mm')
# add a dummy column
strandcount$dummy <- factor(1)
# same but stack
strandProStack <- ggplot(strandcount, aes(x=dummy, y=prop, fill=strand)) +
geom_col() +
theme_minimal() +
xlab('proportion of reads')
strandProStack
ggsave(here('controlPCR', 'strandProStack.pdf'), strandProStack, width=100, height=100, units='mm')
# ...
# 8- Alignment to consensus OPR sequences ---------------------------------
# for example, say a read has an insertion of 92 bp >> it could be a OPRI4
# how can we measure how convincing it is?
# we will align to a consensus sequence of what an OPRI4 looks like
# this makes use of the consensus R sequence written by Simon, which uses flexible nucleotides (eg. Y = C or T)
# >> see OPRconsensus.xlsx / generateOPRCatalog.R / OPRConsensusCatalog.xlsx for more information
# previously: was trying alignment to published OPR sequences
# but there is some logic to which nucleotides can change, so better to define a consensus sequence
# advantage: it includes more possible OPRs (OPR sequences that are probably out there but were never detected or published)
# function msaConsensusFlexible takes an alignment and outputs a corrected consensus sequence
# ! msa support alignment with flexible nucleotides, see https://github.com/UBod/msa/issues/12
# but eg. G vs A returns R; which is correct but not what I want here
# I want to leave flexibility only if the reference leaves that option open
# where eg. Y vs T returns Y
# build look-up table for flexible nucleotides
fnu <- c('R', 'Y', 'S', 'W', 'K', 'M', 'B', 'D', 'H', 'V', 'N') # flexible nucleotides
lut <- as.data.frame(matrix(nrow=4, ncol=length(fnu)))
colnames(lut) <- fnu
lut$R[1:2] <- c('A', 'G')
lut$Y[1:2] <- c('C', 'T')
lut$S[1:2] <- c('G', 'C')
lut$W[1:2] <- c('A', 'T')
lut$K[1:2] <- c('G', 'T')
lut$M[1:2] <- c('A', 'C')
lut$B[1:3] <- c('C', 'G', 'T')
lut$D[1:3] <- c('A', 'G', 'T')
lut$H[1:3] <- c('A', 'C', 'T')
lut$V[1:3] <- c('A', 'C', 'G')
lut$N[1:4] <- c('A', 'C', 'G', 'T')
# msaConsensusFlexible
# use instead of msaConsensusSequence, will return the consensus sequence of an alignment, but
# will replace ? in the consensus sequence which were thrown because of the presence of a flexible nucleotide, where the alignment was fine (eg. Y vs T)
# ! Note; it assumes alignment one-to-one and only one of the sequence (typically the reference) allows flexible nucleotides
msaConsensusFlexible <- function (alignment){
conm <- consensusMatrix(alignment)
cons <- msaConsensusSequence(alignment)
sapply(1:ncol(conm), function(pos) { # for each position of the alignment
al <- conm[,pos] # get the alignment matrix at that position
# count number of flexible nucleotides at that position
nfnu <- sum(al[which(names(al) %in% fnu)])
if (nfnu==0) { # if no flexible nucleotide at that position, do nothing
return(NULL)
}
if (nfnu>0) { # if any flexible nucleotide at that position
# get the flexible nucleotide
f <- names(which(al[which(names(al) %in% fnu)] > 0))
# get the standard nucleotide
s <- names(which(al[which(!names(al) %in% fnu)] > 0))
# is the alignment valid? Check in the look-up table; eg. T vs Y is ok
if (!s %in% lut[,f]) { # if not -- do NOT modify the consensus sequence at that position (should leave ?)
return(NULL)
}
else if (s %in% lut[,f]) { # if yes -- modify the consensus sequence at that position (should put the flexible nucleotide instead of the ?)
substr(cons, pos, pos) <<- f
}
}
})
return(cons)
}
# function alignCon_NumMis:
# takes a query DNA sequence (i.e. possible somatic mutation)
# looks at the possible OPRD/OPRI for that read
# takes consensus DNA sequence from the catalog of OPRs (by allele)
# try aligning to the consensus sequence, gets number of mismatches
# reports the number of mismatches vs consensus
alignCon_NumMis <- function(query, allele) {
# eg. alignRef_NumMis('CGAAATGC', 'OPRD1'), i.e. try aligning CGAAATGC to the consensus OPRD1 sequence and tell me number of mismatches
# can also be alignRef_NumMis('CGAAATGC', 'reference')
if( is.na(allele) | allele=='unassigned') { # if allele is NA (eg. a 8OPRI read will usually have NA in oprd column; all unassigned reads are opri/oprd NA; etc.)
return(NA)
}
if(! allele %in% cato$genotype) { # check the allele is in catalog (I think should always be ok as catalog covers all possible cases)
stop('\t \t \t \t >>> Allele', allele, ' was not found in catalog \n')
}
# 1- take the consensus sequence (used as reference)
ref <- as.character(subset(cato, genotype==allele, consensusseq))
# 2- align query to that reference
ali <- msa(c(ref, query), type='dna', method='ClustalW')
# 3- correct the consensus sequence to allow flexible nucleotides
# using msaConsensusFlexible()
con <- msaConsensusFlexible(ali)
# 4- count number of mismatches (?)
nmis <- str_count(con, '\\?')
# return that
return(nmis)
}
# sanity checks
# align reference OPR to the consensus sequence
alignCon_NumMis('CCTCAGGGCGGTGGTGGCTGGGGGCAGCCTCATGGTGGTGGCTGGGGGCAGCCTCATGGTGGTGGCTGGGGGCAGCCCCATGGTGGTGGCTGGGGACAGCCTCATGGTGGTGGCTGGGGTCAA',
'reference')
# gives 0 mismatch
# check eg. G vs A should NOT return R; only edit consensus if reference leaves the option open
alitest <- msa(c('CYGTK', 'CTAAT'), type='dna', method='ClustalW') # should return C Y ? ? K
msaConsensusFlexible(alitest) # ok
tic('\t \t \t \t Aligning reads took... \n')
# align all possible OPRI reads to their consensus sequence
opri_mis <- sapply(1:nrow(ied), function(ri){ # ri = read index; opri_mats = OPRI mismatches
# Note; will be one element per row of ied, can add to ied as it
cat('\t \t \t \t >>> Read', ri, 'out of', nrow(ied), '\n')
readseq <- as.character(ied[ri, 'seq']) # read sequence
oprgeno <- as.character(ied[ri, 'opri']) # possible OPR allele
alignCon_NumMis(readseq, oprgeno)
})
# put the results in ied
ied$opri_nummis <- opri_mis
# align all possible OPRD reads to their consensus sequence
oprd_mis <- sapply(1:nrow(ied), function(ri){ # ri = read index; oprd_mis = OPRD mismatches
cat('\t \t \t \t >>> Read', ri, 'out of', nrow(ied), '\n')
readseq <- as.character(ied[ri, 'seq']) # read sequence
oprgeno <- as.character(ied[ri, 'oprd']) # possible OPR genotype
alignCon_NumMis(readseq, oprgeno)
})
# put the results in ied
ied$oprd_nummis <- oprd_mis
# align reference reads to the consensus sequence
ref_mis <- sapply(1:nrow(ied), function(ri){ # ri = read index; ref_mis = reference mismatches
cat('\t \t \t \t >>> Read', ri, 'out of', nrow(ied), '\n')
readseq <- as.character(ied[ri, 'seq']) # read sequence
oprgeno <- as.character(ied[ri, 'othercategory']) # take what is othercategory column as allele
# Note alignCon_NumMis takes care of 'unassigned' already, i.e. returns NA (as there is nothing to align to)
alignCon_NumMis(readseq, oprgeno)
})
# put the results in ied
ied$reference_nummis <- ref_mis
toc()
# 9- Threshold for maximum number of mismatches? --------------------------
# what is a good threshold for maximum number of mismatches?
# eg. if 25% of the nucleotides in one read do not match the consensus OPR sequence, might not be so convincing
# >> look at the 'positive control' distribution of number of mismatches
# eg. take sample genotyped as OPRD2 by Sanger
# take all the reads that are consistent with an OPRD2
# and look at their number of mismatches with OPRD2 consensus sequence
# add samples metadata
ied <- left_join(x=ied, y=meta, by='sample')
# we do not need the colour column
ied$colour <- NULL
# plot mismatches plot
# list to store mismatches plots
# will basically do one list for each copy of the genome, i.e.
# first list = all the SVs or reference for the samples w/o SV
# second list = reference for the samples that have a SV / OPRD1 for the compound homozygous sample
# List 1
misPlots1 <- vector(mode='list', length=length(pdgs))
names(misPlots1) <- pdgs
for (s in 1:length(pdgs)) {
# get the PDG we are plotting
pdg <- pdgs[[s]]
# get its Sanger genotype
sang <- as.character(unique(subset(ied, sample==pdg, SV)))
# if Sanger genotype is NA (example sCJD case)
if (is.na(sang)) { # then plot the mismatches vs reference
# then it means its genotype is reference
sang <- 'reference'
tmp <- subset(ied, sample==pdg & othercategory==sang) # subset of ied for plot
misplot <- ggplot(tmp, aes(reference_nummis, after_stat(density))) + # mismatches plot
geom_histogram(binwidth=1) +
theme_minimal() +
coord_cartesian(xlim=c(0,100), ylim=c(0, 0.3)) +
labs(title=pdg,
subtitle=sang)
misPlots1[[s]] <- misplot # add it to the list
# if Sanger genotype is an insertion...
} else if (substr(sang, 1, 4) == 'OPRI') {
tmp <- subset(ied, sample==pdg & opri==sang) # subset of ied for plot
misplot <- ggplot(tmp, aes(opri_nummis, after_stat(density))) + # mismatches plot
geom_histogram(binwidth=1) +
theme_minimal() +
coord_cartesian(xlim=c(0,100), ylim=c(0, 0.3)) +
labs(title=pdg,
subtitle=sang)
misPlots1[[s]] <- misplot # add it to the list
# if Sanger genotype is a deletion...
} else if (substr(sang, 1, 4) == 'OPRD') {
tmp <- subset(ied, sample==pdg & oprd==sang) # subset of ied for plot
misplot <- ggplot(tmp, aes(oprd_nummis, after_stat(density))) + # mismatches plot
geom_histogram(binwidth=1) +
theme_minimal() +
coord_cartesian(xlim=c(0,100), ylim=c(0, 0.3)) +
labs(title=pdg,
subtitle=sang)
misPlots1[[s]] <- misplot # add it to the list
}
}
# List 2
misPlots2 <- vector(mode='list', length=length(pdgs))
names(misPlots2) <- pdgs
for (s in 1:length(pdgs)) {
# get the PDG we are plotting
pdg <- pdgs[[s]]
# get its first Sanger genotype
sang <- as.character(unique(subset(ied, sample==pdg, SV)))
if (is.na(sang)) { # if its first Sanger genotype is NA, we already plotted the reference reads above
# so do not repeat the plot but instead plot an empty plot to fill the list
misplot <- ggplot() +
theme_void() +
labs(title=pdg)
misPlots2[[s]] <- misplot # add it to the list
# if first Sanger genotype is not NA, then we should plot the reference reads now
# (or the 1 OPRD reads for the compound homozygous; will be in SV2 column)
} else if(!is.na(sang)) {
sang2 <- as.character(unique(subset(ied, sample==pdg, SV2))) # look at SV2 column, the only one not NA is the compound homozygous
if(is.na(sang2)) {
# then it means we should plot the reference reads
sang2 <- 'reference'
tmp <- subset(ied, sample==pdg & othercategory==sang2) # subset of ied for plot
misplot <- ggplot(tmp, aes(reference_nummis, after_stat(density))) + # mismatches plot
geom_histogram(binwidth=1) +
theme_minimal() +
coord_cartesian(xlim=c(0,100), ylim=c(0, 0.3)) +
labs(title=pdg,
subtitle=sang2)
misPlots2[[s]] <- misplot # add it to the list
# if not NA, i.e. we are dealing with the compound homozygous; so it will be 1 OPRD
} else if (!is.na(sang2)) {
if(sang2!='OPRD1') stop('\t \t \t \t >>> Something unexpected with the compound homozygous sample \n') # check that is correct