diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fba548e --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*.[oa] +bwa +test +test64 +.*.swp +Makefile.bak +bwamem-lite diff --git a/Makefile b/Makefile index 5a2f792..a1a099a 100644 --- a/Makefile +++ b/Makefile @@ -1,50 +1,82 @@ CC= gcc -CXX= g++ -#CFLAGS= -g -Wall -#CFLAGS= -pg -Wall -O2 -#CFLAGS= -O3 -L/scr/plastilin/pkerp/local/lib -#CFLAGS = -pg -#CFLAGS = -O3 -pg -CFLAGS =-O3 -Wall -CXXFLAGS= $(CFLAGS) -DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64 -OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ - is.o bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \ - bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \ +#CC= clang --analyze +CFLAGS= -g -Wall -O2 -Wno-unused-function +WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS +AR= ar +DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) +LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o +AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ + is.o bwtindex.o bwape.o kopen.o pemerge.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ - bwtsw2_chain.o bamlite.o fastmap.o bwtpssm.o seq2pssm.o \ - pssm.o bwtpssmgap.c bwtge.o + bwtsw2_chain.o fastmap.o bwtpssm.o seq2pssm.o bwtsw2_pair.o \ + pssm.o bwtpssmgap.o bwtge.o PROG= bwa -INCLUDES= +INCLUDES= LIBS= -lm -lz -lpthread -lgdsl -SUBDIRS= . bwt_gen +SUBDIRS= . .SUFFIXES:.c .o .cc .c.o: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ -.cc.o: - $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ all:$(PROG) -bwa:$(OBJS) main.o - $(CC) $(CFLAGS) $(DFLAGS) $(OBJS) main.o -o $@ $(LIBS) +bwa:libbwa.a $(AOBJS) main.o + $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) -QSufSort.o:QSufSort.h +bwamem-lite:libbwa.a example.o + $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS) -bwt.o:bwt.h -bwtio.o:bwt.h -bwtaln.o:bwt.h bwtaln.h kseq.h -bwt1away.o:bwt.h bwtaln.h -bwt2fmv.o:bwt.h -bntseq.o:bntseq.h -bwtgap.o:bwtgap.h bwtaln.h bwt.h -fastmap:bwt.h - -bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h -bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h -bwtsw2_main.o:bwtsw2.h +libbwa.a:$(LOBJS) + $(AR) -csru $@ $(LOBJS) clean: rm -f gmon.out *.o a.out $(PROG) *~ *.a + +depend: + ( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c ) + +# DO NOT DELETE THIS LINE -- make depend depends on it. + +QSufSort.o: QSufSort.h +bamlite.o: bamlite.h malloc_wrap.h +bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h +bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h malloc_wrap.h kseq.h +bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h +bwamem.o: ksort.h utils.h kbtree.h +bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h +bwamem_pair.o: utils.h ksw.h +bwape.o: bwtaln.h bwt.h kvec.h malloc_wrap.h bntseq.h utils.h bwase.h bwa.h +bwape.o: ksw.h khash.h +bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h +bwase.o: bwa.h ksw.h +bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h +bwt.o: utils.h bwt.h kvec.h malloc_wrap.h +bwt_gen.o: QSufSort.h malloc_wrap.h +bwt_lite.o: bwt_lite.h malloc_wrap.h +bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h +bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h +bwtge.o: bwt.h +bwtindex.o: bntseq.h bwt.h utils.h malloc_wrap.h +bwtpssm.o: pssm.h bwtaln.h bwtgap.h bwtpssm.h bwtpssmgap.h utils.h seq2pssm.h +bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h +bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h +bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h +bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h malloc_wrap.h +bwtsw2_core.o: khash.h ksort.h +bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h +bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h +bwtsw2_pair.o: malloc_wrap.h ksw.h +example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h +fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h +is.o: malloc_wrap.h +kopen.o: malloc_wrap.h +kstring.o: kstring.h malloc_wrap.h +ksw.o: ksw.h malloc_wrap.h +main.o: utils.h +malloc_wrap.o: malloc_wrap.h +pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h +pssm.o: pssm.h bwtaln.h +seq2pssm.o: bwtaln.h pssm.h probs.h +utils.o: utils.h ksort.h malloc_wrap.h kseq.h diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..40f4433 --- /dev/null +++ b/NEWS @@ -0,0 +1,997 @@ +Release 0.7.8 (31 March, 2014) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Changes in BWA-MEM: + + * Bugfix: off-diagonal X-dropoff (option -d) not working as intended. + Short-read alignment is not affected. + + * Bugfix: unnecessarily large bandwidth used during global alignment, + which reduces the mapping speed by ~5% for short reads. Results are not + affected. + + * Bugfix: when the matching score is not one, paired-end mapping quality is + inaccurate. + + * When the matching score (option -A) is changed, scale all score-related + options accordingly unless overridden by users. + + * Allow to specify different gap open (or extension) penalties for deletions + and insertions separately. + + * Allow to specify the insert size distribution. + + * Better and more detailed debugging information. + +With the default setting, 0.7.8 and 0.7.7 gave identical output on one million +100bp read pairs. + +(0.7.8: 31 March 2014, r455) + + + +Release 0.7.7 (25 Feburary, 2014) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release fixes incorrect MD tags in the BWA-MEM output. + +A note about short-read mapping to GRCh38. The new human reference genome +GRCh38 contains 60Mbp program generated alpha repeat arrays, some of which are +hard masked as they cannot be localized. These highly repetitive arrays make +BWA-MEM ~50% slower. If you are concerned with the performance of BWA-MEM, you +may consider to use option "-c2000 -m50". On simulated data, this setting helps +the performance at a very minor cost on accuracy. I may consider to change the +default in future releases. + +(0.7.7: 25 Feburary 2014, r441) + + + +Release 0.7.6 (31 Januaray, 2014) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Changes in BWA-MEM: + + * Changed the way mapping quality is estimated. The new method tends to give + the same alignment a higher mapping quality. On paired-end reads, the change + is minor as with pairing, the mapping quality is usually high. For short + single-end reads, the difference is considerable. + + * Improved load balance when many threads are spawned. However, bwa-mem is + still not very thread efficient, probably due to the frequent heap memory + allocation. Further improvement is a little difficult and may affect the + code stability. + + * Allow to use different clipping penalties for 5'- and 3'-ends. This helps + when we do not want to clip one end. + + * Print the @PG line, including the command line options. + + * Improved the band width estimate: a) fixed a bug causing the band + width extimated from extension not used in the final global alignment; b) + try doubled band width if the global alignment score is smaller. + Insufficient band width leads to wrong CIGAR and spurious mismatches/indels. + + * Added a new option -D to fine tune a heuristic on dropping suboptimal hits. + Reducing -D increases accuracy but decreases the mapping speed. If unsure, + leave it to the default. + + * Bugfix: for a repetitive single-end read, the reported hit is not randomly + distributed among equally best hits. + + * Bugfix: missing paired-end hits due to unsorted list of SE hits. + + * Bugfix: incorrect CIGAR caused by a defect in the global alignment. + + * Bugfix: incorrect CIGAR caused by failed SW rescue. + + * Bugfix: alignments largely mapped to the same position are regarded to be + distinct from each other, which leads to underestimated mapping quality. + + * Added the MD tag. + +There are no changes to BWA-backtrack in this release. However, it has a few +known issues yet to be fixed. If you prefer BWA-track, It is still advised to +use bwa-0.6.x. + +While I developed BWA-MEM, I also found a few issues with BWA-SW. It is now +possible to improve BWA-SW with the lessons learned from BWA-MEM. However, as +BWA-MEM is usually better, I will not improve BWA-SW until I find applications +where BWA-SW may excel. + +(0.7.6: 31 January 2014, r432) + + + +Release 0.7.5a (30 May, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Fixed a bug in BWA-backtrack which leads to off-by-one mapping errors in rare +cases. + +(0.7.5a: 30 May 2013, r405) + + + +Release 0.7.5 (29 May, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Changes in all components: + + * Improved error checking on memory allocation and file I/O. Patches provided + by Rob Davies. + + * Updated README. + + * Bugfix: return code is zero upon errors. + +Changes in BWA-MEM: + + * Changed the way a chimeric alignment is reported (conforming to the upcoming + SAM spec v1.5). With 0.7.5, if the read has a chimeric alignment, the paired + or the top hit uses soft clipping and is marked with neither 0x800 nor 0x100 + bits. All the other hits part of the chimeric alignment will use hard + clipping and be marked with 0x800 if option "-M" is not in use, or marked + with 0x100 otherwise. + + * Other hits part of a chimeric alignment are now reported in the SA tag, + conforming to the SAM spec v1.5. + + * Better method for resolving an alignment bridging two or more short + reference sequences. The current strategy maps the query to the reference + sequence that covers the middle point of the alignment. For most + applications, this change has no effects. + +Changes in BWA-backtrack: + + * Added a magic number to .sai files. This prevents samse/sampe from reading + corrupted .sai (e.g. a .sai file containing LSF log) or incompatible .sai + generated by a different version of bwa. + + * Bugfix: alignments in the XA:Z: tag were wrong. + + * Keep track of #ins and #del during backtracking. This simplifies the code + and reduces errors in rare corner cases. I should have done this in the + early days of bwa. + +In addition, if you use BWA-MEM or the fastmap command of BWA, please cite: + + - Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs + with BWA-MEM. arXiv:1303.3997v2 [q-bio.GN]. + +Thank you. + +(0.7.5: 29 May 2013, r404) + + + +Release 0.7.4 (23 April, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a bugfix release. Most of bugs are considered to be minor which only +occur very rarely. + + * Bugfix: wrong CIGAR when a query sequence bridges three or more target + sequences. This only happens when aligning reads to short assembly contigs. + + * Bugfix: leading "D" operator in CIGAR. + + * Extend more seeds for better alignment around tandem repeats. This is also + a cause of the leading "D" operator in CIGAR. + + * Bugfix: SSE2-SSW may occasionally find incorrect query starting position + around tandem repeat. This will lead to a suboptimal CIGAR in BWA-MEM and + a wrong CIGAR in BWA. + + * Bugfix: clipping penalty does not work as is intended when there is a gap + towards the end of a read. + + * Fixed an issue caused by a bug in the libc from Mac/Darwin. In Darwin, + fread() is unable to read a data block longer than 2GB due to an integer + overflow bug in its implementation. + +Since version 0.7.4, BWA-MEM is considered to reach similar stability to +BWA-backtrack for short-read mapping. + +(0.7.4: 23 April, r385) + + + +Release 0.7.3a (15 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In 0.7.3, the wrong CIGAR bug was only fixed in one scenario, but not fixed +in another corner case. + +(0.7.3a: 15 March 2013, r367) + + + +Release 0.7.3 (15 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Changes to BWA-MEM: + + * Bugfix: pairing score is inaccurate when option -A does not take the default + value. This is a very minor issue even if it happens. + + * Bugfix: occasionally wrong CIGAR. This happens when in the alignment there + is a 1bp deletion and a 1bp insertion which are close to the end of the + reads, and there are no other substitutions or indels. BWA-MEM would not do + a gapped alignment due to the bug. + + * New feature: output other non-overlapping alignments in the XP tag such that + we can see the entire picture of alignment from one SAM line. XP gives the + position, CIGAR, NM and mapQ of each aligned subsequence of the query. + +BWA-MEM has been used to align ~300Gbp 100-700bp SE/PE reads. SNP/indel calling +has also been evaluated on part of these data. BWA-MEM generally gives better +pre-filtered SNP calls than BWA. No significant issues have been observed since +0.7.2, though minor improvements or bugs (e.g. the bug fixed in this release) +are still possible. If you find potential issues, please send bug reports to + (free registration required). + +In addition, more detailed description of the BWA-MEM algorithm can be found at +. + +(0.7.3: 15 March 2013, r366) + + + +Release 0.7.2 (9 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Emergent bug fix: 0.7.0 and 0.7.1 give a wrong sign to TLEN. In addition, +flagging `properly paired' also gets improved a little. + +(0.7.2: 9 March 2013, r351) + + + +Release 0.7.1 (8 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Changes to BWA-MEM: + + * Bugfix: rare segmentation fault caused by a partial hit to the end of the + last sequence. + + * Bugfix: occasional mis-pairing given an interleaved fastq. + + * Bugfix: wrong mate information when the mate is unmapped. SAM generated by + BWA-MEM can now be validated with Picard. + + * Improved the performance and accuracy for ultra-long query sequences. + Short-read alignment is not affected. + +Changes to other components: + + * In BWA-backtrack and BWA-SW, replaced the code for global alignment, + Smith-Waterman and SW extension. The performance and accuracy of the two + algorithms stay the same. + + * Added an experimental subcommand to merge overlapping paired ends. The + algorithm is very conservative: it may miss true overlaps but rarely makes + mistakes. + +An important note is that like BWA-SW, BWA-MEM may output multiple primary +alignments for a read, which may cause problems to some tools. For aligning +sequence reads, it is advised to use `-M' to flag extra hits as secondary. This +option is not the default because multiple primary alignments are theoretically +possible in sequence alignment. + +(0.7.1: 8 March 2013, r347) + + + +Beta Release 0.7.0 (28 Feburary, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release comes with a new alignment algorithm, BWA-MEM, for 70bp-1Mbp query +sequences. BWA-MEM essentially seeds alignments with a variant of the fastmap +algorithm and extends seeds with banded affine-gap-penalty dynamic programming +(i.e. the Smith-Waterman-Gotoh algorithm). For typical Illumina 100bp reads or +longer low-divergence query sequences, BWA-MEM is about twice as fast as BWA +and BWA-SW and is more accurate. It also supports split alignments like BWA-SW +and may optionally output multiple hits like BWA. BWA-MEM does not guarantee +to find hits within a certain edit distance, but BWA is not efficient for such +task given longer reads anyway, and the edit-distance criterion is arguably +not as important in long-read alignment. + +In addition to the algorithmic improvements, BWA-MEM also implements a few +handy features in practical aspects: + + 1. BWA-MEM automatically switches between local and glocal (global wrt reads; + local wrt reference) alignment. It reports the end-to-end glocal alignment + if the glocal alignment is not much worse than the optimal local alignment. + Glocal alignment reduces reference bias. + + 2. BWA-MEM automatically infers pair orientation from a batch of single-end + alignments. It allows more than one orientations if there are sufficient + supporting reads. This feature has not been tested on reads from Illumina + jumping library yet. (EXPERIMENTAL) + + 3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It + is possible to convert a name-sorted BAM to an interleaved fastq on the fly + and feed the data stream to BWA-MEM for mapping. + + 4. BWA-MEM optionally copies FASTA/Q comments to the final SAM output, which + helps to transfer individual read annotations to the output. + + 5. BWA-MEM supports more advanced piping. Users can now run: + (bwa mem ref.fa '20) CPU cores. + + * Check I/O error. + + * Increased the maximum barcode length to 63bp. + + * Automatically choose the indexing algorithm. + + * Bugfix: very rare segfault due to an uninitialized variable. The bug also + affects the placement of suboptimal alignments. The effect is very minor. + +This release involves quite a lot of tricky changes. Although it has been +tested on a few data sets, subtle bugs may be still hidden. It is *NOT* +recommended to use this release in a production pipeline. In future, however, +BWA-SW may be better when reads continue to go longer. I would encourage users +to try the 0.6 release. I would also like to hear the users' experience. Thank +you. + +(0.6.0: 12 November 2011, r85) + + + +Beta Release 0.5.9 (24 January, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Feature: barcode support via the `-B' option. + + * Feature: Illumina 1.3+ read format support via the `-I' option. + + * Bugfix: RG tags are not attached to unmapped reads. + + * Bugfix: very rare bwasw mismappings + + * Recommend options for PacBio reads in bwasw help message. + + +Also, since January 13, the BWA master repository has been moved to github: + + https://github.com/lh3/bwa + +The revision number has been reset. All recent changes will be first +committed to this repository. + +(0.5.9: 24 January 2011, r16) + + + +Beta Release Candidate 0.5.9rc1 (10 December, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in bwasw: + + * Output unmapped reads. + + * For a repetitive read, choose a random hit instead of a fixed + one. This is not well tested. + +Notable changes in bwa-short: + + * Fixed a bug in the SW scoring system, which may lead to unexpected + gaps towards the end of a read. + + * Fixed a bug which invalidates the randomness of repetitive reads. + + * Fixed a rare memory leak. + + * Allowed to specify the read group at the command line. + + * Take name-grouped BAM files as input. + +Changes to this release are usually safe in that they do not interfere +with the key functionality. However, the release has only been tested on +small samples instead of on large-scale real data. If anything weird +happens, please report the bugs to the bio-bwa-help mailing list. + +(0.5.9rc1: 10 December 2010, r1561) + + + +Beta Release 0.5.8 (8 June, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in bwasw: + + * Fixed an issue of missing alignments. This should happen rarely and + only when the contig/read alignment is multi-part. Very rarely, bwasw + may still miss a segment in a multi-part alignment. This is difficult + to fix, although possible. + +Notable changes in bwa-short: + + * Discard the SW alignment when the best single-end alignment is much + better. Such a SW alignment may caused by structural variations and + forcing it to be aligned leads to false alignment. This fix has not + been tested thoroughly. It would be great to receive more users + feedbacks on this issue. + + * Fixed a typo/bug in sampe which leads to unnecessarily large memory + usage in some cases. + + * Further reduced the chance of reporting `weird pairing'. + +(0.5.8: 8 June 2010, r1442) + + + +Beta Release 0.5.7 (1 March, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release only has an effect on paired-end data with fat insert-size +distribution. Users are still recommended to update as the new release +improves the robustness to poor data. + + * The fix for `weird pairing' was not working in version 0.5.6, pointed + out by Carol Scott. It should work now. + + * Optionally output to a normal file rather than to stdout (by Tim + Fennel). + +(0.5.7: 1 March 2010, r1310) + + + +Beta Release 0.5.6 (10 Feburary, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in bwa-short: + + * Report multiple hits in the SAM format at a new tag XA encoded as: + (chr,pos,CIGAR,NM;)*. By default, if a paired or single-end read has + 4 or fewer hits, they will all be reported; if a read in a anomalous + pair has 11 or fewer hits, all of them will be reported. + + * Perform Smith-Waterman alignment also for anomalous read pairs when + both ends have quality higher than 17. This reduces false positives + for some SV discovery algorithms. + + * Do not report "weird pairing" when the insert size distribution is + too fat or has a mean close to zero. + + * If a read is bridging two adjacent chromsomes, flag it as unmapped. + + * Fixed a small but long existing memory leak in paired-end mapping. + + * Multiple bug fixes in SOLiD mapping: a) quality "-1" can be correctly + parsed by solid2fastq.pl; b) truncated quality string is resolved; c) + SOLiD read mapped to the reverse strand is complemented. + + * Bwa now calculates skewness and kurtosis of the insert size + distribution. + + * Deploy a Bayesian method to estimate the maximum distance for a read + pair considered to be paired properly. The method is proposed by + Gerton Lunter, but bwa only implements a simplified version. + + * Export more functions for Java bindings, by Matt Hanna (See: + http://www.broadinstitute.org/gsa/wiki/index.php/Sting_BWA/C_bindings) + + * Abstract bwa CIGAR for further extension, by Rodrigo Goya. + +(0.5.6: 10 Feburary 2010, r1303) + + + +Beta Release 0.5.5 (10 November, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a bug fix release: + + * Fixed a serious bug/typo in aln which does not occur given short + reads, but will lead to segfault for >500bp reads. Of course, the aln + command is not recommended for reads longer than 200bp, but this is a + bug anyway. + + * Fixed a minor bug/typo which leads to incorrect single-end mapping + quality when one end is moved to meet the mate-pair requirement. + + * Fixed a bug in samse for mapping in the color space. This bug is + caused by quality filtration added since 0.5.1. + +(0.5.5: 10 November 2009, r1273) + + + +Beta Release 0.5.4 (9 October, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since this version, the default seed length used in the "aln" command is +changed to 32. + +Notable changes in bwa-short: + + * Added a new tag "XC:i" which gives the length of clipped reads. + + * In sampe, skip alignments in case of a bug in the Smith-Waterman + alignment module. + + * In sampe, fixed a bug in pairing when the read sequence is identical + to its reverse complement. + + * In sampe, optionally preload the entire FM-index into memory to + reduce disk operations. + +Notable changes in dBWT-SW/BWA-SW: + + * Changed name dBWT-SW to BWA-SW. + + * Optionally use "hard clipping" in the SAM output. + +(0.5.4: 9 October 2009, r1245) + + + +Beta Release 0.5.3 (15 September, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Fixed a critical bug in bwa-short: reads mapped to the reverse strand +are not complemented. + +(0.5.3: 15 September 2009, r1225) + + + +Beta Release 0.5.2 (13 September, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in bwa-short: + + * Optionally trim reads before alignment. See the manual page on `aln + -q' for detailed description. + + * Fixed a bug in calculating the NM tag for a gapped alignment. + + * Fixed a bug given a mixture of reads with some longer than the seed + length and some shorter. + + * Print SAM header. + +Notable changes in dBWT-SW: + + * Changed the default value of -T to 30. As a result, the accuracy is a + little higher for short reads at the cost of speed. + +(0.5.2: 13 September 2009, r1223) + + + +Beta Release 0.5.1 (2 September, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in the short read alignment component: + + * Fixed a bug in samse: do not write mate coordinates. + +Notable changes in dBWT-SW: + + * Randomly choose one alignment if the read is a repetitive. + + * Fixed a flaw when a read is mapped across two adjacent reference + sequences. However, wrong alignment reports may still occur rarely in + this case. + + * Changed the default band width to 50. The speed is slower due to this + change. + + * Improved the mapping quality a little given long query sequences. + +(0.5.1: 2 September 2009, r1209) + + + +Beta Release 0.5.0 (20 August, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release implements a novel algorithm, dBWT-SW, specifically +designed for long reads. It is 10-50 times faster than SSAHA2, depending +on the characteristics of the input data, and achieves comparable +alignment accuracy while allowing chimera detection. In comparison to +BLAT, dBWT-SW is several times faster and much more accurate especially +when the error rate is high. Please read the manual page for more +information. + +The dBWT-SW algorithm is kind of developed for future sequencing +technologies which produce much longer reads with a little higher error +rate. It is still at its early development stage. Some features are +missing and it may be buggy although I have evaluated on several +simulated and real data sets. But following the "release early" +paradigm, I would like the users to try it first. + +Other notable changes in BWA are: + + * Fixed a rare bug in the Smith-Waterman alignment module. + + * Fixed a rare bug about the wrong alignment coordinate when a read is + poorly aligned. + + * Fixed a bug in generating the "mate-unmap" SAM tag when both ends in + a pair are unmapped. + +(0.5.0: 20 August 2009, r1200) + + + +Beta Release 0.4.9 (19 May, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Interestingly, the integer overflow bug claimed to be fixed in 0.4.7 has +not in fact. Now I have fixed the bug. Sorry for this and thank Quan +Long for pointing out the bug (again). + +(0.4.9: 19 May 2009, r1075) + + + +Beta Release 0.4.8 (18 May, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One change to "aln -R". Now by default, if there are no more than `-R' +equally best hits, bwa will search for suboptimal hits. This change +affects the ability in finding SNPs in segmental duplications. + +I have not tested this option thoroughly, but this simple change is less +likely to cause new bugs. Hope I am right. + +(0.4.8: 18 May 2009, r1073) + + + +Beta Release 0.4.7 (12 May, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Output SM (single-end mapping quality) and AM (smaller mapping + quality among the two ends) tag from sam output. + + * Improved the functionality of stdsw. + + * Made the XN tag more accurate. + + * Fixed a very rare segfault caused by integer overflow. + + * Improve the insert size estimation. + + * Fixed compiling errors for some Linux systems. + +(0.4.7: 12 May 2009, r1066) + + + +Beta Release 0.4.6 (9 March, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release improves the SOLiD support. First, a script for converting +SOLiD raw data is provided. This script is adapted from solid2fastq.pl +in the MAQ package. Second, a nucleotide reference file can be directly +used with `bwa index'. Third, SOLiD paired-end support is +completed. Fourth, color-space reads will be converted to nucleotides +when SAM output is generated. Color errors are corrected in this +process. Please note that like MAQ, BWA cannot make use of the primer +base and the first color. + +In addition, the calculation of mapping quality is also improved a +little bit, although end-users may barely observe the difference. + +(0.4.6: 9 March 2009, r915) + + + +Beta Release 0.4.5 (18 Feburary, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Not much happened, but I think it would be good to let the users use the +latest version. + +Notable changes (Thank Bob Handsaker for catching the two bugs): + + * Improved bounary check. Previous version may still give incorrect + alignment coordinates in rare cases. + + * Fixed a bug in SW alignment when no residue matches. This only + affects the `sampe' command. + + * Robustly estimate insert size without setting the maximum on the + command line. Since this release `sampe -a' only has an effect if + there are not enough good pairs to infer the insert size + distribution. + + * Reduced false PE alignments a little bit by using the inferred insert + size distribution. This fix may be more important for long insert + size libraries. + +(0.4.5: 18 Feburary 2009, r829) + + + +Beta Release 0.4.4 (15 Feburary, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is mainly a bug fix release. Notable changes are: + + * Imposed boundary check for extracting subsequence from the + genome. Previously this causes memory problem in rare cases. + + * Fixed a bug in failing to find whether an alignment overlapping with + N on the genome. + + * Changed MD tag to meet the latest SAM specification. + +(0.4.4: 15 Feburary 2009, r815) + + + +Beta Release 0.4.3 (22 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Treat an ambiguous base N as a mismatch. Previous versions will not + map reads containing any N. + + * Automatically choose the maximum allowed number of differences. This + is important when reads of different lengths are mixed together. + + * Print mate coordinate if only one end is unmapped. + + * Generate MD tag. This tag encodes the mismatching positions and the + reference bases at these positions. Deletions from the reference will + also be printed. + + * Optionally dump multiple hits from samse, in another concise format + rather than SAM. + + * Optionally disable iterative search. This is VERY SLOOOOW, though. + + * Fixed a bug in generate SAM. + +(0.4.3: 22 January 2009, r787) + + + +Beta Release 0.4.2 (9 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Aaron Quinlan found a bug in the indexer: the bwa indexer segfaults if +there are no comment texts in the FASTA header. This is a critical +bug. Nothing else was changed. + +(0.4.2: 9 January 2009, r769) + + + +Beta Release 0.4.1 (7 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +I am sorry for the quick updates these days. I like to set a milestone +for BWA and this release seems to be. For paired end reads, BWA also +does Smith-Waterman alignment for an unmapped read whose mate can be +mapped confidently. With this strategy BWA achieves similar accuracy to +maq. Benchmark is also updated accordingly. + +(0.4.1: 7 January 2009, r760) + + + +Beta Release 0.4.0 (6 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In comparison to the release two days ago, this release is mainly tuned +for performance with some tricks I learnt from Bowtie. However, as the +indexing format has also been changed, I have to increase the version +number to 0.4.0 to emphasize that *DATABASE MUST BE RE-INDEXED* with +`bwa index'. + + * Improved the speed by about 20%. + + * Added multi-threading to `bwa aln'. + +(0.4.0: 6 January 2009, r756) + + + +Beta Release 0.3.0 (4 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Added paired-end support by separating SA calculation and alignment + output. + + * Added SAM output. + + * Added evaluation to the documentation. + +(0.3.0: 4 January 2009, r741) + + + +Beta Release 0.2.0 (15 Augusst, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Take the subsequence at the 5'-end as seed. Seeding strategy greatly + improves the speed for long reads, at the cost of missing a few true + hits that contain many differences in the seed. Seeding also increase + the memory by 800MB. + + * Fixed a bug which may miss some gapped alignments. Fixing the bug + also slows the speed a little. + +(0.2.0: 15 August 2008, r428) + + + +Beta Release 0.1.6 (08 Augusst, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Give accurate CIGAR string. + + * Add a simple interface to SW/NW alignment + +(0.1.6: 08 August 2008, r414) + + + +Beta Release 0.1.5 (27 July, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Improve the speed. This version is expected to give the same results. + +(0.1.5: 27 July 2008, r400) + + + +Beta Release 0.1.4 (22 July, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Fixed a bug which may cause missing gapped alignments. + + * More clearly define what alignments can be found by BWA (See + manual). Now BWA runs a little slower because it will visit more + potential gapped alignments. + + * A bit code clean up. + +(0.1.4: 22 July 2008, r387) + + + +Beta Release 0.1.3 (21 July, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Improve the speed with some tricks on retrieving occurences. The results +should be exactly the same as that of 0.1.2. + +(0.1.3: 21 July 2008, r382) + + + +Beta Release 0.1.2 (17 July, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Support gapped alignment. Codes for ungapped alignment has been removed. + +(0.1.2: 17 July 2008, r371) + + + +Beta Release 0.1.1 (03 June, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is the first release of BWA, Burrows-Wheeler Alignment tool. Please +read man page for more information about this software. + +(0.1.1: 03 June 2008, r349) + + + diff --git a/QSufSort.c b/QSufSort.c index 81f2105..36c5a51 100644 --- a/QSufSort.c +++ b/QSufSort.c @@ -45,7 +45,7 @@ static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t highestPos, const qsint_t numSortedChar); static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, const qsint_t highestPos, const qsint_t numSortedChar); -static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabet_size); +static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize); static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated); @@ -285,7 +285,7 @@ static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I Output: x is V and p is I after the initial sorting stage of the refined suffix sorting algorithm.*/ -static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabet_size) +static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize) { qsint_t i, c; qsint_t d; @@ -293,7 +293,7 @@ static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, con qsint_t currentIndex; // mark linked list empty - for (i=0; i0; i--) { + for (i=alphabetSize; i>0; i--) { c = I[i-1]; d = (qsint_t)(V[c]); groupNum = currentIndex; diff --git a/README b/README index 6c950c1..c855a38 100644 --- a/README +++ b/README @@ -1,15 +1,15 @@ -BPM Readme +BWA-PSSM Readme -The usage of BPM is very similar to that of bwa. Due to some of the +The usage of BWA-PSSM is very similar to that of BWA. Due to some of the underlying differences, howerver, the recommended parameters are different. Installation: -BPM requires the gdsl library. It can be downloaded from: +BWA-PSSM requires the gdsl library. It can be downloaded from: http://home.gna.org/gdsl/ -Once gdsl is installed, BPM can be compiled by simply running 'make'. +Once gdsl is installed, BWA-PSSM can be compiled by simply running 'make'. Usage: @@ -31,7 +31,7 @@ It is presumed that maximum length reads have not been degraded and should not b In the examples below, pssm-file can be either a pssm file as generated by fastq2wm33.pl or a regular fastq file. If it is a regular fastq file, it will be converted to a pssm internally. -bwa pssm -z 3.0 -l 18 -k 3 -n 30 -m 2000 index-file pssm-file | bwa samse index-file - pssm-file > out.sam +bwa pssm -m 2000 index-file pssm-file | bwa samse index-file - pssm-file > out.sam ** Helicos data ** diff --git a/README.md b/README.md new file mode 100644 index 0000000..ac1e57e --- /dev/null +++ b/README.md @@ -0,0 +1,76 @@ +###Getting started + + git clone https://github.com/lh3/bwa.git + cd bwa; make + ./bwa index ref.fa + ./bwa mem ref.fa read-se.fq.gz | gzip -3 > aln-se.sam.gz + ./bwa mem ref.fa read1.fq read2.fq | gzip -3 > aln-pe.sam.gz + +###Introduction + +BWA is a software package for mapping low-divergent sequences against a large +reference genome, such as the human genome. It consists of three algorithms: +BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina +sequence reads up to 100bp, while the rest two for longer sequences ranged from +70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as the support of +long reads and chimeric alignment, but BWA-MEM, which is the latest, is +generally recommended for high-quality queries as it is faster and more +accurate. BWA-MEM also has better performance than BWA-backtrack for 70-100bp +Illumina reads. + +For all the algorithms, BWA first needs to construct the FM-index for the +reference genome (the **index** command). Alignment algorithms are invoked with +different sub-commands: **aln/samse/sampe** for BWA-backtrack, +**bwasw** for BWA-SW and **mem** for the BWA-MEM algorithm. + +###Availability + +BWA is released under [GPLv3][1]. The latest souce code is [freely +available][2] at github. Released packages can [be downloaded][3] at +SourceForge. After you acquire the source code, simply use `make` to compile +and copy the single executable `bwa` to the destination you want. The only +dependency of BWA is [zlib][14]. + +###Seeking helps + +The detailed usage is described in the man page available together with the +source code. You can use `man ./bwa.1` to view the man page in a terminal. The +[HTML version][4] of the man page can be found at the [BWA website][5]. If you +have questions about BWA, you may [sign up the mailing list][6] and then send +the questions to [bio-bwa-help@sourceforge.net][7]. You may also ask questions +in forums such as [BioStar][8] and [SEQanswers][9]. + +###Citing BWA + +* Li H. and Durbin R. (2009) Fast and accurate short read alignment with + Burrows-Wheeler transform. *Bioinformatics*, **25**, 1754-1760. [PMID: + [19451168][10]]. (if you use the BWA-backtrack algorithm) + +* Li H. and Durbin R. (2010) Fast and accurate long-read alignment with + Burrows-Wheeler transform. *Bioinformatics*, **26**, 589-595. [PMID: + [20080505][11]]. (if you use the BWA-SW algorithm) + +* Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs + with BWA-MEM. [arXiv:1303.3997v2][12] [q-bio.GN]. (if you use the BWA-MEM + algorithm or the **fastmap** command, or want to cite the whole BWA package) + +Please note that the last reference is a preprint hosted at [arXiv.org][13]. I +do not have plan to submit it to a peer-reviewed journal in the near future. + + + +[1]: http://en.wikipedia.org/wiki/GNU_General_Public_License +[2]: https://github.com/lh3/bwa +[3]: http://sourceforge.net/projects/bio-bwa/files/ +[4]: http://bio-bwa.sourceforge.net/bwa.shtml +[5]: http://bio-bwa.sourceforge.net/ +[6]: https://lists.sourceforge.net/lists/listinfo/bio-bwa-help +[7]: mailto:bio-bwa-help@sourceforge.net +[8]: http://biostars.org +[9]: http://seqanswers.com/ +[10]: http://www.ncbi.nlm.nih.gov/pubmed/19451168 +[11]: http://www.ncbi.nlm.nih.gov/pubmed/20080505 +[12]: http://arxiv.org/abs/1303.3997 +[13]: http://arxiv.org/ +[14]: http://zlib.net/ +[15]: https://github.com/lh3/bwa/tree/mem diff --git a/bamlite.c b/bamlite.c index 5aad392..3704beb 100644 --- a/bamlite.c +++ b/bamlite.c @@ -2,8 +2,13 @@ #include #include #include +#include #include "bamlite.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + /********************* * from bam_endian.c * *********************/ @@ -62,11 +67,11 @@ void bam_header_destroy(bam_header_t *header) if (header == 0) return; if (header->target_name) { for (i = 0; i < header->n_targets; ++i) - free(header->target_name[i]); + if (header->target_name[i]) free(header->target_name[i]); + if (header->target_len) free(header->target_len); free(header->target_name); - free(header->target_len); } - free(header->text); + if (header->text) free(header->text); free(header); } @@ -80,28 +85,33 @@ bam_header_t *bam_header_read(bamFile fp) magic_len = bam_read(fp, buf, 4); if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n"); - return 0; + return NULL; } header = bam_header_init(); // read plain text and the number of reference sequences - bam_read(fp, &header->l_text, 4); + if (bam_read(fp, &header->l_text, 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&header->l_text); header->text = (char*)calloc(header->l_text + 1, 1); - bam_read(fp, header->text, header->l_text); - bam_read(fp, &header->n_targets, 4); + if (bam_read(fp, header->text, header->l_text) != header->l_text) goto fail; + if (bam_read(fp, &header->n_targets, 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&header->n_targets); // read reference sequence names and lengths header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); header->target_len = (uint32_t*)calloc(header->n_targets, 4); for (i = 0; i != header->n_targets; ++i) { - bam_read(fp, &name_len, 4); + if (bam_read(fp, &name_len, 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&name_len); header->target_name[i] = (char*)calloc(name_len, 1); - bam_read(fp, header->target_name[i], name_len); - bam_read(fp, &header->target_len[i], 4); + if (bam_read(fp, header->target_name[i], name_len) != name_len) { + goto fail; + } + if (bam_read(fp, &header->target_len[i], 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]); } return header; + fail: + bam_header_destroy(header); + return NULL; } static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) @@ -153,3 +163,48 @@ int bam_read1(bamFile fp, bam1_t *b) if (bam_is_be) swap_endian_data(c, b->data_len, b->data); return 4 + block_len; } + + +#ifdef USE_VERBOSE_ZLIB_WRAPPERS +// Versions of gzopen, gzread and gzclose that print up error messages + +gzFile bamlite_gzopen(const char *fn, const char *mode) { + gzFile fp; + if (strcmp(fn, "-") == 0) { + fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode); + if (!fp) { + fprintf(stderr, "Couldn't open %s : %s", + (strstr(mode, "r"))? "stdin" : "stdout", + strerror(errno)); + } + return fp; + } + if ((fp = gzopen(fn, mode)) == 0) { + fprintf(stderr, "Couldn't open %s : %s\n", fn, + errno ? strerror(errno) : "Out of memory"); + } + return fp; +} + +int bamlite_gzread(gzFile file, void *ptr, unsigned int len) { + int ret = gzread(file, ptr, len); + + if (ret < 0) { + int errnum = 0; + const char *msg = gzerror(file, &errnum); + fprintf(stderr, "gzread error: %s\n", + Z_ERRNO == errnum ? strerror(errno) : msg); + } + return ret; +} + +int bamlite_gzclose(gzFile file) { + int ret = gzclose(file); + if (Z_OK != ret) { + fprintf(stderr, "gzclose error: %s\n", + Z_ERRNO == ret ? strerror(errno) : zError(ret)); + } + + return ret; +} +#endif /* USE_VERBOSE_ZLIB_WRAPPERS */ diff --git a/bamlite.h b/bamlite.h index 167fa44..efab7ac 100644 --- a/bamlite.h +++ b/bamlite.h @@ -4,11 +4,25 @@ #include #include +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#define USE_VERBOSE_ZLIB_WRAPPERS + typedef gzFile bamFile; -#define bam_open(fn, mode) gzopen(fn, mode) -#define bam_dopen(fd, mode) gzdopen(fd, mode) -#define bam_close(fp) gzclose(fp) -#define bam_read(fp, buf, size) gzread(fp, buf, size) +#ifdef USE_VERBOSE_ZLIB_WRAPPERS +/* These print error messages on failure */ +# define bam_open(fn, mode) bamlite_gzopen(fn, mode) +# define bam_dopen(fd, mode) gzdopen(fd, mode) +# define bam_close(fp) bamlite_gzclose(fp) +# define bam_read(fp, buf, size) bamlite_gzread(fp, buf, size) +#else +# define bam_open(fn, mode) gzopen(fn, mode) +# define bam_dopen(fd, mode) gzdopen(fd, mode) +# define bam_close(fp) gzclose(fp) +# define bam_read(fp, buf, size) gzread(fp, buf, size) +#endif /* USE_VERBOSE_ZLIB_WRAPPERS */ typedef struct { int32_t n_targets; @@ -87,6 +101,12 @@ extern "C" { bam_header_t *bam_header_read(bamFile fp); int bam_read1(bamFile fp, bam1_t *b); +#ifdef USE_VERBOSE_ZLIB_WRAPPERS + gzFile bamlite_gzopen(const char *fn, const char *mode); + int bamlite_gzread(gzFile file, void *ptr, unsigned int len); + int bamlite_gzclose(gzFile file); +#endif /* USE_VERBOSE_ZLIB_WRAPPERS */ + #ifdef __cplusplus } #endif diff --git a/bntseq.c b/bntseq.c index 98a5a49..eddae84 100644 --- a/bntseq.c +++ b/bntseq.c @@ -29,12 +29,17 @@ #include #include #include +#include +#include #include "bntseq.h" -#include "main.h" #include "utils.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_DECLARE(gzFile) + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif unsigned char nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -63,25 +68,27 @@ void bns_dump(const bntseq_t *bns, const char *prefix) { // dump .ann strcpy(str, prefix); strcat(str, ".ann"); fp = xopen(str, "w"); - fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed); + err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed); for (i = 0; i != bns->n_seqs; ++i) { bntann1_t *p = bns->anns + i; - fprintf(fp, "%d %s", p->gi, p->name); - if (p->anno[0]) fprintf(fp, " %s\n", p->anno); - else fprintf(fp, "\n"); - fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); + err_fprintf(fp, "%d %s", p->gi, p->name); + if (p->anno[0]) err_fprintf(fp, " %s\n", p->anno); + else err_fprintf(fp, "\n"); + err_fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); } - fclose(fp); + err_fflush(fp); + err_fclose(fp); } { // dump .amb strcpy(str, prefix); strcat(str, ".amb"); fp = xopen(str, "w"); - fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes); + err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes); for (i = 0; i != bns->n_holes; ++i) { bntamb1_t *p = bns->ambs + i; - fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); + err_fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); } - fclose(fp); + err_fflush(fp); + err_fclose(fp); } } @@ -89,13 +96,16 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c { char str[1024]; FILE *fp; + const char *fname; bntseq_t *bns; long long xx; int i; + int scanres; bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); { // read .ann - fp = xopen(ann_filename, "r"); - fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed); + fp = xopen(fname = ann_filename, "r"); + scanres = fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed); + if (scanres != 3) goto badread; bns->l_pac = xx; bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t)); for (i = 0; i < bns->n_seqs; ++i) { @@ -103,39 +113,54 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c char *q = str; int c; // read gi and sequence name - fscanf(fp, "%u%s", &p->gi, str); + scanres = fscanf(fp, "%u%s", &p->gi, str); + if (scanres != 2) goto badread; p->name = strdup(str); // read fasta comments - while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; + while (str - q < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; + while (c != '\n' && c != EOF) c = fgetc(fp); + if (c == EOF) { + scanres = EOF; + goto badread; + } *q = 0; if (q - str > 1) p->anno = strdup(str + 1); // skip leading space else p->anno = strdup(""); // read the rest - fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); + scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); + if (scanres != 3) goto badread; p->offset = xx; } - fclose(fp); + err_fclose(fp); } { // read .amb int64_t l_pac; int32_t n_seqs; - fp = xopen(amb_filename, "r"); - fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes); + fp = xopen(fname = amb_filename, "r"); + scanres = fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes); + if (scanres != 3) goto badread; l_pac = xx; xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); - bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)); + bns->ambs = bns->n_holes? (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)) : 0; for (i = 0; i < bns->n_holes; ++i) { bntamb1_t *p = bns->ambs + i; - fscanf(fp, "%lld%d%s", &xx, &p->len, str); + scanres = fscanf(fp, "%lld%d%s", &xx, &p->len, str); + if (scanres != 3) goto badread; p->offset = xx; p->amb = str[0]; } - fclose(fp); + err_fclose(fp); } { // open .pac bns->fp_pac = xopen(pac_filename, "rb"); } return bns; + + badread: + if (EOF == scanres) { + err_fatal(__func__, "Error reading %s : %s\n", fname, ferror(fp) ? strerror(errno) : "Unexpected end of file"); + } + err_fatal(__func__, "Parse error reading %s\n", fname); } bntseq_t *bns_restore(const char *prefix) @@ -152,7 +177,7 @@ void bns_destroy(bntseq_t *bns) if (bns == 0) return; else { int i; - if (bns->fp_pac) fclose(bns->fp_pac); + if (bns->fp_pac) err_fclose(bns->fp_pac); free(bns->ambs); for (i = 0; i < bns->n_seqs; ++i) { free(bns->anns[i].name); @@ -250,16 +275,17 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) ret = bns->l_pac; { // finalize .pac file ubyte_t ct; - fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); + err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % 4 == 0) { ct = 0; - fwrite(&ct, 1, 1, fp); + err_fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % 4; - fwrite(&ct, 1, 1, fp); + err_fwrite(&ct, 1, 1, fp); // close .pac file - fclose(fp); + err_fflush(fp); + err_fclose(fp); } bns_dump(bns, prefix); bns_destroy(bns); @@ -283,25 +309,39 @@ int bwa_fa2pac(int argc, char *argv[]) } fp = xzopen(argv[optind], "r"); bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only); - gzclose(fp); + err_gzclose(fp); return 0; } +int bns_pos2rid(const bntseq_t *bns, int64_t pos_f) +{ + int left, mid, right; + if (pos_f >= bns->l_pac) return -1; + left = 0; mid = 0; right = bns->n_seqs; + while (left < right) { // binary search + mid = (left + right) >> 1; + if (pos_f >= bns->anns[mid].offset) { + if (mid == bns->n_seqs - 1) break; + if (pos_f < bns->anns[mid+1].offset) break; // bracketed + left = mid + 1; + } else right = mid; + } + return mid; +} + +int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re) +{ + int is_rev, rid_b, rid_e; + if (rb < bns->l_pac && re > bns->l_pac) return -2; + rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev)); + rid_e = bns_pos2rid(bns, bns_depos(bns, re, &is_rev) - 1); + return rid_b == rid_e? rid_b : -1; +} + int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) { int left, mid, right, nn; - if (ref_id) { - left = 0; mid = 0; right = bns->n_seqs; - while (left < right) { - mid = (left + right) >> 1; - if (pos_f >= bns->anns[mid].offset) { - if (mid == bns->n_seqs - 1) break; - if (pos_f < bns->anns[mid+1].offset) break; // bracketed - left = mid + 1; - } else right = mid; - } - *ref_id = mid; - } + if (ref_id) *ref_id = bns_pos2rid(bns, pos_f); left = 0; right = bns->n_holes; nn = 0; while (left < right) { mid = (left + right) >> 1; @@ -320,3 +360,53 @@ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) } return nn; } + +uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len) +{ + uint8_t *seq = 0; + if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap + if (end > l_pac<<1) end = l_pac<<1; + if (beg < 0) beg = 0; + if (beg >= l_pac || end <= l_pac) { + int64_t k, l = 0; + *len = end - beg; + seq = malloc(end - beg); + if (beg >= l_pac) { // reverse strand + int64_t beg_f = (l_pac<<1) - 1 - end; + int64_t end_f = (l_pac<<1) - 1 - beg; + for (k = end_f; k > beg_f; --k) + seq[l++] = 3 - _get_pac(pac, k); + } else { // forward strand + for (k = beg; k < end; ++k) + seq[l++] = _get_pac(pac, k); + } + } else *len = 0; // if bridging the forward-reverse boundary, return nothing + return seq; +} + +uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid) +{ + int64_t far_beg, far_end, len; + int is_rev; + uint8_t *seq; + + if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap + assert(*beg <= mid && mid < *end); + *rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev)); + far_beg = bns->anns[*rid].offset; + far_end = far_beg + bns->anns[*rid].len; + if (is_rev) { // flip to the reverse strand + int64_t tmp = far_beg; + far_beg = (bns->l_pac<<1) - far_end; + far_end = (bns->l_pac<<1) - tmp; + } + *beg = *beg > far_beg? *beg : far_beg; + *end = *end < far_end? *end : far_end; + seq = bns_get_seq(bns->l_pac, pac, *beg, *end, &len); + if (seq == 0 || *end - *beg != len) { + fprintf(stderr, "[E::%s] begin=%ld, mid=%ld, end=%ld, len=%ld, seq=%p, rid=%d, far_beg=%ld, far_end=%ld\n", + __func__, (long)*beg, (long)mid, (long)*end, (long)len, seq, *rid, (long)far_beg, (long)far_end); + } + assert(seq && *end - *beg == len); // assertion failure should never happen + return seq; +} diff --git a/bntseq.h b/bntseq.h index 843db64..6437cf6 100644 --- a/bntseq.h +++ b/bntseq.h @@ -28,7 +28,9 @@ #ifndef BWT_BNTSEQ_H #define BWT_BNTSEQ_H +#include #include +#include #include #ifndef BWA_UBYTE @@ -71,7 +73,11 @@ extern "C" { bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); void bns_destroy(bntseq_t *bns); int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); + int bns_pos2rid(const bntseq_t *bns, int64_t pos_f); int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); + uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); + uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid); + int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re); #ifdef __cplusplus } diff --git a/bwa.1 b/bwa.1 new file mode 100644 index 0000000..b6354e5 --- /dev/null +++ b/bwa.1 @@ -0,0 +1,772 @@ +.TH bwa 1 "31 March 2014" "bwa-0.7.8" "Bioinformatics tools" +.SH NAME +.PP +bwa - Burrows-Wheeler Alignment Tool +.SH SYNOPSIS +.PP +bwa index ref.fa +.PP +bwa mem ref.fa reads.fq > aln-se.sam +.PP +bwa mem ref.fa read1.fq read2.fq > aln-pe.sam +.PP +bwa aln ref.fa short_read.fq > aln_sa.sai +.PP +bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam +.PP +bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam +.PP +bwa bwasw ref.fa long_read.fq > aln.sam + +.SH DESCRIPTION +.PP +BWA is a software package for mapping low-divergent sequences against a large +reference genome, such as the human genome. It consists of three algorithms: +BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina +sequence reads up to 100bp, while the rest two for longer sequences ranged from +70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read +support and split alignment, but BWA-MEM, which is the latest, is generally +recommended for high-quality queries as it is faster and more accurate. +BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina +reads. + +For all the algorithms, BWA first needs to construct the FM-index for +the reference genome (the +.B index +command). Alignment algorithms are invoked with different sub-commands: +.BR aln / samse / sampe +for BWA-backtrack, +.B bwasw +for BWA-SW and +.B mem +for the BWA-MEM algorithm. + +.SH COMMANDS AND OPTIONS +.TP +.B index +.B bwa index +.RB [ -p +.IR prefix ] +.RB [ -a +.IR algoType ] +.I db.fa + +Index database sequences in the FASTA format. + +.B OPTIONS: +.RS +.TP 10 +.BI -p \ STR +Prefix of the output database [same as db filename] +.TP +.BI -a \ STR +Algorithm for constructing BWT index. BWA implements two algorithms for BWT +construction: +.B is +and +.BR bwtsw . +The first algorithm is a little faster for small database but requires large +RAM and does not work for databases with total length longer than 2GB. The +second algorithm is adapted from the BWT-SW source code. It in theory works +with database with trillions of bases. When this option is not specified, the +appropriate algorithm will be chosen automatically. +.RE + +.TP +.B mem +.B bwa mem +.RB [ -aCHMpP ] +.RB [ -t +.IR nThreads ] +.RB [ -k +.IR minSeedLen ] +.RB [ -w +.IR bandWidth ] +.RB [ -d +.IR zDropoff ] +.RB [ -r +.IR seedSplitRatio ] +.RB [ -c +.IR maxOcc ] +.RB [ -A +.IR matchScore ] +.RB [ -B +.IR mmPenalty ] +.RB [ -O +.IR gapOpenPen ] +.RB [ -E +.IR gapExtPen ] +.RB [ -L +.IR clipPen ] +.RB [ -U +.IR unpairPen ] +.RB [ -R +.IR RGline ] +.RB [ -v +.IR verboseLevel ] +.I db.prefix +.I reads.fq +.RI [ mates.fq ] + +Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the +algorithm works by seeding alignments with maximal exact matches (MEMs) and +then extending seeds with the affine-gap Smith-Waterman algorithm (SW). + +If +.I mates.fq +file is absent and option +.B -p +is not set, this command regards input reads are single-end. If +.I mates.fq +is present, this command assumes the +.IR i -th +read in +.I reads.fq +and the +.IR i -th +read in +.I mates.fq +constitute a read pair. If +.B -p +is used, the command assumes the +.RI 2 i -th +and the +.RI (2 i +1)-th +read in +.I reads.fq +constitute a read pair (such input file is said to be interleaved). In this case, +.I mates.fq +is ignored. In the paired-end mode, the +.B mem +command will infer the read orientation and the insert size distribution from a +batch of reads. + +The BWA-MEM algorithm performs local alignment. It may produce multiple primary +alignments for different part of a query sequence. This is a crucial feature +for long sequences. However, some tools such as Picard's markDuplicates does +not work with split alignments. One may consider to use option +.B -M +to flag shorter split hits as secondary. + +.RS +.TP 10 +.B ALGORITHM OPTIONS: +.TP +.BI -t \ INT +Number of threads [1] +.TP +.BI -k \ INT +Minimum seed length. Matches shorter than +.I INT +will be missed. The alignment speed is usually insensitive to this value unless +it significantly deviates from 20. [19] +.TP +.BI -w \ INT +Band width. Essentially, gaps longer than +.I INT +will not be found. Note that the maximum gap length is also affected by the +scoring matrix and the hit length, not solely determined by this option. [100] +.TP +.BI -d \ INT +Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between +the best and the current extension score is above +.RI | i - j |* A + INT , +where +.I i +and +.I j +are the current positions of the query and reference, respectively, and +.I A +is the matching score. Z-dropoff is similar to BLAST's X-dropoff except that it +doesn't penalize gaps in one of the sequences in the alignment. Z-dropoff not +only avoids unnecessary extension, but also reduces poor alignments inside a +long good alignment. [100] +.TP +.BI -r \ FLOAT +Trigger re-seeding for a MEM longer than +.IR minSeedLen * FLOAT . +This is a key heuristic parameter for tuning the performance. Larger value +yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5] +.TP +.BI -c \ INT +Discard a MEM if it has more than +.I INT +occurence in the genome. This is an insensitive parameter. [10000] +.TP +.B -P +In the paired-end mode, perform SW to rescue missing hits only but do not try to find +hits that fit a proper pair. +.TP +.BI -A \ INT +Matching score. [1] +.TP +.BI -B \ INT +Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4] +.TP +.BI -O \ INT[,INT] +Gap open penalty. If two numbers are specified, the first is the penalty of +openning a deletion and the second for openning an insertion. [6] +.TP +.BI -E \ INT[,INT] +Gap extension penalty. If two numbers are specified, the first is the penalty +of extending a deletion and second for extending an insertion. A gap of length +k costs O + k*E (i.e. +.B -O +is for opening a zero-length gap). [1] +.TP +.BI -L \ INT[,INT] +Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best +score reaching the end of query. If this score is larger than the best SW score +minus the clipping penalty, clipping will not be applied. Note that in this +case, the SAM AS tag reports the best SW score; clipping penalty is not +deduced. If two numbers are provided, the first is for 5'-end clipping and +second for 3'-end clipping. [5] +.TP +.BI -U \ INT +Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as +.RI scoreRead1+scoreRead2- INT +and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these +two scores to determine whether we should force pairing. A larger value leads to +more aggressive read pair. [17] + +.TP +.B INPUT/OUTPUT OPTIONS: +.TP +.B -p +Assume the first input query file is interleaved paired-end FASTA/Q. See the +command description for details. +.TP +.BI -R \ STR +Complete read group header line. '\\t' can be used in +.I STR +and will be converted to a TAB in the output SAM. The read group ID will be +attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'. +[null] +.TP +.BI -T \ INT +Don't output alignment with score lower than +.IR INT . +This option affects output and occasionally SAM flag 2. [30] +.TP +.B -a +Output all found alignments for single-end or unpaired paired-end reads. These +alignments will be flagged as secondary alignments. +.TP +.B -C +Append append FASTA/Q comment to SAM output. This option can be used to +transfer read meta information (e.g. barcode) to the SAM output. Note that the +FASTA/Q comment (the string after a space in the header line) must conform the SAM +spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output. +.TP +.B -M +Mark shorter split hits as secondary (for Picard compatibility). +.TP +.BI -v \ INT +Control the verbose level of the output. This option has not been fully +supported throughout BWA. Ideally, a value 0 for disabling all the output to +stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for +all normal messages; 4 or higher for debugging. When this option takes value +4, the output is not SAM. [3] +.TP +.BI -I \ FLOAT[,FLOAT[,INT[,INT]]] +Specify the mean, standard deviation (10% of the mean if absent), max (4 sigma +from the mean if absent) and min (4 sigma if absent) of the insert size +distribution. Only applicable to the FR orientation. By default, BWA-MEM infers +these numbers and the pair orientations given enough reads. [inferred] + +.RE + +.TP +.B aln +bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i +nIndelEnd] [-k maxSeedDiff] [-l seedLen] [-t nThrds] [-cRN] [-M misMsc] +[-O gapOsc] [-E gapEsc] [-q trimQual] > + + +Find the SA coordinates of the input reads. Maximum +.I maxSeedDiff +differences are allowed in the first +.I seedLen +subsequence and maximum +.I maxDiff +differences are allowed in the whole sequence. + +.B OPTIONS: +.RS +.TP 10 +.BI -n \ NUM +Maximum edit distance if the value is INT, or the fraction of missing +alignments given 2% uniform base error rate if FLOAT. In the latter +case, the maximum edit distance is automatically chosen for different +read lengths. [0.04] +.TP +.BI -o \ INT +Maximum number of gap opens [1] +.TP +.BI -e \ INT +Maximum number of gap extensions, -1 for k-difference mode (disallowing +long gaps) [-1] +.TP +.BI -d \ INT +Disallow a long deletion within INT bp towards the 3'-end [16] +.TP +.BI -i \ INT +Disallow an indel within INT bp towards the ends [5] +.TP +.BI -l \ INT +Take the first INT subsequence as seed. If INT is larger than the query +sequence, seeding will be disabled. For long reads, this option is +typically ranged from 25 to 35 for `-k 2'. [inf] +.TP +.BI -k \ INT +Maximum edit distance in the seed [2] +.TP +.BI -t \ INT +Number of threads (multi-threading mode) [1] +.TP +.BI -M \ INT +Mismatch penalty. BWA will not search for suboptimal hits with a score +lower than (bestScore-misMsc). [3] +.TP +.BI -O \ INT +Gap open penalty [11] +.TP +.BI -E \ INT +Gap extension penalty [4] +.TP +.BI -R \ INT +Proceed with suboptimal alignments if there are no more than INT equally +best hits. This option only affects paired-end mapping. Increasing this +threshold helps to improve the pairing accuracy at the cost of speed, +especially for short reads (~32bp). +.TP +.B -c +Reverse query but not complement it, which is required for alignment in +the color space. (Disabled since 0.6.x) +.TP +.B -N +Disable iterative search. All hits with no more than +.I maxDiff +differences will be found. This mode is much slower than the default. +.TP +.BI -q \ INT +Parameter for read trimming. BWA trims a read down to +argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l 1.sai + bwa aln ref.fa -b2 reads.bam > 2.sai + bwa sampe ref.fa 1.sai 2.sai reads.bam reads.bam > aln.sam +.TP +.B -0 +When +.B -b +is specified, only use single-end reads in mapping. +.TP +.B -1 +When +.B -b +is specified, only use the first read in a read pair in mapping (skip +single-end reads and the second reads). +.TP +.B -2 +When +.B -b +is specified, only use the second read in a read pair in mapping. +.B +.RE + +.TP +.B samse +bwa samse [-n maxOcc] > + +Generate alignments in the SAM format given single-end reads. Repetitive +hits will be randomly chosen. + +.B OPTIONS: +.RS +.TP 10 +.BI -n \ INT +Maximum number of alignments to output in the XA tag for reads paired +properly. If a read has more than INT hits, the XA tag will not be +written. [3] +.TP +.BI -r \ STR +Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null] +.RE + +.TP +.B sampe +bwa sampe [-a maxInsSize] [-o maxOcc] [-n maxHitPaired] [-N maxHitDis] +[-P] > + +Generate alignments in the SAM format given paired-end reads. Repetitive +read pairs will be placed randomly. + +.B OPTIONS: +.RS +.TP 8 +.BI -a \ INT +Maximum insert size for a read pair to be considered being mapped +properly. Since 0.4.5, this option is only used when there are not +enough good alignment to infer the distribution of insert sizes. [500] +.TP +.BI -o \ INT +Maximum occurrences of a read for pairing. A read with more occurrneces +will be treated as a single-end read. Reducing this parameter helps +faster pairing. [100000] +.TP +.B -P +Load the entire FM-index into memory to reduce disk operations +(base-space reads only). With this option, at least 1.25N bytes of +memory are required, where N is the length of the genome. +.TP +.BI -n \ INT +Maximum number of alignments to output in the XA tag for reads paired +properly. If a read has more than INT hits, the XA tag will not be +written. [3] +.TP +.BI -N \ INT +Maximum number of alignments to output in the XA tag for disconcordant +read pairs (excluding singletons). If a read has more than INT hits, the +XA tag will not be written. [10] +.TP +.BI -r \ STR +Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null] +.RE + +.TP +.B bwasw +bwa bwasw [-a matchScore] [-b mmPen] [-q gapOpenPen] [-r gapExtPen] [-t +nThreads] [-w bandWidth] [-T thres] [-s hspIntv] [-z zBest] [-N +nHspRev] [-c thresCoef] [mate.fq] + +Align query sequences in the +.I in.fq +file. When +.I mate.fq +is present, perform paired-end alignment. The paired-end mode only works +for reads Illumina short-insert libraries. In the paired-end mode, BWA-SW +may still output split alignments but they are all marked as not properly +paired; the mate positions will not be written if the mate has multiple +local hits. + +.B OPTIONS: +.RS +.TP 10 +.BI -a \ INT +Score of a match [1] +.TP +.BI -b \ INT +Mismatch penalty [3] +.TP +.BI -q \ INT +Gap open penalty [5] +.TP +.BI -r \ INT +Gap extension penalty. The penalty for a contiguous gap of size k is +q+k*r. [2] +.TP +.BI -t \ INT +Number of threads in the multi-threading mode [1] +.TP +.BI -w \ INT +Band width in the banded alignment [33] +.TP +.BI -T \ INT +Minimum score threshold divided by a [37] +.TP +.BI -c \ FLOAT +Coefficient for threshold adjustment according to query length. Given an +l-long query, the threshold for a hit to be retained is +a*max{T,c*log(l)}. [5.5] +.TP +.BI -z \ INT +Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1] +.TP +.BI -s \ INT +Maximum SA interval size for initiating a seed. Higher -s increases +accuracy at the cost of speed. [3] +.TP +.BI -N \ INT +Minimum number of seeds supporting the resultant alignment to skip +reverse alignment. [5] +.RE + +.SH SAM ALIGNMENT FORMAT +.PP +The output of the +.B `aln' +command is binary and designed for BWA use only. BWA outputs the final +alignment in the SAM (Sequence Alignment/Map) format. Each line consists +of: + +.TS +center box; +cb | cb | cb +n | l | l . +Col Field Description +_ +1 QNAME Query (pair) NAME +2 FLAG bitwise FLAG +3 RNAME Reference sequence NAME +4 POS 1-based leftmost POSition/coordinate of clipped sequence +5 MAPQ MAPping Quality (Phred-scaled) +6 CIAGR extended CIGAR string +7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME) +8 MPOS 1-based Mate POSistion +9 ISIZE Inferred insert SIZE +10 SEQ query SEQuence on the same strand as the reference +11 QUAL query QUALity (ASCII-33 gives the Phred base quality) +12 OPT variable OPTional fields in the format TAG:VTYPE:VALUE +.TE + +.PP +Each bit in the FLAG field is defined as: + +.TS +center box; +cb | cb | cb +c | l | l . +Chr Flag Description +_ +p 0x0001 the read is paired in sequencing +P 0x0002 the read is mapped in a proper pair +u 0x0004 the query sequence itself is unmapped +U 0x0008 the mate is unmapped +r 0x0010 strand of the query (1 for reverse) +R 0x0020 strand of the mate +1 0x0040 the read is the first read in a pair +2 0x0080 the read is the second read in a pair +s 0x0100 the alignment is not primary +f 0x0200 QC failure +d 0x0400 optical or PCR duplicate +.TE + +.PP +The Please check for the format +specification and the tools for post-processing the alignment. + +BWA generates the following optional fields. Tags starting with `X' are +specific to BWA. + +.TS +center box; +cb | cb +cB | l . +Tag Meaning +_ +NM Edit distance +MD Mismatching positions/bases +AS Alignment score +BC Barcode sequence +SA Supplementary alignments +_ +X0 Number of best hits +X1 Number of suboptimal hits found by BWA +XN Number of ambiguous bases in the referenece +XM Number of mismatches in the alignment +XO Number of gap opens +XG Number of gap extentions +XT Type: Unique/Repeat/N/Mate-sw +XA Alternative hits; format: /(chr,pos,CIGAR,NM;)*/ +_ +XS Suboptimal alignment score +XF Support from forward/reverse alignment +XE Number of supporting seeds +_ +XP Alt primary hits; format: /(chr,pos,CIGAR,mapQ,NM;)+/ +.TE + +.PP +Note that XO and XG are generated by BWT search while the CIGAR string +by Smith-Waterman alignment. These two tags may be inconsistent with the +CIGAR string. This is not a bug. + +.SH NOTES ON SHORT-READ ALIGNMENT +.SS Alignment Accuracy +.PP +When seeding is disabled, BWA guarantees to find an alignment +containing maximum +.I maxDiff +differences including +.I maxGapO +gap opens which do not occur within +.I nIndelEnd +bp towards either end of the query. Longer gaps may be found if +.I maxGapE +is positive, but it is not guaranteed to find all hits. When seeding is +enabled, BWA further requires that the first +.I seedLen +subsequence contains no more than +.I maxSeedDiff +differences. +.PP +When gapped alignment is disabled, BWA is expected to generate the same +alignment as Eland version 1, the Illumina alignment program. However, as BWA +change `N' in the database sequence to random nucleotides, hits to these +random sequences will also be counted. As a consequence, BWA may mark a +unique hit as a repeat, if the random sequences happen to be identical +to the sequences which should be unqiue in the database. +.PP +By default, if the best hit is not highly repetitive (controlled by -R), BWA +also finds all hits contains one more mismatch; otherwise, BWA finds all +equally best hits only. Base quality is NOT considered in evaluating +hits. In the paired-end mode, BWA pairs all hits it found. It further +performs Smith-Waterman alignment for unmapped reads to rescue reads with a +high erro rate, and for high-quality anomalous pairs to fix potential alignment +errors. + +.SS Estimating Insert Size Distribution +.PP +BWA estimates the insert size distribution per 256*1024 read pairs. It +first collects pairs of reads with both ends mapped with a single-end +quality 20 or higher and then calculates median (Q2), lower and higher +quartile (Q1 and Q3). It estimates the mean and the variance of the +insert size distribution from pairs whose insert sizes are within +interval [Q1-2(Q3-Q1), Q3+2(Q3-Q1)]. The maximum distance x for a pair +considered to be properly paired (SAM flag 0x2) is calculated by solving +equation Phi((x-mu)/sigma)=x/L*p0, where mu is the mean, sigma is the +standard error of the insert size distribution, L is the length of the +genome, p0 is prior of anomalous pair and Phi() is the standard +cumulative distribution function. For mapping Illumina short-insert +reads to the human genome, x is about 6-7 sigma away from the +mean. Quartiles, mean, variance and x will be printed to the standard +error output. + +.SS Memory Requirement +.PP +With bwtsw algorithm, 5GB memory is required for indexing the complete +human genome sequences. For short reads, the +.B aln +command uses ~3.2GB memory and the +.B sampe +command uses ~5.4GB. + +.SS Speed +.PP +Indexing the human genome sequences takes 3 hours with bwtsw +algorithm. Indexing smaller genomes with IS algorithms is +faster, but requires more memory. +.PP +The speed of alignment is largely determined by the error rate of the query +sequences (r). Firstly, BWA runs much faster for near perfect hits than +for hits with many differences, and it stops searching for a hit with +l+2 differences if a l-difference hit is found. This means BWA will be +very slow if r is high because in this case BWA has to visit hits with +many differences and looking for these hits is expensive. Secondly, the +alignment algorithm behind makes the speed sensitive to [k log(N)/m], +where k is the maximum allowed differences, N the size of database and m +the length of a query. In practice, we choose k w.r.t. r and therefore r +is the leading factor. I would not recommend to use BWA on data with +r>0.02. +.PP +Pairing is slower for shorter reads. This is mainly because shorter +reads have more spurious hits and converting SA coordinates to +chromosomal coordinates are very costly. + +.SH CHANGES IN BWA-0.6 +.PP +Since version 0.6, BWA has been able to work with a reference genome longer than 4GB. +This feature makes it possible to integrate the forward and reverse complemented +genome in one FM-index, which speeds up both BWA-short and BWA-SW. As a tradeoff, +BWA uses more memory because it has to keep all positions and ranks in 64-bit +integers, twice larger than 32-bit integers used in the previous versions. + +The latest BWA-SW also works for paired-end reads longer than 100bp. In +comparison to BWA-short, BWA-SW tends to be more accurate for highly unique +reads and more robust to relative long INDELs and structural variants. +Nonetheless, BWA-short usually has higher power to distinguish the optimal hit +from many suboptimal hits. The choice of the mapping algorithm may depend on +the application. + +.SH SEE ALSO +BWA website , Samtools website + + +.SH AUTHOR +Heng Li at the Sanger Institute wrote the key source codes and +integrated the following codes for BWT construction: bwtsw +, implemented by Chi-Kwong Wong at +the University of Hong Kong and IS + originally proposed by Nong Ge + at the Sun Yat-Sen University and +implemented by Yuta Mori. + +.SH LICENSE AND CITATION +.PP +The full BWA package is distributed under GPLv3 as it uses source codes +from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS +libraries are distributed under the MIT license. +.PP +If you use the BWA-backtrack algorithm, please cite the following +paper: +.PP +Li H. and Durbin R. (2009) Fast and accurate short read alignment with +Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168] +.PP +If you use the BWA-SW algorithm, please cite: +.PP +Li H. and Durbin R. (2010) Fast and accurate long-read alignment with +Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505] +.PP +If you use BWA-MEM or the fastmap component of BWA, please cite: +.PP +Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with +BWA-MEM. arXiv:1303.3997v1 [q-bio.GN]. +.PP +It is likely that the BWA-MEM manuscript will not appear in a peer-reviewed +journal. + +.SH HISTORY +BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW +and mimics its binary file formats; BWA-SW resembles BWT-SW in several +ways. The initial idea about BWT-based alignment also came from the +group who developed BWT-SW. At the same time, BWA is different enough +from BWT-SW. The short-read alignment algorithm bears no similarity to +Smith-Waterman algorithm any more. While BWA-SW learns from BWT-SW, it +introduces heuristics that can hardly be applied to the original +algorithm. In all, BWA does not guarantee to find all local hits as what +BWT-SW is designed to do, but it is much faster than BWT-SW on both +short and long query sequences. + +I started to write the first piece of codes on 24 May 2008 and got the +initial stable version on 02 June 2008. During this period, I was +acquainted that Professor Tak-Wah Lam, the first author of BWT-SW paper, +was collaborating with Beijing Genomics Institute on SOAP2, the successor +to SOAP (Short Oligonucleotide Analysis Package). SOAP2 has come out in +November 2008. According to the SourceForge download page, the third +BWT-based short read aligner, bowtie, was first released in August +2008. At the time of writing this manual, at least three more BWT-based +short-read aligners are being implemented. + +The BWA-SW algorithm is a new component of BWA. It was conceived in +November 2008 and implemented ten months later. + +The BWA-MEM algorithm is based on an algorithm finding super-maximal exact +matches (SMEMs), which was first published with the fermi assembler paper +in 2012. I first implemented the basic SMEM algorithm in the +.B fastmap +command for an experiment and then extended the basic algorithm and added the +extension part in Feburary 2013 to make BWA-MEM a fully featured mapper. + diff --git a/bwa.c b/bwa.c new file mode 100644 index 0000000..db3b947 --- /dev/null +++ b/bwa.c @@ -0,0 +1,318 @@ +#include +#include +#include +#include +#include "bntseq.h" +#include "bwa.h" +#include "ksw.h" +#include "utils.h" +#include "kstring.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +int bwa_verbose = 3; +char bwa_rg_id[256]; + +/************************ + * Batch FASTA/Q reader * + ************************/ + +#include "kseq.h" +KSEQ_DECLARE(gzFile) + +static inline void trim_readno(kstring_t *s) +{ + if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) + s->l -= 2, s->s[s->l] = 0; +} + +static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) +{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice + s->name = strdup(ks->name.s); + s->comment = ks->comment.l? strdup(ks->comment.s) : 0; + s->seq = strdup(ks->seq.s); + s->qual = ks->qual.l? strdup(ks->qual.s) : 0; + s->l_seq = strlen(s->seq); +} + +bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) +{ + kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; + int size = 0, m, n; + bseq1_t *seqs; + m = n = 0; seqs = 0; + while (kseq_read(ks) >= 0) { + if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + if (n >= m) { + m = m? m<<1 : 256; + seqs = realloc(seqs, m * sizeof(bseq1_t)); + } + trim_readno(&ks->name); + kseq2bseq1(ks, &seqs[n]); + size += seqs[n++].l_seq; + if (ks2) { + trim_readno(&ks2->name); + kseq2bseq1(ks2, &seqs[n]); + size += seqs[n++].l_seq; + } + if (size >= chunk_size && (n&1) == 0) break; + } + if (size == 0) { // test if the 2nd file is finished + if (ks2 && kseq_read(ks2) >= 0) + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + *n_ = n; + return seqs; +} + +/***************** + * CIGAR related * + *****************/ + +void bwa_fill_scmat(int a, int b, int8_t mat[25]) +{ + int i, j, k; + for (i = k = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? a : -b; + mat[k++] = -1; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = -1; +} + +// Generate CIGAR when the alignment end points are known +uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) +{ + uint32_t *cigar = 0; + uint8_t tmp, *rseq; + int i; + int64_t rlen; + kstring_t str; + const char *int2base; + + *n_cigar = 0; *NM = -1; + if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand + rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); + if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range + if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + for (i = 0; i < rlen>>1; ++i) + tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; + } + if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP + // FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance. + cigar = malloc(4); + cigar[0] = l_query<<4 | 0; + *n_cigar = 1; + for (i = 0, *score = 0; i < l_query; ++i) + *score += mat[rseq[i]*5 + query[i]]; + } else { + int w, max_gap, max_ins, max_del, min_w; + // set the band-width + max_ins = (int)((double)(((l_query+1)>>1) * mat[0] - o_ins) / e_ins + 1.); + max_del = (int)((double)(((l_query+1)>>1) * mat[0] - o_del) / e_del + 1.); + max_gap = max_ins > max_del? max_ins : max_del; + max_gap = max_gap > 1? max_gap : 1; + w = (max_gap + abs(rlen - l_query) + 1) >> 1; + w = w < w_? w : w_; + min_w = abs(rlen - l_query) + 3; + w = w > min_w? w : min_w; + // NW alignment + if (bwa_verbose >= 4) { + printf("* Global bandwidth: %d\n", w); + printf("* Global ref: "); for (i = 0; i < rlen; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); + printf("* Global query: "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + } + *score = ksw_global2(l_query, query, rlen, rseq, 5, mat, o_del, e_del, o_ins, e_ins, w, n_cigar, &cigar); + } + {// compute NM and MD + int k, x, y, u, n_mm = 0, n_gap = 0; + str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR + int2base = rb < l_pac? "ACGTN" : "TGCAN"; + for (k = 0, x = y = u = 0; k < *n_cigar; ++k) { + int op, len; + cigar = (uint32_t*)str.s; + op = cigar[k]&0xf, len = cigar[k]>>4; + if (op == 0) { // match + for (i = 0; i < len; ++i) { + if (query[x + i] != rseq[y + i]) { + kputw(u, &str); + kputc(int2base[rseq[y+i]], &str); + ++n_mm; u = 0; + } else ++u; + } + x += len; y += len; + } else if (op == 2) { // deletion + if (k > 0 && k < *n_cigar - 1) { // don't do the following if D is the first or the last CIGAR + kputw(u, &str); kputc('^', &str); + for (i = 0; i < len; ++i) + kputc(int2base[rseq[y+i]], &str); + u = 0; n_gap += len; + } + y += len; + } else if (op == 1) x += len, n_gap += len; // insertion + } + kputw(u, &str); kputc(0, &str); + *NM = n_mm + n_gap; + cigar = (uint32_t*)str.s; + } + if (rb >= l_pac) // reverse back query + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + +ret_gen_cigar: + free(rseq); + return cigar; +} + +uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) +{ + return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM); +} + +/********************* + * Full index reader * + *********************/ + +char *bwa_idx_infer_prefix(const char *hint) +{ + char *prefix; + int l_hint; + FILE *fp; + l_hint = strlen(hint); + prefix = malloc(l_hint + 3 + 4 + 1); + strcpy(prefix, hint); + strcpy(prefix + l_hint, ".64.bwt"); + if ((fp = fopen(prefix, "rb")) != 0) { + fclose(fp); + prefix[l_hint + 3] = 0; + return prefix; + } else { + strcpy(prefix + l_hint, ".bwt"); + if ((fp = fopen(prefix, "rb")) == 0) { + free(prefix); + return 0; + } else { + fclose(fp); + prefix[l_hint] = 0; + return prefix; + } + } +} + +bwt_t *bwa_idx_load_bwt(const char *hint) +{ + char *tmp, *prefix; + bwt_t *bwt; + prefix = bwa_idx_infer_prefix(hint); + if (prefix == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); + return 0; + } + tmp = calloc(strlen(prefix) + 5, 1); + strcat(strcpy(tmp, prefix), ".bwt"); // FM-index + bwt = bwt_restore_bwt(tmp); + strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA) + bwt_restore_sa(tmp, bwt); + free(tmp); free(prefix); + return bwt; +} + +bwaidx_t *bwa_idx_load(const char *hint, int which) +{ + bwaidx_t *idx; + char *prefix; + prefix = bwa_idx_infer_prefix(hint); + if (prefix == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); + return 0; + } + idx = calloc(1, sizeof(bwaidx_t)); + if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); + if (which & BWA_IDX_BNS) { + idx->bns = bns_restore(prefix); + if (which & BWA_IDX_PAC) { + idx->pac = calloc(idx->bns->l_pac/4+1, 1); + err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence + err_fclose(idx->bns->fp_pac); + idx->bns->fp_pac = 0; + } + } + free(prefix); + return idx; +} + +void bwa_idx_destroy(bwaidx_t *idx) +{ + if (idx == 0) return; + if (idx->bwt) bwt_destroy(idx->bwt); + if (idx->bns) bns_destroy(idx->bns); + if (idx->pac) free(idx->pac); + free(idx); +} + +/*********************** + * SAM header routines * + ***********************/ + +void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line) +{ + int i; + extern char *bwa_pg; + for (i = 0; i < bns->n_seqs; ++i) + err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); + if (rg_line) err_printf("%s\n", rg_line); + err_printf("%s\n", bwa_pg); +} + +static char *bwa_escape(char *s) +{ + char *p, *q; + for (p = q = s; *p; ++p) { + if (*p == '\\') { + ++p; + if (*p == 't') *q++ = '\t'; + else if (*p == 'n') *q++ = '\n'; + else if (*p == 'r') *q++ = '\r'; + else if (*p == '\\') *q++ = '\\'; + } else *q++ = *p; + } + *q = '\0'; + return s; +} + +char *bwa_set_rg(const char *s) +{ + char *p, *q, *r, *rg_line = 0; + memset(bwa_rg_id, 0, 256); + if (strstr(s, "@RG") != s) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__); + goto err_set_rg; + } + rg_line = strdup(s); + bwa_escape(rg_line); + if ((p = strstr(rg_line, "\tID:")) == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__); + goto err_set_rg; + } + p += 4; + for (q = p; *q && *q != '\t' && *q != '\n'; ++q); + if (q - p + 1 > 256) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__); + goto err_set_rg; + } + for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) + *r++ = *q; + return rg_line; + +err_set_rg: + free(rg_line); + return 0; +} + diff --git a/bwa.h b/bwa.h new file mode 100644 index 0000000..bbc2525 --- /dev/null +++ b/bwa.h @@ -0,0 +1,50 @@ +#ifndef BWA_H_ +#define BWA_H_ + +#include +#include "bntseq.h" +#include "bwt.h" + +#define BWA_IDX_BWT 0x1 +#define BWA_IDX_BNS 0x2 +#define BWA_IDX_PAC 0x4 +#define BWA_IDX_ALL 0x7 + +typedef struct { + bwt_t *bwt; // FM-index + bntseq_t *bns; // information on the reference sequences + uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base +} bwaidx_t; + +typedef struct { + int l_seq; + char *name, *comment, *seq, *qual, *sam; +} bseq1_t; + +extern int bwa_verbose; +extern char bwa_rg_id[256]; + +#ifdef __cplusplus +extern "C" { +#endif + + bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + + void bwa_fill_scmat(int a, int b, int8_t mat[25]); + uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); + uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); + + char *bwa_idx_infer_prefix(const char *hint); + bwt_t *bwa_idx_load_bwt(const char *hint); + + bwaidx_t *bwa_idx_load(const char *hint, int which); + void bwa_idx_destroy(bwaidx_t *idx); + + void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line); + char *bwa_set_rg(const char *s); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwamem.c b/bwamem.c new file mode 100644 index 0000000..346715c --- /dev/null +++ b/bwamem.c @@ -0,0 +1,1120 @@ +#include +#include +#include +#include +#include +#ifdef HAVE_PTHREAD +#include +#endif + +#include "kstring.h" +#include "bwamem.h" +#include "bntseq.h" +#include "ksw.h" +#include "kvec.h" +#include "ksort.h" +#include "utils.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +/* Theory on probability and scoring *ungapped* alignment + * + * s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution + * s'(a,a) = log(4), s'(a,b) = log(4e/3), where e is the error rate + * + * Scale s'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s'(a,b)/log(4), or conversely: s'(a,b)=s(a,b)*log(4)/x + * + * If the matching score is x and mismatch penalty is -y, we can compute error rate e: + * e = .75 * exp[-log(4) * y/x] + * + * log P(seq) = \sum_i log P(b_i|a_i) = \sum_i {s'(a,b) - log(4)} + * = \sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l) + * + * where S=\sum_i s(a,b) is the alignment score. Converting to the phred scale: + * Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x) + * + * + * Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1) + * Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4) + * + * When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR) + */ + +static const bntseq_t *global_bns = 0; // for debugging only + +mem_opt_t *mem_opt_init() +{ + mem_opt_t *o; + o = calloc(1, sizeof(mem_opt_t)); + o->flag = 0; + o->a = 1; o->b = 4; + o->o_del = o->o_ins = 6; + o->e_del = o->e_ins = 1; + o->w = 100; + o->T = 30; + o->zdrop = 100; + o->pen_unpaired = 17; + o->pen_clip5 = o->pen_clip3 = 5; + o->min_seed_len = 19; + o->split_width = 10; + o->max_occ = 10000; + o->max_chain_gap = 10000; + o->max_ins = 10000; + o->mask_level = 0.50; + o->drop_ratio = 0.50; + o->split_factor = 1.5; + o->chunk_size = 10000000; + o->n_threads = 1; + o->max_matesw = 100; + o->mask_level_redun = 0.95; + o->min_chain_weight = 0; + o->max_chain_extend = 1<<30; + o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); + bwa_fill_scmat(o->a, o->b, o->mat); + return o; +} + +/*************************** + * Collection SA invervals * + ***************************/ + +#define intv_lt(a, b) ((a).info < (b).info) +KSORT_INIT(mem_intv, bwtintv_t, intv_lt) + +typedef struct { + bwtintv_v mem, mem1, *tmpv[2]; +} smem_aux_t; + +static smem_aux_t *smem_aux_init() +{ + smem_aux_t *a; + a = calloc(1, sizeof(smem_aux_t)); + a->tmpv[0] = calloc(1, sizeof(bwtintv_v)); + a->tmpv[1] = calloc(1, sizeof(bwtintv_v)); + return a; +} + +static void smem_aux_destroy(smem_aux_t *a) +{ + free(a->tmpv[0]->a); free(a->tmpv[0]); + free(a->tmpv[1]->a); free(a->tmpv[1]); + free(a->mem.a); free(a->mem1.a); + free(a); +} + +static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a) +{ + int i, k, x = 0, old_n; + int start_width = (opt->flag & MEM_F_SELF_OVLP)? 2 : 1; + int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); + a->mem.n = 0; + // first pass: find all SMEMs + while (x < len) { + if (seq[x] < 4) { + x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); + for (i = 0; i < a->mem1.n; ++i) { + bwtintv_t *p = &a->mem1.a[i]; + int slen = (uint32_t)p->info - (p->info>>32); // seed length + if (slen >= opt->min_seed_len && p->x[2] <= opt->max_occ) + kv_push(bwtintv_t, a->mem, *p); + } + } else ++x; + } + // second pass: find MEMs inside a long SMEM + old_n = a->mem.n; + for (k = 0; k < old_n; ++k) { + bwtintv_t *p = &a->mem.a[k]; + int start = p->info>>32, end = (int32_t)p->info; + if (end - start < split_len || p->x[2] > opt->split_width) continue; + bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv); + for (i = 0; i < a->mem1.n; ++i) + kv_push(bwtintv_t, a->mem, a->mem1.a[i]); + } + // sort + ks_introsort(mem_intv, a->mem.n, a->mem.a); +} + +/************ + * Chaining * + ************/ + +typedef struct { + int64_t rbeg; + int32_t qbeg, len; +} mem_seed_t; // unaligned memory + +typedef struct { + int n, m, first, rid; + int w, kept; + int64_t pos; + mem_seed_t *seeds; +} mem_chain_t; + +typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; + +#include "kbtree.h" + +#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) +KBTREE_INIT(chn, mem_chain_t, chain_cmp) + +// return 1 if the seed is merged into the chain +static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid) +{ + int64_t qend, rend, x, y; + const mem_seed_t *last = &c->seeds[c->n-1]; + qend = last->qbeg + last->len; + rend = last->rbeg + last->len; + if (seed_rid != c->rid) return 0; // different chr; request a new chain + if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) + return 1; // contained seed; do nothing + if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand + x = p->qbeg - last->qbeg; // always non-negtive + y = p->rbeg - last->rbeg; + if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain + if (c->n == c->m) { + c->m <<= 1; + c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t)); + } + c->seeds[c->n++] = *p; + return 1; + } + return 0; // request to add a new chain +} + +int mem_chain_weight(const mem_chain_t *c) +{ + int64_t end; + int j, w = 0, tmp; + for (j = 0, end = 0; j < c->n; ++j) { + const mem_seed_t *s = &c->seeds[j]; + if (s->qbeg >= end) w += s->len; + else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end; + end = end > s->qbeg + s->len? end : s->qbeg + s->len; + } + tmp = w; + for (j = 0, end = 0; j < c->n; ++j) { + const mem_seed_t *s = &c->seeds[j]; + if (s->rbeg >= end) w += s->len; + else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end; + end = end > s->qbeg + s->len? end : s->qbeg + s->len; + } + return w < tmp? w : tmp; +} + +void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) +{ + int i, j; + for (i = 0; i < chn->n; ++i) { + mem_chain_t *p = &chn->a[i]; + err_printf("* Found CHAIN(%d): n=%d; weight=%d", i, p->n, mem_chain_weight(p)); + for (j = 0; j < p->n; ++j) { + bwtint_t pos; + int is_rev, ref_id; + pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev); + if (is_rev) pos -= p->seeds[j].len - 1; + bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id); + err_printf("\t%d;%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); + } + err_putchar('\n'); + } +} + +mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq) +{ + int i; + int64_t l_pac = bns->l_pac; + mem_chain_v chain; + kbtree_t(chn) *tree; + smem_aux_t *aux; + + kv_init(chain); + if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match + tree = kb_init(chn, KB_DEFAULT_SIZE); + + aux = smem_aux_init(); + mem_collect_intv(opt, bwt, len, seq, aux); + for (i = 0; i < aux->mem.n; ++i) { + bwtintv_t *p = &aux->mem.a[i]; + int slen = (uint32_t)p->info - (p->info>>32); // seed length + int64_t k; + if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive + for (k = 0; k < p->x[2]; ++k) { + mem_chain_t tmp, *lower, *upper; + mem_seed_t s; + int rid, to_add = 0; + s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference + s.qbeg = p->info>>32; + s.len = slen; + rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); + if (rid < 0) continue; // bridging multiple reference sequences or the forward-reverse boundary + if (kb_size(tree)) { + kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain + if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) to_add = 1; + } else to_add = 1; + if (to_add) { // add the seed as a new chain + tmp.n = 1; tmp.m = 4; + tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); + tmp.seeds[0] = s; + tmp.rid = rid; + kb_putp(chn, tree, &tmp); + } + } + } + smem_aux_destroy(aux); + + kv_resize(mem_chain_t, chain, kb_size(tree)); + + #define traverse_func(p_) (chain.a[chain.n++] = *(p_)) + __kb_traverse(mem_chain_t, tree, traverse_func); + #undef traverse_func + + kb_destroy(chn, tree); + return chain; +} + +/******************** + * Filtering chains * + ********************/ + +#define chn_beg(ch) ((ch).seeds->qbeg) +#define chn_end(ch) ((ch).seeds[(ch).n-1].qbeg + (ch).seeds[(ch).n-1].len) + +#define flt_lt(a, b) ((a).w > (b).w) +KSORT_INIT(mem_flt, mem_chain_t, flt_lt) + +int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a) +{ + int i, k; + kvec_t(int) chains = {0,0,0}; // this keeps int indices of the non-overlapping chains + if (n_chn == 0) return 0; // no need to filter + // compute the weight of each chain and drop chains with small weight + for (i = k = 0; i < n_chn; ++i) { + mem_chain_t *c = &a[i]; + c->first = -1; c->kept = 0; + c->w = mem_chain_weight(c); + if (c->w < opt->min_chain_weight) free(c->seeds); + else a[k++] = *c; + } + n_chn = k; + ks_introsort(mem_flt, n_chn, a); + // pairwise chain comparisons + a[0].kept = 3; + kv_push(int, chains, 0); + for (i = 1; i < n_chn; ++i) { + int large_ovlp = 0; + for (k = 0; k < chains.n; ++k) { + int j = chains.a[k]; + int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]); + int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]); + if (e_min > b_max) { // have overlap + int li = chn_end(a[i]) - chn_beg(a[i]); + int lj = chn_end(a[j]) - chn_beg(a[j]); + int min_l = li < lj? li : lj; + if (e_min - b_max >= min_l * opt->mask_level && min_l < opt->max_chain_gap) { // significant overlap + large_ovlp = 1; + if (a[j].first < 0) a[j].first = i; // keep the first shadowed hit s.t. mapq can be more accurate + if (a[i].w < a[j].w * opt->drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1) + break; + } + } + } + if (k == chains.n) { + kv_push(int, chains, i); + a[i].kept = large_ovlp? 2 : 3; + } + } + for (i = 0; i < chains.n; ++i) { + mem_chain_t *c = &a[chains.a[i]]; + if (c->first >= 0) a[c->first].kept = 1; + } + free(chains.a); + for (i = k = 0; i < n_chn; ++i) { // don't extend more than opt->max_chain_extend .kept=1/2 chains + if (a[i].kept == 0 || a[i].kept == 3) continue; + if (++k >= opt->max_chain_extend) break; + } + for (; i < n_chn; ++i) + if (a[i].kept < 3) a[i].kept = 0; + for (i = k = 0; i < n_chn; ++i) { // free discarded chains + mem_chain_t *c = &a[i]; + if (c->kept == 0) free(c->seeds); + else a[k++] = a[i]; + } + n_chn = k; + return k; +} + +/****************************** + * De-overlap single-end hits * + ******************************/ + +#define alnreg_slt2(a, b) ((a).re < (b).re) +KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2) + +#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) +KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) + +#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)) +KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt) + +int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) +{ + int m, i, j; + if (n <= 1) return n; + ks_introsort(mem_ars2, n, a); + for (i = 1; i < n; ++i) { + mem_alnreg_t *p = &a[i]; + if (p->rb >= a[i-1].re) continue; + for (j = i - 1; j >= 0 && p->rb < a[j].re; --j) { + mem_alnreg_t *q = &a[j]; + int64_t or, oq, mr, mq; + if (q->qe == q->qb) continue; // a[j] has been excluded + or = q->re - p->rb; // overlap length on the reference + oq = q->qb < p->qb? q->qe - p->qb : p->qe - q->qb; // overlap length on the query + mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment + mq = q->qe - q->qb < p->qe - p->qb? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment + if (or > mask_level_redun * mr && oq > mask_level_redun * mq) { // one of the hits is redundant + if (p->score < q->score) { + p->qe = p->qb; + break; + } else q->qe = q->qb; + } + } + } + for (i = 0, m = 0; i < n; ++i) // exclude identical hits + if (a[i].qe > a[i].qb) { + if (m != i) a[m++] = a[i]; + else ++m; + } + n = m; + ks_introsort(mem_ars, n, a); + for (i = 1; i < n; ++i) { // mark identical hits + if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb) + a[i].qe = a[i].qb; + } + for (i = 1, m = 1; i < n; ++i) // exclude identical hits + if (a[i].qe > a[i].qb) { + if (m != i) a[m++] = a[i]; + else ++m; + } + return m; +} + +int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int qlen) +{ + if (!(opt->flag & MEM_F_SELF_OVLP) || n == 0 || a->truesc != qlen * opt->a) return n; + memmove(a, a + 1, (n - 1) * sizeof(mem_alnreg_t)); + return n - 1; +} + +void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) // IMPORTANT: must run mem_sort_and_dedup() before calling this function +{ // similar to the loop in mem_chain_flt() + int i, k, tmp; + kvec_t(int) z; + if (n == 0) return; + kv_init(z); + for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i); + ks_introsort(mem_ars_hash, n, a); + tmp = opt->a + opt->b; + tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp; + tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp; + kv_push(int, z, 0); + for (i = 1; i < n; ++i) { + for (k = 0; k < z.n; ++k) { + int j = z.a[k]; + int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb; + int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe; + if (e_min > b_max) { // have overlap + int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb; + if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap + if (a[j].sub == 0) a[j].sub = a[i].score; + if (a[j].score - a[i].score <= tmp) ++a[j].sub_n; + break; + } + } + } + if (k == z.n) kv_push(int, z, i); + else a[i].secondary = z.a[k]; + } + free(z.a); +} + +/**************************************** + * Construct the alignment from a chain * + ****************************************/ + +/* mem_chain2aln() vs mem_chain2aln_short() + * + * mem_chain2aln() covers all the functionality of mem_chain2aln_short(). + * However, it may waste time on extracting the reference sequences given a + * very long query. mem_chain2aln_short() is faster for very short chains in a + * long query. It may fail when the matches are long or reach the end of the + * query. In this case, mem_chain2aln() will be called again. + * mem_chain2aln_short() is almost never used for short-read alignment. + */ + +#define MEM_SHORT_EXT 50 +#define MEM_SHORT_LEN 200 + +#define MEM_HSP_COEF 1.5 + +#define MAX_BAND_TRY 2 + +/* mem_test_chain_sw() uses SSE2-SW to align a short chain with 50bp added to + * each end of the chain. If the SW score is below min_HSP_score, it will + * return 0, informing the caller to discard the chain. This heuristic is + * somewhat similar to BLAST which drops a seed hit if ungapped extension is + * below a certain score (true for old BLAST; don't know how BLAST+ works). + * + * For PacBio data, we need to set high matching score and low gap penalties; + * otherwise we are likely to get fragmented alignments. However, with such + * settings, we can often extend most random seed hits to the end. These + * extensions are wasteful and time consuming. By testing the chain with SW, + * we can discard bad chains before performing the expensive extension. + * + * Although probably it is not a bad idea to use this function for + * low-divergence sequences, more testing is needed. For now, I only recommend + * to use mem_test_chain_sw() for PacBio data. It is disabled by default. + */ +int mem_test_chain_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c) +{ + int i, qb, qe, rid; + int min_HSP_score = (int)(opt->min_chain_weight * opt->a * MEM_HSP_COEF + .499); + int64_t rb, re, l_pac = bns->l_pac; + uint8_t *rseq = 0; + kswr_t x; + + if (c->n == 0) return -1; + qb = l_query; qe = 0; + rb = l_pac<<1; re = 0; + for (i = 0; i < c->n; ++i) { + const mem_seed_t *s = &c->seeds[i]; + qb = qb < s->qbeg? qb : s->qbeg; + qe = qe > s->qbeg + s->len? qe : s->qbeg + s->len; + rb = rb < s->rbeg? rb : s->rbeg; + re = re > s->rbeg + s->len? re : s->rbeg + s->len; + } + qb -= MEM_SHORT_EXT; qe += MEM_SHORT_EXT; + qb = qb > 0? qb : 0; + qe = qe < l_query? qe : l_query; + rb -= MEM_SHORT_EXT; re += MEM_SHORT_EXT; + rb = rb > 0? rb : 0; + re = re < l_pac<<1? re : l_pac<<1; + if (rb < l_pac && l_pac < re) { + if (c->seeds[0].rbeg < l_pac) re = l_pac; + else rb = l_pac; + } + if ((re - rb) - (qe - qb) > MEM_SHORT_EXT || (qe - qb) - (re - rb) > MEM_SHORT_EXT) return 1; + if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1; + if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1; + + rseq = bns_fetch_seq(bns, pac, &rb, c->seeds[0].rbeg, &re, &rid); + assert(c->rid == rid); + x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0); + free(rseq); + if (x.score >= min_HSP_score) return 1; + if (bwa_verbose >= 4) printf("** give up the chain due to small HSP score %d.\n", x.score); + return 0; +} + +int mem_chain2aln_short(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) +{ + int i, qb, qe, xtra, rid; + int64_t rb, re, l_pac = bns->l_pac; + uint8_t *rseq = 0; + mem_alnreg_t a; + kswr_t x; + + if (c->n == 0) return -1; + qb = l_query; qe = 0; + rb = l_pac<<1; re = 0; + memset(&a, 0, sizeof(mem_alnreg_t)); + for (i = 0; i < c->n; ++i) { + const mem_seed_t *s = &c->seeds[i]; + qb = qb < s->qbeg? qb : s->qbeg; + qe = qe > s->qbeg + s->len? qe : s->qbeg + s->len; + rb = rb < s->rbeg? rb : s->rbeg; + re = re > s->rbeg + s->len? re : s->rbeg + s->len; + a.seedcov += s->len; + } + qb -= MEM_SHORT_EXT; qe += MEM_SHORT_EXT; + if (qb <= 10 || qe >= l_query - 10) return 1; // because ksw_align() does not support end-to-end alignment + rb -= MEM_SHORT_EXT; re += MEM_SHORT_EXT; + rb = rb > 0? rb : 0; + re = re < l_pac<<1? re : l_pac<<1; + if (rb < l_pac && l_pac < re) { + if (c->seeds[0].rbeg < l_pac) re = l_pac; + else rb = l_pac; + } + if ((re - rb) - (qe - qb) > MEM_SHORT_EXT || (qe - qb) - (re - rb) > MEM_SHORT_EXT) return 1; + if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1; + if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1; + + rseq = bns_fetch_seq(bns, pac, &rb, c->seeds[0].rbeg, &re, &rid); + assert(c->rid == rid); + xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); + x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); + free(rseq); + a.rb = rb + x.tb; a.re = rb + x.te + 1; + a.qb = qb + x.qb; a.qe = qb + x.qe + 1; + a.score = x.score; + a.csub = x.score2; + a.rid = c->rid; + if (bwa_verbose >= 4) printf("** Attempted alignment via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld); score=%d; %d/%d\n", a.qb, a.qe, (long)a.rb, (long)a.re, x.score, a.qe-a.qb, qe-qb); + if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1; + kv_push(mem_alnreg_t, *av, a); + return 0; +} + +static inline int cal_max_gap(const mem_opt_t *opt, int qlen) +{ + int l_del = (int)((double)(qlen * opt->a - opt->o_del) / opt->e_del + 1.); + int l_ins = (int)((double)(qlen * opt->a - opt->o_ins) / opt->e_ins + 1.); + int l = l_del > l_ins? l_del : l_ins; + l = l > 1? l : 1; + return l < opt->w<<1? l : opt->w<<1; +} + +void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) +{ + int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension + int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0; + const mem_seed_t *s; + uint8_t *rseq = 0; + uint64_t *srt; + + if (c->n == 0) return; + // get the max possible span + rmax[0] = l_pac<<1; rmax[1] = 0; + for (i = 0; i < c->n; ++i) { + int64_t b, e; + const mem_seed_t *t = &c->seeds[i]; + b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg)); + e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len)); + rmax[0] = rmax[0] < b? rmax[0] : b; + rmax[1] = rmax[1] > e? rmax[1] : e; + if (t->len > max) max = t->len; + } + rmax[0] = rmax[0] > 0? rmax[0] : 0; + rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1; + if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side + if (c->seeds[0].rbeg < l_pac) rmax[1] = l_pac; // this works because all seeds are guaranteed to be on the same strand + else rmax[0] = l_pac; + } + // retrieve the reference sequence + rseq = bns_fetch_seq(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid); + assert(c->rid == rid); + + srt = malloc(c->n * 8); + for (i = 0; i < c->n; ++i) + srt[i] = (uint64_t)c->seeds[i].len<<32 | i; + ks_introsort_64(c->n, srt); + + for (k = c->n - 1; k >= 0; --k) { + mem_alnreg_t *a; + s = &c->seeds[(uint32_t)srt[k]]; + + for (i = 0; i < av->n; ++i) { // test whether extension has been made before + mem_alnreg_t *p = &av->a[i]; + int64_t rd; + int qd, w, max_gap; + if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained + // qd: distance ahead of the seed on query; rd: on reference + qd = s->qbeg - p->qb; rd = s->rbeg - p->rb; + max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed + w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width + if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit + // similar to the previous four lines, but this time we look at the region behind + qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len); + max_gap = cal_max_gap(opt, qd < rd? qd : rd); + w = max_gap < opt->w? max_gap : opt->w; + if (qd - rd < w && rd - qd < w) break; + } + if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln + if (bwa_verbose >= 4) + printf("** Seed(%d) [%ld;%ld,%ld] is almost contained in an existing alignment. Confirming whether extension is needed...\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg); + for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain + const mem_seed_t *t; + if (srt[i] == 0) continue; + t = &c->seeds[(uint32_t)srt[i]]; + if (t->len < s->len * .95) continue; // only check overlapping if t is long enough; TODO: more efficient by early stopping + if (s->qbeg <= t->qbeg && s->qbeg + s->len - t->qbeg >= s->len>>2 && t->qbeg - s->qbeg != t->rbeg - s->rbeg) break; + if (t->qbeg <= s->qbeg && t->qbeg + t->len - s->qbeg >= s->len>>2 && s->qbeg - t->qbeg != s->rbeg - t->rbeg) break; + } + if (i == c->n) { // no overlapping seeds; then skip extension + srt[k] = 0; // mark that seed extension has not been performed + continue; + } + if (bwa_verbose >= 4) + printf("** Seed(%d) might lead to a different alignment even though it is contained. Extension will be performed.\n", k); + } + + a = kv_pushp(mem_alnreg_t, *av); + memset(a, 0, sizeof(mem_alnreg_t)); + a->w = aw[0] = aw[1] = opt->w; + a->score = a->truesc = -1; + a->rid = c->rid; + + if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg); + if (s->qbeg) { // left extension + uint8_t *rs, *qs; + int qle, tle, gtle, gscore; + qs = malloc(s->qbeg); + for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; + tmp = s->rbeg - rmax[0]; + rs = malloc(tmp); + for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; + for (i = 0; i < MAX_BAND_TRY; ++i) { + int prev = a->score; + aw[0] = opt->w << i; + if (bwa_verbose >= 4) { + int j; + printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rs[j]]); putchar('\n'); + printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)qs[j]]); putchar('\n'); + } + a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); + if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } + if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; + } + // check whether we prefer to reach the end of the query + if (gscore <= 0 || gscore <= a->score - opt->pen_clip5) { // local extension + a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; + a->truesc = a->score; + } else { // to-end extension + a->qb = 0, a->rb = s->rbeg - gtle; + a->truesc = gscore; + } + free(qs); free(rs); + } else a->score = a->truesc = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; + + if (s->qbeg + s->len != l_query) { // right extension + int qle, tle, qe, re, gtle, gscore, sc0 = a->score; + qe = s->qbeg + s->len; + re = s->rbeg + s->len - rmax[0]; + assert(re >= 0); + for (i = 0; i < MAX_BAND_TRY; ++i) { + int prev = a->score; + aw[1] = opt->w << i; + if (bwa_verbose >= 4) { + int j; + printf("*** Right ref: "); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[re+j]]); putchar('\n'); + printf("*** Right query: "); for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[qe+j]]); putchar('\n'); + } + a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); + if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } + if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; + } + // similar to the above + if (gscore <= 0 || gscore <= a->score - opt->pen_clip3) { // local extension + a->qe = qe + qle, a->re = rmax[0] + re + tle; + a->truesc += a->score - sc0; + } else { // to-end extension + a->qe = l_query, a->re = rmax[0] + re + gtle; + a->truesc += gscore - sc0; + } + } else a->qe = l_query, a->re = s->rbeg + s->len; + if (bwa_verbose >= 4) printf("*** Added alignment region: [%d,%d) <=> [%ld,%ld); score=%d; {left,right}_bandwidth={%d,%d}\n", a->qb, a->qe, (long)a->rb, (long)a->re, a->score, aw[0], aw[1]); + + // compute seedcov + for (i = 0, a->seedcov = 0; i < c->n; ++i) { + const mem_seed_t *t = &c->seeds[i]; + if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained + a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough + } + a->w = aw[0] > aw[1]? aw[0] : aw[1]; + } + free(srt); free(rseq); +} + +/***************************** + * Basic hit->SAM conversion * + *****************************/ + +static inline int infer_bw(int l1, int l2, int score, int a, int q, int r) +{ + int w; + if (l1 == l2 && l1 * a - score < (q + r - a)<<1) return 0; // to get equal alignment length, we need at least two gaps + w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 2.); + if (w < abs(l1 - l2)) w = abs(l1 - l2); + return w; +} + +static inline int get_rlen(int n_cigar, const uint32_t *cigar) +{ + int k, l; + for (k = l = 0; k < n_cigar; ++k) { + int op = cigar[k]&0xf; + if (op == 0 || op == 2) + l += cigar[k]>>4; + } + return l; +} + +void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_) +{ + int i; + mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert + + if (m_) mtmp = *m_, m = &mtmp; + // set flag + p->flag |= m? 0x1 : 0; // is paired in sequencing + p->flag |= p->rid < 0? 0x4 : 0; // is mapped + p->flag |= m && m->rid < 0? 0x8 : 0; // is mate mapped + if (p->rid < 0 && m && m->rid >= 0) // copy mate to alignment + p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0; + if (m && m->rid < 0 && p->rid >= 0) // copy alignment to mate + m->rid = p->rid, m->pos = p->pos, m->is_rev = p->is_rev, m->n_cigar = 0; + p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand + p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand + + // print up to CIGAR + kputs(s->name, str); kputc('\t', str); // QNAME + kputw((p->flag&0xffff) | (p->flag&0x10000? 0x100 : 0), str); kputc('\t', str); // FLAG + if (p->rid >= 0) { // with coordinate + kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME + kputl(p->pos + 1, str); kputc('\t', str); // POS + kputw(p->mapq, str); kputc('\t', str); // MAPQ + if (p->n_cigar) { // aligned + for (i = 0; i < p->n_cigar; ++i) { + int c = p->cigar[i]&0xf; + if (c == 3 || c == 4) c = which? 4 : 3; // use hard clipping for supplementary alignments + kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str); + } + } else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true) + } else kputsn("*\t0\t0\t*", 7, str); // without coordinte + kputc('\t', str); + + // print the mate position if applicable + if (m && m->rid >= 0) { + if (p->rid == m->rid) kputc('=', str); + else kputs(bns->anns[m->rid].name, str); + kputc('\t', str); + kputl(m->pos + 1, str); kputc('\t', str); + if (p->rid == m->rid) { + int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) - 1 : 0); + int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) - 1 : 0); + if (m->n_cigar == 0 || p->n_cigar == 0) kputc('0', str); + else kputl(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); + } else kputc('0', str); + } else kputsn("*\t0\t0", 5, str); + kputc('\t', str); + + // print SEQ and QUAL + if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL + kputsn("*\t*", 3, str); + } else if (!p->is_rev) { // the forward strand + int i, qb = 0, qe = s->l_seq; + if (p->n_cigar) { + if (which && ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3)) qb += p->cigar[0]>>4; + if (which && ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3)) qe -= p->cigar[p->n_cigar-1]>>4; + } + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } else { // the reverse strand + int i, qb = 0, qe = s->l_seq; + if (p->n_cigar) { + if (which && ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3)) qe -= p->cigar[0]>>4; + if (which && ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3)) qb += p->cigar[p->n_cigar-1]>>4; + } + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } + + // print optional tags + if (p->n_cigar) { + kputsn("\tNM:i:", 6, str); kputw(p->NM, str); + kputsn("\tMD:Z:", 6, str); kputs((char*)(p->cigar + p->n_cigar), str); + } + if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } + if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } + if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } + if (!(p->flag & 0x100)) { // not multi-hit + for (i = 0; i < n; ++i) + if (i != which && !(list[i].flag&0x100)) break; + if (i < n) { // there are other primary hits; output them + kputsn("\tSA:Z:", 6, str); + for (i = 0; i < n; ++i) { + const mem_aln_t *r = &list[i]; + int k; + if (i == which || (list[i].flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit + kputs(bns->anns[r->rid].name, str); kputc(',', str); + kputl(r->pos+1, str); kputc(',', str); + kputc("+-"[r->is_rev], str); kputc(',', str); + for (k = 0; k < r->n_cigar; ++k) { + kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str); + } + kputc(',', str); kputw(r->mapq, str); + kputc(',', str); kputw(r->NM, str); + kputc(';', str); + } + } + } + if (s->comment) { kputc('\t', str); kputs(s->comment, str); } + kputc('\n', str); +} + +/************************ + * Integrated interface * + ************************/ + +int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) +{ + int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a; + double identity; + sub = a->csub > sub? a->csub : sub; + if (sub >= a->score) return 0; + l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; + identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; + if (a->score == 0) { + mapq = 0; + } else if (opt->mapQ_coef_len > 0) { + double tmp; + tmp = l < opt->mapQ_coef_len? 1. : opt->mapQ_coef_fac / log(l); + tmp *= identity * identity; + mapq = (int)(6.02 * (a->score - sub) / opt->a * tmp * tmp + .499); + } else { + mapq = (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499); + mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; + } + if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499); + if (mapq > 60) mapq = 60; + if (mapq < 0) mapq = 0; + return mapq; +} + +// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible +void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) +{ + kstring_t str; + kvec_t(mem_aln_t) aa; + int k; + + kv_init(aa); + str.l = str.m = 0; str.s = 0; + for (k = 0; k < a->n; ++k) { + mem_alnreg_t *p = &a->a[k]; + mem_aln_t *q; + if (p->score < opt->T) continue; + if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; + if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue; + q = kv_pushp(mem_aln_t, aa); + *q = mem_reg2aln2(opt, bns, pac, s->l_seq, s->seq, p, s->name); + if (q->rid < 0) { + --aa.n; + continue; + } + q->flag |= extra_flag; // flag secondary + if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score + if (k && p->secondary < 0) // if supplementary + q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800; + if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; + } + if (aa.n == 0) { // no alignments good enough; then write an unaligned record + mem_aln_t t; + t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); + t.flag |= extra_flag; + mem_aln2sam(bns, &str, s, 1, &t, 0, m); + } else { + for (k = 0; k < aa.n; ++k) + mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m); + for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); + free(aa.a); + } + s->sam = str.s; +} + +mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) +{ + int i; + mem_chain_v chn; + mem_alnreg_v regs; + + for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so + seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; + + chn = mem_chain(opt, bwt, bns, l_seq, (uint8_t*)seq); + chn.n = mem_chain_flt(opt, chn.n, chn.a); + if (bwa_verbose >= 4) mem_print_chain(bns, &chn); + + kv_init(regs); + for (i = 0; i < chn.n; ++i) { + mem_chain_t *p = &chn.a[i]; + int ret; + if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); + if (opt->min_chain_weight > 0) ret = mem_test_chain_sw(opt, bns, pac, l_seq, (uint8_t*)seq, p); + else ret = mem_chain2aln_short(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s); + if (ret > 0) mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s); + free(chn.a[i].seeds); + } + free(chn.a); + regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun); + if (opt->flag & MEM_F_SELF_OVLP) + regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq); + if (bwa_verbose >= 4) { + err_printf("* %ld chains remain after removing duplicated chains\n", regs.n); + for (i = 0; i < regs.n; ++i) { + mem_alnreg_t *p = ®s.a[i]; + printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re); + } + } + return regs; +} + +mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar, const char *name) +{ + mem_aln_t a; + int i, w2, tmp, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD; + int64_t pos, rb, re; + uint8_t *query; + + memset(&a, 0, sizeof(mem_aln_t)); + if (ar == 0 || ar->rb < 0 || ar->re < 0) { // generate an unmapped record + a.rid = -1; a.pos = -1; a.flag |= 0x4; + return a; + } + qb = ar->qb, qe = ar->qe; + rb = ar->rb, re = ar->re; + query = malloc(l_query); + for (i = 0; i < l_query; ++i) // convert to the nt4 encoding + query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; + a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; + if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment + tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del); + w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins); + w2 = w2 > tmp? w2 : tmp; + if (bwa_verbose >= 4) printf("* Band width: inferred=%d, cmd_opt=%d, alnreg=%d\n", w2, opt->w, ar->w); + if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w; + i = 0; a.cigar = 0; + do { + free(a.cigar); + a.cigar = bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); + if (bwa_verbose >= 4) printf("* Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc); + if (score == last_sc) break; // it is possible that global alignment and local alignment give different scores + last_sc = score; + w2 <<= 1; + } while (++i < 3 && score < ar->truesc - opt->a); + l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1; + a.NM = NM; + pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); + a.is_rev = is_rev; + if (a.n_cigar > 0) { // squeeze out leading or trailing deletions + if ((a.cigar[0]&0xf) == 2) { + pos += a.cigar[0]>>4; + --a.n_cigar; + memmove(a.cigar, a.cigar + 1, a.n_cigar * 4 + l_MD); + } else if ((a.cigar[a.n_cigar-1]&0xf) == 2) { + --a.n_cigar; + memmove(a.cigar + a.n_cigar, a.cigar + a.n_cigar + 1, l_MD); // MD needs to be moved accordingly + } + } + if (qb != 0 || qe != l_query) { // add clipping to CIGAR + int clip5, clip3; + clip5 = is_rev? l_query - qe : qb; + clip3 = is_rev? qb : l_query - qe; + a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2) + l_MD); + if (clip5) { + memmove(a.cigar+1, a.cigar, a.n_cigar * 4 + l_MD); // make room for 5'-end clipping + a.cigar[0] = clip5<<4 | 3; + ++a.n_cigar; + } + if (clip3) { + memmove(a.cigar + a.n_cigar + 1, a.cigar + a.n_cigar, l_MD); // make room for 3'-end clipping + a.cigar[a.n_cigar++] = clip3<<4 | 3; + } + } + a.rid = bns_pos2rid(bns, pos); + assert(a.rid == ar->rid); + a.pos = pos - bns->anns[a.rid].offset; + a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub; + free(query); + return a; +} + +mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) +{ + return mem_reg2aln2(opt, bns, pac, l_query, query_, ar, 0); +} + +typedef struct { + const mem_opt_t *opt; + const bwt_t *bwt; + const bntseq_t *bns; + const uint8_t *pac; + const mem_pestat_t *pes; + bseq1_t *seqs; + mem_alnreg_v *regs; + int64_t n_processed; +} worker_t; + +static void worker1(void *data, int i, int tid) +{ + worker_t *w = (worker_t*)data; + if (!(w->opt->flag&MEM_F_PE)) { + if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name); + w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); + } else { + if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name); + w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq); + if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name); + w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq); + } +} + +static void worker2(void *data, int i, int tid) +{ + extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); + extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); + worker_t *w = (worker_t*)data; + if (!(w->opt->flag&MEM_F_PE)) { + if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name); + if (w->opt->flag & MEM_F_ALN_REG) { + mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]); + } else { + mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); + mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + } + free(w->regs[i].a); + } else { + if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name); + mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]); + free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); + } +} + +void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0) +{ + extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); + worker_t w; + mem_alnreg_v *regs; + mem_pestat_t pes[4]; + double ctime, rtime; + + ctime = cputime(); rtime = realtime(); + global_bns = bns; + regs = malloc(n * sizeof(mem_alnreg_v)); + w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; + w.seqs = seqs; w.regs = regs; w.n_processed = n_processed; + w.pes = &pes[0]; + kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions + if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided + if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 + else mem_pestat(opt, bns->l_pac, n, regs, pes); // otherwise, infer the insert size distribution from data + } + kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment + free(regs); + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime); +} diff --git a/bwamem.h b/bwamem.h new file mode 100644 index 0000000..53472fe --- /dev/null +++ b/bwamem.h @@ -0,0 +1,172 @@ +#ifndef BWAMEM_H_ +#define BWAMEM_H_ + +#include "bwt.h" +#include "bntseq.h" +#include "bwa.h" + +#define MEM_MAPQ_COEF 30.0 +#define MEM_MAPQ_MAX 60 + +struct __smem_i; +typedef struct __smem_i smem_i; + +#define MEM_F_PE 0x2 +#define MEM_F_NOPAIRING 0x4 +#define MEM_F_ALL 0x8 +#define MEM_F_NO_MULTI 0x10 +#define MEM_F_NO_RESCUE 0x20 +#define MEM_F_SELF_OVLP 0x40 +#define MEM_F_ALN_REG 0x80 + +typedef struct { + int a, b; // match score and mismatch penalty + int o_del, e_del; + int o_ins, e_ins; + int pen_unpaired; // phred-scaled penalty for unpaired reads + int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score. + int w; // band width + int zdrop; // Z-dropoff + + int T; // output score threshold; only affecting output + int flag; // see MEM_F_* macros + int min_seed_len; // minimum seed length + int min_chain_weight; + int max_chain_extend; + float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor + int split_width; // split into a seed if its occurence is smaller than this value + int max_occ; // skip a seed if its occurence is larger than this value + int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed + int n_threads; // number of threads + int chunk_size; // process chunk_size-bp sequences in a batch + float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits + float drop_ratio; // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain + float mask_level_redun; + float mapQ_coef_len; + int mapQ_coef_fac; + int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value + int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end + int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset +} mem_opt_t; + +typedef struct { + int64_t rb, re; // [rb,re): reference sequence in the alignment + int qb, qe; // [qb,qe): query sequence in the alignment + int rid; // reference seq ID + int score; // best local SW score + int truesc; // actual score corresponding to the aligned region; possibly smaller than $score + int sub; // 2nd best SW score + int csub; // SW score of a tandem hit + int sub_n; // approximate number of suboptimal hits + int w; // actual band width used in extension + int seedcov; // length of regions coverged by seeds + int secondary; // index of the parent hit shadowing the current hit; <0 if primary + uint64_t hash; +} mem_alnreg_t; + +typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; + +typedef struct { + int low, high; // lower and upper bounds within which a read pair is considered to be properly paired + int failed; // non-zero if the orientation is not supported by sufficient data + double avg, std; // mean and stddev of the insert size distribution +} mem_pestat_t; + +typedef struct { // This struct is only used for the convenience of API. + int64_t pos; // forward strand 5'-end mapping position + int rid; // reference sequence index in bntseq_t; <0 for unmapped + int flag; // extra flag + uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance + int n_cigar; // number of CIGAR operations + uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 + + int score, sub; +} mem_aln_t; + +#ifdef __cplusplus +extern "C" { +#endif + + smem_i *smem_itr_init(const bwt_t *bwt); + void smem_itr_destroy(smem_i *itr); + void smem_set_query(smem_i *itr, int len, const uint8_t *query); + const bwtintv_v *smem_next(smem_i *itr); + + mem_opt_t *mem_opt_init(void); + void mem_fill_scmat(int a, int b, int8_t mat[25]); + + /** + * Align a batch of sequences and generate the alignments in the SAM format + * + * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam. + * Note that $seqs[i].sam may consist of several SAM lines if the + * corresponding sequence has multiple primary hits. + * + * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query + * sequences must be interleaved: $n must be an even number and the 2i-th + * sequence and the (2i+1)-th sequence constitute a read pair. In this + * mode, there should be enough (typically >50) unique pairs for the + * routine to infer the orientation and insert size. + * + * @param opt alignment parameters + * @param bwt FM-index of the reference sequence + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param n number of query sequences + * @param seqs query sequences; $seqs[i].seq/sam to be modified after the call + * @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements, + * corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info. + */ + void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); + + /** + * Find the aligned regions for one query sequence + * + * Note that this routine does not generate CIGAR. CIGAR should be + * generated later by mem_reg2aln() below. + * + * @param opt alignment parameters + * @param bwt FM-index of the reference sequence + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param l_seq length of query sequence + * @param seq query sequence + * + * @return list of aligned regions. + */ + mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq); + + /** + * Generate CIGAR and forward-strand position from alignment region + * + * @param opt alignment parameters + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param l_seq length of query sequence + * @param seq query sequence + * @param ar one alignment region + * + * @return CIGAR, strand, mapping quality and forward-strand position + */ + mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar); + mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar, const char *name); + + /** + * Infer the insert size distribution from interleaved alignment regions + * + * This function can be called after mem_align1(), as long as paired-end + * reads are properly interleaved. + * + * @param opt alignment parameters + * @param l_pac length of concatenated reference sequence + * @param n number of query sequences; must be an even number + * @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair + * @param pes inferred insert size distribution (output) + */ + void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwamem_extra.c b/bwamem_extra.c new file mode 100644 index 0000000..96cdbcd --- /dev/null +++ b/bwamem_extra.c @@ -0,0 +1,107 @@ +#include "bwa.h" +#include "bwamem.h" +#include "bntseq.h" +#include "kstring.h" + +/*************************** + * SMEM iterator interface * + ***************************/ + +struct __smem_i { + const bwt_t *bwt; + const uint8_t *query; + int start, len; + bwtintv_v *matches; // matches; to be returned by smem_next() + bwtintv_v *sub; // sub-matches inside the longest match; temporary + bwtintv_v *tmpvec[2]; // temporary arrays +}; + +smem_i *smem_itr_init(const bwt_t *bwt) +{ + smem_i *itr; + itr = calloc(1, sizeof(smem_i)); + itr->bwt = bwt; + itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); + itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); + itr->matches = calloc(1, sizeof(bwtintv_v)); + itr->sub = calloc(1, sizeof(bwtintv_v)); + return itr; +} + +void smem_itr_destroy(smem_i *itr) +{ + free(itr->tmpvec[0]->a); free(itr->tmpvec[0]); + free(itr->tmpvec[1]->a); free(itr->tmpvec[1]); + free(itr->matches->a); free(itr->matches); + free(itr->sub->a); free(itr->sub); + free(itr); +} + +void smem_set_query(smem_i *itr, int len, const uint8_t *query) +{ + itr->query = query; + itr->start = 0; + itr->len = len; +} + +const bwtintv_v *smem_next(smem_i *itr) +{ + int i, max, max_i, ori_start; + itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; + if (itr->start >= itr->len || itr->start < 0) return 0; + while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases + if (itr->start == itr->len) return 0; + ori_start = itr->start; + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM + if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here + for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match + bwtintv_t *p = &itr->matches->a[i]; + int len = (uint32_t)p->info - (p->info>>32); + if (max < len) max = len, max_i = i; + } + return itr->matches; +} + +/*********************** + *** Extra functions *** + ***********************/ + +mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_) +{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence + extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq); + extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); + mem_alnreg_v ar; + char *seq; + seq = malloc(l_seq); + memcpy(seq, seq_, l_seq); // makes a copy of seq_ + ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq); + mem_mark_primary_se(opt, ar.n, ar.a, lrand48()); + free(seq); + return ar; +} + +void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) +{ + int i; + kstring_t str = {0,0,0}; + for (i = 0; i < a->n; ++i) { + const mem_alnreg_t *p = &a->a[i]; + int is_rev, rid, qb = p->qb, qe = p->qe; + int64_t pos, rb = p->rb, re = p->re; + pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); + rid = bns_pos2rid(bns, pos); + assert(rid == p->rid); + pos -= bns->anns[rid].offset; + kputs(s->name, &str); kputc('\t', &str); + kputw(s->l_seq, &str); kputc('\t', &str); + if (is_rev) qb ^= qe, qe ^= qb, qb ^= qe; // swap + kputw(qb, &str); kputc('\t', &str); kputw(qe, &str); kputc('\t', &str); + kputs(bns->anns[rid].name, &str); kputc('\t', &str); + kputw(bns->anns[rid].len, &str); kputc('\t', &str); + kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str); + ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb)); + kputc('\n', &str); + } + s->sam = str.s; +} + diff --git a/bwamem_pair.c b/bwamem_pair.c new file mode 100644 index 0000000..bbd2cdb --- /dev/null +++ b/bwamem_pair.c @@ -0,0 +1,337 @@ +#include +#include +#include +#include +#include "kstring.h" +#include "bwamem.h" +#include "kvec.h" +#include "utils.h" +#include "ksw.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + + +#define MIN_RATIO 0.8 +#define MIN_DIR_CNT 10 +#define MIN_DIR_RATIO 0.05 +#define OUTLIER_BOUND 2.0 +#define MAPPING_BOUND 3.0 +#define MAX_STDDEV 4.0 + +static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist) +{ + int64_t p2; + int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac); + p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand + *dist = p2 > b1? p2 - b1 : b1 - p2; + return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3); +} + +static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) +{ + int j; + for (j = 1; j < r->n; ++j) { // choose unique alignment + int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb; + int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe; + if (e_min > b_max) { // have overlap + int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb; + if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap + } + } + return j < r->n? r->a[j].score : opt->min_seed_len * opt->a; +} + +void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) +{ + int i, d, max; + uint64_v isize[4]; + memset(pes, 0, 4 * sizeof(mem_pestat_t)); + memset(isize, 0, sizeof(kvec_t(int)) * 4); + for (i = 0; i < n>>1; ++i) { + int dir; + int64_t is; + mem_alnreg_v *r[2]; + r[0] = (mem_alnreg_v*)®s[i<<1|0]; + r[1] = (mem_alnreg_v*)®s[i<<1|1]; + if (r[0]->n == 0 || r[1]->n == 0) continue; + if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue; + if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue; + if (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr + dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is); + if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); + } + if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); + for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. + mem_pestat_t *r = &pes[d]; + uint64_v *q = &isize[d]; + int p25, p50, p75, x; + if (q->n < MIN_DIR_CNT) { + fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + r->failed = 1; + continue; + } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + ks_introsort_64(q->n, q->a); + p25 = q->a[(int)(.25 * q->n + .499)]; + p50 = q->a[(int)(.50 * q->n + .499)]; + p75 = q->a[(int)(.75 * q->n + .499)]; + r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); + if (r->low < 1) r->low = 1; + r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); + fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high); + for (i = x = 0, r->avg = 0; i < q->n; ++i) + if (q->a[i] >= r->low && q->a[i] <= r->high) + r->avg += q->a[i], ++x; + r->avg /= x; + for (i = 0, r->std = 0; i < q->n; ++i) + if (q->a[i] >= r->low && q->a[i] <= r->high) + r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg); + r->std = sqrt(r->std / x); + fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std); + r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499); + r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499); + if (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV * r->std + .499); + if (r->high < r->avg - MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499); + if (r->low < 1) r->low = 1; + fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high); + free(q->a); + } + for (d = 0, max = 0; d < 4; ++d) + max = max > isize[d].n? max : isize[d].n; + for (d = 0; d < 4; ++d) + if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) { + pes[d].failed = 1; + fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + } +} + +int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) +{ + extern int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun); + int64_t l_pac = bns->l_pac; + int i, r, skip[4], n = 0, rid; + for (r = 0; r < 4; ++r) + skip[r] = pes[r].failed? 1 : 0; + for (i = 0; i < ma->n; ++i) { // check which orinentation has been found + int64_t dist; + r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist); + if (dist >= pes[r].low && dist <= pes[r].high) + skip[r] = 1; + } + if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW + for (r = 0; r < 4; ++r) { + int is_rev, is_larger; + uint8_t *seq, *rev = 0, *ref; + int64_t rb, re; + if (skip[r]) continue; + is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate + is_larger = !(r>>1); // whether the mate has larger coordinate + if (is_rev) { + rev = malloc(l_ms); // this is the reverse complement of $ms + for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4; + seq = rev; + } else seq = (uint8_t*)ms; + if (!is_rev) { + rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high; + re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length + } else { + rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands + re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low; + } + if (rb < 0) rb = 0; + if (re > l_pac<<1) re = l_pac<<1; + ref = bns_fetch_seq(bns, pac, &rb, (rb+re)>>1, &re, &rid); + if (a->rid == rid) { // no funny things happening + kswr_t aln; + mem_alnreg_t b; + int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); + aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); + memset(&b, 0, sizeof(mem_alnreg_t)); + if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 + b.rid = a->rid; + b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb; + b.qe = is_rev? l_ms - aln.qb : aln.qe + 1; + b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; + b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1; + b.score = aln.score; + b.csub = aln.score2; + b.secondary = -1; + b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1; +// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); + kv_push(mem_alnreg_t, *ma, b); // make room for a new element + // move b s.t. ma is sorted + for (i = 0; i < ma->n - 1; ++i) // find the insertion point + if (ma->a[i].score < b.score) break; + tmp = i; + for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1]; + ma->a[i] = b; + } + ++n; + } + if (n) ma->n = mem_sort_and_dedup(ma->n, ma->a, opt->mask_level_redun); + if (rev) free(rev); + free(ref); + } + return n; +} + +int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2]) +{ + pair64_v v, u; + int r, i, k, y[4], ret; // y[] keeps the last hit + int64_t l_pac = bns->l_pac; + kv_init(v); kv_init(u); + for (r = 0; r < 2; ++r) { // loop through read number + for (i = 0; i < a[r].n; ++i) { + pair64_t key; + mem_alnreg_t *e = &a[r].a[i]; + key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position + key.x = (uint64_t)e->rid<<32 | (key.x - bns->anns[e->rid].offset); + key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; + kv_push(pair64_t, v, key); + } + } + ks_introsort_128(v.n, v.a); + y[0] = y[1] = y[2] = y[3] = -1; + //for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x); + for (i = 0; i < v.n; ++i) { + for (r = 0; r < 2; ++r) { // loop through direction + int dir = r<<1 | (v.a[i].y>>1&1), which; + if (pes[dir].failed) continue; // invalid orientation + which = r<<1 | ((v.a[i].y&1)^1); + if (y[which] < 0) continue; // no previous hits + for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt) + int64_t dist; + int q; + double ns; + pair64_t *p; + if ((v.a[k].y&3) != which) continue; + dist = (int64_t)v.a[i].x - v.a[k].x; + //printf("%d: %lld\n", k, dist); + if (dist > pes[dir].high) break; + if (dist < pes[dir].low) continue; + ns = (dist - pes[dir].avg) / pes[dir].std; + q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) * opt->a + .499); // .721 = 1/log(4) + if (q < 0) q = 0; + p = kv_pushp(pair64_t, u); + p->y = (uint64_t)k<<32 | i; + p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU); + //printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist); + } + } + y[v.a[i].y&3] = i; + } + if (u.n) { // found at least one proper pair + int tmp = opt->a + opt->b; + tmp = tmp > opt->o_del + opt->e_del? tmp : opt->o_del + opt->e_del; + tmp = tmp > opt->o_ins + opt->e_ins? tmp : opt->o_ins + opt->e_ins; + ks_introsort_128(u.n, u.a); + i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32; + z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair + z[v.a[k].y&1] = v.a[k].y<<32>>34; + ret = u.a[u.n-1].x >> 32; + *sub = u.n > 1? u.a[u.n-2].x>>32 : 0; + for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i) + if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub; + } else ret = 0, *sub = 0, *n_sub = 0; + free(u.a); free(v.a); + return ret; +} + +#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499)) + +int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) +{ + extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); + extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); + extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m); + extern void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m); + + int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1; + kstring_t str; + mem_aln_t h[2]; + + str.l = str.m = 0; str.s = 0; + if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment + mem_alnreg_v b[2]; + kv_init(b[0]); kv_init(b[1]); + for (i = 0; i < 2; ++i) + for (j = 0; j < a[i].n; ++j) + if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) + kv_push(mem_alnreg_t, b[i], a[i].a[j]); + for (i = 0; i < 2; ++i) + for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) + n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); + free(b[0].a); free(b[1].a); + } + mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); + mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1); + if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; + // pairing single-end hits + if (a[0].n && a[1].n && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) { + int is_multi[2], q_pe, score_un, q_se[2]; + // check if an end has multiple hits even after mate-SW + for (i = 0; i < 2; ++i) { + for (j = 1; j < a[i].n; ++j) + if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break; + is_multi[i] = j < a[i].n? 1 : 0; + } + if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score + // compute mapQ for the best SE hit + score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; + //q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; + subo = subo > score_un? subo : score_un; + q_pe = raw_mapq(o - subo, opt->a); + if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499); + if (q_pe < 0) q_pe = 0; + if (q_pe > 60) q_pe = 60; + // the following assumes no split hits + if (o > score_un) { // paired alignment is preferred + mem_alnreg_t *c[2]; + c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]]; + for (i = 0; i < 2; ++i) { + if (c[i]->secondary >= 0) + c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2; + q_se[i] = mem_approx_mapq_se(opt, c[i]); + } + q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40; + q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40; + extra_flag |= 2; + // cap at the tandem repeat score + q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a)? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a); + q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a)? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a); + } else { // the unpaired alignment is preferred + z[0] = z[1] = 0; + q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]); + q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); + } + // write SAM + h[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; + h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; + mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1]); s[0].sam = strdup(str.s); str.l = 0; + mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0]); s[1].sam = str.s; + if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); + free(h[0].cigar); free(h[1].cigar); + } else goto no_pairing; + return n; + +no_pairing: + for (i = 0; i < 2; ++i) { + if (a[i].n && a[i].a[0].score >= opt->T) + h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]); + else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0); + } + if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. + int64_t dist; + int d; + d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); + if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2; + } + mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); + mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); + if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); + free(h[0].cigar); free(h[1].cigar); + return n; +} diff --git a/bwape.c b/bwape.c index 6fe2c36..63203ce 100644 --- a/bwape.c +++ b/bwape.c @@ -8,8 +8,13 @@ #include "kvec.h" #include "bntseq.h" #include "utils.h" -#include "stdaln.h" #include "bwase.h" +#include "bwa.h" +#include "ksw.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif typedef struct { int n; @@ -21,24 +26,15 @@ typedef struct { bwtint_t low, high, high_bayesian; } isize_info_t; -typedef struct { - uint64_t x, y; -} b128_t; - -#define b128_lt(a, b) ((a).x < (b).x) #define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y) #define b128_hash(a) ((uint32_t)(a).x) #include "khash.h" -KHASH_INIT(b128, b128_t, poslist_t, 1, b128_hash, b128_eq) - -#include "ksort.h" -KSORT_INIT(b128, b128_t, b128_lt) -KSORT_INIT_GENERIC(uint64_t) +KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq) typedef struct { - kvec_t(b128_t) arr; - kvec_t(b128_t) pos[2]; + pair64_v arr; + pair64_v pos[2]; kvec_t(bwt_aln1_t) aln[2]; } pe_data_t; @@ -47,16 +43,14 @@ typedef struct { extern int g_log_n[256]; // in bwase.c static kh_b128_t *g_hash; -void adjust_pssm_score(const bntseq_t *bns, bwa_seq_t *seq, float prior); -void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); -void bwa_pssm_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); -void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); +void bwa_aln2seq_core(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); +void bwa_pssm_aln2seq_core(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); +void bwa_aln2seq(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s); int bwa_approx_mapQ(const bwa_seq_t *p, int mm); int bwa_pssm_approx_mapQ(const bwa_seq_t *p, int mm); void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2); bntseq_t *bwa_open_nt(const char *prefix); void bwa_print_sam_SQ(const bntseq_t *bns); -void bwa_print_sam_PG(); pe_opt_t *bwa_init_pe_opt() { @@ -72,19 +66,6 @@ pe_opt_t *bwa_init_pe_opt() po->ap_prior = 1e-5; return po; } - -static inline uint64_t hash_64(uint64_t key) -{ - key += ~(key << 32); - key ^= (key >> 22); - key += ~(key << 13); - key ^= (key >> 8); - key += (key << 3); - key ^= (key >> 15); - key += ~(key << 27); - key ^= (key >> 31); - return key; -} /* static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x); { @@ -123,13 +104,18 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double free(isizes); return -1; } - ks_introsort(uint64_t, tot, isizes); + ks_introsort_64(tot, isizes); p25 = isizes[(int)(tot*0.25 + 0.5)]; p50 = isizes[(int)(tot*0.50 + 0.5)]; p75 = isizes[(int)(tot*0.75 + 0.5)]; tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + if (ii->low > ii->high) { + fprintf(stderr, "[infer_isize] fail to infer insert size: upper bound is smaller than read length\n"); + free(isizes); + return -1; + } for (i = 0, x = n = 0; i < tot; ++i) if (isizes[i] >= ii->low && isizes[i] <= ii->high) ++n, x += isizes[i]; @@ -173,7 +159,7 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, { int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len; uint64_t o_score, subo_score; - b128_t last_pos[2][2], o_pos[2]; + pair64_t last_pos[2][2], o_pos[2]; max_len = p[0]->full_len; if (max_len < p[1]->full_len) max_len = p[1]->full_len; if (low_bound < max_len) low_bound = max_len; @@ -209,11 +195,11 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, o_score = subo_score = (uint64_t)-1; o_n = subo_n = 0; - ks_introsort(b128, d->arr.n, d->arr.a); + ks_introsort_128(d->arr.n, d->arr.a); for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1; if (opt->type == BWA_PET_STD) { for (i = 0; i < d->arr.n; ++i) { - b128_t x = d->arr.a[i]; + pair64_t x = d->arr.a[i]; int strand = x.y>>1&1; if (strand == 1) { // reverse strand, then check int y = 1 - (x.y&1); @@ -224,19 +210,6 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, last_pos[x.y&1][1] = x; } } - } else if (opt->type == BWA_PET_SOLID) { - for (i = 0; i < d->arr.n; ++i) { - b128_t x = d->arr.a[i]; - int strand = x.y>>1&1; - if ((strand^x.y)&1) { // push - int y = 1 - (x.y&1); - __pairing_aux(last_pos[y][1], x); - __pairing_aux(last_pos[y][0], x); - } else { // check - last_pos[x.y&1][0] = last_pos[x.y&1][1]; - last_pos[x.y&1][1] = x; - } - } } else { fprintf(stderr, "[paring] not implemented yet!\n"); exit(1); @@ -312,11 +285,11 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw p[j] = seqs[j] + i; p[j]->n_multi = 0; p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2); - fread(&n_aln, 4, 1, fp_sa[j]); + err_fread_noeof(&n_aln, 4, 1, fp_sa[j]); if (n_aln > kv_max(d->aln[j])) kv_resize(bwt_aln1_t, d->aln[j], n_aln); d->aln[j].n = n_aln; - fread(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]); + err_fread_noeof(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]); kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j] // generate SE alignment and mapping quality bwa_aln2seq(n_aln, d->aln[j].a, p[j]); @@ -330,7 +303,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw } else { p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff); } - p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len, &strand); + p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len + p[j]->ref_shift, &strand); p[j]->strand = strand; } } @@ -354,8 +327,9 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT) && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT)) { // only when both ends mapped - b128_t x; - int j, k, n_occ[2]; + pair64_t x; + int j, k; + long long n_occ[2]; for (j = 0; j < 2; ++j) { n_occ[j] = 0; for (k = 0; k < d->aln[j].n; ++k) @@ -368,7 +342,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw bwt_aln1_t *r = d->aln[j].a + k; bwtint_t l; if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table - b128_t key; + pair64_t key; int ret; key.x = r->k; key.y = r->l; khint_t iter = kh_put(b128, g_hash, key, &ret); @@ -378,21 +352,21 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n); for (l = r->k; l <= r->l; ++l) { int strand; - z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand)<<1; + z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand)<<1; z->a[l - r->k] |= strand; } } for (l = 0; l < kh_val(g_hash, iter).n; ++l) { x.x = kh_val(g_hash, iter).a[l]>>1; x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j; - kv_push(b128_t, d->arr, x); + kv_push(pair64_t, d->arr, x); } } else { // then calculate on the fly for (l = r->k; l <= r->l; ++l) { int strand; - x.x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand); + x.x = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand); x.y = k<<2 | strand<<1 | j; - kv_push(b128_t, d->arr, x); + kv_push(pair64_t, d->arr, x); } } } @@ -403,16 +377,19 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw if (opt->N_multi || opt->n_multi) { for (j = 0; j < 2; ++j) { if (p[j]->type != BWA_TYPE_NO_MATCH) { - int k; + int k, n_multi; if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) { bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi); } else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi); - for (k = 0; k < p[j]->n_multi; ++k) { + for (k = 0, n_multi = 0; k < p[j]->n_multi; ++k) { int strand; bwt_multi1_t *q = p[j]->multi + k; - q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len, &strand); + q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len + q->ref_shift, &strand); q->strand = strand; + if (q->pos != p[j]->pos) + p[j]->multi[n_multi++] = *q; } + p[j]->n_multi = n_multi; } } } @@ -436,16 +413,17 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw #define SW_MIN_MAPQ 17 // cnt = n_mm<<16 | n_gapo<<8 | n_gape -bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, - int *n_cigar, uint32_t *_cnt) +bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt) { + kswr_t r; + uint32_t *cigar32 = 0; bwa_cigar_t *cigar = 0; ubyte_t *ref_seq; bwtint_t k, x, y, l; - int path_len, ret; - AlnParam ap = aln_param_bwa; - path_t *path, *p; + int xtra, gscore; + int8_t mat[25]; + bwa_fill_scmat(1, 3, mat); // check whether there are too many N's if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0; for (k = 0, x = 0; k < len; ++k) @@ -456,15 +434,19 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u ref_seq = (ubyte_t*)calloc(reglen, 1); for (k = *beg, l = 0; l < reglen && k < l_pac; ++k) ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - path = (path_t*)calloc(l+len, sizeof(path_t)); // do alignment - ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, 0); - if (ret < 0) { - free(path); free(cigar); free(ref_seq); *n_cigar = 0; + xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0); + r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0); + gscore = ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32); + cigar = (bwa_cigar_t*)cigar32; + for (k = 0; k < *n_cigar; ++k) + cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); + + if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score || gscore != r.score) { // poor hit or tandem hits or weird alignment + free(cigar); free(ref_seq); *n_cigar = 0; return 0; } - cigar = bwa_aln_path2cigar(path, path_len, n_cigar); // check whether the alignment is good enough for (k = 0, x = y = 0; k < *n_cigar; ++k) { @@ -474,17 +456,14 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u else y += __cigar_len(c); } if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough - free(path); free(cigar); free(ref_seq); + free(cigar); free(ref_seq); *n_cigar = 0; return 0; } { // update cigar and coordinate; - int start, end; - p = path + path_len - 1; - *beg += (p->i? p->i : 1) - 1; - start = (p->j? p->j : 1) - 1; - end = path->j; + int start = r.qb, end = r.qe + 1; + *beg += r.tb; cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); if (start) { memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar)); @@ -501,8 +480,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u { // set *cnt int n_mm, n_gapo, n_gape; n_mm = n_gapo = n_gape = 0; - p = path + path_len - 1; - x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0; + x = r.tb; y = r.qb; for (k = 0; k < *n_cigar; ++k) { bwa_cigar_t c = cigar[k]; if (__cigar_op(c) == FROM_M) { @@ -518,7 +496,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u *_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape; } - free(ref_seq); free(path); + free(ref_seq); return cigar; } @@ -531,8 +509,8 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, // load reference sequence if (_pacseq == 0) { pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); - rewind(bns->fp_pac); - fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); + err_rewind(bns->fp_pac); + err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); } else pacseq = (ubyte_t*)_pacseq; if (!popt->is_sw || ii->avg < 0.0) return pacseq; @@ -581,11 +559,11 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, ++n_tot[is_singleton]; cigar[0] = cigar[1] = 0; n_cigar[0] = n_cigar[1] = 0; - if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered + if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified ubyte_t *seq; if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip - if (popt->type == BWA_PET_STD) { + { // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate __set_rght_coor(beg[k], end[k], p[1-k], p[k]); seq = p[k]->rseq; @@ -594,17 +572,6 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, seq = p[k]->seq; seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly } - } else { // BWA_PET_SOLID - if (p[1-k]->strand == 0) { // R3-F3 pairing - if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 - else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 - seq = p[k]->rseq; - seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed - } else { // F3-R3 pairing - if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 - else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 - seq = p[k]->seq; - } } // perform SW alignment cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]); @@ -661,19 +628,19 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, return pacseq; } -void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt) +void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); int i, j, n_seqs, tot_seqs = 0; bwa_seq_t *seqs[2]; bwa_seqio_t *ks[2]; clock_t t; - bntseq_t *bns, *ntbns = 0; + bntseq_t *bns; FILE *fp_sa[2]; gap_opt_t opt, opt0; khint_t iter; isize_info_t last_ii; // this is for the last batch of reads - char str[1024]; + char str[1024], magic[2][4]; bwt_t *bwt; uint8_t *pac; @@ -688,27 +655,29 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f g_hash = kh_init(b128); last_ii.avg = -1.0; - fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); + err_fread_noeof(magic[0], 1, 4, fp_sa[0]); + err_fread_noeof(magic[1], 1, 4, fp_sa[1]); + if (strncmp(magic[0], SAI_MAGIC, 4) != 0 || strncmp(magic[1], SAI_MAGIC, 4) != 0) { + fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__); + exit(1); + } + err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); ks[0] = bwa_open_reads(opt.mode, fn_fa[0]); opt0 = opt; - fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! + err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! ks[1] = bwa_open_reads(opt.mode, fn_fa[1]); - if (!(opt.mode & BWA_MODE_COMPREAD)) { - popt->type = BWA_PET_SOLID; - ntbns = bwa_open_nt(prefix); - } else { // for Illumina alignment only + { // for Illumina alignment only if (popt->is_preload) { strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1); - rewind(bns->fp_pac); - fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); + err_rewind(bns->fp_pac); + err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); } } // core loop - bwa_print_sam_SQ(bns); - bwa_print_sam_PG(); + bwa_print_sam_hdr(bns, rg_line); while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) { int cnt_chg; isize_info_t ii; @@ -729,7 +698,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... "); for (j = 0; j < 2; ++j) - bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns); + bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); if (pac == 0) free(pacseq); @@ -743,6 +712,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f } bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2); bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2); + if (strcmp(p[0]->name, p[1]->name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", p[0]->name, p[1]->name); } fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); @@ -754,10 +724,9 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f // destroy bns_destroy(bns); - if (ntbns) bns_destroy(ntbns); for (i = 0; i < 2; ++i) { bwa_seq_close(ks[i]); - fclose(fp_sa[i]); + err_fclose(fp_sa[i]); } for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter) if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a); @@ -769,18 +738,15 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f int bwa_sai2sam_pe(int argc, char *argv[]) { - extern char *bwa_rg_line, *bwa_rg_id; - extern int bwa_set_rg(const char *s); int c; pe_opt_t *popt; + char *prefix, *rg_line = 0; + popt = bwa_init_pe_opt(); while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) { switch (c) { case 'r': - if (bwa_set_rg(optarg) < 0) { - fprintf(stderr, "[%s] malformated @RG line\n", __func__); - return 1; - } + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; break; case 'a': popt->max_isize = atoi(optarg); break; case 'o': popt->max_occ = atoi(optarg); break; @@ -814,8 +780,11 @@ int bwa_sai2sam_pe(int argc, char *argv[]) fprintf(stderr, "\n"); return 1; } - bwa_sai2sam_pe_core(argv[optind], argv + optind + 1, argv + optind+3, popt); - free(bwa_rg_line); free(bwa_rg_id); - free(popt); + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + return 1; + } + bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line); + free(prefix); free(popt); return 0; } diff --git a/bwase.c b/bwase.c index 7d9f129..835db14 100644 --- a/bwase.c +++ b/bwase.c @@ -4,25 +4,27 @@ #include #include #include -#include -#include "stdaln.h" +#include #include "bwase.h" #include "bwtaln.h" #include "bntseq.h" #include "utils.h" #include "kstring.h" +#include "bwa.h" +#include "ksw.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif int g_log_n[256]; -char *bwa_rg_line, *bwa_rg_id; float exp2f(float e); double exp2(double e); -void bwa_print_sam_PG(); - -void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) +void bwa_aln2seq_core(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) { int i, cnt, best; - if (n_aln == 0 || !aln) { + if (n_aln == 0) { s->type = BWA_TYPE_NO_MATCH; s->c1 = s->c2 = 0; return; @@ -35,6 +37,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma if (p->score > best) break; if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) { s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; + s->ref_shift = (int)p->n_del - (int)p->n_ins; s->score = p->score; s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); } @@ -70,6 +73,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma for (l = q->k; l <= q->l; ++l) { s->multi[z].pos = l; s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins; s->multi[z++].mm = q->n_mm; } rest -= q->l - q->k + 1; @@ -80,6 +84,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma while (x < p) p -= p * j / (i--); s->multi[z].pos = q->l - i; s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins; s->multi[z++].mm = q->n_mm; } rest = 0; @@ -87,12 +92,6 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma } } s->n_multi = z; - /*// the following code removes the primary hit, but this leads to a bug in the PE mode - for (k = z = 0; k < s->n_multi; ++k) - if (s->multi[k].pos != s->sa) - s->multi[z++] = s->multi[k]; - s->n_multi = z < n_multi? z : n_multi; - */ } } @@ -100,7 +99,7 @@ float itf(int score) { return ((float)score) / 1000.; } -void bwa_pssm_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) +void bwa_pssm_aln2seq_core(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) { int i, cnt; double best_score, total_prob=0.0; @@ -128,7 +127,9 @@ void bwa_pssm_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int s s->best_pssm_score = best_score; s->posterior_prob = exp2((double)s->best_pssm_score) / total_prob; for (i = 0; i < n_aln; i++) { - const bwt_aln1_t *p = aln + i; + bwt_aln1_t *p = aln + i; + p->posterior_p = exp2((double)itf(p->pssm_score)) / total_prob; + if (itf(p->pssm_score) == best_score) s->c1 += p->l - p->k + 1; } @@ -160,6 +161,7 @@ void bwa_pssm_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int s for (l = q->k; l <= q->l; ++l) { s->multi[z].pos = l; s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z].posterior_p = q->posterior_p; s->multi[z++].mm = q->n_mm; } rest -= q->l - q->k + 1; @@ -170,6 +172,7 @@ void bwa_pssm_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int s while (x < p) p -= p * j / (i--); s->multi[z].pos = q->l - i; s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z].posterior_p = q->posterior_p; s->multi[z++].mm = q->n_mm; } rest = 0; @@ -194,14 +197,19 @@ void adjust_pssm_score(const bntseq_t *bns, bwa_seq_t *seq, float prior) { double P = prior; double p = seq->posterior_prob; double new_pp; + int i; new_pp = e / ((e / p) + L * ((1 - P) / P)); seq->posterior_prob = new_pp; - //fprintf(stderr, "best_score: %f e: %f L: %f P: %f p: %f new_pp: %f\n", seq->best_pssm_score, e, L, P, p, new_pp); + for (i = 0; i < seq->n_multi; i++) { + bwt_multi1_t *q = seq->multi + i; + p = q->posterior_p; + q->posterior_p = e / ((e / p) + L * ((1 - P) / P)); + } } -void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s) +void bwa_aln2seq(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s) { if (n_aln == 0 || !aln) { s->type = BWA_TYPE_NO_MATCH; @@ -234,17 +242,15 @@ int bwa_pssm_approx_mapQ(const bwa_seq_t *p, int mm) return (-10 * (logf(1.0 - p->posterior_prob) / logf(10))); } - -bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand) +bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int ref_len, int *strand) { bwtint_t pos_f; int is_rev; - pos_f = bns_depos(bns, bwt_sa(bwt, sapos), &is_rev); // pos_f + pos_f = bwt_sa(bwt, sapos); // position on the forward-reverse coordinate + if (pos_f < bns->l_pac && bns->l_pac < pos_f + ref_len) return (bwtint_t)-1; + pos_f = bns_depos(bns, pos_f, &is_rev); // position on the forward strand; this may be the first base or the last base *strand = !is_rev; - /* NB: For gapped alignment, pacpos may not be correct, which will be fixed - * in refine_gapped_core(). This line also determines the way "x" is - * calculated in refine_gapped_core() when (ext < 0 && is_end == 0). */ - if (is_rev) pos_f = pos_f + 1 < len? 0 : pos_f - len + 1; // mapped to the forward strand + if (is_rev) pos_f = pos_f + 1 < ref_len? 0 : pos_f - ref_len + 1; // position of the first base return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset } @@ -265,85 +271,68 @@ void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, else seq->seQ = seq->mapQ = bwa_pssm_approx_mapQ(seq, max_diff); - seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len, &strand); + seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len + seq->ref_shift, &strand); seq->strand = strand; if (! seq->pssm ) seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); else seq->seQ = seq->mapQ = bwa_pssm_approx_mapQ(seq, max_diff); + if (seq->pos == (bwtint_t)-1) seq->type = BWA_TYPE_NO_MATCH; } void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr) { - int i, j, strand; + int i, j, strand, n_multi; char str[1024]; bwt_t *bwt; // load forward SA strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); for (i = 0; i != n_seqs; ++i) { - bwa_cal_pac_pos_core(bns, bwt, &seqs[i], max_mm, fnr); - for (j = 0; j < seqs[i].n_multi; ++j) { - bwt_multi1_t *p = seqs[i].multi + j; - p->pos = bwa_sa2pos(bns, bwt, p->pos, seqs[i].len, &strand); - p->strand = strand; + bwa_seq_t *p = &seqs[i]; + bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr); + for (j = n_multi = 0; j < p->n_multi; ++j) { + bwt_multi1_t *q = p->multi + j; + q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len + q->ref_shift, &strand); + q->strand = strand; + if (q->pos != p->pos && q->pos != (bwtint_t)-1) + p->multi[n_multi++] = *q; } + p->n_multi = n_multi; } bwt_destroy(bwt); } -/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on - * forward strand. This happens when p->pos is calculated by - * bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct - * coordinate. This happens only for color-converted alignment. */ -static bwa_cigar_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos, - int ext, int *n_cigar, int is_end_correct) +#define SW_BW 50 + +bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, int ref_shift, bwtint_t *_rb, int *n_cigar) { bwa_cigar_t *cigar = 0; - ubyte_t *ref_seq; - int l = 0, path_len, ref_len; - AlnParam ap = aln_param_bwa; - path_t *path; - int64_t k, __pos = *_pos; - - ref_len = len + abs(ext); - if (ext > 0) { - ref_seq = (ubyte_t*)calloc(ref_len, 1); - for (k = __pos; k < __pos + ref_len && k < l_pac; ++k) - ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - } else { - int64_t x = __pos + (is_end_correct? len : ref_len); - ref_seq = (ubyte_t*)calloc(ref_len, 1); - for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k) - ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - } - path = (path_t*)calloc(l+len, sizeof(path_t)); - - aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len); - cigar = bwa_aln_path2cigar(path, path_len, n_cigar); - - if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped to the forward strand - for (l = k = 0; k < *n_cigar; ++k) { - if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]); - else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]); - } - __pos += l; - } - - if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end - __pos += __cigar_len(cigar[0]); - for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1]; - --(*n_cigar); + uint32_t *cigar32 = 0; + ubyte_t *rseq; + int64_t k, rb, re, rlen; + int8_t mat[25]; + + bwa_fill_scmat(1, 3, mat); + rb = *_rb; re = rb + len + ref_shift; + assert(re <= l_pac); + rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); + assert(re - rb == rlen); + ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW > abs(rlen - len) * 1.5? SW_BW : abs(rlen - len) * 1.5, n_cigar, &cigar32); + assert(*n_cigar > 0); + if ((cigar32[*n_cigar - 1]&0xf) == 1) cigar32[*n_cigar - 1] = (cigar32[*n_cigar - 1]>>4<<4) | 3; // change endding ins to soft clipping + if ((cigar32[0]&0xf) == 1) cigar32[0] = (cigar32[0]>>4<<4) | 3; // change beginning ins to soft clipping + if ((cigar32[*n_cigar - 1]&0xf) == 2) --*n_cigar; // delete endding del + if ((cigar32[0]&0xf) == 2) { // delete beginning del + *_rb += cigar32[0]>>4; + --*n_cigar; + memmove(cigar32, cigar32+1, (*n_cigar) * 4); } - if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end - - // change "I" at either end of the read to S. just in case. This should rarely happen... - if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1]))); - if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0]))); - - *_pos = (bwtint_t)__pos; - free(ref_seq); free(path); + cigar = (bwa_cigar_t*)cigar32; + for (k = 0; k < *n_cigar; ++k) + cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); + free(rseq); return cigar; } @@ -382,7 +371,7 @@ char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_ } } } else { // no gaps - for (z = u = 0; z < (bwtint_t)len; ++z) { + for (z = u = 0; z < (bwtint_t)len && x+z < l_pac; ++z) { c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { ksprintf(str, "%d", u); @@ -433,79 +422,50 @@ void bwa_correct_trimmed(bwa_seq_t *s) s->len = s->full_len; } -void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns) +void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq) { - ubyte_t *pacseq, *ntpac = 0; - int i, j; + ubyte_t *pacseq; + int i, j, k; kstring_t *str; - if (ntbns) { // in color space - ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1); - rewind(ntbns->fp_pac); - fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac); - } - if (!_pacseq) { pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); - rewind(bns->fp_pac); - fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); + err_rewind(bns->fp_pac); + err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); } else pacseq = _pacseq; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *s = seqs + i; seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!! - for (j = 0; j < s->n_multi; ++j) { + for (j = k = 0; j < s->n_multi; ++j) { bwt_multi1_t *q = s->multi + j; int n_cigar; - if (q->gap == 0) continue; - q->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, - (q->strand? 1 : -1) * q->gap, &n_cigar, 1); - q->n_cigar = n_cigar; - } - if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; - s->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, - (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); - } - - if (ntbns) { // in color space - for (i = 0; i < n_seqs; ++i) { - bwa_seq_t *s = seqs + i; - bwa_cs2nt_core(s, bns->l_pac, ntpac); - for (j = 0; j < s->n_multi; ++j) { - bwt_multi1_t *q = s->multi + j; - int n_cigar; - if (q->gap == 0) continue; - free(q->cigar); - q->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos, - (q->strand? 1 : -1) * q->gap, &n_cigar, 0); + if (q->gap) { // gapped alignment + q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, &q->pos, &n_cigar); q->n_cigar = n_cigar; - } - if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again - free(s->cigar); - s->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos, - (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0); - } + if (q->cigar) s->multi[k++] = *q; + } else s->multi[k++] = *q; } + s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation + if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; + s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, &s->pos, &s->n_cigar); + if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH; } - // generate MD tag str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = 0; i != n_seqs; ++i) { bwa_seq_t *s = seqs + i; if (s->type != BWA_TYPE_NO_MATCH) { int nm; - s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, - bns->l_pac, ntbns? ntpac : pacseq, str, &nm); + s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, bns->l_pac, pacseq, str, &nm); s->nm = nm; } } free(str->s); free(str); // correct for trimmed reads - if (!ntbns) // trimming is only enabled for Illumina reads - for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); + for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); if (!_pacseq) free(pacseq); - free(ntpac); } int64_t pos_end(const bwa_seq_t *p) @@ -541,6 +501,26 @@ static int64_t pos_5(const bwa_seq_t *p) return -1; } +void bwa_print_seq(FILE *stream, bwa_seq_t *seq) { + char buffer[4096]; + const int bsz = sizeof(buffer); + int i, j, l; + + if (seq->strand == 0) { + for (i = 0; i < seq->full_len; i += bsz) { + l = seq->full_len - i > bsz ? bsz : seq->full_len - i; + for (j = 0; j < l; j++) buffer[j] = "ACGTN"[seq->seq[i + j]]; + err_fwrite(buffer, 1, l, stream); + } + } else { + for (i = seq->full_len - 1; i >= 0; i -= bsz) { + l = i + 1 > bsz ? bsz : i + 1; + for (j = 0; j < l; j++) buffer[j] = "TGCAN"[seq->seq[i - j]]; + err_fwrite(buffer, 1, l, stream); + } + } +} + void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2) { int j; @@ -592,16 +572,14 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in else err_printf("\t*\t0\t0\t"); // print sequence and quality - if (p->strand == 0) - for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]); - else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]); - putchar('\t'); + bwa_print_seq(stdout, p); + err_putchar('\t'); if (p->qual) { if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality err_printf("%s", p->qual); } else err_printf("*"); - if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); + if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); if (p->type != BWA_TYPE_NO_MATCH) { @@ -639,86 +617,34 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in } } } - putchar('\n'); + err_putchar('\n'); } else { // this read has no match - ubyte_t *s = p->strand? p->rseq : p->seq; + //ubyte_t *s = p->strand? p->rseq : p->seq; int flag = p->extra_flag | SAM_FSU; if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU; err_printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag); - for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]); - putchar('\t'); + //Why did this work differently to the version above?? + //for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]); + bwa_print_seq(stdout, p); + err_putchar('\t'); if (p->qual) { if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality err_printf("%s", p->qual); } else err_printf("*"); - if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); + if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); - putchar('\n'); + err_putchar('\n'); } } -bntseq_t *bwa_open_nt(const char *prefix) -{ - bntseq_t *ntbns; - char *str; - str = (char*)calloc(strlen(prefix) + 10, 1); - strcat(strcpy(str, prefix), ".nt"); - ntbns = bns_restore(str); - free(str); - return ntbns; -} - -void bwa_print_sam_SQ(const bntseq_t *bns) -{ - int i; - for (i = 0; i < bns->n_seqs; ++i) - err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); - if (bwa_rg_line) err_printf("%s\n", bwa_rg_line); -} - void bwase_initialize() { int i; for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); } -char *bwa_escape(char *s) -{ - char *p, *q; - for (p = q = s; *p; ++p) { - if (*p == '\\') { - ++p; - if (*p == 't') *q++ = '\t'; - else if (*p == 'n') *q++ = '\n'; - else if (*p == 'r') *q++ = '\r'; - else if (*p == '\\') *q++ = '\\'; - } else *q++ = *p; - } - *q = '\0'; - return s; -} - -int bwa_set_rg(const char *s) -{ - char *p, *q, *r; - if (strstr(s, "@RG") != s) return -1; - if (bwa_rg_line) free(bwa_rg_line); - if (bwa_rg_id) free(bwa_rg_id); - bwa_rg_line = strdup(s); - bwa_rg_id = 0; - bwa_escape(bwa_rg_line); - p = strstr(bwa_rg_line, "\tID:"); - if (p == 0) return -1; - p += 4; - for (q = p; *q && *q != '\t' && *q != '\n'; ++q); - bwa_rg_id = calloc(q - p + 1, 1); - for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) - *r++ = *q; - return 0; -} - -void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ) +void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); int i, n_seqs, tot_seqs = 0, m_aln; @@ -726,9 +652,10 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; - bntseq_t *bns, *ntbns = 0; + bntseq_t *bns; FILE *fp_sa; gap_opt_t opt; + char magic[4]; // initialization bwase_initialize(); @@ -737,11 +664,13 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f fp_sa = xopen(fn_sa, "r"); m_aln = 0; - fread(&opt, sizeof(gap_opt_t), 1, fp_sa); - if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac - ntbns = bwa_open_nt(prefix); - bwa_print_sam_SQ(bns); - bwa_print_sam_PG(); + err_fread_noeof(magic, 1, 4, fp_sa); + if (strncmp(magic, SAI_MAGIC, 4) != 0) { + fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__); + exit(1); + } + err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa); + bwa_print_sam_hdr(bns, rg_line); // set ks ks = bwa_open_reads(opt.mode, fn_fa); // core loop @@ -754,13 +683,13 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f bwa_seq_t *p = seqs + i; p->posterior_prob = 5.0; int n_aln; - fread(&n_aln, 4, 1, fp_sa); + err_fread_noeof(&n_aln, 4, 1, fp_sa); if (n_aln > m_aln) { m_aln = n_aln; aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln); } - fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); - if (aln && aln->pssm) { + err_fread_noeof(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); + if (aln && aln->pssm) { p->pssm = 1; bwa_pssm_aln2seq_core(n_aln, aln, p, 1, n_occ); adjust_pssm_score(bns, p, opt.prior); @@ -768,14 +697,14 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f p->pssm = 0; bwa_aln2seq_core(n_aln, aln, p, 1, n_occ); } - } + } fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... "); bwa_cal_pac_pos(bns, prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); - bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns); + bwa_refine_gapped(bns, n_seqs, seqs, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] print alignments... "); @@ -789,23 +718,20 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f // destroy bwa_seq_close(ks); - if (ntbns) bns_destroy(ntbns); bns_destroy(bns); - fclose(fp_sa); + err_fclose(fp_sa); free(aln); } int bwa_sai2sam_se(int argc, char *argv[]) { int c, n_occ = 3; + char *prefix, *rg_line = 0; while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { switch (c) { case 'h': break; case 'r': - if (bwa_set_rg(optarg) < 0) { - fprintf(stderr, "[%s] malformated @RG line\n", __func__); - return 1; - } + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; break; case 'n': n_occ = atoi(optarg); break; case 'f': xreopen(optarg, "w", stdout); break; @@ -817,7 +743,11 @@ int bwa_sai2sam_se(int argc, char *argv[]) fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] \n"); return 1; } - bwa_sai2sam_se_core(argv[optind], argv[optind+1], argv[optind+2], n_occ); - free(bwa_rg_line); free(bwa_rg_id); + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + return 1; + } + bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line); + free(prefix); return 0; } diff --git a/bwase.h b/bwase.h index f8e9b0a..3f1d2fa 100644 --- a/bwase.h +++ b/bwase.h @@ -14,11 +14,12 @@ extern "C" { // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array. void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr); // Refine the approximate position of the sequence to an actual placement for the sequence. - void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns); + void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq); // Backfill certain alignment properties mainly centering around number of matches. - void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); + void bwa_aln2seq(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s); // Calculate the end position of a read given a certain sequence. int64_t pos_end(const bwa_seq_t *p); + void adjust_pssm_score(const bntseq_t *bns, bwa_seq_t *seq, float prior); // bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); diff --git a/bwaseqio.c b/bwaseqio.c index edf90e7..bd6c6f6 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -6,7 +6,11 @@ #include "seq2pssm.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_DECLARE(gzFile) + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif extern unsigned char nst_nt4_table[256]; static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; @@ -27,6 +31,7 @@ bwa_seqio_t *bwa_bam_open(const char *fn, int which) bs->is_bam = 1; bs->which = which; bs->fp = bam_open(fn, "r"); + if (0 == bs->fp) err_fatal_simple("Couldn't open bam file"); h = bam_header_read(bs->fp); bam_header_destroy(h); return bs; @@ -45,9 +50,10 @@ bwa_seqio_t *bwa_seq_open(const char *fn) void bwa_seq_close(bwa_seqio_t *bs) { if (bs == 0) return; - if (bs->is_bam) bam_close(bs->fp); - else { - gzclose(bs->ks->f->f); + if (bs->is_bam) { + if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file"); + } else { + err_gzclose(bs->ks->f->f); kseq_destroy(bs->ks); } free(bs); @@ -74,16 +80,14 @@ void seq_reverse(int len, ubyte_t *seq, int is_comp) int bwa_trim_read(int trim_qual, bwa_seq_t *p) { - int s = 0, l, max = 0, max_l = p->len - 1; + int s = 0, l, max = 0, max_l = p->len; if (trim_qual < 1 || p->qual == 0) return 0; - for (l = p->len - 1; l >= BWA_MIN_RDLEN - 1; --l) { + for (l = p->len - 1; l >= BWA_MIN_RDLEN; --l) { s += trim_qual - (p->qual[l] - 33); if (s < 0) break; - if (s > max) { - max = s; max_l = l; - } + if (s > max) max = s, max_l = l; } - p->clip_len = p->len = max_l + 1; + p->clip_len = p->len = max_l; return p->full_len - p->len; } @@ -93,11 +97,12 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; + int res; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); - while (bam_read1(bs->fp, b) >= 0) { + while ((res = bam_read1(bs->fp, b)) >= 0) { uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; @@ -129,6 +134,7 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com p->name = strdup((const char*)bam1_qname(b)); if (n_seqs == n_needed) break; } + if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); @@ -154,20 +160,21 @@ bwa_seq_t *bwa_read_pssm_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, in int n_seqs, l, i,j, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; - if (l_bc > 15) { - fprintf(stderr, "[%s] the maximum barcode length is 15.\n", __func__); + if (l_bc > BWA_MAX_BCLEN) { + fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); return 0; } if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { - if (seq->qual.l == 0 && seq->type != KSEQ_TYPE_PSSM) { - fprintf(stderr, "Need either quality scores or a PSSM as input.\n"); - continue; - //exit(1); - } - + if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { + // skip reads that are marked to be filtered by Casava + char *s = index(seq->comment.s, ':'); + if (s && *(++s) == 'Y') { + continue; + } + } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length @@ -190,8 +197,7 @@ bwa_seq_t *bwa_read_pssm_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, in p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; - p->seq = (ubyte_t*)calloc(p->len, 1); - + p->seq = (ubyte_t*)calloc(p->full_len, 1); for (i = 0; i != p->full_len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; if (seq->qual.l) { // copy quality diff --git a/bwt.c b/bwt.c index a19f6d8..c9bf6a3 100644 --- a/bwt.c +++ b/bwt.c @@ -34,6 +34,10 @@ #include "bwt.h" #include "kvec.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + void bwt_gen_cnt_table(bwt_t *bwt) { int i, j; @@ -45,21 +49,28 @@ void bwt_gen_cnt_table(bwt_t *bwt) } } +static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA +{ + bwtint_t x = k - (k > bwt->primary); + x = bwt_B0(bwt, x); + x = bwt->L2[x] + bwt_occ(bwt, k, x); + return k == bwt->primary? 0 : x; +} + // bwt->bwt and bwt->occ must be precalculated void bwt_cal_sa(bwt_t *bwt, int intv) { bwtint_t isa, sa, i; // S(isa) = sa + int intv_round = intv; + kv_roundup32(intv_round); + xassert(intv_round == intv, "SA sample interval is not a power of 2."); xassert(bwt->bwt, "bwt_t::bwt is not initialized."); if (bwt->sa) free(bwt->sa); bwt->sa_intv = intv; bwt->n_sa = (bwt->seq_len + intv) / intv; bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); - if (bwt->sa == 0) { - fprintf(stderr, "[%s] Fail to allocate %.3fMB memory. Abort!\n", __func__, bwt->n_sa * sizeof(bwtint_t) / 1024.0/1024.0); - abort(); - } // calculate SA value isa = 0; sa = bwt->seq_len; for (i = 0; i < bwt->seq_len; ++i) { @@ -73,8 +84,8 @@ void bwt_cal_sa(bwt_t *bwt, int intv) bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k) { - bwtint_t sa = 0; - while (k % bwt->sa_intv != 0) { + bwtint_t sa = 0, mask = bwt->sa_intv - 1; + while (k & mask) { ++sa; k = bwt_invPsi(bwt, k); } @@ -92,23 +103,22 @@ static inline int __occ_aux(uint64_t y, int c) return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56; } -inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) +bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) { - bwtint_t n, l, j; - uint32_t *p; + bwtint_t n; + uint32_t *p, *end; if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; if (k == (bwtint_t)(-1)) return 0; - if (k >= bwt->primary) --k; // because $ is not in bwt + k -= (k >= bwt->primary); // because $ is not in bwt // retrieve Occ at k/OCC_INTERVAL n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; p += sizeof(bwtint_t); // jump to the start of the first BWT cell // calculate Occ up to the last k/32 - j = k >> 5 << 5; - for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2) - n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1); + for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); // calculate Occ n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); @@ -118,7 +128,7 @@ inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) } // an analogy to bwt_occ() but more efficient, requiring k <= l -inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol) +void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol) { bwtint_t _k, _l; _k = (k >= bwt->primary)? k-1 : k; @@ -155,52 +165,53 @@ inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \ + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24]) -inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) +void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) { - bwtint_t l, j, x; - uint32_t *p; + bwtint_t x; + uint32_t *p, tmp, *end; if (k == (bwtint_t)(-1)) { memset(cnt, 0, 4 * sizeof(bwtint_t)); return; } - if (k >= bwt->primary) --k; // because $ is not in bwt + k -= (k >= bwt->primary); // because $ is not in bwt p = bwt_occ_intv(bwt, k); memcpy(cnt, p, 4 * sizeof(bwtint_t)); - p += sizeof(bwtint_t); - j = k >> 4 << 4; - for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p) - x += __occ_aux4(bwt, *p); - x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); + p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) + end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop + for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p); + tmp = *p & ~((1U<<((~k&15)<<1)) - 1); + x += __occ_aux4(bwt, tmp) - (~k&15); cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; } // an analogy to bwt_occ4() but more efficient, requiring k <= l -inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) +void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) { bwtint_t _k, _l; - _k = (k >= bwt->primary)? k-1 : k; - _l = (l >= bwt->primary)? l-1 : l; - if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { + _k = k - (k >= bwt->primary); + _l = l - (l >= bwt->primary); + if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { bwt_occ4(bwt, k, cntk); bwt_occ4(bwt, l, cntl); } else { - bwtint_t i, j, x, y; - uint32_t *p; - if (k >= bwt->primary) --k; // because $ is not in bwt - if (l >= bwt->primary) --l; + bwtint_t x, y; + uint32_t *p, tmp, *endk, *endl; + k -= (k >= bwt->primary); // because $ is not in bwt + l -= (l >= bwt->primary); p = bwt_occ_intv(bwt, k); memcpy(cntk, p, 4 * sizeof(bwtint_t)); - p += sizeof(bwtint_t); + p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) // prepare cntk[] - j = k >> 4 << 4; - for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p) - x += __occ_aux4(bwt, *p); + endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); + endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4)); + for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p); y = x; - x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); + tmp = *p & ~((1U<<((~k&15)<<1)) - 1); + x += __occ_aux4(bwt, tmp) - (~k&15); // calculate cntl[] and finalize cntk[] - j = l >> 4 << 4; - for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p); - y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15); + for (; p < endl; ++p) y += __occ_aux4(bwt, *p); + tmp = *p & ~((1U<<((~l&15)<<1)) - 1); + y += __occ_aux4(bwt, tmp) - (~l&15); memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24; cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24; @@ -274,7 +285,7 @@ static void bwt_reverse_intvs(bwtintv_v *p) } } -int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { int i, j, c, ret; bwtintv_t ik, ok[4]; @@ -282,41 +293,45 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem mem->n = 0; if (q[x] > 3) return x + 1; + if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 kv_init(a[0]); kv_init(a[1]); - prev = tmpvec[0]? tmpvec[0] : &a[0]; - curr = tmpvec[1]? tmpvec[1] : &a[1]; - bwt_set_intv(bwt, q[x], ik); + prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided + curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1]; + bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base ik.info = x + 1; for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search - if (q[i] > 3) break; - c = 3 - q[i]; - bwt_extend(bwt, &ik, ok, 0); - if (ok[c].x[2] != ik.x[2]) // change of the interval size + if (q[i] < 4) { // an A/C/G/T base + c = 3 - q[i]; // complement of q[i] + bwt_extend(bwt, &ik, ok, 0); + if (ok[c].x[2] != ik.x[2]) { // change of the interval size + kv_push(bwtintv_t, *curr, ik); + if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further + } + ik = ok[c]; ik.info = i + 1; + } else { // an ambiguous base kv_push(bwtintv_t, *curr, ik); - if (ok[c].x[2] == 0) break; // cannot be extended - ik = ok[c]; ik.info = i + 1; + break; // always terminate extension at an ambiguous base; in this case, ia[0].info; // this will be the returned value swap = curr; curr = prev; prev = swap; for (i = x - 1; i >= -1; --i) { // backward search for MEMs - if (q[i] > 3) break; - c = i < 0? 0 : q[i]; + c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base for (j = 0, curr->n = 0; j < prev->n; ++j) { bwtintv_t *p = &prev->a[j]; bwt_extend(bwt, p, ok, 1); - if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further - if (curr->n == 0) { // curr->n to make sure there is no longer matches + if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough + if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches ik = *p; ik.info |= (uint64_t)(i + 1)<<32; kv_push(bwtintv_t, *mem, ik); } } // otherwise the match is contained in another longer match - } - if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) { + } else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) { ok[c].info = p->info; kv_push(bwtintv_t, *curr, ok[c]); } @@ -326,27 +341,97 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem } bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate - if (tmpvec[0] == 0) free(a[0].a); - if (tmpvec[1] == 0) free(a[1].a); + if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a); + if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); return ret; } -int bwt_smem(const bwt_t *bwt, int len, const uint8_t *q, bwtintv_v *mem, bwtintv_v *tmpvec[3]) +/************************* + * Read/write BWT and SA * + *************************/ + +void bwt_dump_bwt(const char *fn, const bwt_t *bwt) { - int x = 0, i; - bwtintv_v a[3], *tvec[2], *mem1; - kv_init(a[0]); kv_init(a[1]); kv_init(a[2]); // no memory allocation here - tvec[0] = tmpvec[0]? tmpvec[0] : &a[0]; - tvec[1] = tmpvec[1]? tmpvec[1] : &a[1]; - mem1 = tmpvec[2]? tmpvec[2] : &a[2]; - mem->n = 0; - do { - x = bwt_smem1(bwt, len, q, x, mem1, tvec); - for (i = 0; i < mem1->n; ++i) - kv_push(bwtintv_t, *mem, mem1->a[i]); - } while (x < len); - if (tmpvec[0] == 0) free(a[0].a); - if (tmpvec[1] == 0) free(a[1].a); - if (tmpvec[2] == 0) free(a[2].a); - return mem->n; + FILE *fp; + fp = xopen(fn, "wb"); + err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp); + err_fflush(fp); + err_fclose(fp); +} + +void bwt_dump_sa(const char *fn, const bwt_t *bwt) +{ + FILE *fp; + fp = xopen(fn, "wb"); + err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); + err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + err_fflush(fp); + err_fclose(fp); +} + +static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a) +{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks + const int bufsize = 0x1000000; // 16M block + bwtint_t offset = 0; + while (size) { + int x = bufsize < size? bufsize : size; + if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) break; + size -= x; offset += x; + } + return offset; +} + +void bwt_restore_sa(const char *fn, bwt_t *bwt) +{ + char skipped[256]; + FILE *fp; + bwtint_t primary; + + fp = xopen(fn, "rb"); + err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same."); + err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip + err_fread_noeof(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); + + bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; + bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); + bwt->sa[0] = -1; + + fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1); + err_fclose(fp); +} + +bwt_t *bwt_restore_bwt(const char *fn) +{ + bwt_t *bwt; + FILE *fp; + + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + fp = xopen(fn, "rb"); + err_fseek(fp, 0, SEEK_END); + bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2; + bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4); + err_fseek(fp, 0, SEEK_SET); + err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp); + err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp); + fread_fix(fp, bwt->bwt_size<<2, bwt->bwt); + bwt->seq_len = bwt->L2[4]; + err_fclose(fp); + bwt_gen_cnt_table(bwt); + + return bwt; +} + +void bwt_destroy(bwt_t *bwt) +{ + if (bwt == 0) return; + free(bwt->sa); free(bwt->bwt); + free(bwt); } diff --git a/bwt.h b/bwt.h index d7a219f..d2ff0ac 100644 --- a/bwt.h +++ b/bwt.h @@ -29,16 +29,18 @@ #define BWA_BWT_H #include +#include -// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line -#define OCC_INTERVAL 0x80 +// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80 +#define OCC_INTV_SHIFT 7 +#define OCC_INTERVAL (1LL<>((~(k)&0xf)<<1)&3) -// inverse Psi function -#define bwt_invPsi(bwt, k) \ - (((k) == (bwt)->primary)? 0 : \ - ((k) < (bwt)->primary)? \ - (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \ - : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1))) - #define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0) #ifdef __cplusplus @@ -101,20 +96,30 @@ extern "C" { void bwt_bwtupdate_core(bwt_t *bwt); - inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c); - inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]); + bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c); + void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]); bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k); // more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values void bwt_gen_cnt_table(bwt_t *bwt); - inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol); - inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]); + void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol); + void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]); int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end); int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0); + /** + * Extend bi-SA-interval _ik_ + */ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back); - int bwt_smem(const bwt_t *bwt, int len, const uint8_t *q, bwtintv_v *mem, bwtintv_v *tmpvec[3]); + + /** + * Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_. + * Return the end of the longest exact match starting from _x_. + */ + int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + + // SMEM iterator interface #ifdef __cplusplus } diff --git a/bwt_gen.c b/bwt_gen.c index ef64e0f..6139d80 100644 --- a/bwt_gen.c +++ b/bwt_gen.c @@ -27,8 +27,13 @@ #include #include #include +#include #include "QSufSort.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + typedef uint64_t bgint_t; typedef int64_t sbgint_t; @@ -235,22 +240,22 @@ static unsigned int ceilLog2(const unsigned int input) } // for ConvertBytePackedToWordPacked() -static unsigned int BitPerBytePackedChar(const unsigned int alphabet_size) +static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize) { unsigned int bitPerChar; - bitPerChar = ceilLog2(alphabet_size); + bitPerChar = ceilLog2(alphabetSize); // Return the largest number of bit that does not affect packing efficiency if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar) bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar); return bitPerChar; } // for ConvertBytePackedToWordPacked() -static unsigned int BitPerWordPackedChar(const unsigned int alphabet_size) +static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize) { - return ceilLog2(alphabet_size); + return ceilLog2(alphabetSize); } -static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabet_size, +static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize, const bgint_t textLength) { bgint_t i; @@ -266,8 +271,8 @@ static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned i unsigned int buffer[BITS_IN_WORD]; - bitPerBytePackedChar = BitPerBytePackedChar(alphabet_size); - bitPerWordPackedChar = BitPerWordPackedChar(alphabet_size); + bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize); + bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize); charPerByte = BITS_IN_BYTE / bitPerBytePackedChar; charPerWord = BITS_IN_WORD / bitPerWordPackedChar; @@ -870,6 +875,7 @@ static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __r bgint_t i, c; bgint_t s, r; bgint_t lastRank, lastIndex; + bgint_t oldInverseSa0RelativeRank = 0; bgint_t freq; lastIndex = numItem; @@ -880,6 +886,7 @@ static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __r s = seq[numItem]; relativeRank[s] = numItem; if (lastRank == oldInverseSa0) { + oldInverseSa0RelativeRank = numItem; oldInverseSa0++; // so that this segment of code is not run again lastRank++; // so that oldInverseSa0 become a sorted group with 1 item } @@ -912,6 +919,7 @@ static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __r lastRank = r; relativeRank[s] = i; if (r == oldInverseSa0) { + oldInverseSa0RelativeRank = i; oldInverseSa0++; // so that this segment of code is not run again lastRank++; // so that oldInverseSa0 become a sorted group with 1 item } @@ -941,12 +949,15 @@ static void BWTIncBuildBwt(unsigned int* insertBwt, const bgint_t *relativeRank, static void BWTIncMergeBwt(const bgint_t *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt, unsigned int* __restrict mergedBwt, const bgint_t numOldBwt, const bgint_t numInsertBwt) { + unsigned int bitsInWordMinusBitPerChar; bgint_t leftShift, rightShift; bgint_t o; bgint_t oIndex, iIndex, mIndex; bgint_t mWord, mChar, oWord, oChar; bgint_t numInsert; + bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR; + oIndex = 0; iIndex = 0; mIndex = 0; @@ -1437,13 +1448,29 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxB packedFile = (FILE*)fopen(inputFileName, "rb"); if (packedFile == NULL) { - fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open inputFileName!\n"); + fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open %s : %s\n", + inputFileName, strerror(errno)); exit(1); } - fseek(packedFile, -1, SEEK_END); + if (fseek(packedFile, -1, SEEK_END) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } packedFileLen = ftell(packedFile); - fread(&lastByteLength, sizeof(unsigned char), 1, packedFile); + if (packedFileLen == -1) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't ftell on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } + if (fread(&lastByteLength, sizeof(unsigned char), 1, packedFile) != 1) { + fprintf(stderr, + "BWTIncConstructFromPacked() : Can't read from %s : %s\n", + inputFileName, + ferror(packedFile)? strerror(errno) : "Unexpected end of file"); + exit(1); + } totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); bwtInc = BWTIncCreate(totalTextLength, initialMaxBuildSize, incMaxBuildSize); @@ -1457,10 +1484,23 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxB } textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte - fseek(packedFile, -2, SEEK_CUR); - fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); - fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile); - fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR); + if (fseek(packedFile, -((long)textSizeInByte + 2), SEEK_CUR) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } + if (fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile) != textSizeInByte + 1) { + fprintf(stderr, + "BWTIncConstructFromPacked() : Can't read from %s : %s\n", + inputFileName, + ferror(packedFile)? strerror(errno) : "Unexpected end of file"); + exit(1); + } + if (fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); BWTIncConstruct(bwtInc, textToLoad); @@ -1473,9 +1513,23 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxB textToLoad = totalTextLength - processedTextLength; } textSizeInByte = textToLoad / CHAR_PER_BYTE; - fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); - fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile); - fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); + if (fseek(packedFile, -((long)textSizeInByte), SEEK_CUR) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } + if (fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile) != textSizeInByte) { + fprintf(stderr, + "BWTIncConstructFromPacked() : Can't read from %s : %s\n", + inputFileName, + ferror(packedFile)? strerror(errno) : "Unexpected end of file"); + exit(1); + } + if (fseek(packedFile, -((long)textSizeInByte), SEEK_CUR) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); BWTIncConstruct(bwtInc, textToLoad); processedTextLength += textToLoad; @@ -1520,15 +1574,28 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o bwtFile = (FILE*)fopen(bwtFileName, "wb"); if (bwtFile == NULL) { - fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open BWT code file!\n"); + fprintf(stderr, + "BWTSaveBwtCodeAndOcc(): Cannot open %s for writing: %s\n", + bwtFileName, strerror(errno)); exit(1); } - fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile); - fwrite(bwt->cumulativeFreq + 1, sizeof(bgint_t), ALPHABET_SIZE, bwtFile); bwtLength = BWTFileSizeInWord(bwt->textLength); - fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile); - fclose(bwtFile); + + if (fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile) != 1 + || fwrite(bwt->cumulativeFreq + 1, + sizeof(bgint_t), ALPHABET_SIZE, bwtFile) != ALPHABET_SIZE + || fwrite(bwt->bwtCode, + sizeof(unsigned int), bwtLength, bwtFile) != bwtLength) { + fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Error writing to %s : %s\n", + bwtFileName, strerror(errno)); + exit(1); + } + if (fclose(bwtFile) != 0) { + fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Error on closing %s : %s\n", + bwtFileName, strerror(errno)); + exit(1); + } } void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) diff --git a/bwt_lite.c b/bwt_lite.c index dd411e1..9b47270 100644 --- a/bwt_lite.c +++ b/bwt_lite.c @@ -3,6 +3,10 @@ #include #include "bwt_lite.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + int is_sa(const uint8_t *T, uint32_t *SA, int n); int is_bwt(uint8_t *T, int n); @@ -52,7 +56,7 @@ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq) } return b; } -inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c) +uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c) { uint32_t n, b; if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; @@ -65,7 +69,7 @@ inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c) if (c == 0) n -= 15 - (k&15); // corrected for the masked bits return n; } -inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]) +void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]) { uint32_t x, b; if (k == (uint32_t)(-1)) { @@ -80,7 +84,7 @@ inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]) x -= 15 - (k&15); cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; } -inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]) +void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]) { bwtl_occ4(bwt, k, cntk); bwtl_occ4(bwt, l, cntl); diff --git a/bwt_lite.h b/bwt_lite.h index 0096b93..4fadcce 100644 --- a/bwt_lite.h +++ b/bwt_lite.h @@ -17,9 +17,9 @@ extern "C" { #endif bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq); - inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c); - inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]); - inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]); + uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c); + void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]); + void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]); void bwtl_destroy(bwtl_t *bwt); #ifdef __cplusplus diff --git a/bwtaln.c b/bwtaln.c index 5ecf03b..cd2a887 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -11,11 +11,16 @@ #include "bwtaln.h" #include "bwtgap.h" #include "utils.h" +#include "bwa.h" #ifdef HAVE_PTHREAD #include #endif +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + gap_opt_t *gap_init_opt() { gap_opt_t *o; @@ -45,291 +50,280 @@ gap_opt_t *gap_init_opt() int bwa_cal_maxdiff(int l, double err, double thres) { - double elambda = exp(-l * err); - double sum, y = 1.0; - int k, x = 1; - for (k = 1, sum = elambda; k < 1000; ++k) { - y *= l * err; - x *= k; - sum += elambda * y / x; - if (1.0 - sum < thres) return k; - } - return 2; + double elambda = exp(-l * err); + double sum, y = 1.0; + int k, x = 1; + for (k = 1, sum = elambda; k < 1000; ++k) { + y *= l * err; + x *= k; + sum += elambda * y / x; + if (1.0 - sum < thres) return k; + } + return 2; } // width must be filled as zero -static int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width) +int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width) { - bwtint_t k, l, ok, ol; - int i, bid; - bid = 0; - k = 0; l = bwt->seq_len; - for (i = 0; i < len; ++i) { - ubyte_t c = str[i]; - if (c < 4) { - bwt_2occ(bwt, k - 1, l, c, &ok, &ol); - k = bwt->L2[c] + ok + 1; - l = bwt->L2[c] + ol; - } - if (k > l || c > 3) { // then restart - k = 0; - l = bwt->seq_len; - ++bid; - } - width[i].w = l - k + 1; - width[i].bid = bid; - } - width[len].w = 0; - width[len].bid = ++bid; - return bid; + bwtint_t k, l, ok, ol; + int i, bid; + bid = 0; + k = 0; l = bwt->seq_len; + for (i = 0; i < len; ++i) { + ubyte_t c = str[i]; + if (c < 4) { + bwt_2occ(bwt, k - 1, l, c, &ok, &ol); + k = bwt->L2[c] + ok + 1; + l = bwt->L2[c] + ol; + } + if (k > l || c > 3) { // then restart + k = 0; + l = bwt->seq_len; + ++bid; + } + width[i].w = l - k + 1; + width[i].bid = bid; + } + width[len].w = 0; + width[len].bid = ++bid; + return bid; } void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt) { - int i, j, max_l = 0, max_len; - gap_stack_t *stack; - bwt_width_t *w, *seed_w; - gap_opt_t local_opt = *opt; + int i, j, max_l = 0, max_len; + gap_stack_t *stack; + bwt_width_t *w, *seed_w; + gap_opt_t local_opt = *opt; - // initiate priority stack - for (i = max_len = 0; i != n_seqs; ++i) - if (seqs[i].len > max_len) max_len = seqs[i].len; - if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr); - if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff; - stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt); + // initiate priority stack + for (i = max_len = 0; i != n_seqs; ++i) + if (seqs[i].len > max_len) max_len = seqs[i].len; + if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr); + if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff; + stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt); - seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); - w = 0; - for (i = 0; i != n_seqs; ++i) { - bwa_seq_t *p = seqs + i; + seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); + w = 0; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p = seqs + i; #ifdef HAVE_PTHREAD - if (i % opt->n_threads != tid) continue; + if (i % opt->n_threads != tid) continue; #endif - p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; - if (max_l < p->len) { - max_l = p->len; - w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t)); - memset(w, 0, (max_l + 1) * sizeof(bwt_width_t)); - } - bwt_cal_width(bwt, p->len, p->seq, w); - if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr); - local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff; - if (p->len > opt->seed_len) - bwt_cal_width(bwt, opt->seed_len, p->seq + (p->len - opt->seed_len), seed_w); - // core function - for (j = 0; j < p->len; ++j) // we need to complement - p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j]; - p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); - // clean up the unused data in the record - free(p->name); free(p->seq); free(p->rseq); free(p->qual); - p->name = 0; p->seq = p->rseq = p->qual = 0; - } - free(seed_w); free(w); - gap_destroy_stack(stack); + p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; + if (max_l < p->len) { + max_l = p->len; + w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t)); + memset(w, 0, (max_l + 1) * sizeof(bwt_width_t)); + } + bwt_cal_width(bwt, p->len, p->seq, w); + if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr); + local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff; + if (p->len > opt->seed_len) + bwt_cal_width(bwt, opt->seed_len, p->seq + (p->len - opt->seed_len), seed_w); + // core function + for (j = 0; j < p->len; ++j) // we need to complement + p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j]; + p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); + //fprintf(stderr, "mm=%lld,ins=%lld,del=%lld,gapo=%lld\n", p->aln->n_mm, p->aln->n_ins, p->aln->n_del, p->aln->n_gapo); + // clean up the unused data in the record + free(p->name); free(p->seq); free(p->rseq); free(p->qual); + p->name = 0; p->seq = p->rseq = p->qual = 0; + } + free(seed_w); free(w); + gap_destroy_stack(stack); } #ifdef HAVE_PTHREAD typedef struct { - int tid; - bwt_t *bwt; - int n_seqs; - bwa_seq_t *seqs; - const gap_opt_t *opt; + int tid; + bwt_t *bwt; + int n_seqs; + bwa_seq_t *seqs; + const gap_opt_t *opt; } thread_aux_t; static void *worker(void *data) { - thread_aux_t *d = (thread_aux_t*)data; - bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt); - return 0; + thread_aux_t *d = (thread_aux_t*)data; + bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt); + return 0; } #endif bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa) { - bwa_seqio_t *ks; - if (mode & BWA_MODE_BAM) { // open BAM - int which = 0; - if (mode & BWA_MODE_BAM_SE) which |= 4; - if (mode & BWA_MODE_BAM_READ1) which |= 1; - if (mode & BWA_MODE_BAM_READ2) which |= 2; - if (which == 0) which = 7; // then read all reads - ks = bwa_bam_open(fn_fa, which); - } else ks = bwa_seq_open(fn_fa); - return ks; + bwa_seqio_t *ks; + if (mode & BWA_MODE_BAM) { // open BAM + int which = 0; + if (mode & BWA_MODE_BAM_SE) which |= 4; + if (mode & BWA_MODE_BAM_READ1) which |= 1; + if (mode & BWA_MODE_BAM_READ2) which |= 2; + if (which == 0) which = 7; // then read all reads + ks = bwa_bam_open(fn_fa, which); + } else ks = bwa_seq_open(fn_fa); + return ks; } void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) { - int i, n_seqs, tot_seqs = 0; - bwa_seq_t *seqs; - bwa_seqio_t *ks; - clock_t t; - bwt_t *bwt; + int i, n_seqs, tot_seqs = 0; + bwa_seq_t *seqs; + bwa_seqio_t *ks; + clock_t t; + bwt_t *bwt; - g_visited = 0; - // initialization - ks = bwa_open_reads(opt->mode, fn_fa); + // initialization + ks = bwa_open_reads(opt->mode, fn_fa); - { // load BWT - char *str = (char*)calloc(strlen(prefix) + 10, 1); - strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); - free(str); - } + { // load BWT + char *str = (char*)calloc(strlen(prefix) + 10, 1); + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); + free(str); + } - // core loop - err_fwrite(opt, sizeof(gap_opt_t), 1, stdout); - while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) { - tot_seqs += n_seqs; - t = clock(); + // core loop + err_fwrite(SAI_MAGIC, 1, 4, stdout); + err_fwrite(opt, sizeof(gap_opt_t), 1, stdout); + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) { + tot_seqs += n_seqs; + t = clock(); - fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... "); + fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... "); #ifdef HAVE_PTHREAD - if (opt->n_threads <= 1) { // no multi-threading at all - bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); - } else { - pthread_t *tid; - pthread_attr_t attr; - thread_aux_t *data; - int j; - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); - tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); - for (j = 0; j < opt->n_threads; ++j) { - data[j].tid = j; data[j].bwt = bwt; - data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; - pthread_create(&tid[j], &attr, worker, data + j); - } - for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); - free(data); free(tid); - } + if (opt->n_threads <= 1) { // no multi-threading at all + bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); + } else { + pthread_t *tid; + pthread_attr_t attr; + thread_aux_t *data; + int j; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + for (j = 0; j < opt->n_threads; ++j) { + data[j].tid = j; data[j].bwt = bwt; + data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; + pthread_create(&tid[j], &attr, worker, data + j); + } + for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); + free(data); free(tid); + } #else - bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); + bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); #endif - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); - t = clock(); - fprintf(stderr, "[bwa_aln_core] write to the disk... "); - for (i = 0; i < n_seqs; ++i) { - bwa_seq_t *p = seqs + i; - err_fwrite(&p->n_aln, 4, 1, stdout); - if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); - } - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + t = clock(); + fprintf(stderr, "[bwa_aln_core] write to the disk... "); + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + err_fwrite(&p->n_aln, 4, 1, stdout); + if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); + } + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); - bwa_free_read_seq(n_seqs, seqs); - fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); - } + bwa_free_read_seq(n_seqs, seqs); + fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); + } - fprintf(stderr, "g_visited: %lu\n", g_visited); - // destroy - bwt_destroy(bwt); - bwa_seq_close(ks); + // destroy + bwt_destroy(bwt); + bwa_seq_close(ks); } int bwa_aln(int argc, char *argv[]) { - int c, opte = -1; - gap_opt_t *opt; + int c, opte = -1; + gap_opt_t *opt; + char *prefix; - opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { - switch (c) { - case 'n': - if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; - else opt->max_diff = atoi(optarg), opt->fnr = -1.0; - break; - case 'o': opt->max_gapo = atoi(optarg); break; - case 'e': opte = atoi(optarg); break; - case 'M': opt->s_mm = atoi(optarg); break; - case 'O': opt->s_gapo = atoi(optarg); break; - case 'E': opt->s_gape = atoi(optarg); break; - case 'd': opt->max_del_occ = atoi(optarg); break; - case 'i': opt->indel_end_skip = atoi(optarg); break; - case 'l': opt->seed_len = atoi(optarg); break; - case 'k': opt->max_seed_diff = atoi(optarg); break; - case 'm': opt->max_entries = atoi(optarg); break; - case 't': opt->n_threads = atoi(optarg); break; - case 'L': opt->mode |= BWA_MODE_LOGGAP; break; - case 'R': opt->max_top2 = atoi(optarg); break; - case 'q': opt->trim_qual = atoi(optarg); break; - case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break; - case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; - case 'f': xreopen(optarg, "wb", stdout); break; - case 'b': opt->mode |= BWA_MODE_BAM; break; - case '0': opt->mode |= BWA_MODE_BAM_SE; break; - case '1': opt->mode |= BWA_MODE_BAM_READ1; break; - case '2': opt->mode |= BWA_MODE_BAM_READ2; break; - case 'I': opt->mode |= BWA_MODE_IL13; break; - case 'Y': opt->mode |= BWA_MODE_CFY; break; - case 'B': opt->mode |= atoi(optarg) << 24; break; - default: return 1; - } - } - if (opte > 0) { - opt->max_gape = opte; - opt->mode &= ~BWA_MODE_GAPE; - } + opt = gap_init_opt(); + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { + switch (c) { + case 'n': + if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; + else opt->max_diff = atoi(optarg), opt->fnr = -1.0; + break; + case 'o': opt->max_gapo = atoi(optarg); break; + case 'e': opte = atoi(optarg); break; + case 'M': opt->s_mm = atoi(optarg); break; + case 'O': opt->s_gapo = atoi(optarg); break; + case 'E': opt->s_gape = atoi(optarg); break; + case 'd': opt->max_del_occ = atoi(optarg); break; + case 'i': opt->indel_end_skip = atoi(optarg); break; + case 'l': opt->seed_len = atoi(optarg); break; + case 'k': opt->max_seed_diff = atoi(optarg); break; + case 'm': opt->max_entries = atoi(optarg); break; + case 't': opt->n_threads = atoi(optarg); break; + case 'L': opt->mode |= BWA_MODE_LOGGAP; break; + case 'R': opt->max_top2 = atoi(optarg); break; + case 'q': opt->trim_qual = atoi(optarg); break; + case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; + case 'f': xreopen(optarg, "wb", stdout); break; + case 'b': opt->mode |= BWA_MODE_BAM; break; + case '0': opt->mode |= BWA_MODE_BAM_SE; break; + case '1': opt->mode |= BWA_MODE_BAM_READ1; break; + case '2': opt->mode |= BWA_MODE_BAM_READ2; break; + case 'I': opt->mode |= BWA_MODE_IL13; break; + case 'Y': opt->mode |= BWA_MODE_CFY; break; + case 'B': opt->mode |= atoi(optarg) << 24; break; + default: return 1; + } + } + if (opte > 0) { + opt->max_gape = opte; + opt->mode &= ~BWA_MODE_GAPE; + } - if (optind + 2 > argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa aln [options] \n\n"); - fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n", - BWA_AVG_ERR, opt->fnr); - fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo); - fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n"); - fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip); - fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ); - fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len); - fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff); - fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries); - fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm); - fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo); - fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape); - fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2); - fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa aln [options] \n\n"); + fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n", + BWA_AVG_ERR, opt->fnr); + fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo); + fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n"); + fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip); + fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ); + fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len); + fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff); + fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm); + fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo); + fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape); + fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2); + fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); fprintf(stderr, " -f FILE file to write output to instead of stdout\n"); - fprintf(stderr, " -B INT length of barcode\n"); - fprintf(stderr, " -c input sequences are in the color space\n"); - fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); - fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); - fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n"); - fprintf(stderr, " -b the input read file is in the BAM format\n"); - fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); - fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); - fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n"); - fprintf(stderr, " -Y filter Casava-filtered sequences\n"); - fprintf(stderr, "\n"); - return 1; - } - if (opt->fnr > 0.0) { - int i, k; - for (i = 17, k = 0; i <= 250; ++i) { - int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); - if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l); - k = l; - } - } - bwa_aln_core(argv[optind], argv[optind+1], opt); - free(opt); - return 0; -} - -/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, - __cigar_op and __cigar_len while keeping stdaln stand alone */ -bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar) -{ - uint32_t *cigar32; - bwa_cigar_t *cigar; - int i; - cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar); - cigar = (bwa_cigar_t*)cigar32; - for (i = 0; i < *n_cigar; ++i) - cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) ); - return cigar; + fprintf(stderr, " -B INT length of barcode\n"); + fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); + fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); + fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n"); + fprintf(stderr, " -b the input read file is in the BAM format\n"); + fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); + fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); + fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n"); + fprintf(stderr, " -Y filter Casava-filtered sequences\n"); + fprintf(stderr, "\n"); + return 1; + } + if (opt->fnr > 0.0) { + int i, k; + for (i = 17, k = 0; i <= 250; ++i) { + int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); + if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l); + k = l; + } + } + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + free(opt); + return 1; + } + bwa_aln_core(prefix, argv[optind+1], opt); + free(opt); free(prefix); + return 0; } - diff --git a/bwtaln.h b/bwtaln.h index 0b16859..d92cdcb 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -2,6 +2,7 @@ #define BWTALN_H #include +#include #include "pssm.h" #include "bwt.h" @@ -29,6 +30,13 @@ #define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3) #endif +#define FROM_M 0 +#define FROM_I 1 +#define FROM_D 2 +#define FROM_S 3 + +#define SAI_MAGIC "SAI\1" + typedef struct { bwtint_t w; int bid; @@ -36,10 +44,10 @@ typedef struct { } bwt_width_t; typedef struct { - uint32_t n_mm:8, n_gapo:8, n_gape:8, a:1; + uint64_t n_mm:8, n_gapo:8, n_gape:8, score:20, n_ins:10, n_del:10; bwtint_t k, l; - int score; float pssm_score; + float posterior_p; char pssm; //indicate whether this alignment was made with a PSSM search } bwt_aln1_t; @@ -56,8 +64,10 @@ typedef uint16_t bwa_cigar_t; typedef struct { uint32_t n_cigar:15, gap:8, mm:8, strand:1; + int ref_shift; bwtint_t pos; bwa_cigar_t *cigar; + float posterior_p; } bwt_multi1_t; typedef struct { @@ -76,6 +86,7 @@ typedef struct { // alignment information bwtint_t sa, pos; uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ + int ref_shift; int n_cigar; bwa_cigar_t *cigar; // for multi-threading only @@ -129,7 +140,6 @@ typedef struct { } gap_opt_t; #define BWA_PET_STD 1 -#define BWA_PET_SOLID 2 typedef struct { int max_isize, force_isize; @@ -149,7 +159,6 @@ extern "C" { gap_opt_t *gap_init_opt(); void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt); - bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); bwa_seqio_t *bwa_seq_open(const char *fn); bwa_seqio_t *bwa_bam_open(const char *fn, int which); void bwa_seq_close(bwa_seqio_t *bs); @@ -162,13 +171,6 @@ extern "C" { void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac); - - /* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, - __cigar_op and __cigar_len while keeping stdaln stand alone */ -#include "stdaln.h" - - bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar); - #ifdef __cplusplus } #endif diff --git a/bwtgap.c b/bwtgap.c index c4e06f7..08bc1f4 100644 --- a/bwtgap.c +++ b/bwtgap.c @@ -4,27 +4,30 @@ #include "bwtgap.h" #include "bwtaln.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + #define STATE_M 0 #define STATE_I 1 #define STATE_D 2 #define aln_score(m,o,e,p) ((m)*(p)->s_mm + (o)*(p)->s_gapo + (e)*(p)->s_gape) -gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt) +gap_stack_t *gap_init_stack2(int max_score) { - int i; gap_stack_t *stack; stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t)); - stack->n_stacks = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt); + stack->n_stacks = max_score; stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t)); - for (i = 0; i != stack->n_stacks; ++i) { - gap_stack1_t *p = stack->stacks + i; - p->m_entries = 4; - p->stack = (gap_entry_t*)calloc(p->m_entries, sizeof(gap_entry_t)); - } return stack; } +gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt) +{ + return gap_init_stack2(aln_score(max_mm+1, max_gapo+1, max_gape+1, opt)); +} + void gap_destroy_stack(gap_stack_t *stack) { int i; @@ -42,7 +45,7 @@ static void gap_reset_stack(gap_stack_t *stack) stack->n_entries = 0; } -static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, +static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, int n_ins, int n_del, int state, int is_diff, const gap_opt_t *opt) { int score; @@ -51,12 +54,14 @@ static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, i score = aln_score(n_mm, n_gapo, n_gape, opt); q = stack->stacks + score; if (q->n_entries == q->m_entries) { - q->m_entries <<= 1; + q->m_entries = q->m_entries? q->m_entries<<1 : 4; q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries); } p = q->stack + q->n_entries; p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l; - p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state; + p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; + p->n_ins = n_ins; p->n_del = n_del; + p->state = state; p->last_diff_pos = is_diff? i : 0; ++(q->n_entries); ++(stack->n_entries); @@ -103,7 +108,7 @@ static inline int int_log2(uint32_t v) bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width, bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack) -{ +{ // $seq is the reverse complement of the input read int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt); int best_diff = opt->max_diff + 1, max_diff = opt->max_diff; int best_cnt = 0; @@ -123,57 +128,39 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid //for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w); gap_reset_stack(stack); // reset stack - gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, opt); + gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, 0, 0, opt); while (stack->n_entries) { gap_entry_t e; int i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp; bwtint_t k, l, cnt_k[4], cnt_l[4], occ; - g_visited++; if (max_entries < stack->n_entries) max_entries = stack->n_entries; - if (stack->n_entries > opt->max_entries) { - //fprintf(stderr, "break1\n"); - break; - } + if (stack->n_entries > opt->max_entries) break; gap_pop(stack, &e); // get the best entry k = e.k; l = e.l; // SA interval i = e.info&0xffff; // length - if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) { - //fprintf(stderr, "break2\n"); - break; // no need to proceed - } + if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed m = max_diff - (e.n_mm + e.n_gapo); if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape; - if (m < 0) { - //fprintf(stderr, "continue1\n"); - continue; - } + if (m < 0) continue; if (seed_width) { // apply seeding m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo); if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape; } - //ooga - //fprintf(stderr, "#1\t[%d,%d,%d,%c]\t[%d,%d,%d][%d]\t[%u,%lu]\t[%lu,%lu]\t%d\n", stack->n_entries, i, seq[i], "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, max_diff, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos); - if (i > 0 && m < width[i-1].bid) { - //fprintf(stderr, "continue2\n"); - continue; - } + //printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%u]\t[%u,%u]\t%d\n", stack->n_entries, a, i, "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos); + if (i > 0 && m < width[i-1].bid) continue; // check whether a hit is found hit_found = 0; if (i == 0) hit_found = 1; else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed if (bwt_match_exact_alt(bwt, i, seq, &k, &l)) hit_found = 1; - else { - //fprintf(stderr, "continue3\n"); - continue; // no hit, skip - } + else continue; // no hit, skip } if (hit_found) { // action for found hits - //fprintf(stderr, "#hit_found\n"); int score = aln_score(e.n_mm, e.n_gapo, e.n_gape, opt); int do_add = 1; //printf("#2 hits found: %d:(%u,%u)\n", e.n_mm+e.n_gapo, k, l); @@ -185,16 +172,10 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid max_diff = (best_diff + 1 > opt->max_diff)? opt->max_diff : best_diff + 1; // top2 behaviour } if (score == best_score) best_cnt += l - k + 1; - else if (best_cnt > opt->max_top2) { - //fprintf(stderr, "break3\n"); - break; // top2b behaviour - } + else if (best_cnt > opt->max_top2) break; // top2b behaviour if (e.n_gapo) { // check whether the hit has been found. this may happen when a gap occurs in a tandem repeat for (j = 0; j != n_aln; ++j) - if (aln[j].k == k && aln[j].l == l) { - //fprintf(stderr, "break4\n"); - break; - } + if (aln[j].k == k && aln[j].l == l) break; if (j < n_aln) do_add = 0; } if (do_add) { // append @@ -207,11 +188,12 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid } p = aln + n_aln; p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; + p->n_ins = e.n_ins; p->n_del = e.n_del; p->k = k; p->l = l; p->score = score; + //fprintf(stderr, "*** n_mm=%d,n_gapo=%d,n_gape=%d,n_ins=%d,n_del=%d\n", e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del); ++n_aln; } - //fprintf(stderr, "continue4\n"); continue; } @@ -236,24 +218,24 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid if (e.state == STATE_M) { // gap open if (e.n_gapo < opt->max_gapo) { // gap open is allowed // insertion - gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt); + gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins + 1, e.n_del, STATE_I, 1, opt); // deletion for (j = 0; j != 4; ++j) { k = bwt->L2[j] + cnt_k[j] + 1; l = bwt->L2[j] + cnt_l[j]; - if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt); + if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins, e.n_del + 1, STATE_D, 1, opt); } } } else if (e.state == STATE_I) { // extention of an insertion if (e.n_gape < opt->max_gape) // gap extention is allowed - gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt); + gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins + 1, e.n_del, STATE_I, 1, opt); } else if (e.state == STATE_D) { // extention of a deletion if (e.n_gape < opt->max_gape) { // gap extention is allowed if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) { for (j = 0; j != 4; ++j) { k = bwt->L2[j] + cnt_k[j] + 1; l = bwt->L2[j] + cnt_l[j]; - if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt); + if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins, e.n_del + 1, STATE_D, 1, opt); } } } @@ -266,17 +248,16 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid int is_mm = (j != 4 || seq[i] > 3); k = bwt->L2[c] + cnt_k[c] + 1; l = bwt->L2[c] + cnt_l[c]; - if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt); + if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, is_mm, opt); } } else if (seq[i] < 4) { // try exact match only int c = seq[i] & 3; k = bwt->L2[c] + cnt_k[c] + 1; l = bwt->L2[c] + cnt_l[c]; - if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt); + if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, 0, opt); } } - //fprintf(stderr, "finished, stack->n_entries: %d\n", stack->n_entries); *_n_aln = n_aln; //fprintf(stderr, "max_entries = %d\n", max_entries); return aln; diff --git a/bwtgap.h b/bwtgap.h index 01ee359..7dd6165 100644 --- a/bwtgap.h +++ b/bwtgap.h @@ -7,8 +7,9 @@ typedef struct { // recursion stack u_int32_t info; // score<<21 | i u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6; - bwtint_t k, l; // (k,l) is the SA region of [i,n-1] + u_int32_t n_ins:16, n_del:16; int last_diff_pos; + bwtint_t k, l; // (k,l) is the SA region of [i,n-1] } gap_entry_t; typedef struct { @@ -25,6 +26,7 @@ typedef struct { extern "C" { #endif + gap_stack_t *gap_init_stack2(int max_score); gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt); void gap_destroy_stack(gap_stack_t *stack); bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *w, diff --git a/bwtindex.c b/bwtindex.c index 6056988..9e3ec15 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -33,20 +33,167 @@ #include #include "bntseq.h" #include "bwt.h" -#include "main.h" #include "utils.h" -bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is); -void bwa_pac_rev_core(const char *fn, const char *fn_rev); +#ifdef _DIVBWT +#include "divsufsort.h" +#endif -int bwa_index(int argc, char *argv[]) +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + + +int is_bwt(ubyte_t *T, int n); + +int64_t bwa_seq_len(const char *fn_pac) +{ + FILE *fp; + int64_t pac_len; + ubyte_t c; + fp = xopen(fn_pac, "rb"); + err_fseek(fp, -1, SEEK_END); + pac_len = err_ftell(fp); + err_fread_noeof(&c, 1, 1, fp); + err_fclose(fp); + return (pac_len - 1) * 4 + (int)c; +} + +bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) +{ + bwt_t *bwt; + ubyte_t *buf, *buf2; + int i, pac_size; + FILE *fp; + + // initialization + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + bwt->seq_len = bwa_seq_len(fn_pac); + bwt->bwt_size = (bwt->seq_len + 15) >> 4; + fp = xopen(fn_pac, "rb"); + + // prepare sequence + pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); + buf2 = (ubyte_t*)calloc(pac_size, 1); + err_fread_noeof(buf2, 1, pac_size, fp); + err_fclose(fp); + memset(bwt->L2, 0, 5 * 4); + buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); + for (i = 0; i < bwt->seq_len; ++i) { + buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; + ++bwt->L2[1+buf[i]]; + } + for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; + free(buf2); + + // Burrows-Wheeler Transform + if (use_is) { + bwt->primary = is_bwt(buf, bwt->seq_len); + } else { +#ifdef _DIVBWT + bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); +#else + err_fatal_simple("libdivsufsort is not compiled in."); +#endif + } + bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); + for (i = 0; i < bwt->seq_len; ++i) + bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); + free(buf); + return bwt; +} + +int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required! +{ + bwt_t *bwt; + int c, use_is = 1; + while ((c = getopt(argc, argv, "d")) >= 0) { + switch (c) { + case 'd': use_is = 0; break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); + return 1; + } + bwt = bwt_pac2bwt(argv[optind], use_is); + bwt_dump_bwt(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) + +void bwt_bwtupdate_core(bwt_t *bwt) +{ + bwtint_t i, k, c[4], n_occ; + uint32_t *buf; + + n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; + bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size + buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt + c[0] = c[1] = c[2] = c[3] = 0; + for (i = k = 0; i < bwt->seq_len; ++i) { + if (i % OCC_INTERVAL == 0) { + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) + } + if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 + ++c[bwt_B00(bwt, i)]; + } + // the last element + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); + // update bwt + free(bwt->bwt); bwt->bwt = buf; +} + +int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command +{ + bwt_t *bwt; + if (argc < 2) { + fprintf(stderr, "Usage: bwa bwtupdate \n"); + return 1; + } + bwt = bwt_restore_bwt(argv[1]); + bwt_bwtupdate_core(bwt); + bwt_dump_bwt(argv[1], bwt); + bwt_destroy(bwt); + return 0; +} + +int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command { + bwt_t *bwt; + int c, sa_intv = 32; + while ((c = getopt(argc, argv, "i:")) >= 0) { + switch (c) { + case 'i': sa_intv = atoi(optarg); break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); + return 1; + } + bwt = bwt_restore_bwt(argv[optind]); + bwt_cal_sa(bwt, sa_intv); + bwt_dump_sa(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +int bwa_index(int argc, char *argv[]) // the "index" command +{ + extern void bwa_pac_rev_core(const char *fn, const char *fn_rev); + char *prefix = 0, *str, *str2, *str3; - int c, algo_type = 0, is_color = 0; + int c, algo_type = 0, is_64 = 0; clock_t t; int64_t l_pac; - while ((c = getopt(argc, argv, "ca:p:")) >= 0) { + while ((c = getopt(argc, argv, "6a:p:")) >= 0) { switch (c) { case 'a': // if -a is not set, algo_type will be determined later if (strcmp(optarg, "div") == 0) algo_type = 1; @@ -55,50 +202,39 @@ int bwa_index(int argc, char *argv[]) else err_fatal(__func__, "unknown algorithm: '%s'.", optarg); break; case 'p': prefix = strdup(optarg); break; - case 'c': is_color = 1; break; + case '6': is_64 = 1; break; default: return 1; } } if (optind + 1 > argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa index [-a bwtsw|div|is] [-c] \n\n"); - fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [is]\n"); + fprintf(stderr, "Usage: bwa index [-a bwtsw|is] [-c] \n\n"); + fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n"); fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); - fprintf(stderr, " -c build color-space index\n\n"); + fprintf(stderr, " -6 index files named as .64.* instead of .* \n"); + fprintf(stderr, "\n"); fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); - fprintf(stderr, " `-a div' does not work not for long genomes. Please choose `-a'\n"); + fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n"); fprintf(stderr, " according to the length of the genome.\n\n"); return 1; } - if (prefix == 0) prefix = strdup(argv[optind]); + if (prefix == 0) { + prefix = malloc(strlen(argv[optind]) + 4); + strcpy(prefix, argv[optind]); + if (is_64) strcat(prefix, ".64"); + } str = (char*)calloc(strlen(prefix) + 10, 1); str2 = (char*)calloc(strlen(prefix) + 10, 1); str3 = (char*)calloc(strlen(prefix) + 10, 1); - if (is_color == 0) { // nucleotide indexing + { // nucleotide indexing gzFile fp = xzopen(argv[optind], "r"); t = clock(); fprintf(stderr, "[bwa_index] Pack FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - gzclose(fp); - } else { // color indexing - gzFile fp = xzopen(argv[optind], "r"); - strcat(strcpy(str, prefix), ".nt"); - t = clock(); - fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... "); - l_pac = bns_fasta2bntseq(fp, str, 0); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - gzclose(fp); - { - char *tmp_argv[3]; - tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix; - t = clock(); - fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... "); - bwa_pac2cspac(3, tmp_argv); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - } + err_gzclose(fp); } if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT { @@ -132,7 +268,7 @@ int bwa_index(int argc, char *argv[]) fprintf(stderr, "[bwa_index] Pack forward-only FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 1); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - gzclose(fp); + err_gzclose(fp); } { bwt_t *bwt; diff --git a/bwtpssm.c b/bwtpssm.c index 6458655..37825fc 100644 --- a/bwtpssm.c +++ b/bwtpssm.c @@ -251,7 +251,6 @@ void bwa_pssm_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) qualprobs = phred_ascii_quality_scores(qbase); // initialization - g_visited = 0; ks = bwa_open_reads(opt->mode, fn_fa); { // load BWT @@ -264,7 +263,8 @@ void bwa_pssm_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) mc = markov_chain(bwt->L2, 4); // core loop - fwrite(opt, sizeof(gap_opt_t), 1, stdout); + err_fwrite(SAI_MAGIC, 1, 4, stdout); + err_fwrite(opt, sizeof(gap_opt_t), 1, stdout); while ((seqs = bwa_read_pssm_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual, mc, qualprobs, opt)) != 0) { tot_seqs += n_seqs; t = clock(); @@ -301,16 +301,14 @@ void bwa_pssm_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) fprintf(stderr, "[bwa_pssm_core] write to the disk... "); for (i = 0; i < n_seqs; ++i) { bwa_seq_t *p = seqs + i; - fwrite(&p->n_aln, 4, 1, stdout); - if (p->n_aln) fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); + err_fwrite(&p->n_aln, 4, 1, stdout); + if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); } fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); bwa_free_read_seq(n_seqs, seqs); fprintf(stderr, "[bwa_pssm_core] %d sequences have been processed.\n", tot_seqs); } - fprintf(stderr, "g_visited: %lu\n", g_visited); - free(mc->p); free(mc->powers); free(mc->counts); @@ -373,6 +371,7 @@ int bwa_pssm(int argc, char *argv[]) default: return 1; } } + if (opte > 0) { opt->max_gape = opte; opt->mode &= ~BWA_MODE_GAPE; @@ -420,6 +419,8 @@ int bwa_pssm(int argc, char *argv[]) opt->prior = 0.8; } + /* The maximum number of mismatches is set to 30 when using PSSMs + * if (opt->fnr > 0.0) { int i, k; for (i = 17, k = 0; i <= 250; ++i) { @@ -428,6 +429,8 @@ int bwa_pssm(int argc, char *argv[]) k = l; } } + + */ bwa_pssm_core(argv[optind], argv[optind+1], opt); free(opt); return 0; diff --git a/bwtpssm.h b/bwtpssm.h index fcef371..5993295 100644 --- a/bwtpssm.h +++ b/bwtpssm.h @@ -13,6 +13,7 @@ extern "C" { bwa_seq_t *bwa_read_pssm_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual, Probs *mc, float *qualprobs,const gap_opt_t *opt); void bwa_cal_pssm_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt); + bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); /* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, __cigar_op and __cigar_len while keeping stdaln stand alone */ diff --git a/bwtpssmgap.c b/bwtpssmgap.c index afbc4ad..d5218ef 100644 --- a/bwtpssmgap.c +++ b/bwtpssmgap.c @@ -292,30 +292,16 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const int curr_threshold; min_score = -INT_MAX; - g_visited++; visited++; - //no more space - /* - while (gp_heap->empty_left < 10) { - //fprintf(stderr, "burning entries\n"); - min_score = gap_destroy_min(gp_heap); - } - */ - gap_pop(gp_heap, mat->id, &e); // get the best entry k = e.k; l = e.l; // SA interval a = e.info>>20&1; i = e.info&0xffff; // strand, length - // fprintf(stderr, "best_found: %f mat->be[mat->length-1]-e.score_offset: %f\n", best_found, mat->be[mat->length-1] + e.score_offset); if (!(opt->mode & BWA_MODE_NONSTOP) && best_found > mat->be[mat->length-1] + e.score_offset + desired_mapq) { break; } - //fprintf(stderr, "e.score_offset: %f min_score: %f\n", e.score_offset, min_score); - // - //if (i == 4) - //fprintf(stderr, "yay"); //fprintf(stderr, "pssm #1 id:%d %d \t[%d][%d,%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%lu]\t[%lu,%lu]\t%d\t[%6d, **%6d**, %6d, %6d]\n", mat->id, i, max_entries, gp_heap->empty_left, a, i, seq[i], "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos, curr_score, e.score_offset, mat->thresholds[i], mat->bi[i]); @@ -342,7 +328,6 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const if (i == 0) { if (curr_score > best_found) { if (num_hits_found >= 2) { - //fprintf(stderr, "moving thresholds\n"); calc_and_set_reverse_thresholds(mat, 1, get_length(mat), curr_score); addMinWidthToThresholds(mat, width); } @@ -355,7 +340,6 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const hit_found = 1; num_hits_found += 1; e.score_offset = 0; - //e.pssm_score = mat->be[i-1]; } else { continue; // no hit, skip @@ -393,7 +377,7 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t)); } p = aln + n_aln; - p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; p->a = a; + p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; p->k = k; p->l = l; p->score = score; p->pssm_score = curr_score; @@ -507,7 +491,6 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const curr_offset = -((mat->be[i] - mat->be[i-1]) - base_score) + e.score_offset; } - //fprintf(stderr, "base_score: %d\n", base_score); if (curr_offset > min_score) { k = bwt->L2[c] + cnt_k[c] + 1; @@ -517,7 +500,6 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const if (k <= l) gap_push(gp_heap, mat->id, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt, curr_score + base_score, curr_offset); } } - //fprintf(stderr, "id: %d gp_heap->empty_left: %d gp_heap->size-2: %d\n", mat->id, gp_heap->empty_left , (gp_heap->size - 2)); gap_finish_push(gp_heap); } diff --git a/bwtsw2.h b/bwtsw2.h index 3c93509..0ec9676 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -6,10 +6,16 @@ #include "bwt_lite.h" #include "bwt.h" +#define BSW2_FLAG_MATESW 0x100 +#define BSW2_FLAG_TANDEM 0x200 +#define BSW2_FLAG_MOVED 0x400 +#define BSW2_FLAG_RESCUED 0x800 + typedef struct { - int a, b, q, r, t, qr, bw; - int z, is, t_seeds, hard_clip; - float yita, mask_level, coef; + int skip_sw:8, cpy_cmt:8, hard_clip:16; + int a, b, q, r, t, qr, bw, max_ins, max_chain_gap; + int z, is, t_seeds, multi_2nd; + float mask_level, coef; int n_threads, chunk_size; } bsw2opt_t; @@ -20,11 +26,15 @@ typedef struct { int beg, end; } bsw2hit_t; +typedef struct { + int flag, nn, n_cigar, chr, pos, qual, mchr, mpos, pqual, isize, nm; + uint32_t *cigar; +} bsw2aux_t; + typedef struct { int n, max; bsw2hit_t *hits; - int *n_cigar; - uint32_t **cigar; + bsw2aux_t *aux; } bwtsw2_t; typedef struct { @@ -33,18 +43,25 @@ typedef struct { uint8_t *aln_mem; } bsw2global_t; +typedef struct { + int l, tid; + char *name, *seq, *qual, *sam, *comment; +} bsw2seq1_t; + #ifdef __cplusplus extern "C" { #endif bsw2opt_t *bsw2_init_opt(); bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool); - void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn); + void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2); void bsw2_destroy(bwtsw2_t *b); bsw2global_t *bsw2_global_init(); void bsw2_global_destroy(bsw2global_t *_pool); + void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit); + #ifdef __cplusplus } #endif diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 447ffe0..d225187 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -11,16 +11,22 @@ #include "bwt_lite.h" #include "utils.h" #include "bwtsw2.h" -#include "stdaln.h" #include "kstring.h" +#include "bwa.h" +#include "ksw.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_DECLARE(gzFile) #include "ksort.h" #define __left_lt(a, b) ((a).end > (b).end) KSORT_INIT(hit, bsw2hit_t, __left_lt) +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + + extern unsigned char nst_nt4_table[256]; unsigned char nt_comp_table[256] = { @@ -50,9 +56,12 @@ bsw2opt_t *bsw2_init_opt() bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t)); o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30; o->bw = 50; - o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; - o->mask_level = 0.50f; o->yita = 5.5f; o->coef = 5.5f; + o->max_ins = 20000; + o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; o->skip_sw = 0; + o->mask_level = 0.50f; o->coef = 5.5f; o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000; + o->max_chain_gap = 10000; + o->cpy_cmt = 0; return o; } @@ -60,12 +69,25 @@ void bsw2_destroy(bwtsw2_t *b) { int i; if (b == 0) return; - if (b->cigar) - for (i = 0; i < b->n; ++i) free(b->cigar[i]); - free(b->cigar); free(b->n_cigar); free(b->hits); + if (b->aux) + for (i = 0; i < b->n; ++i) free(b->aux[i].cigar); + free(b->aux); free(b->hits); free(b); } +bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b) +{ + bwtsw2_t *p; + p = calloc(1, sizeof(bwtsw2_t)); + p->max = p->n = b->n; + if (b->n) { + kroundup32(p->max); + p->hits = calloc(p->max, sizeof(bsw2hit_t)); + memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t)); + } + return p; +} + #define __gen_ap(par, opt) do { \ int i; \ for (i = 0; i < 25; ++i) (par).matrix[i] = -(opt)->b; \ @@ -77,13 +99,12 @@ void bsw2_destroy(bwtsw2_t *b) void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { - int i, matrix[25]; + int i; bwtint_t k; uint8_t *target = 0, *query; - AlnParam par; + int8_t mat[25]; - par.matrix = matrix; - __gen_ap(par, opt); + bwa_fill_scmat(opt->a, opt->b, mat); query = calloc(lq, 1); // sort according to the descending order of query end ks_introsort(hit, b->n, b->hits); @@ -94,8 +115,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; - int score, j; - path_t path; + int score, j, qle, tle; p->n_seeds = 1; if (p->l || p->k == 0) continue; for (j = score = 0; j < i; ++j) { @@ -110,12 +130,12 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem); + score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0); if (score > p->G) { // extensible p->G = score; - p->len += path.i; - p->beg -= path.j; - p->k -= path.i; + p->k -= tle; + p->len += tle; + p->beg -= qle; } } free(query); free(target); @@ -123,84 +143,72 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { - int i, matrix[25]; + int i; bwtint_t k; uint8_t *target; - AlnParam par; - - par.matrix = matrix; - __gen_ap(par, opt); + int8_t mat[25]; + + bwa_fill_scmat(opt->a, opt->b, mat); target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; - int j, score; - path_t path; + int j, score, qle, tle; if (p->l) continue; for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = aln_extend_core(target, lt, query + p->beg, lq - p->beg, &par, &path, 0, 1, _mem); + score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0) - 1; // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); if (score >= p->G) { p->G = score; - p->len = path.i; - p->end = path.j + p->beg; + p->len = tle; + p->end = p->beg + qle; } } free(target); } /* generate CIGAR array(s) in b->cigar[] */ -static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], uint8_t *pac, bwtsw2_t *b) +static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], int64_t l_pac, const uint8_t *pac, bwtsw2_t *b, const char *name) { - uint8_t *target; - int i, matrix[25]; - AlnParam par; - path_t *path; - - par.matrix = matrix; - __gen_ap(par, opt); - i = ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq; // maximum possible target length - target = calloc(i, 1); - path = calloc(i + lq, sizeof(path_t)); - // memory clean up for b - if (b->n < b->max) { - b->max = b->n; - b->hits = realloc(b->hits, b->n * sizeof(bsw2hit_t)); - } - if (b->cigar) free(b->cigar); - if (b->n_cigar) free(b->n_cigar); - b->cigar = (uint32_t**)calloc(b->max, sizeof(void*)); - b->n_cigar = (int*)calloc(b->max, sizeof(int)); - // generate CIGAR + int i; + int8_t mat[25]; + + bwa_fill_scmat(opt->a, opt->b, mat); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; + bsw2aux_t *q = b->aux + i; uint8_t *query; - bwtint_t k; - int path_len, beg, end; + int beg, end, score; if (p->l) continue; beg = (p->flag & 0x10)? lq - p->end : p->beg; end = (p->flag & 0x10)? lq - p->beg : p->end; query = seq[(p->flag & 0x10)? 1 : 0] + beg; - for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here - target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3; - aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); - b->cigar[i] = aln_path2cigar32(path, path_len, &b->n_cigar[i]); - if (beg != 0 || end < lq) { // write soft clipping - b->cigar[i] = realloc(b->cigar[i], 4 * (b->n_cigar[i] + 2)); + q->cigar = bwa_gen_cigar(mat, opt->q, opt->r, opt->bw, l_pac, pac, end - beg, query, p->k, p->k + p->len, &score, &q->n_cigar, &q->nm); +#if 0 + if (name && score != p->G) { // debugging only + int j, glen = 0; + for (j = 0; j < q->n_cigar; ++j) + if ((q->cigar[j]&0xf) == 1 || (q->cigar[j]&0xf) == 2) + glen += q->cigar[j]>>4; + fprintf(stderr, "[E::%s] %s - unequal score: %d != %d; (qlen, aqlen, arlen, glen, bw) = (%d, %d, %d, %d, %d)\n", + __func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw); + } +#endif + if (q->cigar && (beg != 0 || end < lq)) { // write soft clipping + q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2)); if (beg != 0) { - memmove(b->cigar[i] + 1, b->cigar[i], b->n_cigar[i] * 4); - b->cigar[i][0] = beg<<4 | 4; - ++b->n_cigar[i]; + memmove(q->cigar + 1, q->cigar, q->n_cigar * 4); + q->cigar[0] = beg<<4 | 4; + ++q->n_cigar; } if (end < lq) { - b->cigar[i][b->n_cigar[i]] = (lq - end)<<4 | 4; - ++b->n_cigar[i]; + q->cigar[q->n_cigar] = (lq - end)<<4 | 4; + ++q->n_cigar; } } } - free(target); free(path); } /* this is for the debugging purpose only */ @@ -211,7 +219,7 @@ void bsw2_debug_hits(const bwtsw2_t *b) for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; if (p->G > 0) - printf("G=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev); + printf("G=%d, G2=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->G2, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev); } } @@ -270,12 +278,13 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8 } } b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits" - bsw2_chain_filter(opt, l, b); + bsw2_chain_filter(opt, l, b); // NB: only unique seeds are chained for (k = 0; k < 2; ++k) { bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem); merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here bsw2_resolve_duphits(0, 0, bb[k][0], 0); bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, pool->aln_mem); + bsw2_resolve_duphits(0, 0, bb[k][0], 0); b[k] = bb[k][0]; free(bb[k]); } @@ -309,17 +318,12 @@ static void flag_fr(bwtsw2_t *b[2]) } } -typedef struct { - int l, tid; - char *name, *seq, *qual, *sam; -} bsw2seq1_t; - typedef struct { int n, max; bsw2seq1_t *seq; } bsw2seq_t; -static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar) +static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar) { // FIXME: this routine does not work if the query bridge three reference sequences int32_t coor, refl, lq; @@ -392,9 +396,85 @@ static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n return n_cigar; } +static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name) +{ + int i; + // allocate for b->aux + if (b->n<<1 < b->max) { + b->max = b->n; + kroundup32(b->max); + b->hits = realloc(b->hits, b->max * sizeof(bsw2hit_t)); + } + b->aux = calloc(b->n, sizeof(bsw2aux_t)); + // generate CIGAR + gen_cigar(opt, qlen, seq, bns->l_pac, pac, b, name); + // fix CIGAR, generate mapQ, and write chromosomal position + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = &b->hits[i]; + bsw2aux_t *q = &b->aux[i]; + q->flag = p->flag & 0xfe; + q->isize = 0; + if (p->l == 0) { // unique hit + float c = 1.0; + int subo; + // fix out-of-boundary CIGAR + q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar); + // compute mapQ + subo = p->G2 > opt->t? p->G2 : opt->t; + if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; + if (p->n_seeds < 2) c *= .2; + q->qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499); + if (q->qual > 250) q->qual = 250; + if (q->qual < 0) q->qual = 0; + if (p->flag&1) q->qual = 0; // this is a random hit + q->pqual = q->qual; // set the paired qual as qual + // get the chromosomal position + q->nn = bns_cnt_ambi(bns, p->k, p->len, &q->chr); + q->pos = p->k - bns->anns[q->chr].offset; + } else q->qual = 0, q->n_cigar = 0, q->chr = q->pos = -1, q->nn = 0; + } +} + +static void update_mate_aux(bwtsw2_t *b, const bwtsw2_t *m) +{ + int i; + if (m == 0) return; + // update flag, mchr and mpos + for (i = 0; i < b->n; ++i) { + bsw2aux_t *q = &b->aux[i]; + q->flag |= 1; // paired + if (m->n == 0) q->flag |= 8; // mate unmapped + if (m->n == 1) { + q->mchr = m->aux[0].chr; + q->mpos = m->aux[0].pos; + if (m->aux[0].flag&0x10) q->flag |= 0x20; // mate reverse strand + if (q->chr == q->mchr) { // set insert size + if (q->mpos + m->hits[0].len > q->pos) + q->isize = q->mpos + m->hits[0].len - q->pos; + else q->isize = q->mpos - q->pos - b->hits[0].len; + } else q->isize = 0; + } else q->mchr = q->mpos = -1; + } + // update mapping quality + if (b->n == 1 && m->n == 1) { + bsw2hit_t *p = &b->hits[0]; + if (p->flag & BSW2_FLAG_MATESW) { // this alignment is found by Smith-Waterman + if (!(p->flag & BSW2_FLAG_TANDEM) && b->aux[0].pqual < 20) + b->aux[0].pqual = 20; + if (b->aux[0].pqual >= m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual; + } else if ((p->flag & 2) && !(m->hits[0].flag & BSW2_FLAG_MATESW)) { // properly paired + if (!(p->flag & BSW2_FLAG_TANDEM)) { // pqual is bounded by [b->aux[0].qual,m->aux[0].qual] + b->aux[0].pqual += 20; + if (b->aux[0].pqual > m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual; + if (b->aux[0].pqual < b->aux[0].qual) b->aux[0].pqual = b->aux[0].qual; + } + } + } +} + /* generate SAM lines for a sequence in ks with alignment stored in * b. ks->name and ks->seq will be freed and set to NULL in the end. */ -static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b) +static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b, int is_pe, bwtsw2_t *bmate) { int i, k; kstring_t str; @@ -410,51 +490,50 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks } for (i = 0; b && i < b->n; ++i) { bsw2hit_t *p = b->hits + i; - int seqid = -1; - int64_t coor = -1; - int j, qual, nn = 0; - int beg, end; - if (p->l == 0) { - b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]); - nn = bns_cnt_ambi(bns, p->k, p->len, &seqid); - coor = p->k - bns->anns[seqid].offset; - } - ksprintf(&str, "%s\t%d", ks->name, p->flag&0x10); - ksprintf(&str, "\t%s\t%ld", seqid>=0? bns->anns[seqid].name : "*", (long)coor + 1); - if (p->l == 0) { - { // estimate mapping quality - float c = 1.0; - int subo = p->G2 > opt->t? p->G2 : opt->t; - if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; - if (p->n_seeds < 2) c *= .2; - qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499); - if (qual > 250) qual = 250; - if (qual < 0) qual = 0; - if (p->flag&1) qual = 0; - } - ksprintf(&str, "\t%d\t", qual); - for (k = 0; k < b->n_cigar[i]; ++k) - ksprintf(&str, "%d%c", b->cigar[i][k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[b->cigar[i][k]&0xf]); + bsw2aux_t *q = b->aux + i; + int j, beg, end, type = 0; + // print mandatory fields before SEQ + if (q->cigar == 0) q->flag |= 0x4; + ksprintf(&str, "%s\t%d", ks->name, q->flag | (opt->multi_2nd && i? 0x100 : 0)); + ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1); + if (p->l == 0 && q->cigar) { // not a repetitive hit + ksprintf(&str, "\t%d\t", q->pqual); + for (k = 0; k < q->n_cigar; ++k) + ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]); } else ksprintf(&str, "\t0\t*"); - ksprintf(&str, "\t*\t0\t0\t"); + if (!is_pe) kputs("\t*\t0\t0\t", &str); + else ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize); + // get the sequence begin and end beg = 0; end = ks->l; - if (opt->hard_clip) { - if ((b->cigar[i][0]&0xf) == 4) beg += b->cigar[i][0]>>4; - if ((b->cigar[i][b->n_cigar[i]-1]&0xf) == 4) end -= b->cigar[i][b->n_cigar[i]-1]>>4; + if (opt->hard_clip && q->cigar) { + if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4; + if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4; } for (j = beg; j < end; ++j) { if (p->flag&0x10) kputc(nt_comp_table[(int)ks->seq[ks->l - 1 - j]], &str); else kputc(ks->seq[j], &str); } + // print base quality if present if (ks->qual) { kputc('\t', &str); for (j = beg; j < end; ++j) { if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str); else kputc(ks->qual[j], &str); } - } else ksprintf(&str, "\t*"); - ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tXN:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, nn); + } else kputs("\t*", &str); + // print optional tags + ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tNM:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, q->nm); + if (q->nn) ksprintf(&str, "\tXN:i:%d", q->nn); if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1); + if (p->flag&BSW2_FLAG_MATESW) type |= 1; + if (p->flag&BSW2_FLAG_TANDEM) type |= 2; + if (type) ksprintf(&str, "\tXT:i:%d", type); + if (opt->cpy_cmt && ks->comment) { + int l = strlen(ks->comment); + if (l >= 6 && ks->comment[2] == ':' && ks->comment[4] == ':') { + kputc('\t', &str); kputs(ks->comment, &str); + } + } kputc('\n', &str); } ks->sam = str.s; @@ -463,39 +542,41 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks free(ks->name); ks->name = 0; } +static void update_opt(bsw2opt_t *dst, const bsw2opt_t *src, int qlen) +{ + double ll = log(qlen); + int i, k; + *dst = *src; + if (dst->t < ll * dst->coef) dst->t = (int)(ll * dst->coef + .499); + // set band width: the query length sets a boundary on the maximum band width + k = (qlen * dst->a - 2 * dst->q) / (2 * dst->r + dst->a); + i = (qlen * dst->a - dst->a - dst->t) / dst->r; + if (k > i) k = i; + if (k < 1) k = 1; // I do not know if k==0 causes troubles + dst->bw = src->bw < k? src->bw : k; +} + /* Core routine to align reads in _seq. It is separated from * process_seqs() to realize multi-threading */ -static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target) +static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) { int x; - bsw2opt_t opt = *_opt; + bsw2opt_t opt; bsw2global_t *pool = bsw2_global_init(); + bwtsw2_t **buf; + buf = calloc(_seq->n, sizeof(void*)); for (x = 0; x < _seq->n; ++x) { bsw2seq1_t *p = _seq->seq + x; uint8_t *seq[2], *rseq[2]; int i, l, k; bwtsw2_t *b[2]; l = p->l; - -#ifdef HAVE_PTHREAD - if (x % _opt->n_threads != tid) continue; -#endif - - // set opt->t - opt.t = _opt->t; - if (opt.t < log(l) * opt.coef) opt.t = (int)(log(l) * opt.coef + .499); + update_opt(&opt, _opt, p->l); if (pool->max_l < l) { // then enlarge working space for aln_extend_core() int tmp = ((l + 1) / 2 * opt.a + opt.r) / opt.r + l; pool->max_l = l; pool->aln_mem = realloc(pool->aln_mem, (tmp + 2) * 24); } - // set opt->bw - opt.bw = _opt->bw; - k = (l * opt.a - 2 * opt.q) / (2 * opt.r + opt.a); - i = (l * opt.a - opt.a - opt.t) / opt.r; - if (k > i) k = i; - if (k < 1) k = 1; // I do not know if k==0 causes troubles - opt.bw = _opt->bw < k? _opt->bw : k; // set seq[2] and rseq[2] seq[0] = calloc(l * 4, 1); seq[1] = seq[0] + l; @@ -510,7 +591,7 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const rseq[1][i] = c; } if (l - k < opt.t) { // too few unambiguous bases - print_hits(bns, &opt, p, 0); + buf[x] = calloc(1, sizeof(bwtsw2_t)); free(seq[0]); continue; } // alignment @@ -532,43 +613,65 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bsw2_resolve_query_overlaps(b[0], opt.mask_level); } else b[1] = 0; // generate CIGAR and print SAM - gen_cigar(&opt, l, seq, pac, b[0]); - print_hits(bns, &opt, p, b[0]); + buf[x] = bsw2_dup_no_cigar(b[0]); // free free(seq[0]); bsw2_destroy(b[0]); } + if (is_pe) bsw2_pair(&opt, bns->l_pac, pac, _seq->n, _seq->seq, buf); + for (x = 0; x < _seq->n; ++x) { + bsw2seq1_t *p = _seq->seq + x; + uint8_t *seq[2]; + int i; + seq[0] = malloc(p->l * 2); seq[1] = seq[0] + p->l; + for (i = 0; i < p->l; ++i) { + int c = nst_nt4_table[(int)p->seq[i]]; + if (c >= 4) c = (int)(drand48() * 4); + seq[0][i] = c; + seq[1][p->l-1-i] = 3 - c; + } + update_opt(&opt, _opt, p->l); + write_aux(&opt, bns, p->l, seq, pac, buf[x], _seq->seq[x].name); + free(seq[0]); + } + for (x = 0; x < _seq->n; ++x) { + if (is_pe) update_mate_aux(buf[x], buf[x^1]); + print_hits(bns, &opt, &_seq->seq[x], buf[x], is_pe, buf[x^1]); + } + for (x = 0; x < _seq->n; ++x) bsw2_destroy(buf[x]); + free(buf); bsw2_global_destroy(pool); } #ifdef HAVE_PTHREAD typedef struct { - int tid; + int tid, is_pe; bsw2seq_t *_seq; const bsw2opt_t *_opt; const bntseq_t *bns; uint8_t *pac; - bwt_t *target; + const bwt_t *target; } thread_aux_t; /* another interface to bsw2_aln_core() to facilitate pthread_create() */ static void *worker(void *data) { thread_aux_t *p = (thread_aux_t*)data; - bsw2_aln_core(p->tid, p->_seq, p->_opt, p->bns, p->pac, p->target); + bsw2_aln_core(p->_seq, p->_opt, p->bns, p->pac, p->target, p->is_pe); return 0; } #endif /* process sequences stored in _seq, generate SAM lines for these * sequences and reset _seq afterwards. */ -static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target) +static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) { int i; + is_pe = is_pe? 1 : 0; #ifdef HAVE_PTHREAD if (opt->n_threads <= 1) { - bsw2_aln_core(0, _seq, opt, bns, pac, target); + bsw2_aln_core(_seq, opt, bns, pac, target, is_pe); } else { pthread_t *tid; pthread_attr_t attr; @@ -580,72 +683,94 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (j = 0; j < opt->n_threads; ++j) { thread_aux_t *p = data + j; - p->tid = j; p->_seq = _seq; p->_opt = opt; p->bns = bns; + p->tid = j; p->_opt = opt; p->bns = bns; p->is_pe = is_pe; p->pac = pac; p->target = target; - pthread_create(&tid[j], &attr, worker, p); + p->_seq = calloc(1, sizeof(bsw2seq_t)); + p->_seq->max = (_seq->n + opt->n_threads - 1) / opt->n_threads + 1; + p->_seq->n = 0; + p->_seq->seq = calloc(p->_seq->max, sizeof(bsw2seq1_t)); + } + for (i = 0; i < _seq->n; ++i) { // assign sequences to each thread + bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq; + p->seq[p->n++] = _seq->seq[i]; } + for (j = 0; j < opt->n_threads; ++j) pthread_create(&tid[j], &attr, worker, &data[j]); for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); + for (j = 0; j < opt->n_threads; ++j) data[j]._seq->n = 0; + for (i = 0; i < _seq->n; ++i) { // copy the result from each thread back + bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq; + _seq->seq[i] = p->seq[p->n++]; + } + for (j = 0; j < opt->n_threads; ++j) { + thread_aux_t *p = data + j; + free(p->_seq->seq); + free(p->_seq); + } free(data); free(tid); } #else - bsw2_aln_core(0, _seq, opt, bns, pac, target); + bsw2_aln_core(_seq, opt, bns, pac, target, is_pe); #endif // print and reset for (i = 0; i < _seq->n; ++i) { bsw2seq1_t *p = _seq->seq + i; - if (p->sam) printf("%s", p->sam); + if (p->sam) err_printf("%s", p->sam); free(p->name); free(p->seq); free(p->qual); free(p->sam); p->tid = -1; p->l = 0; p->name = p->seq = p->qual = p->sam = 0; } - fflush(stdout); + err_fflush(stdout); _seq->n = 0; } -void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn) +void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2) { - gzFile fp; - kseq_t *ks; - int l, size = 0; + gzFile fp, fp2; + kseq_t *ks, *ks2; + int l, is_pe = 0, i, n; uint8_t *pac; bsw2seq_t *_seq; + bseq1_t *bseq; pac = calloc(bns->l_pac/4+1, 1); - if (pac == 0) { - fprintf(stderr, "[bsw2_aln] insufficient memory!\n"); - return; - } for (l = 0; l < bns->n_seqs; ++l) - printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); - fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); + err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); + err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); fp = xzopen(fn, "r"); ks = kseq_init(fp); _seq = calloc(1, sizeof(bsw2seq_t)); - while ((l = kseq_read(ks)) >= 0) { - bsw2seq1_t *p; - if (_seq->n == _seq->max) { - _seq->max = _seq->max? _seq->max<<1 : 1024; + if (fn2) { + fp2 = xzopen(fn2, "r"); + ks2 = kseq_init(fp2); + is_pe = 1; + } else fp2 = 0, ks2 = 0, is_pe = 0; + while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { + int size = 0; + if (n > _seq->max) { + _seq->max = n; + kroundup32(_seq->max); _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); } - p = &_seq->seq[_seq->n++]; - p->tid = -1; - p->l = l; - p->name = strdup(ks->name.s); - p->seq = strdup(ks->seq.s); - p->qual = ks->qual.l? strdup(ks->qual.s) : 0; - p->sam = 0; - size += l; - if (size > opt->chunk_size) { - fprintf(stderr, "[bsw2_aln] read %d sequences (%d bp)...\n", _seq->n, size); - process_seqs(_seq, opt, bns, pac, target); - size = 0; + _seq->n = n; + for (i = 0; i < n; ++i) { + bseq1_t *b = &bseq[i]; + bsw2seq1_t *p = &_seq->seq[i]; + p->tid = -1; p->l = b->l_seq; + p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0; + size += p->l; } + fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size); + free(bseq); + process_seqs(_seq, opt, bns, pac, target, is_pe); } - fprintf(stderr, "[bsw2_aln] read %d sequences (%d bp)...\n", _seq->n, size); - process_seqs(_seq, opt, bns, pac, target); + // free + free(pac); free(_seq->seq); free(_seq); kseq_destroy(ks); - gzclose(fp); - free(pac); + err_gzclose(fp); + if (fn2) { + kseq_destroy(ks2); + err_gzclose(fp2); + } } diff --git a/bwtsw2_chain.c b/bwtsw2_chain.c index c734657..ade77e7 100644 --- a/bwtsw2_chain.c +++ b/bwtsw2_chain.c @@ -1,6 +1,10 @@ #include #include "bwtsw2.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + typedef struct { uint32_t tbeg, tend; int qbeg, qend; @@ -23,15 +27,15 @@ static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t hsaip_t *q = chain + k; int x = p->qbeg - q->qbeg; // always positive int y = p->tbeg - q->tbeg; - if (y > 0 && x - y <= opt->bw && y - x <= opt->bw) { + if (y > 0 && x < opt->max_chain_gap && y < opt->max_chain_gap && x - y <= opt->bw && y - x <= opt->bw) { // chained if (p->qend > q->qend) q->qend = p->qend; if (p->tend > q->tend) q->tend = p->tend; ++q->chain; p->chain = shift + k; break; - } + } else if (q->chain > opt->t_seeds * 2) k = 0; // if the chain is strong enough, do not check the previous chains } - if (k < 0) { + if (k < 0) { // not added to any previous chains chain[m] = *p; chain[m].chain = 1; chain[m].idx = p->chain = shift + m; @@ -44,7 +48,7 @@ static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) { hsaip_t *z[2], *chain[2]; - int i, j, k, n[2], m[2]; + int i, j, k, n[2], m[2], thres = opt->t_seeds * 2; char *flag; // initialization n[0] = b[0]->n; n[1] = b[1]->n; @@ -71,6 +75,7 @@ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) int tmp = p->qbeg; p->qbeg = len - p->qend; p->qend = len - tmp; } + //for (k = 0; k < m[0]; ++k) printf("%d, [%d,%d), [%d,%d)\n", chain[0][k].chain, chain[0][k].tbeg, chain[0][k].tend, chain[0][k].qbeg, chain[0][k].qend); // filtering flag = calloc(m[0] + m[1], 1); ks_introsort(hsaip, m[0] + m[1], chain[0]); @@ -79,7 +84,7 @@ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) for (j = 0; j < k; ++j) { hsaip_t *q = chain[0] + j; if (flag[q->idx]) continue; - if (q->qend >= p->qend && q->chain > p->chain * opt->t_seeds * 2) { + if (q->qend >= p->qend && q->chain > p->chain * thres && p->chain < thres) { flag[p->idx] = 1; break; } diff --git a/bwtsw2_core.c b/bwtsw2_core.c index 398a276..1119601 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -8,6 +8,10 @@ #include "bwt.h" #include "kvec.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + typedef struct { bwtint_t k, l; } qintv_t; @@ -327,6 +331,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int } if (!compatible) { p->G = 0; + if (q->G2 < p->G2) q->G2 = p->G2; break; } } diff --git a/bwtsw2_main.c b/bwtsw2_main.c index 3654372..40a9e0a 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -6,18 +6,17 @@ #include "bwt.h" #include "bwtsw2.h" #include "utils.h" +#include "bwa.h" int bwa_bwtsw2(int argc, char *argv[]) { bsw2opt_t *opt; - bwt_t *target; - char buf[1024]; - bntseq_t *bns; + bwaidx_t *idx; int c; opt = bsw2_init_opt(); srand48(11); - while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:y:s:c:N:Hf:")) >= 0) { + while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:SG:C")) >= 0) { switch (c) { case 'q': opt->q = atoi(optarg); break; case 'r': opt->r = atoi(optarg); break; @@ -27,53 +26,53 @@ int bwa_bwtsw2(int argc, char *argv[]) case 'T': opt->t = atoi(optarg); break; case 't': opt->n_threads = atoi(optarg); break; case 'z': opt->z = atoi(optarg); break; - case 'y': opt->yita = atof(optarg); break; case 's': opt->is = atoi(optarg); break; case 'm': opt->mask_level = atof(optarg); break; case 'c': opt->coef = atof(optarg); break; case 'N': opt->t_seeds = atoi(optarg); break; + case 'M': opt->multi_2nd = 1; break; case 'H': opt->hard_clip = 1; break; case 'f': xreopen(optarg, "w", stdout); break; + case 'I': opt->max_ins = atoi(optarg); break; + case 'S': opt->skip_sw = 1; break; + case 'C': opt->cpy_cmt = 1; break; + case 'G': opt->max_chain_gap = atoi(optarg); break; + default: return 1; } } opt->qr = opt->q + opt->r; if (optind + 2 > argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa bwasw [options] \n\n"); + fprintf(stderr, "Usage: bwa bwasw [options] [query2.fa]\n\n"); fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a); fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b); fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q); fprintf(stderr, " -r INT gap extension penalty [%d]\n", opt->r); -// fprintf(stderr, " -y FLOAT error recurrence coef. (4..16) [%.1f]\n", opt->yita); - fprintf(stderr, "\n"); - fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, "\n"); fprintf(stderr, " -w INT band width [%d]\n", opt->bw); fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level); fprintf(stderr, "\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -f FILE file to output results to instead of stdout\n"); + fprintf(stderr, " -H in SAM output, use hard clipping instead of soft clipping\n"); + fprintf(stderr, " -C copy FASTA/Q comment to SAM output\n"); + fprintf(stderr, " -M mark multi-part alignments as secondary\n"); + fprintf(stderr, " -S skip Smith-Waterman read pairing\n"); + fprintf(stderr, " -I INT ignore pairs with insert >=INT for inferring the size distr [%d]\n", opt->max_ins); + fprintf(stderr, "\n"); fprintf(stderr, " -T INT score threshold divided by a [%d]\n", opt->t); - fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is); - fprintf(stderr, " -z INT Z-best [%d]\n", opt->z); - fprintf(stderr, " -N INT # seeds to trigger reverse alignment [%d]\n", opt->t_seeds); fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef); - fprintf(stderr, " -H in SAM output, use hard clipping rather than soft\n"); - fprintf(stderr, " -f FILE file to output results to instead of stdout\n\n"); + fprintf(stderr, " -z INT Z-best [%d]\n", opt->z); + fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is); + fprintf(stderr, " -N INT # seeds to trigger rev aln; 2*INT is also the chaining threshold [%d]\n", opt->t_seeds); + fprintf(stderr, " -G INT maximum gap size during chaining [%d]\n", opt->max_chain_gap); + fprintf(stderr, "\n"); fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n"); fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n"); fprintf(stderr, " reads (end of 2010), '-b5 -q2 -r1 -z10' is recommended. One may also\n"); fprintf(stderr, " increase '-z' for better sensitivity.\n"); fprintf(stderr, "\n"); - if (0) { - double c, theta, eps, delta; - c = opt->a / log(opt->yita); - theta = exp(-opt->b / c) / opt->yita; - eps = exp(-opt->q / c); - delta = exp(-opt->r / c); - fprintf(stderr, "mismatch: %lf, gap_open: %lf, gap_ext: %lf\n\n", - theta, eps, delta); - } return 1; } @@ -81,14 +80,9 @@ int bwa_bwtsw2(int argc, char *argv[]) opt->t *= opt->a; opt->coef *= opt->a; - strcpy(buf, argv[optind]); target = bwt_restore_bwt(strcat(buf, ".bwt")); - strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".sa"), target); - bns = bns_restore(argv[optind]); - - bsw2_aln(opt, bns, target, argv[optind+1]); - - bns_destroy(bns); - bwt_destroy(target); + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1; + bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0); + bwa_idx_destroy(idx); free(opt); return 0; diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c new file mode 100644 index 0000000..24905df --- /dev/null +++ b/bwtsw2_pair.c @@ -0,0 +1,268 @@ +#include +#include +#include +#include +#include "utils.h" +#include "bwt.h" +#include "bntseq.h" +#include "bwtsw2.h" +#include "kstring.h" +#include "ksw.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#define MIN_RATIO 0.8 +#define OUTLIER_BOUND 2.0 +#define MAX_STDDEV 4.0 +#define EXT_STDDEV 4.0 + +typedef struct { + int low, high, failed; + double avg, std; +} bsw2pestat_t; + +bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) +{ + int i, k, x, p25, p50, p75, tmp, max_len = 0; + uint64_t *isize; + bsw2pestat_t r; + + memset(&r, 0, sizeof(bsw2pestat_t)); + isize = calloc(n, 8); + for (i = k = 0; i < n; i += 2) { + bsw2hit_t *t[2]; + int l; + if (buf[i] == 0 || buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits + t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0]; + if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough + if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough + l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + t[1]->len : t[1]->k - t[0]->k + t[0]->len; + if (l >= max_ins) continue; // skip pairs with excessively large insert + max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg; + max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; + isize[k++] = l; + } + ks_introsort_64(k, isize); + p25 = isize[(int)(.25 * k + .499)]; + p50 = isize[(int)(.50 * k + .499)]; + p75 = isize[(int)(.75 * k + .499)]; + ksprintf(msg, "[%s] infer the insert size distribution from %d high-quality pairs.\n", __func__, k); + if (k < 8) { + ksprintf(msg, "[%s] fail to infer the insert size distribution: too few good pairs.\n", __func__); + free(isize); + r.failed = 1; + return r; + } + tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); + r.low = tmp > max_len? tmp : max_len; + if (r.low < 1) r.low = 1; + r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + if (r.low > r.high) { + ksprintf(msg, "[%s] fail to infer the insert size distribution: upper bound is smaller than max read length.\n", __func__); + free(isize); + r.failed = 1; + return r; + } + ksprintf(msg, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); + ksprintf(msg, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high); + for (i = x = 0, r.avg = 0; i < k; ++i) + if (isize[i] >= r.low && isize[i] <= r.high) + r.avg += isize[i], ++x; + r.avg /= x; + for (i = 0, r.std = 0; i < k; ++i) + if (isize[i] >= r.low && isize[i] <= r.high) + r.std += (isize[i] - r.avg) * (isize[i] - r.avg); + r.std = sqrt(r.std / x); + ksprintf(msg, "[%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r.avg, r.std); + tmp = (int)(p25 - 3. * (p75 - p25) + .499); + r.low = tmp > max_len? tmp : max_len; + if (r.low < 1) r.low = 1; + r.high = (int)(p75 + 3. * (p75 - p25) + .499); + if (r.low > r.avg - MAX_STDDEV * r.std) r.low = (int)(r.avg - MAX_STDDEV * r.std + .499); + r.low = tmp > max_len? tmp : max_len; + if (r.high < r.avg - MAX_STDDEV * r.std) r.high = (int)(r.avg + MAX_STDDEV * r.std + .499); + ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high); + free(isize); + return r; +} + +typedef struct { + int n_cigar, beg, end, len; + int64_t pos; + uint32_t *cigar; +} pairaux_t; + +extern unsigned char nst_nt4_table[256]; + +void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a, int8_t g_mat[25]) +{ + extern void seq_reverse(int len, ubyte_t *seq, int is_comp); + int64_t k, beg, end; + uint8_t *seq, *ref; + int i; + // compute the region start and end + a->n_seeds = 1; a->flag |= BSW2_FLAG_MATESW; // before calling this routine, *a has been cleared with memset(0); the flag is set with 1<<6/7 + if (h->is_rev == 0) { + beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499); + if (beg < h->k) beg = h->k; + end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499); + a->is_rev = 1; a->flag |= 16; + } else { + beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499); + end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499); + if (end > h->k + (h->end - h->beg)) end = h->k + (h->end - h->beg); + a->is_rev = 0; + } + if (beg < 1) beg = 1; + if (end > l_pac) end = l_pac; + if (end - beg < l_mseq) return; + // generate the sequence + seq = malloc(l_mseq + (end - beg)); + ref = seq + l_mseq; + for (k = beg; k < end; ++k) + ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3; + if (h->is_rev == 0) { + for (i = 0; i < l_mseq; ++i) { // on the reverse strand + int c = nst_nt4_table[(int)mseq[i]]; + seq[l_mseq - 1 - i] = c > 3? 4 : 3 - c; + } + } else { + for (i = 0; i < l_mseq; ++i) // on the forward strand + seq[i] = nst_nt4_table[(int)mseq[i]]; + } + { + int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t; + kswr_t aln; + aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0); + a->G = aln.score; + a->G2 = aln.score2; + if (a->G < opt->t) a->G = 0; + if (a->G2 < opt->t) a->G2 = 0; + if (a->G2) a->flag |= BSW2_FLAG_TANDEM; + a->k = beg + aln.tb; + a->len = aln.te - aln.tb + 1; + a->beg = aln.qb; + a->end = aln.qe + 1; + /* + printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n'); + printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n'); + printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len); + */ + } + if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i; + free(seq); +} + +void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits) +{ + extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); + bsw2pestat_t pes; + int i, j, k, n_rescued = 0, n_moved = 0, n_fixed = 0; + int8_t g_mat[25]; + kstring_t msg; + memset(&msg, 0, sizeof(kstring_t)); + pes = bsw2_stat(n, hits, &msg, opt->max_ins); + for (i = k = 0; i < 5; ++i) { + for (j = 0; j < 4; ++j) + g_mat[k++] = i == j? opt->a : -opt->b; + g_mat[k++] = 0; + } + for (i = 0; i < n; i += 2) { + bsw2hit_t a[2]; + memset(&a, 0, sizeof(bsw2hit_t) * 2); + a[0].flag = 1<<6; a[1].flag = 1<<7; + for (j = 0; j < 2; ++j) { // set the read1/2 flag + if (hits[i+j] == 0) continue; + for (k = 0; k < hits[i+j]->n; ++k) { + bsw2hit_t *p = &hits[i+j]->hits[k]; + p->flag |= 1<<(6+j); + } + } + if (pes.failed) continue; + if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N + if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit + if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit + if (!opt->skip_sw) { + if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1], g_mat); + if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0], g_mat); + } // else a[0].G == a[1].G == a[0].G2 == a[1].G2 == 0 + // the following enumerate all possibilities. It is tedious but necessary... + if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not; + bwtsw2_t *p[2]; + int which; + if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1; + else p[0] = hits[i+1], p[1] = hits[i], which = 0; + if (a[which].G == 0) continue; + a[which].flag |= BSW2_FLAG_RESCUED; + if (p[1]->max == 0) { + p[1]->max = 1; + p[1]->hits = malloc(sizeof(bsw2hit_t)); + } + p[1]->hits[0] = a[which]; + p[1]->n = 1; + p[0]->hits[0].flag |= 2; + p[1]->hits[0].flag |= 2; + ++n_rescued; + } else { // then both ends mapped + int is_fixed = 0; + //fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end); + for (j = 0; j < 2; ++j) { // fix wrong mappings and wrong suboptimal alignment score + bsw2hit_t *p = &hits[i+j]->hits[0]; + if (p->G < a[j].G) { // the orginal mapping is suboptimal + a[j].G2 = a[j].G2 > p->G? a[j].G2 : p->G; // FIXME: reset BSW2_FLAG_TANDEM? + *p = a[j]; + ++n_fixed; + is_fixed = 1; + } else if (p->k != a[j].k && p->G2 < a[j].G) { + p->G2 = a[j].G; + } else if (p->k == a[j].k && p->G2 < a[j].G2) { + p->G2 = a[j].G2; + } + } + if (hits[i]->hits[0].k == a[0].k && hits[i+1]->hits[0].k == a[1].k) { // properly paired and no ends need to be moved + for (j = 0; j < 2; ++j) + hits[i+j]->hits[0].flag |= 2 | (a[j].flag & BSW2_FLAG_TANDEM); + } else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match + for (j = 0; j < 2; ++j) { + hits[i+j]->hits[0].flag |= 2; + if (hits[i+j]->hits[0].k != a[j].k) + hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM; + } + } else if (!is_fixed && (a[0].G || a[1].G)) { // it is possible to move one end + if (a[0].G && a[1].G) { // now we have two "proper pairs" + int G[2]; + double diff; + G[0] = hits[i]->hits[0].G + a[1].G; + G[1] = hits[i+1]->hits[0].G + a[0].G; + diff = fabs(G[0] - G[1]) / (opt->a + opt->b) / ((hits[i]->hits[0].len + a[1].len + hits[i+1]->hits[0].len + a[0].len) / 2.); + if (diff > 0.05) a[G[0] > G[1]? 0 : 1].G = 0; + } + if (a[0].G == 0 || a[1].G == 0) { // one proper pair only + bsw2hit_t *p[2]; // p[0] points the unchanged hit; p[1] to the hit to be moved + int which, isize; + double dev, diff; + if (a[0].G) p[0] = &hits[i+1]->hits[0], p[1] = &hits[i]->hits[0], which = 0; + else p[0] = &hits[i]->hits[0], p[1] = &hits[i+1]->hits[0], which = 1; + isize = p[0]->is_rev? p[0]->k + p[0]->len - a[which].k : a[which].k + a[which].len - p[0]->k; + dev = fabs(isize - pes.avg) / pes.std; + diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0; + if (diff < dev * 2.) { // then move (heuristic) + a[which].G2 = a[which].G; + p[1][0] = a[which]; + p[1]->flag |= BSW2_FLAG_MOVED | 2; + p[0]->flag |= 2; + ++n_moved; + } + } + } else if (is_fixed) { + hits[i+0]->hits[0].flag |= 2; + hits[i+1]->hits[0].flag |= 2; + } + } + } + ksprintf(&msg, "[%s] #fixed=%d, #rescued=%d, #moved=%d\n", __func__, n_fixed, n_rescued, n_moved); + fputs(msg.s, stderr); + free(msg.s); +} diff --git a/example.c b/example.c new file mode 100644 index 0000000..a6c9bdd --- /dev/null +++ b/example.c @@ -0,0 +1,64 @@ +#include +#include +#include +#include +#include +#include "bwamem.h" +#include "kseq.h" // for the FASTA/Q parser +KSEQ_DECLARE(gzFile) + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +int main(int argc, char *argv[]) +{ + bwaidx_t *idx; + gzFile fp; + kseq_t *ks; + mem_opt_t *opt; + + if (argc < 3) { + fprintf(stderr, "Usage: bwamem-lite \n"); + return 1; + } + + idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index + if (NULL == idx) { + fprintf(stderr, "Index load failed.\n"); + exit(EXIT_FAILURE); + } + fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); + if (NULL == fp) { + fprintf(stderr, "Couldn't open %s : %s\n", + strcmp(argv[2], "-") ? argv[2] : "stdin", + errno ? strerror(errno) : "Out of memory"); + exit(EXIT_FAILURE); + } + ks = kseq_init(fp); // initialize the FASTA/Q parser + opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values + + while (kseq_read(ks) >= 0) { // read one sequence + mem_alnreg_v ar; + int i, k; + ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits + for (i = 0; i < ar.n; ++i) { // traverse each hit + mem_aln_t a; + if (ar.a[i].secondary >= 0) continue; // skip secondary alignments + a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR + // print alignment + err_printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq); + for (k = 0; k < a.n_cigar; ++k) // print CIGAR + err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); + err_printf("\t%d\n", a.NM); // print edit distance + free(a.cigar); // don't forget to deallocate CIGAR + } + free(ar.a); // and deallocate the hit list + } + + free(opt); + kseq_destroy(ks); + err_gzclose(fp); + bwa_idx_destroy(idx); + return 0; +} diff --git a/fastmap.c b/fastmap.c index 585a043..9bdacb4 100644 --- a/fastmap.c +++ b/fastmap.c @@ -2,81 +2,327 @@ #include #include #include -#include "bntseq.h" -#include "bwt.h" +#include +#include +#include +#include "bwa.h" +#include "bwamem.h" #include "kvec.h" +#include "utils.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +#include "utils.h" +KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; +void *kopen(const char *fn, int *_fd); +int kclose(void *a); + +static void update_a(mem_opt_t *opt, const mem_opt_t *opt0) +{ + if (opt0->a) { // matching score is changed + if (!opt0->b) opt->b *= opt->a; + if (!opt0->T) opt->T *= opt->a; + if (!opt0->o_del) opt->o_del *= opt->a; + if (!opt0->e_del) opt->e_del *= opt->a; + if (!opt0->o_ins) opt->o_ins *= opt->a; + if (!opt0->e_ins) opt->e_ins *= opt->a; + if (!opt0->zdrop) opt->zdrop *= opt->a; + if (!opt0->pen_clip5) opt->pen_clip5 *= opt->a; + if (!opt0->pen_clip3) opt->pen_clip3 *= opt->a; + if (!opt0->pen_unpaired) opt->pen_unpaired *= opt->a; + } +} + +int main_mem(int argc, char *argv[]) +{ + mem_opt_t *opt, opt0; + int fd, fd2, i, c, n, copy_comment = 0; + gzFile fp, fp2 = 0; + kseq_t *ks, *ks2 = 0; + bseq1_t *seqs; + bwaidx_t *idx; + char *p, *rg_line = 0; + const char *mode = 0; + void *ko = 0, *ko2 = 0; + int64_t n_processed = 0; + mem_pestat_t pes[4], *pes0 = 0; + + memset(pes, 0, 4 * sizeof(mem_pestat_t)); + for (i = 0; i < 4; ++i) pes[i].failed = 1; + + opt = mem_opt_init(); + memset(&opt0, 0, sizeof(mem_opt_t)); + while ((c = getopt(argc, argv, "epaFMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:")) >= 0) { + if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; + else if (c == 'x') mode = optarg; + else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; + else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1; + else if (c == 'B') opt->b = atoi(optarg), opt0.b = 1; + else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1; + else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1; + else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; + else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; + else if (c == 'a') opt->flag |= MEM_F_ALL; + else if (c == 'p') opt->flag |= MEM_F_PE; + else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; + else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; + else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP; + else if (c == 'F') opt->flag |= MEM_F_ALN_REG; + else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1; + else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; + else if (c == 'v') bwa_verbose = atoi(optarg); + else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.; + else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.; + else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1; + else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1; + else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1; + else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1; + else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1; + else if (c == 'C') copy_comment = 1; + else if (c == 'Q') { + opt0.mapQ_coef_len = 1; + opt->mapQ_coef_len = atoi(optarg); + opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0; + } else if (c == 'O') { + opt0.o_del = opt0.o_ins = 1; + opt->o_del = opt->o_ins = strtol(optarg, &p, 10); + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + opt->o_ins = strtol(p+1, &p, 10); + } else if (c == 'E') { + opt0.e_del = opt0.e_ins = 1; + opt->e_del = opt->e_ins = strtol(optarg, &p, 10); + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + opt->e_ins = strtol(p+1, &p, 10); + } else if (c == 'L') { + opt0.pen_clip5 = opt0.pen_clip3 = 1; + opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10); + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + opt->pen_clip3 = strtol(p+1, &p, 10); + } else if (c == 'R') { + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak + } else if (c == 'I') { // specify the insert size distribution + pes0 = pes; + pes[1].failed = 0; + pes[1].avg = strtod(optarg, &p); + pes[1].std = pes[1].avg * .1; + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + pes[1].std = strtod(p+1, &p); + pes[1].high = (int)(pes[1].avg + 4. * pes[1].std + .499); + pes[1].low = (int)(pes[1].avg - 4. * pes[1].std + .499); + if (pes[1].low < 1) pes[1].low = 1; + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + pes[1].high = (int)(strtod(p+1, &p) + .499); + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + pes[1].low = (int)(strtod(p+1, &p) + .499); + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n", + __func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low); + } + else return 1; + } + if (opt->n_threads < 1) opt->n_threads = 1; + if (optind + 1 >= argc || optind + 3 < argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); + fprintf(stderr, "Algorithm options:\n\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); + fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); + fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); + fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); +// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); + fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); + fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio); + fprintf(stderr, " -W INT discard a chain if seeded bases shorter than INT [0]\n"); + fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw); + fprintf(stderr, " -S skip mate rescue\n"); + fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); + fprintf(stderr, " -e discard full-length exact matches\n"); + fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a); + fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); + fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins); + fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins); + fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3); + fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); + fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n"); + fprintf(stderr, " pacbio: -k17 -W40 -w200 -c1000 -r10 -A2 -B7 -O2 -E1 -L0\n"); + fprintf(stderr, " pbread: -k13 -W30 -w100 -c1000 -r10 -A2 -B5 -O2 -E1 -N20 -FeaD.01\n"); + fprintf(stderr, "\nInput/output options:\n\n"); + fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); + fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); + fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); + fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); + fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); + fprintf(stderr, " -M mark shorter split hits as secondary\n\n"); + fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n"); + fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n"); + fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n"); + fprintf(stderr, " FR orientation only. [inferred]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n"); + fprintf(stderr, "\n"); + free(opt); + return 1; + } + + if (mode) { + if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) { + if (!opt0.a) opt->a = 2, opt0.a = 1; + update_a(opt, &opt0); + if (!opt0.o_del) opt->o_del = 2; + if (!opt0.e_del) opt->e_del = 1; + if (!opt0.o_ins) opt->o_ins = 2; + if (!opt0.e_ins) opt->e_ins = 1; + if (!opt0.max_occ) opt->max_occ = 1000; + if (opt0.split_factor == 0.) opt->split_factor = 10.; + if (strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) { + opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG; + if (!opt0.b) opt->b = 5; + if (!opt0.w) opt->w = 100; + if (!opt0.min_seed_len) opt->min_seed_len = 13; + if (!opt0.min_chain_weight) opt->min_chain_weight = 30; + if (!opt0.max_chain_extend) opt->max_chain_extend = 20; + if (opt0.drop_ratio == 0.) opt->drop_ratio = .01; + } else { + if (!opt0.b) opt->b = 7; + if (!opt0.w) opt->w = 200; + if (!opt0.min_seed_len) opt->min_seed_len = 17; + if (!opt0.min_chain_weight) opt->min_chain_weight = 40; + if (!opt0.pen_clip5) opt->pen_clip5 = 0; + if (!opt0.pen_clip3) opt->pen_clip3 = 0; + } + } else { + fprintf(stderr, "[E::%s] unknown read type '%s'\n", __func__, mode); + return 1; // FIXME memory leak + } + } else update_a(opt, &opt0); +// if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; // TODO: tie ->T to MEM_HSP_COEF + bwa_fill_scmat(opt->a, opt->b, opt->mat); + + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak + + ko = kopen(argv[optind + 1], &fd); + if (ko == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]); + return 1; + } + fp = gzdopen(fd, "r"); + ks = kseq_init(fp); + if (optind + 2 < argc) { + if (opt->flag&MEM_F_PE) { + if (bwa_verbose >= 2) + fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__); + } else { + ko2 = kopen(argv[optind + 2], &fd2); + if (ko2 == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 2]); + return 1; + } + fp2 = gzdopen(fd2, "r"); + ks2 = kseq_init(fp2); + opt->flag |= MEM_F_PE; + } + } + if (!(opt->flag & MEM_F_ALN_REG)) + bwa_print_sam_hdr(idx->bns, rg_line); + while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { + int64_t size = 0; + if ((opt->flag & MEM_F_PE) && (n&1) == 1) { + if (bwa_verbose >= 2) + fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__); + n = n>>1<<1; + } + if (!copy_comment) + for (i = 0; i < n; ++i) { + free(seqs[i].comment); seqs[i].comment = 0; + } + for (i = 0; i < n; ++i) size += seqs[i].l_seq; + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size); + mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0); + n_processed += n; + for (i = 0; i < n; ++i) { + if (seqs[i].sam) err_fputs(seqs[i].sam, stdout); + free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); + } + free(seqs); + } + + free(opt); + bwa_idx_destroy(idx); + kseq_destroy(ks); + err_gzclose(fp); kclose(ko); + if (ks2) { + kseq_destroy(ks2); + err_gzclose(fp2); kclose(ko2); + } + return 0; +} + int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 20, min_len = 17; + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0; kseq_t *seq; bwtint_t k; gzFile fp; - bwt_t *bwt; - bntseq_t *bns; - bwtintv_v a[3], mem, *tvec[3]; + smem_i *itr; + const bwtintv_v *a; + bwaidx_t *idx; - while ((c = getopt(argc, argv, "w:l:")) >= 0) { + while ((c = getopt(argc, argv, "w:l:p")) >= 0) { switch (c) { + case 'p': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; + default: return 1; } } if (optind + 1 >= argc) { - fprintf(stderr, "Usage: bwa fastmap [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); + fprintf(stderr, "Usage: bwa fastmap [-p] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); return 1; } - fp = gzopen(argv[optind + 1], "r"); + fp = xzopen(argv[optind + 1], "r"); seq = kseq_init(fp); - { // load the packed sequences, BWT and SA - char *tmp = calloc(strlen(argv[optind]) + 5, 1); - strcat(strcpy(tmp, argv[optind]), ".bwt"); - bwt = bwt_restore_bwt(tmp); - strcat(strcpy(tmp, argv[optind]), ".sa"); - bwt_restore_sa(tmp, bwt); - free(tmp); - bns = bns_restore(argv[optind]); - } - for (i = 0; i < 3; ++i) { // initiate the temporary array - kv_init(a[i]); - tvec[i] = &a[i]; - } - kv_init(mem); + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1; + itr = smem_itr_init(idx->bwt); while (kseq_read(seq) >= 0) { + err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); + if (print_seq) { + err_putchar('\t'); + err_puts(seq->seq.s); + } else err_putchar('\n'); for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec); - printf("SQ\t%s\t%ld\n", seq->name.s, seq->seq.l); - for (i = 0; i < mem.n; ++i) { - bwtintv_t *p = &mem.a[i]; - if ((uint32_t)p->info - (p->info>>32) < min_len) continue; - printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); - if (p->x[2] <= min_iwidth) { - for (k = 0; k < p->x[2]; ++k) { - bwtint_t pos; - int len, is_rev, ref_id; - len = (uint32_t)p->info - (p->info>>32); - pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev); - if (is_rev) pos -= len - 1; - bns_cnt_ambi(bns, pos, len, &ref_id); - printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); - } + smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s); + while ((a = smem_next(itr)) != 0) { + for (i = 0; i < a->n; ++i) { + bwtintv_t *p = &a->a[i]; + if ((uint32_t)p->info - (p->info>>32) < min_len) continue; + err_printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); + if (p->x[2] <= min_iwidth) { + for (k = 0; k < p->x[2]; ++k) { + bwtint_t pos; + int len, is_rev, ref_id; + len = (uint32_t)p->info - (p->info>>32); + pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev); + if (is_rev) pos -= len - 1; + bns_cnt_ambi(idx->bns, pos, len, &ref_id); + err_printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1); + } + } else err_puts("\t*"); + err_putchar('\n'); } - putchar('\n'); } - puts("//"); + err_puts("//"); } - free(mem.a); - for (i = 0; i < 3; ++i) free(a[i].a); - bns_destroy(bns); - bwt_destroy(bwt); + smem_itr_destroy(itr); + bwa_idx_destroy(idx); kseq_destroy(seq); - gzclose(fp); + err_gzclose(fp); return 0; } diff --git a/is.c b/is.c index 9e50faf..46f1772 100644 --- a/is.c +++ b/is.c @@ -26,6 +26,10 @@ #include +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + typedef unsigned char ubyte_t; #define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i]) @@ -205,7 +209,8 @@ int is_bwt(ubyte_t *T, int n) { int *SA, i, primary = 0; SA = (int*)calloc(n+1, sizeof(int)); - is_sa(T, SA, n); + + if (is_sa(T, SA, n)) return -1; for (i = 0; i <= n; ++i) { if (SA[i] == 0) primary = i; diff --git a/kbtree.h b/kbtree.h new file mode 100644 index 0000000..2b76953 --- /dev/null +++ b/kbtree.h @@ -0,0 +1,388 @@ +/*- + * Copyright 1997-1999, 2001, John-Mark Gurney. + * 2008-2009, Attractive Chaos + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __AC_KBTREE_H +#define __AC_KBTREE_H + +#include +#include +#include + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +typedef struct { + int32_t is_internal:1, n:31; +} kbnode_t; + +#define __KB_KEY(type, x) ((type*)((char*)x + 4)) +#define __KB_PTR(btr, x) ((kbnode_t**)((char*)x + btr->off_ptr)) + +#define __KB_TREE_T(name) \ + typedef struct { \ + kbnode_t *root; \ + int off_key, off_ptr, ilen, elen; \ + int n, t; \ + int n_keys, n_nodes; \ + } kbtree_##name##_t; + +#define __KB_INIT(name, key_t) \ + kbtree_##name##_t *kb_init_##name(int size) \ + { \ + kbtree_##name##_t *b; \ + b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t)); \ + b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \ + if (b->t < 2) { \ + free(b); return 0; \ + } \ + b->n = 2 * b->t - 1; \ + b->off_ptr = 4 + b->n * sizeof(key_t); \ + b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \ + b->elen = (b->off_ptr + 3) >> 2 << 2; \ + b->root = (kbnode_t*)calloc(1, b->ilen); \ + ++b->n_nodes; \ + return b; \ + } + +#define __kb_destroy(b) do { \ + int i, max = 8; \ + kbnode_t *x, **top, **stack = 0; \ + if (b) { \ + top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*)); \ + *top++ = (b)->root; \ + while (top != stack) { \ + x = *--top; \ + if (x->is_internal == 0) { free(x); continue; } \ + for (i = 0; i <= x->n; ++i) \ + if (__KB_PTR(b, x)[i]) { \ + if (top - stack == max) { \ + max <<= 1; \ + stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \ + top = stack + (max>>1); \ + } \ + *top++ = __KB_PTR(b, x)[i]; \ + } \ + free(x); \ + } \ + } \ + free(b); free(stack); \ + } while (0) + +#define __kb_get_first(key_t, b, ret) do { \ + kbnode_t *__x = (b)->root; \ + while (__KB_PTR(b, __x)[0] != 0) \ + __x = __KB_PTR(b, __x)[0]; \ + (ret) = __KB_KEY(key_t, __x)[0]; \ + } while (0) + +#define __KB_GET_AUX0(name, key_t, __cmp) \ + static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \ + { \ + int tr, *rr, begin, end, n = x->n >> 1; \ + if (x->n == 0) return -1; \ + if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) { \ + begin = 0; end = n; \ + } else { begin = n; end = x->n - 1; } \ + rr = r? r : &tr; \ + n = end; \ + while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \ + return n; \ + } + +#define __KB_GET_AUX1(name, key_t, __cmp) \ + static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \ + { \ + int tr, *rr, begin = 0, end = x->n; \ + if (x->n == 0) return -1; \ + rr = r? r : &tr; \ + while (begin < end) { \ + int mid = (begin + end) >> 1; \ + if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \ + else end = mid; \ + } \ + if (begin == x->n) { *rr = 1; return x->n - 1; } \ + if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin; \ + return begin; \ + } + +#define __KB_GET(name, key_t) \ + static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + int i, r = 0; \ + kbnode_t *x = b->root; \ + while (x) { \ + i = __kb_getp_aux_##name(x, k, &r); \ + if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i]; \ + if (x->is_internal == 0) return 0; \ + x = __KB_PTR(b, x)[i + 1]; \ + } \ + return 0; \ + } \ + static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + return kb_getp_##name(b, &k); \ + } + +#define __KB_INTERVAL(name, key_t) \ + static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper) \ + { \ + int i, r = 0; \ + kbnode_t *x = b->root; \ + *lower = *upper = 0; \ + while (x) { \ + i = __kb_getp_aux_##name(x, k, &r); \ + if (i >= 0 && r == 0) { \ + *lower = *upper = &__KB_KEY(key_t, x)[i]; \ + return; \ + } \ + if (i >= 0) *lower = &__KB_KEY(key_t, x)[i]; \ + if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1]; \ + if (x->is_internal == 0) return; \ + x = __KB_PTR(b, x)[i + 1]; \ + } \ + } \ + static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \ + { \ + kb_intervalp_##name(b, &k, lower, upper); \ + } + +#define __KB_PUT(name, key_t, __cmp) \ + /* x must be an internal node */ \ + static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \ + { \ + kbnode_t *z; \ + z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen); \ + ++b->n_nodes; \ + z->is_internal = y->is_internal; \ + z->n = b->t - 1; \ + memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \ + if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \ + y->n = b->t - 1; \ + memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \ + __KB_PTR(b, x)[i + 1] = z; \ + memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \ + __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1]; \ + ++x->n; \ + } \ + static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \ + { \ + int i = x->n - 1; \ + if (x->is_internal == 0) { \ + i = __kb_getp_aux_##name(x, k, 0); \ + if (i != x->n - 1) \ + memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + __KB_KEY(key_t, x)[i + 1] = *k; \ + ++x->n; \ + } else { \ + i = __kb_getp_aux_##name(x, k, 0) + 1; \ + if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) { \ + __kb_split_##name(b, x, i, __KB_PTR(b, x)[i]); \ + if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i; \ + } \ + __kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k); \ + } \ + } \ + static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + kbnode_t *r, *s; \ + ++b->n_keys; \ + r = b->root; \ + if (r->n == 2 * b->t - 1) { \ + ++b->n_nodes; \ + s = (kbnode_t*)calloc(1, b->ilen); \ + b->root = s; s->is_internal = 1; s->n = 0; \ + __KB_PTR(b, s)[0] = r; \ + __kb_split_##name(b, s, 0, r); \ + r = s; \ + } \ + __kb_putp_aux_##name(b, r, k); \ + } \ + static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + kb_putp_##name(b, &k); \ + } + + +#define __KB_DEL(name, key_t) \ + static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \ + { \ + int yn, zn, i, r = 0; \ + kbnode_t *xp, *y, *z; \ + key_t kp; \ + if (x == 0) return *k; \ + if (s) { /* s can only be 0, 1 or 2 */ \ + r = x->is_internal == 0? 0 : s == 1? 1 : -1; \ + i = s == 1? x->n - 1 : -1; \ + } else i = __kb_getp_aux_##name(x, k, &r); \ + if (x->is_internal == 0) { \ + if (s == 2) ++i; \ + kp = __KB_KEY(key_t, x)[i]; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + --x->n; \ + return kp; \ + } \ + if (r == 0) { \ + if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) { \ + xp = __KB_PTR(b, x)[i]; \ + kp = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \ + return kp; \ + } else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) { \ + xp = __KB_PTR(b, x)[i + 1]; \ + kp = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \ + return kp; \ + } else if (yn == b->t - 1 && zn == b->t - 1) { \ + y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1]; \ + __KB_KEY(key_t, y)[y->n++] = *k; \ + memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \ + y->n += z->n; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \ + --x->n; \ + free(z); \ + return __kb_delp_aux_##name(b, y, k, s); \ + } \ + } \ + ++i; \ + if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) { \ + if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) { \ + memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \ + if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \ + __KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1]; \ + __KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \ + if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \ + --y->n; ++xp->n; \ + } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \ + __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0]; \ + if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \ + --y->n; \ + memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \ + } else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \ + __KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1]; \ + memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \ + y->n += xp->n; \ + memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \ + --x->n; \ + free(xp); \ + xp = y; \ + } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \ + __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \ + memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t)); \ + if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \ + xp->n += y->n; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \ + --x->n; \ + free(y); \ + } \ + } \ + return __kb_delp_aux_##name(b, xp, k, s); \ + } \ + static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + kbnode_t *x; \ + key_t ret; \ + ret = __kb_delp_aux_##name(b, b->root, k, 0); \ + --b->n_keys; \ + if (b->root->n == 0 && b->root->is_internal) { \ + --b->n_nodes; \ + x = b->root; \ + b->root = __KB_PTR(b, x)[0]; \ + free(x); \ + } \ + return ret; \ + } \ + static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + return kb_delp_##name(b, &k); \ + } + +typedef struct { + kbnode_t *x; + int i; +} __kbstack_t; + +#define __kb_traverse(key_t, b, __func) do { \ + int __kmax = 8; \ + __kbstack_t *__kstack, *__kp; \ + __kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \ + __kp->x = (b)->root; __kp->i = 0; \ + for (;;) { \ + while (__kp->x && __kp->i <= __kp->x->n) { \ + if (__kp - __kstack == __kmax - 1) { \ + __kmax <<= 1; \ + __kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \ + __kp = __kstack + (__kmax>>1) - 1; \ + } \ + (__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \ + ++__kp; \ + } \ + --__kp; \ + if (__kp >= __kstack) { \ + if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \ + ++__kp->i; \ + } else break; \ + } \ + free(__kstack); \ + } while (0) + +#define KBTREE_INIT(name, key_t, __cmp) \ + __KB_TREE_T(name) \ + __KB_INIT(name, key_t) \ + __KB_GET_AUX1(name, key_t, __cmp) \ + __KB_GET(name, key_t) \ + __KB_INTERVAL(name, key_t) \ + __KB_PUT(name, key_t, __cmp) \ + __KB_DEL(name, key_t) + +#define KB_DEFAULT_SIZE 512 + +#define kbtree_t(name) kbtree_##name##_t +#define kb_init(name, s) kb_init_##name(s) +#define kb_destroy(name, b) __kb_destroy(b) +#define kb_get(name, b, k) kb_get_##name(b, k) +#define kb_put(name, b, k) kb_put_##name(b, k) +#define kb_del(name, b, k) kb_del_##name(b, k) +#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u) +#define kb_getp(name, b, k) kb_getp_##name(b, k) +#define kb_putp(name, b, k) kb_putp_##name(b, k) +#define kb_delp(name, b, k) kb_delp_##name(b, k) +#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u) + +#define kb_size(b) ((b)->n_keys) + +#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b))) +#define kb_str_cmp(a, b) strcmp(a, b) + +#endif diff --git a/khash.h b/khash.h index de6be6d..12e5542 100644 --- a/khash.h +++ b/khash.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, 2009 by attractor + Copyright (c) 2008, 2009, 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -33,7 +33,6 @@ int main() { khiter_t k; khash_t(32) *h = kh_init(32); k = kh_put(32, h, 5, &ret); - if (!ret) kh_del(32, h, k); kh_value(h, k) = 10; k = kh_get(32, h, 10); is_missing = (k == kh_end(h)); @@ -47,6 +46,29 @@ int main() { */ /* + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + 2009-09-26 (0.2.4): * Improve portability @@ -86,16 +108,18 @@ int main() { @header Generic hash table library. - - @copyright Heng Li */ -#define AC_VERSION_KHASH_H "0.2.4" +#define AC_VERSION_KHASH_H "0.2.6" #include #include #include +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + /* compipler specific configuration */ #if UINT_MAX == 0xffffffffu @@ -111,24 +135,14 @@ typedef unsigned long long khint64_t; #endif #ifdef _MSC_VER -#define inline __inline +#define kh_inline __inline +#else +#define kh_inline inline #endif typedef khint32_t khint_t; typedef khint_t khiter_t; -#define __ac_HASH_PRIME_SIZE 32 -static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = -{ - 0ul, 3ul, 11ul, 23ul, 53ul, - 97ul, 193ul, 389ul, 769ul, 1543ul, - 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, - 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, - 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, - 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, - 3221225473ul, 4294967291ul -}; - #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) @@ -137,88 +151,128 @@ static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) +#ifdef KHASH_LINEAR +#define __ac_inc(k, m) 1 +#else +#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) +#endif + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef kcalloc +#define kcalloc(N,Z) calloc(N,Z) +#endif +#ifndef kmalloc +#define kmalloc(Z) malloc(Z) +#endif +#ifndef krealloc +#define krealloc(P,Z) realloc(P,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + static const double __ac_HASH_UPPER = 0.77; -#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - typedef struct { \ - khint_t n_buckets, size, n_occupied, upper_bound; \ - khint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; \ - static inline kh_##name##_t *kh_init_##name() { \ - return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ } \ - static inline void kh_destroy_##name(kh_##name##_t *h) \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ - free(h->keys); free(h->flags); \ - free(h->vals); \ - free(h); \ + kfree((void *)h->keys); kfree(h->flags); \ + kfree((void *)h->vals); \ + kfree(h); \ } \ } \ - static inline void kh_clear_##name(kh_##name##_t *h) \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ - memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ - static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ - khint_t inc, k, i, last; \ - k = __hash_func(key); i = k % h->n_buckets; \ - inc = 1 + k % (h->n_buckets - 1); last = i; \ + khint_t inc, k, i, last, mask; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ - else i += inc; \ + i = (i + inc) & mask; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ - static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ - { \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ - khint_t t = __ac_HASH_PRIME_SIZE - 1; \ - while (__ac_prime_list[t] > new_n_buckets) --t; \ - new_n_buckets = __ac_prime_list[t+1]; \ - if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ - else { \ - new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ - memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ - if (h->n_buckets < new_n_buckets) { \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) \ - h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ - } \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) return -1; \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) return -1; \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ } \ } \ - if (j) { \ + if (j) { /* rehashing is needed */ \ for (j = 0; j != h->n_buckets; ++j) { \ if (__ac_iseither(h->flags, j) == 0) { \ khkey_t key = h->keys[j]; \ khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ if (kh_is_map) val = h->vals[j]; \ __ac_set_isdel_true(h->flags, j); \ - while (1) { \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ khint_t inc, k, i; \ k = __hash_func(key); \ - i = k % new_n_buckets; \ - inc = 1 + k % (new_n_buckets - 1); \ - while (!__ac_isempty(new_flags, i)) { \ - if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ - else i += inc; \ - } \ + i = k & new_mask; \ + inc = __ac_inc(k, new_mask); \ + while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ __ac_set_isempty_false(new_flags, i); \ - if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ - __ac_set_isdel_true(h->flags, i); \ - } else { \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ h->keys[i] = key; \ if (kh_is_map) h->vals[i] = val; \ break; \ @@ -226,35 +280,39 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ } \ - if (h->n_buckets > new_n_buckets) { \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) \ - h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ } \ - free(h->flags); \ + kfree(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ + return 0; \ } \ - static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ - if (h->n_occupied >= h->upper_bound) { \ - if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ - else kh_resize_##name(h, h->n_buckets + 1); \ - } \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ - khint_t inc, k, i, site, last; \ - x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ - if (__ac_isempty(h->flags, i)) x = i; \ + khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ else { \ - inc = 1 + k % (h->n_buckets - 1); last = i; \ + inc = __ac_inc(k, mask); last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ - if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ - else i += inc; \ + i = (i + inc) & mask; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \ @@ -263,20 +321,20 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ } \ - if (__ac_isempty(h->flags, x)) { \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; ++h->n_occupied; \ *ret = 1; \ - } else if (__ac_isdel(h->flags, x)) { \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; \ *ret = 2; \ - } else *ret = 0; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ return x; \ } \ - static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ @@ -284,6 +342,17 @@ static const double __ac_HASH_UPPER = 0.77; } \ } +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + /* --- BEGIN OF HASH FUNCTIONS --- */ /*! @function @@ -311,10 +380,10 @@ static const double __ac_HASH_UPPER = 0.77; @param s Pointer to a null terminated string @return The hash value */ -static inline khint_t __ac_X31_hash_string(const char *s) +static kh_inline khint_t __ac_X31_hash_string(const char *s) { - khint_t h = *s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; return h; } /*! @function @@ -328,9 +397,21 @@ static inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) +static kh_inline khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) + /* --- END OF HASH FUNCTIONS --- */ -/* Other necessary macros... */ +/* Other convenient macros... */ /*! @abstract Type of the hash table. @@ -396,7 +477,6 @@ static inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_del(name, h, k) kh_del_##name(h, k) - /*! @function @abstract Test whether a bucket contains data. @param h Pointer to the hash table [khash_t(name)*] @@ -455,6 +535,34 @@ static inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_n_buckets(h) ((h)->n_buckets) +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + /* More conenient interfaces */ /*! @function diff --git a/kopen.c b/kopen.c new file mode 100644 index 0000000..d238226 --- /dev/null +++ b/kopen.c @@ -0,0 +1,374 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef _WIN32 +#include +#include +#include +#endif + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#ifdef _WIN32 +#define _KO_NO_NET +#endif + +#ifndef _KO_NO_NET +static int socket_wait(int fd, int is_read) +{ + fd_set fds, *fdr = 0, *fdw = 0; + struct timeval tv; + int ret; + tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (is_read) fdr = &fds; + else fdw = &fds; + ret = select(fd+1, fdr, fdw, 0, &tv); + if (ret == -1) perror("select"); + return ret; +} + +static int socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) + + int on = 1, fd; + struct linger lng = { 0, 0 }; + struct addrinfo hints, *res = 0; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); + if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); + freeaddrinfo(res); + return fd; +#undef __err_connect +} + +static int write_bytes(int fd, const char *buf, size_t len) +{ + ssize_t bytes; + do { + bytes = write(fd, buf, len); + if (bytes >= 0) { + len -= bytes; + } else if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { + return -1; + } + } while (len > 0); + + return 0; +} + +static int http_open(const char *fn) +{ + char *p, *proxy, *q, *http_host, *host, *port, *path, *buf; + int fd, ret, l; + ssize_t bytes = 0, bufsz = 0x10000; + + /* parse URL; adapted from khttp_parse_url() in knetfile.c */ + if (strstr(fn, "http://") != fn) return 0; + // set ->http_host + for (p = (char*)fn + 7; *p && *p != '/'; ++p); + l = p - fn - 7; + http_host = calloc(l + 1, 1); + strncpy(http_host, fn + 7, l); + http_host[l] = 0; + for (q = http_host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + // get http_proxy + proxy = getenv("http_proxy"); + // set host, port and path + if (proxy == 0) { + host = strdup(http_host); // when there is no proxy, server name is identical to http_host name. + port = strdup(*q? q : "80"); + path = strdup(*p? p : "/"); + } else { + host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); + for (q = host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + port = strdup(*q? q : "80"); + path = strdup(fn); + } + + /* connect; adapted from khttp_connect() in knetfile.c */ + l = 0; + fd = socket_connect(host, port); + buf = calloc(bufsz, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. + l += snprintf(buf + l, bufsz, "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n", + path, http_host); + if (write_bytes(fd, buf, l) != 0) { + close(fd); + fd = -1; + goto out; + } + l = 0; + retry: + while (l < bufsz && (bytes = read(fd, buf + l, 1)) > 0) { // read HTTP header; FIXME: bad efficiency + if (buf[l] == '\n' && l >= 3) + if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; + ++l; + } + if (bytes < 0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) goto retry; + + buf[l] = 0; + if (bytes < 0 || l < 14) { // prematured header + close(fd); + fd = -1; + goto out; + } + ret = strtol(buf + 8, &p, 0); // HTTP return code + if (ret != 200) { + close(fd); + fd = -1; + } + out: + free(buf); free(http_host); free(host); free(port); free(path); + return fd; +} + +typedef struct { + int max_response, ctrl_fd; + char *response; +} ftpaux_t; + +static int kftp_get_response(ftpaux_t *aux) +{ + unsigned char c; + int n = 0; + char *p; + if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0; + while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O + if (n >= aux->max_response) { + aux->max_response = aux->max_response? aux->max_response<<1 : 256; + aux->response = realloc(aux->response, aux->max_response); + } + aux->response[n++] = c; + if (c == '\n') { + if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2]) + && aux->response[3] != '-') break; + n = 0; + continue; + } + } + if (n < 2) return -1; + aux->response[n-2] = 0; + return strtol(aux->response, &p, 0); +} + +static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get) +{ + if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing + if (write_bytes(aux->ctrl_fd, cmd, strlen(cmd)) != 0) return -1; + return is_get? kftp_get_response(aux) : 0; +} + +static int ftp_open(const char *fn) +{ + char *p, *host = 0, *port = 0, *retr = 0; + char host2[80], port2[10]; + int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4]; + ftpaux_t aux; + + /* parse URL */ + if (strstr(fn, "ftp://") != fn) return 0; + for (p = (char*)fn + 6; *p && *p != '/'; ++p); + if (*p != '/') return 0; + l = p - fn - 6; + port = strdup("21"); + host = calloc(l + 1, 1); + strncpy(host, fn + 6, l); + retr = calloc(strlen(p) + 8, 1); + sprintf(retr, "RETR %s\r\n", p); + + /* connect to ctrl */ + memset(&aux, 0, sizeof(ftpaux_t)); + aux.ctrl_fd = socket_connect(host, port); + if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */ + + /* connect to the data stream */ + kftp_get_response(&aux); + kftp_send_cmd(&aux, "USER anonymous\r\n", 1); + kftp_send_cmd(&aux, "PASS kopen@\r\n", 1); + kftp_send_cmd(&aux, "TYPE I\r\n", 1); + kftp_send_cmd(&aux, "PASV\r\n", 1); + for (p = aux.response; *p && *p != '('; ++p); + if (*p != '(') goto ftp_open_end; + ++p; + sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); + memcpy(pasv_ip, v, 4 * sizeof(int)); + pasv_port = (v[4]<<8&0xff00) + v[5]; + kftp_send_cmd(&aux, retr, 0); + sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]); + sprintf(port2, "%d", pasv_port); + fd = socket_connect(host2, port2); + if (fd == -1) goto ftp_open_end; + ret = kftp_get_response(&aux); + if (ret != 150) { + close(fd); + fd = -1; + } + close(aux.ctrl_fd); + +ftp_open_end: + free(host); free(port); free(retr); free(aux.response); + return fd; +} +#endif /* !defined(_KO_NO_NET) */ + +static char **cmd2argv(const char *cmd) +{ + int i, beg, end, argc; + char **argv, *str; + end = strlen(cmd); + for (i = end - 1; i >= 0; --i) + if (!isspace(cmd[i])) break; + end = i + 1; + for (beg = 0; beg < end; ++beg) + if (!isspace(cmd[beg])) break; + if (beg == end) return 0; + for (i = beg + 1, argc = 0; i < end; ++i) + if (isspace(cmd[i]) && !isspace(cmd[i-1])) + ++argc; + argv = (char**)calloc(argc + 2, sizeof(void*)); + argv[0] = str = (char*)calloc(end - beg + 1, 1); + strncpy(argv[0], cmd + beg, end - beg); + for (i = argc = 1; i < end - beg; ++i) + if (isspace(str[i])) str[i] = 0; + else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i]; + return argv; +} + +#define KO_STDIN 1 +#define KO_FILE 2 +#define KO_PIPE 3 +#define KO_HTTP 4 +#define KO_FTP 5 + +typedef struct { + int type, fd; + pid_t pid; +} koaux_t; + +void *kopen(const char *fn, int *_fd) +{ + koaux_t *aux = 0; + *_fd = -1; + if (strstr(fn, "http://") == fn) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_HTTP; + aux->fd = http_open(fn); + } else if (strstr(fn, "ftp://") == fn) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_FTP; + aux->fd = ftp_open(fn); + } else if (strcmp(fn, "-") == 0) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_STDIN; + aux->fd = STDIN_FILENO; + } else { + const char *p, *q; + for (p = fn; *p; ++p) + if (!isspace(*p)) break; + if (*p == '<') { // pipe open + int need_shell, pfd[2]; + pid_t pid; + // a simple check to see if we need to invoke a shell; not always working + for (q = p + 1; *q; ++q) + if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':') + break; + need_shell = (*q != 0); + if (pipe(pfd) != 0) return 0; + pid = vfork(); + if (pid == -1) { /* vfork() error */ + close(pfd[0]); close(pfd[1]); + return 0; + } + if (pid == 0) { /* the child process */ + char **argv; /* FIXME: I do not know if this will lead to a memory leak */ + close(pfd[0]); + dup2(pfd[1], STDOUT_FILENO); + close(pfd[1]); + if (!need_shell) { + argv = cmd2argv(p + 1); + execvp(argv[0], argv); + free(argv[0]); free(argv); + } else execl("/bin/sh", "sh", "-c", p + 1, NULL); + exit(1); + } else { /* parent process */ + close(pfd[1]); + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_PIPE; + aux->fd = pfd[0]; + aux->pid = pid; + } + } else { +#ifdef _WIN32 + *_fd = open(fn, O_RDONLY | O_BINARY); +#else + *_fd = open(fn, O_RDONLY); +#endif + if (*_fd >= 0) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_FILE; + aux->fd = *_fd; + } + } + } + if (aux) *_fd = aux->fd; + return aux; +} + +int kclose(void *a) +{ + koaux_t *aux = (koaux_t*)a; + if (aux->type == KO_PIPE) { + int status; + pid_t pid; + pid = waitpid(aux->pid, &status, WNOHANG); + if (pid != aux->pid) kill(aux->pid, 15); + } + free(aux); + return 0; +} + +#ifdef _KO_MAIN +#define BUF_SIZE 0x10000 +int main(int argc, char *argv[]) +{ + void *x; + int l, fd; + unsigned char buf[BUF_SIZE]; + FILE *fp; + if (argc == 1) { + fprintf(stderr, "Usage: kopen \n"); + return 1; + } + x = kopen(argv[1], &fd); + fp = fdopen(fd, "r"); + if (fp == 0) { + fprintf(stderr, "ERROR: fail to open the input\n"); + return 1; + } + do { + if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0) + fwrite(buf, 1, l, stdout); + } while (l == BUF_SIZE); + fclose(fp); + kclose(x); + return 0; +} +#endif diff --git a/kseq.h b/kseq.h index 98c73e4..4d09b43 100644 --- a/kseq.h +++ b/kseq.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, by Heng Li + Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,6 +23,8 @@ SOFTWARE. */ +/* Last Modified: 05MAR2012 */ + #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -30,7 +32,14 @@ #include #include -float strtof(const char *nptr, char **endptr); +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 #define KSEQ_TYPE_FASTQ 0 #define KSEQ_TYPE_FASTA 1 @@ -38,7 +47,7 @@ float strtof(const char *nptr, char **endptr); #define __KS_TYPE(type_t) \ typedef struct __kstream_t { \ - char *buf; \ + unsigned char *buf; \ int begin, end, is_eof; \ type_t f; \ } kstream_t; @@ -49,9 +58,9 @@ float strtof(const char *nptr, char **endptr); #define __KS_BASIC(type_t, __bufsize) \ static inline kstream_t *ks_init(type_t f) \ { \ - kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ ks->f = f; \ - ks->buf = (char*)malloc(__bufsize); \ + ks->buf = (unsigned char*)malloc(__bufsize); \ return ks; \ } \ static inline void ks_destroy(kstream_t *ks) \ @@ -88,10 +97,10 @@ typedef struct __kstring_t { #endif #define __KS_GETUNTIL(__read, __bufsize) \ - static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ if (dret) *dret = 0; \ - str->l = 0; \ + str->l = append? str->l : 0; \ if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ @@ -103,17 +112,23 @@ typedef struct __kstring_t { if (ks->end == 0) break; \ } else break; \ } \ - if (delimiter) { \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == delimiter) break; \ - } else { \ + } else if (delimiter == KS_SEP_SPACE) { \ for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i])) break; \ - } \ - if (str->m - str->l < i - ks->begin + 1) { \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ str->m = str->l + (i - ks->begin) + 1; \ kroundup32(str->m); \ - str->s = (char*)realloc(str->s, str->m); \ + str->s = (char*)realloc(str->s, str->m); \ } \ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ str->l = str->l + (i - ks->begin); \ @@ -123,9 +138,15 @@ typedef struct __kstring_t { break; \ } \ } \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ str->s[str->l] = '\0'; \ return str->l; \ - } + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } #define KSTREAM_INIT(type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ @@ -133,29 +154,19 @@ typedef struct __kstring_t { __KS_GETC(__read, __bufsize) \ __KS_GETUNTIL(__read, __bufsize) -#define __KSEQ_BASIC(type_t) \ - static inline kseq_t *kseq_init(type_t fd) \ +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ { \ - int i; \ - kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ - for (i = 0; i < 4; i++) \ - s->scores[i] = (float *)calloc(1, sizeof(float)); \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ s->f = ks_init(fd); \ return s; \ } \ - static inline void kseq_rewind(kseq_t *ks) \ + SCOPE void kseq_destroy(kseq_t *ks) \ { \ - ks->last_char = 0; \ - ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ - } \ - static inline void kseq_destroy(kseq_t *ks) \ - { \ - int i; \ if (!ks) return; \ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ - for (i = 0; i < 4; i++) { \ - free(ks->scores[i]); \ - } \ ks_destroy(ks->f); \ free(ks); \ } @@ -165,55 +176,55 @@ typedef struct __kstring_t { -1 end-of-file -2 truncated quality string */ -#define __KSEQ_READ \ - static int kseq_read(kseq_t *seq) \ - { \ - int c, i, j; \ - kstream_t *ks = seq->f; \ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c, i, j; \ + kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* the first header char has been read */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ - if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@' && c != '&') { \ - if (isgraph(c)) { /* printable non-space character */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l++] = (char)c; \ - } \ - } \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - if (!seq->seq.s) { \ - fprintf(stderr, "Invalid input sequence\n"); \ - exit(1); \ - } \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+' && c != '&') return seq->seq.l; /* FASTA */ \ - if (c == '+') { \ - if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ - } \ - } \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+' && c != '&') return seq->seq.l; /* FASTA */ \ + if (c == '+') { \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + } \ if (c == '&') { \ seq->type = KSEQ_TYPE_PSSM; \ for (i = 0; i < 4; i++) \ seq->scores[i] = (float *) realloc(seq->scores[i], seq->seq.l* sizeof(float)); \ } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ - if (c == -1) return -2; /* we should not stop here */ \ - if (seq->qual.m > 0) { \ - while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ - if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ - seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ - if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ - } else if (seq->type == KSEQ_TYPE_PSSM) { \ + if (c == -1) return -2; /* error: no quality string */ \ + if (seq->qual.m > 0) { \ + while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ + } else if (seq->type == KSEQ_TYPE_PSSM) { \ kstring_t *buf = calloc(1, sizeof(kstring_t)); \ for (i = 0; i < 4; i++) { \ int lc; \ @@ -242,10 +253,19 @@ typedef struct __kstring_t { char type; \ } kseq_t; -#define KSEQ_INIT(type_t, __read) \ - KSTREAM_INIT(type_t, __read, 4096) \ +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT(type_t, __read, 16384) \ __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(type_t) \ - __KSEQ_READ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); #endif diff --git a/ksort.h b/ksort.h index 52812e1..5851b0d 100644 --- a/ksort.h +++ b/ksort.h @@ -58,6 +58,10 @@ #include #include +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + typedef struct { void *left, *right; int depth; @@ -72,7 +76,7 @@ typedef struct { int curr, shift; \ \ a2[0] = array; \ - a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ for (curr = 0, shift = 0; (1ul< #include "kstring.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + int ksprintf(kstring_t *s, const char *fmt, ...) { va_list ap; diff --git a/kstring.h b/kstring.h index 398901f..fe7fa95 100644 --- a/kstring.h +++ b/kstring.h @@ -4,6 +4,10 @@ #include #include +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif @@ -16,19 +20,33 @@ typedef struct __kstring_t { } kstring_t; #endif -static inline int kputs(const char *p, kstring_t *s) +static inline void ks_resize(kstring_t *s, size_t size) +{ + if (s->m < size) { + s->m = size; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } +} + +static inline int kputsn(const char *p, int l, kstring_t *s) { - int l = strlen(p); if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); } - strcpy(s->s + s->l, p); + memcpy(s->s + s->l, p, l); s->l += l; + s->s[s->l] = 0; return l; } +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + static inline int kputc(int c, kstring_t *s) { if (s->l + 1 >= s->m) { @@ -41,6 +59,57 @@ static inline int kputc(int c, kstring_t *s) return c; } +static inline int kputw(int c, kstring_t *s) +{ + char buf[16]; + int l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputuw(unsigned c, kstring_t *s) +{ + char buf[16]; + int l, i; + unsigned x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputl(long c, kstring_t *s) +{ + char buf[32]; + long l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + int ksprintf(kstring_t *s, const char *fmt, ...); #endif diff --git a/ksw.c b/ksw.c new file mode 100644 index 0000000..74123cb --- /dev/null +++ b/ksw.c @@ -0,0 +1,691 @@ +/* The MIT License + + Copyright (c) 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include +#include +#include +#include "ksw.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect((x),1) +#define UNLIKELY(x) __builtin_expect((x),0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 }; + +struct _kswq_t { + int qlen, slen; + uint8_t shift, mdiff, max, size; + __m128i *qp, *H0, *H1, *E, *Hmax; +}; + +/** + * Initialize the query data structure + * + * @param size Number of bytes used to store a score; valid valures are 1 or 2 + * @param qlen Length of the query sequence + * @param query Query sequence + * @param m Size of the alphabet + * @param mat Scoring matrix in a one-dimension array + * + * @return Query data structure + */ +kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) +{ + kswq_t *q; + int slen, a, tmp, p; + + size = size > 1? 2 : 1; + p = 8 * (3 - size); // # values per __m128i + slen = (qlen + p - 1) / p; // segmented length + q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory + q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory + q->H0 = q->qp + slen * m; + q->H1 = q->H0 + slen; + q->E = q->H1 + slen; + q->Hmax = q->E + slen; + q->slen = slen; q->qlen = qlen; q->size = size; + // compute shift + tmp = m * m; + for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score + if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; + if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; + } + q->max = q->mdiff; + q->shift = 256 - q->shift; // NB: q->shift is uint8_t + q->mdiff += q->shift; // this is the difference between the min and max scores + // An example: p=8, qlen=19, slen=3 and segmentation: + // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} + if (size == 1) { + int8_t *t = (int8_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; + } + } else { + int16_t *t = (int16_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]); + } + } + return q; +} + +kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; + uint64_t *b; + __m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax; + kswr_t r; + +#define __max_16(ret, xx) do { \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ + (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ + } while (0) + + // initialization + r = g_defr; + minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; + endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; + m_b = n_b = 0; b = 0; + zero = _mm_set1_epi32(0); + oe_del = _mm_set1_epi8(_o_del + _e_del); + e_del = _mm_set1_epi8(_e_del); + oe_ins = _mm_set1_epi8(_o_ins + _e_ins); + e_ins = _mm_set1_epi8(_e_ins); + shift = _mm_set1_epi8(q->shift); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm_store_si128(E + i, zero); + _mm_store_si128(H0 + i, zero); + _mm_store_si128(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, cmp, imax; + __m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian + for (j = 0; LIKELY(j < slen); ++j) { + /* SW cells are computed in the following order: + * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + * E(i+1,j) = max{H(i,j)-q, E(i,j)-r} + * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} + */ + // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) + h = _mm_adds_epu8(h, _mm_load_si128(S + j)); + h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) + e = _mm_load_si128(E + j); // e=E'(i,j) + h = _mm_max_epu8(h, e); + h = _mm_max_epu8(h, f); // h=H'(i,j) + max = _mm_max_epu8(max, h); // set max + _mm_store_si128(H1 + j, h); // save to H'(i,j) + // now compute E'(i+1,j) + e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del + t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del + e = _mm_max_epu8(e, t); // e=E'(i+1,j) + _mm_store_si128(E + j, e); // save to E'(i+1,j) + // now compute F'(i,j+1) + f = _mm_subs_epu8(f, e_ins); + t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins + f = _mm_max_epu8(f, t); + // get H'(i-1,j) and prepare for the next j + h = _mm_load_si128(H0 + j); // h=H'(i-1,j) + } + // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion + for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max + f = _mm_slli_si128(f, 1); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epu8(h, f); // h=H'(i,j) + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu8(h, oe_ins); + f = _mm_subs_epu8(f, e_ins); + cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero)); + if (UNLIKELY(cmp == 0xffff)) goto end_loop16; + } + } +end_loop16: + //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); + __max_16(imax, max); // imax is the maximum number in max + if (imax >= minsc) { // write the b array; this condition adds branching unfornately + if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append + if (n_b == m_b) { + m_b = m_b? m_b<<1 : 8; + b = (uint64_t*)realloc(b, 8 * m_b); + } + b[n_b++] = (uint64_t)imax<<32 | i; + } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; te = i; // te is the end position on the target + for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector + _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + if (gmax + q->shift >= 255 || gmax >= endsc) break; + } + S = H1; H1 = H0; H0 = S; // swap H0 and H1 + } + r.score = gmax + q->shift < 255? gmax : 255; + r.te = te; + if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score + int max = -1, tmp, low, high, qlen = slen * 16; + uint8_t *t = (uint8_t*)Hmax; + for (i = 0; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; + else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp; + //printf("%d,%d\n", max, gmax); + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) + r.score2 = b[i]>>32, r.te2 = e; + } + } + } + free(b); + return r; +} + +kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; + uint64_t *b; + __m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax; + kswr_t r; + +#define __max_8(ret, xx) do { \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ + (ret) = _mm_extract_epi16((xx), 0); \ + } while (0) + + // initialization + r = g_defr; + minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; + endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; + m_b = n_b = 0; b = 0; + zero = _mm_set1_epi32(0); + oe_del = _mm_set1_epi16(_o_del + _e_del); + e_del = _mm_set1_epi16(_e_del); + oe_ins = _mm_set1_epi16(_o_ins + _e_ins); + e_ins = _mm_set1_epi16(_e_ins); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm_store_si128(E + i, zero); + _mm_store_si128(H0 + i, zero); + _mm_store_si128(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, imax; + __m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_adds_epi16(h, *S++); + e = _mm_load_si128(E + j); + h = _mm_max_epi16(h, e); + h = _mm_max_epi16(h, f); + max = _mm_max_epi16(max, h); + _mm_store_si128(H1 + j, h); + e = _mm_subs_epu16(e, e_del); + t = _mm_subs_epu16(h, oe_del); + e = _mm_max_epi16(e, t); + _mm_store_si128(E + j, e); + f = _mm_subs_epu16(f, e_ins); + t = _mm_subs_epu16(h, oe_ins); + f = _mm_max_epi16(f, t); + h = _mm_load_si128(H0 + j); + } + for (k = 0; LIKELY(k < 16); ++k) { + f = _mm_slli_si128(f, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epi16(h, f); + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu16(h, oe_ins); + f = _mm_subs_epu16(f, e_ins); + if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; + } + } +end_loop8: + __max_8(imax, max); + if (imax >= minsc) { + if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { + if (n_b == m_b) { + m_b = m_b? m_b<<1 : 8; + b = (uint64_t*)realloc(b, 8 * m_b); + } + b[n_b++] = (uint64_t)imax<<32 | i; + } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; te = i; + for (j = 0; LIKELY(j < slen); ++j) + _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + if (gmax >= endsc) break; + } + S = H1; H1 = H0; H0 = S; + } + r.score = gmax; r.te = te; + { + int max = -1, tmp, low, high, qlen = slen * 8; + uint16_t *t = (uint16_t*)Hmax; + for (i = 0, r.qe = -1; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; + else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp; + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) + r.score2 = b[i]>>32, r.te2 = e; + } + } + } + free(b); + return r; +} + +static inline void revseq(int l, uint8_t *s) +{ + int i, t; + for (i = 0; i < l>>1; ++i) + t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t; +} + +kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry) +{ + int size; + kswq_t *q; + kswr_t r, rr; + kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int, int, int); + + q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat); + if (qry && *qry == 0) *qry = q; + func = q->size == 2? ksw_i16 : ksw_u8; + size = q->size; + r = func(q, tlen, target, o_del, e_del, o_ins, e_ins, xtra); + if (qry == 0) free(q); + if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r; + revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end + q = ksw_qinit(size, r.qe + 1, query, m, mat); + rr = func(q, tlen, target, o_del, e_del, o_ins, e_ins, KSW_XSTOP | r.score); + revseq(r.qe + 1, query); revseq(r.te + 1, target); + free(q); + if (r.score == rr.score) + r.tb = r.te - rr.te, r.qb = r.qe - rr.qe; + return r; +} + +kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry) +{ + return ksw_align2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, xtra, qry); +} + +/******************** + *** SW extension *** + ********************/ + +typedef struct { + int32_t h, e; +} eh_t; + +int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) +{ + eh_t *eh; // score array + int8_t *qp; // query profile + int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; + if (h0 < 0) h0 = 0; + // allocate memory + qp = malloc(qlen * m); + eh = calloc(qlen + 1, 8); + // generate the query profile + for (k = i = 0; k < m; ++k) { + const int8_t *p = &mat[k * m]; + for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; + } + // fill the first row + eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0; + for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j) + eh[j].h = eh[j-1].h - e_ins; + // adjust $w if it is too large + k = m * m; + for (i = 0, max = 0; i < k; ++i) // get the max score + max = max > mat[i]? max : mat[i]; + max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); + max_ins = max_ins > 1? max_ins : 1; + w = w < max_ins? w : max_ins; + max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); + max_del = max_del > 1? max_del : 1; + w = w < max_del? w : max_del; // TODO: is this necessary? + // DP loop + max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; + max_off = 0; + beg = 0, end = qlen; + for (i = 0; LIKELY(i < tlen); ++i) { + int t, f = 0, h1, m = 0, mj = -1; + int8_t *q = &qp[target[i] * qlen]; + // compute the first column + h1 = h0 - (o_del + e_del * (i + 1)); + if (h1 < 0) h1 = 0; + // apply the band and the constraint (if provided) + if (beg < i - w) beg = i - w; + if (end > i + w + 1) end = i + w + 1; + if (end > qlen) end = qlen; + for (j = beg; LIKELY(j < end); ++j) { + // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) + // Similar to SSE2-SW, cells are computed in the following order: + // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape + // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape + eh_t *p = &eh[j]; + int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) + p->h = h1; // set H(i,j-1) for the next row + M += q[j]; // separating H and M to disallow a cigar like "100M3I3D20M" + h = M > e? M : e; + h = h > f? h : f; + h1 = h; // save H(i,j) to h1 for the next column + mj = m > h? mj : j; // record the position where max score is achieved + m = m > h? m : h; // m is stored at eh[mj+1] + t = M - oe_del; + t = t > 0? t : 0; + e -= e_del; + e = e > t? e : t; // computed E(i+1,j) + p->e = e; // save E(i+1,j) for the next row + t = M - oe_ins; + t = t > 0? t : 0; + f -= e_ins; + f = f > t? f : t; // computed F(i,j+1) + } + eh[end].h = h1; eh[end].e = 0; + if (j == qlen) { + max_ie = gscore > h1? max_ie : i; + gscore = gscore > h1? gscore : h1; + } + if (m == 0) break; + if (m > max) { + max = m, max_i = i, max_j = mj; + max_off = max_off > abs(mj - i)? max_off : abs(mj - i); + } else if (zdrop > 0) { + if (i - max_i > mj - max_j) { + if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break; + } else { + if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break; + } + } + // update beg and end for the next round + for (j = mj; j >= beg && eh[j].h; --j); + beg = j + 1; + for (j = mj + 2; j <= end && eh[j].h; ++j); + end = j; + //beg = 0; end = qlen; // uncomment this line for debugging + } + free(eh); free(qp); + if (_qle) *_qle = max_j + 1; + if (_tle) *_tle = max_i + 1; + if (_gtle) *_gtle = max_ie + 1; + if (_gscore) *_gscore = gscore; + if (_max_off) *_max_off = max_off; + return max; +} + +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off) +{ + return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off); +} + +/******************** + * Global alignment * + ********************/ + +#define MINUS_INF -0x40000000 + +static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len) +{ + if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { + if (*n_cigar == *m_cigar) { + *m_cigar = *m_cigar? (*m_cigar)<<1 : 4; + cigar = realloc(cigar, (*m_cigar) << 2); + } + cigar[(*n_cigar)++] = len<<4 | op; + } else cigar[(*n_cigar)-1] += len<<4; + return cigar; +} + +int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_) +{ + eh_t *eh; + int8_t *qp; // query profile + int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, score, n_col; + uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex + if (n_cigar_) *n_cigar_ = 0; + // allocate memory + n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix + z = malloc(n_col * tlen); + qp = malloc(qlen * m); + eh = calloc(qlen + 1, 8); + // generate the query profile + for (k = i = 0; k < m; ++k) { + const int8_t *p = &mat[k * m]; + for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; + } + // fill the first row + eh[0].h = 0; eh[0].e = MINUS_INF; + for (j = 1; j <= qlen && j <= w; ++j) + eh[j].h = -(o_ins + e_ins * j), eh[j].e = MINUS_INF; + for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band + // DP loop + for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop + int32_t f = MINUS_INF, h1, beg, end, t; + int8_t *q = &qp[target[i] * qlen]; + uint8_t *zi = &z[i * n_col]; + beg = i > w? i - w : 0; + end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence + h1 = beg == 0? -(o_del + e_del * (i + 1)) : MINUS_INF; + for (j = beg; LIKELY(j < end); ++j) { + // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) + // Cells are computed in the following order: + // M(i,j) = H(i-1,j-1) + S(i,j) + // H(i,j) = max{M(i,j), E(i,j), F(i,j)} + // E(i+1,j) = max{M(i,j)-gapo, E(i,j)} - gape + // F(i,j+1) = max{M(i,j)-gapo, F(i,j)} - gape + // We have to separate M(i,j); otherwise the direction may not be recorded correctly. + // However, a CIGAR like "10M3I3D10M" allowed by local() is disallowed by global(). + // Such a CIGAR may occur, in theory, if mismatch_penalty > 2*gap_ext_penalty + 2*gap_open_penalty/k. + // In practice, this should happen very rarely given a reasonable scoring system. + eh_t *p = &eh[j]; + int32_t h, m = p->h, e = p->e; + uint8_t d; // direction + p->h = h1; + m += q[j]; + d = m >= e? 0 : 1; + h = m >= e? m : e; + d = h >= f? d : 2; + h = h >= f? h : f; + h1 = h; + t = m - oe_del; + e -= e_del; + d |= e > t? 1<<2 : 0; + e = e > t? e : t; + p->e = e; + t = m - oe_ins; + f -= e_ins; + d |= f > t? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two + f = f > t? f : t; + zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell + } + eh[end].h = h1; eh[end].e = MINUS_INF; + } + score = eh[qlen].h; + if (n_cigar_ && cigar_) { // backtrack + int n_cigar = 0, m_cigar = 0, which = 0; + uint32_t *cigar = 0, tmp; + i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell + while (i >= 0 && k >= 0) { + which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3; + if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k; + else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i; + else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k; + } + if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1); + if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1); + for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR + tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; + *n_cigar_ = n_cigar, *cigar_ = cigar; + } + free(eh); free(qp); free(z); + return score; +} + +int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_) +{ + return ksw_global2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, n_cigar_, cigar_); +} + +/******************************************* + * Main function (not compiled by default) * + *******************************************/ + +#ifdef _KSW_MAIN + +#include +#include +#include +#include "kseq.h" +KSEQ_INIT(gzFile, err_gzread) + +unsigned char seq_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +int main(int argc, char *argv[]) +{ + int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0; + int8_t mat[25]; + int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART; + uint8_t *rseq = 0; + gzFile fpt, fpq; + kseq_t *kst, *ksq; + + // parse command line + while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) { + switch (c) { + case 'a': sa = atoi(optarg); break; + case 'b': sb = atoi(optarg); break; + case 'q': gapo = atoi(optarg); break; + case 'r': gape = atoi(optarg); break; + case 't': minsc = atoi(optarg); break; + case 'f': forward_only = 1; break; + case '1': xtra |= KSW_XBYTE; break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] \n", sa, sb, gapo, gape, minsc); + return 1; + } + if (minsc > 0xffff) minsc = 0xffff; + xtra |= KSW_XSUBO | minsc; + // initialize scoring matrix + for (i = k = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? sa : -sb; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; + // open file + fpt = xzopen(argv[optind], "r"); kst = kseq_init(fpt); + fpq = xzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); + // all-pair alignment + while (kseq_read(ksq) > 0) { + kswq_t *q[2] = {0, 0}; + kswr_t r; + for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; + if (!forward_only) { // reverse + if ((int)ksq->seq.m > max_rseq) { + max_rseq = ksq->seq.m; + rseq = (uint8_t*)realloc(rseq, max_rseq); + } + for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j) + rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; + } + gzrewind(fpt); kseq_rewind(kst); + while (kseq_read(kst) > 0) { + for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; + r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]); + if (r.score >= minsc) + err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2); + if (rseq) { + r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]); + if (r.score >= minsc) + err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2); + } + } + free(q[0]); free(q[1]); + } + free(rseq); + kseq_destroy(kst); err_gzclose(fpt); + kseq_destroy(ksq); err_gzclose(fpq); + return 0; +} +#endif diff --git a/ksw.h b/ksw.h new file mode 100644 index 0000000..5d45a67 --- /dev/null +++ b/ksw.h @@ -0,0 +1,114 @@ +#ifndef __AC_KSW_H +#define __AC_KSW_H + +#include + +#define KSW_XBYTE 0x10000 +#define KSW_XSTOP 0x20000 +#define KSW_XSUBO 0x40000 +#define KSW_XSTART 0x80000 + +struct _kswq_t; +typedef struct _kswq_t kswq_t; + +typedef struct { + int score; // best score + int te, qe; // target end and query end + int score2, te2; // second best score and ending position on the target + int tb, qb; // target start and query start +} kswr_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /** + * Aligning two sequences + * + * @param qlen length of the query sequence (typically =0, *gscore keeps the best score such that + * the entire query sequence is aligned; *gtle keeps the position on the + * target where *gscore is achieved. Returning *gscore and *gtle helps the + * caller to decide whether an end-to-end hit or a partial hit is preferred. + * + * The first 9 parameters are identical to those in ksw_global() + * + * @param h0 alignment score of upstream sequences + * @param _qle (out) length of the query in the alignment + * @param _tle (out) length of the target in the alignment + * @param _gtle (out) length of the target if query is fully aligned + * @param _gscore (out) score of the best end-to-end alignment; negative if not found + * + * @return best semi-local alignment score + */ + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); + int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/kthread.c b/kthread.c new file mode 100644 index 0000000..a44426b --- /dev/null +++ b/kthread.c @@ -0,0 +1,53 @@ +#include +#include + +struct kt_for_t; + +typedef struct { + struct kt_for_t *t; + int i; +} ktf_worker_t; + +typedef struct kt_for_t { + int n_threads, n; + ktf_worker_t *w; + void (*func)(void*,int,int); + void *data; +} kt_for_t; + +static inline int steal_work(kt_for_t *t) +{ + int i, k, min = 0x7fffffff, min_i = -1; + for (i = 0; i < t->n_threads; ++i) + if (min > t->w[i].i) min = t->w[i].i, min_i = i; + k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads); + return k >= t->n? -1 : k; +} + +static void *ktf_worker(void *data) +{ + ktf_worker_t *w = (ktf_worker_t*)data; + int i; + for (;;) { + i = __sync_fetch_and_add(&w->i, w->t->n_threads); + if (i >= w->t->n) break; + w->t->func(w->t->data, i, w - w->t->w); + } + while ((i = steal_work(w->t)) >= 0) + w->t->func(w->t->data, i, w - w->t->w); + pthread_exit(0); +} + +void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n) +{ + int i; + kt_for_t t; + pthread_t *tid; + t.func = func, t.data = data, t.n_threads = n_threads, t.n = n; + t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t)); + tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); + for (i = 0; i < n_threads; ++i) + t.w[i].t = &t, t.w[i].i = i; + for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]); + for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); +} diff --git a/kvec.h b/kvec.h index 57204d6..83ad483 100644 --- a/kvec.h +++ b/kvec.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, by Attractive Chaos + Copyright (c) 2008, by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -50,6 +50,10 @@ int main() { #include +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #define kvec_t(type) struct { size_t n, m; type *a; } @@ -71,20 +75,20 @@ int main() { #define kv_push(type, v, x) do { \ if ((v).n == (v).m) { \ (v).m = (v).m? (v).m<<1 : 2; \ - (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ } \ (v).a[(v).n++] = (x); \ } while (0) -#define kv_pushp(type, v) (((v).n == (v).m)? \ +#define kv_pushp(type, v) ((((v).n == (v).m)? \ ((v).m = ((v).m? (v).m<<1 : 2), \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ - : 0), ((v).a + ((v).n++)) + : 0), &(v).a[(v).n++]) -#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ +#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \ ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ - : (v).n <= (size_t)(i)? (v).n = (i) \ - : 0), (v).a[(i)] + : (v).n <= (size_t)(i)? (v).n = (i) + 1 \ + : 0), (v).a[(i)]) #endif diff --git a/main.c b/main.c index b485467..9faf535 100644 --- a/main.c +++ b/main.c @@ -1,12 +1,33 @@ #include #include -#include "main.h" +#include "kstring.h" #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r70-dev" +#define PACKAGE_VERSION "pssm0.7.8-r713-dirty" #endif +int bwa_fa2pac(int argc, char *argv[]); +int bwa_pac2bwt(int argc, char *argv[]); +int bwa_bwtupdate(int argc, char *argv[]); +int bwa_bwt2sa(int argc, char *argv[]); +int bwa_index(int argc, char *argv[]); +int bwt_bwtgen_main(int argc, char *argv[]); +int bwa_pssm(int argc, char *argv[]); + +int bwa_aln(int argc, char *argv[]); +int bwa_sai2sam_se(int argc, char *argv[]); +int bwa_sai2sam_pe(int argc, char *argv[]); + +int bwa_bwtsw2(int argc, char *argv[]); + +int main_fastmap(int argc, char *argv[]); +int main_mem(int argc, char *argv[]); + +int main_pemerge(int argc, char *argv[]); + +char *bwa_pg; + static int usage() { fprintf(stderr, "\n"); @@ -15,54 +36,68 @@ static int usage() fprintf(stderr, "Contact: Heng Li \n\n"); fprintf(stderr, "Usage: bwa [options]\n\n"); fprintf(stderr, "Command: index index sequences in the FASTA format\n"); + fprintf(stderr, " mem BWA-MEM algorithm\n"); + fprintf(stderr, " fastmap identify super-maximal exact matches\n"); + fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n"); fprintf(stderr, " aln gapped/ungapped alignment\n"); fprintf(stderr, " pssm pssm based gapped/ungapped alignment\n"); fprintf(stderr, " samse generate alignment (single ended)\n"); fprintf(stderr, " sampe generate alignment (paired ended)\n"); fprintf(stderr, " bwasw BWA-SW for long queries\n"); - fprintf(stderr, " fastmap identify super-maximal exact matches\n"); fprintf(stderr, "\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n"); fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n"); fprintf(stderr, " bwtupdate update .bwt to the new format\n"); fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); - fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n"); - fprintf(stderr, " stdsw standard SW/NW alignment\n"); fprintf(stderr, "\n"); + fprintf(stderr, +"Note: To use BWA, you need to first index the genome with `bwa index'.\n" +" There are three alignment algorithms in BWA: `mem', `bwasw', and\n" +" `aln/samse/sampe'. If you are not sure which to use, try `bwa mem'\n" +" first. Please `man ./bwa.1' for the manual.\n\n"); return 1; } -void bwa_print_sam_PG() -{ - printf("@PG\tID:bwa\tPN:bwa\tVN:%s\n", PACKAGE_VERSION); -} - int main(int argc, char *argv[]) { + int i, ret; + double t_real; + kstring_t pg = {0,0,0}; + t_real = realtime(); + ksprintf(&pg, "@PG\tID:bwa\tPN:bwa\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]); + for (i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]); + bwa_pg = pg.s; if (argc < 2) return usage(); - if (strcmp(argv[1], "fa2pac") == 0) return bwa_fa2pac(argc-1, argv+1); - else if (strcmp(argv[1], "pac2bwt") == 0) return bwa_pac2bwt(argc-1, argv+1); - else if (strcmp(argv[1], "pac2bwtgen") == 0) return bwt_bwtgen_main(argc-1, argv+1); - else if (strcmp(argv[1], "bwtupdate") == 0) return bwa_bwtupdate(argc-1, argv+1); - else if (strcmp(argv[1], "bwt2sa") == 0) return bwa_bwt2sa(argc-1, argv+1); - else if (strcmp(argv[1], "index") == 0) return bwa_index(argc-1, argv+1); - else if (strcmp(argv[1], "aln") == 0) return bwa_aln(argc-1, argv+1); + if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1); + else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1); + else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1); + else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1); + else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); + else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); + else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); else if (strcmp(argv[1], "pssm") == 0) return bwa_pssm(argc-1, argv+1); - else if (strcmp(argv[1], "sw") == 0) return bwa_stdsw(argc-1, argv+1); - else if (strcmp(argv[1], "samse") == 0) return bwa_sai2sam_se(argc-1, argv+1); - else if (strcmp(argv[1], "sampe") == 0) return bwa_sai2sam_pe(argc-1, argv+1); - else if (strcmp(argv[1], "pac2cspac") == 0) return bwa_pac2cspac(argc-1, argv+1); - else if (strcmp(argv[1], "stdsw") == 0) return bwa_stdsw(argc-1, argv+1); - else if (strcmp(argv[1], "bwtsw2") == 0) return bwa_bwtsw2(argc-1, argv+1); - else if (strcmp(argv[1], "dbwtsw") == 0) return bwa_bwtsw2(argc-1, argv+1); - else if (strcmp(argv[1], "bwasw") == 0) return bwa_bwtsw2(argc-1, argv+1); - else if (strcmp(argv[1], "fastmap") == 0) return main_fastmap(argc-1, argv+1); + else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); + else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); + else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); + else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); + else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; } - err_fflush(stdout); - err_fclose(stdout); - return 0; + err_fflush(stdout); + err_fclose(stdout); + if (ret == 0) { + fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION); + fprintf(stderr, "[%s] CMD:", __func__); + for (i = 0; i < argc; ++i) + fprintf(stderr, " %s", argv[i]); + fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime()); + } + free(bwa_pg); + return ret; } diff --git a/malloc_wrap.c b/malloc_wrap.c new file mode 100644 index 0000000..100b8cb --- /dev/null +++ b/malloc_wrap.c @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#ifdef USE_MALLOC_WRAPPERS +/* Don't wrap ourselves */ +# undef USE_MALLOC_WRAPPERS +#endif +#include "malloc_wrap.h" + +void *wrap_calloc(size_t nmemb, size_t size, + const char *file, unsigned int line, const char *func) { + void *p = calloc(nmemb, size); + if (NULL == p) { + fprintf(stderr, + "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", + func, nmemb * size, file, line, strerror(errno)); + exit(EXIT_FAILURE); + } + return p; +} + +void *wrap_malloc(size_t size, + const char *file, unsigned int line, const char *func) { + void *p = malloc(size); + if (NULL == p) { + fprintf(stderr, + "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", + func, size, file, line, strerror(errno)); + exit(EXIT_FAILURE); + } + return p; +} + +void *wrap_realloc(void *ptr, size_t size, + const char *file, unsigned int line, const char *func) { + void *p = realloc(ptr, size); + if (NULL == p) { + fprintf(stderr, + "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", + func, size, file, line, strerror(errno)); + exit(EXIT_FAILURE); + } + return p; +} + +char *wrap_strdup(const char *s, + const char *file, unsigned int line, const char *func) { + char *p = strdup(s); + if (NULL == p) { + fprintf(stderr, + "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", + func, strlen(s), file, line, strerror(errno)); + exit(EXIT_FAILURE); + } + return p; +} diff --git a/malloc_wrap.h b/malloc_wrap.h new file mode 100644 index 0000000..a55876a --- /dev/null +++ b/malloc_wrap.h @@ -0,0 +1,47 @@ +#ifndef MALLOC_WRAP_H +#define MALLOC_WRAP_H + +#include /* Avoid breaking the usual definitions */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + + void *wrap_calloc(size_t nmemb, size_t size, + const char *file, unsigned int line, const char *func); + void *wrap_malloc(size_t size, + const char *file, unsigned int line, const char *func); + void *wrap_realloc(void *ptr, size_t size, + const char *file, unsigned int line, const char *func); + char *wrap_strdup(const char *s, + const char *file, unsigned int line, const char *func); + +#ifdef __cplusplus +} +#endif + +#ifdef USE_MALLOC_WRAPPERS +# ifdef calloc +# undef calloc +# endif +# define calloc(n, s) wrap_calloc( (n), (s), __FILE__, __LINE__, __func__) + +# ifdef malloc +# undef malloc +# endif +# define malloc(s) wrap_malloc( (s), __FILE__, __LINE__, __func__) + +# ifdef realloc +# undef realloc +# endif +# define realloc(p, s) wrap_realloc((p), (s), __FILE__, __LINE__, __func__) + +# ifdef strdup +# undef strdup +# endif +# define strdup(s) wrap_strdup( (s), __FILE__, __LINE__, __func__) + +#endif /* USE_MALLOC_WRAPPERS */ + +#endif /* MALLOC_WRAP_H */ diff --git a/pemerge.c b/pemerge.c new file mode 100644 index 0000000..725885f --- /dev/null +++ b/pemerge.c @@ -0,0 +1,291 @@ +#include +#include +#include +#include +#include +#include +#include +#include "ksw.h" +#include "kseq.h" +#include "kstring.h" +#include "bwa.h" +#include "utils.h" +KSEQ_DECLARE(gzFile) + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + +#define MAX_SCORE_RATIO 0.9f +#define MAX_ERR 8 + +static const char *err_msg[MAX_ERR+1] = { + "successful merges", + "low-scoring pairs", + "pairs where the best SW alignment is not an overlap (long left end)", + "pairs where the best SW alignment is not an overlap (long right end)", + "pairs with large 2nd best SW score", + "pairs with gapped overlap", + "pairs where the end-to-end alignment is inconsistent with SW", + "pairs potentially with tandem overlaps", + "pairs with high sum of errors" +}; + +typedef struct { + int a, b, q, r, w; + int q_def, q_thres; + int T; + int chunk_size; + int n_threads; + int flag; // bit 1: print merged; 2: print unmerged + int8_t mat[25]; +} pem_opt_t; + +pem_opt_t *pem_opt_init() +{ + pem_opt_t *opt; + opt = calloc(1, sizeof(pem_opt_t)); + opt->a = 5; opt->b = 4; opt->q = 2, opt->r = 17; opt->w = 20; + opt->T = opt->a * 10; + opt->q_def = 20; + opt->q_thres = 70; + opt->chunk_size = 10000000; + opt->n_threads = 1; + opt->flag = 3; + bwa_fill_scmat(opt->a, opt->b, opt->mat); + return opt; +} + +int bwa_pemerge(const pem_opt_t *opt, bseq1_t x[2]) +{ + uint8_t *s[2], *q[2], *seq, *qual; + int i, xtra, l, l_seq, sum_q, ret = 0; + kswr_t r; + + s[0] = malloc(x[0].l_seq); q[0] = malloc(x[0].l_seq); + s[1] = malloc(x[1].l_seq); q[1] = malloc(x[1].l_seq); + for (i = 0; i < x[0].l_seq; ++i) { + int c = x[0].seq[i]; + s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c]; + q[0][i] = x[0].qual? x[0].qual[i] - 33 : opt->q_def; + } + for (i = 0; i < x[1].l_seq; ++i) { + int c = x[1].seq[x[1].l_seq - 1 - i]; + c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c]; + s[1][i] = c < 4? 3 - c : 4; + q[1][i] = x[1].qual? x[1].qual[x[1].l_seq - 1 - i] - 33 : opt->q_def; + } + + xtra = KSW_XSTART | KSW_XSUBO; + r = ksw_align(x[1].l_seq, s[1], x[0].l_seq, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0); + ++r.qe; ++r.te; // change to the half-close-half-open coordinates + + if (r.score < opt->T) { ret = -1; goto pem_ret; } // poor alignment + if (r.tb < r.qb) { ret = -2; goto pem_ret; } // no enough space for the left end + if (x[0].l_seq - r.te > x[1].l_seq - r.qe) { ret = -3; goto pem_ret; } // no enough space for the right end + if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) { ret = -4; goto pem_ret; } // the second best score is too large + if (r.qe - r.qb != r.te - r.tb) { ret = -5; goto pem_ret; } // we do not allow gaps + + { // test tandem match; O(n^2) + int max_m, max_m2, min_l, max_l, max_l2; + max_m = max_m2 = 0; max_l = max_l2 = 0; + min_l = x[0].l_seq < x[1].l_seq? x[0].l_seq : x[1].l_seq; + for (l = 1; l < min_l; ++l) { + int m = 0, o = x[0].l_seq - l; + uint8_t *s0o = &s[0][o], *s1 = s[1]; + for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck! + m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i] + if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l; + else if (m > max_m2) max_m2 = m, max_l2 = l; + } + if (max_m < opt->T || max_l != x[0].l_seq - (r.tb - r.qb)) { ret = -6; goto pem_ret; } + if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) { + ret = -7; goto pem_ret; + } + if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) { ret = -7; goto pem_ret; } + } + + l = x[0].l_seq - (r.tb - r.qb); // length to merge + l_seq = x[0].l_seq + x[1].l_seq - l; + seq = malloc(l_seq + 1); + qual = malloc(l_seq + 1); + memcpy(seq, s[0], x[0].l_seq); memcpy(seq + x[0].l_seq, &s[1][l], x[1].l_seq - l); + memcpy(qual, q[0], x[0].l_seq); memcpy(qual + x[0].l_seq, &q[1][l], x[1].l_seq - l); + for (i = 0, sum_q = 0; i < l; ++i) { + int k = x[0].l_seq - l + i; + if (s[0][k] == 4) { // ambiguous + seq[k] = s[1][i]; + qual[k] = q[1][i]; + } else if (s[1][i] == 4) { // do nothing + } else if (s[0][k] == s[1][i]) { + qual[k] = qual[k] > q[1][i]? qual[k] : q[1][i]; + } else { // s[0][k] != s[1][i] and neither is N + int qq = q[0][k] < q[1][i]? q[0][k] : q[1][i]; + sum_q += qq >= 3? qq<<1 : 1; + seq[k] = q[0][k] > q[1][i]? s[0][k] : s[1][i]; + qual[k] = abs((int)q[0][k] - (int)q[1][i]); + } + } + if (sum_q>>1 > opt->q_thres) { // too many mismatches + free(seq); free(qual); + ret = -8; goto pem_ret; + } + + for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33; + seq[l_seq] = qual[l_seq] = 0; + + free(x[1].name); free(x[1].seq); free(x[1].qual); free(x[1].comment); + memset(&x[1], 0, sizeof(bseq1_t)); + free(x[0].seq); free(x[0].qual); + x[0].l_seq = l_seq; x[0].seq = (char*)seq; x[0].qual = (char*)qual; + +pem_ret: + free(s[0]); free(s[1]); free(q[0]); free(q[1]); + return ret; +} + +static inline void print_bseq(const bseq1_t *s, int rn) +{ + err_putchar(s->qual? '@' : '>'); + err_fputs(s->name, stdout); + if (rn == 1 || rn == 2) { + err_putchar('/'); err_putchar('0' + rn); err_putchar('\n'); + } else err_puts(" merged"); + err_puts(s->seq); + if (s->qual) { + err_puts("+"); err_puts(s->qual); + } +} + +typedef struct { + int n, start; + bseq1_t *seqs; + int64_t cnt[MAX_ERR+1]; + const pem_opt_t *opt; +} worker_t; + +void *worker(void *data) +{ + worker_t *w = (worker_t*)data; + int i; + for (i = w->start; i < w->n>>1; i += w->opt->n_threads) + ++w->cnt[-bwa_pemerge(w->opt, &w->seqs[i<<1])]; + return 0; +} + +static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cnt[MAX_ERR+1]) +{ + int i, j, n = n_>>1<<1; + worker_t *w; + + w = calloc(opt->n_threads, sizeof(worker_t)); + for (i = 0; i < opt->n_threads; ++i) { + worker_t *p = &w[i]; + p->start = i; p->n = n; + p->opt = opt; + p->seqs = seqs; + } + if (opt->n_threads == 1) { + worker(w); + } else { + pthread_t *tid; + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker, &w[i]); + for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); + free(tid); + } + for (i = 0; i < opt->n_threads; ++i) { + worker_t *p = &w[i]; + for (j = 0; j <= MAX_ERR; ++j) cnt[j] += p->cnt[j]; + } + free(w); + for (i = 0; i < n>>1; ++i) { + if (seqs[i<<1|1].l_seq != 0) { + if (opt->flag&2) { + print_bseq(&seqs[i<<1|0], 1); + print_bseq(&seqs[i<<1|1], 2); + } + } else if (opt->flag&1) + print_bseq(&seqs[i<<1|0], 0); + } + for (i = 0; i < n; ++i) { + bseq1_t *s = &seqs[i]; + free(s->name); free(s->seq); free(s->qual); free(s->comment); + } +} + +int main_pemerge(int argc, char *argv[]) +{ + int c, flag = 0, i, n, min_ovlp = 10; + int64_t cnt[MAX_ERR+1]; + bseq1_t *bseq; + gzFile fp, fp2 = 0; + kseq_t *ks, *ks2 = 0; + pem_opt_t *opt; + + opt = pem_opt_init(); + while ((c = getopt(argc, argv, "muQ:t:T:")) >= 0) { + if (c == 'm') flag |= 1; + else if (c == 'u') flag |= 2; + else if (c == 'Q') opt->q_thres = atoi(optarg); + else if (c == 't') opt->n_threads = atoi(optarg); + else if (c == 'T') min_ovlp = atoi(optarg); + else return 1; + } + if (flag == 0) flag = 3; + opt->flag = flag; + opt->T = opt->a * min_ovlp; + + if (optind == argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa pemerge [-mu] [read2.fq]\n\n"); + fprintf(stderr, "Options: -m output merged reads only\n"); + fprintf(stderr, " -u output unmerged reads only\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -T INT minimum end overlap [%d]\n", min_ovlp); + fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres); + fprintf(stderr, "\n"); + free(opt); + return 1; + } + + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + if (NULL == fp) { + fprintf(stderr, "Couldn't open %s : %s\n", + strcmp(argv[optind], "-") ? argv[optind] : "stdin", + errno ? strerror(errno) : "Out of memory"); + exit(EXIT_FAILURE); + } + ks = kseq_init(fp); + if (optind + 1 < argc) { + fp2 = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "r") : gzdopen(fileno(stdin), "r"); + if (NULL == fp) { + fprintf(stderr, "Couldn't open %s : %s\n", + strcmp(argv[optind+1], "-") ? argv[optind+1] : "stdin", + errno ? strerror(errno) : "Out of memory"); + exit(EXIT_FAILURE); + } + ks2 = kseq_init(fp2); + } + + memset(cnt, 0, 8 * (MAX_ERR+1)); + while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { + process_seqs(opt, n, bseq, cnt); + free(bseq); + } + + fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]); + for (i = 1; i <= MAX_ERR; ++i) + fprintf(stderr, "%12ld %s\n", (long)cnt[i], err_msg[i]); + kseq_destroy(ks); + err_gzclose(fp); + if (ks2) { + kseq_destroy(ks2); + err_gzclose(fp2); + } + free(opt); + + err_fflush(stdout); + + return 0; +} diff --git a/pssm.c b/pssm.c index 2d055b4..8d78a22 100644 --- a/pssm.c +++ b/pssm.c @@ -42,7 +42,7 @@ PSSM init_matrix(int order, int length, int alphabet_size){ // Check if longer than max-length of PSSMs if(length >= MAXPSSMSIZE) { - fprintf(stderr,"Matrix is to long."); + fprintf(stderr,"Matrix is too long: %d", length); return NULL; } @@ -137,7 +137,7 @@ PSSM init_matrix_score(int order, int length, int alphabet_size, int *scores, in char errormsg[160]; sprintf(errormsg, "Mismatch between list size (%i) and size calculated from order, length and alphabet size (%i).", nScores, pssm->offsets[length]); - fprintf(stderr,errormsg); + fprintf(stderr, "%s\n", errormsg); return NULL; } diff --git a/qualfa2fq.pl b/qualfa2fq.pl new file mode 100755 index 0000000..31e1974 --- /dev/null +++ b/qualfa2fq.pl @@ -0,0 +1,27 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; + +die("Usage: qualfa2fq.pl \n") if (@ARGV != 2); + +my ($fhs, $fhq, $q); +open($fhs, ($ARGV[0] =~ /\.gz$/)? "gzip -dc $ARGV[0] |" : $ARGV[0]) || die; +open($fhq, ($ARGV[1] =~ /\.gz$/)? "gzip -dc $ARGV[1] |" : $ARGV[1]) || die; + +$/ = ">"; <$fhs>; <$fhq>; $/ = "\n"; +while (<$fhs>) { + $q = <$fhq>; + print "\@$_"; + $/ = ">"; + $_ = <$fhs>; $q = <$fhq>; + chomp; chomp($q); + $q =~ s/\s*(\d+)\s*/chr($1+33)/eg; + print $_, "+\n"; + for (my $i = 0; $i < length($q); $i += 60) { + print substr($q, $i, 60), "\n"; + } + $/ = "\n"; +} + +close($fhs); close($fhq); diff --git a/seq2pssm.c b/seq2pssm.c index fe72035..80a5707 100644 --- a/seq2pssm.c +++ b/seq2pssm.c @@ -154,7 +154,7 @@ PSSM string_to_pssm(ubyte_t *seq, int len, int alphsize, float match, float mism } /* This function makes a matrix */ - mat = init_matrix_score(0, len, alphsize+1, base, nScores, -0.5); + mat = init_matrix_score(0, len, alphsize+1, base, nScores, 0); return mat; } @@ -528,7 +528,7 @@ int mismatch_threshold(PSSM mat, int M) { int order = mat->order; int scorediff[MAXPSSMSIZE], t, hscore; int *scores = mat->scores; - const int infty = 1.e-100; + const int infty = 0; hscore = 0.; for (i = 0; i < mat->length; ++i) diff --git a/utils.c b/utils.c index d47ec5c..00be7f0 100644 --- a/utils.c +++ b/utils.c @@ -24,6 +24,7 @@ */ /* Contact: Heng Li */ +#define FSYNC_ON_FLUSH #include #include @@ -31,41 +32,73 @@ #include #include #include +#ifdef FSYNC_ON_FLUSH +#include +#include +#include +#endif +#include +#include #include "utils.h" +#include "ksort.h" +#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y)) +KSORT_INIT(128, pair64_t, pair64_lt) +KSORT_INIT(64, uint64_t, ks_lt_generic) + +#include "kseq.h" +KSEQ_INIT2(, gzFile, err_gzread) + +/******************** + * System utilities * + ********************/ + FILE *err_xopen_core(const char *func, const char *fn, const char *mode) { FILE *fp = 0; if (strcmp(fn, "-") == 0) return (strstr(mode, "r"))? stdin : stdout; if ((fp = fopen(fn, mode)) == 0) { - fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); - abort(); + err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno)); } return fp; } + FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp) { if (freopen(fn, mode, fp) == 0) { - fprintf(stderr, "[%s] fail to open file '%s': ", func, fn); - perror(NULL); - fprintf(stderr, "Abort!\n"); - abort(); + err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno)); } return fp; } + gzFile err_xzopen_core(const char *func, const char *fn, const char *mode) { gzFile fp; - if (strcmp(fn, "-") == 0) - return gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode); + if (strcmp(fn, "-") == 0) { + fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode); + /* According to zlib.h, this is the only reason gzdopen can fail */ + if (!fp) err_fatal(func, "Out of memory"); + return fp; + } if ((fp = gzopen(fn, mode)) == 0) { - fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); - abort(); + err_fatal(func, "fail to open file '%s' : %s", fn, errno ? strerror(errno) : "Out of memory"); } return fp; } + void err_fatal(const char *header, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + fprintf(stderr, "[%s] ", header); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); + exit(EXIT_FAILURE); +} + +void err_fatal_core(const char *header, const char *fmt, ...) { va_list args; va_start(args, fmt); @@ -76,7 +109,13 @@ void err_fatal(const char *header, const char *fmt, ...) abort(); } -void err_fatal_simple_core(const char *func, const char *msg) +void _err_fatal_simple(const char *func, const char *msg) +{ + fprintf(stderr, "[%s] %s\n", func, msg); + exit(EXIT_FAILURE); +} + +void _err_fatal_simple_core(const char *func, const char *msg) { fprintf(stderr, "[%s] %s Abort!\n", func, msg); abort(); @@ -84,65 +123,162 @@ void err_fatal_simple_core(const char *func, const char *msg) size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { - size_t ret = fwrite(ptr, size, nmemb, stream); - if (ret != nmemb) - { - err_fatal_simple_core("fwrite", strerror(errno)); - } - return ret; + size_t ret = fwrite(ptr, size, nmemb, stream); + if (ret != nmemb) + _err_fatal_simple("fwrite", strerror(errno)); + return ret; } -int err_printf(const char *format, ...) +size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream) +{ + size_t ret = fread(ptr, size, nmemb, stream); + if (ret != nmemb) + { + _err_fatal_simple("fread", ferror(stream) ? strerror(errno) : "Unexpected end of file"); + } + return ret; +} + +int err_gzread(gzFile file, void *ptr, unsigned int len) +{ + int ret = gzread(file, ptr, len); + + if (ret < 0) + { + int errnum = 0; + const char *msg = gzerror(file, &errnum); + _err_fatal_simple("gzread", Z_ERRNO == errnum ? strerror(errno) : msg); + } + + return ret; +} + +int err_fseek(FILE *stream, long offset, int whence) { - va_list arg; - int done; + int ret = fseek(stream, offset, whence); + if (0 != ret) + { + _err_fatal_simple("fseek", strerror(errno)); + } + return ret; +} - va_start(arg, format); - done = vfprintf(stdout, format, arg); - int saveErrno = errno; - va_end(arg); +long err_ftell(FILE *stream) +{ + long ret = ftell(stream); + if (-1 == ret) + { + _err_fatal_simple("ftell", strerror(errno)); + } + return ret; +} - if (done < 0) - { - err_fatal_simple_core("vfprintf(stdout)", strerror(saveErrno)); - } - return done; +int err_printf(const char *format, ...) +{ + va_list arg; + int done; + va_start(arg, format); + done = vfprintf(stdout, format, arg); + int saveErrno = errno; + va_end(arg); + if (done < 0) _err_fatal_simple("vfprintf(stdout)", strerror(saveErrno)); + return done; } int err_fprintf(FILE *stream, const char *format, ...) { - va_list arg; - int done; + va_list arg; + int done; + va_start(arg, format); + done = vfprintf(stream, format, arg); + int saveErrno = errno; + va_end(arg); + if (done < 0) _err_fatal_simple("vfprintf", strerror(saveErrno)); + return done; +} + +int err_fputc(int c, FILE *stream) +{ + int ret = putc(c, stream); + if (EOF == ret) + { + _err_fatal_simple("fputc", strerror(errno)); + } - va_start(arg, format); - done = vfprintf(stream, format, arg); - int saveErrno = errno; - va_end(arg); + return ret; +} + +int err_fputs(const char *s, FILE *stream) +{ + int ret = fputs(s, stream); + if (EOF == ret) + { + _err_fatal_simple("fputs", strerror(errno)); + } - if (done < 0) - { - err_fatal_simple_core("vfprintf", strerror(saveErrno)); - } - return done; + return ret; } int err_fflush(FILE *stream) { int ret = fflush(stream); - if (ret != 0) - { - err_fatal_simple_core("fflush", strerror(errno)); - } + if (ret != 0) _err_fatal_simple("fflush", strerror(errno)); + +#ifdef FSYNC_ON_FLUSH + /* Calling fflush() ensures that all the data has made it to the + kernel buffers, but this may not be sufficient for remote filesystems + (e.g. NFS, lustre) as an error may still occur while the kernel + is copying the buffered data to the file server. To be sure of + catching these errors, we need to call fsync() on the file + descriptor, but only if it is a regular file. */ + { + struct stat sbuf; + if (0 != fstat(fileno(stream), &sbuf)) + _err_fatal_simple("fstat", strerror(errno)); + + if (S_ISREG(sbuf.st_mode)) + { + if (0 != fsync(fileno(stream))) + _err_fatal_simple("fsync", strerror(errno)); + } + } +#endif return ret; } int err_fclose(FILE *stream) { - int ret = fclose(stream); - if (ret != 0) - { - err_fatal_simple_core("fclose", strerror(errno)); - } - return ret; + int ret = fclose(stream); + if (ret != 0) _err_fatal_simple("fclose", strerror(errno)); + return ret; } +int err_gzclose(gzFile file) +{ + int ret = gzclose(file); + if (Z_OK != ret) + { + _err_fatal_simple("gzclose", Z_ERRNO == ret ? strerror(errno) : zError(ret)); + } + + return ret; +} + +/********* + * Timer * + *********/ + +double cputime() +{ + struct rusage r; + getrusage(RUSAGE_SELF, &r); + return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec); +} + +double realtime() +{ + struct timeval tp; + struct timezone tzp; + gettimeofday(&tp, &tzp); + return tp.tv_sec + tp.tv_usec * 1e-6; +} diff --git a/utils.h b/utils.h index a7fecbc..5ef6ac4 100644 --- a/utils.h +++ b/utils.h @@ -28,6 +28,7 @@ #ifndef LH3_UTILS_H #define LH3_UTILS_H +#include #include #include @@ -38,33 +39,73 @@ #define ATTRIBUTE(list) #endif +#define err_fatal_simple(msg) _err_fatal_simple(__func__, msg) +#define err_fatal_simple_core(msg) _err_fatal_simple_core(__func__, msg) - -#define err_fatal_simple(msg) err_fatal_simple_core(__func__, msg) #define xopen(fn, mode) err_xopen_core(__func__, fn, mode) #define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp) #define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode) -#define xassert(cond, msg) if ((cond) == 0) err_fatal_simple_core(__func__, msg) + +#define xassert(cond, msg) if ((cond) == 0) _err_fatal_simple_core(__func__, msg) + +typedef struct { + uint64_t x, y; +} pair64_t; + +typedef struct { size_t n, m; uint64_t *a; } uint64_v; +typedef struct { size_t n, m; pair64_t *a; } pair64_v; #ifdef __cplusplus extern "C" { #endif - void err_fatal(const char *header, const char *fmt, ...); - void err_fatal_simple_core(const char *func, const char *msg); + void err_fatal(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn)); + void err_fatal_core(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn)); + void _err_fatal_simple(const char *func, const char *msg) ATTRIBUTE((noreturn)); + void _err_fatal_simple_core(const char *func, const char *msg) ATTRIBUTE((noreturn)); FILE *err_xopen_core(const char *func, const char *fn, const char *mode); FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp); gzFile err_xzopen_core(const char *func, const char *fn, const char *mode); size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); + size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream); + + int err_gzread(gzFile file, void *ptr, unsigned int len); + int err_fseek(FILE *stream, long offset, int whence); +#define err_rewind(FP) err_fseek((FP), 0, SEEK_SET) + long err_ftell(FILE *stream); int err_fprintf(FILE *stream, const char *format, ...) ATTRIBUTE((format(printf, 2, 3))); int err_printf(const char *format, ...) ATTRIBUTE((format(printf, 1, 2))); + int err_fputc(int c, FILE *stream); +#define err_putchar(C) err_fputc((C), stdout) + int err_fputs(const char *s, FILE *stream); +#define err_puts(S) err_fputs((S), stdout) int err_fflush(FILE *stream); int err_fclose(FILE *stream); + int err_gzclose(gzFile file); + + double cputime(); + double realtime(); + + void ks_introsort_64 (size_t n, uint64_t *a); + void ks_introsort_128(size_t n, pair64_t *a); #ifdef __cplusplus } #endif +static inline uint64_t hash_64(uint64_t key) +{ + key += ~(key << 32); + key ^= (key >> 22); + key += ~(key << 13); + key ^= (key >> 8); + key += (key << 3); + key ^= (key >> 15); + key += ~(key << 27); + key ^= (key >> 31); + return key; +} + #endif diff --git a/xa2multi.pl b/xa2multi.pl new file mode 100755 index 0000000..2409c29 --- /dev/null +++ b/xa2multi.pl @@ -0,0 +1,25 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; + +while (<>) { + if (/\tXA:Z:(\S+)/) { + my $l = $1; + print; + my @t = split("\t"); + while ($l =~ /([^,;]+),([-+]\d+),([^,]+),(\d+);/g) { + my $mchr = ($t[6] eq $1)? '=' : $t[6]; # FIXME: TLEN/ISIZE is not calculated! + my $seq = $t[9]; + my $phred = $t[10]; + # if alternative alignment has other orientation than primary, + # then print the reverse (complement) of sequence and phred string + if ((($t[1]&0x10)>0) xor ($2<0)) { + $seq = reverse $seq; + $seq =~ tr/ACGTacgt/TGCAtgca/; + $phred = reverse $phred; + } + print(join("\t", $t[0], 0x100|($t[1]&0x6e9)|($2<0?0x10:0), $1, abs($2), 0, $3, @t[6..7], 0, $seq, $phred, "NM:i:$4"), "\n"); + } + } else { print; } +}