From 58628ff2633251b5931fbf9f5e39206b48636932 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 13 Jan 2011 20:42:51 -0500 Subject: [PATCH 001/498] first commit --- README | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 README diff --git a/README b/README new file mode 100644 index 0000000..e69de29 From 007c3eb75d8ba6fb07131226b7f9081090dced54 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 13 Jan 2011 20:52:12 -0500 Subject: [PATCH 002/498] Imported from my local bwa repository, the master repository. --- COPYING | 674 ++++++++ ChangeLog | 3815 ++++++++++++++++++++++++++++++++++++++++++++ Makefile | 55 + NEWS | 528 ++++++ bamlite.c | 155 ++ bamlite.h | 94 ++ bntseq.c | 303 ++++ bntseq.h | 80 + bwa.1 | 550 +++++++ bwape.c | 795 +++++++++ bwase.c | 677 ++++++++ bwase.h | 27 + bwaseqio.c | 198 +++ bwt.c | 250 +++ bwt.h | 105 ++ bwt_gen/Makefile | 23 + bwt_gen/QSufSort.c | 496 ++++++ bwt_gen/QSufSort.h | 40 + bwt_gen/bwt_gen.c | 1547 ++++++++++++++++++ bwt_gen/bwt_gen.h | 105 ++ bwt_lite.c | 94 ++ bwt_lite.h | 29 + bwtaln.c | 339 ++++ bwtaln.h | 147 ++ bwtgap.c | 264 +++ bwtgap.h | 38 + bwtindex.c | 186 +++ bwtio.c | 77 + bwtmisc.c | 267 ++++ bwtsw2.h | 51 + bwtsw2_aux.c | 650 ++++++++ bwtsw2_chain.c | 107 ++ bwtsw2_core.c | 594 +++++++ bwtsw2_main.c | 93 ++ cs2nt.c | 191 +++ is.c | 218 +++ khash.h | 506 ++++++ kseq.h | 208 +++ ksort.h | 269 ++++ kstring.c | 35 + kstring.h | 46 + kvec.h | 90 ++ main.c | 58 + main.h | 29 + qualfa2fq.pl | 27 + simple_dp.c | 162 ++ solid2fastq.pl | 111 ++ stdaln.c | 1072 +++++++++++++ stdaln.h | 162 ++ utils.c | 72 + utils.h | 52 + 51 files changed, 16761 insertions(+) create mode 100644 COPYING create mode 100644 ChangeLog create mode 100644 Makefile create mode 100644 NEWS create mode 100644 bamlite.c create mode 100644 bamlite.h create mode 100644 bntseq.c create mode 100644 bntseq.h create mode 100644 bwa.1 create mode 100644 bwape.c create mode 100644 bwase.c create mode 100644 bwase.h create mode 100644 bwaseqio.c create mode 100644 bwt.c create mode 100644 bwt.h create mode 100644 bwt_gen/Makefile create mode 100644 bwt_gen/QSufSort.c create mode 100644 bwt_gen/QSufSort.h create mode 100644 bwt_gen/bwt_gen.c create mode 100644 bwt_gen/bwt_gen.h create mode 100644 bwt_lite.c create mode 100644 bwt_lite.h create mode 100644 bwtaln.c create mode 100644 bwtaln.h create mode 100644 bwtgap.c create mode 100644 bwtgap.h create mode 100644 bwtindex.c create mode 100644 bwtio.c create mode 100644 bwtmisc.c create mode 100644 bwtsw2.h create mode 100644 bwtsw2_aux.c create mode 100644 bwtsw2_chain.c create mode 100644 bwtsw2_core.c create mode 100644 bwtsw2_main.c create mode 100644 cs2nt.c create mode 100644 is.c create mode 100644 khash.h create mode 100644 kseq.h create mode 100644 ksort.h create mode 100644 kstring.c create mode 100644 kstring.h create mode 100644 kvec.h create mode 100644 main.c create mode 100644 main.h create mode 100755 qualfa2fq.pl create mode 100644 simple_dp.c create mode 100755 solid2fastq.pl create mode 100644 stdaln.c create mode 100644 stdaln.h create mode 100644 utils.c create mode 100644 utils.h diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..94a9ed0 --- /dev/null +++ b/COPYING @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..779a31a --- /dev/null +++ b/ChangeLog @@ -0,0 +1,3815 @@ +------------------------------------------------------------------------ +r1560 | lh3 | 2010-12-10 00:29:08 -0500 (Fri, 10 Dec 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/main.c + + * fixed a small memory leak caused by the BAM reader + * fixed a memory violation, also in the BAM reader + +------------------------------------------------------------------------ +r1559 | lh3 | 2010-12-10 00:10:48 -0500 (Fri, 10 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/Makefile + +change Makefile gcc options + +------------------------------------------------------------------------ +r1558 | lh3 | 2010-12-10 00:09:22 -0500 (Fri, 10 Dec 2010) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-6 (r1557) + * added a little more comments to BWA-SW + * randomly choosing a mapping if there are more than one + +------------------------------------------------------------------------ +r1557 | lh3 | 2010-12-09 21:58:00 -0500 (Thu, 09 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_aux.c + +sometimes unmapped reads may not be printed... + +------------------------------------------------------------------------ +r1556 | lh3 | 2010-12-09 21:50:26 -0500 (Thu, 09 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_aux.c + +print unmapped reads + +------------------------------------------------------------------------ +r1555 | lh3 | 2010-12-09 21:17:20 -0500 (Thu, 09 Dec 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-5 (r1555) + * BAM input documentation + +------------------------------------------------------------------------ +r1544 | lh3 | 2010-11-23 11:01:41 -0500 (Tue, 23 Nov 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-4 (r1544) + * supporting adding RG tags and RG lines + +------------------------------------------------------------------------ +r1543 | lh3 | 2010-11-23 00:16:40 -0500 (Tue, 23 Nov 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-3 (r1543) + * fixed a memory leak + +------------------------------------------------------------------------ +r1542 | lh3 | 2010-11-22 23:50:56 -0500 (Mon, 22 Nov 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.5.8-2 (r1542) + * fixed a long existing bug in random placement of reads + +------------------------------------------------------------------------ +r1541 | lh3 | 2010-11-22 23:27:29 -0500 (Mon, 22 Nov 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bamlite.c + A /branches/prog/bwa/bamlite.h + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + +preliminary BAM input support + +------------------------------------------------------------------------ +r1537 | lh3 | 2010-10-16 23:46:20 -0400 (Sat, 16 Oct 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + +change version number and ChangeLog + +------------------------------------------------------------------------ +r1536 | lh3 | 2010-10-16 23:35:10 -0400 (Sat, 16 Oct 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * fixed a bug in the scoring matrix + * release bwa-0.5.8c (r1536) + +------------------------------------------------------------------------ +r1451 | lh3 | 2010-06-15 09:43:52 -0400 (Tue, 15 Jun 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +version change + +------------------------------------------------------------------------ +r1450 | lh3 | 2010-06-15 09:42:21 -0400 (Tue, 15 Jun 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * bwa-0.5.8b (r1450) + * fixed a bug in scoring matrix + +------------------------------------------------------------------------ +r1445 | lh3 | 2010-06-11 08:58:33 -0400 (Fri, 11 Jun 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + +fixed a serious bug + +------------------------------------------------------------------------ +r1442 | lh3 | 2010-06-08 10:22:14 -0400 (Tue, 08 Jun 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.5.8 (r1442) + +------------------------------------------------------------------------ +r1440 | lh3 | 2010-05-19 13:43:50 -0400 (Wed, 19 May 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-r1440 + * sorry, forget to remove a debugging line + +------------------------------------------------------------------------ +r1439 | lh3 | 2010-05-19 13:43:08 -0400 (Wed, 19 May 2010) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-r1439 + * fixed a bug in bwasw caused by a recent modification + * throwing insane insert size when estimating isize + +------------------------------------------------------------------------ +r1425 | lh3 | 2010-04-29 15:15:23 -0400 (Thu, 29 Apr 2010) | 10 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.5.7-7 (r1425) + * fixed a minor bug in bwasw command-line parsing + * When band-width is not large enough, bwasw may find two highly + overlapping but not completely overlapping alignments. The old + version will filter out one of them, which leads to false + negatives. The current outputs both. This solution is obviously not + ideal. The ideal one would be to increase the band-width and redo the + alignment. + + +------------------------------------------------------------------------ +r1399 | lh3 | 2010-04-16 09:20:49 -0400 (Fri, 16 Apr 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.5.7-6 (r1399) + * fixed a typo/bug (by Vaughn Iverson) + +------------------------------------------------------------------------ +r1329 | lh3 | 2010-03-19 23:32:46 -0400 (Fri, 19 Mar 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + +small correction + +------------------------------------------------------------------------ +r1328 | lh3 | 2010-03-19 23:28:44 -0400 (Fri, 19 Mar 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.7-4 (r1328) + * automatically adjust ap_prior based on alignment + +------------------------------------------------------------------------ +r1327 | lh3 | 2010-03-19 23:02:40 -0400 (Fri, 19 Mar 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.5.7-3 (r1327) + * evaluate hits obtained from SW alignment in a more proper way. + +------------------------------------------------------------------------ +r1320 | lh3 | 2010-03-17 15:13:22 -0400 (Wed, 17 Mar 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwape.c + +fixed a potential out-of-boundary error. Need more testing. + +------------------------------------------------------------------------ +r1319 | lh3 | 2010-03-14 22:44:46 -0400 (Sun, 14 Mar 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwape.c + +insert size is `weird' if the 3rd quatile larger than 100,000bp + +------------------------------------------------------------------------ +r1318 | lh3 | 2010-03-14 22:37:35 -0400 (Sun, 14 Mar 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.7-2 (r1318) + * in sampe, allow to disable insert size estimate + +------------------------------------------------------------------------ +r1317 | lh3 | 2010-03-14 22:14:14 -0400 (Sun, 14 Mar 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/solid2fastq.pl + + * bwa-0.5.7-1 (r1317) + * fixed a potential bug in solid2fastq.pl + * fixed a bug in calculating mapping quality (by Rodrigo Goya) + * fixed a very rare bug (if ever occur) about pairing + +------------------------------------------------------------------------ +r1310 | lh3 | 2010-03-01 10:35:45 -0500 (Mon, 01 Mar 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.5.7 + +------------------------------------------------------------------------ +r1309 | lh3 | 2010-02-26 21:42:22 -0500 (Fri, 26 Feb 2010) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.6-2 (r1309) + * fixed an unfixed bug (by Carol Scott) + * fixed some tiny formatting + +------------------------------------------------------------------------ +r1305 | lh3 | 2010-02-25 13:47:58 -0500 (Thu, 25 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.5.6-1 (r1304) + * optionally write output to a file (by Tim Fennel) + +------------------------------------------------------------------------ +r1303 | lh3 | 2010-02-10 23:43:48 -0500 (Wed, 10 Feb 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + +Release bwa-0.5.6 + +------------------------------------------------------------------------ +r1302 | lh3 | 2010-02-10 11:11:49 -0500 (Wed, 10 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.5-10 (r1302) + * improve max insert size estimate (method suggested by Gerton Lunter) + +------------------------------------------------------------------------ +r1301 | lh3 | 2010-02-09 16:15:28 -0500 (Tue, 09 Feb 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-9 (r1301) + * improve mapping quality calculation for abnomalous pairs + * fixed a bug in multiple hits + * SOLiD multiple hits should work now + +------------------------------------------------------------------------ +r1300 | lh3 | 2010-02-09 12:50:02 -0500 (Tue, 09 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-8 (r1300) + * output kurtosis + +------------------------------------------------------------------------ +r1299 | lh3 | 2010-02-09 12:33:34 -0500 (Tue, 09 Feb 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-7 (r1299) + * calculate skewness in sampe + * increase min_len in SW to 20 + * perform more SW to fix discordant pairs + +------------------------------------------------------------------------ +r1298 | lh3 | 2010-02-08 12:40:31 -0500 (Mon, 08 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.5.5-6 (r1297) + * prepare to replace all 16-bit CIGAR (patches by Rodrigo Goya) + +------------------------------------------------------------------------ +r1297 | lh3 | 2010-02-05 22:26:11 -0500 (Fri, 05 Feb 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/solid2fastq.pl + +the old fix seems not working! + +------------------------------------------------------------------------ +r1296 | lh3 | 2010-02-05 21:51:03 -0500 (Fri, 05 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-5 (r1296) + * fixed a minor issue that the lower bound of insert size is not correctly set. + +------------------------------------------------------------------------ +r1295 | lh3 | 2010-02-05 21:01:10 -0500 (Fri, 05 Feb 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-4 (r1295) + * fixed a memory leak + * change the behaviour of -n (samse and sampe) + * change the default of -n + +------------------------------------------------------------------------ +r1294 | lh3 | 2010-02-05 17:24:06 -0500 (Fri, 05 Feb 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.5-3 (r1294) + * improved multi-hit report + +------------------------------------------------------------------------ +r1293 | lh3 | 2010-02-05 12:57:38 -0500 (Fri, 05 Feb 2010) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/solid2fastq.pl + + * bwa-0.5.5-2 (r1293) + * bugfix: truncated quality string + * bugfix: quality -1 in solid->fastq conversion + * bugfix: color reads on the reverse strand is not complemented + +------------------------------------------------------------------------ +r1279 | lh3 | 2009-11-23 22:42:34 -0500 (Mon, 23 Nov 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwase.c + A /branches/prog/bwa/bwase.h + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.5-1 (r1279) + * incorporate changes from Matt Hanna for Java bindings. + +------------------------------------------------------------------------ +r1275 | lh3 | 2009-11-10 22:13:10 -0500 (Tue, 10 Nov 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + +update ChangeLog + +------------------------------------------------------------------------ +r1273 | lh3 | 2009-11-10 22:08:16 -0500 (Tue, 10 Nov 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + A /branches/prog/bwa/qualfa2fq.pl + +Release bwa-0.5.5 (r1273) + +------------------------------------------------------------------------ +r1272 | lh3 | 2009-11-10 22:02:50 -0500 (Tue, 10 Nov 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.4-3 (r1272) + * fixed another typo which may lead to incorrect single-end mapping quality + +------------------------------------------------------------------------ +r1271 | lh3 | 2009-11-10 21:59:47 -0500 (Tue, 10 Nov 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.4-2 (r1271) + * fixed a serious typo/bug which does not hurt if we allow one gap open + and work with <200bp reads, but causes segfault for long reads. + +------------------------------------------------------------------------ +r1270 | lh3 | 2009-11-09 23:12:42 -0500 (Mon, 09 Nov 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + + * bwa-0.5.4-1 (r1270) + * fixed a bug in color alignment + +------------------------------------------------------------------------ +r1245 | lh3 | 2009-10-09 07:42:52 -0400 (Fri, 09 Oct 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/main.c + +Release bwa-0.5.4 + +------------------------------------------------------------------------ +r1244 | lh3 | 2009-10-09 05:53:52 -0400 (Fri, 09 Oct 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * bwa-0.5.3-4 (r1244) + * output the clipped length in XC:i: tag + * skip mate alignment when stdaln is buggy + * fixed a bug in NM:i: tag + +------------------------------------------------------------------------ +r1243 | lh3 | 2009-10-07 08:15:04 -0400 (Wed, 07 Oct 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.5.3-3 (r1243) + * sampe: fixed a bug when a read sequence is identical its reverse complement. + +------------------------------------------------------------------------ +r1242 | lh3 | 2009-10-07 07:49:13 -0400 (Wed, 07 Oct 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.3-2 (r1242) + * sampe: optionall preload the full index into memory + * aln: change the default seed length to 32bp + +------------------------------------------------------------------------ +r1238 | lh3 | 2009-09-26 18:38:15 -0400 (Sat, 26 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/khash.h + +Improve portability of khash.h + +------------------------------------------------------------------------ +r1228 | lh3 | 2009-09-15 09:20:22 -0400 (Tue, 15 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/main.c + +fixed a typo + +------------------------------------------------------------------------ +r1227 | lh3 | 2009-09-15 09:19:35 -0400 (Tue, 15 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.5.3-1 (r1226) + * in dBWT-SW, optionall use hard clipping instead of soft clipping + +------------------------------------------------------------------------ +r1225 | lh3 | 2009-09-15 08:32:30 -0400 (Tue, 15 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + +Release bwa-0.5.3 (r1225) + +------------------------------------------------------------------------ +r1223 | lh3 | 2009-09-13 07:30:41 -0400 (Sun, 13 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.5.2 + +------------------------------------------------------------------------ +r1222 | lh3 | 2009-09-11 09:11:39 -0400 (Fri, 11 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.1-5 (r1222) + * fixed a typo. No real change + +------------------------------------------------------------------------ +r1221 | lh3 | 2009-09-11 09:09:44 -0400 (Fri, 11 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.1-4 (r1221) + * trim reads before alignment + +------------------------------------------------------------------------ +r1216 | lh3 | 2009-09-08 17:50:15 -0400 (Tue, 08 Sep 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.5.1-3 (r1216) + * fixed a bug about NM tags for gapped alignment + * print SAM header + +------------------------------------------------------------------------ +r1215 | lh3 | 2009-09-08 17:14:42 -0400 (Tue, 08 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.5.1-2 (r1215) + * fixed a bug when read lengths vary (by John Marshall) + +------------------------------------------------------------------------ +r1213 | lh3 | 2009-09-06 18:58:15 -0400 (Sun, 06 Sep 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.1-1 (r1213) + * change default -T to 30 + +------------------------------------------------------------------------ +r1209 | lh3 | 2009-09-02 06:06:02 -0400 (Wed, 02 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.5.1 + +------------------------------------------------------------------------ +r1208 | lh3 | 2009-09-02 05:56:33 -0400 (Wed, 02 Sep 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + + * ChangeLog + +------------------------------------------------------------------------ +r1206 | lh3 | 2009-08-30 18:27:30 -0400 (Sun, 30 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-6 (r1206) + * fixed two bugs caused by previous modification + +------------------------------------------------------------------------ +r1205 | lh3 | 2009-08-30 17:28:36 -0400 (Sun, 30 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-4 (r1205) + * reduce false coordinates and CIGAR when a query bridges two reference + sequences, although some very rare cases may fail bwa. + +------------------------------------------------------------------------ +r1204 | lh3 | 2009-08-30 06:06:16 -0400 (Sun, 30 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-3 (r1204) + * choose one repetitive hit to extend + +------------------------------------------------------------------------ +r1203 | lh3 | 2009-08-29 18:11:51 -0400 (Sat, 29 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-2 (r1203) + * dBWT-SW: change a parameter in calculating mapping quality + * fixed a bug in samse + +------------------------------------------------------------------------ +r1202 | lh3 | 2009-08-28 19:48:41 -0400 (Fri, 28 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.0-1 (r1202) + * change default band width to 50 + * improve mapping quality a bit + +------------------------------------------------------------------------ +r1200 | lh3 | 2009-08-20 06:21:24 -0400 (Thu, 20 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.5.0 (r1200) + +------------------------------------------------------------------------ +r1199 | lh3 | 2009-08-20 04:49:15 -0400 (Thu, 20 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + +Updated ChangeLog and the manual + +------------------------------------------------------------------------ +r1198 | lh3 | 2009-08-19 11:09:15 -0400 (Wed, 19 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-36 (r1198) + * simplify duphits removal. The accuracy is changed a tiny bit, sometimes better, sometimes worse. + +------------------------------------------------------------------------ +r1197 | lh3 | 2009-08-19 08:15:05 -0400 (Wed, 19 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_aux.c + A /branches/prog/bwa/bwtsw2_chain.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-35 (r1197) + * further heuristic acceleration for long queries + +------------------------------------------------------------------------ +r1196 | lh3 | 2009-08-18 06:54:03 -0400 (Tue, 18 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-34 (r1196) + * updated the manual page + * output base quality if the input is fastq + +------------------------------------------------------------------------ +r1195 | lh3 | 2009-08-18 06:23:00 -0400 (Tue, 18 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + + * bwa-0.4.9-33 (r1191) + * fixed a bug in sampe/samse when gaps occur to the 5'-end in SW alignment + * in dbwtsw adjust -T and -c according to -a + +------------------------------------------------------------------------ +r1192 | lh3 | 2009-08-13 05:37:28 -0400 (Thu, 13 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update manual + +------------------------------------------------------------------------ +r1191 | lh3 | 2009-08-12 19:40:51 -0400 (Wed, 12 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtsw2_main.c + +update documentation + +------------------------------------------------------------------------ +r1190 | lh3 | 2009-08-12 08:56:10 -0400 (Wed, 12 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-32 (r1190) + * only help messages are changed + +------------------------------------------------------------------------ +r1189 | lh3 | 2009-08-11 09:28:55 -0400 (Tue, 11 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-31 (r1189) + * in bwape/bwase, print CIGAR "*" if the read is unmapped + * improved the calculation of mapping quality + +------------------------------------------------------------------------ +r1181 | lh3 | 2009-08-03 12:09:41 -0400 (Mon, 03 Aug 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + +fflush() + +------------------------------------------------------------------------ +r1180 | lh3 | 2009-08-03 12:08:46 -0400 (Mon, 03 Aug 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-30 (r1180) + * fixed a memory problem + * multi-threading sometimes does not work... + +------------------------------------------------------------------------ +r1179 | lh3 | 2009-08-03 11:04:39 -0400 (Mon, 03 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-29 (r1179) + * preliminary mutli-threading support in dbwtsw + +------------------------------------------------------------------------ +r1178 | lh3 | 2009-08-03 09:14:54 -0400 (Mon, 03 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-28 (r1178) + * fixed a bug in printing repetitive hits + +------------------------------------------------------------------------ +r1177 | lh3 | 2009-08-03 05:03:42 -0400 (Mon, 03 Aug 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-27 (r1177) + * bwtsw2: fixed a hidden memory leak + +------------------------------------------------------------------------ +r1176 | lh3 | 2009-07-31 10:58:24 -0400 (Fri, 31 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-26 + * change the way mapping quality is calculated + +------------------------------------------------------------------------ +r1175 | lh3 | 2009-07-31 09:15:54 -0400 (Fri, 31 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-25 + * code clean up + * automatically adjust ->t and ->is_rev based on input + +------------------------------------------------------------------------ +r1174 | lh3 | 2009-07-30 08:50:25 -0400 (Thu, 30 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-24 + * fixed a bug in printing the hits + +------------------------------------------------------------------------ +r1173 | lh3 | 2009-07-29 18:32:43 -0400 (Wed, 29 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-23 + * allow to skip reverse alignment + * increase opt->t to 37 + +------------------------------------------------------------------------ +r1172 | lh3 | 2009-07-29 17:22:39 -0400 (Wed, 29 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-22 + * report if the hit is found in both directions + +------------------------------------------------------------------------ +r1171 | lh3 | 2009-07-29 17:12:02 -0400 (Wed, 29 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-21 + * dbwtsw: map to both forward and reverse BWT to reduce false alignment + +------------------------------------------------------------------------ +r1170 | lh3 | 2009-07-29 15:25:14 -0400 (Wed, 29 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + +save hits before cut_tail() + +------------------------------------------------------------------------ +r1169 | lh3 | 2009-07-29 08:06:01 -0400 (Wed, 29 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.9-19 + * use a global memory pool to reduce the CPU time spent on malloc/free(). + +------------------------------------------------------------------------ +r1168 | lh3 | 2009-07-29 06:13:29 -0400 (Wed, 29 Jul 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-18 + * reduce unnecessary extension to the 5'-end + * allow to use different interval size for the 2 rounds + * change default parameters + +------------------------------------------------------------------------ +r1167 | lh3 | 2009-07-28 19:06:17 -0400 (Tue, 28 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-17 + * dbwtsw: fixed THE memory leak. + +------------------------------------------------------------------------ +r1166 | lh3 | 2009-07-28 16:31:41 -0400 (Tue, 28 Jul 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * bwa-0.4.9-16 + * fixed a memory leak + * a small memory leak still occurs to bwtsw2_core(). I will work on that later. + * changed the default parameters + +------------------------------------------------------------------------ +r1165 | lh3 | 2009-07-28 10:15:40 -0400 (Tue, 28 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + + * bwa-0.4.9-15 + * generate CIGAR right before output. This saves unnecessary computation. + * this version may be buggy as I have not tested it. + +------------------------------------------------------------------------ +r1164 | lh3 | 2009-07-28 09:04:14 -0400 (Tue, 28 Jul 2009) | 11 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.9-14 + + * deplete unique hits in dbwtsw and postprocess them with standard sw + + * in principle, this stratgy should be faster and more accurate, but I + have not tested this point. I may switch back to the old method if + this does not work. + + * the code looks quite nasty now. it needs clean up... + + +------------------------------------------------------------------------ +r1163 | lh3 | 2009-07-27 17:41:10 -0400 (Mon, 27 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + +change a default parameter + +------------------------------------------------------------------------ +r1162 | lh3 | 2009-07-27 17:04:35 -0400 (Mon, 27 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-13 + * dbwtsw: switch between small and large Z-best + +------------------------------------------------------------------------ +r1161 | lh3 | 2009-07-27 12:17:41 -0400 (Mon, 27 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-12 + * changed the default -z to 100 + * heuristically speed up alignments for polyA reads + +------------------------------------------------------------------------ +r1160 | lh3 | 2009-07-27 07:50:57 -0400 (Mon, 27 Jul 2009) | 6 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-11 + + * dbwtsw potentially generates less false alignments, although in + practice, the modification brings no improvement. + + +------------------------------------------------------------------------ +r1159 | lh3 | 2009-07-27 04:37:02 -0400 (Mon, 27 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-10 + * disabled debugging code + * add "BAM_FMU" if both ends are unmapped + +------------------------------------------------------------------------ +r1158 | lh3 | 2009-07-24 09:36:52 -0400 (Fri, 24 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/main.c + +nothing, really + +------------------------------------------------------------------------ +r1157 | lh3 | 2009-07-24 09:05:44 -0400 (Fri, 24 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-9 + * bwtsw2: generate SAM output + +------------------------------------------------------------------------ +r1156 | lh3 | 2009-07-24 05:42:47 -0400 (Fri, 24 Jul 2009) | 6 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-8 + + * fixed a weird deadloop which only happens to icc -O3. Thanks John + Marshall for the fix. + + +------------------------------------------------------------------------ +r1155 | lh3 | 2009-07-24 05:28:40 -0400 (Fri, 24 Jul 2009) | 8 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-7 + + * fixed a typo in bwtsw2 alignment. Now score from the standard SW + seems to agree with score from bwtsw2, except that in reporting + alignments, bwtsw2 may report non-optimal segments. This is expected, + though. I will improve in future. + + +------------------------------------------------------------------------ +r1154 | lh3 | 2009-07-23 17:40:20 -0400 (Thu, 23 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * aln_left_core() seems to work properly + * aln_local_core() has a bug... AN EVER EXISTING BUG!!!!!!!!!!! + +------------------------------------------------------------------------ +r1153 | lh3 | 2009-07-23 17:06:09 -0400 (Thu, 23 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + +removed debugging code... + +------------------------------------------------------------------------ +r1152 | lh3 | 2009-07-23 17:01:00 -0400 (Thu, 23 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + + * radical changes failed... + * fixed a bug + +------------------------------------------------------------------------ +r1151 | lh3 | 2009-07-23 14:46:35 -0400 (Thu, 23 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + +temporary changes. Will apply some radical changes to this file... + +------------------------------------------------------------------------ +r1150 | lh3 | 2009-07-23 10:09:56 -0400 (Thu, 23 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/stdaln.c + +fixed a long-existing bug in Smith-Waterman alignment + +------------------------------------------------------------------------ +r1149 | lh3 | 2009-07-23 08:50:52 -0400 (Thu, 23 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.9-6 + * unexplained inconsistency still occurs, but the results largely look reasonable. + +------------------------------------------------------------------------ +r1148 | lh3 | 2009-07-23 08:07:29 -0400 (Thu, 23 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/stdaln.c + +half DP + +------------------------------------------------------------------------ +r1147 | lh3 | 2009-07-22 08:03:06 -0400 (Wed, 22 Jul 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + +a bit code clean up + +------------------------------------------------------------------------ +r1145 | lh3 | 2009-07-21 15:52:05 -0400 (Tue, 21 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-5 + * fixed a bug in determining sub-optimal hits + * removed some debugging codes + +------------------------------------------------------------------------ +r1144 | lh3 | 2009-07-21 10:17:29 -0400 (Tue, 21 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-4 + * better cmd interface + * faster speed + +------------------------------------------------------------------------ +r1143 | lh3 | 2009-07-20 16:38:18 -0400 (Mon, 20 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + +bwtsw2 (dBWT-SW) is working apparently... + + +------------------------------------------------------------------------ +r1139 | lh3 | 2009-07-15 05:52:18 -0400 (Wed, 15 Jul 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.4.9-2 + * bwtsw2: change cut_tail() such that it is faster but more likely to + miss true hits + +------------------------------------------------------------------------ +r1138 | lh3 | 2009-07-15 05:18:42 -0400 (Wed, 15 Jul 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bwt_lite.c + A /branches/prog/bwa/bwt_lite.h + A /branches/prog/bwa/bwtsw2.h + A /branches/prog/bwa/bwtsw2_aux.c + A /branches/prog/bwa/bwtsw2_core.c + A /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.4.9-1 + * added back bwtsw2 + +------------------------------------------------------------------------ +r1075 | lh3 | 2009-05-19 05:14:50 -0400 (Tue, 19 May 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + +Release bwa-0.4.9 + +------------------------------------------------------------------------ +r1073 | lh3 | 2009-05-18 17:13:19 -0400 (Mon, 18 May 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.4.8 + +------------------------------------------------------------------------ +r1069 | lh3 | 2009-05-14 09:54:54 -0400 (Thu, 14 May 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.7-2 + * change the default of "aln -R" to 30 + +------------------------------------------------------------------------ +r1068 | lh3 | 2009-05-14 09:27:55 -0400 (Thu, 14 May 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.4.7-1 + * search for suboptimal hits if the top hit is not so repetitive + +------------------------------------------------------------------------ +r1066 | lh3 | 2009-05-12 15:31:31 -0400 (Tue, 12 May 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + +Release bwa-0.4.7 + +------------------------------------------------------------------------ +r1065 | lh3 | 2009-05-12 15:20:40 -0400 (Tue, 12 May 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.6-9 + * fixed compiling errors on some Linux machines + +------------------------------------------------------------------------ +r1064 | lh3 | 2009-05-12 07:30:46 -0400 (Tue, 12 May 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.6-8 + * avoid compilation error on some systems. + +------------------------------------------------------------------------ +r1035 | lh3 | 2009-05-09 05:41:33 -0400 (Sat, 09 May 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.6-7 + * fixed an integer overflow caused by previous modifications + * made insert size estimation more robust + +------------------------------------------------------------------------ +r1008 | lh3 | 2009-04-29 05:41:58 -0400 (Wed, 29 Apr 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.6-5 + * fixed a integer overflow problem which may cause seg fault in very rare cases + * made XN tags more accurate + +------------------------------------------------------------------------ +r1005 | lh3 | 2009-04-27 07:37:23 -0400 (Mon, 27 Apr 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.6-4 + * heuristic rules to detect suboptimal alignment + * stdsw: support double-strand and protein alignment + +------------------------------------------------------------------------ +r1003 | lh3 | 2009-04-26 12:48:19 -0400 (Sun, 26 Apr 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.6-2 + * improve the functionality of stdsw + * allow to add a threshold on SW alignment. Hope this does not incur new bugs... + +------------------------------------------------------------------------ +r1002 | lh3 | 2009-04-22 03:56:15 -0400 (Wed, 22 Apr 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.6-1 + * output SM and AM tag + +------------------------------------------------------------------------ +r914 | lh3 | 2009-03-09 17:53:50 -0400 (Mon, 09 Mar 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.4.6 + +------------------------------------------------------------------------ +r913 | lh3 | 2009-03-09 17:23:24 -0400 (Mon, 09 Mar 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + A /branches/prog/bwa/solid2fastq.pl + + * added notes to bwa + * added a script to convert SOLiD reads + * updated documentations + +------------------------------------------------------------------------ +r912 | lh3 | 2009-03-09 16:57:05 -0400 (Mon, 09 Mar 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/kstring.c + M /branches/prog/bwa/main.c + +fixed a bug in kstring + +------------------------------------------------------------------------ +r881 | lh3 | 2009-03-02 15:36:06 -0500 (Mon, 02 Mar 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtmisc.c + M /branches/prog/bwa/main.c + + * bwa-0.4.5-7 + * fixed a bug in pac2cspac + +------------------------------------------------------------------------ +r880 | lh3 | 2009-03-01 16:34:08 -0500 (Sun, 01 Mar 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + +disable debugging + +------------------------------------------------------------------------ +r879 | lh3 | 2009-03-01 16:28:04 -0500 (Sun, 01 Mar 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + + * bwa-0.4.5-6 + * fixed problems with coordinates for color gapped alignment + +------------------------------------------------------------------------ +r878 | lh3 | 2009-03-01 13:43:09 -0500 (Sun, 01 Mar 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + + * bwa-0.4.5-5 + * added support for gapped color alignment + +------------------------------------------------------------------------ +r877 | lh3 | 2009-03-01 10:27:52 -0500 (Sun, 01 Mar 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/cs2nt.c + M /branches/prog/bwa/main.c + + * convert cs read to nt read (for ungapped alignment only) + +------------------------------------------------------------------------ +r860 | lh3 | 2009-02-27 08:58:39 -0500 (Fri, 27 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwase.c + A /branches/prog/bwa/cs2nt.c + +prepare to implement cs->nt conversion (have not yet...) + +------------------------------------------------------------------------ +r859 | lh3 | 2009-02-27 07:00:03 -0500 (Fri, 27 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/bwtmisc.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.4.5-3 + * generate color index from nucleotide fasta reference + +------------------------------------------------------------------------ +r857 | lh3 | 2009-02-26 10:22:58 -0500 (Thu, 26 Feb 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.4.5-2 + * improved mapping quality a bit if one end falls in a tandem repeat + but the mate is unique. + +------------------------------------------------------------------------ +r856 | lh3 | 2009-02-26 10:02:29 -0500 (Thu, 26 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.5-1 + * make bwa work for SOLiD reads + +------------------------------------------------------------------------ +r828 | lh3 | 2009-02-18 17:36:41 -0500 (Wed, 18 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.4.5 + +------------------------------------------------------------------------ +r827 | lh3 | 2009-02-18 16:48:48 -0500 (Wed, 18 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/main.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.4.4-6 + * fixed a bug in SW alignment when no residue matches + +------------------------------------------------------------------------ +r824 | lh3 | 2009-02-17 05:33:07 -0500 (Tue, 17 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.4.4-5 + * fixed that bounary bug + +------------------------------------------------------------------------ +r823 | lh3 | 2009-02-17 04:54:18 -0500 (Tue, 17 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwape.c + +just change some logging information + +------------------------------------------------------------------------ +r822 | lh3 | 2009-02-17 04:20:39 -0500 (Tue, 17 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update manual + +------------------------------------------------------------------------ +r821 | lh3 | 2009-02-17 04:11:14 -0500 (Tue, 17 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.4-4 + * fixed a bug on boundary check in pair_sw + +------------------------------------------------------------------------ +r820 | lh3 | 2009-02-16 17:43:37 -0500 (Mon, 16 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.4-3 + * allow to change mismatch penalty + +------------------------------------------------------------------------ +r819 | lh3 | 2009-02-16 17:40:28 -0500 (Mon, 16 Feb 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.4-2 + * remove timer + * allow to change default gapo and gape penalty at the command line + +------------------------------------------------------------------------ +r818 | lh3 | 2009-02-16 09:30:51 -0500 (Mon, 16 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update benchmark + +------------------------------------------------------------------------ +r817 | lh3 | 2009-02-16 08:44:40 -0500 (Mon, 16 Feb 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/kvec.h + M /branches/prog/bwa/main.c + + * bwa-0.4.4-1 + * automatically detect insert size + * use insert size in pairing. This may potentially improve accuracy (untested!) + +------------------------------------------------------------------------ +r814 | lh3 | 2009-02-15 11:10:23 -0500 (Sun, 15 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.4.4 + +------------------------------------------------------------------------ +r813 | lh3 | 2009-02-15 10:22:50 -0500 (Sun, 15 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.3-5 + * impose boundary check in refine_gapped + +------------------------------------------------------------------------ +r811 | lh3 | 2009-02-14 09:46:13 -0500 (Sat, 14 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.3-4 + * change MD tag to match the latest SAM specification + +------------------------------------------------------------------------ +r810 | lh3 | 2009-02-13 04:46:04 -0500 (Fri, 13 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + +update ChangeLog + +------------------------------------------------------------------------ +r799 | lh3 | 2009-02-05 12:01:17 -0500 (Thu, 05 Feb 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + +change MD tag to meet the latest SAM specification + +------------------------------------------------------------------------ +r796 | lh3 | 2009-02-05 08:35:13 -0500 (Thu, 05 Feb 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.3-2 + * fixed a bug on counting 'N' + +------------------------------------------------------------------------ +r795 | lh3 | 2009-02-05 07:41:27 -0500 (Thu, 05 Feb 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.4.3-1 + * fixed potential boundary problems + * update benchmark result + +------------------------------------------------------------------------ +r791 | lh3 | 2009-01-25 05:20:47 -0500 (Sun, 25 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update some numbers + +------------------------------------------------------------------------ +r790 | lh3 | 2009-01-24 15:13:03 -0500 (Sat, 24 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update benchmark + +------------------------------------------------------------------------ +r789 | lh3 | 2009-01-22 10:18:44 -0500 (Thu, 22 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtindex.c + +a warning message for index + +------------------------------------------------------------------------ +r788 | lh3 | 2009-01-22 09:54:06 -0500 (Thu, 22 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/main.c + +forget to change release number + +------------------------------------------------------------------------ +r786 | lh3 | 2009-01-22 06:27:39 -0500 (Thu, 22 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + +Release bwa-0.4.3 + +------------------------------------------------------------------------ +r785 | lh3 | 2009-01-22 06:27:16 -0500 (Thu, 22 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + +Release bwa-0.4.3 + +------------------------------------------------------------------------ +r784 | lh3 | 2009-01-22 06:19:59 -0500 (Thu, 22 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-10 + * update documentation + * fixed a bug on generating MD tags for SW alignment + +------------------------------------------------------------------------ +r782 | lh3 | 2009-01-19 12:08:38 -0500 (Mon, 19 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-9 + * fixed a bug in samse -n... + +------------------------------------------------------------------------ +r781 | lh3 | 2009-01-19 11:26:37 -0500 (Mon, 19 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-8 + * given -N, the previous version would stop if the top hit is a repeat. Now changed. + +------------------------------------------------------------------------ +r780 | lh3 | 2009-01-19 11:20:18 -0500 (Mon, 19 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-7 + * use a bit-wise flag to replace some member variables in the option struct + * allow to switch off the iterative strategy + +------------------------------------------------------------------------ +r779 | lh3 | 2009-01-19 10:45:57 -0500 (Mon, 19 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-6 + * allow to dump multiple hits from samse, in another format, though + +------------------------------------------------------------------------ +r778 | lh3 | 2009-01-19 06:24:29 -0500 (Mon, 19 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/kseq.h + A /branches/prog/bwa/kstring.c + A /branches/prog/bwa/kstring.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/simple_dp.c + + * bwa-0.4.2-5 + * update kseq.h to the latest version + * generate MD tag + * print mate coordinate if only one end is unmapped + +------------------------------------------------------------------------ +r775 | lh3 | 2009-01-18 05:40:35 -0500 (Sun, 18 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-4 + * fixed a bug for SAM format + +------------------------------------------------------------------------ +r774 | lh3 | 2009-01-17 13:48:52 -0500 (Sat, 17 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-3 + * change default fnr to 0.04 + * print max_diff for valid fnr + +------------------------------------------------------------------------ +r773 | lh3 | 2009-01-17 05:54:37 -0500 (Sat, 17 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.2-2 + * automatically choose max_diff + +------------------------------------------------------------------------ +r772 | lh3 | 2009-01-16 18:16:14 -0500 (Fri, 16 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.4.2-1 + * take N as a mismatch + +------------------------------------------------------------------------ +r768 | lh3 | 2009-01-09 11:57:23 -0500 (Fri, 09 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/main.c + +Release bwa-0.4.2 + +------------------------------------------------------------------------ +r759 | lh3 | 2009-01-07 09:55:43 -0500 (Wed, 07 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + +Release bwa-0.4.1 + +------------------------------------------------------------------------ +r758 | lh3 | 2009-01-07 05:36:06 -0500 (Wed, 07 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.0-2 + * make mate_sw fully working + +------------------------------------------------------------------------ +r757 | lh3 | 2009-01-06 18:04:29 -0500 (Tue, 06 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.4.0-1 + * do SW alignment for unmapped mate. It is working. + * I still need to do some extra work for SW alignment, but it is too late + and I am getting tired... I will do tomorrow. + +------------------------------------------------------------------------ +r755 | lh3 | 2009-01-06 10:23:29 -0500 (Tue, 06 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.4.0 + +------------------------------------------------------------------------ +r754 | lh3 | 2009-01-06 07:45:02 -0500 (Tue, 06 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * bwa-0.3.0-12 + * better lock + +------------------------------------------------------------------------ +r753 | lh3 | 2009-01-06 06:17:21 -0500 (Tue, 06 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-11 + * fixed a small memory leak in bwa_seq_close() + * fixed "uninitialized memory" from bwt_aln1_t + * multithreading for "aln" command + +------------------------------------------------------------------------ +r752 | lh3 | 2009-01-05 17:34:13 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + D /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwt_gen/bwt_gen.c + A /branches/prog/bwa/bwtmisc.c (from /branches/prog/bwa/pac2bwt.c:748) + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + D /branches/prog/bwa/pac2bwt.c + + * bwa-0.3.0-10 + * a little bit code clean up + +------------------------------------------------------------------------ +r751 | lh3 | 2009-01-05 17:19:04 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-9 + * use 64-bit integer to speed up Occ calculate, although just a little bit + +------------------------------------------------------------------------ +r750 | lh3 | 2009-01-05 16:44:26 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-8 + * a little bit code cleanup + +------------------------------------------------------------------------ +r749 | lh3 | 2009-01-05 16:37:28 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-7 + * accelerate Occ calculation + +------------------------------------------------------------------------ +r748 | lh3 | 2009-01-05 16:12:28 -0500 (Mon, 05 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/pac2bwt.c + + * bwa-0.3.0-6 + * put occ table along with bwt to save another cache miss + * this version is already faster than the previous and I can still improve it... + +------------------------------------------------------------------------ +r747 | lh3 | 2009-01-05 10:16:18 -0500 (Mon, 05 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-5 + * remove occ_major to save a cache miss; however, OCC_INTERVAL has to be + increased to keep the same memory. As a result, the speed is a little + slower in fact. + +------------------------------------------------------------------------ +r746 | lh3 | 2009-01-05 09:50:53 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-4 + * added back optimization codes (it is a pain...) + +------------------------------------------------------------------------ +r745 | lh3 | 2009-01-05 08:23:00 -0500 (Mon, 05 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-3 + * faster bit operations + +------------------------------------------------------------------------ +r744 | lh3 | 2009-01-05 05:58:46 -0500 (Mon, 05 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-2 + * removed optimization codes again... + * use a new method to count the bits + +------------------------------------------------------------------------ +r743 | lh3 | 2009-01-04 17:18:38 -0500 (Sun, 04 Jan 2009) | 5 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.3.0-1 + * added back the optimization codes + * added a new option to aln: max_entries, although this is disabled by default + * updated benchmark + +------------------------------------------------------------------------ +r742 | lh3 | 2009-01-04 07:56:12 -0500 (Sun, 04 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +add URL + +------------------------------------------------------------------------ +r740 | lh3 | 2009-01-04 07:39:43 -0500 (Sun, 04 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.3.0 + +------------------------------------------------------------------------ +r739 | lh3 | 2009-01-04 06:55:06 -0500 (Sun, 04 Jan 2009) | 2 lines +Changed paths: + A /branches/prog/bwa/COPYING + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/utils.c + M /branches/prog/bwa/utils.h + +added licensing information + +------------------------------------------------------------------------ +r738 | lh3 | 2009-01-04 06:18:25 -0500 (Sun, 04 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-31 + * better mapping quality + * update benchmark + +------------------------------------------------------------------------ +r737 | lh3 | 2009-01-03 16:00:58 -0500 (Sat, 03 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + +update documentation + +------------------------------------------------------------------------ +r736 | lh3 | 2009-01-02 10:26:38 -0500 (Fri, 02 Jan 2009) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update documentation + +------------------------------------------------------------------------ +r735 | lh3 | 2009-01-02 07:10:20 -0500 (Fri, 02 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-30 + * reduce memory a little bit + * update documentation + +------------------------------------------------------------------------ +r734 | lh3 | 2009-01-01 13:45:45 -0500 (Thu, 01 Jan 2009) | 8 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-29 + * sampe: removed -O option; changed default -o to 100000 + * sampe: fixed a bug in calculating paired mapping quality + * aln: added an option to search for suboptimal hits even if the best is a repeat. + This option will make sampe MUCH SLOWER. + * sampe: set isize as zero if mapped to two different chr + * update manual (unfinished) + +------------------------------------------------------------------------ +r733 | lh3 | 2009-01-01 11:01:20 -0500 (Thu, 01 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-28 + * fixed a bug in calculating paired mapping quality + +------------------------------------------------------------------------ +r732 | lh3 | 2009-01-01 09:27:46 -0500 (Thu, 01 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + A /branches/prog/bwa/khash.h (from /branches/prog/sclib/khash/khash.h:675) + M /branches/prog/bwa/main.c + + * bwa-0.2.0-27 + * accelerate sampe by storing visited large intervals + +------------------------------------------------------------------------ +r731 | lh3 | 2009-01-01 06:51:21 -0500 (Thu, 01 Jan 2009) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-26 + * remove the optimation codes + +------------------------------------------------------------------------ +r730 | lh3 | 2009-01-01 06:48:59 -0500 (Thu, 01 Jan 2009) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-25 + * accelerate OCC calculation by ~7%. However, it seems not worth doing + this by complicate the codes. I will change back later. + +------------------------------------------------------------------------ +r729 | lh3 | 2008-12-31 16:43:56 -0500 (Wed, 31 Dec 2008) | 6 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-24 + * change command "sai2sam_pe" to "sampe" + * print usage for sampe command + * in sampe: change default max_occ to 1000 + * fixed a few compiling warnings in bntseq.c + +------------------------------------------------------------------------ +r728 | lh3 | 2008-12-27 07:14:59 -0500 (Sat, 27 Dec 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-22 + * mating information can be printed to SAM + +------------------------------------------------------------------------ +r727 | lh3 | 2008-12-26 18:10:59 -0500 (Fri, 26 Dec 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.2.0-21 + * implement pairing (still UNFINISHED) + * output all reads even if full of N + +------------------------------------------------------------------------ +r726 | lh3 | 2008-12-26 13:31:27 -0500 (Fri, 26 Dec 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.2.0-20 + * remove "-t" from aln cmd + * code clean up: move some functions in bwt2fmv.c to other source files + * added sai2sam_pe cmd: *UNFINISHED* + +------------------------------------------------------------------------ +r725 | lh3 | 2008-12-26 07:04:11 -0500 (Fri, 26 Dec 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bwase.c + A /branches/prog/bwa/bwaseqio.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/kseq.h + A /branches/prog/bwa/ksort.h (from /branches/prog/sclib/ksort/ksort.h:712) + A /branches/prog/bwa/kvec.h (from /branches/prog/sclib/kvec/kvec.h:537) + M /branches/prog/bwa/main.c + + * bwa-0.2.0-19 + * considerable code cleanup; no actual changes + +------------------------------------------------------------------------ +r724 | lh3 | 2008-12-25 11:32:11 -0500 (Thu, 25 Dec 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.2.0-18 + * generate SAM output + +------------------------------------------------------------------------ +r723 | lh3 | 2008-12-25 10:48:31 -0500 (Thu, 25 Dec 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.2.0-17 + * remove bwtsw2 related codes + * separate searching for SA interval from generating alignments + +------------------------------------------------------------------------ +r722 | lh3 | 2008-12-25 08:57:13 -0500 (Thu, 25 Dec 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt2fmv.c + D /branches/prog/bwa/bwt_lite.c + D /branches/prog/bwa/bwt_lite.h + M /branches/prog/bwa/bwtgap.c + D /branches/prog/bwa/bwtsw2.h + D /branches/prog/bwa/bwtsw2_aux.c + D /branches/prog/bwa/bwtsw2_core.c + D /branches/prog/bwa/bwtsw2_main.c + D /branches/prog/bwa/khash.h + D /branches/prog/bwa/ksort.h + D /branches/prog/bwa/kvec.h + M /branches/prog/bwa/main.c + + * added interface to "aln -t" + * remove bwtsw2 related codes + +------------------------------------------------------------------------ +r666 | lh3 | 2008-11-18 18:34:29 -0500 (Tue, 18 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.2.0-16 + * allow to set max mismatches based on read length, but I do not know + whether this really works + +------------------------------------------------------------------------ +r665 | lh3 | 2008-11-18 08:34:03 -0500 (Tue, 18 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-15 + * fixed a bug in sequence parser. + +------------------------------------------------------------------------ +r612 | lh3 | 2008-10-28 06:50:53 -0400 (Tue, 28 Oct 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/utils.c + + * bwa-0.2.0-14 + * fixed a bug caused by the change of the FASTA/Q parser + +------------------------------------------------------------------------ +r611 | lh3 | 2008-10-28 06:24:56 -0400 (Tue, 28 Oct 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtsw2_core.c + A /branches/prog/bwa/kseq.h + D /branches/prog/bwa/seq.c + D /branches/prog/bwa/seq.h + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/utils.c + M /branches/prog/bwa/utils.h + +replace seq.* with kseq.h + +------------------------------------------------------------------------ +r610 | lh3 | 2008-10-27 13:00:04 -0400 (Mon, 27 Oct 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-13 + * make bwtsw2 output sub-optimal hits. not completed + +------------------------------------------------------------------------ +r609 | lh3 | 2008-10-24 16:52:00 -0400 (Fri, 24 Oct 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/kvec.h + +little... + +------------------------------------------------------------------------ +r532 | lh3 | 2008-09-19 05:28:45 -0400 (Fri, 19 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/khash.h + +improve interface of khash + +------------------------------------------------------------------------ +r531 | lh3 | 2008-09-18 06:52:59 -0400 (Thu, 18 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +improve minor things, which make bwtsw2 slower, but should miss less true hits + +------------------------------------------------------------------------ +r530 | lh3 | 2008-09-17 18:19:26 -0400 (Wed, 17 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + + * fixed a bug in calculating ->D + * enforce band-width checking + +------------------------------------------------------------------------ +r529 | lh3 | 2008-09-17 18:06:49 -0400 (Wed, 17 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +delete a line of code that is never visited + +------------------------------------------------------------------------ +r528 | lh3 | 2008-09-17 17:58:51 -0400 (Wed, 17 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +a bit code clean up + +------------------------------------------------------------------------ +r527 | lh3 | 2008-09-17 10:55:45 -0400 (Wed, 17 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-12 + * max-depth can be set, although it does not help the speed at all + +------------------------------------------------------------------------ +r526 | lh3 | 2008-09-16 17:59:36 -0400 (Tue, 16 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +cut_tail after remove duplicate + +------------------------------------------------------------------------ +r525 | lh3 | 2008-09-16 17:56:11 -0400 (Tue, 16 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/khash.h + M /branches/prog/bwa/main.c + + * bwa-0.2.0-11 + * improved cut_tail() + +------------------------------------------------------------------------ +r524 | lh3 | 2008-09-15 16:53:22 -0400 (Mon, 15 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-10 + * fixed a bug in cut_tail() + +------------------------------------------------------------------------ +r518 | lh3 | 2008-09-15 04:35:59 -0400 (Mon, 15 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +a bit code clean up + +------------------------------------------------------------------------ +r517 | lh3 | 2008-09-14 18:18:11 -0400 (Sun, 14 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +improve speed (<1%) + +------------------------------------------------------------------------ +r516 | lh3 | 2008-09-14 18:08:55 -0400 (Sun, 14 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + + * fixed two potential bugs, although I have not seen their effects + * improve speed a bit (<2%) + +------------------------------------------------------------------------ +r515 | lh3 | 2008-09-14 17:26:49 -0400 (Sun, 14 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + +nothing, really + +------------------------------------------------------------------------ +r514 | lh3 | 2008-09-14 17:10:13 -0400 (Sun, 14 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +disable X-drop, which has to be reimplemented in the current algorithm + +------------------------------------------------------------------------ +r513 | lh3 | 2008-09-14 16:49:42 -0400 (Sun, 14 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt_lite.c + M /branches/prog/bwa/bwt_lite.h + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + + * temporarily disable cut_tail() + * calculate SA in bwt_lite.c + * fixed a bug in reversing the sequence + +------------------------------------------------------------------------ +r512 | lh3 | 2008-09-13 17:35:40 -0400 (Sat, 13 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + A /branches/prog/bwa/ksort.h + +n-best method + +------------------------------------------------------------------------ +r507 | lh3 | 2008-09-13 09:06:54 -0400 (Sat, 13 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_core.c + +give correct result again + +------------------------------------------------------------------------ +r506 | lh3 | 2008-09-13 08:12:07 -0400 (Sat, 13 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +I think I know the reason. It needs more work... + +------------------------------------------------------------------------ +r505 | lh3 | 2008-09-13 06:20:43 -0400 (Sat, 13 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_core.c + +fixed another bug, but still have + +------------------------------------------------------------------------ +r504 | lh3 | 2008-09-12 18:13:37 -0400 (Fri, 12 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +fixed another bug + +------------------------------------------------------------------------ +r503 | lh3 | 2008-09-12 17:15:56 -0400 (Fri, 12 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/khash.h + + * do not segfault, but the result is WRONG! + * prepare to remove bsw2_connectivity_check() + +------------------------------------------------------------------------ +r502 | lh3 | 2008-09-12 15:52:41 -0400 (Fri, 12 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/kvec.h + +more revisions + +------------------------------------------------------------------------ +r501 | lh3 | 2008-09-11 18:06:15 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +further simply codes with kvec.h + +------------------------------------------------------------------------ +r500 | lh3 | 2008-09-11 17:42:15 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +part of revisions... have not finished + +------------------------------------------------------------------------ +r499 | lh3 | 2008-09-11 17:24:15 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/khash.h + A /branches/prog/bwa/kvec.h + +prepare for abrupt change + +------------------------------------------------------------------------ +r496 | lh3 | 2008-09-11 10:34:38 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +fixed a bug; now "bwtsw2 -d" is useless + +------------------------------------------------------------------------ +r495 | lh3 | 2008-09-11 09:22:03 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + +improve speed a little bit + +------------------------------------------------------------------------ +r494 | lh3 | 2008-09-11 08:28:08 -0400 (Thu, 11 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +remove debug codes + +------------------------------------------------------------------------ +r493 | lh3 | 2008-09-11 07:49:53 -0400 (Thu, 11 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + + * improve the speed a little bit (<5%) + * prepare to remove BSW_DEBUG + +------------------------------------------------------------------------ +r492 | lh3 | 2008-09-11 06:15:56 -0400 (Thu, 11 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-9 + * support reverse strand + * fixed a bug that causes missing hits + +------------------------------------------------------------------------ +r491 | lh3 | 2008-09-11 05:46:16 -0400 (Thu, 11 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-8 + * better progress report + +------------------------------------------------------------------------ +r490 | lh3 | 2008-09-10 17:04:49 -0400 (Wed, 10 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-7 + * avoid some missing hits + * add maximum depth + +------------------------------------------------------------------------ +r489 | lh3 | 2008-09-10 11:51:13 -0400 (Wed, 10 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-6 + * bwtsw2 works although on the forward strand only for now + * better progress information + +------------------------------------------------------------------------ +r488 | lh3 | 2008-09-10 10:21:53 -0400 (Wed, 10 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + + * implement memory pool + * avoid some rehashing + +------------------------------------------------------------------------ +r487 | lh3 | 2008-09-10 09:23:38 -0400 (Wed, 10 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_main.c + + * fixed a memory leak + * prepare to implement mempool + +------------------------------------------------------------------------ +r486 | lh3 | 2008-09-10 09:10:09 -0400 (Wed, 10 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/khash.h + + * add X-dropoff + * remove duplicated results + * switch to simple stack + +------------------------------------------------------------------------ +r485 | lh3 | 2008-09-10 06:31:20 -0400 (Wed, 10 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + + * check whether t-node has been visited + * prepare to remove two-level stack + +------------------------------------------------------------------------ +r484 | lh3 | 2008-09-10 05:00:57 -0400 (Wed, 10 Sep 2008) | 2 lines +Changed paths: + A /branches/prog/bwa/khash.h + +khash library + +------------------------------------------------------------------------ +r483 | lh3 | 2008-09-10 04:22:53 -0400 (Wed, 10 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +add inline + +------------------------------------------------------------------------ +r482 | lh3 | 2008-09-09 16:34:57 -0400 (Tue, 09 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + +improve speed + +------------------------------------------------------------------------ +r481 | lh3 | 2008-09-09 13:13:00 -0400 (Tue, 09 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_core.c + +Use a 128bit hash table to keep all (tk,tl,qk,ql). This is slow. Just +keep a copy in case I may need this in future. + + +------------------------------------------------------------------------ +r480 | lh3 | 2008-09-09 12:53:32 -0400 (Tue, 09 Sep 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_core.c + + * no principal modification + +------------------------------------------------------------------------ +r479 | lh3 | 2008-09-09 11:01:45 -0400 (Tue, 09 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2_core.c + + * fixed a bug which may cause duplicated matching + * accelerate the speed a bit, although using hash in avoiding duplications + slows the speed down in the end + +------------------------------------------------------------------------ +r474 | lh3 | 2008-09-03 17:22:57 -0400 (Wed, 03 Sep 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtsw2.h + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-5 + * indel seems to work on toy example + * add band + +------------------------------------------------------------------------ +r469 | lh3 | 2008-09-01 09:18:45 -0400 (Mon, 01 Sep 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt_lite.c + M /branches/prog/bwa/bwt_lite.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtsw2.h + A /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/bwtsw2_core.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/is.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/simple_dp.c + + * bwa-0.2.0-4 + * updated bwtsw2, which seems to work properly on toy examples + +------------------------------------------------------------------------ +r447 | lh3 | 2008-08-27 10:05:09 -0400 (Wed, 27 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-3 + * tune for longer gaps, but it does not really work with kilo-bp gaps... + +------------------------------------------------------------------------ +r446 | lh3 | 2008-08-26 13:30:41 -0400 (Tue, 26 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-2 + * changed the way to extend long deletions. Now use max_del_occ. + +------------------------------------------------------------------------ +r445 | lh3 | 2008-08-26 13:05:58 -0400 (Tue, 26 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt_lite.c + M /branches/prog/bwa/bwt_lite.h + +updated from bwtsw2_lite + +------------------------------------------------------------------------ +r436 | lh3 | 2008-08-23 12:28:44 -0400 (Sat, 23 Aug 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.h + A /branches/prog/bwa/bwt_lite.c + A /branches/prog/bwa/bwt_lite.h + A /branches/prog/bwa/bwtsw2.h + A /branches/prog/bwa/bwtsw2_core.c + A /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + + * bwa-0.2.0-1 + * add bwt_lite: a light-weighted version of bwt (NOT TESTED!) + * add core codes for bwtsw2: NOT TESTED!!! + +------------------------------------------------------------------------ +r427 | lh3 | 2008-08-15 05:38:12 -0400 (Fri, 15 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + +Release bwa-0.2.0 + +------------------------------------------------------------------------ +r426 | lh3 | 2008-08-14 11:26:19 -0400 (Thu, 14 Aug 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + + * bwa-0.1.6-7 + * change default seed length to 31 + * add incomplete support to color sequences (not tested yet!) + +------------------------------------------------------------------------ +r425 | lh3 | 2008-08-14 06:23:11 -0400 (Thu, 14 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.6-6 + * change default seed length to 33bp + +------------------------------------------------------------------------ +r424 | lh3 | 2008-08-14 05:55:33 -0400 (Thu, 14 Aug 2008) | 6 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.6-5 + * fixed a bug that may miss true alignments. this bugs exists in most + early versions. + * fixed a bug that yields wrong coordinates for reads mapped on the forward + strands with gaps. + +------------------------------------------------------------------------ +r423 | lh3 | 2008-08-14 04:07:28 -0400 (Thu, 14 Aug 2008) | 2 lines +Changed paths: + D /branches/prog/bwa/Makefile.div + +useless + +------------------------------------------------------------------------ +r422 | lh3 | 2008-08-13 19:21:14 -0400 (Wed, 13 Aug 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.6-4 + * fixed one bug + * there is another one... + +------------------------------------------------------------------------ +r421 | lh3 | 2008-08-13 18:23:33 -0400 (Wed, 13 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/main.c + + * bwa-0.1.6-3 + * almost there, but not quite right + +------------------------------------------------------------------------ +r419 | lh3 | 2008-08-13 17:27:02 -0400 (Wed, 13 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * improve the seeding method + * prepare to load two BWTs into memory. A BIG change! + +------------------------------------------------------------------------ +r418 | lh3 | 2008-08-13 10:56:54 -0400 (Wed, 13 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * added seeding + * unfinished yet + +------------------------------------------------------------------------ +r413 | lh3 | 2008-08-08 11:48:35 -0400 (Fri, 08 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.1.6 + +------------------------------------------------------------------------ +r410 | lh3 | 2008-08-06 15:48:22 -0400 (Wed, 06 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/simple_dp.c + +sw: output alignment score + +------------------------------------------------------------------------ +r407 | lh3 | 2008-08-04 10:01:20 -0400 (Mon, 04 Aug 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + A /branches/prog/bwa/simple_dp.c + M /branches/prog/bwa/stdaln.c + M /branches/prog/bwa/stdaln.h + + * bwa-0.1.5-3 + * added a simple interface to SW/NW alignment + * stdaln-0.9.8 (see header for more details) + +------------------------------------------------------------------------ +r406 | lh3 | 2008-08-01 19:21:59 -0400 (Fri, 01 Aug 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + A /branches/prog/bwa/stdaln.c + A /branches/prog/bwa/stdaln.h + + * bwa-0.1.5-2 + * give accurate gap positions + +------------------------------------------------------------------------ +r405 | lh3 | 2008-08-01 19:06:19 -0400 (Fri, 01 Aug 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + +unfinished, but I am tired... + +------------------------------------------------------------------------ +r401 | lh3 | 2008-07-30 05:59:24 -0400 (Wed, 30 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/main.c + + * bwa-0.1.5-1 + * fixed a potential bug which may produce an alignment in N regions, + although extremely rare. + +------------------------------------------------------------------------ +r399 | lh3 | 2008-07-27 11:41:52 -0400 (Sun, 27 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.1.5 + +------------------------------------------------------------------------ +r398 | lh3 | 2008-07-25 12:14:47 -0400 (Fri, 25 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update documentation + +------------------------------------------------------------------------ +r397 | lh3 | 2008-07-25 09:58:56 -0400 (Fri, 25 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * + +------------------------------------------------------------------------ +r396 | lh3 | 2008-07-25 06:42:01 -0400 (Fri, 25 Jul 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.4-4 + * add timer for debugging + +------------------------------------------------------------------------ +r395 | lh3 | 2008-07-24 05:46:21 -0400 (Thu, 24 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.4-3 + * fixed a bug in the previous code + * this version gives identical result to bwa-0.1.4, just 10% faster + +------------------------------------------------------------------------ +r394 | lh3 | 2008-07-24 05:18:53 -0400 (Thu, 24 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * bwa-0.1.4-2 + * further improve the speed + * The result is slightly different from bwa-0.1.4 now. I need to check... + +------------------------------------------------------------------------ +r393 | lh3 | 2008-07-23 12:04:16 -0400 (Wed, 23 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + +comments only + +------------------------------------------------------------------------ +r392 | lh3 | 2008-07-23 10:34:03 -0400 (Wed, 23 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + +further improve the speed in Occ functions + +------------------------------------------------------------------------ +r386 | lh3 | 2008-07-22 10:03:54 -0400 (Tue, 22 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/main.c + +Release bwa-0.1.4 + +------------------------------------------------------------------------ +r385 | lh3 | 2008-07-22 09:44:50 -0400 (Tue, 22 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + +update documentation and ChangeLog + +------------------------------------------------------------------------ +r384 | lh3 | 2008-07-22 08:50:03 -0400 (Tue, 22 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.3-2 + * fixed the bug in the last modification + * now the alignment should be more clearly defined + +------------------------------------------------------------------------ +r383 | lh3 | 2008-07-21 18:32:21 -0400 (Mon, 21 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.3-1 + * this is a buggy verion! + * i will fix the bug tomorrow. It is late... + +------------------------------------------------------------------------ +r381 | lh3 | 2008-07-21 06:45:32 -0400 (Mon, 21 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.1.3 + +------------------------------------------------------------------------ +r380 | lh3 | 2008-07-21 06:07:43 -0400 (Mon, 21 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/main.c + + * bwa-0.1.2-3 + * improve the speed for gcc on Intel Mac OS X, but not really on icc on Linux + * aln: more command-line options + +------------------------------------------------------------------------ +r373 | lh3 | 2008-07-17 09:09:46 -0400 (Thu, 17 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + + * bwa-0.1.2-2 + * further improve the speed + * this version gives exactly the same result as bwa-0.1.2 + +------------------------------------------------------------------------ +r372 | lh3 | 2008-07-17 07:51:08 -0400 (Thu, 17 Jul 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/main.c + + * bwa-0.1.2-1 + * speed up by about 5% + +------------------------------------------------------------------------ +r370 | lh3 | 2008-07-17 05:12:00 -0400 (Thu, 17 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.1.2 + +------------------------------------------------------------------------ +r368 | lh3 | 2008-07-16 08:51:25 -0400 (Wed, 16 Jul 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + D /branches/prog/bwa/bwt1away.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + D /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-9 + * some code cleanup + * remove 1away and top2 + +------------------------------------------------------------------------ +r367 | lh3 | 2008-07-16 08:24:34 -0400 (Wed, 16 Jul 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/is.c + +Yuta Mori's implementation of IS algorithm. + +------------------------------------------------------------------------ +r365 | lh3 | 2008-07-16 06:58:04 -0400 (Wed, 16 Jul 2008) | 6 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * bwa-0.1.1-8 + * improve gapped alignment + * this version will miss more gapped alignments, but the speed is much faster + * prepare to remove top2 and 1away algorithms + * prepare to add SAIS algorithm for bwt construction + +------------------------------------------------------------------------ +r358 | lh3 | 2008-06-09 06:03:04 -0400 (Mon, 09 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-7 + * change END_SKIP from 3 to 5, but still gaps may be wrongly added + * change default '-g' from 5 to 3 + +------------------------------------------------------------------------ +r357 | lh3 | 2008-06-09 05:18:36 -0400 (Mon, 09 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-6 + * fix a bug in nested stack + +------------------------------------------------------------------------ +r356 | lh3 | 2008-06-08 18:43:13 -0400 (Sun, 08 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + A /branches/prog/bwa/bwtgap.h + M /branches/prog/bwa/main.c + + * bwa-0.1.1-5 + * replace heap with nested stacks + * there are still obvious bugs... + +------------------------------------------------------------------------ +r355 | lh3 | 2008-06-08 17:13:44 -0400 (Sun, 08 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * bwa-0.1.1-4 + * add interface to affine gap alignment + * there are obvious bugs and I will fix them later + +------------------------------------------------------------------------ +r354 | lh3 | 2008-06-08 15:39:05 -0400 (Sun, 08 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-3 + * affine gap seems to work, at least partially + +------------------------------------------------------------------------ +r353 | lh3 | 2008-06-08 09:27:18 -0400 (Sun, 08 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + A /branches/prog/bwa/bwtgap.c + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-2 + * initial gapped alignment. not work at the moment + +------------------------------------------------------------------------ +r352 | lh3 | 2008-06-06 04:37:34 -0400 (Fri, 06 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.1-1 + * ungap: remove a useless varible in top2_entry_t + +------------------------------------------------------------------------ +r348 | lh3 | 2008-06-03 09:04:12 -0400 (Tue, 03 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + A /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.1.1 + +------------------------------------------------------------------------ +r347 | lh3 | 2008-06-03 05:45:08 -0400 (Tue, 03 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +update documentation + +------------------------------------------------------------------------ +r346 | lh3 | 2008-06-02 18:59:50 -0400 (Mon, 02 Jun 2008) | 5 lines +Changed paths: + A /branches/prog/bwa/ChangeLog + A /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-11 + * improve approximating mapping qualities + * add documentation + * add ChangeLog + +------------------------------------------------------------------------ +r345 | lh3 | 2008-06-02 16:04:39 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-10 + * output a random position for repetitive reads + +------------------------------------------------------------------------ +r344 | lh3 | 2008-06-02 15:03:54 -0400 (Mon, 02 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/pac2bwt.c + + * bwa-0.1.0-9 + * fix memory leaks + * fix a potential bug in coverting to the real coordinate + +------------------------------------------------------------------------ +r343 | lh3 | 2008-06-02 13:44:51 -0400 (Mon, 02 Jun 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile.div + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-8 + * fix a bug about strand + * update Makefile.div + * change top2b as the default method + +------------------------------------------------------------------------ +r342 | lh3 | 2008-06-02 11:23:26 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt1away.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-7 + * use bwt_2occ() and bwt_2occ4() in other functions + +------------------------------------------------------------------------ +r341 | lh3 | 2008-06-02 09:31:39 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-6 + * fix a bug for missing hits + +------------------------------------------------------------------------ +r340 | lh3 | 2008-06-02 09:10:18 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-5 + * accelerate comparisons in heap, a bit + +------------------------------------------------------------------------ +r339 | lh3 | 2008-06-02 08:41:31 -0400 (Mon, 02 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-4 + * avoid marginal repeated calculation in occ + +------------------------------------------------------------------------ +r338 | lh3 | 2008-06-02 06:46:51 -0400 (Mon, 02 Jun 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-3 + * fix a bug caused by previours change + * fix a bug in heap + * order the heap by more criteria + +------------------------------------------------------------------------ +r337 | lh3 | 2008-06-01 19:11:15 -0400 (Sun, 01 Jun 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + + * bwa-0.1.0-2 + * also sort sa range in heapsort, in attempt to improve cache performance. + Unfortunately, it does not work well at all. + +------------------------------------------------------------------------ +r336 | lh3 | 2008-06-01 17:45:23 -0400 (Sun, 01 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/Makefile.div + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/main.c + + * 0.1.0-1 + * fix a bug in calculating the real coordinate + +------------------------------------------------------------------------ +r335 | lh3 | 2008-06-01 16:03:09 -0400 (Sun, 01 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + +nothing, really + +------------------------------------------------------------------------ +r334 | lh3 | 2008-06-01 15:59:13 -0400 (Sun, 01 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/Makefile.div + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/pac2bwt.c + +use IS algorithm by default + +------------------------------------------------------------------------ +r333 | lh3 | 2008-06-01 15:05:15 -0400 (Sun, 01 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/is.c + M /branches/prog/bwa/pac2bwt.c + + * a bit code clean up in is.c + * add IS algorithm for constructing BWT, albeit slower + +------------------------------------------------------------------------ +r332 | lh3 | 2008-06-01 13:23:08 -0400 (Sun, 01 Jun 2008) | 2 lines +Changed paths: + A /branches/prog/bwa/is.c + +IS linear-time algorithm for constructing SA/BWT + +------------------------------------------------------------------------ +r331 | lh3 | 2008-06-01 10:35:26 -0400 (Sun, 01 Jun 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + A /branches/prog/bwa/bwtindex.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * fix a bug in generating .pac + * index in one go + +------------------------------------------------------------------------ +r330 | lh3 | 2008-06-01 09:17:05 -0400 (Sun, 01 Jun 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwttop2.c + +real coordinates can be ouput + +------------------------------------------------------------------------ +r329 | lh3 | 2008-05-31 19:21:02 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwttop2.c + +add top2e which is similar to 1away + +------------------------------------------------------------------------ +r328 | lh3 | 2008-05-31 18:46:12 -0400 (Sat, 31 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwttop2.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * unified cmd-line interface for ungapped alignment + * add two alternatives to top2 algorithm + +------------------------------------------------------------------------ +r327 | lh3 | 2008-05-31 18:14:46 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + +add cmd-line interface to alntop2 + +------------------------------------------------------------------------ +r326 | lh3 | 2008-05-31 17:59:31 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt1away.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + A /branches/prog/bwa/bwttop2.c + +top2 algorithm seems to work. I need to change interface, though + +------------------------------------------------------------------------ +r325 | lh3 | 2008-05-31 15:11:49 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt1away.c + +change the variable in the structure + +------------------------------------------------------------------------ +r324 | lh3 | 2008-05-31 14:52:13 -0400 (Sat, 31 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt1away.c + +set a slightly better bound on the maximum allowed mismatches + +------------------------------------------------------------------------ +r323 | lh3 | 2008-05-30 18:40:21 -0400 (Fri, 30 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + + * output time statistics + +------------------------------------------------------------------------ +r322 | lh3 | 2008-05-30 17:58:25 -0400 (Fri, 30 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + A /branches/prog/bwa/bwt1away.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + + * presumably better way to make use of prefix. But for the moment I do + not know whether it is correct or not. + * a bit code clean up: separate alignment part + +------------------------------------------------------------------------ +r321 | lh3 | 2008-05-30 13:57:43 -0400 (Fri, 30 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt_gen/Makefile + M /branches/prog/bwa/bwt_gen/bwt_gen.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/pac2bwt.c + + * a bit code clean up + * put bwt_gen in bwa + +------------------------------------------------------------------------ +r320 | lh3 | 2008-05-30 11:40:11 -0400 (Fri, 30 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtio.c + + * improve cmd-line interface + * fix a bug in loading .sa + * change default sa interval to 32 + +------------------------------------------------------------------------ +r319 | lh3 | 2008-05-30 10:31:37 -0400 (Fri, 30 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtaln.c + + * fix memory leak (I know that. Just a bit lazy) + * change to another method to do 1-away alignment + +------------------------------------------------------------------------ +r318 | lh3 | 2008-05-30 09:21:49 -0400 (Fri, 30 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + +best unique match is partially finished + +------------------------------------------------------------------------ +r317 | lh3 | 2008-05-30 06:33:28 -0400 (Fri, 30 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + +remove "ungapped" command and related codes + +------------------------------------------------------------------------ +r316 | lh3 | 2008-05-30 06:05:20 -0400 (Fri, 30 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + +change variable name thick to width + +------------------------------------------------------------------------ +r315 | lh3 | 2008-05-29 19:06:13 -0400 (Thu, 29 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/pac2bwt.c + +revised algorithm for ungapped alignment. the old one can still be used. + +------------------------------------------------------------------------ +r314 | lh3 | 2008-05-29 16:36:11 -0400 (Thu, 29 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwt_gen/bwt_gen.c + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/pac2bwt.c + + * make commands more independent, but ungapped does not work at the moment + +------------------------------------------------------------------------ +r313 | lh3 | 2008-05-29 15:56:14 -0400 (Thu, 29 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt_gen/bwt_gen.c + +little... + +------------------------------------------------------------------------ +r312 | lh3 | 2008-05-29 15:54:01 -0400 (Thu, 29 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt_gen/bwt_gen.c + M /branches/prog/bwa/bwt_gen/bwt_gen.h + + * add CopyRight information from the original codes + * do not dump .fmv files + +------------------------------------------------------------------------ +r311 | lh3 | 2008-05-29 15:44:36 -0400 (Thu, 29 May 2008) | 2 lines +Changed paths: + A /branches/prog/bwa/bwt_gen + A /branches/prog/bwa/bwt_gen/Makefile + A /branches/prog/bwa/bwt_gen/QSufSort.c + A /branches/prog/bwa/bwt_gen/QSufSort.h + A /branches/prog/bwa/bwt_gen/bwt_gen.c + A /branches/prog/bwa/bwt_gen/bwt_gen.h + +codes from BWT-SW, for building BWT from packed file + +------------------------------------------------------------------------ +r310 | lh3 | 2008-05-28 17:03:35 -0400 (Wed, 28 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * change OCC_INTERVAL to 0x40, which makes bwa twice as fast. + * write Occ file as ".occ" as it is using a different interval from + .fmv, the BWT-SW correspondance of .occ + +------------------------------------------------------------------------ +r309 | lh3 | 2008-05-28 11:39:37 -0400 (Wed, 28 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + +fix a bug + +------------------------------------------------------------------------ +r308 | lh3 | 2008-05-28 09:56:16 -0400 (Wed, 28 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + +add heuristics to improve the speed, but I have not tested whether the +results are correct or not. + + +------------------------------------------------------------------------ +r307 | lh3 | 2008-05-28 06:31:34 -0400 (Wed, 28 May 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * make ungapped alignment basically works... + * but it is very slow in comparison to others... + * also I need to improve the interface... + * a lot of things to keep me busy today... + +------------------------------------------------------------------------ +r306 | lh3 | 2008-05-27 18:41:27 -0400 (Tue, 27 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtaln.c + + * remove recursion + * fixed a bug in bwt_occ() + +------------------------------------------------------------------------ +r305 | lh3 | 2008-05-27 16:59:44 -0400 (Tue, 27 May 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwtaln.c + + * bwa now tells whether a sequenced can be mapped with maximum allowed + mismatches. ONLY ungapped. + * this is a recursive version. I will remove recursion later. + + +------------------------------------------------------------------------ +r304 | lh3 | 2008-05-27 09:12:17 -0400 (Tue, 27 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + A /branches/prog/bwa/bwtaln.c + A /branches/prog/bwa/bwtaln.h + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/utils.c + + * load .sa and .fmv files + * exact alignment now works + +------------------------------------------------------------------------ +r303 | lh3 | 2008-05-27 06:33:38 -0400 (Tue, 27 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/utils.c + M /branches/prog/bwa/utils.h + +add xassert and fix a bug + +------------------------------------------------------------------------ +r302 | lh3 | 2008-05-27 06:23:20 -0400 (Tue, 27 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwtio.c + A /branches/prog/bwa/utils.c + A /branches/prog/bwa/utils.h + +improve error message and error handling + +------------------------------------------------------------------------ +r301 | lh3 | 2008-05-27 05:37:51 -0400 (Tue, 27 May 2008) | 4 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + A /branches/prog/bwa/bwtio.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + + * move I/O codes to bwtio.c + * SA can be dumped and interestingly, it is identical to BWTSW + * now, .fmv is still different from BWTSW + +------------------------------------------------------------------------ +r299 | lh3 | 2008-05-26 18:07:44 -0400 (Mon, 26 May 2008) | 2 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + +generate/retrieve SA and Occ + +------------------------------------------------------------------------ +r298 | lh3 | 2008-05-26 13:16:49 -0400 (Mon, 26 May 2008) | 3 lines +Changed paths: + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/bwt.c + M /branches/prog/bwa/bwt.h + M /branches/prog/bwa/bwt2fmv.c + + * retrieve occ value at any position + * move bwt_cal_occ() to bwt.c + +------------------------------------------------------------------------ +r297 | lh3 | 2008-05-25 17:43:58 -0400 (Sun, 25 May 2008) | 6 lines +Changed paths: + M /branches/prog/bwa/Makefile + A /branches/prog/bwa/bwt.c + A /branches/prog/bwa/bwt.h + A /branches/prog/bwa/bwt2fmv.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + M /branches/prog/bwa/pac2bwt.c + + * add bwt2fmv. It works to some extend. However, I do not understand + the purpose of some weird codes in BWT-SW. As a consequence, bwt2fmv + could generate a file almost identical, but not exactly identical, to + the .fmv file from BWT-SW. + + +------------------------------------------------------------------------ +r296 | lh3 | 2008-05-24 18:35:02 -0400 (Sat, 24 May 2008) | 5 lines +Changed paths: + M /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + M /branches/prog/bwa/bntseq.h + M /branches/prog/bwa/main.c + M /branches/prog/bwa/main.h + A /branches/prog/bwa/pac2bwt.c + +Burrows-Wheeler Transform now works. At least on one example, the +current code generates the same BWT as BWT-SW. Kind of magical, I would +say. :) + + +------------------------------------------------------------------------ +r295 | lh3 | 2008-05-24 11:25:31 -0400 (Sat, 24 May 2008) | 3 lines +Changed paths: + A /branches/prog/bwa/Makefile + M /branches/prog/bwa/bntseq.c + A /branches/prog/bwa/main.c + A /branches/prog/bwa/main.h + + * add Makefile and main.* + * improve interface to fa2bns, a bit + +------------------------------------------------------------------------ +r293 | lh3 | 2008-05-24 10:57:03 -0400 (Sat, 24 May 2008) | 3 lines +Changed paths: + A /branches/prog/bwa + A /branches/prog/bwa/bntseq.c + A /branches/prog/bwa/bntseq.h + A /branches/prog/bwa/seq.c + A /branches/prog/bwa/seq.h + + * Burrow-Wheeler Alignment + * initial codes + +------------------------------------------------------------------------ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..53f241d --- /dev/null +++ b/Makefile @@ -0,0 +1,55 @@ +CC= gcc +CXX= g++ +CFLAGS= -g -Wall -O2 +CXXFLAGS= $(CFLAGS) +DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64 +OBJS= utils.o bwt.o bwtio.o bwtaln.o bwtgap.o is.o \ + bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \ + bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \ + bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ + bwtsw2_chain.o bamlite.o +PROG= bwa +INCLUDES= +LIBS= -lm -lz -lpthread -Lbwt_gen -lbwtgen +SUBDIRS= . bwt_gen + +.SUFFIXES:.c .o .cc + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ +.cc.o: + $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all:$(PROG) + +lib-recur all-recur clean-recur cleanlocal-recur install-recur: + @target=`echo $@ | sed s/-recur//`; \ + wdir=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + cd $$subdir; \ + $(MAKE) CC="$(CC)" CXX="$(CXX)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ + INCLUDES="$(INCLUDES)" $$target || exit 1; \ + cd $$wdir; \ + done; + +lib: + +bwa:lib-recur $(OBJS) main.o + $(CC) $(CFLAGS) $(DFLAGS) $(OBJS) main.o -o $@ $(LIBS) + +bwt.o:bwt.h +bwtio.o:bwt.h +bwtaln.o:bwt.h bwtaln.h kseq.h +bwt1away.o:bwt.h bwtaln.h +bwt2fmv.o:bwt.h +bntseq.o:bntseq.h +bwtgap.o:bwtgap.h bwtaln.h bwt.h + +bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h +bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h +bwtsw2_main.o:bwtsw2.h + +cleanlocal: + rm -f gmon.out *.o a.out $(PROG) *~ *.a + +clean:cleanlocal-recur diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..027beec --- /dev/null +++ b/NEWS @@ -0,0 +1,528 @@ +Beta Release Candidate 0.5.9rc1 (10 December, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in bwasw: + + * Output unmapped reads. + + * For a repetitive read, choose a random hit instead of a fixed + one. This is not well tested. + +Notable changes in bwa-short: + + * Fixed a bug in the SW scoring system, which may lead to unexpected + gaps towards the end of a read. + + * Fixed a bug which invalidates the randomness of repetitive reads. + + * Fixed a rare memory leak. + + * Allowed to specify the read group at the command line. + + * Take name-grouped BAM files as input. + +Changes to this release are usually safe in that they do not interfere +with the key functionality. However, the release has only been tested on +small samples instead of on large-scale real data. If anything weird +happens, please report the bugs to the bio-bwa-help mailing list. + +(0.5.9rc1: 10 December 2010, r1561) + + + +Beta Release 0.5.8 (8 June, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in bwasw: + + * Fixed an issue of missing alignments. This should happen rarely and + only when the contig/read alignment is multi-part. Very rarely, bwasw + may still miss a segment in a multi-part alignment. This is difficult + to fix, although possible. + +Notable changes in bwa-short: + + * Discard the SW alignment when the best single-end alignment is much + better. Such a SW alignment may caused by structural variations and + forcing it to be aligned leads to false alignment. This fix has not + been tested thoroughly. It would be great to receive more users + feedbacks on this issue. + + * Fixed a typo/bug in sampe which leads to unnecessarily large memory + usage in some cases. + + * Further reduced the chance of reporting `weird pairing'. + +(0.5.8: 8 June 2010, r1442) + + + +Beta Release 0.5.7 (1 March, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release only has an effect on paired-end data with fat insert-size +distribution. Users are still recommended to update as the new release +improves the robustness to poor data. + + * The fix for `weird pairing' was not working in version 0.5.6, pointed + out by Carol Scott. It should work now. + + * Optionally output to a normal file rather than to stdout (by Tim + Fennel). + +(0.5.7: 1 March 2010, r1310) + + + +Beta Release 0.5.6 (10 Feburary, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in bwa-short: + + * Report multiple hits in the SAM format at a new tag XA encoded as: + (chr,pos,CIGAR,NM;)*. By default, if a paired or single-end read has + 4 or fewer hits, they will all be reported; if a read in a anomalous + pair has 11 or fewer hits, all of them will be reported. + + * Perform Smith-Waterman alignment also for anomalous read pairs when + both ends have quality higher than 17. This reduces false positives + for some SV discovery algorithms. + + * Do not report "weird pairing" when the insert size distribution is + too fat or has a mean close to zero. + + * If a read is bridging two adjacent chromsomes, flag it as unmapped. + + * Fixed a small but long existing memory leak in paired-end mapping. + + * Multiple bug fixes in SOLiD mapping: a) quality "-1" can be correctly + parsed by solid2fastq.pl; b) truncated quality string is resolved; c) + SOLiD read mapped to the reverse strand is complemented. + + * Bwa now calculates skewness and kurtosis of the insert size + distribution. + + * Deploy a Bayesian method to estimate the maximum distance for a read + pair considered to be paired properly. The method is proposed by + Gerton Lunter, but bwa only implements a simplified version. + + * Export more functions for Java bindings, by Matt Hanna (See: + http://www.broadinstitute.org/gsa/wiki/index.php/Sting_BWA/C_bindings) + + * Abstract bwa CIGAR for further extension, by Rodrigo Goya. + +(0.5.6: 10 Feburary 2010, r1303) + + + +Beta Release 0.5.5 (10 November, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a bug fix release: + + * Fixed a serious bug/typo in aln which does not occur given short + reads, but will lead to segfault for >500bp reads. Of course, the aln + command is not recommended for reads longer than 200bp, but this is a + bug anyway. + + * Fixed a minor bug/typo which leads to incorrect single-end mapping + quality when one end is moved to meet the mate-pair requirement. + + * Fixed a bug in samse for mapping in the color space. This bug is + caused by quality filtration added since 0.5.1. + +(0.5.5: 10 November 2009, r1273) + + + +Beta Release 0.5.4 (9 October, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since this version, the default seed length used in the "aln" command is +changed to 32. + +Notable changes in bwa-short: + + * Added a new tag "XC:i" which gives the length of clipped reads. + + * In sampe, skip alignments in case of a bug in the Smith-Waterman + alignment module. + + * In sampe, fixed a bug in pairing when the read sequence is identical + to its reverse complement. + + * In sampe, optionally preload the entire FM-index into memory to + reduce disk operations. + +Notable changes in dBWT-SW/BWA-SW: + + * Changed name dBWT-SW to BWA-SW. + + * Optionally use "hard clipping" in the SAM output. + +(0.5.4: 9 October 2009, r1245) + + + +Beta Release 0.5.3 (15 September, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Fixed a critical bug in bwa-short: reads mapped to the reverse strand +are not complemented. + +(0.5.3: 15 September 2009, r1225) + + + +Beta Release 0.5.2 (13 September, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in bwa-short: + + * Optionally trim reads before alignment. See the manual page on `aln + -q' for detailed description. + + * Fixed a bug in calculating the NM tag for a gapped alignment. + + * Fixed a bug given a mixture of reads with some longer than the seed + length and some shorter. + + * Print SAM header. + +Notable changes in dBWT-SW: + + * Changed the default value of -T to 30. As a result, the accuracy is a + little higher for short reads at the cost of speed. + +(0.5.2: 13 September 2009, r1223) + + + +Beta Release 0.5.1 (2 September, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in the short read alignment component: + + * Fixed a bug in samse: do not write mate coordinates. + +Notable changes in dBWT-SW: + + * Randomly choose one alignment if the read is a repetitive. + + * Fixed a flaw when a read is mapped across two adjacent reference + sequences. However, wrong alignment reports may still occur rarely in + this case. + + * Changed the default band width to 50. The speed is slower due to this + change. + + * Improved the mapping quality a little given long query sequences. + +(0.5.1: 2 September 2009, r1209) + + + +Beta Release 0.5.0 (20 August, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release implements a novel algorithm, dBWT-SW, specifically +designed for long reads. It is 10-50 times faster than SSAHA2, depending +on the characteristics of the input data, and achieves comparable +alignment accuracy while allowing chimera detection. In comparison to +BLAT, dBWT-SW is several times faster and much more accurate especially +when the error rate is high. Please read the manual page for more +information. + +The dBWT-SW algorithm is kind of developed for future sequencing +technologies which produce much longer reads with a little higher error +rate. It is still at its early development stage. Some features are +missing and it may be buggy although I have evaluated on several +simulated and real data sets. But following the "release early" +paradigm, I would like the users to try it first. + +Other notable changes in BWA are: + + * Fixed a rare bug in the Smith-Waterman alignment module. + + * Fixed a rare bug about the wrong alignment coordinate when a read is + poorly aligned. + + * Fixed a bug in generating the "mate-unmap" SAM tag when both ends in + a pair are unmapped. + +(0.5.0: 20 August 2009, r1200) + + + +Beta Release 0.4.9 (19 May, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Interestingly, the integer overflow bug claimed to be fixed in 0.4.7 has +not in fact. Now I have fixed the bug. Sorry for this and thank Quan +Long for pointing out the bug (again). + +(0.4.9: 19 May 2009, r1075) + + + +Beta Release 0.4.8 (18 May, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One change to "aln -R". Now by default, if there are no more than `-R' +equally best hits, bwa will search for suboptimal hits. This change +affects the ability in finding SNPs in segmental duplications. + +I have not tested this option thoroughly, but this simple change is less +likely to cause new bugs. Hope I am right. + +(0.4.8: 18 May 2009, r1073) + + + +Beta Release 0.4.7 (12 May, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Output SM (single-end mapping quality) and AM (smaller mapping + quality among the two ends) tag from sam output. + + * Improved the functionality of stdsw. + + * Made the XN tag more accurate. + + * Fixed a very rare segfault caused by integer overflow. + + * Improve the insert size estimation. + + * Fixed compiling errors for some Linux systems. + +(0.4.7: 12 May 2009, r1066) + + + +Beta Release 0.4.6 (9 March, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release improves the SOLiD support. First, a script for converting +SOLiD raw data is provided. This script is adapted from solid2fastq.pl +in the MAQ package. Second, a nucleotide reference file can be directly +used with `bwa index'. Third, SOLiD paired-end support is +completed. Fourth, color-space reads will be converted to nucleotides +when SAM output is generated. Color errors are corrected in this +process. Please note that like MAQ, BWA cannot make use of the primer +base and the first color. + +In addition, the calculation of mapping quality is also improved a +little bit, although end-users may barely observe the difference. + +(0.4.6: 9 March 2009, r915) + + + +Beta Release 0.4.5 (18 Feburary, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Not much happened, but I think it would be good to let the users use the +latest version. + +Notable changes (Thank Bob Handsaker for catching the two bugs): + + * Improved bounary check. Previous version may still give incorrect + alignment coordinates in rare cases. + + * Fixed a bug in SW alignment when no residue matches. This only + affects the `sampe' command. + + * Robustly estimate insert size without setting the maximum on the + command line. Since this release `sampe -a' only has an effect if + there are not enough good pairs to infer the insert size + distribution. + + * Reduced false PE alignments a little bit by using the inferred insert + size distribution. This fix may be more important for long insert + size libraries. + +(0.4.5: 18 Feburary 2009, r829) + + + +Beta Release 0.4.4 (15 Feburary, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is mainly a bug fix release. Notable changes are: + + * Imposed boundary check for extracting subsequence from the + genome. Previously this causes memory problem in rare cases. + + * Fixed a bug in failing to find whether an alignment overlapping with + N on the genome. + + * Changed MD tag to meet the latest SAM specification. + +(0.4.4: 15 Feburary 2009, r815) + + + +Beta Release 0.4.3 (22 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Treat an ambiguous base N as a mismatch. Previous versions will not + map reads containing any N. + + * Automatically choose the maximum allowed number of differences. This + is important when reads of different lengths are mixed together. + + * Print mate coordinate if only one end is unmapped. + + * Generate MD tag. This tag encodes the mismatching positions and the + reference bases at these positions. Deletions from the reference will + also be printed. + + * Optionally dump multiple hits from samse, in another concise format + rather than SAM. + + * Optionally disable iterative search. This is VERY SLOOOOW, though. + + * Fixed a bug in generate SAM. + +(0.4.3: 22 January 2009, r787) + + + +Beta Release 0.4.2 (9 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Aaron Quinlan found a bug in the indexer: the bwa indexer segfaults if +there are no comment texts in the FASTA header. This is a critical +bug. Nothing else was changed. + +(0.4.2: 9 January 2009, r769) + + + +Beta Release 0.4.1 (7 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +I am sorry for the quick updates these days. I like to set a milestone +for BWA and this release seems to be. For paired end reads, BWA also +does Smith-Waterman alignment for an unmapped read whose mate can be +mapped confidently. With this strategy BWA achieves similar accuracy to +maq. Benchmark is also updated accordingly. + +(0.4.1: 7 January 2009, r760) + + + +Beta Release 0.4.0 (6 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In comparison to the release two days ago, this release is mainly tuned +for performance with some tricks I learnt from Bowtie. However, as the +indexing format has also been changed, I have to increase the version +number to 0.4.0 to emphasize that *DATABASE MUST BE RE-INDEXED* with +`bwa index'. + + * Improved the speed by about 20%. + + * Added multi-threading to `bwa aln'. + +(0.4.0: 6 January 2009, r756) + + + +Beta Release 0.3.0 (4 January, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Added paired-end support by separating SA calculation and alignment + output. + + * Added SAM output. + + * Added evaluation to the documentation. + +(0.3.0: 4 January 2009, r741) + + + +Beta Release 0.2.0 (15 Augusst, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Take the subsequence at the 5'-end as seed. Seeding strategy greatly + improves the speed for long reads, at the cost of missing a few true + hits that contain many differences in the seed. Seeding also increase + the memory by 800MB. + + * Fixed a bug which may miss some gapped alignments. Fixing the bug + also slows the speed a little. + +(0.2.0: 15 August 2008, r428) + + + +Beta Release 0.1.6 (08 Augusst, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Give accurate CIGAR string. + + * Add a simple interface to SW/NW alignment + +(0.1.6: 08 August 2008, r414) + + + +Beta Release 0.1.5 (27 July, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Improve the speed. This version is expected to give the same results. + +(0.1.5: 27 July 2008, r400) + + + +Beta Release 0.1.4 (22 July, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + * Fixed a bug which may cause missing gapped alignments. + + * More clearly define what alignments can be found by BWA (See + manual). Now BWA runs a little slower because it will visit more + potential gapped alignments. + + * A bit code clean up. + +(0.1.4: 22 July 2008, r387) + + + +Beta Release 0.1.3 (21 July, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Improve the speed with some tricks on retrieving occurences. The results +should be exactly the same as that of 0.1.2. + +(0.1.3: 21 July 2008, r382) + + + +Beta Release 0.1.2 (17 July, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Support gapped alignment. Codes for ungapped alignment has been removed. + +(0.1.2: 17 July 2008, r371) + + + +Beta Release 0.1.1 (03 June, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is the first release of BWA, Burrows-Wheeler Alignment tool. Please +read man page for more information about this software. + +(0.1.1: 03 June 2008, r349) + + + diff --git a/bamlite.c b/bamlite.c new file mode 100644 index 0000000..5aad392 --- /dev/null +++ b/bamlite.c @@ -0,0 +1,155 @@ +#include +#include +#include +#include +#include "bamlite.h" + +/********************* + * from bam_endian.c * + *********************/ + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} +static inline void *bam_swap_endian_2p(void *x) +{ + *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); + return x; +} +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} +static inline void *bam_swap_endian_4p(void *x) +{ + *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); + return x; +} +static inline uint64_t bam_swap_endian_8(uint64_t v) +{ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} +static inline void *bam_swap_endian_8p(void *x) +{ + *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); + return x; +} + +/************** + * from bam.c * + **************/ + +int bam_is_be; + +bam_header_t *bam_header_init() +{ + bam_is_be = bam_is_big_endian(); + return (bam_header_t*)calloc(1, sizeof(bam_header_t)); +} + +void bam_header_destroy(bam_header_t *header) +{ + int32_t i; + if (header == 0) return; + if (header->target_name) { + for (i = 0; i < header->n_targets; ++i) + free(header->target_name[i]); + free(header->target_name); + free(header->target_len); + } + free(header->text); + free(header); +} + +bam_header_t *bam_header_read(bamFile fp) +{ + bam_header_t *header; + char buf[4]; + int magic_len; + int32_t i = 1, name_len; + // read "BAM1" + magic_len = bam_read(fp, buf, 4); + if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { + fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n"); + return 0; + } + header = bam_header_init(); + // read plain text and the number of reference sequences + bam_read(fp, &header->l_text, 4); + if (bam_is_be) bam_swap_endian_4p(&header->l_text); + header->text = (char*)calloc(header->l_text + 1, 1); + bam_read(fp, header->text, header->l_text); + bam_read(fp, &header->n_targets, 4); + if (bam_is_be) bam_swap_endian_4p(&header->n_targets); + // read reference sequence names and lengths + header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); + header->target_len = (uint32_t*)calloc(header->n_targets, 4); + for (i = 0; i != header->n_targets; ++i) { + bam_read(fp, &name_len, 4); + if (bam_is_be) bam_swap_endian_4p(&name_len); + header->target_name[i] = (char*)calloc(name_len, 1); + bam_read(fp, header->target_name[i], name_len); + bam_read(fp, &header->target_len[i], 4); + if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]); + } + return header; +} + +static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) +{ + uint8_t *s; + uint32_t i, *cigar = (uint32_t*)(data + c->l_qname); + s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2; + for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]); + while (s < data + data_len) { + uint8_t type; + s += 2; // skip key + type = toupper(*s); ++s; // skip type + if (type == 'C' || type == 'A') ++s; + else if (type == 'S') { bam_swap_endian_2p(s); s += 2; } + else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } + else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } + else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } + } +} + +int bam_read1(bamFile fp, bam1_t *b) +{ + bam1_core_t *c = &b->core; + int32_t block_len, ret, i; + uint32_t x[8]; + + if ((ret = bam_read(fp, &block_len, 4)) != 4) { + if (ret == 0) return -1; // normal end-of-file + else return -2; // truncated + } + if (bam_read(fp, x, sizeof(bam1_core_t)) != sizeof(bam1_core_t)) return -3; + if (bam_is_be) { + bam_swap_endian_4p(&block_len); + for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); + } + c->tid = x[0]; c->pos = x[1]; + c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; + c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; + c->l_qseq = x[4]; + c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; + b->data_len = block_len - sizeof(bam1_core_t); + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; + b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; + if (bam_is_be) swap_endian_data(c, b->data_len, b->data); + return 4 + block_len; +} diff --git a/bamlite.h b/bamlite.h new file mode 100644 index 0000000..167fa44 --- /dev/null +++ b/bamlite.h @@ -0,0 +1,94 @@ +#ifndef BAMLITE_H_ +#define BAMLITE_H_ + +#include +#include + +typedef gzFile bamFile; +#define bam_open(fn, mode) gzopen(fn, mode) +#define bam_dopen(fd, mode) gzdopen(fd, mode) +#define bam_close(fp) gzclose(fp) +#define bam_read(fp, buf, size) gzread(fp, buf, size) + +typedef struct { + int32_t n_targets; + char **target_name; + uint32_t *target_len; + size_t l_text, n_text; + char *text; +} bam_header_t; + +#define BAM_FPAIRED 1 +#define BAM_FPROPER_PAIR 2 +#define BAM_FUNMAP 4 +#define BAM_FMUNMAP 8 +#define BAM_FREVERSE 16 +#define BAM_FMREVERSE 32 +#define BAM_FREAD1 64 +#define BAM_FREAD2 128 +#define BAM_FSECONDARY 256 +#define BAM_FQCFAIL 512 +#define BAM_FDUP 1024 + +#define BAM_CIGAR_SHIFT 4 +#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) + +#define BAM_CMATCH 0 +#define BAM_CINS 1 +#define BAM_CDEL 2 +#define BAM_CREF_SKIP 3 +#define BAM_CSOFT_CLIP 4 +#define BAM_CHARD_CLIP 5 +#define BAM_CPAD 6 + +typedef struct { + int32_t tid; + int32_t pos; + uint32_t bin:16, qual:8, l_qname:8; + uint32_t flag:16, n_cigar:16; + int32_t l_qseq; + int32_t mtid; + int32_t mpos; + int32_t isize; +} bam1_core_t; + +typedef struct { + bam1_core_t core; + int l_aux, data_len, m_data; + uint8_t *data; +} bam1_t; + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) +#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) +#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname)) +#define bam1_qname(b) ((char*)((b)->data)) +#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) +#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) +#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) +#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) + +#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t))) +#define bam_destroy1(b) do { \ + if (b) { free((b)->data); free(b); } \ + } while (0) + +extern int bam_is_be; + +#ifdef __cplusplus +extern "C" { +#endif + + bam_header_t *bam_header_init(void); + void bam_header_destroy(bam_header_t *header); + bam_header_t *bam_header_read(bamFile fp); + int bam_read1(bamFile fp, bam1_t *b); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bntseq.c b/bntseq.c new file mode 100644 index 0000000..86888c1 --- /dev/null +++ b/bntseq.c @@ -0,0 +1,303 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#include +#include +#include +#include +#include "bntseq.h" +#include "main.h" +#include "utils.h" + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +unsigned char nst_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +void bns_dump(const bntseq_t *bns, const char *prefix) +{ + char str[1024]; + FILE *fp; + int i; + { // dump .ann + strcpy(str, prefix); strcat(str, ".ann"); + fp = xopen(str, "w"); + fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed); + for (i = 0; i != bns->n_seqs; ++i) { + bntann1_t *p = bns->anns + i; + fprintf(fp, "%d %s", p->gi, p->name); + if (p->anno[0]) fprintf(fp, " %s\n", p->anno); + else fprintf(fp, "\n"); + fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); + } + fclose(fp); + } + { // dump .amb + strcpy(str, prefix); strcat(str, ".amb"); + fp = xopen(str, "w"); + fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes); + for (i = 0; i != bns->n_holes; ++i) { + bntamb1_t *p = bns->ambs + i; + fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); + } + fclose(fp); + } +} + +bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename) +{ + char str[1024]; + FILE *fp; + bntseq_t *bns; + long long xx; + int i; + bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); + { // read .ann + fp = xopen(ann_filename, "r"); + fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed); + bns->l_pac = xx; + bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t)); + for (i = 0; i < bns->n_seqs; ++i) { + bntann1_t *p = bns->anns + i; + char *q = str; + int c; + // read gi and sequence name + fscanf(fp, "%u%s", &p->gi, str); + p->name = strdup(str); + // read fasta comments + while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; + *q = 0; + if (q - str > 1) p->anno = strdup(str + 1); // skip leading space + else p->anno = strdup(""); + // read the rest + fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); + p->offset = xx; + } + fclose(fp); + } + { // read .amb + int64_t l_pac; + int32_t n_seqs; + fp = xopen(amb_filename, "r"); + fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes); + l_pac = xx; + xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); + bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)); + for (i = 0; i < bns->n_holes; ++i) { + bntamb1_t *p = bns->ambs + i; + fscanf(fp, "%lld%d%s", &xx, &p->len, str); + p->offset = xx; + p->amb = str[0]; + } + fclose(fp); + } + { // open .pac + bns->fp_pac = xopen(pac_filename, "rb"); + } + return bns; +} + +bntseq_t *bns_restore(const char *prefix) +{ + char ann_filename[1024], amb_filename[1024], pac_filename[1024]; + strcat(strcpy(ann_filename, prefix), ".ann"); + strcat(strcpy(amb_filename, prefix), ".amb"); + strcat(strcpy(pac_filename, prefix), ".pac"); + return bns_restore_core(ann_filename, amb_filename, pac_filename); +} + +void bns_destroy(bntseq_t *bns) +{ + if (bns == 0) return; + else { + int i; + if (bns->fp_pac) fclose(bns->fp_pac); + free(bns->ambs); + for (i = 0; i < bns->n_seqs; ++i) { + free(bns->anns[i].name); + free(bns->anns[i].anno); + } + free(bns->anns); + free(bns); + } +} + +void bns_fasta2bntseq(gzFile fp_fa, const char *prefix) +{ + kseq_t *seq; + char name[1024]; + bntseq_t *bns; + bntamb1_t *q; + int l_buf; + unsigned char buf[0x10000]; + int32_t m_seqs, m_holes, l, i; + FILE *fp; + + // initialization + seq = kseq_init(fp_fa); + bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); + bns->seed = 11; // fixed seed for random generator + srand48(bns->seed); + m_seqs = m_holes = 8; + bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); + bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); + q = bns->ambs; + l_buf = 0; + strcpy(name, prefix); strcat(name, ".pac"); + fp = xopen(name, "wb"); + memset(buf, 0, 0x10000); + // read sequences + while ((l = kseq_read(seq)) >= 0) { + bntann1_t *p; + int lasts; + if (bns->n_seqs == m_seqs) { + m_seqs <<= 1; + bns->anns = (bntann1_t*)realloc(bns->anns, m_seqs * sizeof(bntann1_t)); + } + p = bns->anns + bns->n_seqs; + p->name = strdup((char*)seq->name.s); + p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); + p->gi = 0; p->len = l; + p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; + p->n_ambs = 0; + for (i = 0, lasts = 0; i < l; ++i) { + int c = nst_nt4_table[(int)seq->seq.s[i]]; + if (c >= 4) { // N + if (lasts == seq->seq.s[i]) { // contiguous N + ++q->len; + } else { + if (bns->n_holes == m_holes) { + m_holes <<= 1; + bns->ambs = (bntamb1_t*)realloc(bns->ambs, m_holes * sizeof(bntamb1_t)); + } + q = bns->ambs + bns->n_holes; + q->len = 1; + q->offset = p->offset + i; + q->amb = seq->seq.s[i]; + ++p->n_ambs; + ++bns->n_holes; + } + } + lasts = seq->seq.s[i]; + { // fill buffer + if (c >= 4) c = lrand48()&0x3; + if (l_buf == 0x40000) { + fwrite(buf, 1, 0x10000, fp); + memset(buf, 0, 0x10000); + l_buf = 0; + } + buf[l_buf>>2] |= c << ((3 - (l_buf&3)) << 1); + ++l_buf; + } + } + ++bns->n_seqs; + bns->l_pac += seq->seq.l; + } + xassert(bns->l_pac, "zero length sequence."); + { // finalize .pac file + ubyte_t ct; + fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp); + // the following codes make the pac file size always (l_pac/4+1+1) + if (bns->l_pac % 4 == 0) { + ct = 0; + fwrite(&ct, 1, 1, fp); + } + ct = bns->l_pac % 4; + fwrite(&ct, 1, 1, fp); + // close .pac file + fclose(fp); + } + bns_dump(bns, prefix); + bns_destroy(bns); + kseq_destroy(seq); +} + +int bwa_fa2pac(int argc, char *argv[]) +{ + gzFile fp; + if (argc < 2) { + fprintf(stderr, "Usage: bwa fa2pac []\n"); + return 1; + } + fp = xzopen(argv[1], "r"); + bns_fasta2bntseq(fp, (argc < 3)? argv[1] : argv[2]); + gzclose(fp); + return 0; +} + +int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq) +{ + int left, mid, right, nn; + if (pac_coor >= bns->l_pac) + err_fatal("bns_coor_pac2real", "bug! Coordinate is longer than sequence (%lld>=%lld).", pac_coor, bns->l_pac); + // binary search for the sequence ID. Note that this is a bit different from the following one... + left = 0; mid = 0; right = bns->n_seqs; + while (left < right) { + mid = (left + right) >> 1; + if (pac_coor >= bns->anns[mid].offset) { + if (mid == bns->n_seqs - 1) break; + if (pac_coor < bns->anns[mid+1].offset) break; + left = mid + 1; + } else right = mid; + } + *real_seq = mid; + // binary search for holes + left = 0; right = bns->n_holes; nn = 0; + while (left < right) { + int64_t mid = (left + right) >> 1; + if (pac_coor >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1; + else if (pac_coor + len <= bns->ambs[mid].offset) right = mid; + else { // overlap + if (pac_coor >= bns->ambs[mid].offset) { + nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len? + bns->ambs[mid].offset + bns->ambs[mid].len - pac_coor : len; + } else { + nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len? + bns->ambs[mid].len : len - (bns->ambs[mid].offset - pac_coor); + } + break; + } + } + return nn; +} diff --git a/bntseq.h b/bntseq.h new file mode 100644 index 0000000..21b831e --- /dev/null +++ b/bntseq.h @@ -0,0 +1,80 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef BWT_BNTSEQ_H +#define BWT_BNTSEQ_H + +#include +#include + +#ifndef BWA_UBYTE +#define BWA_UBYTE +typedef uint8_t ubyte_t; +#endif + +typedef struct { + int64_t offset; + int32_t len; + int32_t n_ambs; + uint32_t gi; + char *name, *anno; +} bntann1_t; + +typedef struct { + int64_t offset; + int32_t len; + char amb; +} bntamb1_t; + +typedef struct { + int64_t l_pac; + int32_t n_seqs; + uint32_t seed; + bntann1_t *anns; // n_seqs elements + int32_t n_holes; + bntamb1_t *ambs; // n_holes elements + FILE *fp_pac; +} bntseq_t; + +extern unsigned char nst_nt4_table[256]; + +#ifdef __cplusplus +extern "C" { +#endif + + void bns_dump(const bntseq_t *bns, const char *prefix); + bntseq_t *bns_restore(const char *prefix); + bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); + void bns_destroy(bntseq_t *bns); + void bns_fasta2bntseq(gzFile fp_fa, const char *prefix); + int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwa.1 b/bwa.1 new file mode 100644 index 0000000..52e999e --- /dev/null +++ b/bwa.1 @@ -0,0 +1,550 @@ +.TH bwa 1 "10 December 2010" "bwa-0.5.9rc1" "Bioinformatics tools" +.SH NAME +.PP +bwa - Burrows-Wheeler Alignment Tool +.SH SYNOPSIS +.PP +bwa index -a bwtsw database.fasta +.PP +bwa aln database.fasta short_read.fastq > aln_sa.sai +.PP +bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam +.PP +bwa sampe database.fasta aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln.sam +.PP +bwa bwasw database.fasta long_read.fastq > aln.sam + +.SH DESCRIPTION +.PP +BWA is a fast light-weighted tool that aligns relatively short sequences +(queries) to a sequence database (targe), such as the human reference +genome. It implements two different algorithms, both based on +Burrows-Wheeler Transform (BWT). The first algorithm is designed for +short queries up to ~200bp with low error rate (<3%). It does gapped +global alignment w.r.t. queries, supports paired-end reads, and is one +of the fastest short read alignment algorithms to date while also +visiting suboptimal hits. The second algorithm, BWA-SW, is designed for +long reads with more errors. It performs heuristic Smith-Waterman-like +alignment to find high-scoring local hits (and thus chimera). On +low-error short queries, BWA-SW is slower and less accurate than the +first algorithm, but on long queries, it is better. +.PP +For both algorithms, the database file in the FASTA format must be +first indexed with the +.B `index' +command, which typically takes a few hours. The first algorithm is +implemented via the +.B `aln' +command, which finds the suffix array (SA) coordinates of good hits of +each individual read, and the +.B `samse/sampe' +command, which converts SA coordinates to chromosomal coordinate and +pairs reads (for `sampe'). The second algorithm is invoked by the +.B `dbtwsw' +command. It works for single-end reads only. + +.SH COMMANDS AND OPTIONS +.TP +.B index +bwa index [-p prefix] [-a algoType] [-c] + +Index database sequences in the FASTA format. + +.B OPTIONS: +.RS +.TP 10 +.B -c +Build color-space index. The input fast should be in nucleotide space. +.TP +.B -p STR +Prefix of the output database [same as db filename] +.TP +.B -a STR +Algorithm for constructing BWT index. Available options are: +.RS +.TP +.B is +IS linear-time algorithm for constructing suffix array. It requires +5.37N memory where N is the size of the database. IS is moderately fast, +but does not work with database larger than 2GB. IS is the default +algorithm due to its simplicity. The current codes for IS algorithm are +reimplemented by Yuta Mori. +.TP +.B bwtsw +Algorithm implemented in BWT-SW. This method works with the whole human +genome, but it does not work with database smaller than 10MB and it is +usually slower than IS. +.RE +.RE + +.TP +.B aln +bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i +nIndelEnd] [-k maxSeedDiff] [-l seedLen] [-t nThrds] [-cRN] [-M misMsc] +[-O gapOsc] [-E gapEsc] [-q trimQual] > + + +Find the SA coordinates of the input reads. Maximum +.I maxSeedDiff +differences are allowed in the first +.I seedLen +subsequence and maximum +.I maxDiff +differences are allowed in the whole sequence. + +.B OPTIONS: +.RS +.TP 10 +.B -n NUM +Maximum edit distance if the value is INT, or the fraction of missing +alignments given 2% uniform base error rate if FLOAT. In the latter +case, the maximum edit distance is automatically chosen for different +read lengths. [0.04] +.TP +.B -o INT +Maximum number of gap opens [1] +.TP +.B -e INT +Maximum number of gap extensions, -1 for k-difference mode (disallowing +long gaps) [-1] +.TP +.B -d INT +Disallow a long deletion within INT bp towards the 3'-end [16] +.TP +.B -i INT +Disallow an indel within INT bp towards the ends [5] +.TP +.B -l INT +Take the first INT subsequence as seed. If INT is larger than the query +sequence, seeding will be disabled. For long reads, this option is +typically ranged from 25 to 35 for `-k 2'. [inf] +.TP +.B -k INT +Maximum edit distance in the seed [2] +.TP +.B -t INT +Number of threads (multi-threading mode) [1] +.TP +.B -M INT +Mismatch penalty. BWA will not search for suboptimal hits with a score +lower than (bestScore-misMsc). [3] +.TP +.B -O INT +Gap open penalty [11] +.TP +.B -E INT +Gap extension penalty [4] +.TP +.B -R INT +Proceed with suboptimal alignments if there are no more than INT equally +best hits. This option only affects paired-end mapping. Increasing this +threshold helps to improve the pairing accuracy at the cost of speed, +especially for short reads (~32bp). +.TP +.B -c +Reverse query but not complement it, which is required for alignment in +the color space. +.TP +.B -N +Disable iterative search. All hits with no more than +.I maxDiff +differences will be found. This mode is much slower than the default. +.TP +.B -q INT +Parameter for read trimming. BWA trims a read down to +argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l 1.sai + bwa aln ref.fa -b2 reads.bam > 2.sai + bwa sampe ref.fa 1.sai 2.sai reads.bam reads.bam > aln.sam +.TP +.B -0 +When +.B -b +is specified, only use single-end reads in mapping. +.TP +.B -1 +When +.B -b +is specified, only use the first read in a read pair in mapping (skip +single-end reads and the second reads). +.TP +.B -2 +When +.B -b +is specified, only use the second read in a read pair in mapping. +.B +.RE + +.TP +.B samse +bwa samse [-n maxOcc] > + +Generate alignments in the SAM format given single-end reads. Repetitive +hits will be randomly chosen. + +.B OPTIONS: +.RS +.TP 10 +.BI -n \ INT +Maximum number of alignments to output in the XA tag for reads paired +properly. If a read has more than INT hits, the XA tag will not be +written. [3] +.TP +.BI -r \ STR +Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null] +.RE + +.TP +.B sampe +bwa sampe [-a maxInsSize] [-o maxOcc] [-n maxHitPaired] [-N maxHitDis] +[-P] > + +Generate alignments in the SAM format given paired-end reads. Repetitive +read pairs will be placed randomly. + +.B OPTIONS: +.RS +.TP 8 +.BI -a \ INT +Maximum insert size for a read pair to be considered being mapped +properly. Since 0.4.5, this option is only used when there are not +enough good alignment to infer the distribution of insert sizes. [500] +.TP +.BI -o \ INT +Maximum occurrences of a read for pairing. A read with more occurrneces +will be treated as a single-end read. Reducing this parameter helps +faster pairing. [100000] +.TP +.B -P +Load the entire FM-index into memory to reduce disk operations +(base-space reads only). With this option, at least 1.25N bytes of +memory are required, where N is the length of the genome. +.TP +.BI -n \ INT +Maximum number of alignments to output in the XA tag for reads paired +properly. If a read has more than INT hits, the XA tag will not be +written. [3] +.TP +.BI -N \ INT +Maximum number of alignments to output in the XA tag for disconcordant +read pairs (excluding singletons). If a read has more than INT hits, the +XA tag will not be written. [10] +.TP +.BI -r \ STR +Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null] +.RE + +.TP +.B bwasw +bwa bwasw [-a matchScore] [-b mmPen] [-q gapOpenPen] [-r gapExtPen] [-t +nThreads] [-w bandWidth] [-T thres] [-s hspIntv] [-z zBest] [-N +nHspRev] [-c thresCoef] + +Align query sequences in the file. + +.B OPTIONS: +.RS +.TP 10 +.B -a INT +Score of a match [1] +.TP +.B -b INT +Mismatch penalty [3] +.TP +.B -q INT +Gap open penalty [5] +.TP +.B -r INT +Gap extension penalty. The penalty for a contiguous gap of size k is +q+k*r. [2] +.TP +.B -t INT +Number of threads in the multi-threading mode [1] +.TP +.B -w INT +Band width in the banded alignment [33] +.TP +.B -T INT +Minimum score threshold divided by a [37] +.TP +.B -c FLOAT +Coefficient for threshold adjustment according to query length. Given an +l-long query, the threshold for a hit to be retained is +a*max{T,c*log(l)}. [5.5] +.TP +.B -z INT +Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1] +.TP +.B -s INT +Maximum SA interval size for initiating a seed. Higher -s increases +accuracy at the cost of speed. [3] +.TP +.B -N INT +Minimum number of seeds supporting the resultant alignment to skip +reverse alignment. [5] +.RE + +.SH SAM ALIGNMENT FORMAT +.PP +The output of the +.B `aln' +command is binary and designed for BWA use only. BWA outputs the final +alignment in the SAM (Sequence Alignment/Map) format. Each line consists +of: + +.TS +center box; +cb | cb | cb +n | l | l . +Col Field Description +_ +1 QNAME Query (pair) NAME +2 FLAG bitwise FLAG +3 RNAME Reference sequence NAME +4 POS 1-based leftmost POSition/coordinate of clipped sequence +5 MAPQ MAPping Quality (Phred-scaled) +6 CIAGR extended CIGAR string +7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME) +8 MPOS 1-based Mate POSistion +9 ISIZE Inferred insert SIZE +10 SEQ query SEQuence on the same strand as the reference +11 QUAL query QUALity (ASCII-33 gives the Phred base quality) +12 OPT variable OPTional fields in the format TAG:VTYPE:VALUE +.TE + +.PP +Each bit in the FLAG field is defined as: + +.TS +center box; +cb | cb | cb +c | l | l . +Chr Flag Description +_ +p 0x0001 the read is paired in sequencing +P 0x0002 the read is mapped in a proper pair +u 0x0004 the query sequence itself is unmapped +U 0x0008 the mate is unmapped +r 0x0010 strand of the query (1 for reverse) +R 0x0020 strand of the mate +1 0x0040 the read is the first read in a pair +2 0x0080 the read is the second read in a pair +s 0x0100 the alignment is not primary +f 0x0200 QC failure +d 0x0400 optical or PCR duplicate +.TE + +.PP +The Please check for the format +specification and the tools for post-processing the alignment. + +BWA generates the following optional fields. Tags starting with `X' are +specific to BWA. + +.TS +center box; +cb | cb +cB | l . +Tag Meaning +_ +NM Edit distance +MD Mismatching positions/bases +AS Alignment score +_ +X0 Number of best hits +X1 Number of suboptimal hits found by BWA +XN Number of ambiguous bases in the referenece +XM Number of mismatches in the alignment +XO Number of gap opens +XG Number of gap extentions +XT Type: Unique/Repeat/N/Mate-sw +XA Alternative hits; format: (chr,pos,CIGAR,NM;)* +_ +XS Suboptimal alignment score +XF Support from forward/reverse alignment +XE Number of supporting seeds +.TE + +.PP +Note that XO and XG are generated by BWT search while the CIGAR string +by Smith-Waterman alignment. These two tags may be inconsistent with the +CIGAR string. This is not a bug. + +.SH NOTES ON SHORT-READ ALIGNMENT +.SS Alignment Accuracy +.PP +When seeding is disabled, BWA guarantees to find an alignment +containing maximum +.I maxDiff +differences including +.I maxGapO +gap opens which do not occur within +.I nIndelEnd +bp towards either end of the query. Longer gaps may be found if +.I maxGapE +is positive, but it is not guaranteed to find all hits. When seeding is +enabled, BWA further requires that the first +.I seedLen +subsequence contains no more than +.I maxSeedDiff +differences. +.PP +When gapped alignment is disabled, BWA is expected to generate the same +alignment as Eland, the Illumina alignment program. However, as BWA +change `N' in the database sequence to random nucleotides, hits to these +random sequences will also be counted. As a consequence, BWA may mark a +unique hit as a repeat, if the random sequences happen to be identical +to the sequences which should be unqiue in the database. This random +behaviour will be avoided in future releases. +.PP +By default, if the best hit is no so repetitive (controlled by -R), BWA +also finds all hits contains one more mismatch; otherwise, BWA finds all +equally best hits only. Base quality is NOT considered in evaluating +hits. In paired-end alignment, BWA pairs all hits it found. It further +performs Smith-Waterman alignment for unmapped reads with mates mapped +to rescue mapped mates, and for high-quality anomalous pairs to fix +potential alignment errors. + +.SS Estimating Insert Size Distribution +.PP +BWA estimates the insert size distribution per 256*1024 read pairs. It +first collects pairs of reads with both ends mapped with a single-end +quality 20 or higher and then calculates median (Q2), lower and higher +quartile (Q1 and Q3). It estimates the mean and the variance of the +insert size distribution from pairs whose insert sizes are within +interval [Q1-2(Q3-Q1), Q3+2(Q3-Q1)]. The maximum distance x for a pair +considered to be properly paired (SAM flag 0x2) is calculated by solving +equation Phi((x-mu)/sigma)=x/L*p0, where mu is the mean, sigma is the +standard error of the insert size distribution, L is the length of the +genome, p0 is prior of anomalous pair and Phi() is the standard +cumulative distribution function. For mapping Illumina short-insert +reads to the human genome, x is about 6-7 sigma away from the +mean. Quartiles, mean, variance and x will be printed to the standard +error output. + +.SS Memory Requirement +.PP +With bwtsw algorithm, 2.5GB memory is required for indexing the complete +human genome sequences. For short reads, the +.B `aln' +command uses ~2.3GB memory and the +.B `sampe' +command uses ~3.5GB. + +.SS Speed +.PP +Indexing the human genome sequences takes 3 hours with bwtsw +algorithm. Indexing smaller genomes with IS or divsufsort algorithms is +several times faster, but requires more memory. +.PP +Speed of alignment is largely determined by the error rate of the query +sequences (r). Firstly, BWA runs much faster for near perfect hits than +for hits with many differences, and it stops searching for a hit with +l+2 differences if a l-difference hit is found. This means BWA will be +very slow if r is high because in this case BWA has to visit hits with +many differences and looking for these hits is expensive. Secondly, the +alignment algorithm behind makes the speed sensitive to [k log(N)/m], +where k is the maximum allowed differences, N the size of database and m +the length of a query. In practice, we choose k w.r.t. r and therefore r +is the leading factor. I would not recommend to use BWA on data with +r>0.02. +.PP +Pairing is slower for shorter reads. This is mainly because shorter +reads have more spurious hits and converting SA coordinates to +chromosomal coordinates are very costly. +.PP +In a practical experiment, BWA is able to map 2 million 32bp reads to a +bacterial genome in several minutes, map the same amount of reads to +human X chromosome in 8-15 minutes and to the human genome in 15-25 +minutes. This result implies that the speed of BWA is insensitive to the +size of database and therefore BWA is more efficient when the database +is sufficiently large. On smaller genomes, hash based algorithms are +usually much faster. + +.SH NOTES ON LONG-READ ALIGNMENT +.PP +Command +.B `bwasw' +is designed for long-read alignment. The algorithm behind, BWA-SW, is +similar to BWT-SW, but does not guarantee to find all local hits due to +the heuristic acceleration. It tends to be faster and more accurate if +the resultant alignment is supported by more seeds, and therefore +BWA-SW usually performs better on long queries than on short ones. + +On 350-1000bp reads, BWA-SW is several to tens of times faster than the +existing programs. Its accuracy is comparable to SSAHA2, more accurate +than BLAT. Like BLAT, BWA-SW also finds chimera which may pose a +challenge to SSAHA2. On 10-100kbp queries where chimera detection is +important, BWA-SW is over 10X faster than BLAT while being more +sensitive. + +BWA-SW can also be used to align ~100bp reads, but it is slower than +the short-read algorithm. Its sensitivity and accuracy is lower than +SSAHA2 especially when the sequencing error rate is above 2%. This is +the trade-off of the 30X speed up in comparison to SSAHA2's -454 mode. + +.SH SEE ALSO +BWA website , Samtools website + + +.SH AUTHOR +Heng Li at the Sanger Institute wrote the key source codes and +integrated the following codes for BWT construction: bwtsw +, implemented by Chi-Kwong Wong at +the University of Hong Kong and IS + originally proposed by Nong Ge + at the Sun Yat-Sen University and +implemented by Yuta Mori. + +.SH LICENSE AND CITATION +.PP +The full BWA package is distributed under GPLv3 as it uses source codes +from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS +libraries are distributed under the MIT license. +.PP +If you use the short-read alignment component, please cite the following +paper: +.PP +Li H. and Durbin R. (2009) Fast and accurate short read alignment with +Burrows-Wheeler transform. Bioinformatics, 25, 1754-60. [PMID: 19451168] +.PP +If you use the long-read component (BWA-SW), please cite: +.PP +Li H. and Durbin R. (2010) Fast and accurate long-read alignment with +Burrows-Wheeler transform. Bioinformatics. [PMID: 20080505] + +.SH HISTORY +BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW +and mimics its binary file formats; BWA-SW resembles BWT-SW in several +ways. The initial idea about BWT-based alignment also came from the +group who developed BWT-SW. At the same time, BWA is different enough +from BWT-SW. The short-read alignment algorithm bears no similarity to +Smith-Waterman algorithm any more. While BWA-SW learns from BWT-SW, it +introduces heuristics that can hardly be applied to the original +algorithm. In all, BWA does not guarantee to find all local hits as what +BWT-SW is designed to do, but it is much faster than BWT-SW on both +short and long query sequences. + +I started to write the first piece of codes on 24 May 2008 and got the +initial stable version on 02 June 2008. During this period, I was +acquainted that Professor Tak-Wah Lam, the first author of BWT-SW paper, +was collaborating with Beijing Genomics Institute on SOAP2, the successor +to SOAP (Short Oligonucleotide Analysis Package). SOAP2 has come out in +November 2008. According to the SourceForge download page, the third +BWT-based short read aligner, bowtie, was first released in August +2008. At the time of writing this manual, at least three more BWT-based +short-read aligners are being implemented. + +The BWA-SW algorithm is a new component of BWA. It was conceived in +November 2008 and implemented ten months later. diff --git a/bwape.c b/bwape.c new file mode 100644 index 0000000..a127461 --- /dev/null +++ b/bwape.c @@ -0,0 +1,795 @@ +#include +#include +#include +#include +#include +#include +#include "bwtaln.h" +#include "kvec.h" +#include "bntseq.h" +#include "utils.h" +#include "stdaln.h" + +typedef struct { + int n; + bwtint_t *a; +} poslist_t; + +typedef struct { + double avg, std, ap_prior; + bwtint_t low, high, high_bayesian; +} isize_info_t; + +#include "khash.h" +KHASH_MAP_INIT_INT64(64, poslist_t) + +#include "ksort.h" +KSORT_INIT_GENERIC(uint64_t) + +typedef struct { + kvec_t(uint64_t) arr; + kvec_t(uint64_t) pos[2]; + kvec_t(bwt_aln1_t) aln[2]; +} pe_data_t; + +#define MIN_HASH_WIDTH 1000 + +extern int g_log_n[256]; // in bwase.c +static kh_64_t *g_hash; + +void bwase_initialize(); +void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); +void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); +void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns); +int bwa_approx_mapQ(const bwa_seq_t *p, int mm); +void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2); +bntseq_t *bwa_open_nt(const char *prefix); +void bwa_print_sam_SQ(const bntseq_t *bns); + +pe_opt_t *bwa_init_pe_opt() +{ + pe_opt_t *po; + po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t)); + po->max_isize = 500; + po->force_isize = 0; + po->max_occ = 100000; + po->n_multi = 3; + po->N_multi = 10; + po->type = BWA_PET_STD; + po->is_sw = 1; + po->ap_prior = 1e-5; + return po; +} + +static inline uint64_t hash_64(uint64_t key) +{ + key += ~(key << 32); + key ^= (key >> 22); + key += ~(key << 13); + key ^= (key >> 8); + key += (key << 3); + key ^= (key >> 15); + key += ~(key << 27); + key ^= (key >> 31); + return key; +} +/* +static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x); +{ + const double a = 0.140012; + double b, c; + b = log(x * (2 - x)); + c = 2./M_PI/a + b / 2.; + return sqrt(sqrt(c * c - b / a) - c); +} +*/ + +// for normal distribution, this is about 3std +#define OUTLIER_BOUND 2.0 + +static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ap_prior, int64_t L) +{ + uint64_t x, *isizes, n_ap = 0; + int n, i, tot, p25, p75, p50, max_len = 1, tmp; + double skewness = 0.0, kurtosis = 0.0, y; + + ii->avg = ii->std = -1.0; + ii->low = ii->high = ii->high_bayesian = 0; + isizes = (uint64_t*)calloc(n_seqs, 8); + for (i = 0, tot = 0; i != n_seqs; ++i) { + bwa_seq_t *p[2]; + p[0] = seqs[0] + i; p[1] = seqs[1] + i; + if (p[0]->mapQ >= 20 && p[1]->mapQ >= 20) { + x = (p[0]->pos < p[1]->pos)? p[1]->pos + p[1]->len - p[0]->pos : p[0]->pos + p[0]->len - p[1]->pos; + if (x < 100000) isizes[tot++] = x; + } + if (p[0]->len > max_len) max_len = p[0]->len; + if (p[1]->len > max_len) max_len = p[1]->len; + } + if (tot < 20) { + fprintf(stderr, "[infer_isize] fail to infer insert size: too few good pairs\n"); + free(isizes); + return -1; + } + ks_introsort(uint64_t, tot, isizes); + p25 = isizes[(int)(tot*0.25 + 0.5)]; + p50 = isizes[(int)(tot*0.50 + 0.5)]; + p75 = isizes[(int)(tot*0.75 + 0.5)]; + tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); + ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned + ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + for (i = 0, x = n = 0; i < tot; ++i) + if (isizes[i] >= ii->low && isizes[i] <= ii->high) + ++n, x += isizes[i]; + ii->avg = (double)x / n; + for (i = 0; i < tot; ++i) { + if (isizes[i] >= ii->low && isizes[i] <= ii->high) { + double tmp = (isizes[i] - ii->avg) * (isizes[i] - ii->avg); + ii->std += tmp; + skewness += tmp * (isizes[i] - ii->avg); + kurtosis += tmp * tmp; + } + } + kurtosis = kurtosis/n / (ii->std / n * ii->std / n) - 3; + ii->std = sqrt(ii->std / n); // it would be better as n-1, but n is usually very large + skewness = skewness / n / (ii->std * ii->std * ii->std); + for (y = 1.0; y < 10.0; y += 0.01) + if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break; + ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499); + for (i = 0; i < tot; ++i) + if (isizes[i] > ii->high_bayesian) ++n_ap; + ii->ap_prior = .01 * (n_ap + .01) / tot; + if (ii->ap_prior < ap_prior) ii->ap_prior = ap_prior; + free(isizes); + fprintf(stderr, "[infer_isize] (25, 50, 75) percentile: (%d, %d, %d)\n", p25, p50, p75); + if (isnan(ii->std) || p75 > 100000) { + ii->low = ii->high = ii->high_bayesian = 0; ii->avg = ii->std = -1.0; + fprintf(stderr, "[infer_isize] fail to infer insert size: weird pairing\n"); + return -1; + } + for (y = 1.0; y < 10.0; y += 0.01) + if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break; + ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499); + fprintf(stderr, "[infer_isize] low and high boundaries: %d and %d for estimating avg and std\n", ii->low, ii->high); + fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std); + fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf; ap_prior: %.2e\n", skewness, kurtosis, ii->ap_prior); + fprintf(stderr, "[infer_isize] inferred maximum insert size: %d (%.2lf sigma)\n", ii->high_bayesian, y); + return 0; +} + +static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii) +{ + int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len; + uint64_t last_pos[2][2], o_pos[2], subo_score, o_score; + max_len = p[0]->full_len; + if (max_len < p[1]->full_len) max_len = p[1]->full_len; + if (low_bound < max_len) low_bound = max_len; + + // here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize +#define __pairing_aux(u,v) do { \ + bwtint_t l = ((v)>>32) + p[(v)&1]->len - ((u)>>32); \ + if ((u) != (uint64_t)-1 && (v)>>32 > (u)>>32 && l >= max_len \ + && ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \ + { \ + uint64_t s = d->aln[(v)&1].a[(uint32_t)(v)>>1].score + d->aln[(u)&1].a[(uint32_t)(u)>>1].score; \ + s *= 10; \ + if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \ + s = s<<32 | (uint32_t)hash_64((u)>>32<<32 | (v)>>32); \ + if (s>>32 == o_score>>32) ++o_n; \ + else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \ + else ++subo_n; \ + if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u)&1] = (u), o_pos[(v)&1] = (v); \ + else if (s < subo_score) subo_score = s; \ + } \ + } while (0) + +#define __pairing_aux2(q, w) do { \ + const bwt_aln1_t *r = d->aln[(w)&1].a + ((uint32_t)(w)>>1); \ + (q)->extra_flag |= SAM_FPP; \ + if ((q)->pos != (w)>>32 || (q)->strand != r->a) { \ + (q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = r->a; \ + (q)->score = r->score; \ + (q)->pos = (w)>>32; \ + if ((q)->mapQ > 0) ++cnt_chg; \ + } \ + } while (0) + + o_score = subo_score = (uint64_t)-1; + o_n = subo_n = 0; + ks_introsort(uint64_t, d->arr.n, d->arr.a); + for (j = 0; j < 2; ++j) last_pos[j][0] = last_pos[j][1] = (uint64_t)-1; + if (opt->type == BWA_PET_STD) { + for (i = 0; i < d->arr.n; ++i) { + uint64_t x = d->arr.a[i]; + int strand = d->aln[x&1].a[(uint32_t)x>>1].a; + if (strand == 1) { // reverse strand, then check + int y = 1 - (x&1); + __pairing_aux(last_pos[y][1], x); + __pairing_aux(last_pos[y][0], x); + } else { // forward strand, then push + last_pos[x&1][0] = last_pos[x&1][1]; + last_pos[x&1][1] = x; + } + } + } else if (opt->type == BWA_PET_SOLID) { + for (i = 0; i < d->arr.n; ++i) { + uint64_t x = d->arr.a[i]; + int strand = d->aln[x&1].a[(uint32_t)x>>1].a; + if ((strand^x)&1) { // push + int y = 1 - (x&1); + __pairing_aux(last_pos[y][1], x); + __pairing_aux(last_pos[y][0], x); + } else { // check + last_pos[x&1][0] = last_pos[x&1][1]; + last_pos[x&1][1] = x; + } + } + } else { + fprintf(stderr, "[paring] not implemented yet!\n"); + exit(1); + } + // set pairing + //fprintf(stderr, "[%d, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n); + if (o_score != (uint64_t)-1) { + int mapQ_p = 0; // this is the maximum mapping quality when one end is moved + int rr[2]; + //fprintf(stderr, "%d, %d\n", o_n, subo_n); + if (o_n == 1) { + if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair + else if ((subo_score>>32) - (o_score>>32) > s_mm * 10) mapQ_p = 23; // poor sub-optimal pair + else { + int n = subo_n > 255? 255 : subo_n; + mapQ_p = ((subo_score>>32) - (o_score>>32)) / 2 - g_log_n[n]; + if (mapQ_p < 0) mapQ_p = 0; + } + } + rr[0] = d->aln[o_pos[0]&1].a[(uint32_t)o_pos[0]>>1].a; + rr[1] = d->aln[o_pos[1]&1].a[(uint32_t)o_pos[1]>>1].a; + if ((p[0]->pos == o_pos[0]>>32 && p[0]->strand == rr[0]) && (p[1]->pos == o_pos[1]>>32 && p[1]->strand == rr[1])) { // both ends not moved + if (p[0]->mapQ > 0 && p[1]->mapQ > 0) { + int mapQ = p[0]->mapQ + p[1]->mapQ; + if (mapQ > 60) mapQ = 60; + p[0]->mapQ = p[1]->mapQ = mapQ; + } else { + if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ; + if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ; + } + } else if (p[0]->pos == o_pos[0]>>32 && p[0]->strand == rr[0]) { // [1] moved + p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ; + if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p; + } else if (p[1]->pos == o_pos[1]>>32 && p[1]->strand == rr[1]) { // [0] moved + p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ; + if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p; + } else { // both ends moved + p[0]->seQ = p[1]->seQ = 0; + mapQ_p -= 20; + if (mapQ_p < 0) mapQ_p = 0; + p[0]->mapQ = p[1]->mapQ = mapQ_p; + } + __pairing_aux2(p[0], o_pos[0]); + __pairing_aux2(p[1], o_pos[1]); + } + return cnt_chg; +} + +typedef struct { + kvec_t(bwt_aln1_t) aln; +} aln_buf_t; + +int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii, + const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii) +{ + int i, j, cnt_chg = 0; + char str[1024]; + bwt_t *bwt[2]; + pe_data_t *d; + aln_buf_t *buf[2]; + + d = (pe_data_t*)calloc(1, sizeof(pe_data_t)); + buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); + buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); + + if (_bwt[0] == 0) { // load forward SA + strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]); + strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]); + } else bwt[0] = _bwt[0], bwt[1] = _bwt[1]; + + // SE + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p[2]; + for (j = 0; j < 2; ++j) { + int n_aln; + p[j] = seqs[j] + i; + p[j]->n_multi = 0; + p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2); + fread(&n_aln, 4, 1, fp_sa[j]); + if (n_aln > kv_max(d->aln[j])) + kv_resize(bwt_aln1_t, d->aln[j], n_aln); + d->aln[j].n = n_aln; + fread(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]); + kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j] + // generate SE alignment and mapping quality + bwa_aln2seq(n_aln, d->aln[j].a, p[j]); + if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) { + int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff; + p[j]->pos = p[j]->strand? bwt_sa(bwt[0], p[j]->sa) + : bwt[1]->seq_len - (bwt_sa(bwt[1], p[j]->sa) + p[j]->len); + p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff); + } + } + } + + // infer isize + infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt[0]->seq_len); + if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii; + if (opt->force_isize) { + fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__); + ii->low = ii->high = 0; ii->avg = ii->std = -1.0; + } + + // PE + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p[2]; + for (j = 0; j < 2; ++j) { + p[j] = seqs[j] + i; + kv_copy(bwt_aln1_t, d->aln[j], buf[j][i].aln); + } + if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT) + && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT)) + { // only when both ends mapped + uint64_t x; + int j, k, n_occ[2]; + for (j = 0; j < 2; ++j) { + n_occ[j] = 0; + for (k = 0; k < d->aln[j].n; ++k) + n_occ[j] += d->aln[j].a[k].l - d->aln[j].a[k].k + 1; + } + if (n_occ[0] > opt->max_occ || n_occ[1] > opt->max_occ) continue; + d->arr.n = 0; + for (j = 0; j < 2; ++j) { + for (k = 0; k < d->aln[j].n; ++k) { + bwt_aln1_t *r = d->aln[j].a + k; + bwtint_t l; + if (r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table + uint64_t key = (uint64_t)r->k<<32 | r->l; + int ret; + khint_t iter = kh_put(64, g_hash, key, &ret); + if (ret) { // not in the hash table; ret must equal 1 as we never remove elements + poslist_t *z = &kh_val(g_hash, iter); + z->n = r->l - r->k + 1; + z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n); + for (l = r->k; l <= r->l; ++l) + z->a[l - r->k] = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len); + } + for (l = 0; l < kh_val(g_hash, iter).n; ++l) { + x = kh_val(g_hash, iter).a[l]; + x = x<<32 | k<<1 | j; + kv_push(uint64_t, d->arr, x); + } + } else { // then calculate on the fly + for (l = r->k; l <= r->l; ++l) { + x = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len); + x = x<<32 | k<<1 | j; + kv_push(uint64_t, d->arr, x); + } + } + } + } + cnt_chg += pairing(p, d, opt, gopt->s_mm, ii); + } + + if (opt->N_multi || opt->n_multi) { + for (j = 0; j < 2; ++j) { + if (p[j]->type != BWA_TYPE_NO_MATCH) { + int k; + if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) { + bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi); + } else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi); + for (k = 0; k < p[j]->n_multi; ++k) { + bwt_multi1_t *q = p[j]->multi + k; + q->pos = q->strand? bwt_sa(bwt[0], q->pos) : bwt[1]->seq_len - (bwt_sa(bwt[1], q->pos) + p[j]->len); + } + } + } + } + } + + // free + for (i = 0; i < n_seqs; ++i) { + kv_destroy(buf[0][i].aln); + kv_destroy(buf[1][i].aln); + } + free(buf[0]); free(buf[1]); + if (_bwt[0] == 0) { + bwt_destroy(bwt[0]); bwt_destroy(bwt[1]); + } + kv_destroy(d->arr); + kv_destroy(d->pos[0]); kv_destroy(d->pos[1]); + kv_destroy(d->aln[0]); kv_destroy(d->aln[1]); + free(d); + return cnt_chg; +} + +#define SW_MIN_MATCH_LEN 20 +#define SW_MIN_MAPQ 17 + +// cnt = n_mm<<16 | n_gapo<<8 | n_gape +bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, + int *n_cigar, uint32_t *_cnt) +{ + bwa_cigar_t *cigar = 0; + ubyte_t *ref_seq; + bwtint_t k, x, y, l; + int path_len, ret; + AlnParam ap = aln_param_bwa; + path_t *path, *p; + + // check whether there are too many N's + if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0; + for (k = 0, x = 0; k < len; ++k) + if (seq[k] >= 4) ++x; + if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0; + + // get reference subsequence + ref_seq = (ubyte_t*)calloc(reglen, 1); + for (k = *beg, l = 0; l < reglen && k < l_pac; ++k) + ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; + path = (path_t*)calloc(l+len, sizeof(path_t)); + + // do alignment + ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, 0); + if (ret < 0) { + free(path); free(cigar); free(ref_seq); *n_cigar = 0; + return 0; + } + cigar = bwa_aln_path2cigar(path, path_len, n_cigar); + + // check whether the alignment is good enough + for (k = 0, x = y = 0; k < *n_cigar; ++k) { + bwa_cigar_t c = cigar[k]; + if (__cigar_op(c) == FROM_M) x += __cigar_len(c), y += __cigar_len(c); + else if (__cigar_op(c) == FROM_D) x += __cigar_len(c); + else y += __cigar_len(c); + } + if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough + free(path); free(cigar); free(ref_seq); + *n_cigar = 0; + return 0; + } + + { // update cigar and coordinate; + int start, end; + p = path + path_len - 1; + *beg += (p->i? p->i : 1) - 1; + start = (p->j? p->j : 1) - 1; + end = path->j; + cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); + if (start) { + memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar)); + cigar[0] = __cigar_create(3, start); + ++(*n_cigar); + } + if (end < len) { + /*cigar[*n_cigar] = 3<<14 | (len - end);*/ + cigar[*n_cigar] = __cigar_create(3, (len - end)); + ++(*n_cigar); + } + } + + { // set *cnt + int n_mm, n_gapo, n_gape; + n_mm = n_gapo = n_gape = 0; + p = path + path_len - 1; + x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0; + for (k = 0; k < *n_cigar; ++k) { + bwa_cigar_t c = cigar[k]; + if (__cigar_op(c) == FROM_M) { + for (l = 0; l < (__cigar_len(c)); ++l) + if (ref_seq[x+l] < 4 && seq[y+l] < 4 && ref_seq[x+l] != seq[y+l]) ++n_mm; + x += __cigar_len(c), y += __cigar_len(c); + } else if (__cigar_op(c) == FROM_D) { + x += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1; + } else if (__cigar_op(c) == FROM_I) { + y += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1; + } + } + *_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape; + } + + free(ref_seq); free(path); + return cigar; +} + +ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii) +{ + ubyte_t *pacseq; + int i; + uint64_t n_tot[2], n_mapped[2]; + + // load reference sequence + if (_pacseq == 0) { + pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); + rewind(bns->fp_pac); + fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); + } else pacseq = (ubyte_t*)_pacseq; + if (!popt->is_sw || ii->avg < 0.0) return pacseq; + + // perform mate alignment + n_tot[0] = n_tot[1] = n_mapped[0] = n_mapped[1] = 0; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p[2]; + p[0] = seqs[0] + i; p[1] = seqs[1] + i; + if ((p[0]->mapQ >= SW_MIN_MAPQ || p[1]->mapQ >= SW_MIN_MAPQ) && (p[0]->extra_flag&SAM_FPP) == 0) { // unpaired and one read has high mapQ + int k, n_cigar[2], is_singleton, mapQ = 0, mq_adjust[2]; + int64_t beg[2], end[2]; + bwa_cigar_t *cigar[2]; + uint32_t cnt[2]; + + /* In the following, _pref points to the reference read + * which must be aligned; _pmate points to its mate which is + * considered to be modified. */ + +#define __set_rght_coor(_a, _b, _pref, _pmate) do { \ + (_a) = (int64_t)_pref->pos + ii->avg - 3 * ii->std - _pmate->len * 1.5; \ + (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \ + if ((_a) < (int64_t)_pref->pos + _pref->len) (_a) = _pref->pos + _pref->len; \ + if ((_b) > bns->l_pac) (_b) = bns->l_pac; \ + } while (0) + +#define __set_left_coor(_a, _b, _pref, _pmate) do { \ + (_a) = (int64_t)_pref->pos + _pref->len - ii->avg - 3 * ii->std - _pmate->len * 0.5; \ + (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \ + if ((_a) < 0) (_a) = 0; \ + if ((_b) > _pref->pos) (_b) = _pref->pos; \ + } while (0) + +#define __set_fixed(_pref, _pmate, _beg, _cnt) do { \ + _pmate->type = BWA_TYPE_MATESW; \ + _pmate->pos = _beg; \ + _pmate->seQ = _pref->seQ; \ + _pmate->strand = (popt->type == BWA_PET_STD)? 1 - _pref->strand : _pref->strand; \ + _pmate->n_mm = _cnt>>16; _pmate->n_gapo = _cnt>>8&0xff; _pmate->n_gape = _cnt&0xff; \ + _pmate->extra_flag |= SAM_FPP; \ + _pref->extra_flag |= SAM_FPP; \ + } while (0) + + mq_adjust[0] = mq_adjust[1] = 255; // not effective + is_singleton = (p[0]->type == BWA_TYPE_NO_MATCH || p[1]->type == BWA_TYPE_NO_MATCH)? 1 : 0; + + ++n_tot[is_singleton]; + cigar[0] = cigar[1] = 0; + n_cigar[0] = n_cigar[1] = 0; + if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered + for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified + ubyte_t *seq; + if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip + if (popt->type == BWA_PET_STD) { + if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate + __set_rght_coor(beg[k], end[k], p[1-k], p[k]); + seq = p[k]->rseq; + } else { // then the mate is on forward stand and has smaller coordinate + __set_left_coor(beg[k], end[k], p[1-k], p[k]); + seq = p[k]->seq; + seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly + } + } else { // BWA_PET_SOLID + if (p[1-k]->strand == 0) { // R3-F3 pairing + if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 + else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 + seq = p[k]->rseq; + seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed + } else { // F3-R3 pairing + if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 + else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 + seq = p[k]->seq; + } + } + // perform SW alignment + cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]); + if (cigar[k] && p[k]->type != BWA_TYPE_NO_MATCH) { // re-evaluate cigar[k] + int s_old, clip = 0, s_new; + if (__cigar_op(cigar[k][0]) == 3) clip += __cigar_len(cigar[k][0]); + if (__cigar_op(cigar[k][n_cigar[k]-1]) == 3) clip += __cigar_len(cigar[k][n_cigar[k]-1]); + s_old = (int)((p[k]->n_mm * 9 + p[k]->n_gapo * 13 + p[k]->n_gape * 2) / 3. * 8. + .499); + s_new = (int)(((cnt[k]>>16) * 9 + (cnt[k]>>8&0xff) * 13 + (cnt[k]&0xff) * 2 + clip * 3) / 3. * 8. + .499); + s_old += -4.343 * log(ii->ap_prior / bns->l_pac); + s_new += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * 1.5) + .499)); // assume the mapped isize is 1.5\sigma + if (s_old < s_new) { // reject SW alignment + mq_adjust[k] = s_new - s_old; + free(cigar[k]); cigar[k] = 0; n_cigar[k] = 0; + } else mq_adjust[k] = s_old - s_new; + } + // now revserse sequence back such that p[*]->seq looks untouched + if (popt->type == BWA_PET_STD) { + if (p[1-k]->strand == 1) seq_reverse(p[k]->len, seq, 0); + } else { + if (p[1-k]->strand == 0) seq_reverse(p[k]->len, seq, 0); + } + } + k = -1; // no read to be changed + if (cigar[0] && cigar[1]) { + k = p[0]->mapQ < p[1]->mapQ? 0 : 1; // p[k] to be fixed + mapQ = abs(p[1]->mapQ - p[0]->mapQ); + } else if (cigar[0]) k = 0, mapQ = p[1]->mapQ; + else if (cigar[1]) k = 1, mapQ = p[0]->mapQ; + if (k >= 0 && p[k]->pos != beg[k]) { + ++n_mapped[is_singleton]; + { // recalculate mapping quality + int tmp = (int)p[1-k]->mapQ - p[k]->mapQ/2 - 8; + if (tmp <= 0) tmp = 1; + if (mapQ > tmp) mapQ = tmp; + p[k]->mapQ = p[1-k]->mapQ = mapQ; + p[k]->seQ = p[1-k]->seQ = p[1-k]->seQ < mapQ? p[1-k]->seQ : mapQ; + if (p[k]->mapQ > mq_adjust[k]) p[k]->mapQ = mq_adjust[k]; + if (p[k]->seQ > mq_adjust[k]) p[k]->seQ = mq_adjust[k]; + } + // update CIGAR + free(p[k]->cigar); p[k]->cigar = cigar[k]; cigar[k] = 0; + p[k]->n_cigar = n_cigar[k]; + // update the rest of information + __set_fixed(p[1-k], p[k], beg[k], cnt[k]); + } + free(cigar[0]); free(cigar[1]); + } + } + fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d singletons are mated.\n", + (long long)n_mapped[1], (long long)n_tot[1], SW_MIN_MAPQ); + fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d discordant pairs are fixed.\n", + (long long)n_mapped[0], (long long)n_tot[0], SW_MIN_MAPQ); + return pacseq; +} + +void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt) +{ + extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); + int i, j, n_seqs, tot_seqs = 0; + bwa_seq_t *seqs[2]; + bwa_seqio_t *ks[2]; + clock_t t; + bntseq_t *bns, *ntbns = 0; + FILE *fp_sa[2]; + gap_opt_t opt; + khint_t iter; + isize_info_t last_ii; // this is for the last batch of reads + char str[1024]; + bwt_t *bwt[2]; + uint8_t *pac; + + // initialization + bwase_initialize(); // initialize g_log_n[] in bwase.c + pac = 0; bwt[0] = bwt[1] = 0; + for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); + bns = bns_restore(prefix); + srand48(bns->seed); + fp_sa[0] = xopen(fn_sa[0], "r"); + fp_sa[1] = xopen(fn_sa[1], "r"); + g_hash = kh_init(64); + last_ii.avg = -1.0; + + fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); + ks[0] = bwa_open_reads(opt.mode, fn_fa[0]); + fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! + ks[1] = bwa_open_reads(opt.mode, fn_fa[1]); + if (!(opt.mode & BWA_MODE_COMPREAD)) { + popt->type = BWA_PET_SOLID; + ntbns = bwa_open_nt(prefix); + } else { // for Illumina alignment only + if (popt->is_preload) { + strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]); + strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]); + pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1); + rewind(bns->fp_pac); + fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); + } + } + + // core loop + bwa_print_sam_SQ(bns); + while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) { + int cnt_chg; + isize_info_t ii; + ubyte_t *pacseq; + + seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual); + tot_seqs += n_seqs; + t = clock(); + + fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n"); + cnt_chg = bwa_cal_pac_pos_pe(prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii); + fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg); + + fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n"); + pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii); + fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... "); + for (j = 0; j < 2; ++j) + bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + if (pac == 0) free(pacseq); + + fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... "); + for (i = 0; i < n_seqs; ++i) { + bwa_print_sam1(bns, seqs[0] + i, seqs[1] + i, opt.mode, opt.max_top2); + bwa_print_sam1(bns, seqs[1] + i, seqs[0] + i, opt.mode, opt.max_top2); + } + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + for (j = 0; j < 2; ++j) + bwa_free_read_seq(n_seqs, seqs[j]); + fprintf(stderr, "[bwa_sai2sam_pe_core] %d sequences have been processed.\n", tot_seqs); + last_ii = ii; + } + + // destroy + bns_destroy(bns); + if (ntbns) bns_destroy(ntbns); + for (i = 0; i < 2; ++i) { + bwa_seq_close(ks[i]); + fclose(fp_sa[i]); + } + for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter) + if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a); + kh_destroy(64, g_hash); + if (pac) { + free(pac); bwt_destroy(bwt[0]); bwt_destroy(bwt[1]); + } +} + +int bwa_sai2sam_pe(int argc, char *argv[]) +{ + extern char *bwa_rg_line, *bwa_rg_id; + extern int bwa_set_rg(const char *s); + int c; + pe_opt_t *popt; + popt = bwa_init_pe_opt(); + while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) { + switch (c) { + case 'r': + if (bwa_set_rg(optarg) < 0) { + fprintf(stderr, "[%s] malformated @RG line\n", __func__); + return 1; + } + break; + case 'a': popt->max_isize = atoi(optarg); break; + case 'o': popt->max_occ = atoi(optarg); break; + case 's': popt->is_sw = 0; break; + case 'P': popt->is_preload = 1; break; + case 'n': popt->n_multi = atoi(optarg); break; + case 'N': popt->N_multi = atoi(optarg); break; + case 'c': popt->ap_prior = atof(optarg); break; + case 'f': freopen(optarg, "w", stdout); break; + case 'A': popt->force_isize = 1; break; + default: return 1; + } + } + + if (optind + 5 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa sampe [options] \n\n"); + fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize); + fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ); + fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi); + fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi); + fprintf(stderr, " -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior); + fprintf(stderr, " -f FILE sam file to output results to [stdout]\n"); + fprintf(stderr, " -r STR read group header line such as `@RG\\tID:foo\\tSM:bar' [null]\n"); + fprintf(stderr, " -P preload index into memory (for base-space reads only)\n"); + fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n"); + fprintf(stderr, " -A disable insert size estimate (force -s)\n\n"); + fprintf(stderr, "Notes: 1. For SOLiD reads, corresponds R3 reads and to F3.\n"); + fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n"); + fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n"); + fprintf(stderr, "\n"); + return 1; + } + bwa_sai2sam_pe_core(argv[optind], argv + optind + 1, argv + optind+3, popt); + free(bwa_rg_line); free(bwa_rg_id); + free(popt); + return 0; +} diff --git a/bwase.c b/bwase.c new file mode 100644 index 0000000..937aacf --- /dev/null +++ b/bwase.c @@ -0,0 +1,677 @@ +#include +#include +#include +#include +#include +#include +#include "stdaln.h" +#include "bwase.h" +#include "bwtaln.h" +#include "bntseq.h" +#include "utils.h" +#include "kstring.h" + +int g_log_n[256]; +char *bwa_rg_line, *bwa_rg_id; + +void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) +{ + int i, cnt, best; + if (n_aln == 0) { + s->type = BWA_TYPE_NO_MATCH; + s->c1 = s->c2 = 0; + return; + } + + if (set_main) { + best = aln[0].score; + for (i = cnt = 0; i < n_aln; ++i) { + const bwt_aln1_t *p = aln + i; + if (p->score > best) break; + if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) { + s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->strand = p->a; + s->score = p->score; + s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); + } + cnt += p->l - p->k + 1; + } + s->c1 = cnt; + for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1; + s->c2 = cnt - s->c1; + s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; + } + + if (n_multi) { + int k, rest, n_occ, z = 0; + for (k = n_occ = 0; k < n_aln; ++k) { + const bwt_aln1_t *q = aln + k; + n_occ += q->l - q->k + 1; + } + if (s->multi) free(s->multi); + if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them + s->multi = 0; s->n_multi = 0; + return; + } + /* The following code is more flexible than what is required + * here. In principle, due to the requirement above, we can + * simply output all hits, but the following samples "rest" + * number of random hits. */ + rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa + s->multi = calloc(rest, sizeof(bwt_multi1_t)); + for (k = 0; k < n_aln; ++k) { + const bwt_aln1_t *q = aln + k; + if (q->l - q->k + 1 <= rest) { + bwtint_t l; + for (l = q->k; l <= q->l; ++l) { + s->multi[z].pos = l; + s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z].mm = q->n_mm; + s->multi[z++].strand = q->a; + } + rest -= q->l - q->k + 1; + } else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here. + int j, i, k; + for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) { + double p = 1.0, x = drand48(); + while (x < p) p -= p * j / (i--); + s->multi[z].pos = q->l - i; + s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z].mm = q->n_mm; + s->multi[z++].strand = q->a; + } + rest = 0; + break; + } + } + s->n_multi = z; + for (k = z = 0; k < s->n_multi; ++k) + if (s->multi[k].pos != s->sa) + s->multi[z++] = s->multi[k]; + s->n_multi = z < n_multi? z : n_multi; + } +} + +void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s) +{ + bwa_aln2seq_core(n_aln, aln, s, 1, 0); +} + +int bwa_approx_mapQ(const bwa_seq_t *p, int mm) +{ + int n; + if (p->c1 == 0) return 23; + if (p->c1 > 1) return 0; + if (p->n_mm == mm) return 25; + if (p->c2 == 0) return 37; + n = (p->c2 >= 255)? 255 : p->c2; + return (23 < g_log_n[n])? 0 : 23 - g_log_n[n]; +} + +/** + * Derive the actual position in the read from the given suffix array + * coordinates. Note that the position will be approximate based on + * whether indels appear in the read and whether calculations are + * performed from the start or end of the read. + */ +void bwa_cal_pac_pos_core(const bwt_t *forward_bwt, const bwt_t *reverse_bwt, bwa_seq_t *seq, const int max_mm, const float fnr) +{ + int max_diff; + if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return; + max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm; + if (seq->strand) { // reverse strand only + seq->pos = bwt_sa(forward_bwt, seq->sa); + seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); + } else { // forward strand only + /* NB: For gapped alignment, p->pos may not be correct, which + * will be fixed in refine_gapped_core(). This line also + * determines the way "x" is calculated in + * refine_gapped_core() when (ext < 0 && is_end == 0). */ + seq->pos = reverse_bwt->seq_len - (bwt_sa(reverse_bwt, seq->sa) + seq->len); + seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); + } +} + +void bwa_cal_pac_pos(const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr) +{ + int i, j; + char str[1024]; + bwt_t *bwt; + // load forward SA + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); + for (i = 0; i != n_seqs; ++i) { + if (seqs[i].strand) bwa_cal_pac_pos_core(bwt, 0, &seqs[i], max_mm, fnr); + for (j = 0; j < seqs[i].n_multi; ++j) { + bwt_multi1_t *p = seqs[i].multi + j; + if (p->strand) p->pos = bwt_sa(bwt, p->pos); + } + } + bwt_destroy(bwt); + // load reverse BWT and SA + strcpy(str, prefix); strcat(str, ".rbwt"); bwt = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt); + for (i = 0; i != n_seqs; ++i) { + if (!seqs[i].strand) bwa_cal_pac_pos_core(0, bwt, &seqs[i], max_mm, fnr); + for (j = 0; j < seqs[i].n_multi; ++j) { + bwt_multi1_t *p = seqs[i].multi + j; + if (!p->strand) p->pos = bwt->seq_len - (bwt_sa(bwt, p->pos) + seqs[i].len); + } + } + bwt_destroy(bwt); +} + +/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on + * forward strand. This happens when p->pos is calculated by + * bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct + * coordinate. This happens only for color-converted alignment. */ +static bwa_cigar_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos, + int ext, int *n_cigar, int is_end_correct) +{ + bwa_cigar_t *cigar = 0; + ubyte_t *ref_seq; + int l = 0, path_len, ref_len; + AlnParam ap = aln_param_bwa; + path_t *path; + int64_t k, __pos = *_pos > l_pac? (int64_t)((int32_t)*_pos) : *_pos; + + ref_len = len + abs(ext); + if (ext > 0) { + ref_seq = (ubyte_t*)calloc(ref_len, 1); + for (k = __pos; k < __pos + ref_len && k < l_pac; ++k) + ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; + } else { + int64_t x = __pos + (is_end_correct? len : ref_len); + ref_seq = (ubyte_t*)calloc(ref_len, 1); + for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k) + ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; + } + path = (path_t*)calloc(l+len, sizeof(path_t)); + + aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len); + cigar = bwa_aln_path2cigar(path, path_len, n_cigar); + + if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped on the forward strand + for (l = k = 0; k < *n_cigar; ++k) { + if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]); + else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]); + } + __pos += l; + } + + if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end + __pos += __cigar_len(cigar[0]); + for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1]; + --(*n_cigar); + } + if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end + + // change "I" at either end of the read to S. just in case. This should rarely happen... + if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1]))); + if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0]))); + + *_pos = (bwtint_t)__pos; + free(ref_seq); free(path); + return cigar; +} + +char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq, + bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm) +{ + bwtint_t x, y; + int z, u, c, nm = 0; + str->l = 0; // reset + x = pos; y = 0; + if (cigar) { + int k, l; + for (k = u = 0; k < n_cigar; ++k) { + l = __cigar_len(cigar[k]); + if (__cigar_op(cigar[k]) == FROM_M) { + for (z = 0; z < l && x+z < l_pac; ++z) { + c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; + if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { + ksprintf(str, "%d", u); + kputc("ACGTN"[c], str); + ++nm; + u = 0; + } else ++u; + } + x += l; y += l; +/* } else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) { */ + } else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) { + y += l; + if (__cigar_op(cigar[k]) == FROM_I) nm += l; + } else if (__cigar_op(cigar[k]) == FROM_D) { + ksprintf(str, "%d", u); + kputc('^', str); + for (z = 0; z < l && x+z < l_pac; ++z) + kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str); + u = 0; + x += l; nm += l; + } + } + } else { // no gaps + for (z = u = 0; z < (bwtint_t)len; ++z) { + c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; + if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { + ksprintf(str, "%d", u); + kputc("ACGTN"[c], str); + ++nm; + u = 0; + } else ++u; + } + } + ksprintf(str, "%d", u); + *_nm = nm; + return strdup(str->s); +} + +void bwa_correct_trimmed(bwa_seq_t *s) +{ + if (s->len == s->full_len) return; + if (s->strand == 0) { // forward + if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S + s->cigar[s->n_cigar-1] += s->full_len - s->len; + } else { + if (s->cigar == 0) { + s->n_cigar = 2; + s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t)); + s->cigar[0] = __cigar_create(0, s->len); + } else { + ++s->n_cigar; + s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); + } + s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len)); + } + } else { // reverse + if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S + s->cigar[0] += s->full_len - s->len; + } else { + if (s->cigar == 0) { + s->n_cigar = 2; + s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t)); + s->cigar[1] = __cigar_create(0, s->len); + } else { + ++s->n_cigar; + s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); + memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t)); + } + s->cigar[0] = __cigar_create(3, (s->full_len - s->len)); + } + } + s->len = s->full_len; +} + +void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns) +{ + ubyte_t *pacseq, *ntpac = 0; + int i, j; + kstring_t *str; + + if (ntbns) { // in color space + ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1); + rewind(ntbns->fp_pac); + fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac); + } + + if (!_pacseq) { + pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); + rewind(bns->fp_pac); + fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); + } else pacseq = _pacseq; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *s = seqs + i; + seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!! + for (j = 0; j < s->n_multi; ++j) { + bwt_multi1_t *q = s->multi + j; + int n_cigar; + if (q->gap == 0) continue; + q->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, + (q->strand? 1 : -1) * q->gap, &n_cigar, 1); + q->n_cigar = n_cigar; + } + if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; + s->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, + (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); + } + + if (ntbns) { // in color space + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *s = seqs + i; + bwa_cs2nt_core(s, bns->l_pac, ntpac); + for (j = 0; j < s->n_multi; ++j) { + bwt_multi1_t *q = s->multi + j; + int n_cigar; + if (q->gap == 0) continue; + free(q->cigar); + q->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos, + (q->strand? 1 : -1) * q->gap, &n_cigar, 0); + q->n_cigar = n_cigar; + } + if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again + free(s->cigar); + s->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos, + (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0); + } + } + } + + // generate MD tag + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *s = seqs + i; + if (s->type != BWA_TYPE_NO_MATCH) { + int nm; + s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, + bns->l_pac, ntbns? ntpac : pacseq, str, &nm); + s->nm = nm; + } + } + free(str->s); free(str); + + // correct for trimmed reads + if (!ntbns) // trimming is only enabled for Illumina reads + for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); + + if (!_pacseq) free(pacseq); + free(ntpac); +} + +int64_t pos_end(const bwa_seq_t *p) +{ + if (p->cigar) { + int j; + int64_t x = p->pos; + for (j = 0; j != p->n_cigar; ++j) { + int op = __cigar_op(p->cigar[j]); + if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]); + } + return x; + } else return p->pos + p->len; +} + +int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end() +{ + if (p->cigar) { + int j; + int64_t x = p->pos; + for (j = 0; j != p->n_cigar; ++j) { + int op = __cigar_op(p->cigar[j]); + if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]); + } + return x; + } else return p->pos + len; +} + +static int64_t pos_5(const bwa_seq_t *p) +{ + if (p->type != BWA_TYPE_NO_MATCH) + return p->strand? pos_end(p) : p->pos; + return -1; +} + +void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2) +{ + int j; + if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) { + int seqid, nn, am = 0, flag = p->extra_flag; + char XT; + + if (p->type == BWA_TYPE_NO_MATCH) { + p->pos = mate->pos; + p->strand = mate->strand; + flag |= SAM_FSU; + j = 1; + } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment + + // get seqid + nn = bns_coor_pac2real(bns, p->pos, j, &seqid); + if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len) + flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences + + // update flag and print it + if (p->strand) flag |= SAM_FSR; + if (mate) { + if (mate->type != BWA_TYPE_NO_MATCH) { + if (mate->strand) flag |= SAM_FMR; + } else flag |= SAM_FMU; + } + printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name); + printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ); + + // print CIGAR + if (p->cigar) { + for (j = 0; j != p->n_cigar; ++j) + printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]); + } else if (p->type == BWA_TYPE_NO_MATCH) printf("*"); + else printf("%dM", p->len); + + // print mate coordinate + if (mate && mate->type != BWA_TYPE_NO_MATCH) { + int m_seqid, m_is_N; + long long isize; + am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality + // redundant calculation here, but should not matter too much + m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid); + printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); + isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0; + if (p->type == BWA_TYPE_NO_MATCH) isize = 0; + printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize); + } else if (mate) printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1)); + else printf("\t*\t0\t0\t"); + + // print sequence and quality + if (p->strand == 0) + for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]); + else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]); + putchar('\t'); + if (p->qual) { + if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality + printf("%s", p->qual); + } else printf("*"); + + if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id); + if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len); + if (p->type != BWA_TYPE_NO_MATCH) { + int i; + // calculate XT tag + XT = "NURM"[p->type]; + if (nn > 10) XT = 'N'; + // print tags + printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm); + if (nn) printf("\tXN:i:%d", nn); + if (mate) printf("\tSM:i:%d\tAM:i:%d", p->seQ, am); + if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment + printf("\tX0:i:%d", p->c1); + if (p->c1 <= max_top2) printf("\tX1:i:%d", p->c2); + } + printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape); + if (p->md) printf("\tMD:Z:%s", p->md); + // print multiple hits + if (p->n_multi) { + printf("\tXA:Z:"); + for (i = 0; i < p->n_multi; ++i) { + bwt_multi1_t *q = p->multi + i; + int k; + j = pos_end_multi(q, p->len) - q->pos; + nn = bns_coor_pac2real(bns, q->pos, j, &seqid); + printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+', + (int)(q->pos - bns->anns[seqid].offset + 1)); + if (q->cigar) { + for (k = 0; k < q->n_cigar; ++k) + printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]); + } else printf("%dM", p->len); + printf(",%d;", q->gap + q->mm); + } + } + } + putchar('\n'); + } else { // this read has no match + ubyte_t *s = p->strand? p->rseq : p->seq; + int flag = p->extra_flag | SAM_FSU; + if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU; + printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag); + for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]); + putchar('\t'); + if (p->qual) { + if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality + printf("%s", p->qual); + } else printf("*"); + if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len); + putchar('\n'); + } +} + +bntseq_t *bwa_open_nt(const char *prefix) +{ + bntseq_t *ntbns; + char *str; + str = (char*)calloc(strlen(prefix) + 10, 1); + strcat(strcpy(str, prefix), ".nt"); + ntbns = bns_restore(str); + free(str); + return ntbns; +} + +void bwa_print_sam_SQ(const bntseq_t *bns) +{ + int i; + for (i = 0; i < bns->n_seqs; ++i) + printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); + if (bwa_rg_line) printf("%s\n", bwa_rg_line); +} + +void bwase_initialize() +{ + int i; + for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); +} + +char *bwa_escape(char *s) +{ + char *p, *q; + for (p = q = s; *p; ++p) { + if (*p == '\\') { + ++p; + if (*p == 't') *q++ = '\t'; + else if (*p == 'n') *q++ = '\n'; + else if (*p == 'r') *q++ = '\r'; + else if (*p == '\\') *q++ = '\\'; + } else *q++ = *p; + } + *q = '\0'; + return s; +} + +int bwa_set_rg(const char *s) +{ + char *p, *q, *r; + if (strstr(s, "@RG") != s) return -1; + if (bwa_rg_line) free(bwa_rg_line); + if (bwa_rg_id) free(bwa_rg_id); + bwa_rg_line = strdup(s); + bwa_rg_id = 0; + bwa_escape(bwa_rg_line); + p = strstr(bwa_rg_line, "\tID:"); + if (p == 0) return -1; + p += 4; + for (q = p; *q && *q != '\t' && *q != '\n'; ++q); + bwa_rg_id = calloc(q - p + 1, 1); + for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) + *r++ = *q; + return 0; +} + +void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ) +{ + extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); + int i, n_seqs, tot_seqs = 0, m_aln; + bwt_aln1_t *aln = 0; + bwa_seq_t *seqs; + bwa_seqio_t *ks; + clock_t t; + bntseq_t *bns, *ntbns = 0; + FILE *fp_sa; + gap_opt_t opt; + + // initialization + bwase_initialize(); + bns = bns_restore(prefix); + srand48(bns->seed); + fp_sa = xopen(fn_sa, "r"); + + m_aln = 0; + fread(&opt, sizeof(gap_opt_t), 1, fp_sa); + if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac + ntbns = bwa_open_nt(prefix); + bwa_print_sam_SQ(bns); + // set ks + ks = bwa_open_reads(opt.mode, fn_fa); + // core loop + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) { + tot_seqs += n_seqs; + t = clock(); + + // read alignment + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + int n_aln; + fread(&n_aln, 4, 1, fp_sa); + if (n_aln > m_aln) { + m_aln = n_aln; + aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln); + } + fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); + bwa_aln2seq_core(n_aln, aln, p, 1, n_occ); + } + + fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... "); + bwa_cal_pac_pos(prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); + bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + fprintf(stderr, "[bwa_aln_core] print alignments... "); + for (i = 0; i < n_seqs; ++i) + bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + bwa_free_read_seq(n_seqs, seqs); + fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); + } + + // destroy + bwa_seq_close(ks); + if (ntbns) bns_destroy(ntbns); + bns_destroy(bns); + fclose(fp_sa); + free(aln); +} + +int bwa_sai2sam_se(int argc, char *argv[]) +{ + int c, n_occ = 3; + while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { + switch (c) { + case 'h': break; + case 'r': + if (bwa_set_rg(optarg) < 0) { + fprintf(stderr, "[%s] malformated @RG line\n", __func__); + return 1; + } + break; + case 'n': n_occ = atoi(optarg); break; + case 'f': freopen(optarg, "w", stdout); break; + default: return 1; + } + } + + if (optind + 3 > argc) { + fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] \n"); + return 1; + } + bwa_sai2sam_se_core(argv[optind], argv[optind+1], argv[optind+2], n_occ); + free(bwa_rg_line); free(bwa_rg_id); + return 0; +} diff --git a/bwase.h b/bwase.h new file mode 100644 index 0000000..28ba224 --- /dev/null +++ b/bwase.h @@ -0,0 +1,27 @@ +#ifndef BWASE_H +#define BWASE_H + +#include "bntseq.h" +#include "bwt.h" +#include "bwtaln.h" + +#ifdef __cplusplus +extern "C" { +#endif + + // Initialize mapping tables in the bwa single-end mapper. + void bwase_initialize(); + // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array. + void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr); + // Refine the approximate position of the sequence to an actual placement for the sequence. + void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns); + // Backfill certain alignment properties mainly centering around number of matches. + void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); + // Calculate the end position of a read given a certain sequence. + int64_t pos_end(const bwa_seq_t *p); + +#ifdef __cplusplus +} +#endif + +#endif // BWASE_H diff --git a/bwaseqio.c b/bwaseqio.c new file mode 100644 index 0000000..07a3082 --- /dev/null +++ b/bwaseqio.c @@ -0,0 +1,198 @@ +#include +#include "bwtaln.h" +#include "utils.h" +#include "bamlite.h" + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +extern unsigned char nst_nt4_table[256]; +static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; + +struct __bwa_seqio_t { + // for BAM input + int is_bam, which; // 1st bit: read1, 2nd bit: read2, 3rd: SE + bamFile fp; + // for fastq input + kseq_t *ks; +}; + +bwa_seqio_t *bwa_bam_open(const char *fn, int which) +{ + bwa_seqio_t *bs; + bam_header_t *h; + bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); + bs->is_bam = 1; + bs->which = which; + bs->fp = bam_open(fn, "r"); + h = bam_header_read(bs->fp); + bam_header_destroy(h); + return bs; +} + +bwa_seqio_t *bwa_seq_open(const char *fn) +{ + gzFile fp; + bwa_seqio_t *bs; + bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); + fp = xzopen(fn, "r"); + bs->ks = kseq_init(fp); + return bs; +} + +void bwa_seq_close(bwa_seqio_t *bs) +{ + if (bs == 0) return; + if (bs->is_bam) bam_close(bs->fp); + else { + gzclose(bs->ks->f->f); + kseq_destroy(bs->ks); + } + free(bs); +} + +void seq_reverse(int len, ubyte_t *seq, int is_comp) +{ + int i; + if (is_comp) { + for (i = 0; i < len>>1; ++i) { + char tmp = seq[len-1-i]; + if (tmp < 4) tmp = 3 - tmp; + seq[len-1-i] = (seq[i] >= 4)? seq[i] : 3 - seq[i]; + seq[i] = tmp; + } + if (len&1) seq[i] = (seq[i] >= 4)? seq[i] : 3 - seq[i]; + } else { + for (i = 0; i < len>>1; ++i) { + char tmp = seq[len-1-i]; + seq[len-1-i] = seq[i]; seq[i] = tmp; + } + } +} + +int bwa_trim_read(int trim_qual, bwa_seq_t *p) +{ + int s = 0, l, max = 0, max_l = p->len - 1; + if (trim_qual < 1 || p->qual == 0) return 0; + for (l = p->len - 1; l >= BWA_MIN_RDLEN - 1; --l) { + s += trim_qual - (p->qual[l] - 33); + if (s < 0) break; + if (s > max) { + max = s; max_l = l; + } + } + p->clip_len = p->len = max_l + 1; + return p->full_len - p->len; +} + +static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) +{ + bwa_seq_t *seqs, *p; + int n_seqs, l, i; + long n_trimmed = 0, n_tot = 0; + bam1_t *b; + + b = bam_init1(); + n_seqs = 0; + seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); + while (bam_read1(bs->fp, b) >= 0) { + uint8_t *s, *q; + int go = 0; + if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; + if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; + if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; + if (go == 0) continue; + l = b->core.l_qseq; + p = &seqs[n_seqs++]; + p->tid = -1; // no assigned to a thread + p->qual = 0; + p->full_len = p->clip_len = p->len = l; + n_tot += p->full_len; + s = bam1_seq(b); q = bam1_qual(b); + p->seq = (ubyte_t*)calloc(p->len + 1, 1); + p->qual = (ubyte_t*)calloc(p->len + 1, 1); + for (i = 0; i != p->full_len; ++i) { + p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; + p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; + } + if (bam1_strand(b)) { // then reverse + seq_reverse(p->len, p->seq, 1); + seq_reverse(p->len, p->qual, 0); + } + if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); + p->rseq = (ubyte_t*)calloc(p->full_len, 1); + memcpy(p->rseq, p->seq, p->len); + seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() + seq_reverse(p->len, p->rseq, is_comp); + p->name = strdup((const char*)bam1_qname(b)); + if (n_seqs == n_needed) break; + } + *n = n_seqs; + if (n_seqs && trim_qual >= 1) + fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); + if (n_seqs == 0) { + free(seqs); + bam_destroy1(b); + return 0; + } + bam_destroy1(b); + return seqs; +} + + bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) +{ + bwa_seq_t *seqs, *p; + kseq_t *seq = bs->ks; + int n_seqs, l, i; + long n_trimmed = 0, n_tot = 0; + + if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); + n_seqs = 0; + seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); + while ((l = kseq_read(seq)) >= 0) { + p = &seqs[n_seqs++]; + p->tid = -1; // no assigned to a thread + p->qual = 0; + p->full_len = p->clip_len = p->len = l; + n_tot += p->full_len; + p->seq = (ubyte_t*)calloc(p->len, 1); + for (i = 0; i != p->full_len; ++i) + p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; + if (seq->qual.l) { // copy quality + p->qual = (ubyte_t*)strdup((char*)seq->qual.s); + if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); + } + p->rseq = (ubyte_t*)calloc(p->full_len, 1); + memcpy(p->rseq, p->seq, p->len); + seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() + seq_reverse(p->len, p->rseq, is_comp); + p->name = strdup((const char*)seq->name.s); + { // trim /[12]$ + int t = strlen(p->name); + if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; + } + if (n_seqs == n_needed) break; + } + *n = n_seqs; + if (n_seqs && trim_qual >= 1) + fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); + if (n_seqs == 0) { + free(seqs); + return 0; + } + return seqs; +} + +void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs) +{ + int i, j; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + for (j = 0; j < p->n_multi; ++j) + if (p->multi[j].cigar) free(p->multi[j].cigar); + free(p->name); + free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi); + free(p->cigar); + } + free(seqs); +} diff --git a/bwt.c b/bwt.c new file mode 100644 index 0000000..10b838a --- /dev/null +++ b/bwt.c @@ -0,0 +1,250 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#include +#include +#include +#include +#include +#include "utils.h" +#include "bwt.h" + +void bwt_gen_cnt_table(bwt_t *bwt) +{ + int i, j; + for (i = 0; i != 256; ++i) { + uint32_t x = 0; + for (j = 0; j != 4; ++j) + x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3); + bwt->cnt_table[i] = x; + } +} + +// bwt->bwt and bwt->occ must be precalculated +void bwt_cal_sa(bwt_t *bwt, int intv) +{ + bwtint_t isa, sa, i; // S(isa) = sa + + xassert(bwt->bwt, "bwt_t::bwt is not initialized."); + + if (bwt->sa) free(bwt->sa); + bwt->sa_intv = intv; + bwt->n_sa = (bwt->seq_len + intv) / intv; + bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); + // calculate SA value + isa = 0; sa = bwt->seq_len; + for (i = 0; i < bwt->seq_len; ++i) { + if (isa % intv == 0) bwt->sa[isa/intv] = sa; + --sa; + isa = bwt_invPsi(bwt, isa); + } + if (isa % intv == 0) bwt->sa[isa/intv] = sa; + bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len +} + +bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k) +{ + bwtint_t sa = 0; + while (k % bwt->sa_intv != 0) { + ++sa; + k = bwt_invPsi(bwt, k); + } + /* without setting bwt->sa[0] = -1, the following line should be + changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */ + return sa + bwt->sa[k/bwt->sa_intv]; +} + +static inline int __occ_aux(uint64_t y, int c) +{ + // reduce nucleotide counting to bits counting + y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull; + // count the number of 1s in y + y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull); + return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56; +} + +inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) +{ + bwtint_t n, l, j; + uint32_t *p; + + if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; + if (k == (bwtint_t)(-1)) return 0; + if (k >= bwt->primary) --k; // because $ is not in bwt + + // retrieve Occ at k/OCC_INTERVAL + n = (p = bwt_occ_intv(bwt, k))[c]; + p += 4; // jump to the start of the first BWT cell + + // calculate Occ up to the last k/32 + j = k >> 5 << 5; + for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2) + n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + + // calculate Occ + n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); + if (c == 0) n -= ~k&31; // corrected for the masked bits + + return n; +} + +// an analogy to bwt_occ() but more efficient, requiring k <= l +inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol) +{ + bwtint_t _k, _l; + if (k == l) { + *ok = *ol = bwt_occ(bwt, k, c); + return; + } + _k = (k >= bwt->primary)? k-1 : k; + _l = (l >= bwt->primary)? l-1 : l; + if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { + *ok = bwt_occ(bwt, k, c); + *ol = bwt_occ(bwt, l, c); + } else { + bwtint_t m, n, i, j; + uint32_t *p; + if (k >= bwt->primary) --k; + if (l >= bwt->primary) --l; + n = (p = bwt_occ_intv(bwt, k))[c]; + p += 4; + // calculate *ok + j = k >> 5 << 5; + for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2) + n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + m = n; + n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); + if (c == 0) n -= ~k&31; // corrected for the masked bits + *ok = n; + // calculate *ol + j = l >> 5 << 5; + for (; i < j; i += 32, p += 2) + m += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c); + if (c == 0) m -= ~l&31; // corrected for the masked bits + *ol = m; + } +} + +#define __occ_aux4(bwt, b) \ + ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \ + + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24]) + +inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) +{ + bwtint_t l, j, x; + uint32_t *p; + if (k == (bwtint_t)(-1)) { + memset(cnt, 0, 4 * sizeof(bwtint_t)); + return; + } + if (k >= bwt->primary) --k; // because $ is not in bwt + p = bwt_occ_intv(bwt, k); + memcpy(cnt, p, 16); + p += 4; + j = k >> 4 << 4; + for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p) + x += __occ_aux4(bwt, *p); + x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); + cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; +} + +// an analogy to bwt_occ4() but more efficient, requiring k <= l +inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) +{ + bwtint_t _k, _l; + if (k == l) { + bwt_occ4(bwt, k, cntk); + memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); + return; + } + _k = (k >= bwt->primary)? k-1 : k; + _l = (l >= bwt->primary)? l-1 : l; + if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { + bwt_occ4(bwt, k, cntk); + bwt_occ4(bwt, l, cntl); + } else { + bwtint_t i, j, x, y; + uint32_t *p; + int cl[4]; + if (k >= bwt->primary) --k; // because $ is not in bwt + if (l >= bwt->primary) --l; + cl[0] = cl[1] = cl[2] = cl[3] = 0; + p = bwt_occ_intv(bwt, k); + memcpy(cntk, p, 4 * sizeof(bwtint_t)); + p += 4; + // prepare cntk[] + j = k >> 4 << 4; + for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p) + x += __occ_aux4(bwt, *p); + y = x; + x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); + // calculate cntl[] and finalize cntk[] + j = l >> 4 << 4; + for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p); + y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15); + memcpy(cntl, cntk, 16); + cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24; + cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24; + } +} + +int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end) +{ + bwtint_t k, l, ok, ol; + int i; + k = 0; l = bwt->seq_len; + for (i = len - 1; i >= 0; --i) { + ubyte_t c = str[i]; + if (c > 3) return 0; // no match + bwt_2occ(bwt, k - 1, l, c, &ok, &ol); + k = bwt->L2[c] + ok + 1; + l = bwt->L2[c] + ol; + if (k > l) break; // no match + } + if (k > l) return 0; // no match + if (sa_begin) *sa_begin = k; + if (sa_end) *sa_end = l; + return l - k + 1; +} + +int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0) +{ + int i; + bwtint_t k, l, ok, ol; + k = *k0; l = *l0; + for (i = len - 1; i >= 0; --i) { + ubyte_t c = str[i]; + if (c > 3) return 0; // there is an N here. no match + bwt_2occ(bwt, k - 1, l, c, &ok, &ol); + k = bwt->L2[c] + ok + 1; + l = bwt->L2[c] + ol; + if (k > l) return 0; // no match + } + *k0 = k; *l0 = l; + return l - k + 1; +} diff --git a/bwt.h b/bwt.h new file mode 100644 index 0000000..4aef38d --- /dev/null +++ b/bwt.h @@ -0,0 +1,105 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef BWA_BWT_H +#define BWA_BWT_H + +#include + +// requirement: (OCC_INTERVAL%16 == 0) +#define OCC_INTERVAL 0x80 + +#ifndef BWA_UBYTE +#define BWA_UBYTE +typedef unsigned char ubyte_t; +#endif +typedef uint32_t bwtint_t; + +typedef struct { + bwtint_t primary; // S^{-1}(0), or the primary index of BWT + bwtint_t L2[5]; // C(), cumulative count + bwtint_t seq_len; // sequence length + bwtint_t bwt_size; // size of bwt, about seq_len/4 + uint32_t *bwt; // BWT + // occurance array, separated to two parts + uint32_t cnt_table[256]; + // suffix array + int sa_intv; + bwtint_t n_sa; + bwtint_t *sa; +} bwt_t; + +#define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL*12 + 4 + (k)%OCC_INTERVAL/16]) + +/* retrieve a character from the $-removed BWT string. Note that + * bwt_t::bwt is not exactly the BWT string and therefore this macro is + * called bwt_B0 instead of bwt_B */ +#define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3) + +#define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL*12) + +// inverse Psi function +#define bwt_invPsi(bwt, k) \ + (((k) == (bwt)->primary)? 0 : \ + ((k) < (bwt)->primary)? \ + (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \ + : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1))) + +#ifdef __cplusplus +extern "C" { +#endif + + void bwt_dump_bwt(const char *fn, const bwt_t *bwt); + void bwt_dump_sa(const char *fn, const bwt_t *bwt); + + bwt_t *bwt_restore_bwt(const char *fn); + void bwt_restore_sa(const char *fn, bwt_t *bwt); + + void bwt_destroy(bwt_t *bwt); + + void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW + void bwt_cal_sa(bwt_t *bwt, int intv); + + void bwt_bwtupdate_core(bwt_t *bwt); + + inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c); + inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]); + bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k); + + // more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values + void bwt_gen_cnt_table(bwt_t *bwt); + inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol); + inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]); + + int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end); + int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwt_gen/Makefile b/bwt_gen/Makefile new file mode 100644 index 0000000..131b1c9 --- /dev/null +++ b/bwt_gen/Makefile @@ -0,0 +1,23 @@ +CC= gcc +CFLAGS= -g -Wall -O2 -m64 # comment out `-m64' for 32-bit compilation +DFLAGS= -D_FILE_OFFSET_BITS=64 +OBJS= bwt_gen.o QSufSort.o +INCLUDES= +VERSION= 0.1.0 +LIBS= +SUBDIRS= + +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +lib:libbwtgen.a + +libbwtgen.a:$(OBJS) + $(AR) -cru $@ $(OBJS) + +cleanlocal: + rm -f gmon.out *.o a.out $(PROG) *~ *.a + +clean:cleanlocal diff --git a/bwt_gen/QSufSort.c b/bwt_gen/QSufSort.c new file mode 100644 index 0000000..5bf35de --- /dev/null +++ b/bwt_gen/QSufSort.c @@ -0,0 +1,496 @@ +/* QSufSort.c + + Original source from qsufsort.c + + Copyright 1999, N. Jesper Larsson, all rights reserved. + + This file contains an implementation of the algorithm presented in "Faster + Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko + Sadakane (sada@is.s.u-tokyo.ac.jp). + + This software may be used freely for any purpose. However, when distributed, + the original source must be clearly stated, and, when the source code is + distributed, the copyright notice must be retained and any alterations in + the code must be clearly marked. No warranty is given regarding the quality + of this software. + + Modified by Wong Chi-Kwong, 2004 + + Changes summary: - Used long variable and function names + - Removed global variables + - Replace pointer references with array references + - Used insertion sort in place of selection sort and increased insertion sort threshold + - Reconstructing suffix array from inverse becomes an option + - Add handling where end-of-text symbol is not necessary < all characters + - Removed codes for supporting alphabet size > number of characters + + No warrenty is given regarding the quality of the modifications. + +*/ + + +#include +#include +#include +#include "bwt_gen.h" +#include "QSufSort.h" + +// Static functions +static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos, + const int highestPos, const int numSortedChar); +static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos, + const int highestPos, const int numSortedChar); +static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos, + const int highestPos, const int numSortedChar); +static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int numChar, const int alphabetSize); +static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol, + const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated); + +// from MiscUtilities.c +static unsigned int leadingZero(const unsigned int input) { + + unsigned int l; + const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + if (input & 0xFFFF0000) { + if (input & 0xFF000000) { + l = leadingZero8bit[input >> 24]; + } else { + l = 8 + leadingZero8bit[input >> 16]; + } + } else { + if (input & 0x0000FF00) { + l = 16 + leadingZero8bit[input >> 8]; + } else { + l = 24 + leadingZero8bit[input]; + } + } + return l; + +} + +/* Makes suffix array p of x. x becomes inverse of p. p and x are both of size + n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original + contents of x[n] is disregarded, the n-th symbol being regarded as + end-of-string smaller than all other symbols.*/ +void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol, + const int smallestInputSymbol, const int skipTransform) { + + int i, j; + int s, negatedSortedGroupLength; + int numSymbolAggregated; + int maxNumInputSymbol; + int numSortedPos = 1; + int newAlphabetSize; + + maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; + + if (!skipTransform) { + /* bucketing possible*/ + newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol, + numChar, &numSymbolAggregated); + QSufSortBucketSort(V, I, numChar, newAlphabetSize); + I[0] = -1; + V[numChar] = 0; + numSortedPos = numSymbolAggregated; + } + + while ((int)(I[0]) >= -(int)numChar) { + i = 0; + negatedSortedGroupLength = 0; + do { + s = I[i]; + if (s < 0) { + i -= s; /* skip over sorted group.*/ + negatedSortedGroupLength += s; + } else { + if (negatedSortedGroupLength) { + I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine preceding sorted groups */ + negatedSortedGroupLength = 0; + } + j = V[s] + 1; + QSufSortSortSplit(V, I, i, j - 1, numSortedPos); + i = j; + } + } while (i <= numChar); + if (negatedSortedGroupLength) { + /* array ends with a sorted group.*/ + I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine sorted groups at end of I.*/ + } + numSortedPos *= 2; /* double sorted-depth.*/ + } + +} + +void QSufSortGenerateSaFromInverse(const int* V, int* __restrict I, const int numChar) { + + int i; + for (i=0; i<=numChar; i++) { + I[V[i]] = i + 1; + } + +} + +/* Sorting routine called for each unsorted group. Sorts the array of integers + (suffix numbers) of length n starting at p. The algorithm is a ternary-split + quicksort taken from Bentley & McIlroy, "Engineering a Sort Function", + Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This + function is based on Program 7.*/ +static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos, + const int highestPos, const int numSortedChar) { + + int a, b, c, d; + int l, m; + int f, v, s, t; + int tmp; + int numItem; + + #ifdef DEBUG + if (lowestPos > highestPos) { + fprintf(stderr, "QSufSortSortSplit(): lowestPos > highestPos!\n"); + exit(1); + } + #endif + + numItem = highestPos - lowestPos + 1; + + if (numItem <= INSERT_SORT_NUM_ITEM) { + QSufSortInsertSortSplit(V, I, lowestPos, highestPos, numSortedChar); + return; + } + + v = QSufSortChoosePivot(V, I, lowestPos, highestPos, numSortedChar); + + a = b = lowestPos; + c = d = highestPos; + + while (TRUE) { + while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) { + if (f == v) { + swap(I[a], I[b], tmp); + a++; + } + b++; + } + while (c >= b && (f = KEY(V, I, c, numSortedChar)) >= v) { + if (f == v) { + swap(I[c], I[d], tmp); + d--; + } + c--; + } + if (b > c) { + break; + } + swap(I[b], I[c], tmp); + b++; + c--; + } + + s = a - lowestPos; + t = b - a; + s = min(s, t); + for (l = lowestPos, m = b - s; m < b; l++, m++) { + swap(I[l], I[m], tmp); + } + + s = d - c; + t = highestPos - d; + s = min(s, t); + for (l = b, m = highestPos - s + 1; m <= highestPos; l++, m++) { + swap(I[l], I[m], tmp); + } + + s = b - a; + t = d - c; + if (s > 0) { + QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar); + } + + // Update group number for equal portion + a = lowestPos + s; + b = highestPos - t; + if (a == b) { + // Sorted group + V[I[a]] = a; + I[a] = -1; + } else { + // Unsorted group + for (c=a; c<=b; c++) { + V[I[c]] = b; + } + } + + if (t > 0) { + QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar); + } + +} + +/* Algorithm by Bentley & McIlroy.*/ +static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos, + const int highestPos, const int numSortedChar) { + + int m; + int keyl, keym, keyn; + int key1, key2, key3; + int s; + int numItem; + + #ifdef DEBUG + if (lowestPos > highestPos) { + fprintf(stderr, "QSufSortChoosePivot(): lowestPos > highestPos!\n"); + exit(1); + } + #endif + + numItem = highestPos - lowestPos + 1; + + #ifdef DEBUG + if (numItem <= INSERT_SORT_NUM_ITEM) { + fprintf(stderr, "QSufSortChoosePivot(): number of items <= INSERT_SORT_NUM_ITEM!\n"); + exit(1); + } + #endif + + m = lowestPos + numItem / 2; + + s = numItem / 8; + key1 = KEY(V, I, lowestPos, numSortedChar); + key2 = KEY(V, I, lowestPos+s, numSortedChar); + key3 = KEY(V, I, lowestPos+2*s, numSortedChar); + keyl = med3(key1, key2, key3); + key1 = KEY(V, I, m-s, numSortedChar); + key2 = KEY(V, I, m, numSortedChar); + key3 = KEY(V, I, m+s, numSortedChar); + keym = med3(key1, key2, key3); + key1 = KEY(V, I, highestPos-2*s, numSortedChar); + key2 = KEY(V, I, highestPos-s, numSortedChar); + key3 = KEY(V, I, highestPos, numSortedChar); + keyn = med3(key1, key2, key3); + + return med3(keyl, keym, keyn); + + +} + +/* Quadratic sorting method to use for small subarrays. */ +static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos, + const int highestPos, const int numSortedChar) { + + int i, j; + int tmpKey, tmpPos; + int numItem; + int key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM]; + int negativeSortedLength; + int groupNum; + + #ifdef DEBUG + if (lowestPos > highestPos) { + fprintf(stderr, "QSufSortInsertSortSplit(): lowestPos > highestPos!\n"); + exit(1); + } + #endif + + numItem = highestPos - lowestPos + 1; + + #ifdef DEBUG + if (numItem > INSERT_SORT_NUM_ITEM) { + fprintf(stderr, "QSufSortInsertSortSplit(): number of items > INSERT_SORT_NUM_ITEM!\n"); + exit(1); + } + #endif + + for (i=0; i0 && key[j-1] > tmpKey; j--) { + key[j] = key[j-1]; + pos[j] = pos[j-1]; + } + key[j] = tmpKey; + pos[j] = tmpPos; + } + + negativeSortedLength = -1; + + i = numItem - 1; + groupNum = highestPos; + while (i > 0) { + I[i+lowestPos] = pos[i]; + V[I[i+lowestPos]] = groupNum; + if (key[i-1] == key[i]) { + negativeSortedLength = 0; + } else { + if (negativeSortedLength < 0) { + I[i+lowestPos] = negativeSortedLength; + } + groupNum = i + lowestPos - 1; + negativeSortedLength--; + } + i--; + } + + I[lowestPos] = pos[0]; + V[I[lowestPos]] = groupNum; + if (negativeSortedLength < 0) { + I[lowestPos] = negativeSortedLength; + } + +} + +/* Bucketsort for first iteration. + + Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear + at least once. x[n] is 0. (This is the corresponding output of transform.) k + must be at most n+1. p is array of size n+1 whose contents are disregarded. + + Output: x is V and p is I after the initial sorting stage of the refined + suffix sorting algorithm.*/ + +static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int numChar, const int alphabetSize) { + + int i, c; + int d; + int groupNum; + int currentIndex; + + // mark linked list empty + for (i=0; i0; i--) { + c = I[i-1]; + d = (int)(V[c]); + groupNum = currentIndex; + V[c] = groupNum; + if (d >= 0) { + I[currentIndex] = c; + while (d >= 0) { + c = d; + d = V[c]; + V[c] = groupNum; + currentIndex--; + I[currentIndex] = c; + } + } else { + // sorted group + I[currentIndex] = -1; + } + currentIndex--; + } + +} + +/* Transforms the alphabet of x by attempting to aggregate several symbols into + one, while preserving the suffix order of x. The alphabet may also be + compacted, so that x on output comprises all integers of the new alphabet + with no skipped numbers. + + Input: x is an array of size n+1 whose first n elements are positive + integers in the range l...k-1. p is array of size n+1, used for temporary + storage. q controls aggregation and compaction by defining the maximum intue + for any symbol during transformation: q must be at least k-l; if q<=n, + compaction is guaranteed; if k-l>n, compaction is never done; if q is + INT_MAX, the maximum number of symbols are aggregated into one. + + Output: Returns an integer j in the range 1...q representing the size of the + new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is + set to the number of old symbols grouped into one. Only x[n] is 0.*/ +static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol, + const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated) { + + int c, i, j; + int a; // numSymbolAggregated + int mask; + int minSymbolInChunk = 0, maxSymbolInChunk = 0; + int newAlphabetSize; + int maxNumInputSymbol, maxNumBit, maxSymbol; + + maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; + + maxNumBit = BITS_IN_WORD - leadingZero(maxNumInputSymbol); + maxSymbol = INT_MAX >> maxNumBit; + + c = maxNumInputSymbol; + for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) { + minSymbolInChunk = (minSymbolInChunk << maxNumBit) | (V[a] - smallestInputSymbol + 1); + maxSymbolInChunk = c; + c = (maxSymbolInChunk << maxNumBit) | maxNumInputSymbol; + } + + mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/ + V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/ + + #ifdef DEBUG + // Section of code for maxSymbolInChunk > numChar removed! + if (maxSymbolInChunk > numChar) { + fprintf(stderr, "QSufSortTransform(): maxSymbolInChunk > numChar!\n"); + exit(1); + } + #endif + + /* bucketing possible, compact alphabet.*/ + for (i=0; i<=maxSymbolInChunk; i++) { + I[i] = 0; /* zero transformation table.*/ + } + c = minSymbolInChunk; + for (i=a; i<=numChar; i++) { + I[c] = 1; /* mark used chunk symbol.*/ + c = ((c & mask) << maxNumBit) | (V[i] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/ + } + for (i=1; i number of characters + + No warrenty is given regarding the quality of the modifications. + +*/ + +#ifndef __QSUFSORT_H__ +#define __QSUFSORT_H__ + +#define KEY(V, I, p, h) ( V[ I[p] + h ] ) +#define INSERT_SORT_NUM_ITEM 16 + +void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol, + const int smallestInputSymbol, const int skipTransform); +void QSufSortGenerateSaFromInverse(const int *V, int* __restrict I, const int numChar); + + +#endif diff --git a/bwt_gen/bwt_gen.c b/bwt_gen/bwt_gen.c new file mode 100644 index 0000000..d208a81 --- /dev/null +++ b/bwt_gen/bwt_gen.c @@ -0,0 +1,1547 @@ +/* + + BWTConstruct.c BWT-Index Construction + + This module constructs BWT and auxiliary data structures. + + Copyright (C) 2004, Wong Chi Kwong. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +*/ + +#include +#include +#include +#include "bwt_gen.h" +#include "QSufSort.h" + +static unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar, + unsigned int lastByteLength) +{ + if (bytePackedLength > ALL_ONE_MASK / (BITS_IN_BYTE / bitPerChar)) { + fprintf(stderr, "TextLengthFromBytePacked(): text length > 2^32!\n"); + exit(1); + } + return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength; +} + +static void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue) +{ + unsigned int i; + for (i=0; i>= 2; + } + } + +} +// for BWTIncCreate() +static unsigned int BWTOccValueMajorSizeInWord(const unsigned int numChar) +{ + unsigned int numOfOccValue; + unsigned int numOfOccIntervalPerMajor; + numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding + numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; + return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE; +} +// for BWTIncCreate() +static unsigned int BWTOccValueMinorSizeInWord(const unsigned int numChar) +{ + unsigned int numOfOccValue; + numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding + return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE; +} +// for BWTIncCreate() +static unsigned int BWTResidentSizeInWord(const unsigned int numChar) { + + unsigned int numCharRoundUpToOccInterval; + + // The $ in BWT at the position of inverseSa0 is not encoded + numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL; + + return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD; + +} + +static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) +{ + unsigned int maxBuildSize; + + if (bwtInc->bwt->textLength == 0) { + // initial build + // Minus 2 because n+1 entries of seq and rank needed for n char + maxBuildSize = (bwtInc->availableWord - 2 - OCC_INTERVAL / CHAR_PER_WORD) + / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD; + if (bwtInc->initialMaxBuildSize > 0) { + bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize); + } else { + bwtInc->buildSize = maxBuildSize; + } + } else { + // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char + // Minus numberOfIterationDone because bwt slightly shift to left in each iteration + maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3 + - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) + / 3; + if (maxBuildSize < CHAR_PER_WORD) { + fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); + exit(1); + } + if (bwtInc->incMaxBuildSize > 0) { + bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize); + } else { + bwtInc->buildSize = maxBuildSize; + } + if (bwtInc->buildSize < CHAR_PER_WORD) { + bwtInc->buildSize = CHAR_PER_WORD; + } + } + + if (bwtInc->buildSize < CHAR_PER_WORD) { + fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); + exit(1); + } + + bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; + + bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1); + bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + bwtInc->buildSize + 1); + +} + +// for ceilLog2() +unsigned int leadingZero(const unsigned int input) +{ + unsigned int l; + const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + if (input & 0xFFFF0000) { + if (input & 0xFF000000) { + l = leadingZero8bit[input >> 24]; + } else { + l = 8 + leadingZero8bit[input >> 16]; + } + } else { + if (input & 0x0000FF00) { + l = 16 + leadingZero8bit[input >> 8]; + } else { + l = 24 + leadingZero8bit[input]; + } + } + return l; + +} +// for BitPerBytePackedChar() +static unsigned int ceilLog2(const unsigned int input) +{ + if (input <= 1) return 0; + return BITS_IN_WORD - leadingZero(input - 1); + +} +// for ConvertBytePackedToWordPacked() +static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize) +{ + unsigned int bitPerChar; + bitPerChar = ceilLog2(alphabetSize); + // Return the largest number of bit that does not affect packing efficiency + if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar) + bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar); + return bitPerChar; +} +// for ConvertBytePackedToWordPacked() +static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize) +{ + return ceilLog2(alphabetSize); +} + +static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize, + const unsigned int textLength) +{ + unsigned int i, j, k; + unsigned int c; + unsigned int bitPerBytePackedChar; + unsigned int bitPerWordPackedChar; + unsigned int charPerWord; + unsigned int charPerByte; + unsigned int bytePerIteration; + unsigned int byteProcessed = 0; + unsigned int wordProcessed = 0; + unsigned int mask, shift; + + unsigned int buffer[BITS_IN_WORD]; + + bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize); + bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize); + charPerByte = BITS_IN_BYTE / bitPerBytePackedChar; + charPerWord = BITS_IN_WORD / bitPerWordPackedChar; + + bytePerIteration = charPerWord / charPerByte; + mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar); + shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar; + + while ((wordProcessed + 1) * charPerWord < textLength) { + + k = 0; + for (i=0; i> bitPerWordPackedChar * i; + } + output[wordProcessed] = c; + wordProcessed++; + + } + + k = 0; + for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) { + c = (unsigned int)input[byteProcessed] << shift; + for (j=0; j> bitPerWordPackedChar * i; + } + output[wordProcessed] = c; +} + +BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable) +{ + BWT *bwt; + + bwt = (BWT*)calloc(1, sizeof(BWT)); + + bwt->textLength = 0; + bwt->inverseSa = 0; + + bwt->cumulativeFreq = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int*)); + initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); + + bwt->bwtSizeInWord = 0; + bwt->saValueOnBoundary = NULL; + + // Generate decode tables + if (decodeTable == NULL) { + bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int)); + GenerateDNAOccCountTable(bwt->decodeTable); + } else { + bwt->decodeTable = decodeTable; + } + + bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength); + bwt->occValueMajor = (unsigned*)calloc(bwt->occMajorSizeInWord, sizeof(unsigned int)); + + bwt->occSizeInWord = 0; + bwt->occValue = NULL; + + bwt->saInterval = ALL_ONE_MASK; + bwt->saValueSize = 0; + bwt->saValue = NULL; + + bwt->inverseSaInterval = ALL_ONE_MASK; + bwt->inverseSaSize = 0; + bwt->inverseSa = NULL; + + return bwt; +} + +BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit, + const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize) +{ + BWTInc *bwtInc; + unsigned int i; + + if (targetNBit == 0) { + fprintf(stderr, "BWTIncCreate() : targetNBit = 0!\n"); + exit(1); + } + + bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc)); + bwtInc->numberOfIterationDone = 0; + bwtInc->bwt = BWTCreate(textLength, NULL); + bwtInc->initialMaxBuildSize = initialMaxBuildSize; + bwtInc->incMaxBuildSize = incMaxBuildSize; + bwtInc->targetNBit = targetNBit; + bwtInc->cumulativeCountInCurrentBuild = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int)); + initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); + + // Build frequently accessed data + bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int)); + for (i=0; ipackedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; + } + + bwtInc->targetTextLength = textLength; + bwtInc->availableWord = (unsigned int)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit); + if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) { + fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n"); + exit(1); + } + bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD); + + return bwtInc; + +} +// for BWTIncConstruct() +static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned int* __restrict rank, + unsigned int* __restrict cumulativeCount, const unsigned int numChar) +{ + unsigned int i, j; + unsigned int c, t; + unsigned int packedMask; + unsigned int rankIndex; + unsigned int lastWord, numCharInLastWord; + + lastWord = (numChar - 1) / CHAR_PER_WORD; + numCharInLastWord = numChar - lastWord * CHAR_PER_WORD; + + packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR); + rankIndex = numChar - 1; + + t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR); + for (i=0; i>= BIT_PER_CHAR; + } + + for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 + t = packedText[i]; + for (j=0; j>= BIT_PER_CHAR; + } + } + + // Convert occurrence to cumulativeCount + cumulativeCount[2] += cumulativeCount[1]; + cumulativeCount[3] += cumulativeCount[2]; + cumulativeCount[4] += cumulativeCount[3]; +} + + +static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index, + unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable) +{ + static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, + 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, + 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, + 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; + + unsigned int iteration, wordToCount, charToCount; + unsigned int i, j, c; + unsigned int sum; + + occCount[0] = 0; + occCount[1] = 0; + occCount[2] = 0; + occCount[3] = 0; + + iteration = index / 256; + wordToCount = (index - iteration * 256) / 16; + charToCount = index - iteration * 256 - wordToCount * 16; + + for (i=0; i> 16]; + sum += dnaDecodeTable[*dna & 0x0000FFFF]; + dna++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + occCount[0] += sum & 0x000000FF; sum >>= 8; + occCount[1] += sum & 0x000000FF; sum >>= 8; + occCount[2] += sum & 0x000000FF; sum >>= 8; + occCount[3] += sum; + } else { + // only some or all of the 3 bits are on + // in reality, only one of the four cases are possible + if (sum == 0x00000100) { + occCount[0] += 256; + } else if (sum == 0x00010000) { + occCount[1] += 256; + } else if (sum == 0x01000000) { + occCount[2] += 256; + } else if (sum == 0x00000000) { + occCount[3] += 256; + } else { + fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n"); + exit(1); + } + } + + } + + sum = 0; + for (j=0; j> 16]; + sum += dnaDecodeTable[*dna & 0x0000FFFF]; + dna++; + } + + if (charToCount > 0) { + c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; + sum += dnaDecodeTable[c >> 16]; + sum += dnaDecodeTable[c & 0xFFFF]; + sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess + } + + occCount[0] += sum & 0x000000FF; sum >>= 8; + occCount[1] += sum & 0x000000FF; sum >>= 8; + occCount[2] += sum & 0x000000FF; sum >>= 8; + occCount[3] += sum; +} + +static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* __restrict bwt, const unsigned int numChar, + const unsigned int *cumulativeCount, const unsigned int *packedShift) { + + unsigned int i, c, r; + unsigned int previousRank, currentRank; + unsigned int wordIndex, charIndex; + unsigned int inverseSa0; + + inverseSa0 = previousRank = relativeRank[0]; + + for (i=1; i<=numChar; i++) { + currentRank = relativeRank[i]; + // previousRank > cumulativeCount[c] because $ is one of the char + c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2]) + + (previousRank > cumulativeCount[3]); + // set bwt for currentRank + if (c > 0) { + // c <> 'a' + r = currentRank; + if (r > inverseSa0) { + // - 1 because $ at inverseSa0 is not encoded + r--; + } + wordIndex = r / CHAR_PER_WORD; + charIndex = r - wordIndex * CHAR_PER_WORD; + bwt[wordIndex] |= c << packedShift[charIndex]; + } + previousRank = currentRank; + } +} + +static inline unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit, + const unsigned int character) +{ + unsigned int occIndexMajor; + + occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR; + + if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) { + return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + + (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16); + + } else { + return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + + (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF); + } +} + + +static unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, + const unsigned int* dnaDecodeTable) +{ + static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, + 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, + 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, + 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; + + unsigned int wordToCount, charToCount; + unsigned int i, c; + unsigned int sum = 0; + + wordToCount = index / 16; + charToCount = index - wordToCount * 16; + + for (i=0; i> 16]; + sum += dnaDecodeTable[dna[i] & 0x0000FFFF]; + } + + if (charToCount > 0) { + c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; + sum += dnaDecodeTable[c >> 16]; + sum += dnaDecodeTable[c & 0xFFFF]; + sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess + } + + return (sum >> (character * 8)) & 0x000000FF; + +} + +static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, + const unsigned int* dnaDecodeTable) +{ + static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F, + 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF, + 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF, + 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF }; + + unsigned int wordToCount, charToCount; + unsigned int i, c; + unsigned int sum = 0; + + wordToCount = index / 16; + charToCount = index - wordToCount * 16; + + dna -= wordToCount + 1; + + if (charToCount > 0) { + c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c; + sum += dnaDecodeTable[c >> 16]; + sum += dnaDecodeTable[c & 0xFFFF]; + sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess + } + + for (i=0; i> 16]; + sum += dnaDecodeTable[*dna & 0x0000FFFF]; + } + + return (sum >> (character * 8)) & 0x000000FF; + +} + +unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character) { + + unsigned int occValue; + unsigned int occExplicitIndex, occIndex; + + // $ is supposed to be positioned at inverseSa0 but it is not encoded + // therefore index is subtracted by 1 for adjustment + if (index > bwt->inverseSa0) { + index--; + } + + occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding + occIndex = occExplicitIndex * OCC_INTERVAL; + occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character); + + if (occIndex == index) { + return occValue; + } + + if (occIndex < index) { + return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable); + } else { + return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable); + } + +} + +static unsigned int BWTIncGetAbsoluteRank(BWT *bwt, unsigned int* __restrict absoluteRank, unsigned int* __restrict seq, + const unsigned int *packedText, const unsigned int numChar, + const unsigned int* cumulativeCount, const unsigned int firstCharInLastIteration) +{ + unsigned int saIndex; + unsigned int lastWord; + unsigned int packedMask; + unsigned int i, j; + unsigned int c, t; + unsigned int rankIndex; + unsigned int shift; + unsigned int seqIndexFromStart[ALPHABET_SIZE]; + unsigned int seqIndexFromEnd[ALPHABET_SIZE]; + + for (i=0; i> shift; + saIndex = bwt->inverseSa0; + rankIndex = numChar - 1; + + lastWord = numChar / CHAR_PER_WORD; + for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 + t = packedText[i]; + for (j=0; jcumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1; + // A counting sort using the first character of suffix is done here + // If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0 + if (saIndex > bwt->inverseSa0) { + seq[seqIndexFromEnd[c]] = rankIndex; + absoluteRank[seqIndexFromEnd[c]] = saIndex; + seqIndexFromEnd[c]--; + } else { + seq[seqIndexFromStart[c]] = rankIndex; + absoluteRank[seqIndexFromStart[c]] = saIndex; + seqIndexFromStart[c]++; + } + rankIndex--; + t >>= BIT_PER_CHAR; + } + } + + absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0; // representing the substring of all preceding characters + seq[seqIndexFromStart[firstCharInLastIteration]] = numChar; + + return seqIndexFromStart[firstCharInLastIteration]; +} + +static void BWTIncSortKey(unsigned int* __restrict key, unsigned int* __restrict seq, const unsigned int numItem) +{ + #define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD + + int lowIndex, highIndex, midIndex; + int lowPartitionIndex, highPartitionIndex; + int lowStack[32], highStack[32]; + int stackDepth; + int i, j; + unsigned int tempSeq, tempKey; + int numberOfEqualKey; + + if (numItem < 2) return; + + stackDepth = 0; + + lowIndex = 0; + highIndex = numItem - 1; + + for (;;) { + + for (;;) { + + // Sort small array of data + if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) { // Insertion sort on smallest arrays + for (i=lowIndex+1; i<=highIndex; i++) { + tempSeq = seq[i]; + tempKey = key[i]; + for (j = i; j > lowIndex && key[j-1] > tempKey; j--) { + seq[j] = seq[j-1]; + key[j] = key[j-1]; + } + if (j != i) { + seq[j] = tempSeq; + key[j] = tempKey; + } + } + break; + } + + // Choose pivot as median of the lowest, middle, and highest data; sort the three data + + midIndex = average(lowIndex, highIndex); + if (key[lowIndex] > key[midIndex]) { + tempSeq = seq[lowIndex]; + tempKey = key[lowIndex]; + seq[lowIndex] = seq[midIndex]; + key[lowIndex] = key[midIndex]; + seq[midIndex] = tempSeq; + key[midIndex] = tempKey; + } + if (key[lowIndex] > key[highIndex]) { + tempSeq = seq[lowIndex]; + tempKey = key[lowIndex]; + seq[lowIndex] = seq[highIndex]; + key[lowIndex] = key[highIndex]; + seq[highIndex] = tempSeq; + key[highIndex] = tempKey; + } + if (key[midIndex] > key[highIndex]) { + tempSeq = seq[midIndex]; + tempKey = key[midIndex]; + seq[midIndex] = seq[highIndex]; + key[midIndex] = key[highIndex]; + seq[highIndex] = tempSeq; + key[highIndex] = tempKey; + } + + // Partition data + + numberOfEqualKey = 0; + + lowPartitionIndex = lowIndex + 1; + highPartitionIndex = highIndex - 1; + + for (;;) { + while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) { + numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]); + lowPartitionIndex++; + } + while (lowPartitionIndex < highPartitionIndex) { + if (key[midIndex] >= key[highPartitionIndex]) { + numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]); + break; + } + highPartitionIndex--; + } + if (lowPartitionIndex >= highPartitionIndex) { + break; + } + tempSeq = seq[lowPartitionIndex]; + tempKey = key[lowPartitionIndex]; + seq[lowPartitionIndex] = seq[highPartitionIndex]; + key[lowPartitionIndex] = key[highPartitionIndex]; + seq[highPartitionIndex] = tempSeq; + key[highPartitionIndex] = tempKey; + if (highPartitionIndex == midIndex) { + // partition key has been moved + midIndex = lowPartitionIndex; + } + lowPartitionIndex++; + highPartitionIndex--; + } + + // Adjust the partition index + highPartitionIndex = lowPartitionIndex; + lowPartitionIndex--; + + // move the partition key to end of low partition + tempSeq = seq[midIndex]; + tempKey = key[midIndex]; + seq[midIndex] = seq[lowPartitionIndex]; + key[midIndex] = key[lowPartitionIndex]; + seq[lowPartitionIndex] = tempSeq; + key[lowPartitionIndex] = tempKey; + + if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) { + + // Many keys = partition key; separate the equal key data from the lower partition + + midIndex = lowIndex; + + for (;;) { + while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) { + midIndex++; + } + while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) { + lowPartitionIndex--; + } + if (midIndex >= lowPartitionIndex) { + break; + } + tempSeq = seq[midIndex]; + tempKey = key[midIndex]; + seq[midIndex] = seq[lowPartitionIndex - 1]; + key[midIndex] = key[lowPartitionIndex - 1]; + seq[lowPartitionIndex - 1] = tempSeq; + key[lowPartitionIndex - 1] = tempKey; + midIndex++; + lowPartitionIndex--; + } + + } + + if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) { + // put the larger partition to stack + lowStack[stackDepth] = lowIndex; + highStack[stackDepth] = lowPartitionIndex - 1; + stackDepth++; + // sort the smaller partition first + lowIndex = highPartitionIndex; + } else { + // put the larger partition to stack + lowStack[stackDepth] = highPartitionIndex; + highStack[stackDepth] = highIndex; + stackDepth++; + // sort the smaller partition first + if (lowPartitionIndex > lowIndex) { + highIndex = lowPartitionIndex - 1; + } else { + // all keys in the partition equals to the partition key + break; + } + } + continue; + } + + // Pop a range from stack + if (stackDepth > 0) { + stackDepth--; + lowIndex = lowStack[stackDepth]; + highIndex = highStack[stackDepth]; + continue; + } else return; + } +} + + +static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigned int* __restrict seq, + unsigned int* __restrict relativeRank, const unsigned int numItem, + unsigned int oldInverseSa0, const unsigned int *cumulativeCount) +{ + unsigned int i, c; + unsigned int s, r; + unsigned int lastRank, lastIndex; + unsigned int oldInverseSa0RelativeRank = 0; + unsigned int freq; + + lastIndex = numItem; + lastRank = sortedRank[numItem]; + if (lastRank > oldInverseSa0) { + sortedRank[numItem]--; // to prepare for merging; $ is not encoded in bwt + } + s = seq[numItem]; + relativeRank[s] = numItem; + if (lastRank == oldInverseSa0) { + oldInverseSa0RelativeRank = numItem; + oldInverseSa0++; // so that this segment of code is not run again + lastRank++; // so that oldInverseSa0 become a sorted group with 1 item + } + + c = ALPHABET_SIZE - 1; + freq = cumulativeCount[c]; + + for (i=numItem; i--;) { // from numItem - 1 to 0 + r = sortedRank[i]; + if (r > oldInverseSa0) { + sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt + } + s = seq[i]; + if (i < freq) { + if (lastIndex >= freq) { + lastRank++; // to trigger the group across alphabet boundary to be split + } + c--; + freq = cumulativeCount[c]; + } + if (r == lastRank) { + relativeRank[s] = lastIndex; + } else { + if (i == lastIndex - 1) { + if (lastIndex < numItem && (int)seq[lastIndex + 1] < 0) { + seq[lastIndex] = seq[lastIndex + 1] - 1; + } else { + seq[lastIndex] = (unsigned int)-1; + } + } + lastIndex = i; + lastRank = r; + relativeRank[s] = i; + if (r == oldInverseSa0) { + oldInverseSa0RelativeRank = i; + oldInverseSa0++; // so that this segment of code is not run again + lastRank++; // so that oldInverseSa0 become a sorted group with 1 item + } + } + } + +} + +static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank, const unsigned int numChar, + const unsigned int *cumulativeCount) +{ + unsigned int i, c; + unsigned int previousRank, currentRank; + + previousRank = relativeRank[0]; + + for (i=1; i<=numChar; i++) { + currentRank = relativeRank[i]; + c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2]) + + (previousRank >= cumulativeCount[3]); + seq[currentRank] = c; + previousRank = currentRank; + } +} + +static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt, + unsigned int* __restrict mergedBwt, const unsigned int numOldBwt, const unsigned int numInsertBwt) +{ + unsigned int bitsInWordMinusBitPerChar; + unsigned int leftShift, rightShift; + unsigned int o; + unsigned int oIndex, iIndex, mIndex; + unsigned int mWord, mChar, oWord, oChar; + unsigned int numInsert; + + bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR; + + oIndex = 0; + iIndex = 0; + mIndex = 0; + + mWord = 0; + mChar = 0; + + mergedBwt[0] = 0; // this can be cleared as merged Bwt slightly shift to the left in each iteration + + while (oIndex < numOldBwt) { + + // copy from insertBwt + while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) { + if (sortedRank[iIndex] != 0) { // special value to indicate that this is for new inverseSa0 + mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); + mIndex++; + mChar++; + if (mChar == CHAR_PER_WORD) { + mChar = 0; + mWord++; + mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary + } + } + iIndex++; + } + + // Copy from oldBwt to mergedBwt + if (iIndex <= numInsertBwt) { + o = sortedRank[iIndex]; + } else { + o = numOldBwt; + } + numInsert = o - oIndex; + + oWord = oIndex / CHAR_PER_WORD; + oChar = oIndex - oWord * CHAR_PER_WORD; + if (oChar > mChar) { + leftShift = (oChar - mChar) * BIT_PER_CHAR; + rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR; + mergedBwt[mWord] = mergedBwt[mWord] + | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)) + | (oldBwt[oWord+1] >> rightShift); + oIndex += min(numInsert, CHAR_PER_WORD - mChar); + while (o > oIndex) { + oWord++; + mWord++; + mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift); + oIndex += CHAR_PER_WORD; + } + } else if (oChar < mChar) { + rightShift = (mChar - oChar) * BIT_PER_CHAR; + leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR; + mergedBwt[mWord] = mergedBwt[mWord] + | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)); + oIndex += min(numInsert, CHAR_PER_WORD - mChar); + while (o > oIndex) { + oWord++; + mWord++; + mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift); + oIndex += CHAR_PER_WORD; + } + } else { // oChar == mChar + mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR); + oIndex += min(numInsert, CHAR_PER_WORD - mChar); + while (o > oIndex) { + oWord++; + mWord++; + mergedBwt[mWord] = oldBwt[oWord]; + oIndex += CHAR_PER_WORD; + } + } + oIndex = o; + mIndex += numInsert; + + // Clear the trailing garbage in mergedBwt + mWord = mIndex / CHAR_PER_WORD; + mChar = mIndex - mWord * CHAR_PER_WORD; + if (mChar == 0) { + mergedBwt[mWord] = 0; + } else { + mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR)); + } + + } + + // copy from insertBwt + while (iIndex <= numInsertBwt) { + if (sortedRank[iIndex] != 0) { + mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); + mIndex++; + mChar++; + if (mChar == CHAR_PER_WORD) { + mChar = 0; + mWord++; + mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary + } + } + iIndex++; + } +} + +void BWTClearTrailingBwtCode(BWT *bwt) +{ + unsigned int bwtResidentSizeInWord; + unsigned int wordIndex, offset; + unsigned int i; + + bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength); + + wordIndex = bwt->textLength / CHAR_PER_WORD; + offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR; + if (offset > 0) { + bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset); + } else { + if (wordIndex < bwtResidentSizeInWord) { + bwt->bwtCode[wordIndex] = 0; + } + } + + for (i=wordIndex+1; ibwtCode[i] = 0; + } +} + + +void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue, + unsigned int* __restrict occValueMajor, + const unsigned int textLength, const unsigned int* decodeTable) +{ + unsigned int numberOfOccValueMajor, numberOfOccValue; + unsigned int wordBetweenOccValue; + unsigned int numberOfOccIntervalPerMajor; + unsigned int c; + unsigned int i, j; + unsigned int occMajorIndex; + unsigned int occIndex, bwtIndex; + unsigned int sum; + unsigned int tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE]; + + wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD; + + // Calculate occValue + // [lh3] by default: OCC_INTERVAL_MAJOR=65536, OCC_INTERVAL=256 + numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding + numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; + numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor; + + tempOccValue0[0] = 0; + tempOccValue0[1] = 0; + tempOccValue0[2] = 0; + tempOccValue0[3] = 0; + occValueMajor[0] = 0; + occValueMajor[1] = 0; + occValueMajor[2] = 0; + occValueMajor[3] = 0; + + occIndex = 0; + bwtIndex = 0; + for (occMajorIndex=1; occMajorIndex> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue1[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue1[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue1[2] += 256; + } else { + tempOccValue1[3] += 256; + } + } + occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; + occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; + occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; + occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; + tempOccValue0[0] = tempOccValue1[0]; + tempOccValue0[1] = tempOccValue1[1]; + tempOccValue0[2] = tempOccValue1[2]; + tempOccValue0[3] = tempOccValue1[3]; + sum = 0; + + occIndex++; + + for (j=0; j> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue0[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue0[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue0[2] += 256; + } else { + tempOccValue0[3] += 256; + } + } + } + + occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0]; + occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1]; + occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2]; + occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3]; + tempOccValue0[0] = 0; + tempOccValue0[1] = 0; + tempOccValue0[2] = 0; + tempOccValue0[3] = 0; + + } + + while (occIndex < (numberOfOccValue-1)/2) { + sum = 0; + tempOccValue1[0] = tempOccValue0[0]; + tempOccValue1[1] = tempOccValue0[1]; + tempOccValue1[2] = tempOccValue0[2]; + tempOccValue1[3] = tempOccValue0[3]; + for (j=0; j> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue1[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue1[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue1[2] += 256; + } else { + tempOccValue1[3] += 256; + } + } + occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; + occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; + occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; + occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; + tempOccValue0[0] = tempOccValue1[0]; + tempOccValue0[1] = tempOccValue1[1]; + tempOccValue0[2] = tempOccValue1[2]; + tempOccValue0[3] = tempOccValue1[3]; + sum = 0; + occIndex++; + + for (j=0; j> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue0[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue0[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue0[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue0[2] += 256; + } else { + tempOccValue0[3] += 256; + } + } + } + + sum = 0; + tempOccValue1[0] = tempOccValue0[0]; + tempOccValue1[1] = tempOccValue0[1]; + tempOccValue1[2] = tempOccValue0[2]; + tempOccValue1[3] = tempOccValue0[3]; + + if (occIndex * 2 < numberOfOccValue - 1) { + for (j=0; j> 16]; + sum += decodeTable[c & 0x0000FFFF]; + bwtIndex++; + } + if (!DNA_OCC_SUM_EXCEPTION(sum)) { + tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; + tempOccValue1[3] += sum; + } else { + if (sum == 0x00000100) { + tempOccValue1[0] += 256; + } else if (sum == 0x00010000) { + tempOccValue1[1] += 256; + } else if (sum == 0x01000000) { + tempOccValue1[2] += 256; + } else { + tempOccValue1[3] += 256; + } + } + } + + occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; + occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; + occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; + occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; + +} + +static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) +{ + unsigned int i; + unsigned int mergedBwtSizeInWord, mergedOccSizeInWord; + unsigned int firstCharInThisIteration; + + unsigned int *relativeRank, *seq, *sortedRank, *insertBwt, *mergedBwt; + unsigned int newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0; + + #ifdef DEBUG + if (numChar > bwtInc->buildSize) { + fprintf(stderr, "BWTIncConstruct(): numChar > buildSize!\n"); + exit(1); + } + #endif + + mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar); + mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar); + + initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); + + if (bwtInc->bwt->textLength == 0) { // Initial build + + // Set address + seq = bwtInc->workingMemory; + relativeRank = seq + bwtInc->buildSize + 1; + mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place + + BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar); + + firstCharInThisIteration = relativeRank[0]; + relativeRank[numChar] = 0; + + // Sort suffix + QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)ALPHABET_SIZE - 1, 0, FALSE); + newInverseSa0 = relativeRank[0]; + + // Clear BWT area + initializeVAL(insertBwt, mergedBwtSizeInWord, 0); + + // Build BWT + BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift); + + // so that the cumulativeCount is not deducted + bwtInc->firstCharInLastIteration = ALPHABET_SIZE; + + } else { // Incremental build + // Set address + sortedRank = bwtInc->workingMemory; + seq = sortedRank + bwtInc->buildSize + 1; + insertBwt = seq; + relativeRank = seq + bwtInc->buildSize + 1; + + // Store the first character of this iteration + firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR); + + // Count occurrence of input text + ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable); + // Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration + bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++; + bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1]; + bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2]; + bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3]; + + // Get rank of new suffix among processed suffix + // The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group + oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText, + numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration); + + // Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group) + for (i=0; icumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank || + bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) { + BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]); + } else { + if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) { + BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]); + } + if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) { + BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1); + } + } + } + + // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt + // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups) + BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild); +#ifdef DEBUG + if (relativeRank[numChar] != oldInverseSa0RelativeRank) { + fprintf(stderr, "BWTIncConstruct(): relativeRank[numChar] != oldInverseSa0RelativeRank!\n"); + exit(1); + } +#endif + + // Sort suffix + QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)numChar, 1, TRUE); + + newInverseSa0RelativeRank = relativeRank[0]; + newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank; + + sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt + + // Build BWT + BWTIncBuildBwt(seq, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild); + + // Merge BWT + mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord + - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR; + // minus numberOfIteration * occInterval to create a buffer for merging + BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar); + + } + + // Build auxiliary structure and update info and pointers in BWT + bwtInc->bwt->textLength += numChar; + bwtInc->bwt->bwtCode = mergedBwt; + bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord; + bwtInc->bwt->occSizeInWord = mergedOccSizeInWord; + if (mergedBwt < bwtInc->workingMemory + mergedOccSizeInWord) { + fprintf(stderr, "BWTIncConstruct() : Not enough memory allocated!\n"); + exit(1); + } + + bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord; + + BWTClearTrailingBwtCode(bwtInc->bwt); + BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor, + bwtInc->bwt->textLength, bwtInc->bwt->decodeTable); + + bwtInc->bwt->inverseSa0 = newInverseSa0; + + bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0); + bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1); + bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2); + bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3); + + bwtInc->firstCharInLastIteration = firstCharInThisIteration; + + // Set build size and text address for the next build + BWTIncSetBuildSizeAndTextAddr(bwtInc); + bwtInc->numberOfIterationDone++; + +} + +BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetNBit, + const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize) +{ + + FILE *packedFile; + unsigned int packedFileLen; + unsigned int totalTextLength; + unsigned int textToLoad, textSizeInByte; + unsigned int processedTextLength; + unsigned char lastByteLength; + + BWTInc *bwtInc; + + packedFile = (FILE*)fopen(inputFileName, "rb"); + + if (packedFile == NULL) { + fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open inputFileName!\n"); + exit(1); + } + + fseek(packedFile, -1, SEEK_END); + packedFileLen = ftell(packedFile); + if ((int)packedFileLen < 0) { + fprintf(stderr, "BWTIncConstructFromPacked: Cannot determine file length!\n"); + exit(1); + } + fread(&lastByteLength, sizeof(unsigned char), 1, packedFile); + totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); + + bwtInc = BWTIncCreate(totalTextLength, targetNBit, initialMaxBuildSize, incMaxBuildSize); + + BWTIncSetBuildSizeAndTextAddr(bwtInc); + + if (bwtInc->buildSize > totalTextLength) { + textToLoad = totalTextLength; + } else { + textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD); + } + textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte + + fseek(packedFile, -2, SEEK_CUR); + fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); + fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile); + fseek(packedFile, -((int)textSizeInByte + 1), SEEK_CUR); + + ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); + BWTIncConstruct(bwtInc, textToLoad); + + processedTextLength = textToLoad; + + while (processedTextLength < totalTextLength) { + textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; + if (textToLoad > totalTextLength - processedTextLength) { + textToLoad = totalTextLength - processedTextLength; + } + textSizeInByte = textToLoad / CHAR_PER_BYTE; + fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); + fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile); + fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); + ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); + BWTIncConstruct(bwtInc, textToLoad); + processedTextLength += textToLoad; + if (bwtInc->numberOfIterationDone % 10 == 0) { + printf("[BWTIncConstructFromPacked] %u iterations done. %u characters processed.\n", + bwtInc->numberOfIterationDone, processedTextLength); + } + } + return bwtInc; +} + +void BWTFree(BWT *bwt) +{ + if (bwt == 0) return; + free(bwt->cumulativeFreq); + free(bwt->bwtCode); + free(bwt->occValue); + free(bwt->occValueMajor); + free(bwt->saValue); + free(bwt->inverseSa); + free(bwt->decodeTable); + free(bwt->saIndexRange); + free(bwt->saValueOnBoundary); + free(bwt); +} + +void BWTIncFree(BWTInc *bwtInc) +{ + if (bwtInc == 0) return; + free(bwtInc->bwt); + free(bwtInc->workingMemory); + free(bwtInc); +} + +static unsigned int BWTFileSizeInWord(const unsigned int numChar) +{ + // The $ in BWT at the position of inverseSa0 is not encoded + return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD; +} + +void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName) +{ + FILE *bwtFile; +/* FILE *occValueFile; */ + unsigned int bwtLength; + + bwtFile = (FILE*)fopen(bwtFileName, "wb"); + if (bwtFile == NULL) { + fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open BWT code file!\n"); + exit(1); + } + + fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, bwtFile); + fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, bwtFile); + bwtLength = BWTFileSizeInWord(bwt->textLength); + fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile); + fclose(bwtFile); +/* + occValueFile = (FILE*)fopen(occValueFileName, "wb"); + if (occValueFile == NULL) { + fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open occ value file!\n"); + exit(1); + } + + fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, occValueFile); + fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, occValueFile); + fwrite(bwt->occValue, sizeof(unsigned int), bwt->occSizeInWord, occValueFile); + fwrite(bwt->occValueMajor, sizeof(unsigned int), bwt->occMajorSizeInWord, occValueFile); + fclose(occValueFile); +*/ +} + +void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) +{ + BWTInc *bwtInc; + bwtInc = BWTIncConstructFromPacked(fn_pac, 2.5, 10000000, 10000000); + printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); + BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); + BWTIncFree(bwtInc); +} + +int bwt_bwtgen_main(int argc, char *argv[]) +{ + if (argc < 3) { + fprintf(stderr, "Usage: bwtgen \n"); + return 1; + } + bwt_bwtgen(argv[1], argv[2]); + return 0; +} + +#ifdef MAIN_BWT_GEN + +int main(int argc, char *argv[]) +{ + return bwt_bwtgen_main(argc, argv); +} + +#endif diff --git a/bwt_gen/bwt_gen.h b/bwt_gen/bwt_gen.h new file mode 100644 index 0000000..d6cc1ef --- /dev/null +++ b/bwt_gen/bwt_gen.h @@ -0,0 +1,105 @@ +/* + + BWTConstruct.h BWT-Index Construction + + This module constructs BWT and auxiliary data structures. + + Copyright (C) 2004, Wong Chi Kwong. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +*/ + +#ifndef BWT_GEN_H +#define BWT_GEN_H + +#define ALPHABET_SIZE 4 +#define BIT_PER_CHAR 2 +#define CHAR_PER_WORD 16 +#define CHAR_PER_BYTE 4 + +#define BITS_IN_WORD 32 +#define BITS_IN_BYTE 8 +#define BYTES_IN_WORD 4 + +#define ALL_ONE_MASK 0xFFFFFFFF +#define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536 + +#define BITS_PER_OCC_VALUE 16 +#define OCC_VALUE_PER_WORD 2 +#define OCC_INTERVAL 256 +#define OCC_INTERVAL_MAJOR 65536 + +#define TRUE 1 +#define FALSE 0 + +#define BWTINC_INSERT_SORT_NUM_ITEM 7 + +#define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 ) +#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) +#define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) ) +#define med3(a, b, c) ( ac ? b : a>c ? c : a)) +#define swap(a, b, t); t = a; a = b; b = t; +#define truncateLeft(value, offset) ( (value) << (offset) >> (offset) ) +#define truncateRight(value, offset) ( (value) >> (offset) << (offset) ) +#define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0) + +typedef struct SaIndexRange { + unsigned int startSaIndex; + unsigned int endSaIndex; +} SaIndexRange; + +typedef struct BWT { + unsigned int textLength; // length of the text + unsigned int saInterval; // interval between two SA values stored explicitly + unsigned int inverseSaInterval; // interval between two inverse SA stored explicitly + unsigned int inverseSa0; // SA-1[0] + unsigned int *cumulativeFreq; // cumulative frequency + unsigned int *bwtCode; // BWT code + unsigned int *occValue; // Occurrence values stored explicitly + unsigned int *occValueMajor; // Occurrence values stored explicitly + unsigned int *saValue; // SA values stored explicitly + unsigned int *inverseSa; // Inverse SA stored explicitly + SaIndexRange *saIndexRange; // SA index range + int saIndexRangeNumOfChar; // Number of characters indexed in SA index range + unsigned int *saValueOnBoundary; // Pre-calculated frequently referred data + unsigned int *decodeTable; // For decoding BWT by table lookup + unsigned int decodeTableGenerated; // == TRUE if decode table is generated on load and will be freed + unsigned int bwtSizeInWord; // Temporary variable to hold the memory allocated + unsigned int occSizeInWord; // Temporary variable to hold the memory allocated + unsigned int occMajorSizeInWord; // Temporary variable to hold the memory allocated + unsigned int saValueSize; // Temporary variable to hold the memory allocated + unsigned int inverseSaSize; // Temporary variable to hold the memory allocated + unsigned int saIndexRangeSize; // Temporary variable to hold the memory allocated +} BWT; + +typedef struct BWTInc { + BWT *bwt; + unsigned int numberOfIterationDone; + unsigned int *cumulativeCountInCurrentBuild; + unsigned int availableWord; + unsigned int targetTextLength; + float targetNBit; + unsigned int buildSize; + unsigned int initialMaxBuildSize; + unsigned int incMaxBuildSize; + unsigned int firstCharInLastIteration; + unsigned int *workingMemory; + unsigned int *packedText; + unsigned char *textBuffer; + unsigned int *packedShift; +} BWTInc; + +#endif diff --git a/bwt_lite.c b/bwt_lite.c new file mode 100644 index 0000000..dd411e1 --- /dev/null +++ b/bwt_lite.c @@ -0,0 +1,94 @@ +#include +#include +#include +#include "bwt_lite.h" + +int is_sa(const uint8_t *T, uint32_t *SA, int n); +int is_bwt(uint8_t *T, int n); + +bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq) +{ + bwtl_t *b; + int i; + b = (bwtl_t*)calloc(1, sizeof(bwtl_t)); + b->seq_len = len; + + { // calculate b->bwt + uint8_t *s; + b->sa = (uint32_t*)calloc(len + 1, 4); + is_sa(seq, b->sa, len); + s = (uint8_t*)calloc(len + 1, 1); + for (i = 0; i <= len; ++i) { + if (b->sa[i] == 0) b->primary = i; + else s[i] = seq[b->sa[i] - 1]; + } + for (i = b->primary; i < len; ++i) s[i] = s[i + 1]; + b->bwt_size = (len + 15) / 16; + b->bwt = (uint32_t*)calloc(b->bwt_size, 4); + for (i = 0; i < len; ++i) + b->bwt[i>>4] |= s[i] << ((15 - (i&15)) << 1); + free(s); + } + { // calculate b->occ + uint32_t c[4]; + b->n_occ = (len + 15) / 16 * 4; + b->occ = (uint32_t*)calloc(b->n_occ, 4); + memset(c, 0, 16); + for (i = 0; i < len; ++i) { + if (i % 16 == 0) + memcpy(b->occ + (i/16) * 4, c, 16); + ++c[bwtl_B0(b, i)]; + } + memcpy(b->L2+1, c, 16); + for (i = 2; i < 5; ++i) b->L2[i] += b->L2[i-1]; + } + { // generate cnt_table + for (i = 0; i != 256; ++i) { + u_int32_t j, x = 0; + for (j = 0; j != 4; ++j) + x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3); + b->cnt_table[i] = x; + } + } + return b; +} +inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c) +{ + uint32_t n, b; + if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; + if (k == (uint32_t)(-1)) return 0; + if (k >= bwt->primary) --k; // because $ is not in bwt + n = bwt->occ[k/16<<2|c]; + b = bwt->bwt[k/16] & ~((1U<<((15-(k&15))<<1)) - 1); + n += (bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff] + + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]) >> (c<<3) & 0xff; + if (c == 0) n -= 15 - (k&15); // corrected for the masked bits + return n; +} +inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]) +{ + uint32_t x, b; + if (k == (uint32_t)(-1)) { + memset(cnt, 0, 16); + return; + } + if (k >= bwt->primary) --k; // because $ is not in bwt + memcpy(cnt, bwt->occ + (k>>4<<2), 16); + b = bwt->bwt[k>>4] & ~((1U<<((~k&15)<<1)) - 1); + x = bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff] + + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]; + x -= 15 - (k&15); + cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; +} +inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]) +{ + bwtl_occ4(bwt, k, cntk); + bwtl_occ4(bwt, l, cntl); +} +void bwtl_destroy(bwtl_t *bwt) +{ + if (bwt) { + free(bwt->occ); free(bwt->bwt); free(bwt->sa); + free(bwt); + } +} diff --git a/bwt_lite.h b/bwt_lite.h new file mode 100644 index 0000000..0096b93 --- /dev/null +++ b/bwt_lite.h @@ -0,0 +1,29 @@ +#ifndef BWT_LITE_H_ +#define BWT_LITE_H_ + +#include + +typedef struct { + uint32_t seq_len, bwt_size, n_occ; + uint32_t primary; + uint32_t *bwt, *occ, *sa, L2[5]; + uint32_t cnt_table[256]; +} bwtl_t; + +#define bwtl_B0(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) + +#ifdef __cplusplus +extern "C" { +#endif + + bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq); + inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c); + inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]); + inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]); + void bwtl_destroy(bwtl_t *bwt); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwtaln.c b/bwtaln.c new file mode 100644 index 0000000..6d7b8f4 --- /dev/null +++ b/bwtaln.c @@ -0,0 +1,339 @@ +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#include "bwtaln.h" +#include "bwtgap.h" +#include "utils.h" + +#ifdef HAVE_PTHREAD +#define THREAD_BLOCK_SIZE 1024 +#include +static pthread_mutex_t g_seq_lock = PTHREAD_MUTEX_INITIALIZER; +#endif + +gap_opt_t *gap_init_opt() +{ + gap_opt_t *o; + o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t)); + /* IMPORTANT: s_mm*10 should be about the average base error + rate. Voilating this requirement will break pairing! */ + o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4; + o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6; + o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000; + o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD; + o->seed_len = 32; o->max_seed_diff = 2; + o->fnr = 0.04; + o->n_threads = 1; + o->max_top2 = 30; + o->trim_qual = 0; + return o; +} + +int bwa_cal_maxdiff(int l, double err, double thres) +{ + double elambda = exp(-l * err); + double sum, y = 1.0; + int k, x = 1; + for (k = 1, sum = elambda; k < 1000; ++k) { + y *= l * err; + x *= k; + sum += elambda * y / x; + if (1.0 - sum < thres) return k; + } + return 2; +} + +// width must be filled as zero +static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width) +{ + bwtint_t k, l, ok, ol; + int i, bid; + bid = 0; + k = 0; l = rbwt->seq_len; + for (i = 0; i < len; ++i) { + ubyte_t c = str[i]; + if (c < 4) { + bwt_2occ(rbwt, k - 1, l, c, &ok, &ol); + k = rbwt->L2[c] + ok + 1; + l = rbwt->L2[c] + ol; + } + if (k > l || c > 3) { // then restart + k = 0; + l = rbwt->seq_len; + ++bid; + } + width[i].w = l - k + 1; + width[i].bid = bid; + } + width[len].w = 0; + width[len].bid = ++bid; + return bid; +} + +void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt) +{ + int i, max_l = 0, max_len; + gap_stack_t *stack; + bwt_width_t *w[2], *seed_w[2]; + const ubyte_t *seq[2]; + gap_opt_t local_opt = *opt; + + // initiate priority stack + for (i = max_len = 0; i != n_seqs; ++i) + if (seqs[i].len > max_len) max_len = seqs[i].len; + if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr); + if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff; + stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt); + + seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); + seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); + w[0] = w[1] = 0; + for (i = 0; i != n_seqs; ++i) { + bwa_seq_t *p = seqs + i; +#ifdef HAVE_PTHREAD + if (opt->n_threads > 1) { + pthread_mutex_lock(&g_seq_lock); + if (p->tid < 0) { // unassigned + int j; + for (j = i; j < n_seqs && j < i + THREAD_BLOCK_SIZE; ++j) + seqs[j].tid = tid; + } else if (p->tid != tid) { + pthread_mutex_unlock(&g_seq_lock); + continue; + } + pthread_mutex_unlock(&g_seq_lock); + } +#endif + p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; + seq[0] = p->seq; seq[1] = p->rseq; + if (max_l < p->len) { + max_l = p->len; + w[0] = (bwt_width_t*)realloc(w[0], (max_l + 1) * sizeof(bwt_width_t)); + w[1] = (bwt_width_t*)realloc(w[1], (max_l + 1) * sizeof(bwt_width_t)); + memset(w[0], 0, (max_l + 1) * sizeof(bwt_width_t)); + memset(w[1], 0, (max_l + 1) * sizeof(bwt_width_t)); + } + bwt_cal_width(bwt[0], p->len, seq[0], w[0]); + bwt_cal_width(bwt[1], p->len, seq[1], w[1]); + if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr); + local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff; + if (p->len > opt->seed_len) { + bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]); + bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]); + } + // core function + p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); + // store the alignment + free(p->name); free(p->seq); free(p->rseq); free(p->qual); + p->name = 0; p->seq = p->rseq = p->qual = 0; + } + free(seed_w[0]); free(seed_w[1]); + free(w[0]); free(w[1]); + gap_destroy_stack(stack); +} + +#ifdef HAVE_PTHREAD +typedef struct { + int tid; + bwt_t *bwt[2]; + int n_seqs; + bwa_seq_t *seqs; + const gap_opt_t *opt; +} thread_aux_t; + +static void *worker(void *data) +{ + thread_aux_t *d = (thread_aux_t*)data; + bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt); + return 0; +} +#endif + +bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa) +{ + bwa_seqio_t *ks; + if (mode & BWA_MODE_BAM) { // open BAM + int which = 0; + if (mode & BWA_MODE_BAM_SE) which |= 4; + if (mode & BWA_MODE_BAM_READ1) which |= 1; + if (mode & BWA_MODE_BAM_READ2) which |= 2; + if (which == 0) which = 7; // then read all reads + ks = bwa_bam_open(fn_fa, which); + } else ks = bwa_seq_open(fn_fa); + return ks; +} + +void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) +{ + int i, n_seqs, tot_seqs = 0; + bwa_seq_t *seqs; + bwa_seqio_t *ks; + clock_t t; + bwt_t *bwt[2]; + + // initialization + ks = bwa_open_reads(opt->mode, fn_fa); + + { // load BWT + char *str = (char*)calloc(strlen(prefix) + 10, 1); + strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str); + free(str); + } + + // core loop + fwrite(opt, sizeof(gap_opt_t), 1, stdout); + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode & BWA_MODE_COMPREAD, opt->trim_qual)) != 0) { + tot_seqs += n_seqs; + t = clock(); + + fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... "); + +#ifdef HAVE_PTHREAD + if (opt->n_threads <= 1) { // no multi-threading at all + bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); + } else { + pthread_t *tid; + pthread_attr_t attr; + thread_aux_t *data; + int j; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + for (j = 0; j < opt->n_threads; ++j) { + data[j].tid = j; data[j].bwt[0] = bwt[0]; data[j].bwt[1] = bwt[1]; + data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; + pthread_create(&tid[j], &attr, worker, data + j); + } + for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); + free(data); free(tid); + } +#else + bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); +#endif + + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + t = clock(); + fprintf(stderr, "[bwa_aln_core] write to the disk... "); + for (i = 0; i < n_seqs; ++i) { + bwa_seq_t *p = seqs + i; + fwrite(&p->n_aln, 4, 1, stdout); + if (p->n_aln) fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); + } + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); + + bwa_free_read_seq(n_seqs, seqs); + fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); + } + + // destroy + bwt_destroy(bwt[0]); bwt_destroy(bwt[1]); + bwa_seq_close(ks); +} + +int bwa_aln(int argc, char *argv[]) +{ + int c, opte = -1; + gap_opt_t *opt; + + opt = gap_init_opt(); + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012")) >= 0) { + switch (c) { + case 'n': + if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; + else opt->max_diff = atoi(optarg), opt->fnr = -1.0; + break; + case 'o': opt->max_gapo = atoi(optarg); break; + case 'e': opte = atoi(optarg); break; + case 'M': opt->s_mm = atoi(optarg); break; + case 'O': opt->s_gapo = atoi(optarg); break; + case 'E': opt->s_gape = atoi(optarg); break; + case 'd': opt->max_del_occ = atoi(optarg); break; + case 'i': opt->indel_end_skip = atoi(optarg); break; + case 'l': opt->seed_len = atoi(optarg); break; + case 'k': opt->max_seed_diff = atoi(optarg); break; + case 'm': opt->max_entries = atoi(optarg); break; + case 't': opt->n_threads = atoi(optarg); break; + case 'L': opt->mode |= BWA_MODE_LOGGAP; break; + case 'R': opt->max_top2 = atoi(optarg); break; + case 'q': opt->trim_qual = atoi(optarg); break; + case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break; + case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; + case 'f': freopen(optarg, "wb", stdout); break; + case 'b': opt->mode |= BWA_MODE_BAM; break; + case '0': opt->mode |= BWA_MODE_BAM_SE; break; + case '1': opt->mode |= BWA_MODE_BAM_READ1; break; + case '2': opt->mode |= BWA_MODE_BAM_READ2; break; + default: return 1; + } + } + if (opte > 0) { + opt->max_gape = opte; + opt->mode &= ~BWA_MODE_GAPE; + } + + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa aln [options] \n\n"); + fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n", + BWA_AVG_ERR, opt->fnr); + fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo); + fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n"); + fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip); + fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ); + fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len); + fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff); + fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm); + fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo); + fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape); + fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2); + fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); + fprintf(stderr, " -f FILE file to write output to instead of stdout\n"); + fprintf(stderr, " -c input sequences are in the color space\n"); + fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); + fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); + fprintf(stderr, " -b the input read file is in the BAM format\n"); + fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); + fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); + fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n"); + fprintf(stderr, "\n"); + return 1; + } + if (opt->fnr > 0.0) { + int i, k; + for (i = 17, k = 0; i <= 250; ++i) { + int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); + if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l); + k = l; + } + } + bwa_aln_core(argv[optind], argv[optind+1], opt); + free(opt); + return 0; +} + +/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, +__cigar_op and __cigar_len while keeping stdaln stand alone */ +bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar) +{ + uint32_t *cigar32; + bwa_cigar_t *cigar; + int i; + cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar); + cigar = (bwa_cigar_t*)cigar32; + for (i = 0; i < *n_cigar; ++i) + cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) ); + return cigar; +} + diff --git a/bwtaln.h b/bwtaln.h new file mode 100644 index 0000000..0331b56 --- /dev/null +++ b/bwtaln.h @@ -0,0 +1,147 @@ +#ifndef BWTALN_H +#define BWTALN_H + +#include +#include "bwt.h" + +#define BWA_TYPE_NO_MATCH 0 +#define BWA_TYPE_UNIQUE 1 +#define BWA_TYPE_REPEAT 2 +#define BWA_TYPE_MATESW 3 + +#define SAM_FPD 1 // paired +#define SAM_FPP 2 // properly paired +#define SAM_FSU 4 // self-unmapped +#define SAM_FMU 8 // mate-unmapped +#define SAM_FSR 16 // self on the reverse strand +#define SAM_FMR 32 // mate on the reverse strand +#define SAM_FR1 64 // this is read one +#define SAM_FR2 128 // this is read two +#define SAM_FSC 256 // secondary alignment + +#define BWA_AVG_ERR 0.02 +#define BWA_MIN_RDLEN 35 // for read trimming + +#ifndef bns_pac +#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3) +#endif + +typedef struct { + bwtint_t w; + int bid; +} bwt_width_t; + +typedef struct { + uint32_t n_mm:8, n_gapo:8, n_gape:8, a:1; + bwtint_t k, l; + int score; +} bwt_aln1_t; + +typedef uint16_t bwa_cigar_t; +/* rgoya: If changing order of bytes, beware of operations like: + * s->cigar[0] += s->full_len - s->len; + */ +#define CIGAR_OP_SHIFT 14 +#define CIGAR_LN_MASK 0x3fff + +#define __cigar_op(__cigar) ((__cigar)>>CIGAR_OP_SHIFT) +#define __cigar_len(__cigar) ((__cigar)&CIGAR_LN_MASK) +#define __cigar_create(__op, __len) ((__op)< +#include +#include +#include "bwtgap.h" +#include "bwtaln.h" + +#define STATE_M 0 +#define STATE_I 1 +#define STATE_D 2 + +#define aln_score(m,o,e,p) ((m)*(p)->s_mm + (o)*(p)->s_gapo + (e)*(p)->s_gape) + +gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt) +{ + int i; + gap_stack_t *stack; + stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t)); + stack->n_stacks = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt); + stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t)); + for (i = 0; i != stack->n_stacks; ++i) { + gap_stack1_t *p = stack->stacks + i; + p->m_entries = 4; + p->stack = (gap_entry_t*)calloc(p->m_entries, sizeof(gap_entry_t)); + } + return stack; +} + +void gap_destroy_stack(gap_stack_t *stack) +{ + int i; + for (i = 0; i != stack->n_stacks; ++i) free(stack->stacks[i].stack); + free(stack->stacks); + free(stack); +} + +static void gap_reset_stack(gap_stack_t *stack) +{ + int i; + for (i = 0; i != stack->n_stacks; ++i) + stack->stacks[i].n_entries = 0; + stack->best = stack->n_stacks; + stack->n_entries = 0; +} + +static inline void gap_push(gap_stack_t *stack, int a, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, + int state, int is_diff, const gap_opt_t *opt) +{ + int score; + gap_entry_t *p; + gap_stack1_t *q; + score = aln_score(n_mm, n_gapo, n_gape, opt); + q = stack->stacks + score; + if (q->n_entries == q->m_entries) { + q->m_entries <<= 1; + q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries); + } + p = q->stack + q->n_entries; + p->info = (u_int32_t)score<<21 | a<<20 | i; p->k = k; p->l = l; + p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state; + if (is_diff) p->last_diff_pos = i; + ++(q->n_entries); + ++(stack->n_entries); + if (stack->best > score) stack->best = score; +} + +static inline void gap_pop(gap_stack_t *stack, gap_entry_t *e) +{ + gap_stack1_t *q; + q = stack->stacks + stack->best; + *e = q->stack[q->n_entries - 1]; + --(q->n_entries); + --(stack->n_entries); + if (q->n_entries == 0 && stack->n_entries) { // reset best + int i; + for (i = stack->best + 1; i < stack->n_stacks; ++i) + if (stack->stacks[i].n_entries != 0) break; + stack->best = i; + } else if (stack->n_entries == 0) stack->best = stack->n_stacks; +} + +static inline void gap_shadow(int x, int len, bwtint_t max, int last_diff_pos, bwt_width_t *w) +{ + int i, j; + for (i = j = 0; i < last_diff_pos; ++i) { + if (w[i].w > x) w[i].w -= x; + else if (w[i].w == x) { + w[i].bid = 1; + w[i].w = max - (++j); + } // else should not happen + } +} + +static inline int int_log2(uint32_t v) +{ + int c = 0; + if (v & 0xffff0000u) { v >>= 16; c |= 16; } + if (v & 0xff00) { v >>= 8; c |= 8; } + if (v & 0xf0) { v >>= 4; c |= 4; } + if (v & 0xc) { v >>= 2; c |= 2; } + if (v & 0x2) c |= 1; + return c; +} + +bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], bwt_width_t *w[2], + bwt_width_t *seed_w[2], const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack) +{ + int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt); + int best_diff = opt->max_diff + 1, max_diff = opt->max_diff; + int best_cnt = 0; + int max_entries = 0, j, _j, n_aln, m_aln; + bwt_aln1_t *aln; + + m_aln = 4; n_aln = 0; + aln = (bwt_aln1_t*)calloc(m_aln, sizeof(bwt_aln1_t)); + + // check whether there are too many N + for (j = _j = 0; j < len; ++j) + if (seq[0][j] > 3) ++_j; + if (_j > max_diff) { + *_n_aln = n_aln; + return aln; + } + + //for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w); + gap_reset_stack(stack); // reset stack + gap_push(stack, 0, len, 0, bwts[0]->seq_len, 0, 0, 0, 0, 0, opt); + gap_push(stack, 1, len, 0, bwts[0]->seq_len, 0, 0, 0, 0, 0, opt); + + while (stack->n_entries) { + gap_entry_t e; + int a, i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp; + bwtint_t k, l, cnt_k[4], cnt_l[4], occ; + const bwt_t *bwt; + const ubyte_t *str; + const bwt_width_t *seed_width = 0; + bwt_width_t *width; + + if (max_entries < stack->n_entries) max_entries = stack->n_entries; + if (stack->n_entries > opt->max_entries) break; + gap_pop(stack, &e); // get the best entry + k = e.k; l = e.l; // SA interval + a = e.info>>20&1; i = e.info&0xffff; // strand, length + if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed + + m = max_diff - (e.n_mm + e.n_gapo); + if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape; + if (m < 0) continue; + bwt = bwts[1-a]; str = seq[a]; width = w[a]; + if (seed_w) { // apply seeding + seed_width = seed_w[a]; + m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo); + if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape; + } + //printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%u]\t[%u,%u]\t%d\n", stack->n_entries, a, i, "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos); + if (i > 0 && m < width[i-1].bid) continue; + + // check whether a hit is found + hit_found = 0; + if (i == 0) hit_found = 1; + else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed + if (bwt_match_exact_alt(bwt, i, str, &k, &l)) hit_found = 1; + else continue; // no hit, skip + } + + if (hit_found) { // action for found hits + int score = aln_score(e.n_mm, e.n_gapo, e.n_gape, opt); + int do_add = 1; + //printf("#2 hits found: %d:(%u,%u)\n", e.n_mm+e.n_gapo, k, l); + if (n_aln == 0) { + best_score = score; + best_diff = e.n_mm + e.n_gapo; + if (opt->mode & BWA_MODE_GAPE) best_diff += e.n_gape; + if (!(opt->mode & BWA_MODE_NONSTOP)) + max_diff = (best_diff + 1 > opt->max_diff)? opt->max_diff : best_diff + 1; // top2 behaviour + } + if (score == best_score) best_cnt += l - k + 1; + else if (best_cnt > opt->max_top2) break; // top2b behaviour + if (e.n_gapo) { // check whether the hit has been found. this may happen when a gap occurs in a tandem repeat + for (j = 0; j != n_aln; ++j) + if (aln[j].k == k && aln[j].l == l) break; + if (j < n_aln) do_add = 0; + } + if (do_add) { // append + bwt_aln1_t *p; + gap_shadow(l - k + 1, len, bwt->seq_len, e.last_diff_pos, width); + if (n_aln == m_aln) { + m_aln <<= 1; + aln = (bwt_aln1_t*)realloc(aln, m_aln * sizeof(bwt_aln1_t)); + memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t)); + } + p = aln + n_aln; + p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; p->a = a; + p->k = k; p->l = l; + p->score = score; + ++n_aln; + } + continue; + } + + --i; + bwt_2occ4(bwt, k - 1, l, cnt_k, cnt_l); // retrieve Occ values + occ = l - k + 1; + // test whether diff is allowed + allow_diff = allow_M = 1; + if (i > 0) { + int ii = i - (len - opt->seed_len); + if (width[i-1].bid > m-1) allow_diff = 0; + else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0; + if (seed_w && ii > 0) { + if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0; + else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1 + && seed_width[ii-1].w == seed_width[ii].w) allow_M = 0; + } + } + // indels + tmp = (opt->mode & BWA_MODE_LOGGAP)? int_log2(e.n_gape + e.n_gapo)/2+1 : e.n_gapo + e.n_gape; + if (allow_diff && i >= opt->indel_end_skip + tmp && len - i >= opt->indel_end_skip + tmp) { + if (e.state == STATE_M) { // gap open + if (e.n_gapo < opt->max_gapo) { // gap open is allowed + // insertion + gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt); + // deletion + for (j = 0; j != 4; ++j) { + k = bwt->L2[j] + cnt_k[j] + 1; + l = bwt->L2[j] + cnt_l[j]; + if (k <= l) gap_push(stack, a, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt); + } + } + } else if (e.state == STATE_I) { // extention of an insertion + if (e.n_gape < opt->max_gape) // gap extention is allowed + gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt); + } else if (e.state == STATE_D) { // extention of a deletion + if (e.n_gape < opt->max_gape) { // gap extention is allowed + if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) { + for (j = 0; j != 4; ++j) { + k = bwt->L2[j] + cnt_k[j] + 1; + l = bwt->L2[j] + cnt_l[j]; + if (k <= l) gap_push(stack, a, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt); + } + } + } + } + } + // mismatches + if (allow_diff && allow_M) { // mismatch is allowed + for (j = 1; j <= 4; ++j) { + int c = (str[i] + j) & 3; + int is_mm = (j != 4 || str[i] > 3); + k = bwt->L2[c] + cnt_k[c] + 1; + l = bwt->L2[c] + cnt_l[c]; + if (k <= l) gap_push(stack, a, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt); + } + } else if (str[i] < 4) { // try exact match only + int c = str[i] & 3; + k = bwt->L2[c] + cnt_k[c] + 1; + l = bwt->L2[c] + cnt_l[c]; + if (k <= l) gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt); + } + } + + *_n_aln = n_aln; + //fprintf(stderr, "max_entries = %d\n", max_entries); + return aln; +} diff --git a/bwtgap.h b/bwtgap.h new file mode 100644 index 0000000..fc910bc --- /dev/null +++ b/bwtgap.h @@ -0,0 +1,38 @@ +#ifndef BWTGAP_H_ +#define BWTGAP_H_ + +#include "bwt.h" +#include "bwtaln.h" + +typedef struct { // recursion stack + u_int32_t info; // score<<21 | a<<20 | i + u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6; + bwtint_t k, l; // (k,l) is the SA region of [i,n-1] + int last_diff_pos; +} gap_entry_t; + +typedef struct { + int n_entries, m_entries; + gap_entry_t *stack; +} gap_stack1_t; + +typedef struct { + int n_stacks, best, n_entries; + gap_stack1_t *stacks; +} gap_stack_t; + +#ifdef __cplusplus +extern "C" { +#endif + + gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt); + void gap_destroy_stack(gap_stack_t *stack); + bwt_aln1_t *bwt_match_gap(bwt_t *const bwt[2], int len, const ubyte_t *seq[2], bwt_width_t *w[2], + bwt_width_t *seed_w[2], const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack); + void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwtindex.c b/bwtindex.c new file mode 100644 index 0000000..68792f7 --- /dev/null +++ b/bwtindex.c @@ -0,0 +1,186 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#include +#include +#include +#include +#include +#include +#include "bntseq.h" +#include "bwt.h" +#include "main.h" +#include "utils.h" + +bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is); +void bwa_pac_rev_core(const char *fn, const char *fn_rev); + +int bwa_index(int argc, char *argv[]) +{ + char *prefix = 0, *str, *str2, *str3; + int c, algo_type = 3, is_color = 0; + clock_t t; + + while ((c = getopt(argc, argv, "ca:p:")) >= 0) { + switch (c) { + case 'a': + if (strcmp(optarg, "div") == 0) algo_type = 1; + else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2; + else if (strcmp(optarg, "is") == 0) algo_type = 3; + else err_fatal(__func__, "unknown algorithm: '%s'.", optarg); + break; + case 'p': prefix = strdup(optarg); break; + case 'c': is_color = 1; break; + default: return 1; + } + } + + if (optind + 1 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa index [-a bwtsw|div|is] [-c] \n\n"); + fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [is]\n"); + fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); + fprintf(stderr, " -c build color-space index\n\n"); + fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); + fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n"); + fprintf(stderr, " according to the length of the genome.\n\n"); + return 1; + } + if (prefix == 0) prefix = strdup(argv[optind]); + str = (char*)calloc(strlen(prefix) + 10, 1); + str2 = (char*)calloc(strlen(prefix) + 10, 1); + str3 = (char*)calloc(strlen(prefix) + 10, 1); + + if (is_color == 0) { // nucleotide indexing + gzFile fp = xzopen(argv[optind], "r"); + t = clock(); + fprintf(stderr, "[bwa_index] Pack FASTA... "); + bns_fasta2bntseq(fp, prefix); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + gzclose(fp); + } else { // color indexing + gzFile fp = xzopen(argv[optind], "r"); + strcat(strcpy(str, prefix), ".nt"); + t = clock(); + fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... "); + bns_fasta2bntseq(fp, str); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + gzclose(fp); + { + char *tmp_argv[3]; + tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix; + t = clock(); + fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... "); + bwa_pac2cspac(3, tmp_argv); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + } + { + strcpy(str, prefix); strcat(str, ".pac"); + strcpy(str2, prefix); strcat(str2, ".rpac"); + t = clock(); + fprintf(stderr, "[bwa_index] Reverse the packed sequence... "); + bwa_pac_rev_core(str, str2); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + { + strcpy(str, prefix); strcat(str, ".pac"); + strcpy(str2, prefix); strcat(str2, ".bwt"); + t = clock(); + fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n"); + if (algo_type == 2) bwt_bwtgen(str, str2); + else if (algo_type == 1 || algo_type == 3) { + bwt_t *bwt; + bwt = bwt_pac2bwt(str, algo_type == 3); + bwt_dump_bwt(str2, bwt); + bwt_destroy(bwt); + } + fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + { + strcpy(str, prefix); strcat(str, ".rpac"); + strcpy(str2, prefix); strcat(str2, ".rbwt"); + t = clock(); + fprintf(stderr, "[bwa_index] Construct BWT for the reverse packed sequence...\n"); + if (algo_type == 2) bwt_bwtgen(str, str2); + else if (algo_type == 1 || algo_type == 3) { + bwt_t *bwt; + bwt = bwt_pac2bwt(str, algo_type == 3); + bwt_dump_bwt(str2, bwt); + bwt_destroy(bwt); + } + fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + { + bwt_t *bwt; + strcpy(str, prefix); strcat(str, ".bwt"); + t = clock(); + fprintf(stderr, "[bwa_index] Update BWT... "); + bwt = bwt_restore_bwt(str); + bwt_bwtupdate_core(bwt); + bwt_dump_bwt(str, bwt); + bwt_destroy(bwt); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + { + bwt_t *bwt; + strcpy(str, prefix); strcat(str, ".rbwt"); + t = clock(); + fprintf(stderr, "[bwa_index] Update reverse BWT... "); + bwt = bwt_restore_bwt(str); + bwt_bwtupdate_core(bwt); + bwt_dump_bwt(str, bwt); + bwt_destroy(bwt); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + { + bwt_t *bwt; + strcpy(str, prefix); strcat(str, ".bwt"); + strcpy(str3, prefix); strcat(str3, ".sa"); + t = clock(); + fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... "); + bwt = bwt_restore_bwt(str); + bwt_cal_sa(bwt, 32); + bwt_dump_sa(str3, bwt); + bwt_destroy(bwt); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + { + bwt_t *bwt; + strcpy(str, prefix); strcat(str, ".rbwt"); + strcpy(str3, prefix); strcat(str3, ".rsa"); + t = clock(); + fprintf(stderr, "[bwa_index] Construct SA from reverse BWT and Occ... "); + bwt = bwt_restore_bwt(str); + bwt_cal_sa(bwt, 32); + bwt_dump_sa(str3, bwt); + bwt_destroy(bwt); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + } + free(str3); free(str2); free(str); free(prefix); + return 0; +} diff --git a/bwtio.c b/bwtio.c new file mode 100644 index 0000000..c5ffcae --- /dev/null +++ b/bwtio.c @@ -0,0 +1,77 @@ +#include +#include +#include +#include "bwt.h" +#include "utils.h" + +void bwt_dump_bwt(const char *fn, const bwt_t *bwt) +{ + FILE *fp; + fp = xopen(fn, "wb"); + fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + fwrite(bwt->bwt, sizeof(bwtint_t), bwt->bwt_size, fp); + fclose(fp); +} + +void bwt_dump_sa(const char *fn, const bwt_t *bwt) +{ + FILE *fp; + fp = xopen(fn, "wb"); + fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); + fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + fclose(fp); +} + +void bwt_restore_sa(const char *fn, bwt_t *bwt) +{ + char skipped[256]; + FILE *fp; + bwtint_t primary; + + fp = xopen(fn, "rb"); + fread(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same."); + fread(skipped, sizeof(bwtint_t), 4, fp); // skip + fread(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + fread(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); + + bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; + bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); + bwt->sa[0] = -1; + + fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + fclose(fp); +} + +bwt_t *bwt_restore_bwt(const char *fn) +{ + bwt_t *bwt; + FILE *fp; + + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + fp = xopen(fn, "rb"); + fseek(fp, 0, SEEK_END); + bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2; + bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4); + fseek(fp, 0, SEEK_SET); + fread(&bwt->primary, sizeof(bwtint_t), 1, fp); + fread(bwt->L2+1, sizeof(bwtint_t), 4, fp); + fread(bwt->bwt, 4, bwt->bwt_size, fp); + bwt->seq_len = bwt->L2[4]; + fclose(fp); + bwt_gen_cnt_table(bwt); + + return bwt; +} + +void bwt_destroy(bwt_t *bwt) +{ + if (bwt == 0) return; + free(bwt->sa); free(bwt->bwt); + free(bwt); +} diff --git a/bwtmisc.c b/bwtmisc.c new file mode 100644 index 0000000..1082065 --- /dev/null +++ b/bwtmisc.c @@ -0,0 +1,267 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#include +#include +#include +#include +#include "bntseq.h" +#include "utils.h" +#include "main.h" +#include "bwt.h" + +#ifdef _DIVBWT +#include "divsufsort.h" +#endif + +int is_bwt(ubyte_t *T, int n); + +int64_t bwa_seq_len(const char *fn_pac) +{ + FILE *fp; + int64_t pac_len; + ubyte_t c; + fp = xopen(fn_pac, "rb"); + fseek(fp, -1, SEEK_END); + pac_len = ftell(fp); + fread(&c, 1, 1, fp); + fclose(fp); + return (pac_len - 1) * 4 + (int)c; +} + +bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) +{ + bwt_t *bwt; + ubyte_t *buf, *buf2; + int i, pac_size; + FILE *fp; + + // initialization + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + bwt->seq_len = bwa_seq_len(fn_pac); + bwt->bwt_size = (bwt->seq_len + 15) >> 4; + fp = xopen(fn_pac, "rb"); + + // prepare sequence + pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); + buf2 = (ubyte_t*)calloc(pac_size, 1); + fread(buf2, 1, pac_size, fp); + fclose(fp); + memset(bwt->L2, 0, 5 * 4); + buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); + for (i = 0; i < bwt->seq_len; ++i) { + buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; + ++bwt->L2[1+buf[i]]; + } + for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; + free(buf2); + + // Burrows-Wheeler Transform + if (use_is) { + bwt->primary = is_bwt(buf, bwt->seq_len); + } else { +#ifdef _DIVBWT + bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); +#else + err_fatal_simple("libdivsufsort is not compiled in."); +#endif + } + bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); + for (i = 0; i < bwt->seq_len; ++i) + bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); + free(buf); + return bwt; +} + +int bwa_pac2bwt(int argc, char *argv[]) +{ + bwt_t *bwt; + int c, use_is = 1; + while ((c = getopt(argc, argv, "d")) >= 0) { + switch (c) { + case 'd': use_is = 0; break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); + return 1; + } + bwt = bwt_pac2bwt(argv[optind], use_is); + bwt_dump_bwt(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) + +void bwt_bwtupdate_core(bwt_t *bwt) +{ + bwtint_t i, k, c[4], n_occ; + uint32_t *buf; + + n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; + bwt->bwt_size += n_occ * 4; // the new size + buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt + c[0] = c[1] = c[2] = c[3] = 0; + for (i = k = 0; i < bwt->seq_len; ++i) { + if (i % OCC_INTERVAL == 0) { + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + k += 4; + } + if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; + ++c[bwt_B00(bwt, i)]; + } + // the last element + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + xassert(k + 4 == bwt->bwt_size, "inconsistent bwt_size"); + // update bwt + free(bwt->bwt); bwt->bwt = buf; +} + +int bwa_bwtupdate(int argc, char *argv[]) +{ + bwt_t *bwt; + if (argc < 2) { + fprintf(stderr, "Usage: bwa bwtupdate \n"); + return 1; + } + bwt = bwt_restore_bwt(argv[1]); + bwt_bwtupdate_core(bwt); + bwt_dump_bwt(argv[1], bwt); + bwt_destroy(bwt); + return 0; +} + +void bwa_pac_rev_core(const char *fn, const char *fn_rev) +{ + int64_t seq_len, i; + bwtint_t pac_len, j; + ubyte_t *bufin, *bufout, ct; + FILE *fp; + seq_len = bwa_seq_len(fn); + pac_len = (seq_len >> 2) + 1; + bufin = (ubyte_t*)calloc(pac_len, 1); + bufout = (ubyte_t*)calloc(pac_len, 1); + fp = xopen(fn, "rb"); + fread(bufin, 1, pac_len, fp); + fclose(fp); + for (i = seq_len - 1, j = 0; i >= 0; --i) { + int c = bufin[i>>2] >> ((~i&3)<<1) & 3; + bwtint_t j = seq_len - 1 - i; + bufout[j>>2] |= c << ((~j&3)<<1); + } + free(bufin); + fp = xopen(fn_rev, "wb"); + fwrite(bufout, 1, pac_len, fp); + ct = seq_len % 4; + fwrite(&ct, 1, 1, fp); + fclose(fp); + free(bufout); +} + +int bwa_pac_rev(int argc, char *argv[]) +{ + if (argc < 3) { + fprintf(stderr, "Usage: bwa pac_rev \n"); + return 1; + } + bwa_pac_rev_core(argv[1], argv[2]); + return 0; +} + +const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; + +/* this function is not memory efficient, but this will make life easier + Ideally we should also change .amb files as one 'N' in the nucleotide + sequence leads to two ambiguous colors. I may do this later... */ +uint8_t *bwa_pac2cspac_core(const bntseq_t *bns) +{ + uint8_t *pac, *cspac; + bwtint_t i; + int c1, c2; + pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); + cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); + fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); + rewind(bns->fp_pac); + c1 = pac[0]>>6; cspac[0] = c1<<6; + for (i = 1; i < bns->l_pac; ++i) { + c2 = pac[i>>2] >> (~i&3)*2 & 3; + cspac[i>>2] |= nst_color_space_table[(1< \n"); + return 1; + } + bns = bns_restore(argv[1]); + cspac = bwa_pac2cspac_core(bns); + bns_dump(bns, argv[2]); + // now write cspac + str = (char*)calloc(strlen(argv[2]) + 5, 1); + strcat(strcpy(str, argv[2]), ".pac"); + fp = xopen(str, "wb"); + fwrite(cspac, 1, bns->l_pac/4 + 1, fp); + ct = bns->l_pac % 4; + fwrite(&ct, 1, 1, fp); + fclose(fp); + bns_destroy(bns); + free(cspac); + return 0; +} + +int bwa_bwt2sa(int argc, char *argv[]) +{ + bwt_t *bwt; + int c, sa_intv = 32; + while ((c = getopt(argc, argv, "i:")) >= 0) { + switch (c) { + case 'i': sa_intv = atoi(optarg); break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); + return 1; + } + bwt = bwt_restore_bwt(argv[optind]); + bwt_cal_sa(bwt, sa_intv); + bwt_dump_sa(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} diff --git a/bwtsw2.h b/bwtsw2.h new file mode 100644 index 0000000..d5dbe71 --- /dev/null +++ b/bwtsw2.h @@ -0,0 +1,51 @@ +#ifndef LH3_BWTSW2_H +#define LH3_BWTSW2_H + +#include +#include "bntseq.h" +#include "bwt_lite.h" +#include "bwt.h" + +typedef struct { + int a, b, q, r, t, qr, bw; + int z, is, t_seeds, hard_clip; + float yita, mask_level, coef; + int n_threads, chunk_size; +} bsw2opt_t; + +typedef struct { + uint32_t k, l, flag:18, n_seeds:14; + int len, G, G2; + int beg, end; +} bsw2hit_t; + +typedef struct { + int n, max; + bsw2hit_t *hits; + int *n_cigar; + uint32_t **cigar; +} bwtsw2_t; + +typedef struct { + void *stack; + int max_l; + uint8_t *aln_mem; +} bsw2global_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bsw2opt_t *bsw2_init_opt(); + bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool); + void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target[2], const char *fn); + void bsw2_destroy(bwtsw2_t *b); + + bsw2global_t *bsw2_global_init(); + void bsw2_global_destroy(bsw2global_t *_pool); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c new file mode 100644 index 0000000..c8915cc --- /dev/null +++ b/bwtsw2_aux.c @@ -0,0 +1,650 @@ +#include +#include +#include +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#ifdef HAVE_PTHREAD +#include +#endif +#include "bntseq.h" +#include "bwt_lite.h" +#include "utils.h" +#include "bwtsw2.h" +#include "stdaln.h" +#include "kstring.h" + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +#include "ksort.h" +#define __left_lt(a, b) ((a).end > (b).end) +KSORT_INIT(hit, bsw2hit_t, __left_lt) + +extern unsigned char nst_nt4_table[256]; + +unsigned char nt_comp_table[256] = { + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N', + 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N', + 'n','t','v','g', 'h','n','n','c', 'd','n','n','m', 'n','k','n','n', + 'n','n','y','s', 'a','n','b','w', 'x','r','n}; + +extern int bsw2_resolve_duphits(const bwt_t *bwt, bwtsw2_t *b, int IS); +extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level); + +bsw2opt_t *bsw2_init_opt() +{ + bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t)); + o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30; + o->bw = 50; + o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; + o->mask_level = 0.50f; o->yita = 5.5f; o->coef = 5.5f; + o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000; + return o; +} + +void bsw2_destroy(bwtsw2_t *b) +{ + int i; + if (b == 0) return; + if (b->cigar) + for (i = 0; i < b->n; ++i) free(b->cigar[i]); + free(b->cigar); free(b->n_cigar); free(b->hits); + free(b); +} + +#define __gen_ap(par, opt) do { \ + int i; \ + for (i = 0; i < 25; ++i) (par).matrix[i] = -(opt)->b; \ + for (i = 0; i < 4; ++i) (par).matrix[i*5+i] = (opt)->a; \ + (par).gap_open = (opt)->q; (par).gap_ext = (opt)->r; \ + (par).gap_end = (opt)->r; \ + (par).row = 5; (par).band_width = opt->bw; \ + } while (0) + +#define __rpac(pac, l, i) (pac[(l-i-1)>>2] >> (~(l-i-1)&3)*2 & 0x3) + +void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, uint32_t l_pac, int is_rev, uint8_t *_mem) +{ + int i, matrix[25]; + bwtint_t k; + uint8_t *target = 0, *query; + AlnParam par; + + par.matrix = matrix; + __gen_ap(par, opt); + query = calloc(lq, 1); + // sort according to the descending order of query end + ks_introsort(hit, b->n, b->hits); + target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); + // reverse _query + for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i]; + // core loop + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; + int score, j; + path_t path; + p->n_seeds = 1; + if (p->l || p->k == 0) continue; + for (j = score = 0; j < i; ++j) { + bsw2hit_t *q = b->hits + j; + if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) { + if (q->n_seeds < (1<<14) - 2) ++q->n_seeds; + ++score; + } + } + if (score) continue; + if (lt > p->k) lt = p->k; + if (is_rev) { + for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! + target[j++] = __rpac(pac, l_pac, k); + } else { + for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! + target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; + } + lt = j; + score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem); + if (score > p->G) { // extensible + p->G = score; + p->len += path.i; + p->beg -= path.j; + p->k -= path.i; + } + } + free(query); free(target); +} + +void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, uint32_t l_pac, int is_rev, uint8_t *_mem) +{ + int i, matrix[25]; + uint32_t k; + uint8_t *target; + AlnParam par; + + par.matrix = matrix; + __gen_ap(par, opt); + target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; + int j, score; + path_t path; + if (p->l) continue; + if (is_rev) { + for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) + target[j++] = __rpac(pac, l_pac, k); + } else { + for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) + target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; + } + lt = j; + score = aln_extend_core(target, lt, query + p->beg, lq - p->beg, &par, &path, 0, 1, _mem); +// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); + if (score >= p->G) { + p->G = score; + p->len = path.i; + p->end = path.j + p->beg; + } + } + free(target); +} + +/* generate CIGAR array(s) in b->cigar[] */ +static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], uint8_t *pac, bwtsw2_t *b) +{ + uint8_t *target; + int i, matrix[25]; + AlnParam par; + path_t *path; + + par.matrix = matrix; + __gen_ap(par, opt); + i = ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq; // maximum possible target length + target = calloc(i, 1); + path = calloc(i + lq, sizeof(path_t)); + // memory clean up for b + if (b->n < b->max) { + b->max = b->n; + b->hits = realloc(b->hits, b->n * sizeof(bsw2hit_t)); + } + if (b->cigar) free(b->cigar); + if (b->n_cigar) free(b->n_cigar); + b->cigar = (uint32_t**)calloc(b->max, sizeof(void*)); + b->n_cigar = (int*)calloc(b->max, sizeof(int)); + // generate CIGAR + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + uint8_t *query; + uint32_t k; + int score, path_len, beg, end; + if (p->l) continue; + beg = (p->flag & 0x10)? lq - p->end : p->beg; + end = (p->flag & 0x10)? lq - p->beg : p->end; + query = seq[(p->flag & 0x10)? 1 : 0] + beg; + for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here + target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3; + score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); + b->cigar[i] = aln_path2cigar32(path, path_len, &b->n_cigar[i]); + if (beg != 0 || end < lq) { // write soft clipping + b->cigar[i] = realloc(b->cigar[i], 4 * (b->n_cigar[i] + 2)); + if (beg != 0) { + memmove(b->cigar[i] + 1, b->cigar[i], b->n_cigar[i] * 4); + b->cigar[i][0] = beg<<4 | 4; + ++b->n_cigar[i]; + } + if (end < lq) { + b->cigar[i][b->n_cigar[i]] = (lq - end)<<4 | 4; + ++b->n_cigar[i]; + } + } + } + free(target); free(path); +} + +/* this is for the debugging purpose only */ +void bsw2_debug_hits(const bwtsw2_t *b) +{ + int i; + printf("# raw hits: %d\n", b->n); + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + if (p->l == 0) + printf("%d, %d, %d, %u, %u\n", p->G, p->beg, p->end, p->k, p->l); + } +} + +static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse) +{ + int i; + if (b[0]->n + b[1]->n > b[0]->max) { + b[0]->max = b[0]->n + b[1]->n; + b[0]->hits = realloc(b[0]->hits, b[0]->max * sizeof(bsw2hit_t)); + } + for (i = 0; i < b[1]->n; ++i) { + bsw2hit_t *p = b[0]->hits + b[0]->n + i; + *p = b[1]->hits[i]; + if (is_reverse) { + int x = p->beg; + p->beg = l - p->end; + p->end = l - x; + p->flag |= 0x10; + } + } + b[0]->n += b[1]->n; + bsw2_destroy(b[1]); + b[1] = 0; +} +/* seq[0] is the forward sequence and seq[1] is the reverse complement. */ +static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, + int l, uint8_t *seq[2], int is_rev, bsw2global_t *pool) +{ + extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]); + bwtsw2_t *b[2], **bb[2]; + int k; + for (k = 0; k < 2; ++k) { + bwtl_t *query = bwtl_seq2bwtl(l, seq[k]); + bb[k] = bsw2_core(opt, query, target, pool); + bwtl_destroy(query); + } + b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits" + bsw2_chain_filter(opt, l, b); + for (k = 0; k < 2; ++k) { + bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem); + merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here + bsw2_resolve_duphits(0, bb[k][0], 0); + bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem); + b[k] = bb[k][0]; + free(bb[k]); + } + merge_hits(b, l, 1); // again, b[1] is merged to b[0] + bsw2_resolve_query_overlaps(b[0], opt->mask_level); + return b[0]; +} + +/* set ->flag to records the origin of the hit (to forward bwt or reverse bwt) */ +static void flag_fr(bwtsw2_t *b[2]) +{ + int i, j; + for (i = 0; i < b[0]->n; ++i) { + bsw2hit_t *p = b[0]->hits + i; + p->flag |= 0x10000; + } + for (i = 0; i < b[1]->n; ++i) { + bsw2hit_t *p = b[1]->hits + i; + p->flag |= 0x20000; + } + for (i = 0; i < b[0]->n; ++i) { + bsw2hit_t *p = b[0]->hits + i; + for (j = 0; j < b[1]->n; ++j) { + bsw2hit_t *q = b[1]->hits + i; + if (q->beg == p->beg && q->end == p->end && q->k == p->k && q->len == p->len && q->G == p->G) { + q->flag |= 0x30000; p->flag |= 0x30000; + break; + } + } + } +} + +typedef struct { + int l, tid; + char *name, *seq, *qual, *sam; +} bsw2seq1_t; + +typedef struct { + int n, max; + bsw2seq1_t *seq; +} bsw2seq_t; + +#ifdef HAVE_PTHREAD +static pthread_mutex_t g_dbwtsw_lock = PTHREAD_MUTEX_INITIALIZER; +#endif + +static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar) +{ + // FIXME: this routine does not work if the query bridge three reference sequences + int32_t coor, refl, lq; + int x, y, i, seqid; + bns_coor_pac2real(bns, p->k, p->len, &seqid); + coor = p->k - bns->anns[seqid].offset; + refl = bns->anns[seqid].len; + x = coor; y = 0; + // test if the alignment goes beyond the boundary + for (i = 0; i < n_cigar; ++i) { + int op = cigar[i]&0xf, ln = cigar[i]>>4; + if (op == 1 || op == 4 || op == 5) y += ln; + else if (op == 2) x += ln; + else x += ln, y += ln; + } + lq = y; // length of the query sequence + if (x > refl) { // then fix it + int j, nc, mq[2], nlen[2]; + uint32_t *cn, kk = 0; + nc = mq[0] = mq[1] = nlen[0] = nlen[1] = 0; + cn = calloc(n_cigar + 3, 4); + x = coor; y = 0; + for (i = j = 0; i < n_cigar; ++i) { + int op = cigar[i]&0xf, ln = cigar[i]>>4; + if (op == 4 || op == 5 || op == 1) { // ins or clipping + y += ln; + cn[j++] = cigar[i]; + } else if (op == 2) { // del + if (x + ln >= refl && nc == 0) { + cn[j++] = (uint32_t)(lq - y)<<4 | 4; + nc = j; + cn[j++] = (uint32_t)y<<4 | 4; + kk = p->k + (x + ln - refl); + nlen[0] = x - coor; + nlen[1] = p->len - nlen[0] - ln; + } else cn[j++] = cigar[i]; + x += ln; + } else if (op == 0) { // match + if (x + ln >= refl && nc == 0) { + // FIXME: not consider a special case where a split right between M and I + cn[j++] = (uint32_t)(refl - x)<<4 | 0; // write M + cn[j++] = (uint32_t)(lq - y - (refl - x))<<4 | 4; // write S + nc = j; + mq[0] += refl - x; + cn[j++] = (uint32_t)(y + (refl - x))<<4 | 4; + if (x + ln - refl) cn[j++] = (uint32_t)(x + ln - refl)<<4 | 0; + mq[1] += x + ln - refl; + kk = bns->anns[seqid].offset + refl; + nlen[0] = refl - coor; + nlen[1] = p->len - nlen[0]; + } else { + cn[j++] = cigar[i]; + mq[nc?1:0] += ln; + } + x += ln; y += ln; + } + } + if (mq[0] > mq[1]) { // then take the first alignment + n_cigar = nc; + memcpy(cigar, cn, 4 * nc); + p->len = nlen[0]; + } else { + p->k = kk; p->len = nlen[1]; + n_cigar = j - nc; + memcpy(cigar, cn + nc, 4 * (j - nc)); + } + free(cn); + } + return n_cigar; +} + +/* generate SAM lines for a sequence in ks with alignment stored in + * b. ks->name and ks->seq will be freed and set to NULL in the end. */ +static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b) +{ + int i, k; + kstring_t str; + memset(&str, 0, sizeof(kstring_t)); + if (b == 0 || b->n == 0) { // no hits + ksprintf(&str, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t", ks->name); + for (i = 0; i < ks->l; ++i) kputc(ks->seq[i], &str); + if (ks->qual) { + kputc('\t', &str); + for (i = 0; i < ks->l; ++i) kputc(ks->qual[i], &str); + } else kputs("\t*", &str); + kputc('\n', &str); + } + for (i = 0; b && i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + int32_t seqid = -1, coor = -1; + int j, qual, nn = 0; + int beg, end; + if (p->l == 0) { + b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]); + nn = bns_coor_pac2real(bns, p->k, p->len, &seqid); + coor = p->k - bns->anns[seqid].offset; + } + ksprintf(&str, "%s\t%d", ks->name, p->flag&0x10); + ksprintf(&str, "\t%s\t%d", seqid>=0? bns->anns[seqid].name : "*", coor + 1); + if (p->l == 0) { + { // estimate mapping quality + float c = 1.0; + int subo = p->G2 > opt->t? p->G2 : opt->t; + if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; + if (p->n_seeds < 2) c *= .2; + qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499); + if (qual > 250) qual = 250; + if (p->flag&1) qual = 0; + } + ksprintf(&str, "\t%d\t", qual); + for (k = 0; k < b->n_cigar[i]; ++k) + ksprintf(&str, "%d%c", b->cigar[i][k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[b->cigar[i][k]&0xf]); + } else ksprintf(&str, "\t0\t*"); + ksprintf(&str, "\t*\t0\t0\t"); + beg = 0; end = ks->l; + if (opt->hard_clip) { + if ((b->cigar[i][0]&0xf) == 4) beg += b->cigar[i][0]>>4; + if ((b->cigar[i][b->n_cigar[i]-1]&0xf) == 4) end -= b->cigar[i][b->n_cigar[i]-1]>>4; + } + for (j = beg; j < end; ++j) { + if (p->flag&0x10) kputc(nt_comp_table[(int)ks->seq[ks->l - 1 - j]], &str); + else kputc(ks->seq[j], &str); + } + if (ks->qual) { + kputc('\t', &str); + for (j = beg; j < end; ++j) { + if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str); + else kputc(ks->qual[j], &str); + } + } else ksprintf(&str, "\t*"); + ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tXN:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, nn); + if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1); + kputc('\n', &str); + } + ks->sam = str.s; + free(ks->seq); ks->seq = 0; + free(ks->qual); ks->qual = 0; + free(ks->name); ks->name = 0; +} + +/* Core routine to align reads in _seq. It is separated from + * process_seqs() to realize multi-threading */ +static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target[2]) +{ + int x; + bsw2opt_t opt = *_opt; + bsw2global_t *pool = bsw2_global_init(); + for (x = 0; x < _seq->n; ++x) { + bsw2seq1_t *p = _seq->seq + x; + uint8_t *seq[2], *rseq[2]; + int i, l, k; + bwtsw2_t *b[2]; + l = p->l; + +#ifdef HAVE_PTHREAD + if (_opt->n_threads > 1) { + pthread_mutex_lock(&g_dbwtsw_lock); + if (p->tid < 0) p->tid = tid; + else if (p->tid != tid) { + pthread_mutex_unlock(&g_dbwtsw_lock); + continue; + } // in pinciple else should not happen + pthread_mutex_unlock(&g_dbwtsw_lock); + } +#endif + + // set opt->t + opt.t = _opt->t; + if (opt.t < log(l) * opt.coef) opt.t = (int)(log(l) * opt.coef + .499); + if (pool->max_l < l) { // then enlarge working space for aln_extend_core() + int tmp = ((l + 1) / 2 * opt.a + opt.r) / opt.r + l; + pool->max_l = l; + pool->aln_mem = realloc(pool->aln_mem, (tmp + 2) * 24); + } + // set opt->bw + opt.bw = _opt->bw; + k = (l * opt.a - 2 * opt.q) / (2 * opt.r + opt.a); + i = (l * opt.a - opt.a - opt.t) / opt.r; + if (k > i) k = i; + if (k < 1) k = 1; // I do not know if k==0 causes troubles + opt.bw = _opt->bw < k? _opt->bw : k; + // set seq[2] and rseq[2] + seq[0] = calloc(l * 4, 1); + seq[1] = seq[0] + l; + rseq[0] = seq[1] + l; rseq[1] = rseq[0] + l; + // convert sequences to 2-bit representation + for (i = k = 0; i < l; ++i) { + int c = nst_nt4_table[(int)p->seq[i]]; + if (c >= 4) { c = (int)(drand48() * 4); ++k; } // FIXME: ambiguous bases are not properly handled + seq[0][i] = c; + seq[1][l-1-i] = 3 - c; + rseq[0][l-1-i] = c; + rseq[1][i] = 3 - c; + } + if (l - k < opt.t) { // too few unambiguous bases + print_hits(bns, &opt, p, 0); + free(seq[0]); continue; + } + // alignment + b[0] = bsw2_aln1_core(&opt, bns, pac, target[0], l, seq, 0, pool); + for (k = 0; k < b[0]->n; ++k) + if (b[0]->hits[k].n_seeds < opt.t_seeds) break; + if (k < b[0]->n) { + b[1] = bsw2_aln1_core(&opt, bns, pac, target[1], l, rseq, 1, pool); + for (i = 0; i < b[1]->n; ++i) { + bsw2hit_t *p = b[1]->hits + i; + int x = p->beg; + p->beg = l - p->end; + p->end = l - x; + if (p->l == 0) p->k = bns->l_pac - (p->k + p->len); + } + flag_fr(b); + merge_hits(b, l, 0); + bsw2_resolve_duphits(0, b[0], 0); + bsw2_resolve_query_overlaps(b[0], opt.mask_level); + } else b[1] = 0; + // generate CIGAR and print SAM + gen_cigar(&opt, l, seq, pac, b[0]); + print_hits(bns, &opt, p, b[0]); + // free + free(seq[0]); + bsw2_destroy(b[0]); + } + bsw2_global_destroy(pool); +} + +#ifdef HAVE_PTHREAD +typedef struct { + int tid; + bsw2seq_t *_seq; + const bsw2opt_t *_opt; + const bntseq_t *bns; + uint8_t *pac; + bwt_t *target[2]; +} thread_aux_t; + +/* another interface to bsw2_aln_core() to facilitate pthread_create() */ +static void *worker(void *data) +{ + thread_aux_t *p = (thread_aux_t*)data; + bsw2_aln_core(p->tid, p->_seq, p->_opt, p->bns, p->pac, p->target); + return 0; +} +#endif + +/* process sequences stored in _seq, generate SAM lines for these + * sequences and reset _seq afterwards. */ +static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target[2]) +{ + int i; + +#ifdef HAVE_PTHREAD + if (opt->n_threads <= 1) { + bsw2_aln_core(0, _seq, opt, bns, pac, target); + } else { + pthread_t *tid; + pthread_attr_t attr; + thread_aux_t *data; + int j; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + for (j = 0; j < opt->n_threads; ++j) { + thread_aux_t *p = data + j; + p->tid = j; p->_seq = _seq; p->_opt = opt; p->bns = bns; + p->pac = pac; p->target[0] = target[0]; p->target[1] = target[1]; + pthread_create(&tid[j], &attr, worker, p); + } + for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); + free(data); free(tid); + } +#else + bsw2_aln_core(0, _seq, opt, bns, pac, target); +#endif + + // print and reset + for (i = 0; i < _seq->n; ++i) { + bsw2seq1_t *p = _seq->seq + i; + if (p->sam) printf("%s", p->sam); + free(p->name); free(p->seq); free(p->qual); free(p->sam); + p->tid = -1; p->l = 0; + p->name = p->seq = p->qual = p->sam = 0; + } + fflush(stdout); + _seq->n = 0; +} + +void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target[2], const char *fn) +{ + gzFile fp; + kseq_t *ks; + int l, size = 0; + uint8_t *pac; + bsw2seq_t *_seq; + + pac = calloc(bns->l_pac/4+1, 1); + if (pac == 0) { + fprintf(stderr, "[bsw2_aln] insufficient memory!\n"); + return; + } + for (l = 0; l < bns->n_seqs; ++l) + printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); + fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); + fp = xzopen(fn, "r"); + ks = kseq_init(fp); + _seq = calloc(1, sizeof(bsw2seq_t)); + while ((l = kseq_read(ks)) >= 0) { + bsw2seq1_t *p; + if (_seq->n == _seq->max) { + _seq->max = _seq->max? _seq->max<<1 : 1024; + _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); + } + p = &_seq->seq[_seq->n++]; + p->tid = -1; + p->l = l; + p->name = strdup(ks->name.s); + p->seq = strdup(ks->seq.s); + p->qual = ks->qual.l? strdup(ks->qual.s) : 0; + p->sam = 0; + size += l; + if (size > opt->chunk_size) { + fprintf(stderr, "[bsw2_aln] read %d sequences (%d bp)...\n", _seq->n, size); + process_seqs(_seq, opt, bns, pac, target); + size = 0; + } + } + fprintf(stderr, "[bsw2_aln] read %d sequences (%d bp)...\n", _seq->n, size); + process_seqs(_seq, opt, bns, pac, target); + free(_seq->seq); free(_seq); + kseq_destroy(ks); + gzclose(fp); + free(pac); +} diff --git a/bwtsw2_chain.c b/bwtsw2_chain.c new file mode 100644 index 0000000..c734657 --- /dev/null +++ b/bwtsw2_chain.c @@ -0,0 +1,107 @@ +#include +#include "bwtsw2.h" + +typedef struct { + uint32_t tbeg, tend; + int qbeg, qend; + uint32_t flag:1, idx:31; + int chain; // also reuse as a counter +} hsaip_t; + +#define _hsaip_lt(a, b) ((a).qbeg < (b).qbeg) + +#include "ksort.h" +KSORT_INIT(hsaip, hsaip_t, _hsaip_lt) + +static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t *chain) +{ + int j, k, m = 0; + ks_introsort(hsaip, n, z); + for (j = 0; j < n; ++j) { + hsaip_t *p = z + j; + for (k = m - 1; k >= 0; --k) { + hsaip_t *q = chain + k; + int x = p->qbeg - q->qbeg; // always positive + int y = p->tbeg - q->tbeg; + if (y > 0 && x - y <= opt->bw && y - x <= opt->bw) { + if (p->qend > q->qend) q->qend = p->qend; + if (p->tend > q->tend) q->tend = p->tend; + ++q->chain; + p->chain = shift + k; + break; + } + } + if (k < 0) { + chain[m] = *p; + chain[m].chain = 1; + chain[m].idx = p->chain = shift + m; + ++m; + } + } + return m; +} + +void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) +{ + hsaip_t *z[2], *chain[2]; + int i, j, k, n[2], m[2]; + char *flag; + // initialization + n[0] = b[0]->n; n[1] = b[1]->n; + z[0] = calloc(n[0] + n[1], sizeof(hsaip_t)); + z[1] = z[0] + n[0]; + chain[0] = calloc(n[0] + n[1], sizeof(hsaip_t)); + for (k = j = 0; k < 2; ++k) { + for (i = 0; i < b[k]->n; ++i) { + bsw2hit_t *p = b[k]->hits + i; + hsaip_t *q = z[k] + i; + q->flag = k; q->idx = i; + q->tbeg = p->k; q->tend = p->k + p->len; + q->chain = -1; + q->qbeg = p->beg; q->qend = p->end; + } + } + // chaining + m[0] = chaining(opt, 0, n[0], z[0], chain[0]); + chain[1] = chain[0] + m[0]; + m[1] = chaining(opt, m[0], n[1], z[1], chain[1]); + // change query coordinate on the reverse strand + for (k = 0; k < m[1]; ++k) { + hsaip_t *p = chain[1] + k; + int tmp = p->qbeg; + p->qbeg = len - p->qend; p->qend = len - tmp; + } + // filtering + flag = calloc(m[0] + m[1], 1); + ks_introsort(hsaip, m[0] + m[1], chain[0]); + for (k = 1; k < m[0] + m[1]; ++k) { + hsaip_t *p = chain[0] + k; + for (j = 0; j < k; ++j) { + hsaip_t *q = chain[0] + j; + if (flag[q->idx]) continue; + if (q->qend >= p->qend && q->chain > p->chain * opt->t_seeds * 2) { + flag[p->idx] = 1; + break; + } + } + } + for (k = 0; k < n[0] + n[1]; ++k) { + hsaip_t *p = z[0] + k; + if (flag[p->chain]) + b[p->flag]->hits[p->idx].G = 0; + } + free(flag); + // squeeze out filtered elements in b[2] + for (k = 0; k < 2; ++k) { + for (j = i = 0; j < n[k]; ++j) { + bsw2hit_t *p = b[k]->hits + j; + if (p->G) { + if (i != j) b[k]->hits[i++] = *p; + else ++i; + } + } + b[k]->n = i; + } + // free + free(z[0]); free(chain[0]); +} diff --git a/bwtsw2_core.c b/bwtsw2_core.c new file mode 100644 index 0000000..03360a3 --- /dev/null +++ b/bwtsw2_core.c @@ -0,0 +1,594 @@ +#include +#include +#include +#include +#include +#include "bwt_lite.h" +#include "bwtsw2.h" +#include "bwt.h" +#include "kvec.h" + +#include "khash.h" +KHASH_MAP_INIT_INT64(64, uint64_t) + +#define MINUS_INF -0x3fffffff +#define MASK_LEVEL 0.90f + +struct __mempool_t; +static void mp_destroy(struct __mempool_t*); +typedef struct { + uint32_t qk, ql; + int I, D, G; + uint32_t pj:2, qlen:30; + int tlen; + int ppos, upos; + int cpos[4]; +} bsw2cell_t; + +#include "ksort.h" +KSORT_INIT_GENERIC(int) +#define __hitG_lt(a, b) ((a).G > (b).G) +KSORT_INIT(hitG, bsw2hit_t, __hitG_lt) + +static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF, 0, 0, 0, -1, -1, {-1, -1, -1, -1} }; + +typedef struct { + int n, max; + uint32_t tk, tl; + bsw2cell_t *array; +} bsw2entry_t, *bsw2entry_p; + +/* --- BEGIN: Stack operations --- */ +typedef struct { + int n_pending; + kvec_t(bsw2entry_p) stack0, pending; + struct __mempool_t *pool; +} bsw2stack_t; + +#define stack_isempty(s) (kv_size(s->stack0) == 0 && s->n_pending == 0) +static void stack_destroy(bsw2stack_t *s) { mp_destroy(s->pool); kv_destroy(s->stack0); kv_destroy(s->pending); free(s); } +inline static void stack_push0(bsw2stack_t *s, bsw2entry_p e) { kv_push(bsw2entry_p, s->stack0, e); } +inline static bsw2entry_p stack_pop(bsw2stack_t *s) +{ + assert(!(kv_size(s->stack0) == 0 && s->n_pending != 0)); + return kv_pop(s->stack0); +} +/* --- END: Stack operations --- */ + +/* --- BEGIN: memory pool --- */ +typedef struct __mempool_t { + int cnt; // if cnt!=0, then there must be memory leak + kvec_t(bsw2entry_p) pool; +} mempool_t; +inline static bsw2entry_p mp_alloc(mempool_t *mp) +{ + ++mp->cnt; + if (kv_size(mp->pool) == 0) return (bsw2entry_t*)calloc(1, sizeof(bsw2entry_t)); + else return kv_pop(mp->pool); +} +inline static void mp_free(mempool_t *mp, bsw2entry_p e) +{ + --mp->cnt; e->n = 0; + kv_push(bsw2entry_p, mp->pool, e); +} +static void mp_destroy(struct __mempool_t *mp) +{ + int i; + for (i = 0; i != kv_size(mp->pool); ++i) { + free(kv_A(mp->pool, i)->array); + free(kv_A(mp->pool, i)); + } + kv_destroy(mp->pool); + free(mp); +} +/* --- END: memory pool --- */ + +/* --- BEGIN: utilities --- */ +static khash_t(64) *bsw2_connectivity(const bwtl_t *b) +{ + khash_t(64) *h; + uint32_t k, l, cntk[4], cntl[4]; + uint64_t x; + khiter_t iter; + int j, ret; + kvec_t(uint64_t) stack; + + kv_init(stack); + h = kh_init(64); + kh_resize(64, h, b->seq_len * 4); + x = b->seq_len; + kv_push(uint64_t, stack, x); + while (kv_size(stack)) { + x = kv_pop(stack); + k = x>>32; l = (uint32_t)x; + bwtl_2occ4(b, k-1, l, cntk, cntl); + for (j = 0; j != 4; ++j) { + k = b->L2[j] + cntk[j] + 1; + l = b->L2[j] + cntl[j]; + if (k > l) continue; + x = (uint64_t)k << 32 | l; + iter = kh_put(64, h, x, &ret); + if (ret) { // if not present + kh_value(h, iter) = 1; + kv_push(uint64_t, stack, x); + } else ++kh_value(h, iter); + } + } + kv_destroy(stack); + //fprintf(stderr, "[bsw2_connectivity] %u nodes in the DAG\n", kh_size(h)); + return h; +} +// pick up top T matches at a node +static void cut_tail(bsw2entry_t *u, int T, bsw2entry_t *aux) +{ + int i, *a, n, x; + if (u->n <= T) return; + if (aux->max < u->n) { + aux->max = u->n; + aux->array = (bsw2cell_t*)realloc(aux->array, aux->max * sizeof(bsw2cell_t)); + } + a = (int*)aux->array; + for (i = n = 0; i != u->n; ++i) + if (u->array[i].ql && u->array[i].G > 0) + a[n++] = -u->array[i].G; + if (n <= T) return; + x = -ks_ksmall(int, n, a, T); + n = 0; + for (i = 0; i < u->n; ++i) { + bsw2cell_t *p = u->array + i; + if (p->G == x) ++n; + if (p->G < x || (p->G == x && n >= T)) { + p->qk = p->ql = 0; p->G = 0; + if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -1; + } + } +} +// remove duplicated cells +static inline void remove_duplicate(bsw2entry_t *u, khash_t(64) *hash) +{ + int i, ret, j; + khiter_t k; + uint64_t key; + kh_clear(64, hash); + for (i = 0; i != u->n; ++i) { + bsw2cell_t *p = u->array + i; + if (p->ql == 0) continue; + key = (uint64_t)p->qk << 32 | p->ql; + k = kh_put(64, hash, key, &ret); + j = -1; + if (ret == 0) { + if ((uint32_t)kh_value(hash, k) >= p->G) j = i; + else { + j = kh_value(hash, k)>>32; + kh_value(hash, k) = (uint64_t)i<<32 | p->G; + } + } else kh_value(hash, k) = (uint64_t)i<<32 | p->G; + if (j >= 0) { + p = u->array + j; + p->qk = p->ql = 0; p->G = 0; + if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3; + } + } +} +// merge two entries +static void merge_entry(const bsw2opt_t * __restrict opt, bsw2entry_t *u, bsw2entry_t *v, bwtsw2_t *b) +{ + int i; + if (u->n + v->n >= u->max) { + u->max = u->n + v->n; + u->array = (bsw2cell_t*)realloc(u->array, u->max * sizeof(bsw2cell_t)); + } + for (i = 0; i != v->n; ++i) { + bsw2cell_t *p = v->array + i; + if (p->ppos >= 0) p->ppos += u->n; + if (p->cpos[0] >= 0) p->cpos[0] += u->n; + if (p->cpos[1] >= 0) p->cpos[1] += u->n; + if (p->cpos[2] >= 0) p->cpos[2] += u->n; + if (p->cpos[3] >= 0) p->cpos[3] += u->n; + } + memcpy(u->array + u->n, v->array, v->n * sizeof(bsw2cell_t)); + u->n += v->n; +} + +static inline bsw2cell_t *push_array_p(bsw2entry_t *e) +{ + if (e->n == e->max) { + e->max = e->max? e->max<<1 : 256; + e->array = (bsw2cell_t*)realloc(e->array, sizeof(bsw2cell_t) * e->max); + } + return e->array + e->n; +} + +static inline double time_elapse(const struct rusage *curr, const struct rusage *last) +{ + long t1 = (curr->ru_utime.tv_sec - last->ru_utime.tv_sec) + (curr->ru_stime.tv_sec - last->ru_stime.tv_sec); + long t2 = (curr->ru_utime.tv_usec - last->ru_utime.tv_usec) + (curr->ru_stime.tv_usec - last->ru_stime.tv_usec); + return (double)t1 + t2 * 1e-6; +} +/* --- END: utilities --- */ + +/* --- BEGIN: processing partial hits --- */ +static void save_hits(const bwtl_t *bwt, int thres, bsw2hit_t *hits, bsw2entry_t *u) +{ + int i; + uint32_t k; + for (i = 0; i < u->n; ++i) { + bsw2cell_t *p = u->array + i; + if (p->G < thres) continue; + for (k = u->tk; k <= u->tl; ++k) { + int beg, end; + bsw2hit_t *q = 0; + beg = bwt->sa[k]; end = beg + p->tlen; + if (p->G > hits[beg*2].G) { + hits[beg*2+1] = hits[beg*2]; + q = hits + beg * 2; + } else if (p->G > hits[beg*2+1].G) q = hits + beg * 2 + 1; + if (q) { + q->k = p->qk; q->l = p->ql; q->len = p->qlen; q->G = p->G; + q->beg = beg; q->end = end; q->G2 = q->k == q->l? 0 : q->G; + q->flag = q->n_seeds = 0; + } + } + } +} +/* "narrow hits" are node-to-node hits that have a high score and + * are not so repetitive (|SA interval|<=IS). */ +static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, int t, int IS) +{ + int i; + for (i = 0; i < u->n; ++i) { + bsw2hit_t *q; + bsw2cell_t *p = u->array + i; + if (p->G >= t && p->ql - p->qk + 1 <= IS) { // good narrow hit + if (b1->max == b1->n) { + b1->max = b1->max? b1->max<<1 : 4; + b1->hits = realloc(b1->hits, b1->max * sizeof(bsw2hit_t)); + } + q = &b1->hits[b1->n++]; + q->k = p->qk; q->l = p->ql; + q->len = p->qlen; + q->G = p->G; q->G2 = 0; + q->beg = bwtl->sa[u->tk]; q->end = q->beg + p->tlen; + q->flag = 0; + // delete p + p->qk = p->ql = 0; p->G = 0; + if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3; + } + } +} +/* after this, "narrow SA hits" will be expanded and the coordinates + * will be obtained and stored in b->hits[*].k. */ +int bsw2_resolve_duphits(const bwt_t *bwt, bwtsw2_t *b, int IS) +{ + int i, j, n; + if (b->n == 0) return 0; + if (bwt) { // convert to chromosomal coordinates if suitable + int old_n = b->n; + bsw2hit_t *old_hits = b->hits; + for (i = n = 0; i < b->n; ++i) { + bsw2hit_t *p = old_hits + i; + if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1; + else if (p->G > 0) ++n; + } + b->n = b->max = n; + b->hits = calloc(b->max, sizeof(bsw2hit_t)); + for (i = j = 0; i < old_n; ++i) { + bsw2hit_t *p = old_hits + i; + if (p->l - p->k + 1 <= IS) { + bwtint_t k; + for (k = p->k; k <= p->l; ++k) { + b->hits[j] = *p; + b->hits[j].k = bwt_sa(bwt, k); + b->hits[j].l = 0; + ++j; + } + } else if (p->G > 0) { + b->hits[j] = *p; + b->hits[j].k = bwt_sa(bwt, p->k); + b->hits[j].l = 0; + b->hits[j].flag |= 1; + ++j; + } + } + free(old_hits); + } + ks_introsort(hitG, b->n, b->hits); + for (i = 1; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + if (p->G == 0) break; + for (j = 0; j < i; ++j) { + bsw2hit_t *q = b->hits + j; + int compatible = 1; + if (q->G == 0) continue; + if (p->l == 0 && q->l == 0) { + int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); + if (qol < 0) qol = 0; + if ((float)qol / (p->end - p->beg) > MASK_LEVEL || (float)qol / (q->end - q->beg) > MASK_LEVEL) { + int64_t tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len) + - (int64_t)(p->k > q->k? p->k : q->k); + if ((double)tol / p->len > MASK_LEVEL || (double)tol / q->len > MASK_LEVEL) + compatible = 0; + } + } + if (!compatible) { + p->G = 0; + break; + } + } + } + n = i; + for (i = j = 0; i < n; ++i) { + if (b->hits[i].G == 0) continue; + if (i != j) b->hits[j++] = b->hits[i]; + else ++j; + } + b->n = j; + return b->n; +} + +int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level) +{ + int i, j, n; + if (b->n == 0) return 0; + ks_introsort(hitG, b->n, b->hits); + { // choose a random one + int G0 = b->hits[0].G; + for (i = 1; i < b->n; ++i) + if (b->hits[i].G != G0) break; + j = (int)(i * drand48()); + if (j) { + bsw2hit_t tmp; + tmp = b->hits[0]; b->hits[0] = b->hits[j]; b->hits[j] = tmp; + } + } + for (i = 1; i < b->n; ++i) { + bsw2hit_t *p = b->hits + i; + int all_compatible = 1; + if (p->G == 0) break; + for (j = 0; j < i; ++j) { + bsw2hit_t *q = b->hits + j; + int64_t tol = 0; + int qol, compatible = 0; + float fol; + if (q->G == 0) continue; + qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); + if (qol < 0) qol = 0; + if (p->l == 0 && q->l == 0) { + tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len) + - (p->k > q->k? p->k : q->k); + if (tol < 0) tol = 0; + } + fol = (float)qol / (p->end - p->beg < q->end - q->beg? p->end - p->beg : q->end - q->beg); + if (fol < mask_level || (tol > 0 && qol < p->end - p->beg && qol < q->end - q->beg)) compatible = 1; + if (!compatible) { + if (q->G2 < p->G) q->G2 = p->G; + all_compatible = 0; + } + } + if (!all_compatible) p->G = 0; + } + n = i; + for (i = j = 0; i < n; ++i) { + if (b->hits[i].G == 0) continue; + if (i != j) b->hits[j++] = b->hits[i]; + else ++j; + } + b->n = j; + return j; +} +/* --- END: processing partial hits --- */ + +/* --- BEGIN: global mem pool --- */ +bsw2global_t *bsw2_global_init() +{ + bsw2global_t *pool; + bsw2stack_t *stack; + pool = calloc(1, sizeof(bsw2global_t)); + stack = calloc(1, sizeof(bsw2stack_t)); + stack->pool = (mempool_t*)calloc(1, sizeof(mempool_t)); + pool->stack = (void*)stack; + return pool; +} + +void bsw2_global_destroy(bsw2global_t *pool) +{ + stack_destroy((bsw2stack_t*)pool->stack); + free(pool->aln_mem); + free(pool); +} +/* --- END: global mem pool --- */ + +static inline int fill_cell(const bsw2opt_t *o, int match_score, bsw2cell_t *c[4]) +{ + int G = c[3]? c[3]->G + match_score : MINUS_INF; + if (c[1]) { + c[0]->I = c[1]->I > c[1]->G - o->q? c[1]->I - o->r : c[1]->G - o->qr; + if (c[0]->I > G) G = c[0]->I; + } else c[0]->I = MINUS_INF; + if (c[2]) { + c[0]->D = c[2]->D > c[2]->G - o->q? c[2]->D - o->r : c[2]->G - o->qr; + if (c[0]->D > G) G = c[0]->D; + } else c[0]->D = MINUS_INF; + return(c[0]->G = G); +} + +static void init_bwtsw2(const bwtl_t *target, const bwt_t *query, bsw2stack_t *s) +{ + bsw2entry_t *u; + bsw2cell_t *x; + + u = mp_alloc(s->pool); + u->tk = 0; u->tl = target->seq_len; + x = push_array_p(u); + *x = g_default_cell; + x->G = 0; x->qk = 0; x->ql = query->seq_len; + u->n++; + stack_push0(s, u); +} +/* On return, ret[1] keeps not-so-repetitive hits (narrow SA hits); ret[0] keeps all hits (right?) */ +bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool) +{ + bsw2stack_t *stack = (bsw2stack_t*)pool->stack; + bwtsw2_t *b, *b1, **b_ret; + int i, j, score_mat[16], *heap, heap_size, n_tot = 0; + struct rusage curr, last; + khash_t(64) *rhash, *chash; + + // initialize connectivity hash (chash) + chash = bsw2_connectivity(target); + // calculate score matrix + for (i = 0; i != 4; ++i) + for (j = 0; j != 4; ++j) + score_mat[i<<2|j] = (i == j)? opt->a : -opt->b; + // initialize other variables + rhash = kh_init(64); + init_bwtsw2(target, query, stack); + heap_size = opt->z; + heap = calloc(heap_size, sizeof(int)); + // initialize the return struct + b = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t)); + b->n = b->max = target->seq_len * 2; + b->hits = calloc(b->max, sizeof(bsw2hit_t)); + b1 = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t)); + b_ret = calloc(2, sizeof(void*)); + b_ret[0] = b; b_ret[1] = b1; + // initialize timer + getrusage(0, &last); + // the main loop: traversal of the DAG + while (!stack_isempty(stack)) { + int old_n, tj; + bsw2entry_t *v; + uint32_t k, l, tcntk[4], tcntl[4]; + + v = stack_pop(stack); old_n = v->n; + n_tot += v->n; + + for (i = 0; i < v->n; ++i) { // test max depth and band width + bsw2cell_t *p = v->array + i; + if (p->ql == 0) continue; + if (p->tlen - (int)p->qlen > opt->bw || (int)p->qlen - p->tlen > opt->bw) { + p->qk = p->ql = 0; + if (p->ppos >= 0) v->array[p->ppos].cpos[p->pj] = -5; + } + } + + // get Occ for the DAG + bwtl_2occ4(target, v->tk - 1, v->tl, tcntk, tcntl); + for (tj = 0; tj != 4; ++tj) { // descend to the children + uint32_t qcntk[4], qcntl[4]; + int qj, *curr_score_mat = score_mat + tj * 4; + khiter_t iter; + bsw2entry_t *u; + + k = target->L2[tj] + tcntk[tj] + 1; + l = target->L2[tj] + tcntl[tj]; + if (k > l) continue; + // update counter + iter = kh_get(64, chash, (uint64_t)k<<32 | l); + --kh_value(chash, iter); + // initialization + u = mp_alloc(stack->pool); + u->tk = k; u->tl = l; + memset(heap, 0, sizeof(int) * opt->z); + // loop through all the nodes in v + for (i = 0; i < v->n; ++i) { + bsw2cell_t *p = v->array + i, *x, *c[4]; // c[0]=>current, c[1]=>I, c[2]=>D, c[3]=>G + int is_added = 0; + if (p->ql == 0) continue; // deleted node + c[0] = x = push_array_p(u); + x->G = MINUS_INF; + p->upos = x->upos = -1; + if (p->ppos >= 0) { // parent has been visited + c[1] = (v->array[p->ppos].upos >= 0)? u->array + v->array[p->ppos].upos : 0; + c[3] = v->array + p->ppos; c[2] = p; + if (fill_cell(opt, curr_score_mat[p->pj], c) > 0) { // then update topology at p and x + x->ppos = v->array[p->ppos].upos; // the parent pos in u + p->upos = u->n++; // the current pos in u + if (x->ppos >= 0) u->array[x->ppos].cpos[p->pj] = p->upos; // the child pos of its parent in u + is_added = 1; + } + } else { + x->D = p->D > p->G - opt->q? p->D - opt->r : p->G - opt->qr; + if (x->D > 0) { + x->G = x->D; + x->I = MINUS_INF; x->ppos = -1; + p->upos = u->n++; + is_added = 1; + } + } + if (is_added) { // x has been added to u->array. fill the remaining variables + x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1; + x->pj = p->pj; x->qk = p->qk; x->ql = p->ql; x->qlen = p->qlen; x->tlen = p->tlen + 1; + if (x->G > -heap[0]) { + heap[0] = -x->G; + ks_heapadjust(int, 0, heap_size, heap); + } + } + if ((x->G > opt->qr && x->G >= -heap[0]) || i < old_n) { // good node in u, or in v + if (p->cpos[0] == -1 || p->cpos[1] == -1 || p->cpos[2] == -1 || p->cpos[3] == -1) { + bwt_2occ4(query, p->qk - 1, p->ql, qcntk, qcntl); + for (qj = 0; qj != 4; ++qj) { // descend to the prefix trie + if (p->cpos[qj] != -1) continue; // this node will be visited later + k = query->L2[qj] + qcntk[qj] + 1; + l = query->L2[qj] + qcntl[qj]; + if (k > l) { p->cpos[qj] = -2; continue; } + x = push_array_p(v); + p = v->array + i; // p may not point to the correct position after realloc + x->G = x->I = x->D = MINUS_INF; + x->qk = k; x->ql = l; x->pj = qj; x->qlen = p->qlen + 1; x->ppos = i; x->tlen = p->tlen; + x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1; + p->cpos[qj] = v->n++; + } // ~for(qj) + } // ~if(p->cpos[]) + } // ~if + } // ~for(i) + if (u->n) save_hits(target, opt->t, b->hits, u); + { // push u to the stack (or to the pending array) + uint32_t cnt, pos; + cnt = (uint32_t)kh_value(chash, iter); + pos = kh_value(chash, iter)>>32; + if (pos) { // something in the pending array, then merge + bsw2entry_t *w = kv_A(stack->pending, pos-1); + if (u->n) { + if (w->n < u->n) { // swap + w = u; u = kv_A(stack->pending, pos-1); kv_A(stack->pending, pos-1) = w; + } + merge_entry(opt, w, u, b); + } + if (cnt == 0) { // move from pending to stack0 + remove_duplicate(w, rhash); + save_narrow_hits(target, w, b1, opt->t, opt->is); + cut_tail(w, opt->z, u); + stack_push0(stack, w); + kv_A(stack->pending, pos-1) = 0; + --stack->n_pending; + } + mp_free(stack->pool, u); + } else if (cnt) { // the first time + if (u->n) { // push to the pending queue + ++stack->n_pending; + kv_push(bsw2entry_p, stack->pending, u); + kh_value(chash, iter) = (uint64_t)kv_size(stack->pending)<<32 | cnt; + } else mp_free(stack->pool, u); + } else { // cnt == 0, then push to the stack + bsw2entry_t *w = mp_alloc(stack->pool); + save_narrow_hits(target, u, b1, opt->t, opt->is); + cut_tail(u, opt->z, w); + mp_free(stack->pool, w); + stack_push0(stack, u); + } + } + } // ~for(tj) + mp_free(stack->pool, v); + } // while(top) + getrusage(0, &curr); + bsw2_resolve_duphits(query, b, opt->is); + bsw2_resolve_duphits(query, b1, opt->is); + //fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot); + // free + free(heap); + kh_destroy(64, rhash); + kh_destroy(64, chash); + stack->pending.n = stack->stack0.n = 0; + return b_ret; +} diff --git a/bwtsw2_main.c b/bwtsw2_main.c new file mode 100644 index 0000000..a31800b --- /dev/null +++ b/bwtsw2_main.c @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include +#include "bwt.h" +#include "bwtsw2.h" + +int bwa_bwtsw2(int argc, char *argv[]) +{ + bsw2opt_t *opt; + bwt_t *target[2]; + char buf[1024]; + bntseq_t *bns; + int c; + + opt = bsw2_init_opt(); + srand48(11); + while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:y:s:c:N:Hf:")) >= 0) { + switch (c) { + case 'q': opt->q = atoi(optarg); break; + case 'r': opt->r = atoi(optarg); break; + case 'a': opt->a = atoi(optarg); break; + case 'b': opt->b = atoi(optarg); break; + case 'w': opt->bw = atoi(optarg); break; + case 'T': opt->t = atoi(optarg); break; + case 't': opt->n_threads = atoi(optarg); break; + case 'z': opt->z = atoi(optarg); break; + case 'y': opt->yita = atof(optarg); break; + case 's': opt->is = atoi(optarg); break; + case 'm': opt->mask_level = atof(optarg); break; + case 'c': opt->coef = atof(optarg); break; + case 'N': opt->t_seeds = atoi(optarg); break; + case 'H': opt->hard_clip = 1; break; + case 'f': freopen(optarg, "w", stdout); + } + } + opt->qr = opt->q + opt->r; + + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa bwasw [options] \n\n"); + fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a); + fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b); + fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q); + fprintf(stderr, " -r INT gap extension penalty [%d]\n", opt->r); +// fprintf(stderr, " -y FLOAT error recurrence coef. (4..16) [%.1f]\n", opt->yita); + fprintf(stderr, "\n"); + fprintf(stderr, " -t INT nmber of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -s INT size of a chunk of reads [%d]\n", opt->chunk_size); + fprintf(stderr, "\n"); + fprintf(stderr, " -w INT band width [%d]\n", opt->bw); + fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level); + fprintf(stderr, "\n"); + fprintf(stderr, " -T INT score threshold divided by a [%d]\n", opt->t); + fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is); + fprintf(stderr, " -z INT Z-best [%d]\n", opt->z); + fprintf(stderr, " -N INT # seeds to trigger reverse alignment [%d]\n", opt->t_seeds); + fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef); + fprintf(stderr, " -H in SAM output, use hard clipping rather than soft\n"); + fprintf(stderr, " -f FILE file to output results to instead of stdout\n"); + fprintf(stderr, "\n"); + + { + double c, theta, eps, delta; + c = opt->a / log(opt->yita); + theta = exp(-opt->b / c) / opt->yita; + eps = exp(-opt->q / c); + delta = exp(-opt->r / c); + fprintf(stderr, "mismatch: %lf, gap_open: %lf, gap_ext: %lf\n\n", + theta, eps, delta); + } + return 1; + } + + // adjust opt for opt->a + opt->t *= opt->a; + opt->coef *= opt->a; + + strcpy(buf, argv[optind]); target[0] = bwt_restore_bwt(strcat(buf, ".bwt")); + strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".sa"), target[0]); + strcpy(buf, argv[optind]); target[1] = bwt_restore_bwt(strcat(buf, ".rbwt")); + strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".rsa"), target[1]); + bns = bns_restore(argv[optind]); + + bsw2_aln(opt, bns, target, argv[optind+1]); + + bns_destroy(bns); + bwt_destroy(target[0]); bwt_destroy(target[1]); + free(opt); + + return 0; +} diff --git a/cs2nt.c b/cs2nt.c new file mode 100644 index 0000000..dfbce60 --- /dev/null +++ b/cs2nt.c @@ -0,0 +1,191 @@ +#include +#include +#include +#include "bwtaln.h" +#include "stdaln.h" + +/* + Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we + decode as ATTGAC(RBGOG), there are one color change and one nt change; + if we decode as ATTAAC(RBRBG), there are two color changes. + + In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM + as the penalty; otherwise, we will use color quality as the + penalty. This means we always prefer two consistent color changes over + a nt change, but if a color has high quality, we may prefer one nt + change. + + In the above example, the penalties of the two types of decoding are + q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first; + otherwise the second. Note that no matter what we choose, the fourth + base will get a low nt quality. + */ + +#define COLOR_MM 19 +#define NUCL_MM 25 + +static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 }; + +/* + {A,C,G,T,N} -> {0,1,2,3,4} + nt_ref[0..size]: nucleotide reference: 0/1/2/3/4 + cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N + nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned) + btarray[0..4*size]: backtrack array (working space) + */ +void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray) +{ + int h[8], curr, last; + int x, y, xmin, hmin, k; + + // h[0..3] and h[4..7] are the current and last best score array, depending on curr and last + + // recursion: initial value + if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2); + else { + for (x = 0; x != 4; ++x) h[x] = NUCL_MM; + h[nt_ref[0]] = 0; + } + // recursion: main loop + curr = 1; last = 0; + for (k = 1; k <= size; ++k) { + for (x = 0; x != 4; ++x) { + int min = 0x7fffffff, ymin = 0; + for (y = 0; y != 4; ++y) { + int s = h[last<<2|y]; + if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<= 0; --k) + nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]]; +} +/* + nt_read[0..size]: nucleotide read sequence: 0/1/2/3 + cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N + tarray[0..size*2-1]: temporary array + */ +uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray) +{ + int k, c1, c2; + uint8_t *t2array = tarray + size; + // get the color sequence of nt_read + c1 = nt_read[0]; + for (k = 1; k <= size; ++k) { + c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case + tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<>6 && tarray[k] == cs_read[k]>>6) { + q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10; + } else if (tarray[k-1] == cs_read[k-1]>>6) { + q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f); + } else if (tarray[k] == cs_read[k]>>6) { + q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f); + } // else, q = 0 + if (q < 0) q = 0; + if (q > 60) q = 60; + t2array[k] = nt_read[k]<<6 | q; + if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0; + } + return t2array + 1; // of size-2 +} + +// this function will be called when p->seq has been reversed by refine_gapped() +void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac) +{ + uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read; + int i, len; + uint8_t *seq; + + // set temporary arrays + if (p->type == BWA_TYPE_NO_MATCH) return; + len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space + ta = (uint8_t*)malloc(len * 7); + nt_ref = ta; + cs_read = nt_ref + len; + nt_read = cs_read + len; + btarray = nt_read + len; + tarray = nt_read + len; + +#define __gen_csbase(_cs, _i, _seq) do { \ + int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \ + if (q > 60) q = 60; \ + if (_seq[_i] > 3) q = 63; \ + (_cs) = _seq[_i]<<6 | q; \ + } while (0) + + // generate len, nt_ref[] and cs_read + seq = p->strand? p->rseq : p->seq; + nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4; + if (p->cigar == 0) { // no gap or clipping + len = p->len; + for (i = 0; i < p->len; ++i) { + __gen_csbase(cs_read[i], i, seq); + nt_ref[i+1] = bns_pac(pac, p->pos + i); + } + } else { + int k, z; + bwtint_t x, y; + x = p->pos; y = 0; + for (k = z = 0; k < p->n_cigar; ++k) { + int l = __cigar_len(p->cigar[k]); + if (__cigar_op(p->cigar[k]) == FROM_M) { + for (i = 0; i < l; ++i, ++x, ++y) { + __gen_csbase(cs_read[z], y, seq); + nt_ref[z+1] = bns_pac(pac, x); + ++z; + } + } else if (__cigar_op(p->cigar[k]) == FROM_I) { + for (i = 0; i < l; ++i, ++y) { + __gen_csbase(cs_read[z], y, seq); + nt_ref[z+1] = 4; + ++z; + } + } else if (__cigar_op(p->cigar[k]) == FROM_S) y += l; + else x += l; + } + len = z; + } + + cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray); + new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray); + + // update p + p->len = p->full_len = len - 1; + for (i = 0; i < p->len; ++i) { + if ((new_nt_read[i]&0x3f) == 63) { + p->qual[i] = 33; seq[i] = 4; + } else { + p->qual[i] = (new_nt_read[i]&0x3f) + 33; + seq[i] = new_nt_read[i]>>6; + } + } + p->qual[p->len] = seq[p->len] = 0; + if (p->strand) { + memcpy(p->seq, seq, p->len); + seq_reverse(p->len, p->seq, 1); + seq_reverse(p->len, p->qual, 0); + } else { + memcpy(p->rseq, seq, p->len); + seq_reverse(p->len, p->rseq, 1); + } + free(ta); +} diff --git a/is.c b/is.c new file mode 100644 index 0000000..9e50faf --- /dev/null +++ b/is.c @@ -0,0 +1,218 @@ +/* + * sais.c for sais-lite + * Copyright (c) 2008 Yuta Mori All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +typedef unsigned char ubyte_t; +#define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i]) + +/* find the start or end of each bucket */ +static void getCounts(const unsigned char *T, int *C, int n, int k, int cs) +{ + int i; + for (i = 0; i < k; ++i) C[i] = 0; + for (i = 0; i < n; ++i) ++C[chr(i)]; +} +static void getBuckets(const int *C, int *B, int k, int end) +{ + int i, sum = 0; + if (end) { + for (i = 0; i < k; ++i) { + sum += C[i]; + B[i] = sum; + } + } else { + for (i = 0; i < k; ++i) { + sum += C[i]; + B[i] = sum - C[i]; + } + } +} + +/* compute SA */ +static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs) +{ + int *b, i, j; + int c0, c1; + /* compute SAl */ + if (C == B) getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 0); /* find starts of buckets */ + j = n - 1; + b = SA + B[c1 = chr(j)]; + *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; + for (i = 0; i < n; ++i) { + j = SA[i], SA[i] = ~j; + if (0 < j) { + --j; + if ((c0 = chr(j)) != c1) { + B[c1] = b - SA; + b = SA + B[c1 = c0]; + } + *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; + } + } + /* compute SAs */ + if (C == B) getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 1); /* find ends of buckets */ + for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) { + if (0 < (j = SA[i])) { + --j; + if ((c0 = chr(j)) != c1) { + B[c1] = b - SA; + b = SA + B[c1 = c0]; + } + *--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j; + } else SA[i] = ~j; + } +} + +/* + * find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working + * space (excluding T and SA) of at most 2n+O(1) for a constant alphabet + */ +static int sais_main(const unsigned char *T, int *SA, int fs, int n, int k, int cs) +{ + int *C, *B, *RA; + int i, j, c, m, p, q, plen, qlen, name; + int c0, c1; + int diff; + + /* stage 1: reduce the problem by at least 1/2 sort all the + * S-substrings */ + if (k <= fs) { + C = SA + n; + B = (k <= (fs - k)) ? C + k : C; + } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2; + getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 1); /* find ends of buckets */ + for (i = 0; i < n; ++i) SA[i] = 0; + for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = chr(i)) < (c1 + c)) c = 1; + else if (c != 0) SA[--B[c1]] = i + 1, c = 0; + } + induceSA(T, SA, C, B, n, k, cs); + if (fs < k) free(C); + /* compact all the sorted substrings into the first m items of SA + * 2*m must be not larger than n (proveable) */ + for (i = 0, m = 0; i < n; ++i) { + p = SA[i]; + if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) { + for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j); + if ((j < n) && (c0 < c1)) SA[m++] = p; + } + } + for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */ + /* store the length of all substrings */ + for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = chr(i)) < (c1 + c)) c = 1; + else if (c != 0) { + SA[m + ((i + 1) >> 1)] = j - i - 1; + j = i + 1; + c = 0; + } + } + /* find the lexicographic names of all substrings */ + for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) { + p = SA[i], plen = SA[m + (p >> 1)], diff = 1; + if (plen == qlen) { + for (j = 0; (j < plen) && (chr(p + j) == chr(q + j)); j++); + if (j == plen) diff = 0; + } + if (diff != 0) ++name, q = p, qlen = plen; + SA[m + (p >> 1)] = name; + } + + /* stage 2: solve the reduced problem recurse if names are not yet + * unique */ + if (name < m) { + RA = SA + n + fs - m; + for (i = n - 1, j = m - 1; m <= i; --i) { + if (SA[i] != 0) RA[j--] = SA[i] - 1; + } + if (sais_main((unsigned char *) RA, SA, fs + n - m * 2, m, name, sizeof(int)) != 0) return -2; + for (i = n - 2, j = m - 1, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { + if ((c0 = chr(i)) < (c1 + c)) c = 1; + else if (c != 0) RA[j--] = i + 1, c = 0; /* get p1 */ + } + for (i = 0; i < m; ++i) SA[i] = RA[SA[i]]; /* get index */ + } + /* stage 3: induce the result for the original problem */ + if (k <= fs) { + C = SA + n; + B = (k <= (fs - k)) ? C + k : C; + } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2; + /* put all left-most S characters into their buckets */ + getCounts(T, C, n, k, cs); + getBuckets(C, B, k, 1); /* find ends of buckets */ + for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */ + for (i = m - 1; 0 <= i; --i) { + j = SA[i], SA[i] = 0; + SA[--B[chr(j)]] = j; + } + induceSA(T, SA, C, B, n, k, cs); + if (fs < k) free(C); + return 0; +} + +/** + * Constructs the suffix array of a given string. + * @param T[0..n-1] The input string. + * @param SA[0..n] The output array of suffixes. + * @param n The length of the given string. + * @return 0 if no error occurred + */ +int is_sa(const ubyte_t *T, int *SA, int n) +{ + if ((T == NULL) || (SA == NULL) || (n < 0)) return -1; + SA[0] = n; + if (n <= 1) { + if (n == 1) SA[1] = 0; + return 0; + } + return sais_main(T, SA+1, 0, n, 256, 1); +} + +/** + * Constructs the burrows-wheeler transformed string of a given string. + * @param T[0..n-1] The input string. + * @param n The length of the given string. + * @return The primary index if no error occurred, -1 or -2 otherwise. + */ +int is_bwt(ubyte_t *T, int n) +{ + int *SA, i, primary = 0; + SA = (int*)calloc(n+1, sizeof(int)); + is_sa(T, SA, n); + + for (i = 0; i <= n; ++i) { + if (SA[i] == 0) primary = i; + else SA[i] = T[SA[i] - 1]; + } + for (i = 0; i < primary; ++i) T[i] = SA[i]; + for (; i < n; ++i) T[i] = SA[i + 1]; + free(SA); + return primary; +} diff --git a/khash.h b/khash.h new file mode 100644 index 0000000..de6be6d --- /dev/null +++ b/khash.h @@ -0,0 +1,506 @@ +/* The MIT License + + Copyright (c) 2008, 2009 by attractor + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + if (!ret) kh_del(32, h, k); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + + @copyright Heng Li + */ + +#define AC_VERSION_KHASH_H "0.2.4" + +#include +#include +#include + +/* compipler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifdef _MSC_VER +#define inline __inline +#endif + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_HASH_PRIME_SIZE 32 +static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = +{ + 0ul, 3ul, 11ul, 23ul, 53ul, + 97ul, 193ul, 389ul, 769ul, 1543ul, + 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, + 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, + 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, + 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, + 3221225473ul, 4294967291ul +}; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +static const double __ac_HASH_UPPER = 0.77; + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + static inline kh_##name##_t *kh_init_##name() { \ + return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + } \ + static inline void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + free(h->keys); free(h->flags); \ + free(h->vals); \ + free(h); \ + } \ + } \ + static inline void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t inc, k, i, last; \ + k = __hash_func(key); i = k % h->n_buckets; \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + khint_t t = __ac_HASH_PRIME_SIZE - 1; \ + while (__ac_prime_list[t] > new_n_buckets) --t; \ + new_n_buckets = __ac_prime_list[t+1]; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ + else { \ + new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ + memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + } \ + } \ + if (j) { \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { \ + khint_t inc, k, i; \ + k = __hash_func(key); \ + i = k % new_n_buckets; \ + inc = 1 + k % (new_n_buckets - 1); \ + while (!__ac_isempty(new_flags, i)) { \ + if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ + else i += inc; \ + } \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); \ + } else { \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + free(h->flags); \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + } \ + static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { \ + if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ + else kh_resize_##name(h, h->n_buckets + 1); \ + } \ + { \ + khint_t inc, k, i, site, last; \ + x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ + if (__ac_isempty(h->flags, i)) x = i; \ + else { \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; \ + return x; \ + } \ + static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other necessary macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/kseq.h b/kseq.h new file mode 100644 index 0000000..ad8937c --- /dev/null +++ b/kseq.h @@ -0,0 +1,208 @@ +/* The MIT License + + Copyright (c) 2008, by Heng Li + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = (char*)malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { \ + if (dret) *dret = 0; \ + str->l = 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } \ + if (str->m - str->l < i - ks->begin + 1) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + str->s[str->l] = '\0'; \ + return str->l; \ + } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define __KSEQ_BASIC(type_t) \ + static inline kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + static inline void kseq_rewind(kseq_t *ks) \ + { \ + ks->last_char = 0; \ + ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ + } \ + static inline void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ \ + static int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* the first header char has been read */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ + if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (isgraph(c)) { /* printable non-space character */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l++] = (char)c; \ + } \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* we should not stop here */ \ + while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ + if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ + seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT(type_t, __read) \ + KSTREAM_INIT(type_t, __read, 4096) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(type_t) \ + __KSEQ_READ + +#endif diff --git a/ksort.h b/ksort.h new file mode 100644 index 0000000..52812e1 --- /dev/null +++ b/ksort.h @@ -0,0 +1,269 @@ +/* The MIT License + + Copyright (c) 2008, by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + 2008-11-16 (0.1.4): + + * Fixed a bug in introsort() that happens in rare cases. + + 2008-11-05 (0.1.3): + + * Fixed a bug in introsort() for complex comparisons. + + * Fixed a bug in mergesort(). The previous version is not stable. + + 2008-09-15 (0.1.2): + + * Accelerated introsort. On my Mac (not on another Linux machine), + my implementation is as fast as std::sort on random input. + + * Added combsort and in introsort, switch to combsort if the + recursion is too deep. + + 2008-09-13 (0.1.1): + + * Added k-small algorithm + + 2008-09-05 (0.1.0): + + * Initial version + +*/ + +#ifndef AC_KSORT_H +#define AC_KSORT_H + +#include +#include + +typedef struct { + void *left, *right; + int depth; +} ks_isort_stack_t; + +#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } + +#define KSORT_INIT(name, type_t, __sort_lt) \ + void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ + { \ + type_t *a2[2], *a, *b; \ + int curr, shift; \ + \ + a2[0] = array; \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ + ks_heapadjust_##name(i, lsize, l); \ + } \ + void ks_heapsort_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = lsize - 1; i > 0; --i) { \ + type_t tmp; \ + tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ + } \ + } \ + inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + { \ + type_t *i, *j, swap_tmp; \ + for (i = s + 1; i < t; ++i) \ + for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ + swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ + } \ + } \ + void ks_combsort_##name(size_t n, type_t a[]) \ + { \ + const double shrink_factor = 1.2473309501039786540366528676643; \ + int do_swap; \ + size_t gap = n; \ + type_t tmp, *i, *j; \ + do { \ + if (gap > 2) { \ + gap = (size_t)(gap / shrink_factor); \ + if (gap == 9 || gap == 10) gap = 11; \ + } \ + do_swap = 0; \ + for (i = a; i < a + n - gap; ++i) { \ + j = i + gap; \ + if (__sort_lt(*j, *i)) { \ + tmp = *i; *i = *j; *j = tmp; \ + do_swap = 1; \ + } \ + } \ + } while (do_swap || gap > 2); \ + if (gap != 1) __ks_insertsort_##name(a, a + n); \ + } \ + void ks_introsort_##name(size_t n, type_t a[]) \ + { \ + int d; \ + ks_isort_stack_t *top, *stack; \ + type_t rp, swap_tmp; \ + type_t *s, *t, *i, *j, *k; \ + \ + if (n < 1) return; \ + else if (n == 2) { \ + if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ + return; \ + } \ + for (d = 2; 1ul<>1) + 1; \ + if (__sort_lt(*k, *i)) { \ + if (__sort_lt(*k, *j)) k = j; \ + } else k = __sort_lt(*j, *i)? i : j; \ + rp = *k; \ + if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ + for (;;) { \ + do ++i; while (__sort_lt(*i, rp)); \ + do --j; while (i <= j && __sort_lt(rp, *j)); \ + if (j <= i) break; \ + swap_tmp = *i; *i = *j; *j = swap_tmp; \ + } \ + swap_tmp = *i; *i = *t; *t = swap_tmp; \ + if (i-s > t-i) { \ + if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ + s = t-i > 16? i+1 : t; \ + } else { \ + if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ + t = i-s > 16? i-1 : s; \ + } \ + } else { \ + if (top == stack) { \ + free(stack); \ + __ks_insertsort_##name(a, a+n); \ + return; \ + } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ + } \ + } \ + } \ + /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ + /* 0 <= kk < n */ \ + type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ + { \ + type_t *low, *high, *k, *ll, *hh, *mid; \ + low = arr; high = arr + n - 1; k = arr + kk; \ + for (;;) { \ + if (high <= low) return *k; \ + if (high == low + 1) { \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + return *k; \ + } \ + mid = low + (high - low) / 2; \ + if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ + KSORT_SWAP(type_t, *mid, *(low+1)); \ + ll = low + 1; hh = high; \ + for (;;) { \ + do ++ll; while (__sort_lt(*ll, *low)); \ + do --hh; while (__sort_lt(*low, *hh)); \ + if (hh < ll) break; \ + KSORT_SWAP(type_t, *ll, *hh); \ + } \ + KSORT_SWAP(type_t, *low, *hh); \ + if (hh <= k) low = ll; \ + if (hh >= k) high = hh - 1; \ + } \ + } + +#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) +#define ks_introsort(name, n, a) ks_introsort_##name(n, a) +#define ks_combsort(name, n, a) ks_combsort_##name(n, a) +#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) +#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) +#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) +#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) + +#define ks_lt_generic(a, b) ((a) < (b)) +#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) + +typedef const char *ksstr_t; + +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) +#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) + +#endif diff --git a/kstring.c b/kstring.c new file mode 100644 index 0000000..de06552 --- /dev/null +++ b/kstring.c @@ -0,0 +1,35 @@ +#include +#include +#include "kstring.h" + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + va_end(ap); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + } + va_end(ap); + s->l += l; + return l; +} + +#ifdef KSTRING_MAIN +#include +int main() +{ + kstring_t *s; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + ksprintf(s, "abcdefg: %d", 100); + printf("%s\n", s->s); + free(s); + return 0; +} +#endif diff --git a/kstring.h b/kstring.h new file mode 100644 index 0000000..398901f --- /dev/null +++ b/kstring.h @@ -0,0 +1,46 @@ +#ifndef KSTRING_H +#define KSTRING_H + +#include +#include + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +static inline int kputs(const char *p, kstring_t *s) +{ + int l = strlen(p); + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + strcpy(s->s + s->l, p); + s->l += l; + return l; +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +int ksprintf(kstring_t *s, const char *fmt, ...); + +#endif diff --git a/kvec.h b/kvec.h new file mode 100644 index 0000000..57204d6 --- /dev/null +++ b/kvec.h @@ -0,0 +1,90 @@ +/* The MIT License + + Copyright (c) 2008, by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "kvec.h" +int main() { + kvec_t(int) array; + kv_init(array); + kv_push(int, array, 10); // append + kv_a(int, array, 20) = 5; // dynamic + kv_A(array, 20) = 4; // static + kv_destroy(array); + return 0; +} +*/ + +/* + 2008-09-22 (0.1.0): + + * The initial version. + +*/ + +#ifndef AC_KVEC_H +#define AC_KVEC_H + +#include + +#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) + +#define kvec_t(type) struct { size_t n, m; type *a; } +#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) +#define kv_destroy(v) free((v).a) +#define kv_A(v, i) ((v).a[(i)]) +#define kv_pop(v) ((v).a[--(v).n]) +#define kv_size(v) ((v).n) +#define kv_max(v) ((v).m) + +#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) + +#define kv_copy(type, v1, v0) do { \ + if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ + (v1).n = (v0).n; \ + memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ + } while (0) \ + +#define kv_push(type, v, x) do { \ + if ((v).n == (v).m) { \ + (v).m = (v).m? (v).m<<1 : 2; \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ + } \ + (v).a[(v).n++] = (x); \ + } while (0) + +#define kv_pushp(type, v) (((v).n == (v).m)? \ + ((v).m = ((v).m? (v).m<<1 : 2), \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + : 0), ((v).a + ((v).n++)) + +#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ + ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + : (v).n <= (size_t)(i)? (v).n = (i) \ + : 0), (v).a[(i)] + +#endif diff --git a/main.c b/main.c new file mode 100644 index 0000000..f99255b --- /dev/null +++ b/main.c @@ -0,0 +1,58 @@ +#include +#include +#include "main.h" + +#ifndef PACKAGE_VERSION +#define PACKAGE_VERSION "0.5.9rc1 (r1561)" +#endif + +static int usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Program: bwa (alignment via Burrows-Wheeler transformation)\n"); + fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); + fprintf(stderr, "Contact: Heng Li \n\n"); + fprintf(stderr, "Usage: bwa [options]\n\n"); + fprintf(stderr, "Command: index index sequences in the FASTA format\n"); + fprintf(stderr, " aln gapped/ungapped alignment\n"); + fprintf(stderr, " samse generate alignment (single ended)\n"); + fprintf(stderr, " sampe generate alignment (paired ended)\n"); + fprintf(stderr, " bwasw BWA-SW for long queries\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); + fprintf(stderr, " pac2bwt generate BWT from PAC\n"); + fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n"); + fprintf(stderr, " bwtupdate update .bwt to the new format\n"); + fprintf(stderr, " pac_rev generate reverse PAC\n"); + fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); + fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n"); + fprintf(stderr, " stdsw standard SW/NW alignment\n"); + fprintf(stderr, "\n"); + return 1; +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) return usage(); + if (strcmp(argv[1], "fa2pac") == 0) return bwa_fa2pac(argc-1, argv+1); + else if (strcmp(argv[1], "pac2bwt") == 0) return bwa_pac2bwt(argc-1, argv+1); + else if (strcmp(argv[1], "pac2bwtgen") == 0) return bwt_bwtgen_main(argc-1, argv+1); + else if (strcmp(argv[1], "bwtupdate") == 0) return bwa_bwtupdate(argc-1, argv+1); + else if (strcmp(argv[1], "pac_rev") == 0) return bwa_pac_rev(argc-1, argv+1); + else if (strcmp(argv[1], "bwt2sa") == 0) return bwa_bwt2sa(argc-1, argv+1); + else if (strcmp(argv[1], "index") == 0) return bwa_index(argc-1, argv+1); + else if (strcmp(argv[1], "aln") == 0) return bwa_aln(argc-1, argv+1); + else if (strcmp(argv[1], "sw") == 0) return bwa_stdsw(argc-1, argv+1); + else if (strcmp(argv[1], "samse") == 0) return bwa_sai2sam_se(argc-1, argv+1); + else if (strcmp(argv[1], "sampe") == 0) return bwa_sai2sam_pe(argc-1, argv+1); + else if (strcmp(argv[1], "pac2cspac") == 0) return bwa_pac2cspac(argc-1, argv+1); + else if (strcmp(argv[1], "stdsw") == 0) return bwa_stdsw(argc-1, argv+1); + else if (strcmp(argv[1], "bwtsw2") == 0) return bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "dbwtsw") == 0) return bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "bwasw") == 0) return bwa_bwtsw2(argc-1, argv+1); + else { + fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); + return 1; + } + return 0; +} diff --git a/main.h b/main.h new file mode 100644 index 0000000..5e7697a --- /dev/null +++ b/main.h @@ -0,0 +1,29 @@ +#ifndef BWA_MAIN_H +#define BWA_MAIN_H + +#ifdef __cplusplus +extern "C" { +#endif + + int bwa_fa2pac(int argc, char *argv[]); + int bwa_pac_rev(int argc, char *argv[]); + int bwa_pac2cspac(int argc, char *argv[]); + int bwa_pac2bwt(int argc, char *argv[]); + int bwa_bwtupdate(int argc, char *argv[]); + int bwa_bwt2sa(int argc, char *argv[]); + int bwa_index(int argc, char *argv[]); + int bwa_aln(int argc, char *argv[]); + int bwt_bwtgen_main(int argc, char *argv[]); + + int bwa_sai2sam_se(int argc, char *argv[]); + int bwa_sai2sam_pe(int argc, char *argv[]); + + int bwa_stdsw(int argc, char *argv[]); + + int bwa_bwtsw2(int argc, char *argv[]); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/qualfa2fq.pl b/qualfa2fq.pl new file mode 100755 index 0000000..31e1974 --- /dev/null +++ b/qualfa2fq.pl @@ -0,0 +1,27 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; + +die("Usage: qualfa2fq.pl \n") if (@ARGV != 2); + +my ($fhs, $fhq, $q); +open($fhs, ($ARGV[0] =~ /\.gz$/)? "gzip -dc $ARGV[0] |" : $ARGV[0]) || die; +open($fhq, ($ARGV[1] =~ /\.gz$/)? "gzip -dc $ARGV[1] |" : $ARGV[1]) || die; + +$/ = ">"; <$fhs>; <$fhq>; $/ = "\n"; +while (<$fhs>) { + $q = <$fhq>; + print "\@$_"; + $/ = ">"; + $_ = <$fhs>; $q = <$fhq>; + chomp; chomp($q); + $q =~ s/\s*(\d+)\s*/chr($1+33)/eg; + print $_, "+\n"; + for (my $i = 0; $i < length($q); $i += 60) { + print substr($q, $i, 60), "\n"; + } + $/ = "\n"; +} + +close($fhs); close($fhq); diff --git a/simple_dp.c b/simple_dp.c new file mode 100644 index 0000000..7c078c2 --- /dev/null +++ b/simple_dp.c @@ -0,0 +1,162 @@ +#include +#include +#include +#include +#include +#include +#include "stdaln.h" +#include "utils.h" + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +typedef struct { + int l; + unsigned char *s; + char *n; +} seq1_t; + +typedef struct { + int n_seqs, m_seqs; + seq1_t *seqs; +} seqs_t; + +unsigned char aln_rev_table[256] = { + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N', + 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N', + 'N','t','v','g', 'h','N','N','c', 'd','N','N','m', 'N','k','N','N', + 'N','N','y','s', 'a','N','b','w', 'x','r','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', + 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N' +}; + +static int g_is_global = 0, g_thres = 1, g_strand = 0, g_aa = 0; +static AlnParam g_aln_param; + +static void revseq(int len, uint8_t *seq) +{ + int i; + for (i = 0; i < len>>1; ++i) { + uint8_t tmp = aln_rev_table[seq[len-1-i]]; + seq[len-1-i] = aln_rev_table[seq[i]]; + seq[i] = tmp; + } + if (len&1) seq[i] = aln_rev_table[seq[i]]; +} + +static seqs_t *load_seqs(const char *fn) +{ + seqs_t *s; + seq1_t *p; + gzFile fp; + int l; + kseq_t *seq; + + fp = xzopen(fn, "r"); + seq = kseq_init(fp); + s = (seqs_t*)calloc(1, sizeof(seqs_t)); + s->m_seqs = 256; + s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t)); + while ((l = kseq_read(seq)) >= 0) { + if (s->n_seqs == s->m_seqs) { + s->m_seqs <<= 1; + s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t)); + } + p = s->seqs + (s->n_seqs++); + p->l = seq->seq.l; + p->s = (unsigned char*)malloc(p->l + 1); + memcpy(p->s, seq->seq.s, p->l); + p->s[p->l] = 0; + p->n = strdup((const char*)seq->name.s); + } + kseq_destroy(seq); + gzclose(fp); + fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs); + return s; +} + +static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand) +{ + int i; + for (i = 0; i < ss->n_seqs; ++i) { + AlnAln *aa; + seq1_t *p = ss->seqs + i; + g_aln_param.band_width = l + p->l; + aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l); + if (aa->score >= g_thres || g_is_global) { + printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand, + aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo); + // NB: I put the short sequence as the first sequence in SW, an insertion to + // the reference becomes a deletion from the short sequence. Therefore, I use + // "MDI" here rather than "MID", and print ->out2 first rather than ->out1. + for (i = 0; i != aa->n_cigar; ++i) + printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]); + printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1); + } + aln_free_AlnAln(aa); + } +} + +static void aln_seqs(const seqs_t *ss, const char *fn) +{ + gzFile fp; + kseq_t *seq; + int l; + + fp = xzopen(fn, "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + if (g_strand&1) aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '+'); + if (g_strand&2) { + revseq(l, (uint8_t*)seq->seq.s); + aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '-'); + } + } + kseq_destroy(seq); + gzclose(fp); +} + +int bwa_stdsw(int argc, char *argv[]) +{ + int c; + seqs_t *ss; + + while ((c = getopt(argc, argv, "gT:frp")) >= 0) { + switch (c) { + case 'g': g_is_global = 1; break; + case 'T': g_thres = atoi(optarg); break; + case 'f': g_strand |= 1; break; + case 'r': g_strand |= 2; break; + case 'p': g_aa = 1; break; + } + } + if (g_strand == 0) g_strand = 3; + if (g_aa) g_strand = 1; + if (optind + 1 >= argc) { + fprintf(stderr, "\nUsage: bwa stdsw [options] \n\n"); + fprintf(stderr, "Options: -T INT minimum score [%d]\n", g_thres); + fprintf(stderr, " -p protein alignment (suppressing -r)\n"); + fprintf(stderr, " -f forward strand only\n"); + fprintf(stderr, " -r reverse strand only\n"); + fprintf(stderr, " -g global alignment\n\n"); + fprintf(stderr, "Note: This program is specifically designed for alignment between multiple short\n"); + fprintf(stderr, " sequences and ONE long sequence. It outputs the suboptimal score on the long\n"); + fprintf(stderr, " sequence.\n\n"); + return 1; + } + g_aln_param = g_aa? aln_param_aa2aa : aln_param_blast; + g_aln_param.gap_end = 0; + ss = load_seqs(argv[optind]); + aln_seqs(ss, argv[optind+1]); + return 0; +} diff --git a/solid2fastq.pl b/solid2fastq.pl new file mode 100755 index 0000000..c60ad81 --- /dev/null +++ b/solid2fastq.pl @@ -0,0 +1,111 @@ +#!/usr/bin/perl -w + +# Author: lh3 +# Note: Ideally, this script should be written in C. It is a bit slow at present. +# Also note that this script is different from the one contained in MAQ. + +use strict; +use warnings; +use Getopt::Std; + +my %opts; +my $version = '0.1.4'; +my $usage = qq{ +Usage: solid2fastq.pl + +Note: is the string showed in the `# Title:' line of a + ".csfasta" read file. Then F3.csfasta is read sequence + file and F3_QV.qual is the quality file. If + R3.csfasta is present, this script assumes reads are + paired; otherwise reads will be regarded as single-end. + + The read name will be :panel_x_y/[12] with `1' for R3 + tag and `2' for F3. Usually you may want to use short + to save diskspace. Long also causes troubles to maq. + +}; + +getopts('', \%opts); +die($usage) if (@ARGV != 2); +my ($title, $pre) = @ARGV; +my (@fhr, @fhw); +my @fn_suff = ('F3.csfasta', 'F3_QV.qual', 'R3.csfasta', 'R3_QV.qual'); +my $is_paired = (-f "$title$fn_suff[2]" || -f "$title$fn_suff[2].gz")? 1 : 0; +if ($is_paired) { # paired end + for (0 .. 3) { + my $fn = "$title$fn_suff[$_]"; + $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); + open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); + } + open($fhw[0], "|gzip >$pre.read2.fastq.gz") || die; # this is NOT a typo + open($fhw[1], "|gzip >$pre.read1.fastq.gz") || die; + open($fhw[2], "|gzip >$pre.single.fastq.gz") || die; + my (@df, @dr); + @df = &read1(1); @dr = &read1(2); + while (@df && @dr) { + if ($df[0] eq $dr[0]) { # mate pair + print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1]; + @df = &read1(1); @dr = &read1(2); + } else { + if ($df[0] le $dr[0]) { + print {$fhw[2]} $df[1]; + @df = &read1(1); + } else { + print {$fhw[2]} $dr[1]; + @dr = &read1(2); + } + } + } + if (@df) { + print {$fhw[2]} $df[1]; + while (@df = &read1(1, $fhr[0], $fhr[1])) { + print {$fhw[2]} $df[1]; + } + } + if (@dr) { + print {$fhw[2]} $dr[1]; + while (@dr = &read1(2, $fhr[2], $fhr[3])) { + print {$fhw[2]} $dr[1]; + } + } + close($fhr[$_]) for (0 .. $#fhr); + close($fhw[$_]) for (0 .. $#fhw); +} else { # single end + for (0 .. 1) { + my $fn = "$title$fn_suff[$_]"; + $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); + open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); + } + open($fhw[2], "|gzip >$pre.single.fastq.gz") || die; + my @df; + while (@df = &read1(1, $fhr[0], $fhr[1])) { + print {$fhw[2]} $df[1]; + } + close($fhr[$_]) for (0 .. $#fhr); + close($fhw[2]); +} + +sub read1 { + my $i = shift(@_); + my $j = ($i-1)<<1; + my ($key, $seq); + my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]); + while (<$fhs>) { + my $t = <$fhq>; + if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) { + $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines + die(qq/** unmatched read name: '$_' != '$_'\n/) unless ($_ eq $t); + my $name = "$pre:$1_$2_$3/$i"; + $_ = substr(<$fhs>, 2); + tr/0123./ACGTN/; + my $s = $_; + $_ = <$fhq>; + s/-1\b/0/eg; + s/^(\d+)\s*//; + s/(\d+)\s*/chr($1+33)/eg; + $seq = qq/\@$name\n$s+\n$_\n/; + last; + } + } + return defined($seq)? ($key, $seq) : (); +} diff --git a/stdaln.c b/stdaln.c new file mode 100644 index 0000000..7b55b2e --- /dev/null +++ b/stdaln.c @@ -0,0 +1,1072 @@ +/* The MIT License + + Copyright (c) 2003-2006, 2008, 2009, by Heng Li + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include +#include +#include +#include +#include "stdaln.h" + +/* char -> 17 (=16+1) nucleotides */ +unsigned char aln_nt16_table[256] = { + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,16 /*'-'*/,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15, + 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15, + 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15, + 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 +}; +char *aln_nt16_rev_table = "XAGRCMSVTWKDYHBN-"; + +/* char -> 5 (=4+1) nucleotides */ +unsigned char aln_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; +char *aln_nt4_rev_table = "AGCTN-"; + +/* char -> 22 (=20+1+1) amino acids */ +unsigned char aln_aa_table[256] = { + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,20,21, 21,22 /*'-'*/,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21, + 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21, + 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21, + 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, + 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21 +}; +char *aln_aa_rev_table = "ARNDCQEGHILKMFPSTWYV*X-"; + /* 01234567890123456789012 */ + +/* translation table. They are useless in stdaln.c, but when you realize you need it, you need not write the table again. */ +unsigned char aln_trans_table_eu[66] = { + 11,11, 2, 2, 1, 1,15,15, 16,16,16,16, 9,12, 9, 9, + 6, 6, 3, 3, 7, 7, 7, 7, 0, 0, 0, 0, 19,19,19,19, + 5, 5, 8, 8, 1, 1, 1, 1, 14,14,14,14, 10,10,10,10, + 20,20,18,18, 20,17, 4, 4, 15,15,15,15, 10,10,13,13, 21, 22 +}; +char *aln_trans_table_eu_char = "KKNNRRSSTTTTIMIIEEDDGGGGAAAAVVVVQQHHRRRRPPPPLLLL**YY*WCCSSSSLLFFX"; + /* 01234567890123456789012345678901234567890123456789012345678901234 */ +int aln_sm_blosum62[] = { +/* A R N D C Q E G H I L K M F P S T W Y V * X */ + 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0, + -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1, + -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1, + -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1, + 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2, + -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1, + -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1, + 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1, + -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1, + -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1, + -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1, + -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1, + -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1, + -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1, + -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2, + 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0, + 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0, + -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2, + -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1, + 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1, + -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4, + 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 +}; + +int aln_sm_blosum45[] = { +/* A R N D C Q E G H I L K M F P S T W Y V * X */ + 5,-2,-1,-2,-1,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-2,-2, 0,-5, 0, + -2, 7, 0,-1,-3, 1, 0,-2, 0,-3,-2, 3,-1,-2,-2,-1,-1,-2,-1,-2,-5,-1, + -1, 0, 6, 2,-2, 0, 0, 0, 1,-2,-3, 0,-2,-2,-2, 1, 0,-4,-2,-3,-5,-1, + -2,-1, 2, 7,-3, 0, 2,-1, 0,-4,-3, 0,-3,-4,-1, 0,-1,-4,-2,-3,-5,-1, + -1,-3,-2,-3,12,-3,-3,-3,-3,-3,-2,-3,-2,-2,-4,-1,-1,-5,-3,-1,-5,-2, + -1, 1, 0, 0,-3, 6, 2,-2, 1,-2,-2, 1, 0,-4,-1, 0,-1,-2,-1,-3,-5,-1, + -1, 0, 0, 2,-3, 2, 6,-2, 0,-3,-2, 1,-2,-3, 0, 0,-1,-3,-2,-3,-5,-1, + 0,-2, 0,-1,-3,-2,-2, 7,-2,-4,-3,-2,-2,-3,-2, 0,-2,-2,-3,-3,-5,-1, + -2, 0, 1, 0,-3, 1, 0,-2,10,-3,-2,-1, 0,-2,-2,-1,-2,-3, 2,-3,-5,-1, + -1,-3,-2,-4,-3,-2,-3,-4,-3, 5, 2,-3, 2, 0,-2,-2,-1,-2, 0, 3,-5,-1, + -1,-2,-3,-3,-2,-2,-2,-3,-2, 2, 5,-3, 2, 1,-3,-3,-1,-2, 0, 1,-5,-1, + -1, 3, 0, 0,-3, 1, 1,-2,-1,-3,-3, 5,-1,-3,-1,-1,-1,-2,-1,-2,-5,-1, + -1,-1,-2,-3,-2, 0,-2,-2, 0, 2, 2,-1, 6, 0,-2,-2,-1,-2, 0, 1,-5,-1, + -2,-2,-2,-4,-2,-4,-3,-3,-2, 0, 1,-3, 0, 8,-3,-2,-1, 1, 3, 0,-5,-1, + -1,-2,-2,-1,-4,-1, 0,-2,-2,-2,-3,-1,-2,-3, 9,-1,-1,-3,-3,-3,-5,-1, + 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-3,-1,-2,-2,-1, 4, 2,-4,-2,-1,-5, 0, + 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-1,-1, 2, 5,-3,-1, 0,-5, 0, + -2,-2,-4,-4,-5,-2,-3,-2,-3,-2,-2,-2,-2, 1,-3,-4,-3,15, 3,-3,-5,-2, + -2,-1,-2,-2,-3,-1,-2,-3, 2, 0, 0,-1, 0, 3,-3,-2,-1, 3, 8,-1,-5,-1, + 0,-2,-3,-3,-1,-3,-3,-3,-3, 3, 1,-2, 1, 0,-3,-1, 0,-3,-1, 5,-5,-1, + -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, 1,-5, + 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0,-2,-1,-1,-5,-1 +}; + +int aln_sm_nt[] = { +/* X A G R C M S V T W K D Y H B N */ + -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2, + -2, 2,-1, 1,-2, 1,-2, 0,-2, 1,-2, 0,-2, 0,-2, 0, + -2,-1, 2, 1,-2,-2, 1, 0,-2,-2, 1, 0,-2,-2, 0, 0, + -2, 1, 1, 1,-2,-1,-1, 0,-2,-1,-1, 0,-2, 0, 0, 0, + -2,-2,-2,-2, 2, 1, 1, 0,-1,-2,-2,-2, 1, 0, 0, 0, + -2, 1,-2,-1, 1, 1,-1, 0,-2,-1,-2, 0,-1, 0, 0, 0, + -2,-2, 1,-1, 1,-1, 1, 0,-2,-2,-1, 0,-1, 0, 0, 0, + -2, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, + -2,-2,-2,-2,-1,-2,-2,-2, 2, 1, 1, 0, 1, 0, 0, 0, + -2, 1,-2,-1,-2,-1,-2, 0, 1, 1,-1, 0,-1, 0, 0, 0, + -2,-2, 1,-1,-2,-2,-1, 0, 1,-1, 1, 0,-1, 0, 0, 0, + -2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -2,-2,-2,-2, 1,-1,-1, 0, 1,-1,-1, 0, 1, 0, 0, 0, + -2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +int aln_sm_read[] = { +/* X A G R C M S V T W K D Y H B N */ + -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, + -17, 2,-17, 1,-17, 1,-17, 0,-17, 1,-17, 0,-17, 0,-17, 0, + -17,-17, 2, 1,-17,-17, 1, 0,-17,-17, 1, 0,-17,-17, 0, 0, + -17, 1, 1, 1,-17,-17,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0, + -17,-17,-17,-17, 2, 1, 1, 0,-17,-17,-17,-17, 1, 0, 0, 0, + -17, 1,-17,-17, 1, 1,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0, + -17,-17, 1,-17, 1,-17, 1, 0,-17,-17,-17, 0,-17, 0, 0, 0, + -17, 0, 0, 0, 0, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, + -17,-17,-17,-17,-17,-17,-17,-17, 2, 1, 1, 0, 1, 0, 0, 0, + -17, 1,-17,-17,-17,-17,-17, 0, 1, 1,-17, 0,-17, 0, 0, 0, + -17,-17, 1,-17,-17,-17,-17, 0, 1,-17, 1, 0,-17, 0, 0, 0, + -17, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -17,-17,-17,-17, 1,-17,-17, 0, 1,-17,-17, 0, 1, 0, 0, 0, + -17, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -17,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + -17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +int aln_sm_hs[] = { +/* A G C T N */ + 91, -31,-114,-123, -44, + -31, 100,-125,-114, -42, + -123,-125, 100, -31, -42, + -114,-114, -31, 91, -42, + -44, -42, -42, -42, -43 +}; + +int aln_sm_maq[] = { + 11, -19, -19, -19, -13, + -19, 11, -19, -19, -13, + -19, -19, 11, -19, -13, + -19, -19, -19, 11, -13, + -13, -13, -13, -13, -13 +}; + +int aln_sm_blast[] = { + 1, -3, -3, -3, -2, + -3, 1, -3, -3, -2, + -3, -3, 1, -3, -2, + -3, -3, -3, 1, -2, + -2, -2, -2, -2, -2 +}; + +/********************/ +/* START OF align.c */ +/********************/ + +AlnParam aln_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 }; +AlnParam aln_param_bwa = { 26, 9, 5, aln_sm_maq, 5, 50 }; +AlnParam aln_param_nt2nt = { 8, 2, 2, aln_sm_nt, 16, 75 }; +AlnParam aln_param_rd2rd = { 1, 19, 19, aln_sm_read, 16, 75 }; +AlnParam aln_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 }; + +AlnAln *aln_init_AlnAln() +{ + AlnAln *aa; + aa = (AlnAln*)malloc(sizeof(AlnAln)); + aa->path = 0; + aa->out1 = aa->out2 = aa->outm = 0; + aa->path_len = 0; + return aa; +} +void aln_free_AlnAln(AlnAln *aa) +{ + free(aa->path); free(aa->cigar32); + free(aa->out1); free(aa->out2); free(aa->outm); + free(aa); +} + +/***************************/ +/* START OF common_align.c */ +/***************************/ + +#define LOCAL_OVERFLOW_THRESHOLD 32000 +#define LOCAL_OVERFLOW_REDUCE 16000 +#define NT_LOCAL_SCORE int +#define NT_LOCAL_SHIFT 16 +#define NT_LOCAL_MASK 0xffff + +#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF; + +#define set_M(MM, cur, p, sc) \ +{ \ + if ((p)->M >= (p)->I) { \ + if ((p)->M >= (p)->D) { \ + (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \ + } else { \ + (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ + } \ + } else { \ + if ((p)->I > (p)->D) { \ + (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \ + } else { \ + (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ + } \ + } \ +} +#define set_I(II, cur, p) \ +{ \ + if ((p)->M - gap_open > (p)->I) { \ + (cur)->It = FROM_M; \ + (II) = (p)->M - gap_open - gap_ext; \ + } else { \ + (cur)->It = FROM_I; \ + (II) = (p)->I - gap_ext; \ + } \ +} +#define set_end_I(II, cur, p) \ +{ \ + if (gap_end >= 0) { \ + if ((p)->M - gap_open > (p)->I) { \ + (cur)->It = FROM_M; \ + (II) = (p)->M - gap_open - gap_end; \ + } else { \ + (cur)->It = FROM_I; \ + (II) = (p)->I - gap_end; \ + } \ + } else set_I(II, cur, p); \ +} +#define set_D(DD, cur, p) \ +{ \ + if ((p)->M - gap_open > (p)->D) { \ + (cur)->Dt = FROM_M; \ + (DD) = (p)->M - gap_open - gap_ext; \ + } else { \ + (cur)->Dt = FROM_D; \ + (DD) = (p)->D - gap_ext; \ + } \ +} +#define set_end_D(DD, cur, p) \ +{ \ + if (gap_end >= 0) { \ + if ((p)->M - gap_open > (p)->D) { \ + (cur)->Dt = FROM_M; \ + (DD) = (p)->M - gap_open - gap_end; \ + } else { \ + (cur)->Dt = FROM_D; \ + (DD) = (p)->D - gap_end; \ + } \ + } else set_D(DD, cur, p); \ +} + +typedef struct +{ + unsigned char Mt:3, It:2, Dt:2; +} dpcell_t; + +typedef struct +{ + int M, I, D; +} dpscore_t; + +/* build score profile for accelerating alignment, in theory */ +void aln_init_score_array(unsigned char *seq, int len, int row, int *score_matrix, int **s_array) +{ + int *tmp, *tmp2, i, k; + for (i = 0; i != row; ++i) { + tmp = score_matrix + i * row; + tmp2 = s_array[i]; + for (k = 0; k != len; ++k) + tmp2[k] = tmp[seq[k]]; + } +} +/*************************** + * banded global alignment * + ***************************/ +int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, + path_t *path, int *path_len) +{ + register int i, j; + dpcell_t **dpcell, *q; + dpscore_t *curr, *last, *s; + path_t *p; + int b1, b2, tmp_end; + int *mat, end, max; + unsigned char type, ctype; + + int gap_open, gap_ext, gap_end, b; + int *score_matrix, N_MATRIX_ROW; + + /* initialize some align-related parameters. just for compatibility */ + gap_open = ap->gap_open; + gap_ext = ap->gap_ext; + gap_end = ap->gap_end; + b = ap->band_width; + score_matrix = ap->matrix; + N_MATRIX_ROW = ap->row; + + if (len1 == 0 || len2 == 0) { + *path_len = 0; + return 0; + } + /* calculate b1 and b2 */ + if (len1 > len2) { + b1 = len1 - len2 + b; + b2 = b; + } else { + b1 = b; + b2 = len2 - len1 + b; + } + if (b1 > len1) b1 = len1; + if (b2 > len2) b2 = len2; + --seq1; --seq2; + + /* allocate memory */ + end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); + dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1)); + for (j = 0; j <= len2; ++j) + dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end); + for (j = b2 + 1; j <= len2; ++j) + dpcell[j] -= j - b2; + curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); + last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); + + /* set first row */ + SET_INF(*curr); curr->M = 0; + for (i = 1, s = curr + 1; i < b1; ++i, ++s) { + SET_INF(*s); + set_end_D(s->D, dpcell[0] + i, s - 1); + } + s = curr; curr = last; last = s; + + /* core dynamic programming, part 1 */ + tmp_end = (b2 < len2)? b2 : len2 - 1; + for (j = 1; j <= tmp_end; ++j) { + q = dpcell[j]; s = curr; SET_INF(*s); + set_end_I(s->I, q, last); + end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + ++s; ++q; + for (i = 1; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_D(s->D, q, s - 1); + if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ + set_end_I(s->I, q, last + i); + } else s->I = MINOR_INF; + s = curr; curr = last; last = s; + } + /* last row for part 1, use set_end_D() instead of set_D() */ + if (j == len2 && b2 != len2 - 1) { + q = dpcell[j]; s = curr; SET_INF(*s); + set_end_I(s->I, q, last); + end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + ++s; ++q; + for (i = 1; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ + set_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_end_D(s->D, q, s - 1); + if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ + set_end_I(s->I, q, last + i); + } else s->I = MINOR_INF; + s = curr; curr = last; last = s; + ++j; + } + + /* core dynamic programming, part 2 */ + for (; j <= len2 - b2 + 1; ++j) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + end = j + b1 - 1; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_D(s->D, q, s - 1); + s->I = MINOR_INF; + s = curr; curr = last; last = s; + } + + /* core dynamic programming, part 3 */ + for (; j < len2; ++j) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); + set_end_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + s = curr; curr = last; last = s; + } + /* last row */ + if (j == len2) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + } + set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); + set_end_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + s = curr; curr = last; last = s; + } + + /* backtrace */ + i = len1; j = len2; + q = dpcell[j] + i; + s = last + len1; + max = s->M; type = q->Mt; ctype = FROM_M; + if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; } + if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; } + + p = path; + p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */ + ++p; + do { + switch (ctype) { + case FROM_M: --i; --j; break; + case FROM_I: --j; break; + case FROM_D: --i; break; + } + q = dpcell[j] + i; + ctype = type; + switch (type) { + case FROM_M: type = q->Mt; break; + case FROM_I: type = q->It; break; + case FROM_D: type = q->Dt; break; + } + p->ctype = ctype; p->i = i; p->j = j; + ++p; + } while (i || j); + *path_len = p - path - 1; + + /* free memory */ + for (j = b2 + 1; j <= len2; ++j) + dpcell[j] += j - b2; + for (j = 0; j <= len2; ++j) + free(dpcell[j]); + free(dpcell); + free(curr); free(last); + + return max; +} +/************************************************* + * local alignment combined with banded strategy * + *************************************************/ +int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, + path_t *path, int *path_len, int _thres, int *_subo) +{ + register NT_LOCAL_SCORE *s; + register int i; + int q, r, qr, tmp_len, qr_shift; + int **s_array, *score_array; + int e, f; + int is_overflow, of_base; + NT_LOCAL_SCORE *eh, curr_h, last_h, curr_last_h; + int j, start_i, start_j, end_i, end_j; + path_t *p; + int score_f, score_r, score_g; + int start, end, max_score; + int thres, *suba, *ss; + + int gap_open, gap_ext, b; + int *score_matrix, N_MATRIX_ROW; + + /* initialize some align-related parameters. just for compatibility */ + gap_open = ap->gap_open; + gap_ext = ap->gap_ext; + b = ap->band_width; + score_matrix = ap->matrix; + N_MATRIX_ROW = ap->row; + thres = _thres > 0? _thres : -_thres; + + if (len1 == 0 || len2 == 0) return -1; + + /* allocate memory */ + suba = (int*)malloc(sizeof(int) * (len2 + 1)); + eh = (NT_LOCAL_SCORE*)malloc(sizeof(NT_LOCAL_SCORE) * (len1 + 1)); + s_array = (int**)malloc(sizeof(int*) * N_MATRIX_ROW); + for (i = 0; i != N_MATRIX_ROW; ++i) + s_array[i] = (int*)malloc(sizeof(int) * len1); + /* initialization */ + aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); + q = gap_open; + r = gap_ext; + qr = q + r; + qr_shift = (qr+1) << NT_LOCAL_SHIFT; + tmp_len = len1 + 1; + start_i = start_j = end_i = end_j = 0; + for (i = 0, max_score = 0; i != N_MATRIX_ROW * N_MATRIX_ROW; ++i) + if (max_score < score_matrix[i]) max_score = score_matrix[i]; + /* convert the coordinate */ + --seq1; --seq2; + for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i]; + + /* forward dynamic programming */ + for (i = 0, s = eh; i != tmp_len; ++i, ++s) *s = 0; + score_f = 0; + is_overflow = of_base = 0; + suba[0] = 0; + for (j = 1, ss = suba + 1; j <= len2; ++j, ++ss) { + int subo = 0; + last_h = f = 0; + score_array = s_array[seq2[j]]; + if (is_overflow) { /* adjust eh[] array if overflow occurs. */ + /* If LOCAL_OVERFLOW_REDUCE is too small, optimal alignment might be missed. + * If it is too large, this block will be excuted frequently and therefore + * slow down the whole program. + * Acually, smaller LOCAL_OVERFLOW_REDUCE might also help to reduce the + * number of assignments because it sets some cells to zero when overflow + * happens. */ + int tmp, tmp2; + score_f -= LOCAL_OVERFLOW_REDUCE; + of_base += LOCAL_OVERFLOW_REDUCE; + is_overflow = 0; + for (i = 1, s = eh; i <= tmp_len; ++i, ++s) { + tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK; + if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; + else tmp2 -= LOCAL_OVERFLOW_REDUCE; + if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; + else tmp -= LOCAL_OVERFLOW_REDUCE; + *s = (tmp << NT_LOCAL_SHIFT) | tmp2; + } + } + for (i = 1, s = eh; i != tmp_len; ++i, ++s) { + /* prepare for calculate current h */ + curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i]; + if (curr_h < 0) curr_h = 0; + if (last_h > 0) { /* initialize f */ + f = (f > last_h - q)? f - r : last_h - qr; + if (curr_h < f) curr_h = f; + } + if (*(s+1) >= qr_shift) { /* initialize e */ + curr_last_h = *(s+1) >> NT_LOCAL_SHIFT; + e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr; + if (curr_h < e) curr_h = e; + *s = (last_h << NT_LOCAL_SHIFT) | e; + } else *s = last_h << NT_LOCAL_SHIFT; /* e = 0 */ + last_h = curr_h; + if (subo < curr_h) subo = curr_h; + if (score_f < curr_h) { + score_f = curr_h; end_i = i; end_j = j; + if (score_f > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; + } + } + *s = last_h << NT_LOCAL_SHIFT; + *ss = subo + of_base; + } + score_f += of_base; + + if (score_f < thres) { /* no matching residue at all, 090218 */ + *path_len = 0; + goto end_func; + } + if (path == 0) goto end_func; /* skip path-filling */ + + /* reverse dynamic programming */ + for (i = end_i, s = eh + end_i; i >= 0; --i, --s) *s = 0; + if (end_i == 0 || end_j == 0) goto end_func; /* no local match */ + score_r = score_matrix[seq1[end_i] * N_MATRIX_ROW + seq2[end_j]]; + is_overflow = of_base = 0; + start_i = end_i; start_j = end_j; + eh[end_i] = ((NT_LOCAL_SCORE)(qr + score_r)) << NT_LOCAL_SHIFT; /* in order to initialize f and e, 040408 */ + start = end_i - 1; + end = end_i - 3; + if (end <= 0) end = 0; + + /* second pass DP can be done in a band, speed will thus be enhanced */ + for (j = end_j - 1; j != 0; --j) { + last_h = f = 0; + score_array = s_array[seq2[j]]; + if (is_overflow) { /* adjust eh[] array if overflow occurs. */ + int tmp, tmp2; + score_r -= LOCAL_OVERFLOW_REDUCE; + of_base += LOCAL_OVERFLOW_REDUCE; + is_overflow = 0; + for (i = start, s = eh + start + 1; i >= end; --i, --s) { + tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK; + if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; + else tmp2 -= LOCAL_OVERFLOW_REDUCE; + if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; + else tmp -= LOCAL_OVERFLOW_REDUCE; + *s = (tmp << NT_LOCAL_SHIFT) | tmp2; + } + } + for (i = start, s = eh + start + 1; i != end; --i, --s) { + /* prepare for calculate current h */ + curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i]; + if (curr_h < 0) curr_h = 0; + if (last_h > 0) { /* initialize f */ + f = (f > last_h - q)? f - r : last_h - qr; + if (curr_h < f) curr_h = f; + } + curr_last_h = *(s-1) >> NT_LOCAL_SHIFT; + e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr; + if (e < 0) e = 0; + if (curr_h < e) curr_h = e; + *s = (last_h << NT_LOCAL_SHIFT) | e; + last_h = curr_h; + if (score_r < curr_h) { + score_r = curr_h; start_i = i; start_j = j; + if (score_r + of_base - qr == score_f) { + j = 1; break; + } + if (score_r > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; + } + } + *s = last_h << NT_LOCAL_SHIFT; + /* recalculate start and end, the boundaries of the band */ + if ((eh[start] >> NT_LOCAL_SHIFT) <= qr) --start; + if (start <= 0) start = 0; + end = start_i - (start_j - j) - (score_r + of_base + (start_j - j) * max_score) / r - 1; + if (end <= 0) end = 0; + } + + if (_subo) { + int tmp2 = 0, tmp = (int)(start_j - .33 * (end_j - start_j) + .499); + for (j = 1; j <= tmp; ++j) + if (tmp2 < suba[j]) tmp2 = suba[j]; + tmp = (int)(end_j + .33 * (end_j - start_j) + .499); + for (j = tmp; j <= len2; ++j) + if (tmp2 < suba[j]) tmp2 = suba[j]; + *_subo = tmp2; + } + + if (path_len == 0) { + path[0].i = start_i; path[0].j = start_j; + path[1].i = end_i; path[1].j = end_j; + goto end_func; + } + + score_r += of_base; + score_r -= qr; + +#ifdef DEBUG + /* this seems not a bug */ + if (score_f != score_r) + fprintf(stderr, "[aln_local_core] unknown flaw occurs: score_f(%d) != score_r(%d)\n", score_f, score_r); +#endif + + if (_thres > 0) { /* call global alignment to fill the path */ + score_g = 0; + j = (end_i - start_i > end_j - start_j)? end_i - start_i : end_j - start_j; + ++j; /* j is the maximum band_width */ + for (i = ap->band_width;; i <<= 1) { + AlnParam ap_real = *ap; + ap_real.gap_end = -1; + ap_real.band_width = i; + score_g = aln_global_core(seq1 + start_i, end_i - start_i + 1, seq2 + start_j, + end_j - start_j + 1, &ap_real, path, path_len); + if (score_g == score_r || score_f == score_g) break; + if (i > j) break; + } + if (score_r > score_g && score_f > score_g) { + fprintf(stderr, "[aln_local_core] Potential bug: (%d,%d) > %d\n", score_f, score_r, score_g); + score_f = score_r = -1; + } else score_f = score_g; + + /* convert coordinate */ + for (p = path + *path_len - 1; p >= path; --p) { + p->i += start_i - 1; + p->j += start_j - 1; + } + } else { /* just store the start and end */ + *path_len = 2; + path[1].i = start_i; path[1].j = start_j; + path->i = end_i; path->j = end_j; + } + +end_func: + /* free */ + free(eh); free(suba); + for (i = 0; i != N_MATRIX_ROW; ++i) { + ++s_array[i]; + free(s_array[i]); + } + free(s_array); + return score_f; +} +AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, + int type, int thres, int len1, int len2) +{ + unsigned char *seq11, *seq22; + int score; + int i, j, l; + path_t *p; + char *out1, *out2, *outm; + AlnAln *aa; + + if (len1 < 0) len1 = strlen(seq1); + if (len2 < 0) len2 = strlen(seq2); + + aa = aln_init_AlnAln(); + seq11 = (unsigned char*)malloc(sizeof(unsigned char) * len1); + seq22 = (unsigned char*)malloc(sizeof(unsigned char) * len2); + aa->path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 1)); + + if (ap->row < 10) { /* 4-nucleotide alignment */ + for (i = 0; i < len1; ++i) + seq11[i] = aln_nt4_table[(int)seq1[i]]; + for (j = 0; j < len2; ++j) + seq22[j] = aln_nt4_table[(int)seq2[j]]; + } else if (ap->row < 20) { /* 16-nucleotide alignment */ + for (i = 0; i < len1; ++i) + seq11[i] = aln_nt16_table[(int)seq1[i]]; + for (j = 0; j < len2; ++j) + seq22[j] = aln_nt16_table[(int)seq2[j]]; + } else { /* amino acids */ + for (i = 0; i < len1; ++i) + seq11[i] = aln_aa_table[(int)seq1[i]]; + for (j = 0; j < len2; ++j) + seq22[j] = aln_aa_table[(int)seq2[j]]; + } + + if (type == ALN_TYPE_GLOBAL) score = aln_global_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len); + else if (type == ALN_TYPE_LOCAL) score = aln_local_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, thres, &aa->subo); + else if (type == ALN_TYPE_EXTEND) score = aln_extend_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, 1, 0); + else { + free(seq11); free(seq22); free(aa->path); + aln_free_AlnAln(aa); + return 0; + } + aa->score = score; + + if (thres > 0) { + out1 = aa->out1 = (char*)malloc(sizeof(char) * (aa->path_len + 1)); + out2 = aa->out2 = (char*)malloc(sizeof(char) * (aa->path_len + 1)); + outm = aa->outm = (char*)malloc(sizeof(char) * (aa->path_len + 1)); + + --seq1; --seq2; + --seq11; --seq22; + + p = aa->path + aa->path_len - 1; + + for (l = 0; p >= aa->path; --p, ++l) { + switch (p->ctype) { + case FROM_M: out1[l] = seq1[p->i]; out2[l] = seq2[p->j]; + outm[l] = (seq11[p->i] == seq22[p->j] && seq11[p->i] != ap->row)? '|' : ' '; + break; + case FROM_I: out1[l] = '-'; out2[l] = seq2[p->j]; outm[l] = ' '; break; + case FROM_D: out1[l] = seq1[p->i]; out2[l] = '-'; outm[l] = ' '; break; + } + } + out1[l] = out2[l] = outm[l] = '\0'; + ++seq11; ++seq22; + } + + free(seq11); + free(seq22); + + p = aa->path + aa->path_len - 1; + aa->start1 = p->i? p->i : 1; + aa->end1 = aa->path->i; + aa->start2 = p->j? p->j : 1; + aa->end2 = aa->path->j; + aa->cigar32 = aln_path2cigar32(aa->path, aa->path_len, &aa->n_cigar); + + return aa; +} +AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int thres) +{ + return aln_stdaln_aux(seq1, seq2, ap, type, thres, -1, -1); +} + +/* for backward compatibility */ +uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar) +{ + uint32_t *cigar32; + uint16_t *cigar; + int i; + cigar32 = aln_path2cigar32(path, path_len, n_cigar); + cigar = (uint16_t*)cigar32; + for (i = 0; i < *n_cigar; ++i) + cigar[i] = (cigar32[i]&0xf)<<14 | (cigar32[i]>>4&0x3fff); + return cigar; +} + +/* newly added functions (2009-07-21) */ + +int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, + path_t *path, int *path_len, int G0, uint8_t *_mem) +{ + int q, r, qr, tmp_len; + int32_t **s_array, *score_array; + int is_overflow, of_base; + uint32_t *eh; + int i, j, end_i, end_j; + int score, start, end; + int *score_matrix, N_MATRIX_ROW; + uint8_t *mem, *_p; + + /* initialize some align-related parameters. just for compatibility */ + q = ap->gap_open; + r = ap->gap_ext; + qr = q + r; + score_matrix = ap->matrix; + N_MATRIX_ROW = ap->row; + + if (len1 == 0 || len2 == 0) return -1; + + /* allocate memory */ + mem = _mem? _mem : calloc((len1 + 2) * (N_MATRIX_ROW + 1), 4); + _p = mem; + eh = (uint32_t*)_p, _p += 4 * (len1 + 2); + s_array = calloc(N_MATRIX_ROW, sizeof(void*)); + for (i = 0; i != N_MATRIX_ROW; ++i) + s_array[i] = (int32_t*)_p, _p += 4 * len1; + /* initialization */ + aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); + tmp_len = len1 + 1; + start = 1; end = 2; + end_i = end_j = 0; + score = 0; + is_overflow = of_base = 0; + /* convert the coordinate */ + --seq1; --seq2; + for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i]; + + /* dynamic programming */ + memset(eh, 0, 4 * (len1 + 2)); + eh[1] = (uint32_t)G0<<16; + for (j = 1; j <= len2; ++j) { + int _start, _end; + int h1 = 0, f = 0; + score_array = s_array[seq2[j]]; + /* set start and end */ + _start = j - ap->band_width; + if (_start < 1) _start = 1; + if (_start > start) start = _start; + _end = j + ap->band_width; + if (_end > len1 + 1) _end = len1 + 1; + if (_end < end) end = _end; + if (start == end) break; + /* adjust eh[] array if overflow occurs. */ + if (is_overflow) { + int tmp, tmp2; + score -= LOCAL_OVERFLOW_REDUCE; + of_base += LOCAL_OVERFLOW_REDUCE; + is_overflow = 0; + for (i = start; i <= end; ++i) { + uint32_t *s = &eh[i]; + tmp = *s >> 16; tmp2 = *s & 0xffff; + if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; + else tmp2 -= LOCAL_OVERFLOW_REDUCE; + if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; + else tmp -= LOCAL_OVERFLOW_REDUCE; + *s = (tmp << 16) | tmp2; + } + } + _start = _end = 0; + /* the inner loop */ + for (i = start; i < end; ++i) { + /* At the beginning of each cycle: + eh[i] -> h[j-1,i-1]<<16 | e[j,i] + f -> f[j,i] + h1 -> h[j,i-1] + */ + uint32_t *s = &eh[i]; + int h = (int)(*s >> 16); + int e = *s & 0xffff; /* this is e[j,i] */ + *s = (uint32_t)h1 << 16; /* eh[i] now stores h[j,i-1]<<16 */ + h += h? score_array[i] : 0; /* this is left_core() specific */ + /* calculate h[j,i]; don't need to test 0, as {e,f}>=0 */ + h = h > e? h : e; + h = h > f? h : f; /* h now is h[j,i] */ + h1 = h; + if (h > 0) { + if (_start == 0) _start = i; + _end = i; + if (score < h) { + score = h; end_i = i; end_j = j; + if (score > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; + } + } + /* calculate e[j+1,i] and f[j,i+1] */ + h -= qr; + h = h > 0? h : 0; + e -= r; + e = e > h? e : h; + f -= r; + f = f > h? f : h; + *s |= e; + } + eh[end] = h1 << 16; + /* recalculate start and end, the boundaries of the band */ + if (_end <= 0) break; /* no cell in this row has a positive score */ + start = _start; + end = _end + 3; + } + + score += of_base - 1; + if (score <= 0) { + if (path_len) *path_len = 0; + goto end_left_func; + } + + if (path == 0) goto end_left_func; + + if (path_len == 0) { + path[0].i = end_i; path[0].j = end_j; + goto end_left_func; + } + + { /* call global alignment to fill the path */ + int score_g = 0; + j = (end_i - 1 > end_j - 1)? end_i - 1 : end_j - 1; + ++j; /* j is the maximum band_width */ + for (i = ap->band_width;; i <<= 1) { + AlnParam ap_real = *ap; + ap_real.gap_end = -1; + ap_real.band_width = i; + score_g = aln_global_core(seq1 + 1, end_i, seq2 + 1, end_j, &ap_real, path, path_len); + if (score == score_g) break; + if (i > j) break; + } + if (score > score_g) + fprintf(stderr, "[aln_left_core] no suitable bandwidth: %d < %d\n", score_g, score); + score = score_g; + } + +end_left_func: + /* free */ + free(s_array); + if (!_mem) free(mem); + return score; +} + +uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar) +{ + int i, n; + uint32_t *cigar; + unsigned char last_type; + + if (path_len == 0 || path == 0) { + *n_cigar = 0; + return 0; + } + + last_type = path->ctype; + for (i = n = 1; i < path_len; ++i) { + if (last_type != path[i].ctype) ++n; + last_type = path[i].ctype; + } + *n_cigar = n; + cigar = (uint32_t*)malloc(*n_cigar * 4); + + cigar[0] = 1u << 4 | path[path_len-1].ctype; + last_type = path[path_len-1].ctype; + for (i = path_len - 2, n = 0; i >= 0; --i) { + if (path[i].ctype == last_type) cigar[n] += 1u << 4; + else { + cigar[++n] = 1u << 4 | path[i].ctype; + last_type = path[i].ctype; + } + } + + return cigar; +} + +#ifdef STDALN_MAIN +int main() +{ + AlnAln *aln_local, *aln_global, *aln_left; + int i; + + aln_local = aln_stdaln("CGTGCGATGCactgCATACGGCTCGCCTAGATCA", "AAGGGATGCTCTGCATCgCTCGGCTAGCTGT", &aln_param_blast, 0, 1); + aln_global = aln_stdaln("CGTGCGATGCactgCATACGGCTCGCCTAGATCA", "AAGGGATGCTCTGCATCGgCTCGGCTAGCTGT", &aln_param_blast, 1, 1); +// aln_left = aln_stdaln( "GATGCACTGCATACGGCTCGCCTAGATCA", "GATGCTCTGCATCGgCTCGGCTAGCTGT", &aln_param_blast, 2, 1); + aln_left = aln_stdaln("CACCTTCGACTCACGTCTCATTCTCGGAGTCGAGTGGACGGTCCCTCATACACGAACAGGTTC", + "CACCTTCGACTTTCACCTCTCATTCTCGGACTCGAGTGGACGGTCCCTCATCCAAGAACAGGGTCTGTGAAA", &aln_param_blast, 2, 1); + + printf(">%d,%d\t%d,%d\n", aln_local->start1, aln_local->end1, aln_local->start2, aln_local->end2); + printf("%s\n%s\n%s\n", aln_local->out1, aln_local->outm, aln_local->out2); + + printf(">%d,%d\t%d,%d\t", aln_global->start1, aln_global->end1, aln_global->start2, aln_global->end2); + for (i = 0; i != aln_global->n_cigar; ++i) + printf("%d%c", aln_global->cigar32[i]>>4, "MID"[aln_global->cigar32[i]&0xf]); + printf("\n%s\n%s\n%s\n", aln_global->out1, aln_global->outm, aln_global->out2); + + printf(">%d\t%d,%d\t%d,%d\t", aln_left->score, aln_left->start1, aln_left->end1, aln_left->start2, aln_left->end2); + for (i = 0; i != aln_left->n_cigar; ++i) + printf("%d%c", aln_left->cigar32[i]>>4, "MID"[aln_left->cigar32[i]&0xf]); + printf("\n%s\n%s\n%s\n", aln_left->out1, aln_left->outm, aln_left->out2); + + aln_free_AlnAln(aln_local); + aln_free_AlnAln(aln_global); + aln_free_AlnAln(aln_left); + return 0; +} +#endif diff --git a/stdaln.h b/stdaln.h new file mode 100644 index 0000000..f0048b3 --- /dev/null +++ b/stdaln.h @@ -0,0 +1,162 @@ +/* The MIT License + + Copyright (c) 2003-2006, 2008, by Heng Li + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + 2009-07-23, 0.10.0 + + - Use 32-bit to store CIGAR + + - Report suboptimal aligments + + - Implemented half-fixed-half-open DP + + 2009-04-26, 0.9.10 + + - Allow to set a threshold for local alignment + + 2009-02-18, 0.9.9 + + - Fixed a bug when no residue matches + + 2008-08-04, 0.9.8 + + - Fixed the wrong declaration of aln_stdaln_aux() + + - Avoid 0 coordinate for global alignment + + 2008-08-01, 0.9.7 + + - Change gap_end penalty to 5 in aln_param_bwa + + - Add function to convert path_t to the CIGAR format + + 2008-08-01, 0.9.6 + + - The first gap now costs (gap_open+gap_ext), instead of + gap_open. Scoring systems are modified accordingly. + + - Gap end is now correctly handled. Previously it is not correct. + + - Change license to MIT. + + */ + +#ifndef LH3_STDALN_H_ +#define LH3_STDALN_H_ + + +#define STDALN_VERSION 0.11.0 + +#include + +#define FROM_M 0 +#define FROM_I 1 +#define FROM_D 2 +#define FROM_S 3 + +#define ALN_TYPE_LOCAL 0 +#define ALN_TYPE_GLOBAL 1 +#define ALN_TYPE_EXTEND 2 + +/* This is the smallest integer. It might be CPU-dependent in very RARE cases. */ +#define MINOR_INF -1073741823 + +typedef struct +{ + int gap_open; + int gap_ext; + int gap_end; + + int *matrix; + int row; + int band_width; +} AlnParam; + +typedef struct +{ + int i, j; + unsigned char ctype; +} path_t; + +typedef struct +{ + path_t *path; /* for advanced users... :-) */ + int path_len; /* for advanced users... :-) */ + int start1, end1; /* start and end of the first sequence, coordinations are 1-based */ + int start2, end2; /* start and end of the second sequence, coordinations are 1-based */ + int score, subo; /* score */ + + char *out1, *out2; /* print them, and then you will know */ + char *outm; + + int n_cigar; + uint32_t *cigar32; +} AlnAln; + +#ifdef __cplusplus +extern "C" { +#endif + + AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, + int type, int do_align, int len1, int len2); + AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int do_align); + void aln_free_AlnAln(AlnAln *aa); + + int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, + path_t *path, int *path_len); + int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, + path_t *path, int *path_len, int _thres, int *_subo); + int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, + path_t *path, int *path_len, int G0, uint8_t *_mem); + uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar); + uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar); + +#ifdef __cplusplus +} +#endif + +/******************** + * global variables * + ********************/ + +extern AlnParam aln_param_bwa; /* = { 37, 9, 0, aln_sm_maq, 5, 50 }; */ +extern AlnParam aln_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */ +extern AlnParam aln_param_nt2nt; /* = { 10, 2, 2, aln_sm_nt, 16, 75 }; */ +extern AlnParam aln_param_aa2aa; /* = { 20, 19, 19, aln_sm_read, 16, 75 }; */ +extern AlnParam aln_param_rd2rd; /* = { 12, 2, 2, aln_sm_blosum62, 22, 50 }; */ + +/* common nucleotide score matrix for 16 bases */ +extern int aln_sm_nt[], aln_sm_bwa[]; + +/* BLOSUM62 and BLOSUM45 */ +extern int aln_sm_blosum62[], aln_sm_blosum45[]; + +/* common read for 16 bases. note that read alignment is quite different from common nucleotide alignment */ +extern int aln_sm_read[]; + +/* human-mouse score matrix for 4 bases */ +extern int aln_sm_hs[]; + +#endif diff --git a/utils.c b/utils.c new file mode 100644 index 0000000..203f057 --- /dev/null +++ b/utils.c @@ -0,0 +1,72 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#include +#include +#include +#include +#include +#include "utils.h" + +FILE *err_xopen_core(const char *func, const char *fn, const char *mode) +{ + FILE *fp = 0; + if (strcmp(fn, "-") == 0) + return (strstr(mode, "r"))? stdin : stdout; + if ((fp = fopen(fn, mode)) == 0) { + fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); + abort(); + } + return fp; +} +gzFile err_xzopen_core(const char *func, const char *fn, const char *mode) +{ + gzFile fp; + if (strcmp(fn, "-") == 0) + return gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode); + if ((fp = gzopen(fn, mode)) == 0) { + fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); + abort(); + } + return fp; +} +void err_fatal(const char *header, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + fprintf(stderr, "[%s] ", header); + vfprintf(stderr, fmt, args); + fprintf(stderr, " Abort!\n"); + va_end(args); + abort(); +} + +void err_fatal_simple_core(const char *func, const char *msg) +{ + fprintf(stderr, "[%s] %s Abort!\n", func, msg); + abort(); +} diff --git a/utils.h b/utils.h new file mode 100644 index 0000000..d6ac07c --- /dev/null +++ b/utils.h @@ -0,0 +1,52 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + +#ifndef LH3_UTILS_H +#define LH3_UTILS_H + +#include +#include + +#define err_fatal_simple(msg) err_fatal_simple_core(__func__, msg) +#define xopen(fn, mode) err_xopen_core(__func__, fn, mode) +#define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode) +#define xassert(cond, msg) if ((cond) == 0) err_fatal_simple_core(__func__, msg) + +#ifdef __cplusplus +extern "C" { +#endif + + void err_fatal(const char *header, const char *fmt, ...); + void err_fatal_simple_core(const char *func, const char *msg); + FILE *err_xopen_core(const char *func, const char *fn, const char *mode); + gzFile err_xzopen_core(const char *func, const char *fn, const char *mode); + +#ifdef __cplusplus +} +#endif + +#endif From 5e308847309d75f5424f2478c611d721195ffa83 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 13 Jan 2011 20:54:10 -0500 Subject: [PATCH 003/498] Update to the latest modfication 0.5.9rc1-2. Update ChangeLog --- ChangeLog | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ bwa.1 | 2 +- bwape.c | 4 +++- bwase.c | 5 ++++- bwtaln.c | 2 +- bwtsw2_aux.c | 2 +- bwtsw2_main.c | 3 ++- main.c | 7 ++++++- utils.c | 10 ++++++++++ utils.h | 2 ++ 10 files changed, 79 insertions(+), 7 deletions(-) diff --git a/ChangeLog b/ChangeLog index 779a31a..403e61f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,52 @@ +------------------------------------------------------------------------ +r1605 | lh3 | 2010-12-29 20:20:20 -0500 (Wed, 29 Dec 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwtsw2_aux.c + M /branches/prog/bwa/main.c + + * bwa-0.5.9rc1-2 (r1605) + * fixed a typo/bug in bwasw + +------------------------------------------------------------------------ +r1587 | lh3 | 2010-12-21 18:48:30 -0500 (Tue, 21 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + +a typo in the manual + +------------------------------------------------------------------------ +r1586 | lh3 | 2010-12-21 18:47:48 -0500 (Tue, 21 Dec 2010) | 3 lines +Changed paths: + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + M /branches/prog/bwa/bwtaln.c + M /branches/prog/bwa/bwtsw2_main.c + M /branches/prog/bwa/main.c + M /branches/prog/bwa/utils.c + M /branches/prog/bwa/utils.h + + * bwa-0.5.9rc1-1 (r1586) + * a few patches by John + +------------------------------------------------------------------------ +r1562 | lh3 | 2010-12-10 01:02:06 -0500 (Fri, 10 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/bwape.c + M /branches/prog/bwa/bwase.c + +documentation on specifying @RG + +------------------------------------------------------------------------ +r1561 | lh3 | 2010-12-10 00:45:40 -0500 (Fri, 10 Dec 2010) | 2 lines +Changed paths: + M /branches/prog/bwa/ChangeLog + M /branches/prog/bwa/NEWS + M /branches/prog/bwa/bwa.1 + M /branches/prog/bwa/main.c + +Release bwa-0.5.9rc1 (r1561) + ------------------------------------------------------------------------ r1560 | lh3 | 2010-12-10 00:29:08 -0500 (Fri, 10 Dec 2010) | 3 lines Changed paths: diff --git a/bwa.1 b/bwa.1 index 52e999e..71fb80a 100644 --- a/bwa.1 +++ b/bwa.1 @@ -40,7 +40,7 @@ each individual read, and the .B `samse/sampe' command, which converts SA coordinates to chromosomal coordinate and pairs reads (for `sampe'). The second algorithm is invoked by the -.B `dbtwsw' +.B `bwasw' command. It works for single-end reads only. .SH COMMANDS AND OPTIONS diff --git a/bwape.c b/bwape.c index a127461..65de3e2 100644 --- a/bwape.c +++ b/bwape.c @@ -45,6 +45,7 @@ int bwa_approx_mapQ(const bwa_seq_t *p, int mm); void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2); bntseq_t *bwa_open_nt(const char *prefix); void bwa_print_sam_SQ(const bntseq_t *bns); +void bwa_print_sam_PG(); pe_opt_t *bwa_init_pe_opt() { @@ -689,6 +690,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f // core loop bwa_print_sam_SQ(bns); + bwa_print_sam_PG(); while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) { int cnt_chg; isize_info_t ii; @@ -763,7 +765,7 @@ int bwa_sai2sam_pe(int argc, char *argv[]) case 'n': popt->n_multi = atoi(optarg); break; case 'N': popt->N_multi = atoi(optarg); break; case 'c': popt->ap_prior = atof(optarg); break; - case 'f': freopen(optarg, "w", stdout); break; + case 'f': xreopen(optarg, "w", stdout); break; case 'A': popt->force_isize = 1; break; default: return 1; } diff --git a/bwase.c b/bwase.c index 937aacf..8ddbcc4 100644 --- a/bwase.c +++ b/bwase.c @@ -14,6 +14,8 @@ int g_log_n[256]; char *bwa_rg_line, *bwa_rg_id; +void bwa_print_sam_PG(); + void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) { int i, cnt, best; @@ -604,6 +606,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac ntbns = bwa_open_nt(prefix); bwa_print_sam_SQ(bns); + bwa_print_sam_PG(); // set ks ks = bwa_open_reads(opt.mode, fn_fa); // core loop @@ -662,7 +665,7 @@ int bwa_sai2sam_se(int argc, char *argv[]) } break; case 'n': n_occ = atoi(optarg); break; - case 'f': freopen(optarg, "w", stdout); break; + case 'f': xreopen(optarg, "w", stdout); break; default: return 1; } } diff --git a/bwtaln.c b/bwtaln.c index 6d7b8f4..de63676 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -268,7 +268,7 @@ int bwa_aln(int argc, char *argv[]) case 'q': opt->trim_qual = atoi(optarg); break; case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break; case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; - case 'f': freopen(optarg, "wb", stdout); break; + case 'f': xreopen(optarg, "wb", stdout); break; case 'b': opt->mode |= BWA_MODE_BAM; break; case '0': opt->mode |= BWA_MODE_BAM_SE; break; case '1': opt->mode |= BWA_MODE_BAM_READ1; break; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index c8915cc..96d0d0a 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -290,7 +290,7 @@ static void flag_fr(bwtsw2_t *b[2]) for (i = 0; i < b[0]->n; ++i) { bsw2hit_t *p = b[0]->hits + i; for (j = 0; j < b[1]->n; ++j) { - bsw2hit_t *q = b[1]->hits + i; + bsw2hit_t *q = b[1]->hits + j; if (q->beg == p->beg && q->end == p->end && q->k == p->k && q->len == p->len && q->G == p->G) { q->flag |= 0x30000; p->flag |= 0x30000; break; diff --git a/bwtsw2_main.c b/bwtsw2_main.c index a31800b..e601381 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -5,6 +5,7 @@ #include #include "bwt.h" #include "bwtsw2.h" +#include "utils.h" int bwa_bwtsw2(int argc, char *argv[]) { @@ -32,7 +33,7 @@ int bwa_bwtsw2(int argc, char *argv[]) case 'c': opt->coef = atof(optarg); break; case 'N': opt->t_seeds = atoi(optarg); break; case 'H': opt->hard_clip = 1; break; - case 'f': freopen(optarg, "w", stdout); + case 'f': xreopen(optarg, "w", stdout); break; } } opt->qr = opt->q + opt->r; diff --git a/main.c b/main.c index f99255b..daafa76 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "main.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.5.9rc1 (r1561)" +#define PACKAGE_VERSION "0.5.9rc1-2 (r1605)" #endif static int usage() @@ -31,6 +31,11 @@ static int usage() return 1; } +void bwa_print_sam_PG() +{ + printf("@PG\tID:bwa\tPN:bwa\tVN:%s\n", PACKAGE_VERSION); +} + int main(int argc, char *argv[]) { if (argc < 2) return usage(); diff --git a/utils.c b/utils.c index 203f057..89693e0 100644 --- a/utils.c +++ b/utils.c @@ -43,6 +43,16 @@ FILE *err_xopen_core(const char *func, const char *fn, const char *mode) } return fp; } +FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp) +{ + if (freopen(fn, mode, fp) == 0) { + fprintf(stderr, "[%s] fail to open file '%s': ", func, fn); + perror(NULL); + fprintf(stderr, "Abort!\n"); + abort(); + } + return fp; +} gzFile err_xzopen_core(const char *func, const char *fn, const char *mode) { gzFile fp; diff --git a/utils.h b/utils.h index d6ac07c..31d6086 100644 --- a/utils.h +++ b/utils.h @@ -33,6 +33,7 @@ #define err_fatal_simple(msg) err_fatal_simple_core(__func__, msg) #define xopen(fn, mode) err_xopen_core(__func__, fn, mode) +#define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp) #define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode) #define xassert(cond, msg) if ((cond) == 0) err_fatal_simple_core(__func__, msg) @@ -43,6 +44,7 @@ extern "C" { void err_fatal(const char *header, const char *fmt, ...); void err_fatal_simple_core(const char *func, const char *msg); FILE *err_xopen_core(const char *func, const char *fn, const char *mode); + FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp); gzFile err_xzopen_core(const char *func, const char *fn, const char *mode); #ifdef __cplusplus From fbedf6370b58d075df06afc2817045a3ba4a7828 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 13 Jan 2011 21:12:42 -0500 Subject: [PATCH 004/498] Added .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8862c08 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.[oa] From 94122ad6ee01be4fc971a8877d9379df45d96fa4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 13 Jan 2011 21:14:00 -0500 Subject: [PATCH 005/498] Added a few others to .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 8862c08..1e78ec2 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ *.[oa] +bwa +test From ac93bbc6e4354631a6bc5ad50f050304884f0847 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 13 Jan 2011 21:19:02 -0500 Subject: [PATCH 006/498] Provided links to help information --- README | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README b/README index e69de29..377d323 100644 --- a/README +++ b/README @@ -0,0 +1,15 @@ +FAQ is available at: + + http://bio-bwa.sourceforge.net + +Manual page at: + + http://bio-bwa.sourceforge.net/bwa.shtml + +Mailing list: + + bio-bwa-help@lists.sourceforge.net + +To sign up: + + http://sourceforge.net/mail/?group_id=276243 From f85533c3c70b40d2e4bf20a5cb0d9209f9237408 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 13 Jan 2011 21:28:26 -0500 Subject: [PATCH 007/498] Updated README --- README | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/README b/README index 377d323..3dc2653 100644 --- a/README +++ b/README @@ -1,4 +1,8 @@ -FAQ is available at: +Released packages can be downloaded from SourceForge.net: + + http://sourceforge.net/projects/bio-bwa/files/ + +Introduction and FAQ are available at: http://bio-bwa.sourceforge.net @@ -13,3 +17,14 @@ Mailing list: To sign up: http://sourceforge.net/mail/?group_id=276243 + +Publications (Open Access): + + http://www.ncbi.nlm.nih.gov/pubmed/20080505 + http://www.ncbi.nlm.nih.gov/pubmed/19451168 + + +Part of citations (via HubMed.org): + + http://www.hubmed.org/references.cgi?uids=20080505 + http://www.hubmed.org/references.cgi?uids=19451168 From 5b362b87186917749462f375ec4a7650e36c0148 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 13 Jan 2011 21:38:37 -0500 Subject: [PATCH 008/498] Updated README --- README | 1 - 1 file changed, 1 deletion(-) diff --git a/README b/README index 3dc2653..d92563d 100644 --- a/README +++ b/README @@ -23,7 +23,6 @@ Publications (Open Access): http://www.ncbi.nlm.nih.gov/pubmed/20080505 http://www.ncbi.nlm.nih.gov/pubmed/19451168 - Part of citations (via HubMed.org): http://www.hubmed.org/references.cgi?uids=20080505 From f335b33624c4803a4c35057dc86cad5ff4d1a30b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 15 Jan 2011 10:32:45 -0500 Subject: [PATCH 009/498] fixed a bug in bwase: no RG for unmapped read pairs --- bwase.c | 1 + main.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bwase.c b/bwase.c index 8ddbcc4..2dd2a35 100644 --- a/bwase.c +++ b/bwase.c @@ -518,6 +518,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality printf("%s", p->qual); } else printf("*"); + if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id); if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len); putchar('\n'); } diff --git a/main.c b/main.c index daafa76..eff0d68 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "main.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.5.9rc1-2 (r1605)" +#define PACKAGE_VERSION "0.5.9rc1-9" #endif static int usage() From 10721ca60250ed34ed059c073a4b51180a6f44e8 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 15 Jan 2011 14:07:08 -0500 Subject: [PATCH 010/498] Added an option to accept Illumina 1.3+ fastq --- bwape.c | 8 +++++--- bwase.c | 6 ++++-- bwaseqio.c | 6 ++++-- bwtaln.c | 10 +++++++--- bwtaln.h | 1 + main.c | 2 +- 6 files changed, 22 insertions(+), 11 deletions(-) diff --git a/bwape.c b/bwape.c index 65de3e2..dec52c3 100644 --- a/bwape.c +++ b/bwape.c @@ -645,7 +645,7 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); - int i, j, n_seqs, tot_seqs = 0; + int i, j, n_seqs, tot_seqs = 0, read_flag = 0; bwa_seq_t *seqs[2]; bwa_seqio_t *ks[2]; clock_t t; @@ -691,12 +691,14 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f // core loop bwa_print_sam_SQ(bns); bwa_print_sam_PG(); - while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) { + read_flag |= (opt.mode & BWA_MODE_COMPREAD)? 1 : 0; + read_flag |= ((opt.mode & BWA_MODE_IL13)? 1 : 0)<<1; + while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, read_flag, opt.trim_qual)) != 0) { int cnt_chg; isize_info_t ii; ubyte_t *pacseq; - seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual); + seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, read_flag, opt.trim_qual); tot_seqs += n_seqs; t = clock(); diff --git a/bwase.c b/bwase.c index 2dd2a35..12f01fb 100644 --- a/bwase.c +++ b/bwase.c @@ -587,7 +587,7 @@ int bwa_set_rg(const char *s) void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); - int i, n_seqs, tot_seqs = 0, m_aln; + int i, n_seqs, tot_seqs = 0, m_aln, read_flag = 0; bwt_aln1_t *aln = 0; bwa_seq_t *seqs; bwa_seqio_t *ks; @@ -611,7 +611,9 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f // set ks ks = bwa_open_reads(opt.mode, fn_fa); // core loop - while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode & BWA_MODE_COMPREAD, opt.trim_qual)) != 0) { + read_flag |= (opt.mode & BWA_MODE_COMPREAD)? 1 : 0; + read_flag |= ((opt.mode & BWA_MODE_IL13)? 1 : 0)<<1; + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, read_flag, opt.trim_qual)) != 0) { tot_seqs += n_seqs; t = clock(); diff --git a/bwaseqio.c b/bwaseqio.c index 07a3082..10ff83b 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -139,17 +139,19 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com return seqs; } - bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) +bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int flag, int trim_qual) { bwa_seq_t *seqs, *p; kseq_t *seq = bs->ks; - int n_seqs, l, i; + int n_seqs, l, i, is_comp = flag&1, is_64 = flag&2; long n_trimmed = 0, n_tot = 0; if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { + if (is_64 && seq->qual.l) + for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; diff --git a/bwtaln.c b/bwtaln.c index de63676..3aa9a5c 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -172,7 +172,7 @@ bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa) void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) { - int i, n_seqs, tot_seqs = 0; + int i, n_seqs, tot_seqs = 0, read_flag = 0; bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; @@ -190,7 +190,9 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) // core loop fwrite(opt, sizeof(gap_opt_t), 1, stdout); - while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode & BWA_MODE_COMPREAD, opt->trim_qual)) != 0) { + read_flag |= (opt->mode & BWA_MODE_COMPREAD)? 1 : 0; + read_flag |= ((opt->mode & BWA_MODE_IL13)? 1 : 0)<<1; + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, read_flag, opt->trim_qual)) != 0) { tot_seqs += n_seqs; t = clock(); @@ -246,7 +248,7 @@ int bwa_aln(int argc, char *argv[]) gap_opt_t *opt; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012I")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -273,6 +275,7 @@ int bwa_aln(int argc, char *argv[]) case '0': opt->mode |= BWA_MODE_BAM_SE; break; case '1': opt->mode |= BWA_MODE_BAM_READ1; break; case '2': opt->mode |= BWA_MODE_BAM_READ2; break; + case 'I': opt->mode |= BWA_MODE_IL13; break; default: return 1; } } @@ -303,6 +306,7 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -c input sequences are in the color space\n"); fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); + fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n"); fprintf(stderr, " -b the input read file is in the BAM format\n"); fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); diff --git a/bwtaln.h b/bwtaln.h index 0331b56..9e841f8 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -87,6 +87,7 @@ typedef struct { #define BWA_MODE_BAM_SE 0x40 #define BWA_MODE_BAM_READ1 0x80 #define BWA_MODE_BAM_READ2 0x100 +#define BWA_MODE_IL13 0x200 typedef struct { int s_mm, s_gapo, s_gape; diff --git a/main.c b/main.c index eff0d68..25e32e8 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "main.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.5.9rc1-9" +#define PACKAGE_VERSION "0.5.9rc1-r10" #endif static int usage() From 51d354cd289e377aaa3af20e9b739e1819ba2ac3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 15 Jan 2011 15:35:39 -0500 Subject: [PATCH 011/498] Added barcode support --- .gitignore | 1 + bwape.c | 11 +++++------ bwase.c | 8 ++++---- bwaseqio.c | 28 +++++++++++++++++++++++++--- bwtaln.c | 10 +++++----- bwtaln.h | 6 ++++-- main.c | 2 +- 7 files changed, 45 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 1e78ec2..16d123a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.[oa] bwa test +.*.swp diff --git a/bwape.c b/bwape.c index dec52c3..4e94373 100644 --- a/bwape.c +++ b/bwape.c @@ -645,13 +645,13 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); - int i, j, n_seqs, tot_seqs = 0, read_flag = 0; + int i, j, n_seqs, tot_seqs = 0; bwa_seq_t *seqs[2]; bwa_seqio_t *ks[2]; clock_t t; bntseq_t *bns, *ntbns = 0; FILE *fp_sa[2]; - gap_opt_t opt; + gap_opt_t opt, opt0; khint_t iter; isize_info_t last_ii; // this is for the last batch of reads char str[1024]; @@ -671,6 +671,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); ks[0] = bwa_open_reads(opt.mode, fn_fa[0]); + opt0 = opt; fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! ks[1] = bwa_open_reads(opt.mode, fn_fa[1]); if (!(opt.mode & BWA_MODE_COMPREAD)) { @@ -691,14 +692,12 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f // core loop bwa_print_sam_SQ(bns); bwa_print_sam_PG(); - read_flag |= (opt.mode & BWA_MODE_COMPREAD)? 1 : 0; - read_flag |= ((opt.mode & BWA_MODE_IL13)? 1 : 0)<<1; - while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, read_flag, opt.trim_qual)) != 0) { + while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) { int cnt_chg; isize_info_t ii; ubyte_t *pacseq; - seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, read_flag, opt.trim_qual); + seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual); tot_seqs += n_seqs; t = clock(); diff --git a/bwase.c b/bwase.c index 12f01fb..e9e164d 100644 --- a/bwase.c +++ b/bwase.c @@ -472,6 +472,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in } else printf("*"); if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id); + if (p->bc[0]) printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len); if (p->type != BWA_TYPE_NO_MATCH) { int i; @@ -519,6 +520,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in printf("%s", p->qual); } else printf("*"); if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id); + if (p->bc[0]) printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len); putchar('\n'); } @@ -587,7 +589,7 @@ int bwa_set_rg(const char *s) void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); - int i, n_seqs, tot_seqs = 0, m_aln, read_flag = 0; + int i, n_seqs, tot_seqs = 0, m_aln; bwt_aln1_t *aln = 0; bwa_seq_t *seqs; bwa_seqio_t *ks; @@ -611,9 +613,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f // set ks ks = bwa_open_reads(opt.mode, fn_fa); // core loop - read_flag |= (opt.mode & BWA_MODE_COMPREAD)? 1 : 0; - read_flag |= ((opt.mode & BWA_MODE_IL13)? 1 : 0)<<1; - while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, read_flag, opt.trim_qual)) != 0) { + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) { tot_seqs += n_seqs; t = clock(); diff --git a/bwaseqio.c b/bwaseqio.c index 10ff83b..94271b9 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -1,4 +1,5 @@ #include +#include #include "bwtaln.h" #include "utils.h" #include "bamlite.h" @@ -139,20 +140,41 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com return seqs; } -bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int flag, int trim_qual) +#define BARCODE_LOW_QUAL 13 + +bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual) { bwa_seq_t *seqs, *p; kseq_t *seq = bs->ks; - int n_seqs, l, i, is_comp = flag&1, is_64 = flag&2; + int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; - if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); + if (l_bc > 15) { + fprintf(stderr, "[%s] the maximum barcode length is 15.\n", __func__); + return 0; + } + if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; + if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; + if (l_bc) { // then trim barcode + for (i = 0; i < l_bc; ++i) + p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); + p->bc[i] = 0; + for (; i < seq->seq.l; ++i) + seq->seq.s[i - l_bc] = seq->seq.s[i]; + seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; + if (seq->qual.l) { + for (i = l_bc; i < seq->qual.l; ++i) + seq->qual.s[i - l_bc] = seq->qual.s[i]; + seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; + } + l = seq->seq.l; + } else p->bc[0] = 0; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; diff --git a/bwtaln.c b/bwtaln.c index 3aa9a5c..bd2aad2 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -172,7 +172,7 @@ bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa) void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) { - int i, n_seqs, tot_seqs = 0, read_flag = 0; + int i, n_seqs, tot_seqs = 0; bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; @@ -190,9 +190,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) // core loop fwrite(opt, sizeof(gap_opt_t), 1, stdout); - read_flag |= (opt->mode & BWA_MODE_COMPREAD)? 1 : 0; - read_flag |= ((opt->mode & BWA_MODE_IL13)? 1 : 0)<<1; - while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, read_flag, opt->trim_qual)) != 0) { + while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) { tot_seqs += n_seqs; t = clock(); @@ -248,7 +246,7 @@ int bwa_aln(int argc, char *argv[]) gap_opt_t *opt; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012I")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -276,6 +274,7 @@ int bwa_aln(int argc, char *argv[]) case '1': opt->mode |= BWA_MODE_BAM_READ1; break; case '2': opt->mode |= BWA_MODE_BAM_READ2; break; case 'I': opt->mode |= BWA_MODE_IL13; break; + case 'B': opt->mode |= atoi(optarg) << 24; break; default: return 1; } } @@ -303,6 +302,7 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2); fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); fprintf(stderr, " -f FILE file to write output to instead of stdout\n"); + fprintf(stderr, " -B INT length of barcode\n"); fprintf(stderr, " -c input sequences are in the color space\n"); fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); diff --git a/bwtaln.h b/bwtaln.h index 9e841f8..f659ac8 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -74,6 +74,8 @@ typedef struct { bwa_cigar_t *cigar; // for multi-threading only int tid; + // barcode + char bc[16]; // null terminated; up to 15 bases // NM and MD tags uint32_t full_len:20, nm:12; char *md; @@ -91,7 +93,7 @@ typedef struct { typedef struct { int s_mm, s_gapo, s_gape; - int mode; + int mode; // bit 24-31 are the barcode length int indel_end_skip, max_del_occ, max_entries; float fnr; int max_diff, max_gapo, max_gape; @@ -126,7 +128,7 @@ extern "C" { bwa_seqio_t *bwa_bam_open(const char *fn, int which); void bwa_seq_close(bwa_seqio_t *bs); void seq_reverse(int len, ubyte_t *seq, int is_comp); - bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int is_comp, int trim_qual); + bwa_seq_t *bwa_read_seq(bwa_seqio_t *seq, int n_needed, int *n, int mode, int trim_qual); void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs); int bwa_cal_maxdiff(int l, double err, double thres); diff --git a/main.c b/main.c index 25e32e8..1b9d7f2 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "main.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.5.9rc1-r10" +#define PACKAGE_VERSION "0.5.9rc1-r11" #endif static int usage() From 1d7d8be9e8e9f4e3a241e8480d7bf7517e116280 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 18 Jan 2011 20:16:57 -0500 Subject: [PATCH 012/498] Put BC: to both ends --- bwape.c | 10 ++++++++-- main.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/bwape.c b/bwape.c index 4e94373..3336538 100644 --- a/bwape.c +++ b/bwape.c @@ -718,8 +718,14 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... "); for (i = 0; i < n_seqs; ++i) { - bwa_print_sam1(bns, seqs[0] + i, seqs[1] + i, opt.mode, opt.max_top2); - bwa_print_sam1(bns, seqs[1] + i, seqs[0] + i, opt.mode, opt.max_top2); + bwa_seq_t *p[2]; + p[0] = seqs[0] + i; p[1] = seqs[1] + i; + if (p[0]->bc[0] || p[1]->bc[0]) { + strcat(p[0]->bc, p[1]->bc); + strcpy(p[1]->bc, p[0]->bc); + } + bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2); + bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2); } fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); diff --git a/main.c b/main.c index 1b9d7f2..af5cc37 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "main.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.5.9rc1-r11" +#define PACKAGE_VERSION "0.5.9rc1-r12" #endif static int usage() From 0380cf02bf873a11bfe51b015074f0432f75ae1e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 18 Jan 2011 23:27:55 -0500 Subject: [PATCH 013/498] update the manual page --- bwa.1 | 64 +++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/bwa.1 b/bwa.1 index 71fb80a..d6af7a0 100644 --- a/bwa.1 +++ b/bwa.1 @@ -56,10 +56,10 @@ Index database sequences in the FASTA format. .B -c Build color-space index. The input fast should be in nucleotide space. .TP -.B -p STR +.BI -p \ STR Prefix of the output database [same as db filename] .TP -.B -a STR +.BI -a \ STR Algorithm for constructing BWT index. Available options are: .RS .TP @@ -95,47 +95,47 @@ differences are allowed in the whole sequence. .B OPTIONS: .RS .TP 10 -.B -n NUM +.BI -n \ NUM Maximum edit distance if the value is INT, or the fraction of missing alignments given 2% uniform base error rate if FLOAT. In the latter case, the maximum edit distance is automatically chosen for different read lengths. [0.04] .TP -.B -o INT +.BI -o \ INT Maximum number of gap opens [1] .TP -.B -e INT +.BI -e \ INT Maximum number of gap extensions, -1 for k-difference mode (disallowing long gaps) [-1] .TP -.B -d INT +.BI -d \ INT Disallow a long deletion within INT bp towards the 3'-end [16] .TP -.B -i INT +.BI -i \ INT Disallow an indel within INT bp towards the ends [5] .TP -.B -l INT +.BI -l \ INT Take the first INT subsequence as seed. If INT is larger than the query sequence, seeding will be disabled. For long reads, this option is typically ranged from 25 to 35 for `-k 2'. [inf] .TP -.B -k INT +.BI -k \ INT Maximum edit distance in the seed [2] .TP -.B -t INT +.BI -t \ INT Number of threads (multi-threading mode) [1] .TP -.B -M INT +.BI -M \ INT Mismatch penalty. BWA will not search for suboptimal hits with a score lower than (bestScore-misMsc). [3] .TP -.B -O INT +.BI -O \ INT Gap open penalty [11] .TP -.B -E INT +.BI -E \ INT Gap extension penalty [4] .TP -.B -R INT +.BI -R \ INT Proceed with suboptimal alignments if there are no more than INT equally best hits. This option only affects paired-end mapping. Increasing this threshold helps to improve the pairing accuracy at the cost of speed, @@ -150,11 +150,22 @@ Disable iterative search. All hits with no more than .I maxDiff differences will be found. This mode is much slower than the default. .TP -.B -q INT +.BI -q \ INT Parameter for read trimming. BWA trims a read down to argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l file. .B OPTIONS: .RS .TP 10 -.B -a INT +.BI -a \ INT Score of a match [1] .TP -.B -b INT +.BI -b \ INT Mismatch penalty [3] .TP -.B -q INT +.BI -q \ INT Gap open penalty [5] .TP -.B -r INT +.BI -r \ INT Gap extension penalty. The penalty for a contiguous gap of size k is q+k*r. [2] .TP -.B -t INT +.BI -t \ INT Number of threads in the multi-threading mode [1] .TP -.B -w INT +.BI -w \ INT Band width in the banded alignment [33] .TP -.B -T INT +.BI -T \ INT Minimum score threshold divided by a [37] .TP -.B -c FLOAT +.BI -c \ FLOAT Coefficient for threshold adjustment according to query length. Given an l-long query, the threshold for a hit to be retained is a*max{T,c*log(l)}. [5.5] .TP -.B -z INT +.BI -z \ INT Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1] .TP -.B -s INT +.BI -s \ INT Maximum SA interval size for initiating a seed. Higher -s increases accuracy at the cost of speed. [3] .TP -.B -N INT +.BI -N \ INT Minimum number of seeds supporting the resultant alignment to skip reverse alignment. [5] .RE @@ -361,6 +372,7 @@ _ NM Edit distance MD Mismatching positions/bases AS Alignment score +BC Barcode sequence _ X0 Number of best hits X1 Number of suboptimal hits found by BWA From dac5395126cad88fbefc36675f31ac8ccafe3dbb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 21 Jan 2011 23:56:18 -0500 Subject: [PATCH 014/498] fixed a typo in help --- bwtsw2_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwtsw2_main.c b/bwtsw2_main.c index e601381..0f1c660 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -47,7 +47,7 @@ int bwa_bwtsw2(int argc, char *argv[]) fprintf(stderr, " -r INT gap extension penalty [%d]\n", opt->r); // fprintf(stderr, " -y FLOAT error recurrence coef. (4..16) [%.1f]\n", opt->yita); fprintf(stderr, "\n"); - fprintf(stderr, " -t INT nmber of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -s INT size of a chunk of reads [%d]\n", opt->chunk_size); fprintf(stderr, "\n"); fprintf(stderr, " -w INT band width [%d]\n", opt->bw); @@ -62,7 +62,7 @@ int bwa_bwtsw2(int argc, char *argv[]) fprintf(stderr, " -f FILE file to output results to instead of stdout\n"); fprintf(stderr, "\n"); - { + if (0) { double c, theta, eps, delta; c = opt->a / log(opt->yita); theta = exp(-opt->b / c) / opt->yita; From 7fd894868928db7f61fcff693963d7ee2f84f0ed Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 22 Jan 2011 13:20:11 -0500 Subject: [PATCH 015/498] Added recommendation for PacBio reads --- bwtsw2_main.c | 6 +++++- main.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/bwtsw2_main.c b/bwtsw2_main.c index 0f1c660..afbad2e 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -59,7 +59,11 @@ int bwa_bwtsw2(int argc, char *argv[]) fprintf(stderr, " -N INT # seeds to trigger reverse alignment [%d]\n", opt->t_seeds); fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef); fprintf(stderr, " -H in SAM output, use hard clipping rather than soft\n"); - fprintf(stderr, " -f FILE file to output results to instead of stdout\n"); + fprintf(stderr, " -f FILE file to output results to instead of stdout\n\n"); + fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n"); + fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n"); + fprintf(stderr, " reads (end of 2010), '-b5 -q2 -r1 -z10' is recommended. One may also\n"); + fprintf(stderr, " increase '-z' for better sensitivity.\n"); fprintf(stderr, "\n"); if (0) { diff --git a/main.c b/main.c index af5cc37..a87d1a1 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "main.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.5.9rc1-r12" +#define PACKAGE_VERSION "0.5.9rc1-r15" #endif static int usage() From 87664941b09d8ded9705ccdb5af7b375751d5187 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 Jan 2011 22:00:24 -0500 Subject: [PATCH 016/498] Release bwa-0.5.9 (r16) --- NEWS | 27 +++++++++++++++++++++++++++ README | 2 +- bwa.1 | 2 +- main.c | 2 +- 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 027beec..a49db00 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,30 @@ +Beta Release 0.5.9 (24 January, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Feature: barcode support via the `-B' option. + + * Feature: Illumina 1.3+ read format support via the `-I' option. + + * Bugfix: RG tags are not attached to unmapped reads. + + * Bugfix: very rare bwasw mismappings + + * Recommend options for PacBio reads in bwasw help message. + + +Also, since January 13, the BWA master repository has been moved to github: + + https://github.com/lh3/bwa + +The revision number has been reset. All recent changes will be first +committed to this repository. + +(0.5.9: 24 January 2011, r16) + + + Beta Release Candidate 0.5.9rc1 (10 December, 2010) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/README b/README index d92563d..f398cec 100644 --- a/README +++ b/README @@ -23,7 +23,7 @@ Publications (Open Access): http://www.ncbi.nlm.nih.gov/pubmed/20080505 http://www.ncbi.nlm.nih.gov/pubmed/19451168 -Part of citations (via HubMed.org): +Incomplete list of citations (via HubMed.org): http://www.hubmed.org/references.cgi?uids=20080505 http://www.hubmed.org/references.cgi?uids=19451168 diff --git a/bwa.1 b/bwa.1 index d6af7a0..c82fdc7 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "10 December 2010" "bwa-0.5.9rc1" "Bioinformatics tools" +.TH bwa 1 "24 January 2011" "bwa-0.5.9" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool diff --git a/main.c b/main.c index a87d1a1..3990296 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "main.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.5.9rc1-r15" +#define PACKAGE_VERSION "0.5.9-r16" #endif static int usage() From 8d6b859bf8cadf5cd8438630f5bfb4fe931ae36d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Apr 2011 17:26:08 -0400 Subject: [PATCH 017/498] speed up multi-threading --- bwtaln.c | 15 +-------------- bwtsw2_aux.c | 14 +------------- 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/bwtaln.c b/bwtaln.c index bd2aad2..66310d3 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -13,9 +13,7 @@ #include "utils.h" #ifdef HAVE_PTHREAD -#define THREAD_BLOCK_SIZE 1024 #include -static pthread_mutex_t g_seq_lock = PTHREAD_MUTEX_INITIALIZER; #endif gap_opt_t *gap_init_opt() @@ -98,18 +96,7 @@ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seq for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = seqs + i; #ifdef HAVE_PTHREAD - if (opt->n_threads > 1) { - pthread_mutex_lock(&g_seq_lock); - if (p->tid < 0) { // unassigned - int j; - for (j = i; j < n_seqs && j < i + THREAD_BLOCK_SIZE; ++j) - seqs[j].tid = tid; - } else if (p->tid != tid) { - pthread_mutex_unlock(&g_seq_lock); - continue; - } - pthread_mutex_unlock(&g_seq_lock); - } + if (i % opt->n_threads != tid) continue; #endif p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; seq[0] = p->seq; seq[1] = p->rseq; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 96d0d0a..8ba6455 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -309,10 +309,6 @@ typedef struct { bsw2seq1_t *seq; } bsw2seq_t; -#ifdef HAVE_PTHREAD -static pthread_mutex_t g_dbwtsw_lock = PTHREAD_MUTEX_INITIALIZER; -#endif - static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar) { // FIXME: this routine does not work if the query bridge three reference sequences @@ -469,15 +465,7 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const l = p->l; #ifdef HAVE_PTHREAD - if (_opt->n_threads > 1) { - pthread_mutex_lock(&g_dbwtsw_lock); - if (p->tid < 0) p->tid = tid; - else if (p->tid != tid) { - pthread_mutex_unlock(&g_dbwtsw_lock); - continue; - } // in pinciple else should not happen - pthread_mutex_unlock(&g_dbwtsw_lock); - } + if (x % _opt->n_threads != tid) continue; #endif // set opt->t From 4d064c69cecef019920ee6dce8d60b92a3bb34b0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 4 May 2011 09:41:30 -0400 Subject: [PATCH 018/498] added -s to ar --- bwt_gen/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwt_gen/Makefile b/bwt_gen/Makefile index 131b1c9..a7fe6cf 100644 --- a/bwt_gen/Makefile +++ b/bwt_gen/Makefile @@ -15,7 +15,7 @@ SUBDIRS= lib:libbwtgen.a libbwtgen.a:$(OBJS) - $(AR) -cru $@ $(OBJS) + $(AR) -scru $@ $(OBJS) cleanlocal: rm -f gmon.out *.o a.out $(PROG) *~ *.a From 243e735431a0348f5764958ae913014c4e971f33 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 4 May 2011 09:46:50 -0400 Subject: [PATCH 019/498] applied patches from Alec Wysoker --- bwase.c | 68 ++++++++++++++++++++++++++++---------------------------- bwtaln.c | 6 ++--- main.c | 5 ++++- utils.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ utils.h | 16 +++++++++++++ 5 files changed, 123 insertions(+), 38 deletions(-) diff --git a/bwase.c b/bwase.c index e9e164d..7ef4bec 100644 --- a/bwase.c +++ b/bwase.c @@ -437,15 +437,15 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in if (mate->strand) flag |= SAM_FMR; } else flag |= SAM_FMU; } - printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name); - printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ); + err_printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name); + err_printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ); // print CIGAR if (p->cigar) { for (j = 0; j != p->n_cigar; ++j) - printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]); - } else if (p->type == BWA_TYPE_NO_MATCH) printf("*"); - else printf("%dM", p->len); + err_printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]); + } else if (p->type == BWA_TYPE_NO_MATCH) err_printf("*"); + else err_printf("%dM", p->len); // print mate coordinate if (mate && mate->type != BWA_TYPE_NO_MATCH) { @@ -454,12 +454,12 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality // redundant calculation here, but should not matter too much m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid); - printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); + err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0; if (p->type == BWA_TYPE_NO_MATCH) isize = 0; - printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize); - } else if (mate) printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1)); - else printf("\t*\t0\t0\t"); + err_printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize); + } else if (mate) err_printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1)); + else err_printf("\t*\t0\t0\t"); // print sequence and quality if (p->strand == 0) @@ -468,42 +468,42 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in putchar('\t'); if (p->qual) { if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality - printf("%s", p->qual); - } else printf("*"); + err_printf("%s", p->qual); + } else err_printf("*"); - if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id); - if (p->bc[0]) printf("\tBC:Z:%s", p->bc); - if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len); + if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); + if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); + if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); if (p->type != BWA_TYPE_NO_MATCH) { int i; // calculate XT tag XT = "NURM"[p->type]; if (nn > 10) XT = 'N'; // print tags - printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm); - if (nn) printf("\tXN:i:%d", nn); - if (mate) printf("\tSM:i:%d\tAM:i:%d", p->seQ, am); + err_printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm); + if (nn) err_printf("\tXN:i:%d", nn); + if (mate) err_printf("\tSM:i:%d\tAM:i:%d", p->seQ, am); if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment - printf("\tX0:i:%d", p->c1); - if (p->c1 <= max_top2) printf("\tX1:i:%d", p->c2); + err_printf("\tX0:i:%d", p->c1); + if (p->c1 <= max_top2) err_printf("\tX1:i:%d", p->c2); } - printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape); - if (p->md) printf("\tMD:Z:%s", p->md); + err_printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape); + if (p->md) err_printf("\tMD:Z:%s", p->md); // print multiple hits if (p->n_multi) { - printf("\tXA:Z:"); + err_printf("\tXA:Z:"); for (i = 0; i < p->n_multi; ++i) { bwt_multi1_t *q = p->multi + i; int k; j = pos_end_multi(q, p->len) - q->pos; nn = bns_coor_pac2real(bns, q->pos, j, &seqid); - printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+', + err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+', (int)(q->pos - bns->anns[seqid].offset + 1)); if (q->cigar) { for (k = 0; k < q->n_cigar; ++k) - printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]); - } else printf("%dM", p->len); - printf(",%d;", q->gap + q->mm); + err_printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]); + } else err_printf("%dM", p->len); + err_printf(",%d;", q->gap + q->mm); } } } @@ -512,16 +512,16 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in ubyte_t *s = p->strand? p->rseq : p->seq; int flag = p->extra_flag | SAM_FSU; if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU; - printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag); + err_printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag); for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]); putchar('\t'); if (p->qual) { if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality - printf("%s", p->qual); - } else printf("*"); - if (bwa_rg_id) printf("\tRG:Z:%s", bwa_rg_id); - if (p->bc[0]) printf("\tBC:Z:%s", p->bc); - if (p->clip_len < p->full_len) printf("\tXC:i:%d", p->clip_len); + err_printf("%s", p->qual); + } else err_printf("*"); + if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); + if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); + if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); putchar('\n'); } } @@ -541,8 +541,8 @@ void bwa_print_sam_SQ(const bntseq_t *bns) { int i; for (i = 0; i < bns->n_seqs; ++i) - printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); - if (bwa_rg_line) printf("%s\n", bwa_rg_line); + err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); + if (bwa_rg_line) err_printf("%s\n", bwa_rg_line); } void bwase_initialize() diff --git a/bwtaln.c b/bwtaln.c index 66310d3..905c2d2 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -176,7 +176,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) } // core loop - fwrite(opt, sizeof(gap_opt_t), 1, stdout); + err_fwrite(opt, sizeof(gap_opt_t), 1, stdout); while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) { tot_seqs += n_seqs; t = clock(); @@ -213,8 +213,8 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) fprintf(stderr, "[bwa_aln_core] write to the disk... "); for (i = 0; i < n_seqs; ++i) { bwa_seq_t *p = seqs + i; - fwrite(&p->n_aln, 4, 1, stdout); - if (p->n_aln) fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); + err_fwrite(&p->n_aln, 4, 1, stdout); + if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); } fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); diff --git a/main.c b/main.c index 3990296..4f70bb5 100644 --- a/main.c +++ b/main.c @@ -1,9 +1,10 @@ #include #include #include "main.h" +#include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.5.9-r16" +#define PACKAGE_VERSION "0.5.9-r18-dev" #endif static int usage() @@ -59,5 +60,7 @@ int main(int argc, char *argv[]) fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; } + err_fflush(stdout); + err_fclose(stdout); return 0; } diff --git a/utils.c b/utils.c index 89693e0..d47ec5c 100644 --- a/utils.c +++ b/utils.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "utils.h" FILE *err_xopen_core(const char *func, const char *fn, const char *mode) @@ -80,3 +81,68 @@ void err_fatal_simple_core(const char *func, const char *msg) fprintf(stderr, "[%s] %s Abort!\n", func, msg); abort(); } + +size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) +{ + size_t ret = fwrite(ptr, size, nmemb, stream); + if (ret != nmemb) + { + err_fatal_simple_core("fwrite", strerror(errno)); + } + return ret; +} + +int err_printf(const char *format, ...) +{ + va_list arg; + int done; + + va_start(arg, format); + done = vfprintf(stdout, format, arg); + int saveErrno = errno; + va_end(arg); + + if (done < 0) + { + err_fatal_simple_core("vfprintf(stdout)", strerror(saveErrno)); + } + return done; +} + +int err_fprintf(FILE *stream, const char *format, ...) +{ + va_list arg; + int done; + + va_start(arg, format); + done = vfprintf(stream, format, arg); + int saveErrno = errno; + va_end(arg); + + if (done < 0) + { + err_fatal_simple_core("vfprintf", strerror(saveErrno)); + } + return done; +} + +int err_fflush(FILE *stream) +{ + int ret = fflush(stream); + if (ret != 0) + { + err_fatal_simple_core("fflush", strerror(errno)); + } + return ret; +} + +int err_fclose(FILE *stream) +{ + int ret = fclose(stream); + if (ret != 0) + { + err_fatal_simple_core("fclose", strerror(errno)); + } + return ret; +} + diff --git a/utils.h b/utils.h index 31d6086..a7fecbc 100644 --- a/utils.h +++ b/utils.h @@ -31,6 +31,15 @@ #include #include +#ifdef __GNUC__ +// Tell GCC to validate printf format string and args +#define ATTRIBUTE(list) __attribute__ (list) +#else +#define ATTRIBUTE(list) +#endif + + + #define err_fatal_simple(msg) err_fatal_simple_core(__func__, msg) #define xopen(fn, mode) err_xopen_core(__func__, fn, mode) #define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp) @@ -46,6 +55,13 @@ extern "C" { FILE *err_xopen_core(const char *func, const char *fn, const char *mode); FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp); gzFile err_xzopen_core(const char *func, const char *fn, const char *mode); + size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); + int err_fprintf(FILE *stream, const char *format, ...) + ATTRIBUTE((format(printf, 2, 3))); + int err_printf(const char *format, ...) + ATTRIBUTE((format(printf, 1, 2))); + int err_fflush(FILE *stream); + int err_fclose(FILE *stream); #ifdef __cplusplus } From a74523a68d77cf0d1b76b3de0a05b90af43eea10 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 9 Jun 2011 17:17:13 -0400 Subject: [PATCH 020/498] increase maximum barcode length limit to 63bp --- bwaseqio.c | 4 ++-- bwtaln.h | 4 +++- main.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bwaseqio.c b/bwaseqio.c index 94271b9..12ac765 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -149,8 +149,8 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int tri int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; - if (l_bc > 15) { - fprintf(stderr, "[%s] the maximum barcode length is 15.\n", __func__); + if (l_bc > BWA_MAX_BCLEN) { + fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); return 0; } if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input diff --git a/bwtaln.h b/bwtaln.h index f659ac8..02f54f7 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -22,6 +22,8 @@ #define BWA_AVG_ERR 0.02 #define BWA_MIN_RDLEN 35 // for read trimming +#define BWA_MAX_BCLEN 63 // maximum barcode length; 127 is the maximum + #ifndef bns_pac #define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3) #endif @@ -75,7 +77,7 @@ typedef struct { // for multi-threading only int tid; // barcode - char bc[16]; // null terminated; up to 15 bases + char bc[BWA_MAX_BCLEN+1]; // null terminated; up to BWA_MAX_BCLEN bases // NM and MD tags uint32_t full_len:20, nm:12; char *md; diff --git a/main.c b/main.c index 4f70bb5..b22552e 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.5.9-r18-dev" +#define PACKAGE_VERSION "0.5.9-r19-dev" #endif static int usage() From 72563c38f383b4d3df444caaf5cfb8327ab629a3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 9 Jun 2011 17:33:25 -0400 Subject: [PATCH 021/498] automatically choose the algorithm for BWT --- bntseq.c | 5 ++++- bntseq.h | 2 +- bwtindex.c | 14 ++++++++++---- main.c | 2 +- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/bntseq.c b/bntseq.c index 86888c1..21ba91f 100644 --- a/bntseq.c +++ b/bntseq.c @@ -163,7 +163,7 @@ void bns_destroy(bntseq_t *bns) } } -void bns_fasta2bntseq(gzFile fp_fa, const char *prefix) +int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix) { kseq_t *seq; char name[1024]; @@ -172,6 +172,7 @@ void bns_fasta2bntseq(gzFile fp_fa, const char *prefix) int l_buf; unsigned char buf[0x10000]; int32_t m_seqs, m_holes, l, i; + int64_t ret = -1; FILE *fp; // initialization @@ -235,6 +236,7 @@ void bns_fasta2bntseq(gzFile fp_fa, const char *prefix) bns->l_pac += seq->seq.l; } xassert(bns->l_pac, "zero length sequence."); + ret = bns->l_pac; { // finalize .pac file ubyte_t ct; fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp); @@ -251,6 +253,7 @@ void bns_fasta2bntseq(gzFile fp_fa, const char *prefix) bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); + return ret; } int bwa_fa2pac(int argc, char *argv[]) diff --git a/bntseq.h b/bntseq.h index 21b831e..189e017 100644 --- a/bntseq.h +++ b/bntseq.h @@ -70,7 +70,7 @@ extern "C" { bntseq_t *bns_restore(const char *prefix); bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); void bns_destroy(bntseq_t *bns); - void bns_fasta2bntseq(gzFile fp_fa, const char *prefix); + int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix); int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq); #ifdef __cplusplus diff --git a/bwtindex.c b/bwtindex.c index 68792f7..c752a2f 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -42,12 +42,13 @@ void bwa_pac_rev_core(const char *fn, const char *fn_rev); int bwa_index(int argc, char *argv[]) { char *prefix = 0, *str, *str2, *str3; - int c, algo_type = 3, is_color = 0; + int c, algo_type = 0, is_color = 0; clock_t t; + int64_t l_pac; while ((c = getopt(argc, argv, "ca:p:")) >= 0) { switch (c) { - case 'a': + case 'a': // if -a is not set, algo_type will be determined later if (strcmp(optarg, "div") == 0) algo_type = 1; else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2; else if (strcmp(optarg, "is") == 0) algo_type = 3; @@ -79,7 +80,7 @@ int bwa_index(int argc, char *argv[]) gzFile fp = xzopen(argv[optind], "r"); t = clock(); fprintf(stderr, "[bwa_index] Pack FASTA... "); - bns_fasta2bntseq(fp, prefix); + l_pac = bns_fasta2bntseq(fp, prefix); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); gzclose(fp); } else { // color indexing @@ -87,7 +88,7 @@ int bwa_index(int argc, char *argv[]) strcat(strcpy(str, prefix), ".nt"); t = clock(); fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... "); - bns_fasta2bntseq(fp, str); + l_pac = bns_fasta2bntseq(fp, str); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); gzclose(fp); { @@ -99,6 +100,11 @@ int bwa_index(int argc, char *argv[]) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } } + if (l_pac > 0xffffffffu) { + fprintf(stderr, "[%s] BWA only works with reference sequences shorter than 4GB in total. Abort!\n", __func__); + return 1; + } + if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT { strcpy(str, prefix); strcat(str, ".pac"); strcpy(str2, prefix); strcat(str2, ".rpac"); diff --git a/main.c b/main.c index b22552e..0e545a7 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.5.9-r19-dev" +#define PACKAGE_VERSION "0.5.9-r20-dev" #endif static int usage() From d11674367d5784c4f0cbcd1ee06d6a368d692f0f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 9 Jun 2011 18:08:49 -0400 Subject: [PATCH 022/498] convert XA to multiple lines --- xa2multi.pl | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100755 xa2multi.pl diff --git a/xa2multi.pl b/xa2multi.pl new file mode 100755 index 0000000..6dae38b --- /dev/null +++ b/xa2multi.pl @@ -0,0 +1,16 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; + +while (<>) { + if (/\tXA:Z:(\S+)/) { + my $l = $1; + print; + my @t = split("\t"); + while ($l =~ /([^,;]+),([-+]\d+),([^,]+),(\d+);/g) { + my $mchr = ($t[6] eq $1)? '=' : $t[6]; # FIXME: TLEN/ISIZE is not calculated! + print(join("\t", $t[0], 0x100|($t[1]&0x6e9)|($2<0?0x10:0), $1, abs($2), 0, $3, @t[6..7], 0, @t[9..10], "NM:i:$4"), "\n"); + } + } else { print; } +} From 36cd4f9882cec5b35836cc1ee09ced2aa9887a9d Mon Sep 17 00:00:00 2001 From: RoelKluin Date: Fri, 8 Jul 2011 16:49:09 +0200 Subject: [PATCH 023/498] In Casava 1.8 the fastq output changed, the name had a space which bwa wasn't parsing correctly. This patch fixes that and enables bwa to filter sequences marked by Casava, removing this tag from the output. Signed-off-by: RoelKluin --- bwaseqio.c | 19 +++++++++++++++++++ bwtaln.c | 4 +++- bwtaln.h | 1 + kseq.h | 2 +- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/bwaseqio.c b/bwaseqio.c index 12ac765..ac29ba7 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -157,6 +157,25 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int tri n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { + // skip reads that are marked to be filtered by Casava + if (mode & BWA_MODE_CFY) { + char *s = rindex(seq->name.s, ' '); + if (s) { + *s = '\0'; + for(++s; *s != '\0'; ++s) { + if (*s == ':') { + ++s; + break; + } + } + if (*s == 'Y') + continue; + } + if (!s || *s != 'N') { + fprintf(stderr, "No Casava filter character found.\n"); + return 0; + } + } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length diff --git a/bwtaln.c b/bwtaln.c index 905c2d2..de6b001 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -233,7 +233,7 @@ int bwa_aln(int argc, char *argv[]) gap_opt_t *opt; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IB:")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -261,6 +261,7 @@ int bwa_aln(int argc, char *argv[]) case '1': opt->mode |= BWA_MODE_BAM_READ1; break; case '2': opt->mode |= BWA_MODE_BAM_READ2; break; case 'I': opt->mode |= BWA_MODE_IL13; break; + case 'Y': opt->mode |= BWA_MODE_CFY; break; case 'B': opt->mode |= atoi(optarg) << 24; break; default: return 1; } @@ -298,6 +299,7 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n"); + fprintf(stderr, " -Y filter Casava-filtered sequences\n"); fprintf(stderr, "\n"); return 1; } diff --git a/bwtaln.h b/bwtaln.h index 02f54f7..c2faa98 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -86,6 +86,7 @@ typedef struct { #define BWA_MODE_GAPE 0x01 #define BWA_MODE_COMPREAD 0x02 #define BWA_MODE_LOGGAP 0x04 +#define BWA_MODE_CFY 0x08 #define BWA_MODE_NONSTOP 0x10 #define BWA_MODE_BAM 0x20 #define BWA_MODE_BAM_SE 0x40 diff --git a/kseq.h b/kseq.h index ad8937c..f44ac31 100644 --- a/kseq.h +++ b/kseq.h @@ -102,7 +102,7 @@ typedef struct __kstring_t { if (ks->buf[i] == delimiter) break; \ } else { \ for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i])) break; \ + if (isspace(ks->buf[i]) && (ks->buf[i] != ' ')) break; \ } \ if (str->m - str->l < i - ks->begin + 1) { \ str->m = str->l + (i - ks->begin) + 1; \ From 8f115a8e00bd6f5c38292f04786f6ba1fd955a65 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Sun, 10 Jul 2011 16:40:42 +0200 Subject: [PATCH 024/498] Revert "In Casava 1.8 the fastq output changed, the name had a space which bwa" This reverts commit 36cd4f9882cec5b35836cc1ee09ced2aa9887a9d. The comment shouldn't be included in the sequence name. --- bwaseqio.c | 19 ------------------- bwtaln.c | 4 +--- bwtaln.h | 1 - kseq.h | 2 +- 4 files changed, 2 insertions(+), 24 deletions(-) diff --git a/bwaseqio.c b/bwaseqio.c index ac29ba7..12ac765 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -157,25 +157,6 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int tri n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { - // skip reads that are marked to be filtered by Casava - if (mode & BWA_MODE_CFY) { - char *s = rindex(seq->name.s, ' '); - if (s) { - *s = '\0'; - for(++s; *s != '\0'; ++s) { - if (*s == ':') { - ++s; - break; - } - } - if (*s == 'Y') - continue; - } - if (!s || *s != 'N') { - fprintf(stderr, "No Casava filter character found.\n"); - return 0; - } - } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length diff --git a/bwtaln.c b/bwtaln.c index de6b001..905c2d2 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -233,7 +233,7 @@ int bwa_aln(int argc, char *argv[]) gap_opt_t *opt; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -261,7 +261,6 @@ int bwa_aln(int argc, char *argv[]) case '1': opt->mode |= BWA_MODE_BAM_READ1; break; case '2': opt->mode |= BWA_MODE_BAM_READ2; break; case 'I': opt->mode |= BWA_MODE_IL13; break; - case 'Y': opt->mode |= BWA_MODE_CFY; break; case 'B': opt->mode |= atoi(optarg) << 24; break; default: return 1; } @@ -299,7 +298,6 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n"); - fprintf(stderr, " -Y filter Casava-filtered sequences\n"); fprintf(stderr, "\n"); return 1; } diff --git a/bwtaln.h b/bwtaln.h index c2faa98..02f54f7 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -86,7 +86,6 @@ typedef struct { #define BWA_MODE_GAPE 0x01 #define BWA_MODE_COMPREAD 0x02 #define BWA_MODE_LOGGAP 0x04 -#define BWA_MODE_CFY 0x08 #define BWA_MODE_NONSTOP 0x10 #define BWA_MODE_BAM 0x20 #define BWA_MODE_BAM_SE 0x40 diff --git a/kseq.h b/kseq.h index f44ac31..ad8937c 100644 --- a/kseq.h +++ b/kseq.h @@ -102,7 +102,7 @@ typedef struct __kstring_t { if (ks->buf[i] == delimiter) break; \ } else { \ for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i]) && (ks->buf[i] != ' ')) break; \ + if (isspace(ks->buf[i])) break; \ } \ if (str->m - str->l < i - ks->begin + 1) { \ str->m = str->l + (i - ks->begin) + 1; \ From db59a605d1b31ec20645676c4f58d0af21198fec Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Sun, 10 Jul 2011 17:04:06 +0200 Subject: [PATCH 025/498] Remove sequences marked to be filtered by Casava-1.8 with bwa aln -Y In Casava 1.8 the fastq output changed. e.g. @EAS139:136:FC706VJ:2:5:1000:12850 1:Y:18:ATCACG AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + BBBBCCCC?::: With `Y' Casava indicates that a sequence should be filtered. This patch enables bwa, with an -Y flag, to filter these sequences. Signed-off-by: Roel Kluin --- bwaseqio.c | 7 +++++++ bwtaln.c | 4 +++- bwtaln.h | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/bwaseqio.c b/bwaseqio.c index 12ac765..600754e 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -157,6 +157,13 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int tri n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { + if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { + // skip reads that are marked to be filtered by Casava + char *s = index(seq->comment.s, ':'); + if (s && *(++s) == 'Y') { + continue; + } + } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length diff --git a/bwtaln.c b/bwtaln.c index 905c2d2..de6b001 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -233,7 +233,7 @@ int bwa_aln(int argc, char *argv[]) gap_opt_t *opt; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IB:")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -261,6 +261,7 @@ int bwa_aln(int argc, char *argv[]) case '1': opt->mode |= BWA_MODE_BAM_READ1; break; case '2': opt->mode |= BWA_MODE_BAM_READ2; break; case 'I': opt->mode |= BWA_MODE_IL13; break; + case 'Y': opt->mode |= BWA_MODE_CFY; break; case 'B': opt->mode |= atoi(optarg) << 24; break; default: return 1; } @@ -298,6 +299,7 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n"); + fprintf(stderr, " -Y filter Casava-filtered sequences\n"); fprintf(stderr, "\n"); return 1; } diff --git a/bwtaln.h b/bwtaln.h index 02f54f7..c2faa98 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -86,6 +86,7 @@ typedef struct { #define BWA_MODE_GAPE 0x01 #define BWA_MODE_COMPREAD 0x02 #define BWA_MODE_LOGGAP 0x04 +#define BWA_MODE_CFY 0x08 #define BWA_MODE_NONSTOP 0x10 #define BWA_MODE_BAM 0x20 #define BWA_MODE_BAM_SE 0x40 From 3536e2113ceef0c4cd5e7b11e89f76dc7043190c Mon Sep 17 00:00:00 2001 From: Tobias Marschall Date: Wed, 7 Sep 2011 14:31:28 +0200 Subject: [PATCH 026/498] Bugfix: reverse (complement) sequence and phred string if alternative alignment has different orientation than primary alignment --- xa2multi.pl | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/xa2multi.pl b/xa2multi.pl index 6dae38b..2409c29 100755 --- a/xa2multi.pl +++ b/xa2multi.pl @@ -10,7 +10,16 @@ my @t = split("\t"); while ($l =~ /([^,;]+),([-+]\d+),([^,]+),(\d+);/g) { my $mchr = ($t[6] eq $1)? '=' : $t[6]; # FIXME: TLEN/ISIZE is not calculated! - print(join("\t", $t[0], 0x100|($t[1]&0x6e9)|($2<0?0x10:0), $1, abs($2), 0, $3, @t[6..7], 0, @t[9..10], "NM:i:$4"), "\n"); + my $seq = $t[9]; + my $phred = $t[10]; + # if alternative alignment has other orientation than primary, + # then print the reverse (complement) of sequence and phred string + if ((($t[1]&0x10)>0) xor ($2<0)) { + $seq = reverse $seq; + $seq =~ tr/ACGTacgt/TGCAtgca/; + $phred = reverse $phred; + } + print(join("\t", $t[0], 0x100|($t[1]&0x6e9)|($2<0?0x10:0), $1, abs($2), 0, $3, @t[6..7], 0, $seq, $phred, "NM:i:$4"), "\n"); } } else { print; } } From 2255c4cd4bdfffe66703ab1706820b00552f574c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 12 Oct 2011 00:05:01 -0400 Subject: [PATCH 027/498] fixed a long existing bug This bug may cause segfault (though never to me) and lead to missing suboptimal hits. But the top hits should not be affected. --- bwtgap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwtgap.c b/bwtgap.c index 3db1af2..9234f18 100644 --- a/bwtgap.c +++ b/bwtgap.c @@ -57,7 +57,7 @@ static inline void gap_push(gap_stack_t *stack, int a, int i, bwtint_t k, bwtint p = q->stack + q->n_entries; p->info = (u_int32_t)score<<21 | a<<20 | i; p->k = k; p->l = l; p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state; - if (is_diff) p->last_diff_pos = i; + p->last_diff_pos = is_diff? i : -1; ++(q->n_entries); ++(stack->n_entries); if (stack->best > score) stack->best = score; From 80e02281d5057d0049aa74e8a208e1c7edf97ef7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 14 Oct 2011 10:31:48 -0400 Subject: [PATCH 028/498] minor change --- bwtgap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwtgap.c b/bwtgap.c index 9234f18..d32c3bb 100644 --- a/bwtgap.c +++ b/bwtgap.c @@ -57,7 +57,7 @@ static inline void gap_push(gap_stack_t *stack, int a, int i, bwtint_t k, bwtint p = q->stack + q->n_entries; p->info = (u_int32_t)score<<21 | a<<20 | i; p->k = k; p->l = l; p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state; - p->last_diff_pos = is_diff? i : -1; + p->last_diff_pos = is_diff? i : 0; ++(q->n_entries); ++(stack->n_entries); if (stack->best > score) stack->best = score; From d70754e2342b1dc62171e55821015f18eed285ff Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 14 Oct 2011 10:32:31 -0400 Subject: [PATCH 029/498] update revision number --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index 0e545a7..f3447e6 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.5.9-r20-dev" +#define PACKAGE_VERSION "0.5.9-r26-dev" #endif static int usage() From d2f357af3a7cf78d003a6a8748fb6097ff168a4f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 18 Oct 2011 16:39:18 -0400 Subject: [PATCH 030/498] a little bit code clean up --- bwt_gen/bwt_gen.c | 28 ---------------------------- bwt_gen/bwt_gen.h | 28 +++++++++------------------- 2 files changed, 9 insertions(+), 47 deletions(-) diff --git a/bwt_gen/bwt_gen.c b/bwt_gen/bwt_gen.c index d208a81..9d881f0 100644 --- a/bwt_gen/bwt_gen.c +++ b/bwt_gen/bwt_gen.c @@ -257,13 +257,11 @@ BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable) bwt = (BWT*)calloc(1, sizeof(BWT)); bwt->textLength = 0; - bwt->inverseSa = 0; bwt->cumulativeFreq = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int*)); initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); bwt->bwtSizeInWord = 0; - bwt->saValueOnBoundary = NULL; // Generate decode tables if (decodeTable == NULL) { @@ -279,14 +277,6 @@ BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable) bwt->occSizeInWord = 0; bwt->occValue = NULL; - bwt->saInterval = ALL_ONE_MASK; - bwt->saValueSize = 0; - bwt->saValue = NULL; - - bwt->inverseSaInterval = ALL_ONE_MASK; - bwt->inverseSaSize = 0; - bwt->inverseSa = NULL; - return bwt; } @@ -1047,7 +1037,6 @@ void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restri wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD; // Calculate occValue - // [lh3] by default: OCC_INTERVAL_MAJOR=65536, OCC_INTERVAL=256 numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor; @@ -1464,11 +1453,7 @@ void BWTFree(BWT *bwt) free(bwt->bwtCode); free(bwt->occValue); free(bwt->occValueMajor); - free(bwt->saValue); - free(bwt->inverseSa); free(bwt->decodeTable); - free(bwt->saIndexRange); - free(bwt->saValueOnBoundary); free(bwt); } @@ -1503,19 +1488,6 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o bwtLength = BWTFileSizeInWord(bwt->textLength); fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile); fclose(bwtFile); -/* - occValueFile = (FILE*)fopen(occValueFileName, "wb"); - if (occValueFile == NULL) { - fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open occ value file!\n"); - exit(1); - } - - fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, occValueFile); - fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, occValueFile); - fwrite(bwt->occValue, sizeof(unsigned int), bwt->occSizeInWord, occValueFile); - fwrite(bwt->occValueMajor, sizeof(unsigned int), bwt->occMajorSizeInWord, occValueFile); - fclose(occValueFile); -*/ } void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) diff --git a/bwt_gen/bwt_gen.h b/bwt_gen/bwt_gen.h index d6cc1ef..8461418 100644 --- a/bwt_gen/bwt_gen.h +++ b/bwt_gen/bwt_gen.h @@ -25,6 +25,11 @@ #ifndef BWT_GEN_H #define BWT_GEN_H +#include + +//typedef int64_t bgint_t; +typedef unsigned bgint_t; + #define ALPHABET_SIZE 4 #define BIT_PER_CHAR 2 #define CHAR_PER_WORD 16 @@ -56,32 +61,17 @@ #define truncateRight(value, offset) ( (value) >> (offset) << (offset) ) #define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0) -typedef struct SaIndexRange { - unsigned int startSaIndex; - unsigned int endSaIndex; -} SaIndexRange; - typedef struct BWT { - unsigned int textLength; // length of the text - unsigned int saInterval; // interval between two SA values stored explicitly - unsigned int inverseSaInterval; // interval between two inverse SA stored explicitly - unsigned int inverseSa0; // SA-1[0] - unsigned int *cumulativeFreq; // cumulative frequency + bgint_t textLength; // length of the text + bgint_t inverseSa0; // SA-1[0] + bgint_t *cumulativeFreq; // cumulative frequency unsigned int *bwtCode; // BWT code unsigned int *occValue; // Occurrence values stored explicitly - unsigned int *occValueMajor; // Occurrence values stored explicitly - unsigned int *saValue; // SA values stored explicitly - unsigned int *inverseSa; // Inverse SA stored explicitly - SaIndexRange *saIndexRange; // SA index range - int saIndexRangeNumOfChar; // Number of characters indexed in SA index range - unsigned int *saValueOnBoundary; // Pre-calculated frequently referred data + bgint_t *occValueMajor; // Occurrence values stored explicitly unsigned int *decodeTable; // For decoding BWT by table lookup - unsigned int decodeTableGenerated; // == TRUE if decode table is generated on load and will be freed unsigned int bwtSizeInWord; // Temporary variable to hold the memory allocated unsigned int occSizeInWord; // Temporary variable to hold the memory allocated unsigned int occMajorSizeInWord; // Temporary variable to hold the memory allocated - unsigned int saValueSize; // Temporary variable to hold the memory allocated - unsigned int inverseSaSize; // Temporary variable to hold the memory allocated unsigned int saIndexRangeSize; // Temporary variable to hold the memory allocated } BWT; From 3114edcb7c9130da4c8382c36b657150beff758e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 18 Oct 2011 16:41:44 -0400 Subject: [PATCH 031/498] further code clean up --- bwt_gen/bwt_gen.c | 1 - bwt_gen/bwt_gen.h | 2 -- 2 files changed, 3 deletions(-) diff --git a/bwt_gen/bwt_gen.c b/bwt_gen/bwt_gen.c index 9d881f0..b187c18 100644 --- a/bwt_gen/bwt_gen.c +++ b/bwt_gen/bwt_gen.c @@ -306,7 +306,6 @@ BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit, bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; } - bwtInc->targetTextLength = textLength; bwtInc->availableWord = (unsigned int)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit); if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) { fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n"); diff --git a/bwt_gen/bwt_gen.h b/bwt_gen/bwt_gen.h index 8461418..fe7fcf9 100644 --- a/bwt_gen/bwt_gen.h +++ b/bwt_gen/bwt_gen.h @@ -72,7 +72,6 @@ typedef struct BWT { unsigned int bwtSizeInWord; // Temporary variable to hold the memory allocated unsigned int occSizeInWord; // Temporary variable to hold the memory allocated unsigned int occMajorSizeInWord; // Temporary variable to hold the memory allocated - unsigned int saIndexRangeSize; // Temporary variable to hold the memory allocated } BWT; typedef struct BWTInc { @@ -80,7 +79,6 @@ typedef struct BWTInc { unsigned int numberOfIterationDone; unsigned int *cumulativeCountInCurrentBuild; unsigned int availableWord; - unsigned int targetTextLength; float targetNBit; unsigned int buildSize; unsigned int initialMaxBuildSize; From 95b1ab7e965bb9a207de1e958cefccbe0773b89b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 18 Oct 2011 23:41:26 -0400 Subject: [PATCH 032/498] first attempt to make bwt_gen work for >4GB seq --- bwt_gen/QSufSort.c | 207 +++++++++--------------------- bwt_gen/QSufSort.h | 11 +- bwt_gen/bwt_gen.c | 313 +++++++++++++++++++++------------------------ bwt_gen/bwt_gen.h | 20 +-- 4 files changed, 231 insertions(+), 320 deletions(-) diff --git a/bwt_gen/QSufSort.c b/bwt_gen/QSufSort.c index 5bf35de..92a8594 100644 --- a/bwt_gen/QSufSort.c +++ b/bwt_gen/QSufSort.c @@ -36,59 +36,29 @@ #include "QSufSort.h" // Static functions -static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos, - const int highestPos, const int numSortedChar); -static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos, - const int highestPos, const int numSortedChar); -static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos, - const int highestPos, const int numSortedChar); -static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int numChar, const int alphabetSize); -static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol, - const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated); - -// from MiscUtilities.c -static unsigned int leadingZero(const unsigned int input) { - - unsigned int l; - const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - - if (input & 0xFFFF0000) { - if (input & 0xFF000000) { - l = leadingZero8bit[input >> 24]; - } else { - l = 8 + leadingZero8bit[input >> 16]; - } - } else { - if (input & 0x0000FF00) { - l = 16 + leadingZero8bit[input >> 8]; - } else { - l = 24 + leadingZero8bit[input]; - } - } - return l; - -} +static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar); +static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar); +static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar); +static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize); +static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated); /* Makes suffix array p of x. x becomes inverse of p. p and x are both of size n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original contents of x[n] is disregarded, the n-th symbol being regarded as end-of-string smaller than all other symbols.*/ -void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol, - const int smallestInputSymbol, const int skipTransform) { - - int i, j; - int s, negatedSortedGroupLength; - int numSymbolAggregated; - int maxNumInputSymbol; - int numSortedPos = 1; - int newAlphabetSize; +void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const int skipTransform) { + + qsint_t i, j; + qsint_t s, negatedSortedGroupLength; + qsint_t numSymbolAggregated; + qsint_t maxNumInputSymbol; + qsint_t numSortedPos = 1; + qsint_t newAlphabetSize; maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; @@ -102,7 +72,7 @@ void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, numSortedPos = numSymbolAggregated; } - while ((int)(I[0]) >= -(int)numChar) { + while ((qsint_t)(I[0]) >= -(qsint_t)numChar) { i = 0; negatedSortedGroupLength = 0; do { @@ -129,9 +99,9 @@ void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, } -void QSufSortGenerateSaFromInverse(const int* V, int* __restrict I, const int numChar) { +void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar) { - int i; + qsint_t i; for (i=0; i<=numChar; i++) { I[V[i]] = i + 1; } @@ -143,21 +113,14 @@ void QSufSortGenerateSaFromInverse(const int* V, int* __restrict I, const int nu quicksort taken from Bentley & McIlroy, "Engineering a Sort Function", Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This function is based on Program 7.*/ -static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lowestPos, - const int highestPos, const int numSortedChar) { - - int a, b, c, d; - int l, m; - int f, v, s, t; - int tmp; - int numItem; - - #ifdef DEBUG - if (lowestPos > highestPos) { - fprintf(stderr, "QSufSortSortSplit(): lowestPos > highestPos!\n"); - exit(1); - } - #endif +static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar) { + + qsint_t a, b, c, d; + qsint_t l, m; + qsint_t f, v, s, t; + qsint_t tmp; + qsint_t numItem; numItem = highestPos - lowestPos + 1; @@ -171,7 +134,7 @@ static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lo a = b = lowestPos; c = d = highestPos; - while (TRUE) { + while (1) { while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) { if (f == v) { swap(I[a], I[b], tmp); @@ -235,30 +198,16 @@ static void QSufSortSortSplit(int* __restrict V, int* __restrict I, const int lo } /* Algorithm by Bentley & McIlroy.*/ -static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int lowestPos, - const int highestPos, const int numSortedChar) { - - int m; - int keyl, keym, keyn; - int key1, key2, key3; - int s; - int numItem; - - #ifdef DEBUG - if (lowestPos > highestPos) { - fprintf(stderr, "QSufSortChoosePivot(): lowestPos > highestPos!\n"); - exit(1); - } - #endif +static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar) { - numItem = highestPos - lowestPos + 1; + qsint_t m; + qsint_t keyl, keym, keyn; + qsint_t key1, key2, key3; + qsint_t s; + qsint_t numItem; - #ifdef DEBUG - if (numItem <= INSERT_SORT_NUM_ITEM) { - fprintf(stderr, "QSufSortChoosePivot(): number of items <= INSERT_SORT_NUM_ITEM!\n"); - exit(1); - } - #endif + numItem = highestPos - lowestPos + 1; m = lowestPos + numItem / 2; @@ -282,39 +231,19 @@ static int QSufSortChoosePivot(int* __restrict V, int* __restrict I, const int l } /* Quadratic sorting method to use for small subarrays. */ -static void QSufSortInsertSortSplit(int* __restrict V, int* __restrict I, const int lowestPos, - const int highestPos, const int numSortedChar) { - - int i, j; - int tmpKey, tmpPos; - int numItem; - int key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM]; - int negativeSortedLength; - int groupNum; - - #ifdef DEBUG - if (lowestPos > highestPos) { - fprintf(stderr, "QSufSortInsertSortSplit(): lowestPos > highestPos!\n"); - exit(1); - } - #endif +static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, + const qsint_t highestPos, const qsint_t numSortedChar) { - numItem = highestPos - lowestPos + 1; + qsint_t i, j; + qsint_t tmpKey, tmpPos; + qsint_t numItem; + qsint_t key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM]; + qsint_t negativeSortedLength; + qsint_t groupNum; - #ifdef DEBUG - if (numItem > INSERT_SORT_NUM_ITEM) { - fprintf(stderr, "QSufSortInsertSortSplit(): number of items > INSERT_SORT_NUM_ITEM!\n"); - exit(1); - } - #endif + numItem = highestPos - lowestPos + 1; for (i=0; i0; i--) { c = I[i-1]; - d = (int)(V[c]); + d = (qsint_t)(V[c]); groupNum = currentIndex; V[c] = groupNum; if (d >= 0) { @@ -424,20 +353,20 @@ static void QSufSortBucketSort(int* __restrict V, int* __restrict I, const int n Output: Returns an integer j in the range 1...q representing the size of the new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is set to the number of old symbols grouped into one. Only x[n] is 0.*/ -static int QSufSortTransform(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol, - const int smallestInputSymbol, const int maxNewAlphabetSize, int *numSymbolAggregated) { +static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated) { - int c, i, j; - int a; // numSymbolAggregated - int mask; - int minSymbolInChunk = 0, maxSymbolInChunk = 0; - int newAlphabetSize; - int maxNumInputSymbol, maxNumBit, maxSymbol; + qsint_t c, i, j; + qsint_t a; // numSymbolAggregated + qsint_t mask; + qsint_t minSymbolInChunk = 0, maxSymbolInChunk = 0; + qsint_t newAlphabetSize; + qsint_t maxNumInputSymbol, maxNumBit, maxSymbol; maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; - maxNumBit = BITS_IN_WORD - leadingZero(maxNumInputSymbol); - maxSymbol = INT_MAX >> maxNumBit; + for (maxNumBit = 0, i = maxNumInputSymbol; i; i >>= 1) ++maxNumBit; + maxSymbol = QSINT_MAX >> maxNumBit; c = maxNumInputSymbol; for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) { @@ -449,14 +378,6 @@ static int QSufSortTransform(int* __restrict V, int* __restrict I, const int num mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/ V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/ - #ifdef DEBUG - // Section of code for maxSymbolInChunk > numChar removed! - if (maxSymbolInChunk > numChar) { - fprintf(stderr, "QSufSortTransform(): maxSymbolInChunk > numChar!\n"); - exit(1); - } - #endif - /* bucketing possible, compact alphabet.*/ for (i=0; i<=maxSymbolInChunk; i++) { I[i] = 0; /* zero transformation table.*/ diff --git a/bwt_gen/QSufSort.h b/bwt_gen/QSufSort.h index 8724d30..6faf9f6 100644 --- a/bwt_gen/QSufSort.h +++ b/bwt_gen/QSufSort.h @@ -29,12 +29,17 @@ #ifndef __QSUFSORT_H__ #define __QSUFSORT_H__ +#include + #define KEY(V, I, p, h) ( V[ I[p] + h ] ) #define INSERT_SORT_NUM_ITEM 16 -void QSufSortSuffixSort(int* __restrict V, int* __restrict I, const int numChar, const int largestInputSymbol, - const int smallestInputSymbol, const int skipTransform); -void QSufSortGenerateSaFromInverse(const int *V, int* __restrict I, const int numChar); +typedef int64_t qsint_t; +#define QSINT_MAX INT64_MAX + +void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, + const qsint_t smallestInputSymbol, const int skipTransform); +void QSufSortGenerateSaFromInverse(const qsint_t *V, qsint_t* __restrict I, const qsint_t numChar); #endif diff --git a/bwt_gen/bwt_gen.c b/bwt_gen/bwt_gen.c index b187c18..fecbf56 100644 --- a/bwt_gen/bwt_gen.c +++ b/bwt_gen/bwt_gen.c @@ -25,22 +25,27 @@ #include #include #include +#include #include "bwt_gen.h" #include "QSufSort.h" -static unsigned int TextLengthFromBytePacked(unsigned int bytePackedLength, unsigned int bitPerChar, +#define MIN_AVAILABLE_WORD 0x10000 + +static bgint_t TextLengthFromBytePacked(bgint_t bytePackedLength, unsigned int bitPerChar, unsigned int lastByteLength) { - if (bytePackedLength > ALL_ONE_MASK / (BITS_IN_BYTE / bitPerChar)) { - fprintf(stderr, "TextLengthFromBytePacked(): text length > 2^32!\n"); - exit(1); - } return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength; } -static void initializeVAL(unsigned int *startAddr, const unsigned int length, const unsigned int initValue) +static void initializeVAL(unsigned int *startAddr, const bgint_t length, const unsigned int initValue) { - unsigned int i; + bgint_t i; + for (i=0; ibwt->textLength == 0) { // initial build // Minus 2 because n+1 entries of seq and rank needed for n char - maxBuildSize = (bwtInc->availableWord - 2 - OCC_INTERVAL / CHAR_PER_WORD) - / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD; + maxBuildSize = (bwtInc->availableWord - 2 * (sizeof(bgint_t) / 4) - OCC_INTERVAL / CHAR_PER_WORD) + / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD / (sizeof(bgint_t) / 4); if (bwtInc->initialMaxBuildSize > 0) { bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize); } else { @@ -104,9 +109,9 @@ static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) } else { // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char // Minus numberOfIterationDone because bwt slightly shift to left in each iteration - maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3 + maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3 * (sizeof(bgint_t) / 4) - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) - / 3; + / 3 / (sizeof(bgint_t) / 4); if (maxBuildSize < CHAR_PER_WORD) { fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); exit(1); @@ -116,9 +121,8 @@ static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) } else { bwtInc->buildSize = maxBuildSize; } - if (bwtInc->buildSize < CHAR_PER_WORD) { + if (bwtInc->buildSize < CHAR_PER_WORD) bwtInc->buildSize = CHAR_PER_WORD; - } } if (bwtInc->buildSize < CHAR_PER_WORD) { @@ -128,9 +132,8 @@ static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; - bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1); - bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + bwtInc->buildSize + 1); - + bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4); + bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4)); } // for ceilLog2() @@ -186,17 +189,17 @@ static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize) } static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize, - const unsigned int textLength) + const bgint_t textLength) { - unsigned int i, j, k; - unsigned int c; + bgint_t i; + unsigned int j, k, c; unsigned int bitPerBytePackedChar; unsigned int bitPerWordPackedChar; unsigned int charPerWord; unsigned int charPerByte; unsigned int bytePerIteration; - unsigned int byteProcessed = 0; - unsigned int wordProcessed = 0; + bgint_t byteProcessed = 0; + bgint_t wordProcessed = 0; unsigned int mask, shift; unsigned int buffer[BITS_IN_WORD]; @@ -250,7 +253,7 @@ static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned i output[wordProcessed] = c; } -BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable) +BWT *BWTCreate(const bgint_t textLength, unsigned int *decodeTable) { BWT *bwt; @@ -258,8 +261,8 @@ BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable) bwt->textLength = 0; - bwt->cumulativeFreq = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int*)); - initializeVAL(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); + bwt->cumulativeFreq = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); + initializeVAL_bg(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); bwt->bwtSizeInWord = 0; @@ -272,7 +275,7 @@ BWT *BWTCreate(const unsigned int textLength, unsigned int *decodeTable) } bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength); - bwt->occValueMajor = (unsigned*)calloc(bwt->occMajorSizeInWord, sizeof(unsigned int)); + bwt->occValueMajor = (bgint_t*)calloc(bwt->occMajorSizeInWord, sizeof(bgint_t)); bwt->occSizeInWord = 0; bwt->occValue = NULL; @@ -297,8 +300,8 @@ BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit, bwtInc->initialMaxBuildSize = initialMaxBuildSize; bwtInc->incMaxBuildSize = incMaxBuildSize; bwtInc->targetNBit = targetNBit; - bwtInc->cumulativeCountInCurrentBuild = (unsigned*)calloc((ALPHABET_SIZE + 1), sizeof(unsigned int)); - initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); + bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); + initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); // Build frequently accessed data bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int)); @@ -306,7 +309,8 @@ BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit, bwtInc->packedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; } - bwtInc->availableWord = (unsigned int)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit); + bwtInc->availableWord = (bgint_t)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit); + if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD; if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) { fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n"); exit(1); @@ -317,14 +321,16 @@ BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit, } // for BWTIncConstruct() -static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned int* __restrict rank, - unsigned int* __restrict cumulativeCount, const unsigned int numChar) +static void BWTIncPutPackedTextToRank(const unsigned int *packedText, bgint_t* __restrict rank, + bgint_t* __restrict cumulativeCount, const bgint_t numChar) { - unsigned int i, j; + bgint_t i; + unsigned int j; unsigned int c, t; unsigned int packedMask; - unsigned int rankIndex; - unsigned int lastWord, numCharInLastWord; + bgint_t rankIndex; + bgint_t lastWord; + unsigned int numCharInLastWord; lastWord = (numChar - 1) / CHAR_PER_WORD; numCharInLastWord = numChar - lastWord * CHAR_PER_WORD; @@ -359,17 +365,17 @@ static void BWTIncPutPackedTextToRank(const unsigned int *packedText, unsigned i } -static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigned int index, - unsigned int* __restrict occCount, const unsigned int* dnaDecodeTable) +static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const bgint_t index, + bgint_t* __restrict occCount, const unsigned int* dnaDecodeTable) { static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; - unsigned int iteration, wordToCount, charToCount; - unsigned int i, j, c; - unsigned int sum; + bgint_t iteration, i; + unsigned int wordToCount, charToCount; + unsigned int j, c, sum; occCount[0] = 0; occCount[1] = 0; @@ -432,13 +438,14 @@ static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const unsigne occCount[3] += sum; } -static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* __restrict bwt, const unsigned int numChar, - const unsigned int *cumulativeCount, const unsigned int *packedShift) { +static void BWTIncBuildPackedBwt(const bgint_t *relativeRank, unsigned int* __restrict bwt, const bgint_t numChar, + const bgint_t *cumulativeCount, const unsigned int *packedShift) { - unsigned int i, c, r; - unsigned int previousRank, currentRank; - unsigned int wordIndex, charIndex; - unsigned int inverseSa0; + bgint_t i, r; + unsigned int c; + bgint_t previousRank, currentRank; + bgint_t wordIndex, charIndex; + bgint_t inverseSa0; inverseSa0 = previousRank = relativeRank[0]; @@ -463,10 +470,10 @@ static void BWTIncBuildPackedBwt(const unsigned int *relativeRank, unsigned int* } } -static inline unsigned int BWTOccValueExplicit(const BWT *bwt, const unsigned int occIndexExplicit, +static inline bgint_t BWTOccValueExplicit(const BWT *bwt, const bgint_t occIndexExplicit, const unsigned int character) { - unsigned int occIndexMajor; + bgint_t occIndexMajor; occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR; @@ -546,46 +553,43 @@ static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned } -unsigned int BWTOccValue(const BWT *bwt, unsigned int index, const unsigned int character) { - - unsigned int occValue; - unsigned int occExplicitIndex, occIndex; +bgint_t BWTOccValue(const BWT *bwt, bgint_t index, const unsigned int character) +{ + bgint_t occValue; + bgint_t occExplicitIndex, occIndex; // $ is supposed to be positioned at inverseSa0 but it is not encoded // therefore index is subtracted by 1 for adjustment - if (index > bwt->inverseSa0) { + if (index > bwt->inverseSa0) index--; - } occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding occIndex = occExplicitIndex * OCC_INTERVAL; occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character); - if (occIndex == index) { + if (occIndex == index) return occValue; - } if (occIndex < index) { return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable); } else { return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable); } - } -static unsigned int BWTIncGetAbsoluteRank(BWT *bwt, unsigned int* __restrict absoluteRank, unsigned int* __restrict seq, - const unsigned int *packedText, const unsigned int numChar, - const unsigned int* cumulativeCount, const unsigned int firstCharInLastIteration) +static bgint_t BWTIncGetAbsoluteRank(BWT *bwt, bgint_t* __restrict absoluteRank, bgint_t* __restrict seq, + const unsigned int *packedText, const bgint_t numChar, + const bgint_t* cumulativeCount, const unsigned int firstCharInLastIteration) { - unsigned int saIndex; - unsigned int lastWord; + bgint_t saIndex; + bgint_t lastWord; unsigned int packedMask; - unsigned int i, j; - unsigned int c, t; - unsigned int rankIndex; + bgint_t i; + unsigned int c, t, j; + bgint_t rankIndex; unsigned int shift; - unsigned int seqIndexFromStart[ALPHABET_SIZE]; - unsigned int seqIndexFromEnd[ALPHABET_SIZE]; + bgint_t seqIndexFromStart[ALPHABET_SIZE]; + bgint_t seqIndexFromEnd[ALPHABET_SIZE]; for (i=0; i oldInverseSa0) { + if (r > oldInverseSa0) sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt - } s = seq[i]; if (i < freq) { - if (lastIndex >= freq) { + if (lastIndex >= freq) lastRank++; // to trigger the group across alphabet boundary to be split - } c--; freq = cumulativeCount[c]; } @@ -846,10 +848,10 @@ static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigne relativeRank[s] = lastIndex; } else { if (i == lastIndex - 1) { - if (lastIndex < numItem && (int)seq[lastIndex + 1] < 0) { + if (lastIndex < numItem && (sbgint_t)seq[lastIndex + 1] < 0) { seq[lastIndex] = seq[lastIndex + 1] - 1; } else { - seq[lastIndex] = (unsigned int)-1; + seq[lastIndex] = (bgint_t)-1; } } lastIndex = i; @@ -865,11 +867,12 @@ static void BWTIncBuildRelativeRank(unsigned int* __restrict sortedRank, unsigne } -static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank, const unsigned int numChar, - const unsigned int *cumulativeCount) +static void BWTIncBuildBwt(unsigned int* insertBwt, const bgint_t *relativeRank, const bgint_t numChar, + const bgint_t *cumulativeCount) { - unsigned int i, c; - unsigned int previousRank, currentRank; + unsigned int c; + bgint_t i; + bgint_t previousRank, currentRank; previousRank = relativeRank[0]; @@ -877,20 +880,20 @@ static void BWTIncBuildBwt(unsigned int* seq, const unsigned int *relativeRank, currentRank = relativeRank[i]; c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2]) + (previousRank >= cumulativeCount[3]); - seq[currentRank] = c; + insertBwt[currentRank] = c; previousRank = currentRank; } } -static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt, - unsigned int* __restrict mergedBwt, const unsigned int numOldBwt, const unsigned int numInsertBwt) +static void BWTIncMergeBwt(const bgint_t *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt, + unsigned int* __restrict mergedBwt, const bgint_t numOldBwt, const bgint_t numInsertBwt) { unsigned int bitsInWordMinusBitPerChar; - unsigned int leftShift, rightShift; - unsigned int o; - unsigned int oIndex, iIndex, mIndex; - unsigned int mWord, mChar, oWord, oChar; - unsigned int numInsert; + bgint_t leftShift, rightShift; + bgint_t o; + bgint_t oIndex, iIndex, mIndex; + bgint_t mWord, mChar, oWord, oChar; + bgint_t numInsert; bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR; @@ -997,9 +1000,9 @@ static void BWTIncMergeBwt(const unsigned int *sortedRank, const unsigned int* o void BWTClearTrailingBwtCode(BWT *bwt) { - unsigned int bwtResidentSizeInWord; - unsigned int wordIndex, offset; - unsigned int i; + bgint_t bwtResidentSizeInWord; + bgint_t wordIndex, offset; + bgint_t i; bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength); @@ -1020,18 +1023,18 @@ void BWTClearTrailingBwtCode(BWT *bwt) void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue, - unsigned int* __restrict occValueMajor, - const unsigned int textLength, const unsigned int* decodeTable) + bgint_t* __restrict occValueMajor, + const bgint_t textLength, const unsigned int* decodeTable) { - unsigned int numberOfOccValueMajor, numberOfOccValue; + bgint_t numberOfOccValueMajor, numberOfOccValue; unsigned int wordBetweenOccValue; - unsigned int numberOfOccIntervalPerMajor; + bgint_t numberOfOccIntervalPerMajor; unsigned int c; - unsigned int i, j; - unsigned int occMajorIndex; - unsigned int occIndex, bwtIndex; - unsigned int sum; - unsigned int tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE]; + bgint_t i, j; + bgint_t occMajorIndex; + bgint_t occIndex, bwtIndex; + bgint_t sum; // perhaps unsigned is big enough + bgint_t tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE]; wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD; @@ -1231,31 +1234,25 @@ void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restri } -static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) +static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar) { unsigned int i; - unsigned int mergedBwtSizeInWord, mergedOccSizeInWord; + bgint_t mergedBwtSizeInWord, mergedOccSizeInWord; unsigned int firstCharInThisIteration; - unsigned int *relativeRank, *seq, *sortedRank, *insertBwt, *mergedBwt; - unsigned int newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0; - - #ifdef DEBUG - if (numChar > bwtInc->buildSize) { - fprintf(stderr, "BWTIncConstruct(): numChar > buildSize!\n"); - exit(1); - } - #endif + bgint_t *relativeRank, *seq, *sortedRank; + unsigned int *insertBwt, *mergedBwt; + bgint_t newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0; mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar); mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar); - initializeVAL(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); + initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); if (bwtInc->bwt->textLength == 0) { // Initial build // Set address - seq = bwtInc->workingMemory; + seq = (bgint_t*)bwtInc->workingMemory; relativeRank = seq + bwtInc->buildSize + 1; mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place @@ -1265,7 +1262,7 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) relativeRank[numChar] = 0; // Sort suffix - QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)ALPHABET_SIZE - 1, 0, FALSE); + QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)ALPHABET_SIZE - 1, 0, FALSE); newInverseSa0 = relativeRank[0]; // Clear BWT area @@ -1279,9 +1276,9 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) } else { // Incremental build // Set address - sortedRank = bwtInc->workingMemory; + sortedRank = (bgint_t*)bwtInc->workingMemory; seq = sortedRank + bwtInc->buildSize + 1; - insertBwt = seq; + insertBwt = (unsigned*)seq; // insertBwt and seq share memory relativeRank = seq + bwtInc->buildSize + 1; // Store the first character of this iteration @@ -1318,15 +1315,10 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups) BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild); -#ifdef DEBUG - if (relativeRank[numChar] != oldInverseSa0RelativeRank) { - fprintf(stderr, "BWTIncConstruct(): relativeRank[numChar] != oldInverseSa0RelativeRank!\n"); - exit(1); - } -#endif + assert(relativeRank[numChar] == oldInverseSa0RelativeRank); // Sort suffix - QSufSortSuffixSort((int*)relativeRank, (int*)seq, (int)numChar, (int)numChar, 1, TRUE); + QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)numChar, 1, TRUE); newInverseSa0RelativeRank = relativeRank[0]; newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank; @@ -1334,7 +1326,7 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt // Build BWT - BWTIncBuildBwt(seq, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild); + BWTIncBuildBwt(insertBwt, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild); // Merge BWT mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord @@ -1349,10 +1341,7 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) bwtInc->bwt->bwtCode = mergedBwt; bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord; bwtInc->bwt->occSizeInWord = mergedOccSizeInWord; - if (mergedBwt < bwtInc->workingMemory + mergedOccSizeInWord) { - fprintf(stderr, "BWTIncConstruct() : Not enough memory allocated!\n"); - exit(1); - } + assert(mergedBwt >= bwtInc->workingMemory + mergedOccSizeInWord); bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord; @@ -1376,14 +1365,14 @@ static void BWTIncConstruct(BWTInc *bwtInc, const unsigned int numChar) } BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetNBit, - const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize) + bgint_t initialMaxBuildSize, bgint_t incMaxBuildSize) { FILE *packedFile; - unsigned int packedFileLen; - unsigned int totalTextLength; - unsigned int textToLoad, textSizeInByte; - unsigned int processedTextLength; + bgint_t packedFileLen; + bgint_t totalTextLength; + bgint_t textToLoad, textSizeInByte; + bgint_t processedTextLength; unsigned char lastByteLength; BWTInc *bwtInc; @@ -1397,10 +1386,6 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetN fseek(packedFile, -1, SEEK_END); packedFileLen = ftell(packedFile); - if ((int)packedFileLen < 0) { - fprintf(stderr, "BWTIncConstructFromPacked: Cannot determine file length!\n"); - exit(1); - } fread(&lastByteLength, sizeof(unsigned char), 1, packedFile); totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); @@ -1416,9 +1401,9 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetN textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte fseek(packedFile, -2, SEEK_CUR); - fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); + fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile); - fseek(packedFile, -((int)textSizeInByte + 1), SEEK_CUR); + fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR); ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); BWTIncConstruct(bwtInc, textToLoad); @@ -1431,15 +1416,15 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetN textToLoad = totalTextLength - processedTextLength; } textSizeInByte = textToLoad / CHAR_PER_BYTE; - fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); + fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile); - fseek(packedFile, -((int)textSizeInByte), SEEK_CUR); + fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); BWTIncConstruct(bwtInc, textToLoad); processedTextLength += textToLoad; if (bwtInc->numberOfIterationDone % 10 == 0) { - printf("[BWTIncConstructFromPacked] %u iterations done. %u characters processed.\n", - bwtInc->numberOfIterationDone, processedTextLength); + printf("[BWTIncConstructFromPacked] %lu iterations done. %lu characters processed.\n", + (long)bwtInc->numberOfIterationDone, (long)processedTextLength); } } return bwtInc; @@ -1464,7 +1449,7 @@ void BWTIncFree(BWTInc *bwtInc) free(bwtInc); } -static unsigned int BWTFileSizeInWord(const unsigned int numChar) +static bgint_t BWTFileSizeInWord(const bgint_t numChar) { // The $ in BWT at the position of inverseSa0 is not encoded return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD; @@ -1474,7 +1459,7 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o { FILE *bwtFile; /* FILE *occValueFile; */ - unsigned int bwtLength; + bgint_t bwtLength; bwtFile = (FILE*)fopen(bwtFileName, "wb"); if (bwtFile == NULL) { @@ -1482,8 +1467,8 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o exit(1); } - fwrite(&bwt->inverseSa0, sizeof(unsigned int), 1, bwtFile); - fwrite(bwt->cumulativeFreq + 1, sizeof(unsigned int), ALPHABET_SIZE, bwtFile); + fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile); + fwrite(bwt->cumulativeFreq + 1, sizeof(bgint_t), ALPHABET_SIZE, bwtFile); bwtLength = BWTFileSizeInWord(bwt->textLength); fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile); fclose(bwtFile); diff --git a/bwt_gen/bwt_gen.h b/bwt_gen/bwt_gen.h index fe7fcf9..954c1c0 100644 --- a/bwt_gen/bwt_gen.h +++ b/bwt_gen/bwt_gen.h @@ -27,8 +27,8 @@ #include -//typedef int64_t bgint_t; -typedef unsigned bgint_t; +typedef uint64_t bgint_t; +typedef int64_t sbgint_t; #define ALPHABET_SIZE 4 #define BIT_PER_CHAR 2 @@ -69,20 +69,20 @@ typedef struct BWT { unsigned int *occValue; // Occurrence values stored explicitly bgint_t *occValueMajor; // Occurrence values stored explicitly unsigned int *decodeTable; // For decoding BWT by table lookup - unsigned int bwtSizeInWord; // Temporary variable to hold the memory allocated - unsigned int occSizeInWord; // Temporary variable to hold the memory allocated - unsigned int occMajorSizeInWord; // Temporary variable to hold the memory allocated + bgint_t bwtSizeInWord; // Temporary variable to hold the memory allocated + bgint_t occSizeInWord; // Temporary variable to hold the memory allocated + bgint_t occMajorSizeInWord; // Temporary variable to hold the memory allocated } BWT; typedef struct BWTInc { BWT *bwt; unsigned int numberOfIterationDone; - unsigned int *cumulativeCountInCurrentBuild; - unsigned int availableWord; + bgint_t *cumulativeCountInCurrentBuild; + bgint_t availableWord; float targetNBit; - unsigned int buildSize; - unsigned int initialMaxBuildSize; - unsigned int incMaxBuildSize; + bgint_t buildSize; + bgint_t initialMaxBuildSize; + bgint_t incMaxBuildSize; unsigned int firstCharInLastIteration; unsigned int *workingMemory; unsigned int *packedText; From 573ad0c98f9c20bcf5308f861a691c521cc7c563 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 18 Oct 2011 23:48:46 -0400 Subject: [PATCH 033/498] merge bwt_gen.h to bwt_gen.c --- bwt_gen/QSufSort.c | 58 ++++++++++++----------------- bwt_gen/bwt_gen.c | 67 ++++++++++++++++++++++++++++++++- bwt_gen/bwt_gen.h | 93 ---------------------------------------------- 3 files changed, 88 insertions(+), 130 deletions(-) delete mode 100644 bwt_gen/bwt_gen.h diff --git a/bwt_gen/QSufSort.c b/bwt_gen/QSufSort.c index 92a8594..e437ac3 100644 --- a/bwt_gen/QSufSort.c +++ b/bwt_gen/QSufSort.c @@ -32,9 +32,12 @@ #include #include #include -#include "bwt_gen.h" #include "QSufSort.h" +#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) +#define med3(a, b, c) ( ac ? b : a>c ? c : a)) +#define swap(a, b, t); t = a; a = b; b = t; + // Static functions static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, const qsint_t highestPos, const qsint_t numSortedChar); @@ -51,8 +54,8 @@ static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, c contents of x[n] is disregarded, the n-th symbol being regarded as end-of-string smaller than all other symbols.*/ void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, - const qsint_t smallestInputSymbol, const int skipTransform) { - + const qsint_t smallestInputSymbol, const int skipTransform) +{ qsint_t i, j; qsint_t s, negatedSortedGroupLength; qsint_t numSymbolAggregated; @@ -96,16 +99,13 @@ void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsin } numSortedPos *= 2; /* double sorted-depth.*/ } - } -void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar) { - +void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar) +{ qsint_t i; - for (i=0; i<=numChar; i++) { + for (i=0; i<=numChar; i++) I[V[i]] = i + 1; - } - } /* Sorting routine called for each unsorted group. Sorts the array of integers @@ -149,9 +149,8 @@ static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, cons } c--; } - if (b > c) { + if (b > c) break; - } swap(I[b], I[c], tmp); b++; c--; @@ -173,9 +172,8 @@ static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, cons s = b - a; t = d - c; - if (s > 0) { + if (s > 0) QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar); - } // Update group number for equal portion a = lowestPos + s; @@ -186,14 +184,12 @@ static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, cons I[a] = -1; } else { // Unsorted group - for (c=a; c<=b; c++) { + for (c=a; c<=b; c++) V[I[c]] = b; - } } - if (t > 0) { + if (t > 0) QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar); - } } @@ -232,8 +228,8 @@ static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, /* Quadratic sorting method to use for small subarrays. */ static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, - const qsint_t highestPos, const qsint_t numSortedChar) { - + const qsint_t highestPos, const qsint_t numSortedChar) +{ qsint_t i, j; qsint_t tmpKey, tmpPos; qsint_t numItem; @@ -269,9 +265,8 @@ static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I if (key[i-1] == key[i]) { negativeSortedLength = 0; } else { - if (negativeSortedLength < 0) { + if (negativeSortedLength < 0) I[i+lowestPos] = negativeSortedLength; - } groupNum = i + lowestPos - 1; negativeSortedLength--; } @@ -280,10 +275,8 @@ static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I I[lowestPos] = pos[0]; V[I[lowestPos]] = groupNum; - if (negativeSortedLength < 0) { + if (negativeSortedLength < 0) I[lowestPos] = negativeSortedLength; - } - } /* Bucketsort for first iteration. @@ -295,17 +288,16 @@ static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I Output: x is V and p is I after the initial sorting stage of the refined suffix sorting algorithm.*/ -static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize) { - +static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize) +{ qsint_t i, c; qsint_t d; qsint_t groupNum; qsint_t currentIndex; // mark linked list empty - for (i=0; i #include #include -#include "bwt_gen.h" +#include #include "QSufSort.h" +typedef uint64_t bgint_t; +typedef int64_t sbgint_t; + +#define ALPHABET_SIZE 4 +#define BIT_PER_CHAR 2 +#define CHAR_PER_WORD 16 +#define CHAR_PER_BYTE 4 + +#define BITS_IN_WORD 32 +#define BITS_IN_BYTE 8 +#define BYTES_IN_WORD 4 + +#define ALL_ONE_MASK 0xFFFFFFFF +#define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536 + +#define BITS_PER_OCC_VALUE 16 +#define OCC_VALUE_PER_WORD 2 +#define OCC_INTERVAL 256 +#define OCC_INTERVAL_MAJOR 65536 + +#define TRUE 1 +#define FALSE 0 + +#define BWTINC_INSERT_SORT_NUM_ITEM 7 + #define MIN_AVAILABLE_WORD 0x10000 +#define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 ) +#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) +#define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) ) +#define med3(a, b, c) ( ac ? b : a>c ? c : a)) +#define swap(a, b, t); t = a; a = b; b = t; +#define truncateLeft(value, offset) ( (value) << (offset) >> (offset) ) +#define truncateRight(value, offset) ( (value) >> (offset) << (offset) ) +#define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0) + +typedef struct BWT { + bgint_t textLength; // length of the text + bgint_t inverseSa0; // SA-1[0] + bgint_t *cumulativeFreq; // cumulative frequency + unsigned int *bwtCode; // BWT code + unsigned int *occValue; // Occurrence values stored explicitly + bgint_t *occValueMajor; // Occurrence values stored explicitly + unsigned int *decodeTable; // For decoding BWT by table lookup + bgint_t bwtSizeInWord; // Temporary variable to hold the memory allocated + bgint_t occSizeInWord; // Temporary variable to hold the memory allocated + bgint_t occMajorSizeInWord; // Temporary variable to hold the memory allocated +} BWT; + +typedef struct BWTInc { + BWT *bwt; + unsigned int numberOfIterationDone; + bgint_t *cumulativeCountInCurrentBuild; + bgint_t availableWord; + float targetNBit; + bgint_t buildSize; + bgint_t initialMaxBuildSize; + bgint_t incMaxBuildSize; + unsigned int firstCharInLastIteration; + unsigned int *workingMemory; + unsigned int *packedText; + unsigned char *textBuffer; + unsigned int *packedShift; +} BWTInc; + static bgint_t TextLengthFromBytePacked(bgint_t bytePackedLength, unsigned int bitPerChar, unsigned int lastByteLength) { @@ -1477,7 +1540,7 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) { BWTInc *bwtInc; - bwtInc = BWTIncConstructFromPacked(fn_pac, 2.5, 10000000, 10000000); + bwtInc = BWTIncConstructFromPacked(fn_pac, 5.0, 10000000, 10000000); printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); BWTIncFree(bwtInc); diff --git a/bwt_gen/bwt_gen.h b/bwt_gen/bwt_gen.h deleted file mode 100644 index 954c1c0..0000000 --- a/bwt_gen/bwt_gen.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - - BWTConstruct.h BWT-Index Construction - - This module constructs BWT and auxiliary data structures. - - Copyright (C) 2004, Wong Chi Kwong. - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License - as published by the Free Software Foundation; either version 2 - of the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -*/ - -#ifndef BWT_GEN_H -#define BWT_GEN_H - -#include - -typedef uint64_t bgint_t; -typedef int64_t sbgint_t; - -#define ALPHABET_SIZE 4 -#define BIT_PER_CHAR 2 -#define CHAR_PER_WORD 16 -#define CHAR_PER_BYTE 4 - -#define BITS_IN_WORD 32 -#define BITS_IN_BYTE 8 -#define BYTES_IN_WORD 4 - -#define ALL_ONE_MASK 0xFFFFFFFF -#define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536 - -#define BITS_PER_OCC_VALUE 16 -#define OCC_VALUE_PER_WORD 2 -#define OCC_INTERVAL 256 -#define OCC_INTERVAL_MAJOR 65536 - -#define TRUE 1 -#define FALSE 0 - -#define BWTINC_INSERT_SORT_NUM_ITEM 7 - -#define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 ) -#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) -#define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) ) -#define med3(a, b, c) ( ac ? b : a>c ? c : a)) -#define swap(a, b, t); t = a; a = b; b = t; -#define truncateLeft(value, offset) ( (value) << (offset) >> (offset) ) -#define truncateRight(value, offset) ( (value) >> (offset) << (offset) ) -#define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0) - -typedef struct BWT { - bgint_t textLength; // length of the text - bgint_t inverseSa0; // SA-1[0] - bgint_t *cumulativeFreq; // cumulative frequency - unsigned int *bwtCode; // BWT code - unsigned int *occValue; // Occurrence values stored explicitly - bgint_t *occValueMajor; // Occurrence values stored explicitly - unsigned int *decodeTable; // For decoding BWT by table lookup - bgint_t bwtSizeInWord; // Temporary variable to hold the memory allocated - bgint_t occSizeInWord; // Temporary variable to hold the memory allocated - bgint_t occMajorSizeInWord; // Temporary variable to hold the memory allocated -} BWT; - -typedef struct BWTInc { - BWT *bwt; - unsigned int numberOfIterationDone; - bgint_t *cumulativeCountInCurrentBuild; - bgint_t availableWord; - float targetNBit; - bgint_t buildSize; - bgint_t initialMaxBuildSize; - bgint_t incMaxBuildSize; - unsigned int firstCharInLastIteration; - unsigned int *workingMemory; - unsigned int *packedText; - unsigned char *textBuffer; - unsigned int *packedShift; -} BWTInc; - -#endif From 1f02acbe092d2ea25d361c6c5eb9c7ec4635034d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 19 Oct 2011 00:26:56 -0400 Subject: [PATCH 034/498] added a few assertions --- bwt_gen/bwt_gen.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/bwt_gen/bwt_gen.c b/bwt_gen/bwt_gen.c index 0693e9a..1c6454a 100644 --- a/bwt_gen/bwt_gen.c +++ b/bwt_gen/bwt_gen.c @@ -83,7 +83,6 @@ typedef struct BWTInc { unsigned int numberOfIterationDone; bgint_t *cumulativeCountInCurrentBuild; bgint_t availableWord; - float targetNBit; bgint_t buildSize; bgint_t initialMaxBuildSize; bgint_t incMaxBuildSize; @@ -362,18 +361,16 @@ BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit, bwtInc->bwt = BWTCreate(textLength, NULL); bwtInc->initialMaxBuildSize = initialMaxBuildSize; bwtInc->incMaxBuildSize = incMaxBuildSize; - bwtInc->targetNBit = targetNBit; bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); // Build frequently accessed data bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int)); - for (i=0; ipackedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; - } - bwtInc->availableWord = (bgint_t)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * bwtInc->targetNBit); - if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD; + bwtInc->availableWord = (bgint_t)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * targetNBit); + if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD; // lh3: otherwise segfaul when availableWord is too small if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) { fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n"); exit(1); @@ -1317,8 +1314,13 @@ static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar) // Set address seq = (bgint_t*)bwtInc->workingMemory; relativeRank = seq + bwtInc->buildSize + 1; + // mergedBwt and packedTex may share memory mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place + assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)bwtInc->packedText); + assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)mergedBwt); + + // ->packedText is not used any more and may be overwritten by mergedBwt BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar); firstCharInThisIteration = relativeRank[0]; @@ -1342,8 +1344,11 @@ static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar) sortedRank = (bgint_t*)bwtInc->workingMemory; seq = sortedRank + bwtInc->buildSize + 1; insertBwt = (unsigned*)seq; // insertBwt and seq share memory + // relativeRank and ->packedText may share memory relativeRank = seq + bwtInc->buildSize + 1; + assert((void*)relativeRank <= (void*)bwtInc->packedText); + // Store the first character of this iteration firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR); @@ -1357,6 +1362,7 @@ static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar) // Get rank of new suffix among processed suffix // The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group + // ->packedText is not used any more and will be overwritten by relativeRank oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration); @@ -1388,15 +1394,14 @@ static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar) sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt - // Build BWT + // Build BWT; seq is overwritten by insertBwt BWTIncBuildBwt(insertBwt, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild); - // Merge BWT + // Merge BWT; relativeRank may be overwritten by mergedBwt mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord - - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR; - // minus numberOfIteration * occInterval to create a buffer for merging + - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR; // minus numberOfIteration * occInterval to create a buffer for merging + assert(mergedBwt >= insertBwt + numChar); BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar); - } // Build auxiliary structure and update info and pointers in BWT From b6d807b0b7c4444780f443dc9e2cfd11c2be8069 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 19 Oct 2011 13:50:50 -0400 Subject: [PATCH 035/498] minor changes in case something goes wrong --- bwt_gen/bwt_gen.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bwt_gen/bwt_gen.c b/bwt_gen/bwt_gen.c index 1c6454a..0dcc7f4 100644 --- a/bwt_gen/bwt_gen.c +++ b/bwt_gen/bwt_gen.c @@ -161,7 +161,7 @@ static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) if (bwtInc->bwt->textLength == 0) { // initial build // Minus 2 because n+1 entries of seq and rank needed for n char - maxBuildSize = (bwtInc->availableWord - 2 * (sizeof(bgint_t) / 4) - OCC_INTERVAL / CHAR_PER_WORD) + maxBuildSize = (bwtInc->availableWord - (2 + OCC_INTERVAL / CHAR_PER_WORD) * (sizeof(bgint_t) / 4)) / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD / (sizeof(bgint_t) / 4); if (bwtInc->initialMaxBuildSize > 0) { bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize); @@ -171,8 +171,8 @@ static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) } else { // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char // Minus numberOfIterationDone because bwt slightly shift to left in each iteration - maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - 3 * (sizeof(bgint_t) / 4) - - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) + maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord + - (3 + bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) * (sizeof(bgint_t) / 4)) / 3 / (sizeof(bgint_t) / 4); if (maxBuildSize < CHAR_PER_WORD) { fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); @@ -1399,7 +1399,7 @@ static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar) // Merge BWT; relativeRank may be overwritten by mergedBwt mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord - - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR; // minus numberOfIteration * occInterval to create a buffer for merging + - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR * (sizeof(bgint_t) / 4); // minus numberOfIteration * occInterval to create a buffer for merging assert(mergedBwt >= insertBwt + numChar); BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar); } @@ -1545,7 +1545,7 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) { BWTInc *bwtInc; - bwtInc = BWTIncConstructFromPacked(fn_pac, 5.0, 10000000, 10000000); + bwtInc = BWTIncConstructFromPacked(fn_pac, 4.4, 10000000, 10000000); printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); BWTIncFree(bwtInc); From c948c647a0b4fad187db4c29cca2a6c2e332ae82 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 19 Oct 2011 17:42:42 -0400 Subject: [PATCH 036/498] make changes to bwt.c --- bwt.c | 25 +++++++------------------ bwt.h | 16 +++++++++++----- bwt_gen/bwt_gen.c | 2 +- bwtaln.h | 2 +- bwtio.c | 2 +- bwtmisc.c | 8 ++++---- bwtsw2_core.c | 2 +- 7 files changed, 26 insertions(+), 31 deletions(-) diff --git a/bwt.c b/bwt.c index 10b838a..390e90e 100644 --- a/bwt.c +++ b/bwt.c @@ -97,8 +97,8 @@ inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) if (k >= bwt->primary) --k; // because $ is not in bwt // retrieve Occ at k/OCC_INTERVAL - n = (p = bwt_occ_intv(bwt, k))[c]; - p += 4; // jump to the start of the first BWT cell + n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; + p += sizeof(bwtint_t); // jump to the start of the first BWT cell // calculate Occ up to the last k/32 j = k >> 5 << 5; @@ -116,10 +116,6 @@ inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol) { bwtint_t _k, _l; - if (k == l) { - *ok = *ol = bwt_occ(bwt, k, c); - return; - } _k = (k >= bwt->primary)? k-1 : k; _l = (l >= bwt->primary)? l-1 : l; if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { @@ -130,8 +126,8 @@ inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint uint32_t *p; if (k >= bwt->primary) --k; if (l >= bwt->primary) --l; - n = (p = bwt_occ_intv(bwt, k))[c]; - p += 4; + n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; + p += sizeof(bwtint_t); // calculate *ok j = k >> 5 << 5; for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2) @@ -164,8 +160,8 @@ inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) } if (k >= bwt->primary) --k; // because $ is not in bwt p = bwt_occ_intv(bwt, k); - memcpy(cnt, p, 16); - p += 4; + memcpy(cnt, p, 4 * sizeof(bwtint_t)); + p += sizeof(bwtint_t); j = k >> 4 << 4; for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p) x += __occ_aux4(bwt, *p); @@ -177,11 +173,6 @@ inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) { bwtint_t _k, _l; - if (k == l) { - bwt_occ4(bwt, k, cntk); - memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); - return; - } _k = (k >= bwt->primary)? k-1 : k; _l = (l >= bwt->primary)? l-1 : l; if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { @@ -190,13 +181,11 @@ inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4] } else { bwtint_t i, j, x, y; uint32_t *p; - int cl[4]; if (k >= bwt->primary) --k; // because $ is not in bwt if (l >= bwt->primary) --l; - cl[0] = cl[1] = cl[2] = cl[3] = 0; p = bwt_occ_intv(bwt, k); memcpy(cntk, p, 4 * sizeof(bwtint_t)); - p += 4; + p += sizeof(bwtint_t); // prepare cntk[] j = k >> 4 << 4; for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p) diff --git a/bwt.h b/bwt.h index 4aef38d..2f12be6 100644 --- a/bwt.h +++ b/bwt.h @@ -30,14 +30,15 @@ #include -// requirement: (OCC_INTERVAL%16 == 0) +// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line #define OCC_INTERVAL 0x80 #ifndef BWA_UBYTE #define BWA_UBYTE typedef unsigned char ubyte_t; #endif -typedef uint32_t bwtint_t; + +typedef uint64_t bwtint_t; typedef struct { bwtint_t primary; // S^{-1}(0), or the primary index of BWT @@ -53,15 +54,20 @@ typedef struct { bwtint_t *sa; } bwt_t; -#define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL*12 + 4 + (k)%OCC_INTERVAL/16]) +/* For general OCC_INTERVAL, the following is correct: +#define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16]) +#define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) +*/ + +// The following two lines are ONLY correct when OCC_INTERVAL==0x80 +#define bwt_bwt(b, k) ((b)->bwt[((k)>>7<<4) + sizeof(bwtint_t) + (((k)&0x7f)>>4)]) +#define bwt_occ_intv(b, k) ((b)->bwt + ((k)>>7<<4)) /* retrieve a character from the $-removed BWT string. Note that * bwt_t::bwt is not exactly the BWT string and therefore this macro is * called bwt_B0 instead of bwt_B */ #define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3) -#define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL*12) - // inverse Psi function #define bwt_invPsi(bwt, k) \ (((k) == (bwt)->primary)? 0 : \ diff --git a/bwt_gen/bwt_gen.c b/bwt_gen/bwt_gen.c index 0dcc7f4..5caaf5f 100644 --- a/bwt_gen/bwt_gen.c +++ b/bwt_gen/bwt_gen.c @@ -1545,7 +1545,7 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) { BWTInc *bwtInc; - bwtInc = BWTIncConstructFromPacked(fn_pac, 4.4, 10000000, 10000000); + bwtInc = BWTIncConstructFromPacked(fn_pac, 3.7, 10000000, 10000000); printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); BWTIncFree(bwtInc); diff --git a/bwtaln.h b/bwtaln.h index c2faa98..0dc7363 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -51,8 +51,8 @@ typedef uint16_t bwa_cigar_t; #define __cigar_create(__op, __len) ((__op)<primary, sizeof(bwtint_t), 1, fp); fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); - fwrite(bwt->bwt, sizeof(bwtint_t), bwt->bwt_size, fp); + fwrite(bwt->bwt, 4, bwt->bwt_size, fp); fclose(fp); } diff --git a/bwtmisc.c b/bwtmisc.c index 1082065..8d20287 100644 --- a/bwtmisc.c +++ b/bwtmisc.c @@ -125,20 +125,20 @@ void bwt_bwtupdate_core(bwt_t *bwt) uint32_t *buf; n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; - bwt->bwt_size += n_occ * 4; // the new size + bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt c[0] = c[1] = c[2] = c[3] = 0; for (i = k = 0; i < bwt->seq_len; ++i) { if (i % OCC_INTERVAL == 0) { memcpy(buf + k, c, sizeof(bwtint_t) * 4); - k += 4; + k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) } - if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; + if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 ++c[bwt_B00(bwt, i)]; } // the last element memcpy(buf + k, c, sizeof(bwtint_t) * 4); - xassert(k + 4 == bwt->bwt_size, "inconsistent bwt_size"); + xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); // update bwt free(bwt->bwt); bwt->bwt = buf; } diff --git a/bwtsw2_core.c b/bwtsw2_core.c index 03360a3..4d5984c 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -475,7 +475,7 @@ bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *qu // get Occ for the DAG bwtl_2occ4(target, v->tk - 1, v->tl, tcntk, tcntl); for (tj = 0; tj != 4; ++tj) { // descend to the children - uint32_t qcntk[4], qcntl[4]; + bwtint_t qcntk[4], qcntl[4]; int qj, *curr_score_mat = score_mat + tj * 4; khiter_t iter; bsw2entry_t *u; From d6155ecf9058142916a10ca017fcc0db8c8e54fc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 19 Oct 2011 17:45:27 -0400 Subject: [PATCH 037/498] fixed a couple of gcc warnings --- bwape.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwape.c b/bwape.c index 3336538..bf00203 100644 --- a/bwape.c +++ b/bwape.c @@ -151,10 +151,10 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double for (y = 1.0; y < 10.0; y += 0.01) if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break; ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499); - fprintf(stderr, "[infer_isize] low and high boundaries: %d and %d for estimating avg and std\n", ii->low, ii->high); + fprintf(stderr, "[infer_isize] low and high boundaries: %ld and %ld for estimating avg and std\n", (long)ii->low, (long)ii->high); fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std); fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf; ap_prior: %.2e\n", skewness, kurtosis, ii->ap_prior); - fprintf(stderr, "[infer_isize] inferred maximum insert size: %d (%.2lf sigma)\n", ii->high_bayesian, y); + fprintf(stderr, "[infer_isize] inferred maximum insert size: %ld (%.2lf sigma)\n", (long)ii->high_bayesian, y); return 0; } From b7e8c4c5aa4444e1c43665c99d5fbd53a6129ab1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 20 Oct 2011 11:23:23 -0400 Subject: [PATCH 038/498] fixed a bug in 2occ4() --- bwt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwt.c b/bwt.c index 390e90e..038a8a3 100644 --- a/bwt.c +++ b/bwt.c @@ -196,7 +196,7 @@ inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4] j = l >> 4 << 4; for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p); y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15); - memcpy(cntl, cntk, 16); + memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24; cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24; } From 70da24e177ebda6f891b7e29d3a179fe3a7f720e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 20 Oct 2011 11:53:44 -0400 Subject: [PATCH 039/498] restructure bns_fasta2bntseq() for further changes --- bntseq.c | 100 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 52 insertions(+), 48 deletions(-) diff --git a/bntseq.c b/bntseq.c index 21ba91f..b83b4e1 100644 --- a/bntseq.c +++ b/bntseq.c @@ -163,16 +163,63 @@ void bns_destroy(bntseq_t *bns) } } +static void add1(const kseq_t *seq, bntseq_t *bns, FILE *fp, uint8_t *buf, int *l_buf, int *m_seqs, int *m_holes, bntamb1_t **q) +{ + bntann1_t *p; + int i, lasts; + if (bns->n_seqs == *m_seqs) { + *m_seqs <<= 1; + bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t)); + } + p = bns->anns + bns->n_seqs; + p->name = strdup((char*)seq->name.s); + p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); + p->gi = 0; p->len = seq->seq.l; + p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; + p->n_ambs = 0; + for (i = lasts = 0; i < seq->seq.l; ++i) { + int c = nst_nt4_table[(int)seq->seq.s[i]]; + if (c >= 4) { // N + if (lasts == seq->seq.s[i]) { // contiguous N + ++(*q)->len; + } else { + if (bns->n_holes == *m_holes) { + (*m_holes) <<= 1; + bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t)); + } + *q = bns->ambs + bns->n_holes; + (*q)->len = 1; + (*q)->offset = p->offset + i; + (*q)->amb = seq->seq.s[i]; + ++p->n_ambs; + ++bns->n_holes; + } + } + lasts = seq->seq.s[i]; + { // fill buffer + if (c >= 4) c = lrand48()&0x3; + if (*l_buf == 0x40000) { + fwrite(buf, 1, 0x10000, fp); + memset(buf, 0, 0x10000); + *l_buf = 0; + } + buf[*l_buf>>2] |= c << ((3 - (*l_buf&3)) << 1); + ++(*l_buf); + } + } + ++bns->n_seqs; + bns->l_pac += seq->seq.l; +} + int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix) { kseq_t *seq; char name[1024]; bntseq_t *bns; - bntamb1_t *q; - int l_buf; unsigned char buf[0x10000]; - int32_t m_seqs, m_holes, l, i; + int32_t l_buf, m_seqs, m_holes; int64_t ret = -1; + bntamb1_t *q; FILE *fp; // initialization @@ -189,51 +236,8 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix) fp = xopen(name, "wb"); memset(buf, 0, 0x10000); // read sequences - while ((l = kseq_read(seq)) >= 0) { - bntann1_t *p; - int lasts; - if (bns->n_seqs == m_seqs) { - m_seqs <<= 1; - bns->anns = (bntann1_t*)realloc(bns->anns, m_seqs * sizeof(bntann1_t)); - } - p = bns->anns + bns->n_seqs; - p->name = strdup((char*)seq->name.s); - p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); - p->gi = 0; p->len = l; - p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; - p->n_ambs = 0; - for (i = 0, lasts = 0; i < l; ++i) { - int c = nst_nt4_table[(int)seq->seq.s[i]]; - if (c >= 4) { // N - if (lasts == seq->seq.s[i]) { // contiguous N - ++q->len; - } else { - if (bns->n_holes == m_holes) { - m_holes <<= 1; - bns->ambs = (bntamb1_t*)realloc(bns->ambs, m_holes * sizeof(bntamb1_t)); - } - q = bns->ambs + bns->n_holes; - q->len = 1; - q->offset = p->offset + i; - q->amb = seq->seq.s[i]; - ++p->n_ambs; - ++bns->n_holes; - } - } - lasts = seq->seq.s[i]; - { // fill buffer - if (c >= 4) c = lrand48()&0x3; - if (l_buf == 0x40000) { - fwrite(buf, 1, 0x10000, fp); - memset(buf, 0, 0x10000); - l_buf = 0; - } - buf[l_buf>>2] |= c << ((3 - (l_buf&3)) << 1); - ++l_buf; - } - } - ++bns->n_seqs; - bns->l_pac += seq->seq.l; + while (kseq_read(seq) >= 0) { + add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); } xassert(bns->l_pac, "zero length sequence."); ret = bns->l_pac; From b96f180a15fc182ba0c2b409df594a45aa02a56c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 20 Oct 2011 11:56:24 -0400 Subject: [PATCH 040/498] move bwt_gen/* to the root directory --- Makefile | 26 +++++++------------------- bwt_gen/QSufSort.c => QSufSort.c | 0 bwt_gen/QSufSort.h => QSufSort.h | 0 bwt_gen/bwt_gen.c => bwt_gen.c | 0 4 files changed, 7 insertions(+), 19 deletions(-) rename bwt_gen/QSufSort.c => QSufSort.c (100%) rename bwt_gen/QSufSort.h => QSufSort.h (100%) rename bwt_gen/bwt_gen.c => bwt_gen.c (100%) diff --git a/Makefile b/Makefile index 53f241d..c9588f2 100644 --- a/Makefile +++ b/Makefile @@ -3,14 +3,14 @@ CXX= g++ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64 -OBJS= utils.o bwt.o bwtio.o bwtaln.o bwtgap.o is.o \ - bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \ +OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ + is.o bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \ bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o bamlite.o PROG= bwa INCLUDES= -LIBS= -lm -lz -lpthread -Lbwt_gen -lbwtgen +LIBS= -lm -lz -lpthread SUBDIRS= . bwt_gen .SUFFIXES:.c .o .cc @@ -22,21 +22,11 @@ SUBDIRS= . bwt_gen all:$(PROG) -lib-recur all-recur clean-recur cleanlocal-recur install-recur: - @target=`echo $@ | sed s/-recur//`; \ - wdir=`pwd`; \ - list='$(SUBDIRS)'; for subdir in $$list; do \ - cd $$subdir; \ - $(MAKE) CC="$(CC)" CXX="$(CXX)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ - INCLUDES="$(INCLUDES)" $$target || exit 1; \ - cd $$wdir; \ - done; - -lib: - -bwa:lib-recur $(OBJS) main.o +bwa:$(OBJS) main.o $(CC) $(CFLAGS) $(DFLAGS) $(OBJS) main.o -o $@ $(LIBS) +QSufSort.o:QSufSort.h + bwt.o:bwt.h bwtio.o:bwt.h bwtaln.o:bwt.h bwtaln.h kseq.h @@ -49,7 +39,5 @@ bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h bwtsw2_main.o:bwtsw2.h -cleanlocal: +clean: rm -f gmon.out *.o a.out $(PROG) *~ *.a - -clean:cleanlocal-recur diff --git a/bwt_gen/QSufSort.c b/QSufSort.c similarity index 100% rename from bwt_gen/QSufSort.c rename to QSufSort.c diff --git a/bwt_gen/QSufSort.h b/QSufSort.h similarity index 100% rename from bwt_gen/QSufSort.h rename to QSufSort.h diff --git a/bwt_gen/bwt_gen.c b/bwt_gen.c similarity index 100% rename from bwt_gen/bwt_gen.c rename to bwt_gen.c From 46123639cff6247bb6bc351f29fed91dfc498af8 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 20 Oct 2011 12:09:35 -0400 Subject: [PATCH 041/498] removed reverse pac; bwa is not working right now --- bntseq.c | 3 +++ bwtindex.c | 45 --------------------------------------------- bwtmisc.c | 37 ------------------------------------- main.c | 2 -- main.h | 1 - 5 files changed, 3 insertions(+), 85 deletions(-) diff --git a/bntseq.c b/bntseq.c index b83b4e1..41fb68c 100644 --- a/bntseq.c +++ b/bntseq.c @@ -213,6 +213,7 @@ static void add1(const kseq_t *seq, bntseq_t *bns, FILE *fp, uint8_t *buf, int * int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix) { + extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; char name[1024]; bntseq_t *bns; @@ -238,6 +239,8 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix) // read sequences while (kseq_read(seq) >= 0) { add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); + seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 1); + add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); } xassert(bns->l_pac, "zero length sequence."); ret = bns->l_pac; diff --git a/bwtindex.c b/bwtindex.c index c752a2f..2dadea3 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -105,14 +105,6 @@ int bwa_index(int argc, char *argv[]) return 1; } if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT - { - strcpy(str, prefix); strcat(str, ".pac"); - strcpy(str2, prefix); strcat(str2, ".rpac"); - t = clock(); - fprintf(stderr, "[bwa_index] Reverse the packed sequence... "); - bwa_pac_rev_core(str, str2); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - } { strcpy(str, prefix); strcat(str, ".pac"); strcpy(str2, prefix); strcat(str2, ".bwt"); @@ -127,20 +119,6 @@ int bwa_index(int argc, char *argv[]) } fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC); } - { - strcpy(str, prefix); strcat(str, ".rpac"); - strcpy(str2, prefix); strcat(str2, ".rbwt"); - t = clock(); - fprintf(stderr, "[bwa_index] Construct BWT for the reverse packed sequence...\n"); - if (algo_type == 2) bwt_bwtgen(str, str2); - else if (algo_type == 1 || algo_type == 3) { - bwt_t *bwt; - bwt = bwt_pac2bwt(str, algo_type == 3); - bwt_dump_bwt(str2, bwt); - bwt_destroy(bwt); - } - fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC); - } { bwt_t *bwt; strcpy(str, prefix); strcat(str, ".bwt"); @@ -152,17 +130,6 @@ int bwa_index(int argc, char *argv[]) bwt_destroy(bwt); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } - { - bwt_t *bwt; - strcpy(str, prefix); strcat(str, ".rbwt"); - t = clock(); - fprintf(stderr, "[bwa_index] Update reverse BWT... "); - bwt = bwt_restore_bwt(str); - bwt_bwtupdate_core(bwt); - bwt_dump_bwt(str, bwt); - bwt_destroy(bwt); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - } { bwt_t *bwt; strcpy(str, prefix); strcat(str, ".bwt"); @@ -175,18 +142,6 @@ int bwa_index(int argc, char *argv[]) bwt_destroy(bwt); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } - { - bwt_t *bwt; - strcpy(str, prefix); strcat(str, ".rbwt"); - strcpy(str3, prefix); strcat(str3, ".rsa"); - t = clock(); - fprintf(stderr, "[bwa_index] Construct SA from reverse BWT and Occ... "); - bwt = bwt_restore_bwt(str); - bwt_cal_sa(bwt, 32); - bwt_dump_sa(str3, bwt); - bwt_destroy(bwt); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - } free(str3); free(str2); free(str); free(prefix); return 0; } diff --git a/bwtmisc.c b/bwtmisc.c index 8d20287..c35d684 100644 --- a/bwtmisc.c +++ b/bwtmisc.c @@ -157,43 +157,6 @@ int bwa_bwtupdate(int argc, char *argv[]) return 0; } -void bwa_pac_rev_core(const char *fn, const char *fn_rev) -{ - int64_t seq_len, i; - bwtint_t pac_len, j; - ubyte_t *bufin, *bufout, ct; - FILE *fp; - seq_len = bwa_seq_len(fn); - pac_len = (seq_len >> 2) + 1; - bufin = (ubyte_t*)calloc(pac_len, 1); - bufout = (ubyte_t*)calloc(pac_len, 1); - fp = xopen(fn, "rb"); - fread(bufin, 1, pac_len, fp); - fclose(fp); - for (i = seq_len - 1, j = 0; i >= 0; --i) { - int c = bufin[i>>2] >> ((~i&3)<<1) & 3; - bwtint_t j = seq_len - 1 - i; - bufout[j>>2] |= c << ((~j&3)<<1); - } - free(bufin); - fp = xopen(fn_rev, "wb"); - fwrite(bufout, 1, pac_len, fp); - ct = seq_len % 4; - fwrite(&ct, 1, 1, fp); - fclose(fp); - free(bufout); -} - -int bwa_pac_rev(int argc, char *argv[]) -{ - if (argc < 3) { - fprintf(stderr, "Usage: bwa pac_rev \n"); - return 1; - } - bwa_pac_rev_core(argv[1], argv[2]); - return 0; -} - const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; /* this function is not memory efficient, but this will make life easier diff --git a/main.c b/main.c index f3447e6..0e5ee8a 100644 --- a/main.c +++ b/main.c @@ -24,7 +24,6 @@ static int usage() fprintf(stderr, " pac2bwt generate BWT from PAC\n"); fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n"); fprintf(stderr, " bwtupdate update .bwt to the new format\n"); - fprintf(stderr, " pac_rev generate reverse PAC\n"); fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n"); fprintf(stderr, " stdsw standard SW/NW alignment\n"); @@ -44,7 +43,6 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "pac2bwt") == 0) return bwa_pac2bwt(argc-1, argv+1); else if (strcmp(argv[1], "pac2bwtgen") == 0) return bwt_bwtgen_main(argc-1, argv+1); else if (strcmp(argv[1], "bwtupdate") == 0) return bwa_bwtupdate(argc-1, argv+1); - else if (strcmp(argv[1], "pac_rev") == 0) return bwa_pac_rev(argc-1, argv+1); else if (strcmp(argv[1], "bwt2sa") == 0) return bwa_bwt2sa(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) return bwa_index(argc-1, argv+1); else if (strcmp(argv[1], "aln") == 0) return bwa_aln(argc-1, argv+1); diff --git a/main.h b/main.h index 5e7697a..15ec189 100644 --- a/main.h +++ b/main.h @@ -6,7 +6,6 @@ extern "C" { #endif int bwa_fa2pac(int argc, char *argv[]); - int bwa_pac_rev(int argc, char *argv[]); int bwa_pac2cspac(int argc, char *argv[]); int bwa_pac2bwt(int argc, char *argv[]); int bwa_bwtupdate(int argc, char *argv[]); From 2d2db5d50f61e7eeb10103137c6f1f3178951f84 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 20 Oct 2011 16:13:35 -0400 Subject: [PATCH 042/498] aln seems working --- bntseq.c | 6 ++++-- bwtaln.c | 59 +++++++++++++++++++++++++------------------------------- bwtaln.h | 2 +- bwtgap.c | 51 +++++++++++++++++++++--------------------------- bwtgap.h | 6 +++--- 5 files changed, 56 insertions(+), 68 deletions(-) diff --git a/bntseq.c b/bntseq.c index 41fb68c..4d532c4 100644 --- a/bntseq.c +++ b/bntseq.c @@ -178,7 +178,7 @@ static void add1(const kseq_t *seq, bntseq_t *bns, FILE *fp, uint8_t *buf, int * p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; p->n_ambs = 0; for (i = lasts = 0; i < seq->seq.l; ++i) { - int c = nst_nt4_table[(int)seq->seq.s[i]]; + int c = seq->seq.s[i]; if (c >= 4) { // N if (lasts == seq->seq.s[i]) { // contiguous N ++(*q)->len; @@ -218,7 +218,7 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix) char name[1024]; bntseq_t *bns; unsigned char buf[0x10000]; - int32_t l_buf, m_seqs, m_holes; + int32_t i, l_buf, m_seqs, m_holes; int64_t ret = -1; bntamb1_t *q; FILE *fp; @@ -238,6 +238,8 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix) memset(buf, 0, 0x10000); // read sequences while (kseq_read(seq) >= 0) { + for (i = 0; i < seq->seq.l; ++i) // convert to 2-bit encoding + seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 1); add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); diff --git a/bwtaln.c b/bwtaln.c index de6b001..c5002c6 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -49,22 +49,22 @@ int bwa_cal_maxdiff(int l, double err, double thres) } // width must be filled as zero -static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_width_t *width) +static int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width) { bwtint_t k, l, ok, ol; int i, bid; bid = 0; - k = 0; l = rbwt->seq_len; + k = 0; l = bwt->seq_len; for (i = 0; i < len; ++i) { ubyte_t c = str[i]; if (c < 4) { - bwt_2occ(rbwt, k - 1, l, c, &ok, &ol); - k = rbwt->L2[c] + ok + 1; - l = rbwt->L2[c] + ol; + bwt_2occ(bwt, k - 1, l, c, &ok, &ol); + k = bwt->L2[c] + ok + 1; + l = bwt->L2[c] + ol; } if (k > l || c > 3) { // then restart k = 0; - l = rbwt->seq_len; + l = bwt->seq_len; ++bid; } width[i].w = l - k + 1; @@ -75,12 +75,11 @@ static int bwt_cal_width(const bwt_t *rbwt, int len, const ubyte_t *str, bwt_wid return bid; } -void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt) +void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt) { - int i, max_l = 0, max_len; + int i, j, max_l = 0, max_len; gap_stack_t *stack; - bwt_width_t *w[2], *seed_w[2]; - const ubyte_t *seq[2]; + bwt_width_t *w, *seed_w; gap_opt_t local_opt = *opt; // initiate priority stack @@ -90,46 +89,40 @@ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seq if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff; stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt); - seed_w[0] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); - seed_w[1] = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); - w[0] = w[1] = 0; + seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); + w = 0; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = seqs + i; #ifdef HAVE_PTHREAD if (i % opt->n_threads != tid) continue; #endif p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; - seq[0] = p->seq; seq[1] = p->rseq; if (max_l < p->len) { max_l = p->len; - w[0] = (bwt_width_t*)realloc(w[0], (max_l + 1) * sizeof(bwt_width_t)); - w[1] = (bwt_width_t*)realloc(w[1], (max_l + 1) * sizeof(bwt_width_t)); - memset(w[0], 0, (max_l + 1) * sizeof(bwt_width_t)); - memset(w[1], 0, (max_l + 1) * sizeof(bwt_width_t)); + w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t)); + memset(w, 0, (max_l + 1) * sizeof(bwt_width_t)); } - bwt_cal_width(bwt[0], p->len, seq[0], w[0]); - bwt_cal_width(bwt[1], p->len, seq[1], w[1]); + bwt_cal_width(bwt, p->len, p->seq, w); if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr); local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff; - if (p->len > opt->seed_len) { - bwt_cal_width(bwt[0], opt->seed_len, seq[0] + (p->len - opt->seed_len), seed_w[0]); - bwt_cal_width(bwt[1], opt->seed_len, seq[1] + (p->len - opt->seed_len), seed_w[1]); - } + if (p->len > opt->seed_len) + bwt_cal_width(bwt, opt->seed_len, p->seq + (p->len - opt->seed_len), seed_w); // core function - p->aln = bwt_match_gap(bwt, p->len, seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); - // store the alignment + for (j = 0; j < p->len; ++j) // we need to complement + p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j]; + p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); + // clean up the record free(p->name); free(p->seq); free(p->rseq); free(p->qual); p->name = 0; p->seq = p->rseq = p->qual = 0; } - free(seed_w[0]); free(seed_w[1]); - free(w[0]); free(w[1]); + free(seed_w); free(w); gap_destroy_stack(stack); } #ifdef HAVE_PTHREAD typedef struct { int tid; - bwt_t *bwt[2]; + bwt_t *bwt; int n_seqs; bwa_seq_t *seqs; const gap_opt_t *opt; @@ -163,15 +156,15 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; - bwt_t *bwt[2]; + bwt_t *bwt; // initialization ks = bwa_open_reads(opt->mode, fn_fa); { // load BWT + extern uint8_t nst_nt4_table[]; char *str = (char*)calloc(strlen(prefix) + 10, 1); - strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); free(str); } @@ -223,7 +216,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) } // destroy - bwt_destroy(bwt[0]); bwt_destroy(bwt[1]); + bwt_destroy(bwt); bwa_seq_close(ks); } diff --git a/bwtaln.h b/bwtaln.h index 0dc7363..20191ad 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -135,7 +135,7 @@ extern "C" { void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs); int bwa_cal_maxdiff(int l, double err, double thres); - void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt[2], int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt); + void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt); void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac); diff --git a/bwtgap.c b/bwtgap.c index d32c3bb..c996f9f 100644 --- a/bwtgap.c +++ b/bwtgap.c @@ -42,7 +42,7 @@ static void gap_reset_stack(gap_stack_t *stack) stack->n_entries = 0; } -static inline void gap_push(gap_stack_t *stack, int a, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, +static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, int state, int is_diff, const gap_opt_t *opt) { int score; @@ -55,7 +55,7 @@ static inline void gap_push(gap_stack_t *stack, int a, int i, bwtint_t k, bwtint q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries); } p = q->stack + q->n_entries; - p->info = (u_int32_t)score<<21 | a<<20 | i; p->k = k; p->l = l; + p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l; p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state; p->last_diff_pos = is_diff? i : 0; ++(q->n_entries); @@ -101,8 +101,8 @@ static inline int int_log2(uint32_t v) return c; } -bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], bwt_width_t *w[2], - bwt_width_t *seed_w[2], const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack) +bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width, + bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack) { int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt); int best_diff = opt->max_diff + 1, max_diff = opt->max_diff; @@ -115,7 +115,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], // check whether there are too many N for (j = _j = 0; j < len; ++j) - if (seq[0][j] > 3) ++_j; + if (seq[j] > 3) ++_j; if (_j > max_diff) { *_n_aln = n_aln; return aln; @@ -123,31 +123,24 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], //for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w); gap_reset_stack(stack); // reset stack - gap_push(stack, 0, len, 0, bwts[0]->seq_len, 0, 0, 0, 0, 0, opt); - gap_push(stack, 1, len, 0, bwts[0]->seq_len, 0, 0, 0, 0, 0, opt); + gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, opt); while (stack->n_entries) { gap_entry_t e; - int a, i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp; + int i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp; bwtint_t k, l, cnt_k[4], cnt_l[4], occ; - const bwt_t *bwt; - const ubyte_t *str; - const bwt_width_t *seed_width = 0; - bwt_width_t *width; if (max_entries < stack->n_entries) max_entries = stack->n_entries; if (stack->n_entries > opt->max_entries) break; gap_pop(stack, &e); // get the best entry k = e.k; l = e.l; // SA interval - a = e.info>>20&1; i = e.info&0xffff; // strand, length + i = e.info&0xffff; // length if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed m = max_diff - (e.n_mm + e.n_gapo); if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape; if (m < 0) continue; - bwt = bwts[1-a]; str = seq[a]; width = w[a]; - if (seed_w) { // apply seeding - seed_width = seed_w[a]; + if (seed_width) { // apply seeding m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo); if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape; } @@ -158,7 +151,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], hit_found = 0; if (i == 0) hit_found = 1; else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed - if (bwt_match_exact_alt(bwt, i, str, &k, &l)) hit_found = 1; + if (bwt_match_exact_alt(bwt, i, seq, &k, &l)) hit_found = 1; else continue; // no hit, skip } @@ -189,7 +182,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t)); } p = aln + n_aln; - p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; p->a = a; + p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; p->k = k; p->l = l; p->score = score; ++n_aln; @@ -206,7 +199,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], int ii = i - (len - opt->seed_len); if (width[i-1].bid > m-1) allow_diff = 0; else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0; - if (seed_w && ii > 0) { + if (seed_width && ii > 0) { if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0; else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1 && seed_width[ii-1].w == seed_width[ii].w) allow_M = 0; @@ -218,24 +211,24 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], if (e.state == STATE_M) { // gap open if (e.n_gapo < opt->max_gapo) { // gap open is allowed // insertion - gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt); + gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt); // deletion for (j = 0; j != 4; ++j) { k = bwt->L2[j] + cnt_k[j] + 1; l = bwt->L2[j] + cnt_l[j]; - if (k <= l) gap_push(stack, a, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt); + if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt); } } } else if (e.state == STATE_I) { // extention of an insertion if (e.n_gape < opt->max_gape) // gap extention is allowed - gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt); + gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt); } else if (e.state == STATE_D) { // extention of a deletion if (e.n_gape < opt->max_gape) { // gap extention is allowed if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) { for (j = 0; j != 4; ++j) { k = bwt->L2[j] + cnt_k[j] + 1; l = bwt->L2[j] + cnt_l[j]; - if (k <= l) gap_push(stack, a, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt); + if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt); } } } @@ -244,17 +237,17 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwts[2], int len, const ubyte_t *seq[2], // mismatches if (allow_diff && allow_M) { // mismatch is allowed for (j = 1; j <= 4; ++j) { - int c = (str[i] + j) & 3; - int is_mm = (j != 4 || str[i] > 3); + int c = (seq[i] + j) & 3; + int is_mm = (j != 4 || seq[i] > 3); k = bwt->L2[c] + cnt_k[c] + 1; l = bwt->L2[c] + cnt_l[c]; - if (k <= l) gap_push(stack, a, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt); + if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt); } - } else if (str[i] < 4) { // try exact match only - int c = str[i] & 3; + } else if (seq[i] < 4) { // try exact match only + int c = seq[i] & 3; k = bwt->L2[c] + cnt_k[c] + 1; l = bwt->L2[c] + cnt_l[c]; - if (k <= l) gap_push(stack, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt); + if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt); } } diff --git a/bwtgap.h b/bwtgap.h index fc910bc..01ee359 100644 --- a/bwtgap.h +++ b/bwtgap.h @@ -5,7 +5,7 @@ #include "bwtaln.h" typedef struct { // recursion stack - u_int32_t info; // score<<21 | a<<20 | i + u_int32_t info; // score<<21 | i u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6; bwtint_t k, l; // (k,l) is the SA region of [i,n-1] int last_diff_pos; @@ -27,8 +27,8 @@ extern "C" { gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt); void gap_destroy_stack(gap_stack_t *stack); - bwt_aln1_t *bwt_match_gap(bwt_t *const bwt[2], int len, const ubyte_t *seq[2], bwt_width_t *w[2], - bwt_width_t *seed_w[2], const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack); + bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *w, + bwt_width_t *seed_w, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack); void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); #ifdef __cplusplus From 156852b1dddb1315166d32706032d95ab0374d8f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 20 Oct 2011 16:15:16 -0400 Subject: [PATCH 043/498] no change --- bwtaln.c | 1 - 1 file changed, 1 deletion(-) diff --git a/bwtaln.c b/bwtaln.c index c5002c6..a686c92 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -162,7 +162,6 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) ks = bwa_open_reads(opt->mode, fn_fa); { // load BWT - extern uint8_t nst_nt4_table[]; char *str = (char*)calloc(strlen(prefix) + 10, 1); strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); free(str); From 098f44cd51713de22fca9db7d9a691236952e0fb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 20 Oct 2011 16:26:14 -0400 Subject: [PATCH 044/498] nothing --- bwtaln.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwtaln.c b/bwtaln.c index a686c92..08a6d5c 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -111,7 +111,7 @@ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, for (j = 0; j < p->len; ++j) // we need to complement p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j]; p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); - // clean up the record + // clean up the unused data in the record free(p->name); free(p->seq); free(p->rseq); free(p->qual); p->name = 0; p->seq = p->rseq = p->qual = 0; } From ec307a10e68b90f3660d614fc2269b29274d8cb2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 20 Oct 2011 17:45:41 -0400 Subject: [PATCH 045/498] PE seems working; more testing needed --- bntseq.c | 1 + bwape.c | 57 +++++++++++++++++++++++++------------------------- bwase.c | 63 +++++++++++++++++++++++++++----------------------------- bwase.h | 4 +++- 4 files changed, 63 insertions(+), 62 deletions(-) diff --git a/bntseq.c b/bntseq.c index 4d532c4..1997038 100644 --- a/bntseq.c +++ b/bntseq.c @@ -294,6 +294,7 @@ int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *r } else right = mid; } *real_seq = mid; + if (len == 0) return 0; // binary search for holes left = 0; right = bns->n_holes; nn = 0; while (left < right) { diff --git a/bwape.c b/bwape.c index bf00203..8d2a695 100644 --- a/bwape.c +++ b/bwape.c @@ -9,6 +9,7 @@ #include "bntseq.h" #include "utils.h" #include "stdaln.h" +#include "bwase.h" typedef struct { int n; @@ -37,10 +38,8 @@ typedef struct { extern int g_log_n[256]; // in bwase.c static kh_64_t *g_hash; -void bwase_initialize(); void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); -void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns); int bwa_approx_mapQ(const bwa_seq_t *p, int mm); void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2); bntseq_t *bwa_open_nt(const char *prefix); @@ -277,12 +276,12 @@ typedef struct { kvec_t(bwt_aln1_t) aln; } aln_buf_t; -int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii, +int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bwt, int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii, const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii) { int i, j, cnt_chg = 0; char str[1024]; - bwt_t *bwt[2]; + bwt_t *bwt; pe_data_t *d; aln_buf_t *buf[2]; @@ -290,12 +289,10 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); - if (_bwt[0] == 0) { // load forward SA - strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]); - strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]); - } else bwt[0] = _bwt[0], bwt[1] = _bwt[1]; + if (_bwt == 0) { // load forward SA + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); + } else bwt = _bwt; // SE for (i = 0; i != n_seqs; ++i) { @@ -314,16 +311,17 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa // generate SE alignment and mapping quality bwa_aln2seq(n_aln, d->aln[j].a, p[j]); if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) { + int strand; int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff; - p[j]->pos = p[j]->strand? bwt_sa(bwt[0], p[j]->sa) - : bwt[1]->seq_len - (bwt_sa(bwt[1], p[j]->sa) + p[j]->len); p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff); + p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len, &strand); + p[j]->strand = strand; } } } // infer isize - infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt[0]->seq_len); + infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt->seq_len/2); if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii; if (opt->force_isize) { fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__); @@ -361,8 +359,11 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa poslist_t *z = &kh_val(g_hash, iter); z->n = r->l - r->k + 1; z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n); - for (l = r->k; l <= r->l; ++l) - z->a[l - r->k] = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len); + for (l = r->k; l <= r->l; ++l) { + int strand; + z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand); + r->a = strand; + } } for (l = 0; l < kh_val(g_hash, iter).n; ++l) { x = kh_val(g_hash, iter).a[l]; @@ -371,7 +372,9 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa } } else { // then calculate on the fly for (l = r->k; l <= r->l; ++l) { - x = r->a? bwt_sa(bwt[0], l) : bwt[1]->seq_len - (bwt_sa(bwt[1], l) + p[j]->len); + int strand; + x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand); + r->a = strand; x = x<<32 | k<<1 | j; kv_push(uint64_t, d->arr, x); } @@ -389,8 +392,10 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi); } else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi); for (k = 0; k < p[j]->n_multi; ++k) { + int strand; bwt_multi1_t *q = p[j]->multi + k; - q->pos = q->strand? bwt_sa(bwt[0], q->pos) : bwt[1]->seq_len - (bwt_sa(bwt[1], q->pos) + p[j]->len); + q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len, &strand); + q->strand = strand; } } } @@ -403,9 +408,7 @@ int bwa_cal_pac_pos_pe(const char *prefix, bwt_t *const _bwt[2], int n_seqs, bwa kv_destroy(buf[1][i].aln); } free(buf[0]); free(buf[1]); - if (_bwt[0] == 0) { - bwt_destroy(bwt[0]); bwt_destroy(bwt[1]); - } + if (_bwt == 0) bwt_destroy(bwt); kv_destroy(d->arr); kv_destroy(d->pos[0]); kv_destroy(d->pos[1]); kv_destroy(d->aln[0]); kv_destroy(d->aln[1]); @@ -655,12 +658,12 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f khint_t iter; isize_info_t last_ii; // this is for the last batch of reads char str[1024]; - bwt_t *bwt[2]; + bwt_t *bwt; uint8_t *pac; // initialization bwase_initialize(); // initialize g_log_n[] in bwase.c - pac = 0; bwt[0] = bwt[1] = 0; + pac = 0; bwt = 0; for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); bns = bns_restore(prefix); srand48(bns->seed); @@ -679,10 +682,8 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f ntbns = bwa_open_nt(prefix); } else { // for Illumina alignment only if (popt->is_preload) { - strcpy(str, prefix); strcat(str, ".bwt"); bwt[0] = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt[0]); - strcpy(str, prefix); strcat(str, ".rbwt"); bwt[1] = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt[1]); + strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); + strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1); rewind(bns->fp_pac); fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); @@ -702,7 +703,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f t = clock(); fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n"); - cnt_chg = bwa_cal_pac_pos_pe(prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii); + cnt_chg = bwa_cal_pac_pos_pe(bns, prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii); fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg); @@ -746,7 +747,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a); kh_destroy(64, g_hash); if (pac) { - free(pac); bwt_destroy(bwt[0]); bwt_destroy(bwt[1]); + free(pac); bwt_destroy(bwt); } } diff --git a/bwase.c b/bwase.c index 7ef4bec..8f8d9a8 100644 --- a/bwase.c +++ b/bwase.c @@ -109,54 +109,52 @@ int bwa_approx_mapQ(const bwa_seq_t *p, int mm) return (23 < g_log_n[n])? 0 : 23 - g_log_n[n]; } +bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand) +{ + bwtint_t pacpos; + int32_t ref_id; + pacpos = bwt_sa(bwt, sapos); + bns_coor_pac2real(bns, pacpos, 0, &ref_id); + *strand = !(ref_id&1); + /* NB: For gapped alignment, pacpos may not be correct, which will be fixed + * in refine_gapped_core(). This line also determines the way "x" is + * calculated in refine_gapped_core() when (ext < 0 && is_end == 0). */ + if (ref_id&1) // mapped to the forward strand + pacpos = bns->anns[ref_id].len - (pacpos + len - bns->anns[ref_id].offset) + bns->anns[ref_id-1].offset; + return pacpos; +} + /** * Derive the actual position in the read from the given suffix array * coordinates. Note that the position will be approximate based on * whether indels appear in the read and whether calculations are * performed from the start or end of the read. */ -void bwa_cal_pac_pos_core(const bwt_t *forward_bwt, const bwt_t *reverse_bwt, bwa_seq_t *seq, const int max_mm, const float fnr) +void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, const int max_mm, const float fnr) { - int max_diff; + int max_diff, strand; if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return; max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm; - if (seq->strand) { // reverse strand only - seq->pos = bwt_sa(forward_bwt, seq->sa); - seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); - } else { // forward strand only - /* NB: For gapped alignment, p->pos may not be correct, which - * will be fixed in refine_gapped_core(). This line also - * determines the way "x" is calculated in - * refine_gapped_core() when (ext < 0 && is_end == 0). */ - seq->pos = reverse_bwt->seq_len - (bwt_sa(reverse_bwt, seq->sa) + seq->len); - seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); - } + seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); + seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len, &strand); + seq->strand = strand; + seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); } -void bwa_cal_pac_pos(const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr) +void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr) { - int i, j; + int i, j, strand; char str[1024]; bwt_t *bwt; // load forward SA strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); for (i = 0; i != n_seqs; ++i) { - if (seqs[i].strand) bwa_cal_pac_pos_core(bwt, 0, &seqs[i], max_mm, fnr); - for (j = 0; j < seqs[i].n_multi; ++j) { - bwt_multi1_t *p = seqs[i].multi + j; - if (p->strand) p->pos = bwt_sa(bwt, p->pos); - } - } - bwt_destroy(bwt); - // load reverse BWT and SA - strcpy(str, prefix); strcat(str, ".rbwt"); bwt = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".rsa"); bwt_restore_sa(str, bwt); - for (i = 0; i != n_seqs; ++i) { - if (!seqs[i].strand) bwa_cal_pac_pos_core(0, bwt, &seqs[i], max_mm, fnr); + bwa_cal_pac_pos_core(bns, bwt, &seqs[i], max_mm, fnr); for (j = 0; j < seqs[i].n_multi; ++j) { bwt_multi1_t *p = seqs[i].multi + j; - if (!p->strand) p->pos = bwt->seq_len - (bwt_sa(bwt, p->pos) + seqs[i].len); + p->pos = bwa_sa2pos(bns, bwt, p->pos, seqs[i].len, &strand); + p->strand = strand; } } bwt_destroy(bwt); @@ -174,7 +172,7 @@ static bwa_cigar_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, in int l = 0, path_len, ref_len; AlnParam ap = aln_param_bwa; path_t *path; - int64_t k, __pos = *_pos > l_pac? (int64_t)((int32_t)*_pos) : *_pos; + int64_t k, __pos = *_pos; ref_len = len + abs(ext); if (ext > 0) { @@ -192,7 +190,7 @@ static bwa_cigar_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, in aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len); cigar = bwa_aln_path2cigar(path, path_len, n_cigar); - if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped on the forward strand + if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped to the forward strand for (l = k = 0; k < *n_cigar; ++k) { if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]); else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]); @@ -238,8 +236,7 @@ char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_ } else ++u; } x += l; y += l; -/* } else if (cigar[k]>>14 == FROM_I || cigar[k]>>14 == 3) { */ - } else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) { + } else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) { y += l; if (__cigar_op(cigar[k]) == FROM_I) nm += l; } else if (__cigar_op(cigar[k]) == FROM_D) { @@ -631,7 +628,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f } fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... "); - bwa_cal_pac_pos(prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here + bwa_cal_pac_pos(bns, prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); diff --git a/bwase.h b/bwase.h index 28ba224..f8e9b0a 100644 --- a/bwase.h +++ b/bwase.h @@ -12,13 +12,15 @@ extern "C" { // Initialize mapping tables in the bwa single-end mapper. void bwase_initialize(); // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array. - void bwa_cal_pac_pos_core(const bwt_t* forward_bwt, const bwt_t* reverse_bwt, bwa_seq_t* seq, const int max_mm, const float fnr); + void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr); // Refine the approximate position of the sequence to an actual placement for the sequence. void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns); // Backfill certain alignment properties mainly centering around number of matches. void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); // Calculate the end position of a read given a certain sequence. int64_t pos_end(const bwa_seq_t *p); + // + bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); #ifdef __cplusplus } From 66629512d914c4d9a9add880cb9bda2e760647f5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 20 Oct 2011 23:49:03 -0400 Subject: [PATCH 046/498] more carefully computed the availableWords --- bwt_gen.c | 33 ++++++++++++++------------------- bwtindex.c | 4 ---- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/bwt_gen.c b/bwt_gen.c index 5caaf5f..b9568e9 100644 --- a/bwt_gen.c +++ b/bwt_gen.c @@ -345,17 +345,14 @@ BWT *BWTCreate(const bgint_t textLength, unsigned int *decodeTable) return bwt; } -BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit, - const unsigned int initialMaxBuildSize, const unsigned int incMaxBuildSize) +BWTInc *BWTIncCreate(const bgint_t textLength, unsigned int initialMaxBuildSize, unsigned int incMaxBuildSize) { BWTInc *bwtInc; - unsigned int i; + unsigned int i, n_iter; + + if (textLength < incMaxBuildSize) incMaxBuildSize = textLength; + if (textLength < initialMaxBuildSize) initialMaxBuildSize = textLength; - if (targetNBit == 0) { - fprintf(stderr, "BWTIncCreate() : targetNBit = 0!\n"); - exit(1); - } - bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc)); bwtInc->numberOfIterationDone = 0; bwtInc->bwt = BWTCreate(textLength, NULL); @@ -369,16 +366,15 @@ BWTInc *BWTIncCreate(const unsigned int textLength, const float targetNBit, for (i=0; ipackedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; - bwtInc->availableWord = (bgint_t)((textLength + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL / BITS_IN_WORD * targetNBit); + n_iter = (textLength - initialMaxBuildSize) / incMaxBuildSize + 1; + bwtInc->availableWord = BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength) // minimal memory requirement + + OCC_INTERVAL / BIT_PER_CHAR * n_iter * 2 * (sizeof(bgint_t) / 4) // buffer at the end of occ array + + incMaxBuildSize/5 * 3 * (sizeof(bgint_t) / 4); // space for the 3 temporary arrays in each iteration if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD; // lh3: otherwise segfaul when availableWord is too small - if (bwtInc->availableWord < BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength)) { - fprintf(stderr, "BWTIncCreate() : targetNBit is too low!\n"); - exit(1); - } + fprintf(stderr, "[%s] textLength=%ld, availableWord=%ld\n", __func__, (long)textLength, (long)bwtInc->availableWord); bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD); return bwtInc; - } // for BWTIncConstruct() static void BWTIncPutPackedTextToRank(const unsigned int *packedText, bgint_t* __restrict rank, @@ -699,7 +695,7 @@ static void BWTIncSortKey(bgint_t* __restrict key, bgint_t* __restrict seq, cons int stackDepth; int64_t i, j; bgint_t tempSeq, tempKey; - int numberOfEqualKey; + int64_t numberOfEqualKey; if (numItem < 2) return; @@ -1432,8 +1428,7 @@ static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar) } -BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetNBit, - bgint_t initialMaxBuildSize, bgint_t incMaxBuildSize) +BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxBuildSize, bgint_t incMaxBuildSize) { FILE *packedFile; @@ -1457,7 +1452,7 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, const float targetN fread(&lastByteLength, sizeof(unsigned char), 1, packedFile); totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); - bwtInc = BWTIncCreate(totalTextLength, targetNBit, initialMaxBuildSize, incMaxBuildSize); + bwtInc = BWTIncCreate(totalTextLength, initialMaxBuildSize, incMaxBuildSize); BWTIncSetBuildSizeAndTextAddr(bwtInc); @@ -1545,7 +1540,7 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) { BWTInc *bwtInc; - bwtInc = BWTIncConstructFromPacked(fn_pac, 3.7, 10000000, 10000000); + bwtInc = BWTIncConstructFromPacked(fn_pac, 10000000, 10000000); printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); BWTIncFree(bwtInc); diff --git a/bwtindex.c b/bwtindex.c index 2dadea3..8672687 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -100,10 +100,6 @@ int bwa_index(int argc, char *argv[]) fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } } - if (l_pac > 0xffffffffu) { - fprintf(stderr, "[%s] BWA only works with reference sequences shorter than 4GB in total. Abort!\n", __func__); - return 1; - } if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT { strcpy(str, prefix); strcat(str, ".pac"); From fe9da3c7043cf13fd6377622fa14ac768f9b22ac Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 21 Oct 2011 10:57:12 -0400 Subject: [PATCH 047/498] allow to pac forward strand only --- bntseq.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/bntseq.c b/bntseq.c index 1997038..a12b95c 100644 --- a/bntseq.c +++ b/bntseq.c @@ -211,7 +211,7 @@ static void add1(const kseq_t *seq, bntseq_t *bns, FILE *fp, uint8_t *buf, int * bns->l_pac += seq->seq.l; } -int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix) +int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c kseq_t *seq; @@ -241,8 +241,10 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix) for (i = 0; i < seq->seq.l; ++i) // convert to 2-bit encoding seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); - seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 1); - add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); + if (!for_only) { + seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 1); + add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); + } } xassert(bns->l_pac, "zero length sequence."); ret = bns->l_pac; @@ -267,13 +269,19 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix) int bwa_fa2pac(int argc, char *argv[]) { + int c, for_only = 0; gzFile fp; - if (argc < 2) { - fprintf(stderr, "Usage: bwa fa2pac []\n"); + while ((c = getopt(argc, argv, "f")) >= 0) { + switch (c) { + case 'f': for_only = 1; break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: bwa fa2pac [-f] []\n"); return 1; } - fp = xzopen(argv[1], "r"); - bns_fasta2bntseq(fp, (argc < 3)? argv[1] : argv[2]); + fp = xzopen(argv[optind], "r"); + bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only); gzclose(fp); return 0; } From 1cb409aaf2d4e47069703b8961a5da41d43bfd23 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 21 Oct 2011 12:03:14 -0400 Subject: [PATCH 048/498] use forward-only pac to reduce memory --- bntseq.c | 44 ++++++++++++++++++++++++++------------------ bntseq.h | 6 ++++-- bwase.c | 21 ++++++++++----------- bwtindex.c | 12 ++++++++++-- bwtsw2_aux.c | 4 ++-- 5 files changed, 52 insertions(+), 35 deletions(-) diff --git a/bntseq.c b/bntseq.c index a12b95c..15b369f 100644 --- a/bntseq.c +++ b/bntseq.c @@ -286,36 +286,44 @@ int bwa_fa2pac(int argc, char *argv[]) return 0; } -int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq) +int64_t bns_pos2refId(const bntseq_t *bns, int64_t pos, int is_fr, int *ref_id, int *is_rev) { - int left, mid, right, nn; - if (pac_coor >= bns->l_pac) - err_fatal("bns_coor_pac2real", "bug! Coordinate is longer than sequence (%lld>=%lld).", pac_coor, bns->l_pac); - // binary search for the sequence ID. Note that this is a bit different from the following one... + int left, mid, right; + is_fr = is_fr? 1 : 0; left = 0; mid = 0; right = bns->n_seqs; while (left < right) { mid = (left + right) >> 1; - if (pac_coor >= bns->anns[mid].offset) { + if (pos >= bns->anns[mid].offset<n_seqs - 1) break; - if (pac_coor < bns->anns[mid+1].offset) break; + if (pos < bns->anns[mid+1].offset<anns[mid]; + if (pos - (p->offset<<1) < p->len) *is_rev = 0, pos -= p->offset; + else *is_rev = 1, pos = p->len - (pos - (p->offset<<1) - p->len) + p->offset; + } + return pos; +} + +int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) +{ + int left, mid, right, nn; + if (ref_id) bns_pos2refId(bns, pos_f, 0, ref_id, 0); left = 0; right = bns->n_holes; nn = 0; while (left < right) { - int64_t mid = (left + right) >> 1; - if (pac_coor >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1; - else if (pac_coor + len <= bns->ambs[mid].offset) right = mid; + mid = (left + right) >> 1; + if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1; + else if (pos_f + len <= bns->ambs[mid].offset) right = mid; else { // overlap - if (pac_coor >= bns->ambs[mid].offset) { - nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len? - bns->ambs[mid].offset + bns->ambs[mid].len - pac_coor : len; + if (pos_f >= bns->ambs[mid].offset) { + nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len? + bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len; } else { - nn += bns->ambs[mid].offset + bns->ambs[mid].len < pac_coor + len? - bns->ambs[mid].len : len - (bns->ambs[mid].offset - pac_coor); + nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len? + bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f); } break; } diff --git a/bntseq.h b/bntseq.h index 189e017..276ef64 100644 --- a/bntseq.h +++ b/bntseq.h @@ -70,8 +70,10 @@ extern "C" { bntseq_t *bns_restore(const char *prefix); bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); void bns_destroy(bntseq_t *bns); - int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix); - int bns_coor_pac2real(const bntseq_t *bns, int64_t pac_coor, int len, int32_t *real_seq); + int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); + int64_t bns_pos2refId(const bntseq_t *bns, int64_t pos, int is_fr, int *ref_id, int *is_rev); + int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); + #ifdef __cplusplus } diff --git a/bwase.c b/bwase.c index 8f8d9a8..fda4752 100644 --- a/bwase.c +++ b/bwase.c @@ -111,17 +111,16 @@ int bwa_approx_mapQ(const bwa_seq_t *p, int mm) bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand) { - bwtint_t pacpos; - int32_t ref_id; - pacpos = bwt_sa(bwt, sapos); - bns_coor_pac2real(bns, pacpos, 0, &ref_id); - *strand = !(ref_id&1); + bwtint_t pos_fr, pos_f; + int is_rev, ref_id; + pos_fr = bwt_sa(bwt, sapos); + pos_f = bns_pos2refId(bns, pos_fr, 1, &ref_id, &is_rev); // pos_f + *strand = !is_rev; /* NB: For gapped alignment, pacpos may not be correct, which will be fixed * in refine_gapped_core(). This line also determines the way "x" is * calculated in refine_gapped_core() when (ext < 0 && is_end == 0). */ - if (ref_id&1) // mapped to the forward strand - pacpos = bns->anns[ref_id].len - (pacpos + len - bns->anns[ref_id].offset) + bns->anns[ref_id-1].offset; - return pacpos; + if (is_rev) pos_f = pos_f < len? 0 : pos_f - len; // mapped to the forward strand + return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset } /** @@ -423,7 +422,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment // get seqid - nn = bns_coor_pac2real(bns, p->pos, j, &seqid); + nn = bns_cnt_ambi(bns, p->pos, j, &seqid); if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len) flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences @@ -450,7 +449,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in long long isize; am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality // redundant calculation here, but should not matter too much - m_is_N = bns_coor_pac2real(bns, mate->pos, mate->len, &m_seqid); + m_is_N = bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid); err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0; if (p->type == BWA_TYPE_NO_MATCH) isize = 0; @@ -493,7 +492,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in bwt_multi1_t *q = p->multi + i; int k; j = pos_end_multi(q, p->len) - q->pos; - nn = bns_coor_pac2real(bns, q->pos, j, &seqid); + nn = bns_cnt_ambi(bns, q->pos, j, &seqid); err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+', (int)(q->pos - bns->anns[seqid].offset + 1)); if (q->cigar) { diff --git a/bwtindex.c b/bwtindex.c index 8672687..8d40245 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -80,7 +80,7 @@ int bwa_index(int argc, char *argv[]) gzFile fp = xzopen(argv[optind], "r"); t = clock(); fprintf(stderr, "[bwa_index] Pack FASTA... "); - l_pac = bns_fasta2bntseq(fp, prefix); + l_pac = bns_fasta2bntseq(fp, prefix, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); gzclose(fp); } else { // color indexing @@ -88,7 +88,7 @@ int bwa_index(int argc, char *argv[]) strcat(strcpy(str, prefix), ".nt"); t = clock(); fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... "); - l_pac = bns_fasta2bntseq(fp, str); + l_pac = bns_fasta2bntseq(fp, str, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); gzclose(fp); { @@ -126,6 +126,14 @@ int bwa_index(int argc, char *argv[]) bwt_destroy(bwt); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); } + { + gzFile fp = xzopen(argv[optind], "r"); + t = clock(); + fprintf(stderr, "[bwa_index] Pack forward-only FASTA... "); + l_pac = bns_fasta2bntseq(fp, prefix, 1); + fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); + gzclose(fp); + } { bwt_t *bwt; strcpy(str, prefix); strcat(str, ".bwt"); diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 8ba6455..fe8b96b 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -314,7 +314,7 @@ static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n // FIXME: this routine does not work if the query bridge three reference sequences int32_t coor, refl, lq; int x, y, i, seqid; - bns_coor_pac2real(bns, p->k, p->len, &seqid); + bns_cnt_ambi(bns, p->k, p->len, &seqid); coor = p->k - bns->anns[seqid].offset; refl = bns->anns[seqid].len; x = coor; y = 0; @@ -404,7 +404,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks int beg, end; if (p->l == 0) { b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]); - nn = bns_coor_pac2real(bns, p->k, p->len, &seqid); + nn = bns_cnt_ambi(bns, p->k, p->len, &seqid); coor = p->k - bns->anns[seqid].offset; } ksprintf(&str, "%s\t%d", ks->name, p->flag&0x10); From 26b77eabefb3b756f9a8ca25aec33e1c84960aa6 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 21 Oct 2011 12:32:00 -0400 Subject: [PATCH 049/498] updated version number --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index 0e5ee8a..1e91c17 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.5.9-r26-dev" +#define PACKAGE_VERSION "0.6.0-r46-dev" #endif static int usage() From b3397a1f1491b7513c8cc0b6c3d087a8c3481b9b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 21 Oct 2011 13:32:45 -0400 Subject: [PATCH 050/498] changes to bwa-sw for the 64-bit support; unfinish --- bwtsw2.h | 3 ++- bwtsw2_aux.c | 13 +++++++------ bwtsw2_core.c | 33 +++++++++++++++++++++------------ 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/bwtsw2.h b/bwtsw2.h index d5dbe71..0a0571f 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -14,7 +14,8 @@ typedef struct { } bsw2opt_t; typedef struct { - uint32_t k, l, flag:18, n_seeds:14; + bwtint_t k, l; + uint32_t flag:18, n_seeds:14; int len, G, G2; int beg, end; } bsw2hit_t; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index fe8b96b..ced8b54 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -77,7 +77,7 @@ void bsw2_destroy(bwtsw2_t *b) #define __rpac(pac, l, i) (pac[(l-i-1)>>2] >> (~(l-i-1)&3)*2 & 0x3) -void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, uint32_t l_pac, int is_rev, uint8_t *_mem) +void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, int is_rev, uint8_t *_mem) { int i, matrix[25]; bwtint_t k; @@ -128,10 +128,10 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq free(query); free(target); } -void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, uint32_t l_pac, int is_rev, uint8_t *_mem) +void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, int is_rev, uint8_t *_mem) { int i, matrix[25]; - uint32_t k; + bwtint_t k; uint8_t *target; AlnParam par; @@ -189,7 +189,7 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], uint8_t *pa for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; uint8_t *query; - uint32_t k; + bwtint_t k; int score, path_len, beg, end; if (p->l) continue; beg = (p->flag & 0x10)? lq - p->end : p->beg; @@ -223,7 +223,7 @@ void bsw2_debug_hits(const bwtsw2_t *b) for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; if (p->l == 0) - printf("%d, %d, %d, %u, %u\n", p->G, p->beg, p->end, p->k, p->l); + printf("%d, %d, %d, %lu, %lu\n", p->G, p->beg, p->end, (long)p->k, (long)p->l); } } @@ -328,7 +328,8 @@ static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n lq = y; // length of the query sequence if (x > refl) { // then fix it int j, nc, mq[2], nlen[2]; - uint32_t *cn, kk = 0; + uint32_t *cn; + bwtint_t kk = 0; nc = mq[0] = mq[1] = nlen[0] = nlen[1] = 0; cn = calloc(n_cigar + 3, 4); x = coor; y = 0; diff --git a/bwtsw2_core.c b/bwtsw2_core.c index 4d5984c..c3431cf 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -8,7 +8,15 @@ #include "bwt.h" #include "kvec.h" +typedef struct { + bwtint_t k, l; +} qintv_t; + +#define qintv_eq(a, b) ((a).k == (b).k && (a).l == (b).l) +#define qintv_hash(a) ((a).k>>7^(a).l<<17) + #include "khash.h" +KHASH_INIT(qintv, qintv_t, uint64_t, 1, qintv_hash, qintv_eq) KHASH_MAP_INIT_INT64(64, uint64_t) #define MINUS_INF -0x3fffffff @@ -17,7 +25,7 @@ KHASH_MAP_INIT_INT64(64, uint64_t) struct __mempool_t; static void mp_destroy(struct __mempool_t*); typedef struct { - uint32_t qk, ql; + bwtint_t qk, ql; int I, D, G; uint32_t pj:2, qlen:30; int tlen; @@ -34,7 +42,7 @@ static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF typedef struct { int n, max; - uint32_t tk, tl; + uint32_t tk, tl; // this is fine bsw2cell_t *array; } bsw2entry_t, *bsw2entry_p; @@ -87,7 +95,7 @@ static void mp_destroy(struct __mempool_t *mp) static khash_t(64) *bsw2_connectivity(const bwtl_t *b) { khash_t(64) *h; - uint32_t k, l, cntk[4], cntl[4]; + uint32_t k, l, cntk[4], cntl[4]; // this is fine uint64_t x; khiter_t iter; int j, ret; @@ -144,17 +152,17 @@ static void cut_tail(bsw2entry_t *u, int T, bsw2entry_t *aux) } } // remove duplicated cells -static inline void remove_duplicate(bsw2entry_t *u, khash_t(64) *hash) +static inline void remove_duplicate(bsw2entry_t *u, khash_t(qintv) *hash) { int i, ret, j; khiter_t k; - uint64_t key; - kh_clear(64, hash); + qintv_t key; + kh_clear(qintv, hash); for (i = 0; i != u->n; ++i) { bsw2cell_t *p = u->array + i; if (p->ql == 0) continue; - key = (uint64_t)p->qk << 32 | p->ql; - k = kh_put(64, hash, key, &ret); + key.k = p->qk; key.l = p->ql; + k = kh_put(qintv, hash, key, &ret); j = -1; if (ret == 0) { if ((uint32_t)kh_value(hash, k) >= p->G) j = i; @@ -211,7 +219,7 @@ static inline double time_elapse(const struct rusage *curr, const struct rusage static void save_hits(const bwtl_t *bwt, int thres, bsw2hit_t *hits, bsw2entry_t *u) { int i; - uint32_t k; + uint32_t k; // this is fine for (i = 0; i < u->n; ++i) { bsw2cell_t *p = u->array + i; if (p->G < thres) continue; @@ -432,7 +440,8 @@ bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *qu bwtsw2_t *b, *b1, **b_ret; int i, j, score_mat[16], *heap, heap_size, n_tot = 0; struct rusage curr, last; - khash_t(64) *rhash, *chash; + khash_t(qintv) *rhash; + khash_t(64) *chash; // initialize connectivity hash (chash) chash = bsw2_connectivity(target); @@ -441,7 +450,7 @@ bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *qu for (j = 0; j != 4; ++j) score_mat[i<<2|j] = (i == j)? opt->a : -opt->b; // initialize other variables - rhash = kh_init(64); + rhash = kh_init(qintv); init_bwtsw2(target, query, stack); heap_size = opt->z; heap = calloc(heap_size, sizeof(int)); @@ -587,7 +596,7 @@ bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *qu //fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot); // free free(heap); - kh_destroy(64, rhash); + kh_destroy(qintv, rhash); kh_destroy(64, chash); stack->pending.n = stack->stack0.n = 0; return b_ret; From 29c3acfb3181e6b247c75fe5345f47720967b26c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 Oct 2011 09:36:52 -0400 Subject: [PATCH 051/498] not segfault immediately; but buggy --- bwtsw2.h | 6 +++--- bwtsw2_aux.c | 48 ++++++++++++++++++++++++++++++++---------------- bwtsw2_core.c | 22 ++++++++++++---------- bwtsw2_main.c | 10 ++++------ 4 files changed, 51 insertions(+), 35 deletions(-) diff --git a/bwtsw2.h b/bwtsw2.h index 0a0571f..3c93509 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -15,7 +15,7 @@ typedef struct { typedef struct { bwtint_t k, l; - uint32_t flag:18, n_seeds:14; + uint32_t flag:18, n_seeds:13, is_rev:1; int len, G, G2; int beg, end; } bsw2hit_t; @@ -38,8 +38,8 @@ extern "C" { #endif bsw2opt_t *bsw2_init_opt(); - bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool); - void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target[2], const char *fn); + bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool); + void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn); void bsw2_destroy(bwtsw2_t *b); bsw2global_t *bsw2_global_init(); diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index ced8b54..9836435 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -42,7 +42,7 @@ unsigned char nt_comp_table[256] = { 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N' }; -extern int bsw2_resolve_duphits(const bwt_t *bwt, bwtsw2_t *b, int IS); +extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level); bsw2opt_t *bsw2_init_opt() @@ -253,25 +253,41 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8 int l, uint8_t *seq[2], int is_rev, bsw2global_t *pool) { extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]); - bwtsw2_t *b[2], **bb[2]; - int k; + bwtsw2_t *b[2], **bb[2], **_b, *p; + int k, j; + bwtl_t *query; + query = bwtl_seq2bwtl(l, seq[0]); + _b = bsw2_core(bns, opt, query, target, pool); + bwtl_destroy(query); for (k = 0; k < 2; ++k) { - bwtl_t *query = bwtl_seq2bwtl(l, seq[k]); - bb[k] = bsw2_core(opt, query, target, pool); - bwtl_destroy(query); + bb[k] = calloc(2, sizeof(void*)); + bb[k][0] = calloc(1, sizeof(bwtsw2_t)); + bb[k][1] = calloc(1, sizeof(bwtsw2_t)); + } + fprintf(stderr, "here!\n"); + for (k = 0; k < 2; ++k) { // separate _b into bb[2] based on the strand + for (j = 0; j < _b[k]->n; ++j) { + p = bb[k][_b[k]->hits[j].is_rev]; + if (p->n == p->max) { + p->max = p->max? p->max<<1 : 8; + p->hits = realloc(p->hits, p->max * sizeof(bsw2hit_t)); + } + p->hits[p->n++] = _b[k]->hits[j]; + } } b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits" bsw2_chain_filter(opt, l, b); for (k = 0; k < 2; ++k) { bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem); merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here - bsw2_resolve_duphits(0, bb[k][0], 0); + bsw2_resolve_duphits(0, 0, bb[k][0], 0); bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem); b[k] = bb[k][0]; free(bb[k]); } merge_hits(b, l, 1); // again, b[1] is merged to b[0] bsw2_resolve_query_overlaps(b[0], opt->mask_level); + bsw2_destroy(_b[0]); bsw2_destroy(_b[1]); free(_b); return b[0]; } @@ -453,7 +469,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks /* Core routine to align reads in _seq. It is separated from * process_seqs() to realize multi-threading */ -static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target[2]) +static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target) { int x; bsw2opt_t opt = *_opt; @@ -502,11 +518,11 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const free(seq[0]); continue; } // alignment - b[0] = bsw2_aln1_core(&opt, bns, pac, target[0], l, seq, 0, pool); + b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, 0, pool); for (k = 0; k < b[0]->n; ++k) if (b[0]->hits[k].n_seeds < opt.t_seeds) break; - if (k < b[0]->n) { - b[1] = bsw2_aln1_core(&opt, bns, pac, target[1], l, rseq, 1, pool); + if (0 && k < b[0]->n) { + b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, 1, pool); for (i = 0; i < b[1]->n; ++i) { bsw2hit_t *p = b[1]->hits + i; int x = p->beg; @@ -516,7 +532,7 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const } flag_fr(b); merge_hits(b, l, 0); - bsw2_resolve_duphits(0, b[0], 0); + bsw2_resolve_duphits(0, 0, b[0], 0); bsw2_resolve_query_overlaps(b[0], opt.mask_level); } else b[1] = 0; // generate CIGAR and print SAM @@ -536,7 +552,7 @@ typedef struct { const bsw2opt_t *_opt; const bntseq_t *bns; uint8_t *pac; - bwt_t *target[2]; + bwt_t *target; } thread_aux_t; /* another interface to bsw2_aln_core() to facilitate pthread_create() */ @@ -550,7 +566,7 @@ static void *worker(void *data) /* process sequences stored in _seq, generate SAM lines for these * sequences and reset _seq afterwards. */ -static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target[2]) +static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target) { int i; @@ -569,7 +585,7 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * for (j = 0; j < opt->n_threads; ++j) { thread_aux_t *p = data + j; p->tid = j; p->_seq = _seq; p->_opt = opt; p->bns = bns; - p->pac = pac; p->target[0] = target[0]; p->target[1] = target[1]; + p->pac = pac; p->target = target; pthread_create(&tid[j], &attr, worker, p); } for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); @@ -591,7 +607,7 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * _seq->n = 0; } -void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target[2], const char *fn) +void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn) { gzFile fp; kseq_t *ks; diff --git a/bwtsw2_core.c b/bwtsw2_core.c index c3431cf..0440e93 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -266,14 +266,14 @@ static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, i } /* after this, "narrow SA hits" will be expanded and the coordinates * will be obtained and stored in b->hits[*].k. */ -int bsw2_resolve_duphits(const bwt_t *bwt, bwtsw2_t *b, int IS) +int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS) { - int i, j, n; + int i, j, n, ref_id, is_rev; if (b->n == 0) return 0; - if (bwt) { // convert to chromosomal coordinates if suitable + if (bwt && bns) { // convert to chromosomal coordinates if suitable int old_n = b->n; bsw2hit_t *old_hits = b->hits; - for (i = n = 0; i < b->n; ++i) { + for (i = n = 0; i < b->n; ++i) { // compute memory needed to be allocated bsw2hit_t *p = old_hits + i; if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1; else if (p->G > 0) ++n; @@ -282,19 +282,21 @@ int bsw2_resolve_duphits(const bwt_t *bwt, bwtsw2_t *b, int IS) b->hits = calloc(b->max, sizeof(bsw2hit_t)); for (i = j = 0; i < old_n; ++i) { bsw2hit_t *p = old_hits + i; - if (p->l - p->k + 1 <= IS) { + if (p->l - p->k + 1 <= IS) { // the hit is no so repetitive bwtint_t k; for (k = p->k; k <= p->l; ++k) { b->hits[j] = *p; - b->hits[j].k = bwt_sa(bwt, k); + b->hits[j].k = bns_pos2refId(bns, bwt_sa(bwt, k), 1, &ref_id, &is_rev); b->hits[j].l = 0; + b->hits[j].is_rev = is_rev; ++j; } } else if (p->G > 0) { b->hits[j] = *p; - b->hits[j].k = bwt_sa(bwt, p->k); + b->hits[j].k = bns_pos2refId(bns, bwt_sa(bwt, p->k), 1, &ref_id, &is_rev); b->hits[j].l = 0; b->hits[j].flag |= 1; + b->hits[j].is_rev = is_rev; ++j; } } @@ -434,7 +436,7 @@ static void init_bwtsw2(const bwtl_t *target, const bwt_t *query, bsw2stack_t *s stack_push0(s, u); } /* On return, ret[1] keeps not-so-repetitive hits (narrow SA hits); ret[0] keeps all hits (right?) */ -bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool) +bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool) { bsw2stack_t *stack = (bsw2stack_t*)pool->stack; bwtsw2_t *b, *b1, **b_ret; @@ -591,8 +593,8 @@ bwtsw2_t **bsw2_core(const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *qu mp_free(stack->pool, v); } // while(top) getrusage(0, &curr); - bsw2_resolve_duphits(query, b, opt->is); - bsw2_resolve_duphits(query, b1, opt->is); + bsw2_resolve_duphits(bns, query, b, opt->is); + bsw2_resolve_duphits(bns, query, b1, opt->is); //fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot); // free free(heap); diff --git a/bwtsw2_main.c b/bwtsw2_main.c index afbad2e..281efb1 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -10,7 +10,7 @@ int bwa_bwtsw2(int argc, char *argv[]) { bsw2opt_t *opt; - bwt_t *target[2]; + bwt_t *target; char buf[1024]; bntseq_t *bns; int c; @@ -82,16 +82,14 @@ int bwa_bwtsw2(int argc, char *argv[]) opt->t *= opt->a; opt->coef *= opt->a; - strcpy(buf, argv[optind]); target[0] = bwt_restore_bwt(strcat(buf, ".bwt")); - strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".sa"), target[0]); - strcpy(buf, argv[optind]); target[1] = bwt_restore_bwt(strcat(buf, ".rbwt")); - strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".rsa"), target[1]); + strcpy(buf, argv[optind]); target = bwt_restore_bwt(strcat(buf, ".bwt")); + strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".sa"), target); bns = bns_restore(argv[optind]); bsw2_aln(opt, bns, target, argv[optind+1]); bns_destroy(bns); - bwt_destroy(target[0]); bwt_destroy(target[1]); + bwt_destroy(target); free(opt); return 0; From 4c43c5914df181c746d7326e49c1e5e071cf08c4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 Oct 2011 11:50:11 -0400 Subject: [PATCH 052/498] this is better; but still buggy --- bwtsw2_aux.c | 22 ++++++++++++++++------ bwtsw2_core.c | 1 + bwtsw2_main.c | 1 - 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 9836435..9dd8684 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -223,7 +223,7 @@ void bsw2_debug_hits(const bwtsw2_t *b) for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; if (p->l == 0) - printf("%d, %d, %d, %lu, %lu\n", p->G, p->beg, p->end, (long)p->k, (long)p->l); + printf("G=%d, [%d,%d), k=%lu, l=%lu\n", p->G, p->beg, p->end, (long)p->k, (long)p->l); } } @@ -264,21 +264,31 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8 bb[k][0] = calloc(1, sizeof(bwtsw2_t)); bb[k][1] = calloc(1, sizeof(bwtsw2_t)); } - fprintf(stderr, "here!\n"); for (k = 0; k < 2; ++k) { // separate _b into bb[2] based on the strand for (j = 0; j < _b[k]->n; ++j) { - p = bb[k][_b[k]->hits[j].is_rev]; + bsw2hit_t *q; + p = bb[_b[k]->hits[j].is_rev][k]; if (p->n == p->max) { p->max = p->max? p->max<<1 : 8; p->hits = realloc(p->hits, p->max * sizeof(bsw2hit_t)); } - p->hits[p->n++] = _b[k]->hits[j]; + q = &p->hits[p->n++]; + *q = _b[k]->hits[j]; + if (_b[k]->hits[j].is_rev) { + int x = q->beg; + q->beg = l - q->end; + q->end = l - x; + q->k -= q->len; + } } } + //bsw2_debug_hits(bb[0][1]); + bsw2_debug_hits(bb[1][1]); b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits" bsw2_chain_filter(opt, l, b); for (k = 0; k < 2; ++k) { bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem); + if (k == 1) bsw2_debug_hits(bb[k][1]); merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here bsw2_resolve_duphits(0, 0, bb[k][0], 0); bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem); @@ -510,8 +520,8 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const if (c >= 4) { c = (int)(drand48() * 4); ++k; } // FIXME: ambiguous bases are not properly handled seq[0][i] = c; seq[1][l-1-i] = 3 - c; - rseq[0][l-1-i] = c; - rseq[1][i] = 3 - c; + rseq[0][l-1-i] = 3 - c; + rseq[1][i] = c; } if (l - k < opt.t) { // too few unambiguous bases print_hits(bns, &opt, p, 0); diff --git a/bwtsw2_core.c b/bwtsw2_core.c index 0440e93..244e121 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -284,6 +284,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int bsw2hit_t *p = old_hits + i; if (p->l - p->k + 1 <= IS) { // the hit is no so repetitive bwtint_t k; + if (p->G == 0 && p->k == 0 && p->l == 0 && p->len == 0) continue; for (k = p->k; k <= p->l; ++k) { b->hits[j] = *p; b->hits[j].k = bns_pos2refId(bns, bwt_sa(bwt, k), 1, &ref_id, &is_rev); diff --git a/bwtsw2_main.c b/bwtsw2_main.c index 281efb1..3654372 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -48,7 +48,6 @@ int bwa_bwtsw2(int argc, char *argv[]) // fprintf(stderr, " -y FLOAT error recurrence coef. (4..16) [%.1f]\n", opt->yita); fprintf(stderr, "\n"); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, " -s INT size of a chunk of reads [%d]\n", opt->chunk_size); fprintf(stderr, "\n"); fprintf(stderr, " -w INT band width [%d]\n", opt->bw); fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level); From 8512b55ce32122ebfd78328ca0b65dc2dab66127 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 Oct 2011 13:42:32 -0400 Subject: [PATCH 053/498] bwasw works on a couple of sequences --- bwtsw2_aux.c | 55 ++++++++++++++++----------------------------------- bwtsw2_core.c | 21 +++++++++++++------- 2 files changed, 31 insertions(+), 45 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 9dd8684..8f25734 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -75,9 +75,7 @@ void bsw2_destroy(bwtsw2_t *b) (par).row = 5; (par).band_width = opt->bw; \ } while (0) -#define __rpac(pac, l, i) (pac[(l-i-1)>>2] >> (~(l-i-1)&3)*2 & 0x3) - -void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, int is_rev, uint8_t *_mem) +void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { int i, matrix[25]; bwtint_t k; @@ -103,19 +101,14 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (j = score = 0; j < i; ++j) { bsw2hit_t *q = b->hits + j; if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) { - if (q->n_seeds < (1<<14) - 2) ++q->n_seeds; + if (q->n_seeds < (1<<13) - 2) ++q->n_seeds; ++score; } } if (score) continue; if (lt > p->k) lt = p->k; - if (is_rev) { - for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! - target[j++] = __rpac(pac, l_pac, k); - } else { - for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! - target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; - } + for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! + target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem); if (score > p->G) { // extensible @@ -128,7 +121,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq free(query); free(target); } -void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, int is_rev, uint8_t *_mem) +void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { int i, matrix[25]; bwtint_t k; @@ -144,13 +137,8 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, int j, score; path_t path; if (p->l) continue; - if (is_rev) { - for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) - target[j++] = __rpac(pac, l_pac, k); - } else { - for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) - target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; - } + for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) + target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; score = aln_extend_core(target, lt, query + p->beg, lq - p->beg, &par, &path, 0, 1, _mem); // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); @@ -222,8 +210,8 @@ void bsw2_debug_hits(const bwtsw2_t *b) printf("# raw hits: %d\n", b->n); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; - if (p->l == 0) - printf("G=%d, [%d,%d), k=%lu, l=%lu\n", p->G, p->beg, p->end, (long)p->k, (long)p->l); + if (p->G > 0) + printf("G=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev); } } @@ -250,7 +238,7 @@ static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse) } /* seq[0] is the forward sequence and seq[1] is the reverse complement. */ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, - int l, uint8_t *seq[2], int is_rev, bsw2global_t *pool) + int l, uint8_t *seq[2], bsw2global_t *pool) { extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]); bwtsw2_t *b[2], **bb[2], **_b, *p; @@ -278,20 +266,16 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8 int x = q->beg; q->beg = l - q->end; q->end = l - x; - q->k -= q->len; } } } - //bsw2_debug_hits(bb[0][1]); - bsw2_debug_hits(bb[1][1]); b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits" bsw2_chain_filter(opt, l, b); for (k = 0; k < 2; ++k) { - bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem); - if (k == 1) bsw2_debug_hits(bb[k][1]); + bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem); merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here bsw2_resolve_duphits(0, 0, bb[k][0], 0); - bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, is_rev, pool->aln_mem); + bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, pool->aln_mem); b[k] = bb[k][0]; free(bb[k]); } @@ -528,18 +512,13 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const free(seq[0]); continue; } // alignment - b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, 0, pool); + b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, pool); for (k = 0; k < b[0]->n; ++k) if (b[0]->hits[k].n_seeds < opt.t_seeds) break; - if (0 && k < b[0]->n) { - b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, 1, pool); - for (i = 0; i < b[1]->n; ++i) { - bsw2hit_t *p = b[1]->hits + i; - int x = p->beg; - p->beg = l - p->end; - p->end = l - x; - if (p->l == 0) p->k = bns->l_pac - (p->k + p->len); - } + if (k < b[0]->n) { + b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, pool); + for (i = 0; i < b[1]->n; ++i) // flip the strand flag + b[1]->hits[i].flag ^= 0x10, b[1]->hits[i].is_rev ^= 1; flag_fr(b); merge_hits(b, l, 0); bsw2_resolve_duphits(0, 0, b[0], 0); diff --git a/bwtsw2_core.c b/bwtsw2_core.c index 244e121..f7b72de 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -35,7 +35,7 @@ typedef struct { #include "ksort.h" KSORT_INIT_GENERIC(int) -#define __hitG_lt(a, b) ((a).G > (b).G) +#define __hitG_lt(a, b) ((a).n_seeds > (b).n_seeds || ((a).n_seeds == (b).n_seeds && (a).G > (b).G)) KSORT_INIT(hitG, bsw2hit_t, __hitG_lt) static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF, 0, 0, 0, -1, -1, {-1, -1, -1, -1} }; @@ -270,10 +270,10 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int { int i, j, n, ref_id, is_rev; if (b->n == 0) return 0; - if (bwt && bns) { // convert to chromosomal coordinates if suitable + if (bwt && bns) { // convert to chromosomal coordinates if requested int old_n = b->n; bsw2hit_t *old_hits = b->hits; - for (i = n = 0; i < b->n; ++i) { // compute memory needed to be allocated + for (i = n = 0; i < b->n; ++i) { // compute the memory to allocated bsw2hit_t *p = old_hits + i; if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1; else if (p->G > 0) ++n; @@ -290,6 +290,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int b->hits[j].k = bns_pos2refId(bns, bwt_sa(bwt, k), 1, &ref_id, &is_rev); b->hits[j].l = 0; b->hits[j].is_rev = is_rev; + if (is_rev) b->hits[j].k -= p->len; ++j; } } else if (p->G > 0) { @@ -298,25 +299,28 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int b->hits[j].l = 0; b->hits[j].flag |= 1; b->hits[j].is_rev = is_rev; + if (is_rev) b->hits[j].k -= p->len; ++j; } } free(old_hits); } + for (i = j = 0; i < b->n; ++i) // squeeze out empty elements + if (b->hits[i].G) b->hits[j++] = b->hits[i]; + b->n = j; ks_introsort(hitG, b->n, b->hits); for (i = 1; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; - if (p->G == 0) break; for (j = 0; j < i; ++j) { bsw2hit_t *q = b->hits + j; int compatible = 1; - if (q->G == 0) continue; + if (p->is_rev != q->is_rev) continue; // hits from opposite strands are not duplicates if (p->l == 0 && q->l == 0) { - int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); + int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); // length of query overlap if (qol < 0) qol = 0; if ((float)qol / (p->end - p->beg) > MASK_LEVEL || (float)qol / (q->end - q->beg) > MASK_LEVEL) { int64_t tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len) - - (int64_t)(p->k > q->k? p->k : q->k); + - (int64_t)(p->k > q->k? p->k : q->k); // length of target overlap if ((double)tol / p->len > MASK_LEVEL || (double)tol / q->len > MASK_LEVEL) compatible = 0; } @@ -594,6 +598,9 @@ bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *ta mp_free(stack->pool, v); } // while(top) getrusage(0, &curr); + for (i = 0; i < 2; ++i) + for (j = 0; j < b_ret[i]->n; ++j) + b_ret[i]->hits[j].n_seeds = 0; bsw2_resolve_duphits(bns, query, b, opt->is); bsw2_resolve_duphits(bns, query, b1, opt->is); //fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot); From b204437c7e5f17a13ab40c7d2d992f702bd48451 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 Oct 2011 14:00:36 -0400 Subject: [PATCH 054/498] fixed negative mapping quality; still byggy --- bwtsw2_aux.c | 1 + bwtsw2_core.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 8f25734..5637b70 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -428,6 +428,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks if (p->n_seeds < 2) c *= .2; qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499); if (qual > 250) qual = 250; + if (qual < 0) qual = 0; if (p->flag&1) qual = 0; } ksprintf(&str, "\t%d\t", qual); diff --git a/bwtsw2_core.c b/bwtsw2_core.c index f7b72de..cc3dc63 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -35,7 +35,7 @@ typedef struct { #include "ksort.h" KSORT_INIT_GENERIC(int) -#define __hitG_lt(a, b) ((a).n_seeds > (b).n_seeds || ((a).n_seeds == (b).n_seeds && (a).G > (b).G)) +#define __hitG_lt(a, b) (((a).G + ((int)(a).n_seeds<<2)) > (b).G + ((int)(b).n_seeds<<2)) KSORT_INIT(hitG, bsw2hit_t, __hitG_lt) static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF, 0, 0, 0, -1, -1, {-1, -1, -1, -1} }; From eed46e802a9852c1c767f9d761412b43270d01a7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 Oct 2011 14:09:05 -0400 Subject: [PATCH 055/498] fixed a bug about reverse alignment --- bwtsw2_aux.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 5637b70..ce4324d 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -518,8 +518,13 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const if (b[0]->hits[k].n_seeds < opt.t_seeds) break; if (k < b[0]->n) { b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, pool); - for (i = 0; i < b[1]->n; ++i) // flip the strand flag - b[1]->hits[i].flag ^= 0x10, b[1]->hits[i].is_rev ^= 1; + for (i = 0; i < b[1]->n; ++i) { + bsw2hit_t *p = &b[1]->hits[i]; + int x = p->beg; + p->flag ^= 0x10, p->is_rev ^= 1; // flip the strand + p->beg = l - p->end; + p->end = l - x; + } flag_fr(b); merge_hits(b, l, 0); bsw2_resolve_duphits(0, 0, b[0], 0); From 1f970b45570378e6f1f30b0b0bd2c35ea8f345eb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 Oct 2011 14:14:42 -0400 Subject: [PATCH 056/498] updated revision number --- bwa.1 | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwa.1 b/bwa.1 index c82fdc7..85ebf04 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "24 January 2011" "bwa-0.5.9" "Bioinformatics tools" +.TH bwa 1 "24 October 2011" "bwa-0.6.0" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool diff --git a/main.c b/main.c index 1e91c17..dbb8faa 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r46-dev" +#define PACKAGE_VERSION "0.6.0-r53-dev" #endif static int usage() From 8f3c7805526b82545c6eaf240a6b2b92a8b9dccb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 Oct 2011 14:22:39 -0400 Subject: [PATCH 057/498] fixed a potential int overflow --- bwtsw2_aux.c | 5 +++-- main.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index ce4324d..37113fb 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -410,7 +410,8 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks } for (i = 0; b && i < b->n; ++i) { bsw2hit_t *p = b->hits + i; - int32_t seqid = -1, coor = -1; + int seqid = -1; + int64_t coor = -1; int j, qual, nn = 0; int beg, end; if (p->l == 0) { @@ -419,7 +420,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks coor = p->k - bns->anns[seqid].offset; } ksprintf(&str, "%s\t%d", ks->name, p->flag&0x10); - ksprintf(&str, "\t%s\t%d", seqid>=0? bns->anns[seqid].name : "*", coor + 1); + ksprintf(&str, "\t%s\t%ld", seqid>=0? bns->anns[seqid].name : "*", (long)coor + 1); if (p->l == 0) { { // estimate mapping quality float c = 1.0; diff --git a/main.c b/main.c index dbb8faa..5642ce1 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r53-dev" +#define PACKAGE_VERSION "0.6.0-r54-dev" #endif static int usage() From b59fd2bf47f8013e717b13aa17be3ff05d198b91 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 Oct 2011 14:39:57 -0400 Subject: [PATCH 058/498] fixed an integer overflow --- bwtsw2_core.c | 3 ++- main.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bwtsw2_core.c b/bwtsw2_core.c index cc3dc63..846c184 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -474,7 +474,8 @@ bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *ta while (!stack_isempty(stack)) { int old_n, tj; bsw2entry_t *v; - uint32_t k, l, tcntk[4], tcntl[4]; + uint32_t tcntk[4], tcntl[4]; + bwtint_t k, l; v = stack_pop(stack); old_n = v->n; n_tot += v->n; diff --git a/main.c b/main.c index 5642ce1..ad3841f 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r54-dev" +#define PACKAGE_VERSION "0.6.0-r55-dev" #endif static int usage() From 7b4266a6e5afd0f2a73f896760e224261e5a0a53 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 24 Oct 2011 17:07:12 -0400 Subject: [PATCH 059/498] bugfix: integer overflow and strand error in sampe --- bwape.c | 131 +++++++++++++++++++++++++++++-------------------------- bwase.c | 10 ++--- bwtaln.h | 2 +- main.c | 2 +- 4 files changed, 76 insertions(+), 69 deletions(-) diff --git a/bwape.c b/bwape.c index 8d2a695..30cba12 100644 --- a/bwape.c +++ b/bwape.c @@ -21,22 +21,31 @@ typedef struct { bwtint_t low, high, high_bayesian; } isize_info_t; +typedef struct { + uint64_t x, y; +} b128_t; + +#define b128_lt(a, b) ((a).x < (b).x) +#define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y) +#define b128_hash(a) ((uint32_t)(a).x) + #include "khash.h" -KHASH_MAP_INIT_INT64(64, poslist_t) +KHASH_INIT(b128, b128_t, poslist_t, 1, b128_hash, b128_eq) #include "ksort.h" +KSORT_INIT(b128, b128_t, b128_lt) KSORT_INIT_GENERIC(uint64_t) typedef struct { - kvec_t(uint64_t) arr; - kvec_t(uint64_t) pos[2]; + kvec_t(b128_t) arr; + kvec_t(b128_t) pos[2]; kvec_t(bwt_aln1_t) aln[2]; } pe_data_t; #define MIN_HASH_WIDTH 1000 extern int g_log_n[256]; // in bwase.c -static kh_64_t *g_hash; +static kh_b128_t *g_hash; void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); @@ -160,68 +169,69 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii) { int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len; - uint64_t last_pos[2][2], o_pos[2], subo_score, o_score; + uint64_t o_score, subo_score; + b128_t last_pos[2][2], o_pos[2]; max_len = p[0]->full_len; if (max_len < p[1]->full_len) max_len = p[1]->full_len; if (low_bound < max_len) low_bound = max_len; // here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize -#define __pairing_aux(u,v) do { \ - bwtint_t l = ((v)>>32) + p[(v)&1]->len - ((u)>>32); \ - if ((u) != (uint64_t)-1 && (v)>>32 > (u)>>32 && l >= max_len \ +#define __pairing_aux(u,v) do { \ + bwtint_t l = (v).x + p[(v).y&1]->len - ((u).x); \ + if ((u).x != (uint64_t)-1 && (v).x > (u).x && l >= max_len \ && ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \ - { \ - uint64_t s = d->aln[(v)&1].a[(uint32_t)(v)>>1].score + d->aln[(u)&1].a[(uint32_t)(u)>>1].score; \ - s *= 10; \ + { \ + uint64_t s = d->aln[(v).y&1].a[(v).y>>2].score + d->aln[(u).y&1].a[(u).y>>2].score; \ + s *= 10; \ if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \ - s = s<<32 | (uint32_t)hash_64((u)>>32<<32 | (v)>>32); \ - if (s>>32 == o_score>>32) ++o_n; \ - else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \ - else ++subo_n; \ - if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u)&1] = (u), o_pos[(v)&1] = (v); \ - else if (s < subo_score) subo_score = s; \ - } \ + s = s<<32 | (uint32_t)hash_64((u).x<<32 | (v).x); \ + if (s>>32 == o_score>>32) ++o_n; \ + else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \ + else ++subo_n; \ + if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u).y&1] = (u), o_pos[(v).y&1] = (v); \ + else if (s < subo_score) subo_score = s; \ + } \ } while (0) -#define __pairing_aux2(q, w) do { \ - const bwt_aln1_t *r = d->aln[(w)&1].a + ((uint32_t)(w)>>1); \ - (q)->extra_flag |= SAM_FPP; \ - if ((q)->pos != (w)>>32 || (q)->strand != r->a) { \ - (q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = r->a; \ - (q)->score = r->score; \ - (q)->pos = (w)>>32; \ - if ((q)->mapQ > 0) ++cnt_chg; \ - } \ +#define __pairing_aux2(q, w) do { \ + const bwt_aln1_t *r = d->aln[(w).y&1].a + ((w).y>>2); \ + (q)->extra_flag |= SAM_FPP; \ + if ((q)->pos != (w).x || (q)->strand != ((w).y>>1&1)) { \ + (q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = (w).y>>1&1; \ + (q)->score = r->score; \ + (q)->pos = (w).x; \ + if ((q)->mapQ > 0) ++cnt_chg; \ + } \ } while (0) o_score = subo_score = (uint64_t)-1; o_n = subo_n = 0; - ks_introsort(uint64_t, d->arr.n, d->arr.a); - for (j = 0; j < 2; ++j) last_pos[j][0] = last_pos[j][1] = (uint64_t)-1; + ks_introsort(b128, d->arr.n, d->arr.a); + for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1; if (opt->type == BWA_PET_STD) { for (i = 0; i < d->arr.n; ++i) { - uint64_t x = d->arr.a[i]; - int strand = d->aln[x&1].a[(uint32_t)x>>1].a; + b128_t x = d->arr.a[i]; + int strand = x.y>>1&1; if (strand == 1) { // reverse strand, then check - int y = 1 - (x&1); + int y = 1 - (x.y&1); __pairing_aux(last_pos[y][1], x); __pairing_aux(last_pos[y][0], x); } else { // forward strand, then push - last_pos[x&1][0] = last_pos[x&1][1]; - last_pos[x&1][1] = x; + last_pos[x.y&1][0] = last_pos[x.y&1][1]; + last_pos[x.y&1][1] = x; } } } else if (opt->type == BWA_PET_SOLID) { for (i = 0; i < d->arr.n; ++i) { - uint64_t x = d->arr.a[i]; - int strand = d->aln[x&1].a[(uint32_t)x>>1].a; - if ((strand^x)&1) { // push - int y = 1 - (x&1); + b128_t x = d->arr.a[i]; + int strand = x.y>>1&1; + if ((strand^x.y)&1) { // push + int y = 1 - (x.y&1); __pairing_aux(last_pos[y][1], x); __pairing_aux(last_pos[y][0], x); } else { // check - last_pos[x&1][0] = last_pos[x&1][1]; - last_pos[x&1][1] = x; + last_pos[x.y&1][0] = last_pos[x.y&1][1]; + last_pos[x.y&1][1] = x; } } } else { @@ -229,10 +239,9 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, exit(1); } // set pairing - //fprintf(stderr, "[%d, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n); + //fprintf(stderr, "[%ld, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n); if (o_score != (uint64_t)-1) { int mapQ_p = 0; // this is the maximum mapping quality when one end is moved - int rr[2]; //fprintf(stderr, "%d, %d\n", o_n, subo_n); if (o_n == 1) { if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair @@ -243,9 +252,7 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, if (mapQ_p < 0) mapQ_p = 0; } } - rr[0] = d->aln[o_pos[0]&1].a[(uint32_t)o_pos[0]>>1].a; - rr[1] = d->aln[o_pos[1]&1].a[(uint32_t)o_pos[1]>>1].a; - if ((p[0]->pos == o_pos[0]>>32 && p[0]->strand == rr[0]) && (p[1]->pos == o_pos[1]>>32 && p[1]->strand == rr[1])) { // both ends not moved + if ((p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) && (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1))) { // both ends not moved if (p[0]->mapQ > 0 && p[1]->mapQ > 0) { int mapQ = p[0]->mapQ + p[1]->mapQ; if (mapQ > 60) mapQ = 60; @@ -254,10 +261,10 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ; if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ; } - } else if (p[0]->pos == o_pos[0]>>32 && p[0]->strand == rr[0]) { // [1] moved + } else if (p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) { // [1] moved p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ; if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p; - } else if (p[1]->pos == o_pos[1]>>32 && p[1]->strand == rr[1]) { // [0] moved + } else if (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1)) { // [0] moved p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ; if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p; } else { // both ends moved @@ -338,7 +345,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT) && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT)) { // only when both ends mapped - uint64_t x; + b128_t x; int j, k, n_occ[2]; for (j = 0; j < 2; ++j) { n_occ[j] = 0; @@ -351,32 +358,32 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw for (k = 0; k < d->aln[j].n; ++k) { bwt_aln1_t *r = d->aln[j].a + k; bwtint_t l; - if (r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table - uint64_t key = (uint64_t)r->k<<32 | r->l; + if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table + b128_t key; int ret; - khint_t iter = kh_put(64, g_hash, key, &ret); + key.x = r->k; key.y = r->l; + khint_t iter = kh_put(b128, g_hash, key, &ret); if (ret) { // not in the hash table; ret must equal 1 as we never remove elements poslist_t *z = &kh_val(g_hash, iter); z->n = r->l - r->k + 1; z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n); for (l = r->k; l <= r->l; ++l) { int strand; - z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand); - r->a = strand; + z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand)<<1; + z->a[l - r->k] |= strand; } } for (l = 0; l < kh_val(g_hash, iter).n; ++l) { - x = kh_val(g_hash, iter).a[l]; - x = x<<32 | k<<1 | j; - kv_push(uint64_t, d->arr, x); + x.x = kh_val(g_hash, iter).a[l]>>1; + x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j; + kv_push(b128_t, d->arr, x); } } else { // then calculate on the fly for (l = r->k; l <= r->l; ++l) { int strand; - x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand); - r->a = strand; - x = x<<32 | k<<1 | j; - kv_push(uint64_t, d->arr, x); + x.x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand); + x.y = k<<2 | strand<<1 | j; + kv_push(b128_t, d->arr, x); } } } @@ -669,7 +676,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f srand48(bns->seed); fp_sa[0] = xopen(fn_sa[0], "r"); fp_sa[1] = xopen(fn_sa[1], "r"); - g_hash = kh_init(64); + g_hash = kh_init(b128); last_ii.avg = -1.0; fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); @@ -745,7 +752,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f } for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter) if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a); - kh_destroy(64, g_hash); + kh_destroy(b128, g_hash); if (pac) { free(pac); bwt_destroy(bwt); } diff --git a/bwase.c b/bwase.c index fda4752..0b6581e 100644 --- a/bwase.c +++ b/bwase.c @@ -31,7 +31,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma const bwt_aln1_t *p = aln + i; if (p->score > best) break; if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) { - s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->strand = p->a; + s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; s->score = p->score; s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); } @@ -67,8 +67,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma for (l = q->k; l <= q->l; ++l) { s->multi[z].pos = l; s->multi[z].gap = q->n_gapo + q->n_gape; - s->multi[z].mm = q->n_mm; - s->multi[z++].strand = q->a; + s->multi[z++].mm = q->n_mm; } rest -= q->l - q->k + 1; } else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here. @@ -78,18 +77,19 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma while (x < p) p -= p * j / (i--); s->multi[z].pos = q->l - i; s->multi[z].gap = q->n_gapo + q->n_gape; - s->multi[z].mm = q->n_mm; - s->multi[z++].strand = q->a; + s->multi[z++].mm = q->n_mm; } rest = 0; break; } } s->n_multi = z; + /*// the following code removes the primary hit, but this leads to a bug in the PE mode for (k = z = 0; k < s->n_multi; ++k) if (s->multi[k].pos != s->sa) s->multi[z++] = s->multi[k]; s->n_multi = z < n_multi? z : n_multi; + */ } } diff --git a/bwtaln.h b/bwtaln.h index 20191ad..a3eace2 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -34,7 +34,7 @@ typedef struct { } bwt_width_t; typedef struct { - uint32_t n_mm:8, n_gapo:8, n_gape:8, a:1; + uint32_t n_mm:16, n_gapo:8, n_gape:8; bwtint_t k, l; int score; } bwt_aln1_t; diff --git a/main.c b/main.c index ad3841f..8d663b6 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r55-dev" +#define PACKAGE_VERSION "0.6.0-r56-dev" #endif static int usage() From 22c2252e156b089bae0b154d51b1db777b1be34a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 25 Oct 2011 00:22:28 -0400 Subject: [PATCH 060/498] added bidirectional bwt; seems buggy --- Makefile | 2 +- bwt.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ bwt.h | 11 ++++++ fastmap.c | 56 ++++++++++++++++++++++++++++ main.c | 1 + main.h | 2 + 6 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 fastmap.c diff --git a/Makefile b/Makefile index c9588f2..2035050 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ is.o bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \ bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ - bwtsw2_chain.o bamlite.o + bwtsw2_chain.o bamlite.o fastmap.o PROG= bwa INCLUDES= LIBS= -lm -lz -lpthread diff --git a/bwt.c b/bwt.c index 038a8a3..e7da788 100644 --- a/bwt.c +++ b/bwt.c @@ -32,6 +32,7 @@ #include #include "utils.h" #include "bwt.h" +#include "kvec.h" void bwt_gen_cnt_table(bwt_t *bwt) { @@ -237,3 +238,111 @@ int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0 = k; *l0 = l; return l - k + 1; } + +/********************* + * Bidirectional BWT * + *********************/ + +void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) +{ + bwtint_t tk[4], tl[4]; + int i; + bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); + for (i = 0; i != 4; ++i) { + ok[i].x[!is_back] = bwt->L2[i] + tk[i] + 1; + ok[i].x[2] = (tl[i] -= tk[i]); + } + ok[3].x[is_back] = ik->x[is_back]; + ok[2].x[is_back] = ok[3].x[is_back] + tl[3]; + ok[1].x[is_back] = ok[2].x[is_back] + tl[2]; + ok[0].x[is_back] = ok[1].x[is_back] + tl[1]; +} + +static void bwt_reverse_intvs(bwtintv_v *p) +{ + if (p->n > 1) { + int j; + for (j = 0; j < p->n>>1; ++j) { + bwtintv_t tmp = p->a[p->n - 1 - j]; + p->a[p->n - 1 - j] = p->a[j]; + p->a[j] = tmp; + } + } +} + +int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +{ + int i, j, c, ret; + bwtintv_t ik, ok[4]; + bwtintv_v a[2], *prev, *curr, *swap; + + mem->n = 0; + if (q[x] > 3) return x + 1; + kv_init(a[0]); kv_init(a[1]); + prev = tmpvec[0]? tmpvec[0] : &a[0]; + curr = tmpvec[1]? tmpvec[1] : &a[1]; + bwt_set_intv(bwt, q[x], ik); + + ik.info = x + 1; + for (i = x + 1; i < len; ++i) { // forward search + if (q[i] > 3) break; + c = 3 - q[i]; + bwt_extend(bwt, &ik, ok, 0); + if (ok[c].x[2] != ik.x[2]) // change of the interval size + kv_push(bwtintv_t, *curr, ik); + if (ok[c].x[2] == 0) break; // cannot be extended + ik = ok[c]; ik.info = i + 1; + } + if (i == len) kv_push(bwtintv_t, *curr, ik); // push the last interval if we reach the end + bwt_reverse_intvs(curr); // s.t. smaller intervals visited first + ret = curr->a[0].info; // this will be the returned value + swap = curr; curr = prev; prev = swap; + + for (i = x - 1; i >= -1; --i) { // backward search for MEMs + if (q[i] > 3) break; + c = i < 0? 0 : q[i]; + for (j = 0, curr->n = 0; j < prev->n; ++j) { + bwtintv_t *p = &prev->a[j]; + bwt_extend(bwt, p, ok, 1); + if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further + if (curr->n == 0) { // curr->n to make sure there is no longer matches + if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches + ik = *p; ik.info |= (uint64_t)(i + 1)<<32; + kv_push(bwtintv_t, *mem, ik); + } + } // otherwise the match is contained in another longer match + } + if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) { + ok[c].info = p->info; + kv_push(bwtintv_t, *curr, ok[c]); + } + } + if (curr->n == 0) break; + swap = curr; curr = prev; prev = swap; + } + bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate + + if (tmpvec[0] == 0) free(a[0].a); + if (tmpvec[1] == 0) free(a[1].a); + return ret; +} + +int bwt_smem(const bwt_t *bwt, int len, const uint8_t *q, bwtintv_v *mem, bwtintv_v *tmpvec[3]) +{ + int x = 0, i; + bwtintv_v a[3], *tvec[2], *mem1; + kv_init(a[0]); kv_init(a[1]); kv_init(a[2]); // no memory allocation here + tvec[0] = tmpvec[0]? tmpvec[0] : &a[0]; + tvec[1] = tmpvec[1]? tmpvec[1] : &a[1]; + mem1 = tmpvec[2]? tmpvec[2] : &a[2]; + mem->n = 0; + do { + x = bwt_smem1(bwt, len, q, x, mem1, tvec); + for (i = 0; i < mem1->n; ++i) + kv_push(bwtintv_t, *mem, mem1->a[i]); + } while (x < len); + if (tmpvec[0] == 0) free(a[0].a); + if (tmpvec[1] == 0) free(a[1].a); + if (tmpvec[2] == 0) free(a[2].a); + return mem->n; +} diff --git a/bwt.h b/bwt.h index 2f12be6..e635b13 100644 --- a/bwt.h +++ b/bwt.h @@ -54,6 +54,12 @@ typedef struct { bwtint_t *sa; } bwt_t; +typedef struct { + bwtint_t x[3], info; +} bwtintv_t; + +typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v; + /* For general OCC_INTERVAL, the following is correct: #define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16]) #define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) @@ -75,6 +81,8 @@ typedef struct { (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \ : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1))) +#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)], (ik).x[2] = (bwt)->L2[(int)(c)+1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)], (ik).info = 0) + #ifdef __cplusplus extern "C" { #endif @@ -104,6 +112,9 @@ extern "C" { int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end); int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0); + void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back); + int bwt_smem(const bwt_t *bwt, int len, const uint8_t *q, bwtintv_v *mem, bwtintv_v *tmpvec[3]); + #ifdef __cplusplus } #endif diff --git a/fastmap.c b/fastmap.c new file mode 100644 index 0000000..ee8e9b4 --- /dev/null +++ b/fastmap.c @@ -0,0 +1,56 @@ +#include +#include +#include +#include "bwt.h" +#include "kvec.h" +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +extern unsigned char nst_nt4_table[256]; + +int main_fastmap(int argc, char *argv[]) +{ + int c, i; + kseq_t *seq; + gzFile fp; + bwt_t *bwt; + bwtintv_v a[3], mem, *tvec[3]; + while ((c = getopt(argc, argv, "")) >= 0) { + switch (c) { + } + } + if (optind + 1 >= argc) { + fprintf(stderr, "bwa fastmap \n"); + return 1; + } + fp = gzopen(argv[optind + 1], "r"); + seq = kseq_init(fp); + { // load the BWT + char *tmp = calloc(strlen(argv[optind]) + 5, 1); + strcat(strcpy(tmp, argv[optind]), ".bwt"); + bwt = bwt_restore_bwt(tmp); + free(tmp); + } + for (i = 0; i < 3; ++i) { + kv_init(a[i]); + tvec[i] = &a[i]; + } + kv_init(mem); + while (kseq_read(seq) >= 0) { + for (i = 0; i < seq->seq.l; ++i) + seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; + bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec); + printf(">%s\t%ld\n", seq->name.s, mem.n); + for (i = 0; i < mem.n; ++i) { + bwtintv_t *p = &mem.a[i]; + printf("%d\t%d\t%ld\n", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); + } + puts("//"); + } + free(mem.a); + for (i = 0; i < 3; ++i) free(a[i].a); + bwt_destroy(bwt); + kseq_destroy(seq); + gzclose(fp); + return 0; +} diff --git a/main.c b/main.c index 8d663b6..11de84e 100644 --- a/main.c +++ b/main.c @@ -54,6 +54,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwtsw2") == 0) return bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "dbwtsw") == 0) return bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "bwasw") == 0) return bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "fastmap") == 0) return main_fastmap(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; diff --git a/main.h b/main.h index 15ec189..026a80b 100644 --- a/main.h +++ b/main.h @@ -21,6 +21,8 @@ extern "C" { int bwa_bwtsw2(int argc, char *argv[]); + int main_fastmap(int argc, char *argv[]); + #ifdef __cplusplus } #endif From 7626595e3ad1e5c973c3548b28805eccc3eea34e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 25 Oct 2011 10:03:57 -0400 Subject: [PATCH 061/498] backup the current debugging code; more changes --- bntseq.c | 13 +++++++++---- bwt.c | 5 +++-- fastmap.c | 21 +++++++++++++++++++++ 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/bntseq.c b/bntseq.c index 15b369f..56df921 100644 --- a/bntseq.c +++ b/bntseq.c @@ -190,14 +190,14 @@ static void add1(const kseq_t *seq, bntseq_t *bns, FILE *fp, uint8_t *buf, int * *q = bns->ambs + bns->n_holes; (*q)->len = 1; (*q)->offset = p->offset + i; - (*q)->amb = seq->seq.s[i]; + (*q)->amb = 'N'; ++p->n_ambs; ++bns->n_holes; } } lasts = seq->seq.s[i]; { // fill buffer - if (c >= 4) c = lrand48()&0x3; + if (c >= 4) c = c>>4; if (*l_buf == 0x40000) { fwrite(buf, 1, 0x10000, fp); memset(buf, 0, 0x10000); @@ -238,11 +238,16 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) memset(buf, 0, 0x10000); // read sequences while (kseq_read(seq) >= 0) { - for (i = 0; i < seq->seq.l; ++i) // convert to 2-bit encoding + for (i = 0; i < seq->seq.l; ++i) { // convert to 2-bit encoding seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; + if (seq->seq.s[i] > 3) + seq->seq.s[i] |= (lrand48()&3) << 4; + } add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); if (!for_only) { - seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 1); + seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 0); // reversed but not complemented + for (i = 0; i < seq->seq.l; ++i) // complement + seq->seq.s[i] = seq->seq.s[i] < 4? 3 - seq->seq.s[i] : ((3 - (seq->seq.s[i]>>4)) << 4 | 4); add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); } } diff --git a/bwt.c b/bwt.c index e7da788..6f5eb67 100644 --- a/bwt.c +++ b/bwt.c @@ -282,9 +282,9 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem prev = tmpvec[0]? tmpvec[0] : &a[0]; curr = tmpvec[1]? tmpvec[1] : &a[1]; bwt_set_intv(bwt, q[x], ik); - ik.info = x + 1; - for (i = x + 1; i < len; ++i) { // forward search + + for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search if (q[i] > 3) break; c = 3 - q[i]; bwt_extend(bwt, &ik, ok, 0); @@ -298,6 +298,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem ret = curr->a[0].info; // this will be the returned value swap = curr; curr = prev; prev = swap; + if (x == 40) printf("[%lld,%lld,%lld]\n", prev->a[0].x[0], prev->a[0].x[1], prev->a[0].x[2]); for (i = x - 1; i >= -1; --i) { // backward search for MEMs if (q[i] > 3) break; c = i < 0? 0 : q[i]; diff --git a/fastmap.c b/fastmap.c index ee8e9b4..cf952b7 100644 --- a/fastmap.c +++ b/fastmap.c @@ -39,6 +39,27 @@ int main_fastmap(int argc, char *argv[]) while (kseq_read(seq) >= 0) { for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; + { + int beg = 98; + bwtintv_t ik, ok[4]; + bwt_set_intv(bwt, seq->seq.s[seq->seq.l - 1], ik); + for (i = seq->seq.l - 2; i >= beg; --i) { + //printf("[%lld,%lld,%lld] @ %d\n", ik.x[0], ik.x[1], ik.x[2], i+1); + bwt_extend(bwt, &ik, ok, 1); + ik = ok[seq->seq.s[i]]; + if (ik.x[2] == 0) break; + } + printf("[%lld,%lld,%lld] @ %d\n", ik.x[0], ik.x[1], ik.x[2], i+1); + printf("======================== %lld, [%lld,%lld,%lld,%lld]\n", bwt->primary, bwt->L2[1], bwt->L2[2]-bwt->L2[1], bwt->L2[3]-bwt->L2[2], bwt->L2[4]-bwt->L2[3]); + bwt_set_intv(bwt, seq->seq.s[beg], ik); + for (i = beg + 1; i < seq->seq.l; ++i) { + //printf("[%lld,%lld,%lld] @ %d\n", ik.x[0], ik.x[1], ik.x[2], i-1); + bwt_extend(bwt, &ik, ok, 0); + ik = ok[3-seq->seq.s[i]]; + if (ik.x[2] == 0) break; + } + printf("[%lld,%lld,%lld] @ %d\n", ik.x[0], ik.x[1], ik.x[2], i-1); + } bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec); printf(">%s\t%ld\n", seq->name.s, mem.n); for (i = 0; i < mem.n; ++i) { From ca809a44d9da31296ad5b68f1e7b1058e5509e4e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 25 Oct 2011 10:27:13 -0400 Subject: [PATCH 062/498] build .pac in memory; prepare for further changes --- bntseq.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/bntseq.c b/bntseq.c index 56df921..3a90b99 100644 --- a/bntseq.c +++ b/bntseq.c @@ -163,7 +163,7 @@ void bns_destroy(bntseq_t *bns) } } -static void add1(const kseq_t *seq, bntseq_t *bns, FILE *fp, uint8_t *buf, int *l_buf, int *m_seqs, int *m_holes, bntamb1_t **q) +static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q) { bntann1_t *p; int i, lasts; @@ -198,17 +198,17 @@ static void add1(const kseq_t *seq, bntseq_t *bns, FILE *fp, uint8_t *buf, int * lasts = seq->seq.s[i]; { // fill buffer if (c >= 4) c = c>>4; - if (*l_buf == 0x40000) { - fwrite(buf, 1, 0x10000, fp); - memset(buf, 0, 0x10000); - *l_buf = 0; + if (bns->l_pac == *m_pac) { // double the pac size + *m_pac <<= 1; + pac = realloc(pac, *m_pac/4); + memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4); } - buf[*l_buf>>2] |= c << ((3 - (*l_buf&3)) << 1); - ++(*l_buf); + pac[bns->l_pac>>2] |= c << ((3 - (bns->l_pac&3)) << 1); + ++bns->l_pac; } } ++bns->n_seqs; - bns->l_pac += seq->seq.l; + return pac; } int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) @@ -217,9 +217,9 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) kseq_t *seq; char name[1024]; bntseq_t *bns; - unsigned char buf[0x10000]; - int32_t i, l_buf, m_seqs, m_holes; - int64_t ret = -1; + uint8_t *pac = 0; + int32_t i, m_seqs, m_holes; + int64_t ret = -1, m_pac; bntamb1_t *q; FILE *fp; @@ -228,14 +228,13 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); - m_seqs = m_holes = 8; + m_seqs = m_holes = 8; m_pac = 0x10000; bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); + pac = calloc(m_pac/4, 1); q = bns->ambs; - l_buf = 0; strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); - memset(buf, 0, 0x10000); // read sequences while (kseq_read(seq) >= 0) { for (i = 0; i < seq->seq.l; ++i) { // convert to 2-bit encoding @@ -243,19 +242,18 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) if (seq->seq.s[i] > 3) seq->seq.s[i] |= (lrand48()&3) << 4; } - add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); + pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); if (!for_only) { seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 0); // reversed but not complemented for (i = 0; i < seq->seq.l; ++i) // complement seq->seq.s[i] = seq->seq.s[i] < 4? 3 - seq->seq.s[i] : ((3 - (seq->seq.s[i]>>4)) << 4 | 4); - add1(seq, bns, fp, buf, &l_buf, &m_seqs, &m_holes, &q); + pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); } } - xassert(bns->l_pac, "zero length sequence."); ret = bns->l_pac; { // finalize .pac file ubyte_t ct; - fwrite(buf, 1, (l_buf>>2) + ((l_buf&3) == 0? 0 : 1), fp); + fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % 4 == 0) { ct = 0; @@ -269,6 +267,7 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) bns_dump(bns, prefix); bns_destroy(bns); kseq_destroy(seq); + free(pac); return ret; } From aabb80773456d29c3551c097f190428110080b81 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 25 Oct 2011 11:22:08 -0400 Subject: [PATCH 063/498] concatenate for-rev sequences in the end --- bntseq.c | 70 +++++++++++++++++++++------------------------------ bntseq.h | 7 ++++-- bwase.c | 7 +++--- bwtsw2_core.c | 6 ++--- 4 files changed, 39 insertions(+), 51 deletions(-) diff --git a/bntseq.c b/bntseq.c index 3a90b99..98a5a49 100644 --- a/bntseq.c +++ b/bntseq.c @@ -163,6 +163,9 @@ void bns_destroy(bntseq_t *bns) } } +#define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1)) +#define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3) + static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q) { bntann1_t *p; @@ -178,7 +181,7 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_ p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; p->n_ambs = 0; for (i = lasts = 0; i < seq->seq.l; ++i) { - int c = seq->seq.s[i]; + int c = nst_nt4_table[(int)seq->seq.s[i]]; if (c >= 4) { // N if (lasts == seq->seq.s[i]) { // contiguous N ++(*q)->len; @@ -190,20 +193,20 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_ *q = bns->ambs + bns->n_holes; (*q)->len = 1; (*q)->offset = p->offset + i; - (*q)->amb = 'N'; + (*q)->amb = seq->seq.s[i]; ++p->n_ambs; ++bns->n_holes; } } lasts = seq->seq.s[i]; { // fill buffer - if (c >= 4) c = c>>4; + if (c >= 4) c = lrand48()&3; if (bns->l_pac == *m_pac) { // double the pac size *m_pac <<= 1; pac = realloc(pac, *m_pac/4); memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4); } - pac[bns->l_pac>>2] |= c << ((3 - (bns->l_pac&3)) << 1); + _set_pac(pac, bns->l_pac, c); ++bns->l_pac; } } @@ -218,8 +221,8 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) char name[1024]; bntseq_t *bns; uint8_t *pac = 0; - int32_t i, m_seqs, m_holes; - int64_t ret = -1, m_pac; + int32_t m_seqs, m_holes; + int64_t ret = -1, m_pac, l; bntamb1_t *q; FILE *fp; @@ -236,19 +239,13 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); // read sequences - while (kseq_read(seq) >= 0) { - for (i = 0; i < seq->seq.l; ++i) { // convert to 2-bit encoding - seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - if (seq->seq.s[i] > 3) - seq->seq.s[i] |= (lrand48()&3) << 4; - } - pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); - if (!for_only) { - seq_reverse(seq->seq.l, (uint8_t*)seq->seq.s, 0); // reversed but not complemented - for (i = 0; i < seq->seq.l; ++i) // complement - seq->seq.s[i] = seq->seq.s[i] < 4? 3 - seq->seq.s[i] : ((3 - (seq->seq.s[i]>>4)) << 4 | 4); - pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); - } + while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); + if (!for_only) { // add the reverse complemented sequence + m_pac = (bns->l_pac * 2 + 3) / 4 * 4; + pac = realloc(pac, m_pac/4); + memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); + for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) + _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); } ret = bns->l_pac; { // finalize .pac file @@ -290,32 +287,21 @@ int bwa_fa2pac(int argc, char *argv[]) return 0; } -int64_t bns_pos2refId(const bntseq_t *bns, int64_t pos, int is_fr, int *ref_id, int *is_rev) -{ - int left, mid, right; - is_fr = is_fr? 1 : 0; - left = 0; mid = 0; right = bns->n_seqs; - while (left < right) { - mid = (left + right) >> 1; - if (pos >= bns->anns[mid].offset<n_seqs - 1) break; - if (pos < bns->anns[mid+1].offset<anns[mid]; - if (pos - (p->offset<<1) < p->len) *is_rev = 0, pos -= p->offset; - else *is_rev = 1, pos = p->len - (pos - (p->offset<<1) - p->len) + p->offset; - } - return pos; -} - int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) { int left, mid, right, nn; - if (ref_id) bns_pos2refId(bns, pos_f, 0, ref_id, 0); + if (ref_id) { + left = 0; mid = 0; right = bns->n_seqs; + while (left < right) { + mid = (left + right) >> 1; + if (pos_f >= bns->anns[mid].offset) { + if (mid == bns->n_seqs - 1) break; + if (pos_f < bns->anns[mid+1].offset) break; // bracketed + left = mid + 1; + } else right = mid; + } + *ref_id = mid; + } left = 0; right = bns->n_holes; nn = 0; while (left < right) { mid = (left + right) >> 1; diff --git a/bntseq.h b/bntseq.h index 276ef64..0becc01 100644 --- a/bntseq.h +++ b/bntseq.h @@ -71,12 +71,15 @@ extern "C" { bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); void bns_destroy(bntseq_t *bns); int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); - int64_t bns_pos2refId(const bntseq_t *bns, int64_t pos, int is_fr, int *ref_id, int *is_rev); int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); - #ifdef __cplusplus } #endif +static inline int64_t bns_depos(const bntseq_t *bns, int64_t pos, int *is_rev) +{ + return (*is_rev = (pos >= bns->l_pac))? (bns->l_pac<<1) - pos : pos; +} + #endif diff --git a/bwase.c b/bwase.c index 0b6581e..124d6b4 100644 --- a/bwase.c +++ b/bwase.c @@ -111,10 +111,9 @@ int bwa_approx_mapQ(const bwa_seq_t *p, int mm) bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand) { - bwtint_t pos_fr, pos_f; - int is_rev, ref_id; - pos_fr = bwt_sa(bwt, sapos); - pos_f = bns_pos2refId(bns, pos_fr, 1, &ref_id, &is_rev); // pos_f + bwtint_t pos_f; + int is_rev; + pos_f = bns_depos(bns, bwt_sa(bwt, sapos), &is_rev); // pos_f *strand = !is_rev; /* NB: For gapped alignment, pacpos may not be correct, which will be fixed * in refine_gapped_core(). This line also determines the way "x" is diff --git a/bwtsw2_core.c b/bwtsw2_core.c index 846c184..fe87a5f 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -268,7 +268,7 @@ static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, i * will be obtained and stored in b->hits[*].k. */ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS) { - int i, j, n, ref_id, is_rev; + int i, j, n, is_rev; if (b->n == 0) return 0; if (bwt && bns) { // convert to chromosomal coordinates if requested int old_n = b->n; @@ -287,7 +287,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int if (p->G == 0 && p->k == 0 && p->l == 0 && p->len == 0) continue; for (k = p->k; k <= p->l; ++k) { b->hits[j] = *p; - b->hits[j].k = bns_pos2refId(bns, bwt_sa(bwt, k), 1, &ref_id, &is_rev); + b->hits[j].k = bns_depos(bns, bwt_sa(bwt, k), &is_rev); b->hits[j].l = 0; b->hits[j].is_rev = is_rev; if (is_rev) b->hits[j].k -= p->len; @@ -295,7 +295,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int } } else if (p->G > 0) { b->hits[j] = *p; - b->hits[j].k = bns_pos2refId(bns, bwt_sa(bwt, p->k), 1, &ref_id, &is_rev); + b->hits[j].k = bns_depos(bns, bwt_sa(bwt, p->k), &is_rev); b->hits[j].l = 0; b->hits[j].flag |= 1; b->hits[j].is_rev = is_rev; From f56edd07dd2f0b15609694b4814bb65b3a517c6d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 25 Oct 2011 12:31:36 -0400 Subject: [PATCH 064/498] forward-backward search seems working --- Makefile | 1 + bwt.c | 13 ++++++------- bwt.h | 2 +- fastmap.c | 6 +++++- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 2035050..e4b073f 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,7 @@ bwt1away.o:bwt.h bwtaln.h bwt2fmv.o:bwt.h bntseq.o:bntseq.h bwtgap.o:bwtgap.h bwtaln.h bwt.h +fastmap:bwt.h bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h diff --git a/bwt.c b/bwt.c index 6f5eb67..5b139d2 100644 --- a/bwt.c +++ b/bwt.c @@ -249,13 +249,13 @@ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_b int i; bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); for (i = 0; i != 4; ++i) { - ok[i].x[!is_back] = bwt->L2[i] + tk[i] + 1; - ok[i].x[2] = (tl[i] -= tk[i]); + ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; + ok[i].x[2] = tl[i] - tk[i]; } - ok[3].x[is_back] = ik->x[is_back]; - ok[2].x[is_back] = ok[3].x[is_back] + tl[3]; - ok[1].x[is_back] = ok[2].x[is_back] + tl[2]; - ok[0].x[is_back] = ok[1].x[is_back] + tl[1]; + ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); + ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; + ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; + ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; } static void bwt_reverse_intvs(bwtintv_v *p) @@ -298,7 +298,6 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem ret = curr->a[0].info; // this will be the returned value swap = curr; curr = prev; prev = swap; - if (x == 40) printf("[%lld,%lld,%lld]\n", prev->a[0].x[0], prev->a[0].x[1], prev->a[0].x[2]); for (i = x - 1; i >= -1; --i) { // backward search for MEMs if (q[i] > 3) break; c = i < 0? 0 : q[i]; diff --git a/bwt.h b/bwt.h index e635b13..75dd21c 100644 --- a/bwt.h +++ b/bwt.h @@ -81,7 +81,7 @@ typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v; (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \ : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1))) -#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)], (ik).x[2] = (bwt)->L2[(int)(c)+1] - (bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)], (ik).info = 0) +#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0) #ifdef __cplusplus extern "C" { diff --git a/fastmap.c b/fastmap.c index cf952b7..de5bc0d 100644 --- a/fastmap.c +++ b/fastmap.c @@ -40,7 +40,9 @@ int main_fastmap(int argc, char *argv[]) for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; { - int beg = 98; + int beg = 97; + uint8_t str[3]; + bwtint_t b, e; bwtintv_t ik, ok[4]; bwt_set_intv(bwt, seq->seq.s[seq->seq.l - 1], ik); for (i = seq->seq.l - 2; i >= beg; --i) { @@ -50,6 +52,7 @@ int main_fastmap(int argc, char *argv[]) if (ik.x[2] == 0) break; } printf("[%lld,%lld,%lld] @ %d\n", ik.x[0], ik.x[1], ik.x[2], i+1); + //str[0] = '\1'; str[1] = '\3'; bwt_match_exact(bwt, 2, str, &b, &e); printf("%lld,%lld,%lld\n", b, e, e-b+1); printf("======================== %lld, [%lld,%lld,%lld,%lld]\n", bwt->primary, bwt->L2[1], bwt->L2[2]-bwt->L2[1], bwt->L2[3]-bwt->L2[2], bwt->L2[4]-bwt->L2[3]); bwt_set_intv(bwt, seq->seq.s[beg], ik); for (i = beg + 1; i < seq->seq.l; ++i) { @@ -59,6 +62,7 @@ int main_fastmap(int argc, char *argv[]) if (ik.x[2] == 0) break; } printf("[%lld,%lld,%lld] @ %d\n", ik.x[0], ik.x[1], ik.x[2], i-1); + //str[0] = '\0'; str[1] = '\2'; bwt_match_exact(bwt, 2, str, &b, &e); printf("%lld,%lld,%lld\n", b, e, e-b+1); } bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec); printf(">%s\t%ld\n", seq->name.s, mem.n); From 4813257d4f90fc988efffcc0ceffa7c15f36c3cb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 25 Oct 2011 12:38:33 -0400 Subject: [PATCH 065/498] remove debugging code --- fastmap.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/fastmap.c b/fastmap.c index de5bc0d..ee8e9b4 100644 --- a/fastmap.c +++ b/fastmap.c @@ -39,31 +39,6 @@ int main_fastmap(int argc, char *argv[]) while (kseq_read(seq) >= 0) { for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - { - int beg = 97; - uint8_t str[3]; - bwtint_t b, e; - bwtintv_t ik, ok[4]; - bwt_set_intv(bwt, seq->seq.s[seq->seq.l - 1], ik); - for (i = seq->seq.l - 2; i >= beg; --i) { - //printf("[%lld,%lld,%lld] @ %d\n", ik.x[0], ik.x[1], ik.x[2], i+1); - bwt_extend(bwt, &ik, ok, 1); - ik = ok[seq->seq.s[i]]; - if (ik.x[2] == 0) break; - } - printf("[%lld,%lld,%lld] @ %d\n", ik.x[0], ik.x[1], ik.x[2], i+1); - //str[0] = '\1'; str[1] = '\3'; bwt_match_exact(bwt, 2, str, &b, &e); printf("%lld,%lld,%lld\n", b, e, e-b+1); - printf("======================== %lld, [%lld,%lld,%lld,%lld]\n", bwt->primary, bwt->L2[1], bwt->L2[2]-bwt->L2[1], bwt->L2[3]-bwt->L2[2], bwt->L2[4]-bwt->L2[3]); - bwt_set_intv(bwt, seq->seq.s[beg], ik); - for (i = beg + 1; i < seq->seq.l; ++i) { - //printf("[%lld,%lld,%lld] @ %d\n", ik.x[0], ik.x[1], ik.x[2], i-1); - bwt_extend(bwt, &ik, ok, 0); - ik = ok[3-seq->seq.s[i]]; - if (ik.x[2] == 0) break; - } - printf("[%lld,%lld,%lld] @ %d\n", ik.x[0], ik.x[1], ik.x[2], i-1); - //str[0] = '\0'; str[1] = '\2'; bwt_match_exact(bwt, 2, str, &b, &e); printf("%lld,%lld,%lld\n", b, e, e-b+1); - } bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec); printf(">%s\t%ld\n", seq->name.s, mem.n); for (i = 0; i < mem.n; ++i) { From 7168f5c10ace159bfb901fcca3c7abddb87e3070 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 25 Oct 2011 12:50:19 -0400 Subject: [PATCH 066/498] updated revision number --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index 11de84e..eda54f4 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r56-dev" +#define PACKAGE_VERSION "0.6.0-r63-dev" #endif static int usage() From 7664795ffb15a568b2e5e2dffe26f492f4f230e9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 25 Oct 2011 13:00:41 -0400 Subject: [PATCH 067/498] fixed a minor issue about +/-1 --- bntseq.h | 2 +- bwase.c | 2 +- main.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bntseq.h b/bntseq.h index 0becc01..843db64 100644 --- a/bntseq.h +++ b/bntseq.h @@ -79,7 +79,7 @@ extern "C" { static inline int64_t bns_depos(const bntseq_t *bns, int64_t pos, int *is_rev) { - return (*is_rev = (pos >= bns->l_pac))? (bns->l_pac<<1) - pos : pos; + return (*is_rev = (pos >= bns->l_pac))? (bns->l_pac<<1) - 1 - pos : pos; } #endif diff --git a/bwase.c b/bwase.c index 124d6b4..76480d4 100644 --- a/bwase.c +++ b/bwase.c @@ -118,7 +118,7 @@ bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int l /* NB: For gapped alignment, pacpos may not be correct, which will be fixed * in refine_gapped_core(). This line also determines the way "x" is * calculated in refine_gapped_core() when (ext < 0 && is_end == 0). */ - if (is_rev) pos_f = pos_f < len? 0 : pos_f - len; // mapped to the forward strand + if (is_rev) pos_f = pos_f + 1 < len? 0 : pos_f - len + 1; // mapped to the forward strand return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset } diff --git a/main.c b/main.c index eda54f4..bc4eca4 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r63-dev" +#define PACKAGE_VERSION "0.6.0-r64-dev" #endif static int usage() From 55059443bd0252053926e2b12186de770a41ba66 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 25 Oct 2011 15:06:13 -0400 Subject: [PATCH 068/498] print msg to stderr; output more in fastmap --- bwt_gen.c | 4 ++-- fastmap.c | 37 ++++++++++++++++++++++++++++++------- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/bwt_gen.c b/bwt_gen.c index b9568e9..cac6a5f 100644 --- a/bwt_gen.c +++ b/bwt_gen.c @@ -1486,8 +1486,8 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxB BWTIncConstruct(bwtInc, textToLoad); processedTextLength += textToLoad; if (bwtInc->numberOfIterationDone % 10 == 0) { - printf("[BWTIncConstructFromPacked] %lu iterations done. %lu characters processed.\n", - (long)bwtInc->numberOfIterationDone, (long)processedTextLength); + fprintf(stderr, "[BWTIncConstructFromPacked] %lu iterations done. %lu characters processed.\n", + (long)bwtInc->numberOfIterationDone, (long)processedTextLength); } } return bwtInc; diff --git a/fastmap.c b/fastmap.c index ee8e9b4..e3485fc 100644 --- a/fastmap.c +++ b/fastmap.c @@ -1,6 +1,8 @@ #include -#include #include +#include +#include +#include "bntseq.h" #include "bwt.h" #include "kvec.h" #include "kseq.h" @@ -10,28 +12,37 @@ extern unsigned char nst_nt4_table[256]; int main_fastmap(int argc, char *argv[]) { - int c, i; + int c, i, min_iwidth = 3, min_len = 17; kseq_t *seq; + bwtint_t k; gzFile fp; bwt_t *bwt; + bntseq_t *bns; bwtintv_v a[3], mem, *tvec[3]; - while ((c = getopt(argc, argv, "")) >= 0) { + + while ((c = getopt(argc, argv, "w:l:")) >= 0) { switch (c) { + case 'w': min_iwidth = atoi(optarg); break; + case 'l': min_len = atoi(optarg); break; } } if (optind + 1 >= argc) { fprintf(stderr, "bwa fastmap \n"); return 1; } + fp = gzopen(argv[optind + 1], "r"); seq = kseq_init(fp); - { // load the BWT + { // load the packed sequences, BWT and SA char *tmp = calloc(strlen(argv[optind]) + 5, 1); strcat(strcpy(tmp, argv[optind]), ".bwt"); bwt = bwt_restore_bwt(tmp); + strcat(strcpy(tmp, argv[optind]), ".sa"); + bwt_restore_sa(tmp, bwt); free(tmp); + bns = bns_restore(argv[optind]); } - for (i = 0; i < 3; ++i) { + for (i = 0; i < 3; ++i) { // initiate the temporary array kv_init(a[i]); tvec[i] = &a[i]; } @@ -40,15 +51,27 @@ int main_fastmap(int argc, char *argv[]) for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec); - printf(">%s\t%ld\n", seq->name.s, mem.n); + printf(">%s\t%ld\n", seq->name.s, seq->seq.l); for (i = 0; i < mem.n; ++i) { bwtintv_t *p = &mem.a[i]; - printf("%d\t%d\t%ld\n", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); + if ((uint32_t)p->info - (p->info>>32) < min_len) continue; + printf("%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); + if (p->x[2] <= min_iwidth) { + for (k = 0; k < p->x[2]; ++k) { + int is_rev; + int64_t pos; + pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev); + printf("\t%c%ld", is_rev?'-':'+', (long)pos); + } + } + putchar('\n'); } puts("//"); } + free(mem.a); for (i = 0; i < 3; ++i) free(a[i].a); + bns_destroy(bns); bwt_destroy(bwt); kseq_destroy(seq); gzclose(fp); From e890b8ac2e5c2d493998bb104d4362c38edd6ec4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 25 Oct 2011 19:45:55 -0400 Subject: [PATCH 069/498] preliminary code to generate fake sam --- fastmap.c | 141 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 129 insertions(+), 12 deletions(-) diff --git a/fastmap.c b/fastmap.c index e3485fc..c0c040a 100644 --- a/fastmap.c +++ b/fastmap.c @@ -8,22 +8,121 @@ #include "kseq.h" KSEQ_INIT(gzFile, gzread) +typedef struct { + int qbeg, len; + int64_t tbeg; +} fmhit_t; + +#define hit_qlt(a, b) ((a).qbeg < (b).qbeg) +#define hit_tlt(a, b) ((a).tbeg < (b).tbeg) + +#include "ksort.h" +KSORT_INIT(hitq, fmhit_t, hit_qlt) +KSORT_INIT(hitt, fmhit_t, hit_tlt) + extern unsigned char nst_nt4_table[256]; +typedef struct { size_t n, m; fmhit_t *a; } fmhit_v; + +static uint64_t cluster_qhits(fmhit_v *hits, int beg, int end, int max_dist, int *_score) +{ + int qend, score, max = 0, cbeg = beg; + size_t i; + uint64_t max_cluster = 0; + ks_introsort(hitq, end - beg, hits->a + beg); + qend = hits->a[beg].qbeg + hits->a[beg].len; + score = hits->a[beg].len; + for (i = beg + 1; i < end; ++i) { + fmhit_t *p = &hits->a[i]; + if (p->qbeg - qend > max_dist) { + if (score > max) max = score, max_cluster = (uint64_t)cbeg<<32 | i; + score = 0; cbeg = i; + } + score += p->len; + qend = p->qbeg + p->len; + } + if (score > max) max = score, max_cluster = (uint64_t)cbeg<<32 | i; + *_score = score; + return max_cluster; +} + +static int cluster_hits(fmhit_v *hits, int max_dist) +{ + size_t i; + int64_t tend; + uint64_t cluster, max_cluster = 0; + int j, score, max = 0, max2 = 0, cbeg = 0, cend; + ks_introsort(hitt, hits->n, hits->a); + tend = hits->a[0].tbeg + hits->a[0].len; + for (i = 1; i < hits->n; ++i) { + fmhit_t *p = &hits->a[i]; + if (p->tbeg - tend > max_dist) { + cluster = cluster_qhits(hits, cbeg, i, max_dist, &score); + if (score > max) max2 = max, max = score, max_cluster = cluster; + else if (score > max2) max2 = score; + cbeg = i; + } + tend = p->tbeg + p->len; + } + cluster = cluster_qhits(hits, cbeg, i, max_dist, &score); + if (score > max) max2 = max, max = score, max_cluster = cluster; + else if (score > max2) max2 = score; + cbeg = max_cluster>>32; cend = (uint32_t)max_cluster; + for (i = 0, j = cbeg; j < cend; ++j) hits->a[i++] = hits->a[j]; + hits->n = i; + return (int)(200.0 * (max - max2) / max + .499); +} + +static int fake_cigar(const bntseq_t *bns, fmhit_v *hits, int beg, int end, int len, uint32_t *cigar, int64_t *pos, int *is_rev) +{ + size_t i; + int qbeg, qend, n_cigar = 0; + int64_t tbeg, tend, tmp; + qbeg = len; qend = 0; + tbeg = bns->l_pac<<1; tend = 0; + for (i = beg; i < end; ++i) { + fmhit_t *p = &hits->a[i]; + if (p->qbeg < qbeg) qbeg = p->qbeg; + if (p->qbeg + p->len > qend) qend = p->qbeg + p->len; + if (p->tbeg < tbeg) tbeg = p->tbeg; + if (p->tbeg + p->len > qend) tend = p->tbeg + p->len; + } + if (tbeg >= bns->l_pac) { + tmp = tend; tend = bns->l_pac*2 - tbeg; tbeg = bns->l_pac*2 - tmp; + tmp = qend; qend = len - qbeg; qbeg = len - tmp; + *is_rev = 1; + } else *is_rev = 0; + *pos = tbeg; + if (qbeg) cigar[n_cigar++] = qbeg<<4|4; + if (tend - tbeg < qend - qbeg) { // reference is shorter + cigar[n_cigar++] = (uint32_t)(tend - tbeg)<<4 | 0; + cigar[n_cigar++] = (uint32_t)((qend - qbeg) - (tend - tbeg))<<4 | 2; + } else if (tend - tbeg > qend - qbeg) { // query is shorter + cigar[n_cigar++] = (uint32_t)(qend - qbeg)<<4 | 0; + cigar[n_cigar++] = (uint32_t)((tend - tbeg) - (qend - qbeg))<<4 | 1; + } else cigar[n_cigar++] = (uint32_t)(qend - qbeg)<<4 | 0; + if (len > qend) cigar[n_cigar++] = (uint32_t)(len - qend)<<4|4; + return n_cigar; +} + int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 3, min_len = 17; + int c, i, min_iwidth = 3, min_len = 17, max_dist = 100, mem_only = 0; kseq_t *seq; bwtint_t k; gzFile fp; bwt_t *bwt; bntseq_t *bns; bwtintv_v a[3], mem, *tvec[3]; + fmhit_v hits; + uint32_t cigar[1024]; - while ((c = getopt(argc, argv, "w:l:")) >= 0) { + while ((c = getopt(argc, argv, "w:l:d:p")) >= 0) { switch (c) { case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; + case 'd': max_dist = atoi(optarg); break; + case 'p': mem_only = 1; break; } } if (optind + 1 >= argc) { @@ -46,27 +145,45 @@ int main_fastmap(int argc, char *argv[]) kv_init(a[i]); tvec[i] = &a[i]; } - kv_init(mem); + kv_init(mem); kv_init(hits); while (kseq_read(seq) >= 0) { for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec); - printf(">%s\t%ld\n", seq->name.s, seq->seq.l); - for (i = 0; i < mem.n; ++i) { + if (mem_only) printf("SQ\t%s\t%ld\n", seq->name.s, seq->seq.l); + for (i = 0, hits.n = 0; i < mem.n; ++i) { bwtintv_t *p = &mem.a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; - printf("%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); + if (mem_only) printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); if (p->x[2] <= min_iwidth) { for (k = 0; k < p->x[2]; ++k) { - int is_rev; - int64_t pos; - pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev); - printf("\t%c%ld", is_rev?'-':'+', (long)pos); + fmhit_t z; + z.tbeg = bwt_sa(bwt, p->x[0] + k); + z.qbeg = p->info>>32; + z.len = (uint32_t)p->info - z.qbeg; + kv_push(fmhit_t, hits, z); + if (mem_only) { + int is_rev, ref_id; + int64_t pos = bns_depos(bns, z.tbeg, &is_rev); + if (is_rev) pos -= z.len - 1; + bns_cnt_ambi(bns, pos, z.len, &ref_id); + printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); + } } } - putchar('\n'); + if (mem_only) putchar('\n'); } - puts("//"); + if (!mem_only) { + int64_t pos; + int n_cigar, is_rev, ref_id, mapq; + mapq = cluster_hits(&hits, max_dist); + n_cigar = fake_cigar(bns, &hits, 0, hits.n, seq->seq.l, cigar, &pos, &is_rev); + bns_cnt_ambi(bns, pos, 1, &ref_id); + printf("%s\t%d\t%s\t%ld\t%d\t", seq->name.s, is_rev?16:0, bns->anns[ref_id].name, (long)(pos - bns->anns[ref_id].offset) + 1, mapq); + for (i = 0; i < n_cigar; ++i) + printf("%d%c", cigar[i]>>4, "MIDNSHP"[cigar[i]&0xf]); + printf("\t*\t0\t0\t*\t*\n"); + } else puts("//"); } free(mem.a); From 7467671c30159a3492ac42350c896134713b0138 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 25 Oct 2011 21:39:38 -0400 Subject: [PATCH 070/498] minor change --- fastmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fastmap.c b/fastmap.c index c0c040a..37c640c 100644 --- a/fastmap.c +++ b/fastmap.c @@ -107,7 +107,7 @@ static int fake_cigar(const bntseq_t *bns, fmhit_v *hits, int beg, int end, int int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 3, min_len = 17, max_dist = 100, mem_only = 0; + int c, i, min_iwidth = 20, min_len = 17, max_dist = 50, mem_only = 0; kseq_t *seq; bwtint_t k; gzFile fp; @@ -176,6 +176,7 @@ int main_fastmap(int argc, char *argv[]) if (!mem_only) { int64_t pos; int n_cigar, is_rev, ref_id, mapq; + if (hits.n == 0) continue; mapq = cluster_hits(&hits, max_dist); n_cigar = fake_cigar(bns, &hits, 0, hits.n, seq->seq.l, cigar, &pos, &is_rev); bns_cnt_ambi(bns, pos, 1, &ref_id); From 7babb54e4c7d26cf1a8a5890da4584852c44d5f3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 27 Oct 2011 10:56:09 -0400 Subject: [PATCH 071/498] drop smem based mapping algorithm While we can compute smems very efficiently, there is still a long way to get the alignment. On simulated data, this smem-based algorithm is 4X faster than bwasw and twice as fast as bowtie2, but the accuracy is far lower than bwasw and even lower than bowtie2 in the high-mapQ range. I am kind of sure that if we continue to increase the mapping accuracy, the speed will approach to bwasw, if not slower. Smem-based mapping algorithm is still interesting, but given that I am short of time, I will not explore it further. --- fastmap.c | 147 ++++++------------------------------------------------ main.c | 3 +- 2 files changed, 18 insertions(+), 132 deletions(-) diff --git a/fastmap.c b/fastmap.c index 37c640c..585a043 100644 --- a/fastmap.c +++ b/fastmap.c @@ -8,125 +8,26 @@ #include "kseq.h" KSEQ_INIT(gzFile, gzread) -typedef struct { - int qbeg, len; - int64_t tbeg; -} fmhit_t; - -#define hit_qlt(a, b) ((a).qbeg < (b).qbeg) -#define hit_tlt(a, b) ((a).tbeg < (b).tbeg) - -#include "ksort.h" -KSORT_INIT(hitq, fmhit_t, hit_qlt) -KSORT_INIT(hitt, fmhit_t, hit_tlt) - extern unsigned char nst_nt4_table[256]; -typedef struct { size_t n, m; fmhit_t *a; } fmhit_v; - -static uint64_t cluster_qhits(fmhit_v *hits, int beg, int end, int max_dist, int *_score) -{ - int qend, score, max = 0, cbeg = beg; - size_t i; - uint64_t max_cluster = 0; - ks_introsort(hitq, end - beg, hits->a + beg); - qend = hits->a[beg].qbeg + hits->a[beg].len; - score = hits->a[beg].len; - for (i = beg + 1; i < end; ++i) { - fmhit_t *p = &hits->a[i]; - if (p->qbeg - qend > max_dist) { - if (score > max) max = score, max_cluster = (uint64_t)cbeg<<32 | i; - score = 0; cbeg = i; - } - score += p->len; - qend = p->qbeg + p->len; - } - if (score > max) max = score, max_cluster = (uint64_t)cbeg<<32 | i; - *_score = score; - return max_cluster; -} - -static int cluster_hits(fmhit_v *hits, int max_dist) -{ - size_t i; - int64_t tend; - uint64_t cluster, max_cluster = 0; - int j, score, max = 0, max2 = 0, cbeg = 0, cend; - ks_introsort(hitt, hits->n, hits->a); - tend = hits->a[0].tbeg + hits->a[0].len; - for (i = 1; i < hits->n; ++i) { - fmhit_t *p = &hits->a[i]; - if (p->tbeg - tend > max_dist) { - cluster = cluster_qhits(hits, cbeg, i, max_dist, &score); - if (score > max) max2 = max, max = score, max_cluster = cluster; - else if (score > max2) max2 = score; - cbeg = i; - } - tend = p->tbeg + p->len; - } - cluster = cluster_qhits(hits, cbeg, i, max_dist, &score); - if (score > max) max2 = max, max = score, max_cluster = cluster; - else if (score > max2) max2 = score; - cbeg = max_cluster>>32; cend = (uint32_t)max_cluster; - for (i = 0, j = cbeg; j < cend; ++j) hits->a[i++] = hits->a[j]; - hits->n = i; - return (int)(200.0 * (max - max2) / max + .499); -} - -static int fake_cigar(const bntseq_t *bns, fmhit_v *hits, int beg, int end, int len, uint32_t *cigar, int64_t *pos, int *is_rev) -{ - size_t i; - int qbeg, qend, n_cigar = 0; - int64_t tbeg, tend, tmp; - qbeg = len; qend = 0; - tbeg = bns->l_pac<<1; tend = 0; - for (i = beg; i < end; ++i) { - fmhit_t *p = &hits->a[i]; - if (p->qbeg < qbeg) qbeg = p->qbeg; - if (p->qbeg + p->len > qend) qend = p->qbeg + p->len; - if (p->tbeg < tbeg) tbeg = p->tbeg; - if (p->tbeg + p->len > qend) tend = p->tbeg + p->len; - } - if (tbeg >= bns->l_pac) { - tmp = tend; tend = bns->l_pac*2 - tbeg; tbeg = bns->l_pac*2 - tmp; - tmp = qend; qend = len - qbeg; qbeg = len - tmp; - *is_rev = 1; - } else *is_rev = 0; - *pos = tbeg; - if (qbeg) cigar[n_cigar++] = qbeg<<4|4; - if (tend - tbeg < qend - qbeg) { // reference is shorter - cigar[n_cigar++] = (uint32_t)(tend - tbeg)<<4 | 0; - cigar[n_cigar++] = (uint32_t)((qend - qbeg) - (tend - tbeg))<<4 | 2; - } else if (tend - tbeg > qend - qbeg) { // query is shorter - cigar[n_cigar++] = (uint32_t)(qend - qbeg)<<4 | 0; - cigar[n_cigar++] = (uint32_t)((tend - tbeg) - (qend - qbeg))<<4 | 1; - } else cigar[n_cigar++] = (uint32_t)(qend - qbeg)<<4 | 0; - if (len > qend) cigar[n_cigar++] = (uint32_t)(len - qend)<<4|4; - return n_cigar; -} - int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 20, min_len = 17, max_dist = 50, mem_only = 0; + int c, i, min_iwidth = 20, min_len = 17; kseq_t *seq; bwtint_t k; gzFile fp; bwt_t *bwt; bntseq_t *bns; bwtintv_v a[3], mem, *tvec[3]; - fmhit_v hits; - uint32_t cigar[1024]; - while ((c = getopt(argc, argv, "w:l:d:p")) >= 0) { + while ((c = getopt(argc, argv, "w:l:")) >= 0) { switch (c) { case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; - case 'd': max_dist = atoi(optarg); break; - case 'p': mem_only = 1; break; } } if (optind + 1 >= argc) { - fprintf(stderr, "bwa fastmap \n"); + fprintf(stderr, "Usage: bwa fastmap [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); return 1; } @@ -145,46 +46,30 @@ int main_fastmap(int argc, char *argv[]) kv_init(a[i]); tvec[i] = &a[i]; } - kv_init(mem); kv_init(hits); + kv_init(mem); while (kseq_read(seq) >= 0) { for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec); - if (mem_only) printf("SQ\t%s\t%ld\n", seq->name.s, seq->seq.l); - for (i = 0, hits.n = 0; i < mem.n; ++i) { + printf("SQ\t%s\t%ld\n", seq->name.s, seq->seq.l); + for (i = 0; i < mem.n; ++i) { bwtintv_t *p = &mem.a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; - if (mem_only) printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); + printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); if (p->x[2] <= min_iwidth) { for (k = 0; k < p->x[2]; ++k) { - fmhit_t z; - z.tbeg = bwt_sa(bwt, p->x[0] + k); - z.qbeg = p->info>>32; - z.len = (uint32_t)p->info - z.qbeg; - kv_push(fmhit_t, hits, z); - if (mem_only) { - int is_rev, ref_id; - int64_t pos = bns_depos(bns, z.tbeg, &is_rev); - if (is_rev) pos -= z.len - 1; - bns_cnt_ambi(bns, pos, z.len, &ref_id); - printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); - } + bwtint_t pos; + int len, is_rev, ref_id; + len = (uint32_t)p->info - (p->info>>32); + pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev); + if (is_rev) pos -= len - 1; + bns_cnt_ambi(bns, pos, len, &ref_id); + printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); } } - if (mem_only) putchar('\n'); + putchar('\n'); } - if (!mem_only) { - int64_t pos; - int n_cigar, is_rev, ref_id, mapq; - if (hits.n == 0) continue; - mapq = cluster_hits(&hits, max_dist); - n_cigar = fake_cigar(bns, &hits, 0, hits.n, seq->seq.l, cigar, &pos, &is_rev); - bns_cnt_ambi(bns, pos, 1, &ref_id); - printf("%s\t%d\t%s\t%ld\t%d\t", seq->name.s, is_rev?16:0, bns->anns[ref_id].name, (long)(pos - bns->anns[ref_id].offset) + 1, mapq); - for (i = 0; i < n_cigar; ++i) - printf("%d%c", cigar[i]>>4, "MIDNSHP"[cigar[i]&0xf]); - printf("\t*\t0\t0\t*\t*\n"); - } else puts("//"); + puts("//"); } free(mem.a); diff --git a/main.c b/main.c index bc4eca4..b077179 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r64-dev" +#define PACKAGE_VERSION "0.6.0-r68-dev" #endif static int usage() @@ -19,6 +19,7 @@ static int usage() fprintf(stderr, " samse generate alignment (single ended)\n"); fprintf(stderr, " sampe generate alignment (paired ended)\n"); fprintf(stderr, " bwasw BWA-SW for long queries\n"); + fprintf(stderr, " fastmap identify super-maximal exact matches\n"); fprintf(stderr, "\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n"); From 02946df28a09f26f9c3384933554293588466d18 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 27 Oct 2011 13:55:48 -0400 Subject: [PATCH 072/498] fixed a off-by-1 bug --- bwtsw2_core.c | 4 ++-- main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bwtsw2_core.c b/bwtsw2_core.c index fe87a5f..398a276 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -290,7 +290,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int b->hits[j].k = bns_depos(bns, bwt_sa(bwt, k), &is_rev); b->hits[j].l = 0; b->hits[j].is_rev = is_rev; - if (is_rev) b->hits[j].k -= p->len; + if (is_rev) b->hits[j].k -= p->len - 1; ++j; } } else if (p->G > 0) { @@ -299,7 +299,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int b->hits[j].l = 0; b->hits[j].flag |= 1; b->hits[j].is_rev = is_rev; - if (is_rev) b->hits[j].k -= p->len; + if (is_rev) b->hits[j].k -= p->len - 1; ++j; } } diff --git a/main.c b/main.c index b077179..9456140 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r68-dev" +#define PACKAGE_VERSION "0.6.0-r69-dev" #endif static int usage() From 673ae4aaf8c6423cb494592f43328157d630d6b3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 31 Oct 2011 13:26:24 -0400 Subject: [PATCH 073/498] throw an error if insufficient memory during index --- bwt.c | 4 ++++ main.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/bwt.c b/bwt.c index 5b139d2..38a3c21 100644 --- a/bwt.c +++ b/bwt.c @@ -56,6 +56,10 @@ void bwt_cal_sa(bwt_t *bwt, int intv) bwt->sa_intv = intv; bwt->n_sa = (bwt->seq_len + intv) / intv; bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); + if (bwa->sa == 0) { + fprintf(stderr, "[%s] Fail to allocate %.3fMB memory. Abort!\n", __func__, bwt->n_sa * sizeof(bwtint_t) / 1024.0/1024.0); + abort(); + } // calculate SA value isa = 0; sa = bwt->seq_len; for (i = 0; i < bwt->seq_len; ++i) { diff --git a/main.c b/main.c index 9456140..7f9601f 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r69-dev" +#define PACKAGE_VERSION "0.6.0-r70-dev" #endif static int usage() From 4083fe9413c8f0e2132e7f74d5fa307e398be571 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 2 Nov 2011 09:03:05 -0400 Subject: [PATCH 074/498] fixed a typo --- bwt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwt.c b/bwt.c index 38a3c21..a19f6d8 100644 --- a/bwt.c +++ b/bwt.c @@ -56,7 +56,7 @@ void bwt_cal_sa(bwt_t *bwt, int intv) bwt->sa_intv = intv; bwt->n_sa = (bwt->seq_len + intv) / intv; bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); - if (bwa->sa == 0) { + if (bwt->sa == 0) { fprintf(stderr, "[%s] Fail to allocate %.3fMB memory. Abort!\n", __func__, bwt->n_sa * sizeof(bwtint_t) / 1024.0/1024.0); abort(); } From a26096dd750711dd58b90638b18b551058182d61 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 5 Nov 2011 10:56:25 -0400 Subject: [PATCH 075/498] pair two single-end SAMs --- pairsam.pl | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100755 pairsam.pl diff --git a/pairsam.pl b/pairsam.pl new file mode 100755 index 0000000..bb901bb --- /dev/null +++ b/pairsam.pl @@ -0,0 +1,69 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; +use Getopt::Std; + +my %opts = (a=>700, b=>100); +getopts('a:b:', \%opts); +die("Usage: pairsam.pl [-a $opts{a}] \n") if (@ARGV < 2); + +my ($fh0, $fh1, $l0, $l1); + +open($fh0, $ARGV[0] =~ /\.gz$/? "gzip -dc $ARGV[0] |" : $ARGV[0]) || die; +open($fh1, $ARGV[1] =~ /\.gz$/? "gzip -dc $ARGV[1] |" : $ARGV[1]) || die; + +while ($l0 = <$fh0>) { last if $l0 !~ /^@/; print $l0 } +while ($l1 = <$fh1>) { last if $l1 !~ /^@/ } + +while (defined($l0) && defined($l1)) { + my ($r0, $r1) = &pair_line(\$l0, \$l1, $opts{a}, $opts{b}); + while ($l0 = <$fh0>) { last if $l0 !~ /^$r0/; print $l0; } + while ($l1 = <$fh1>) { last if $l1 !~ /^$r1/; print $l1; } +} + +close($fh0); close($fh1); + +sub pair_line { + my ($l0, $l1, $max_ins, $min_ins) = @_; + my @t0 = split("\t", $$l0); + my @t1 = split("\t", $$l1); + my ($n0, $n1) = ($t0[0], $t1[0]); + my ($cigar, $a0, $a1, $p0, $p1); + # length in alignment + $cigar = $t0[5]; $a0 = 0; $cigar =~ s/(\d+)[MI]/$a0 += $1/eg; + $cigar = $t1[5]; $a1 = 0; $cigar =~ s/(\d+)[MI]/$a1 += $1/eg; + # 5'-end alignment position on the read + $p0 = $t0[1] == 16? $t0[3] + $a0 : $t0[3]; + $p1 = $t1[1] == 16? $t1[3] + $a1 : $t1[3]; + # adjust mapping quality + if ($t0[2] eq $t1[2] && $t0[1]+$t1[1] == 16) { # on the same chr and forward-reverse + if (abs($p0 - $p1) <= $max_ins && abs($p0 - $p1) >= $min_ins) { # within the right insert size distribution + $t0[1] |= 2; $t1[1] |= 2; # flag as paired + if ($t0[4] < $t1[4]) { # increase mapQ + $t0[4] = $t0[4] + 10 < $t1[4]? $t0[4] + 10 : $t1[4]; + } else { + $t1[4] = $t1[4] + 10 < $t0[4]? $t1[4] + 10 : $t0[4]; + } + } + } + unless ($t0[1]&2) { # decrease mapQ if unpaired + $t0[4] = $t0[4] > 10? $t0[4] - 10 : 0; + $t1[4] = $t1[4] > 10? $t1[4] - 10 : 0; + } + # strip off /[12] + $t0[0] =~ s/\/[12]$//; $t1[0] =~ s/\/[12]$//; + # update FLAG + $t0[1] |= 0x41 | (($t1[1]&16)? 0x20 : 0) | (($t1[1]&4)? 0x8 : 0); + $t1[1] |= 0x81 | (($t0[1]&16)? 0x20 : 0) | (($t0[1]&4)? 0x8 : 0); + # update mate positions + if ($t0[2] eq $t1[2]) { + $t0[6] = $t1[6] = '='; + $t0[8] = $p1 - $p0; $t1[8] = $p0 - $p1; + } else { $t0[6] = $t1[2]; $t1[6] = $t0[2]; } + $t0[7] = $t1[3]; $t1[7] = $t0[3]; + # print out + print join("\t", @t0); + print join("\t", @t1); + return ($n0, $n1); +} From 06687a33b95254dd15e3ec18c92dc9396b64b01f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 5 Nov 2011 14:00:01 -0400 Subject: [PATCH 076/498] bwasw: read and align two fastq at a time --- bwtsw2.h | 2 +- bwtsw2_aux.c | 76 +++++++++++++++++++++++++++++++++------------------ bwtsw2_main.c | 13 ++------- 3 files changed, 52 insertions(+), 39 deletions(-) diff --git a/bwtsw2.h b/bwtsw2.h index 3c93509..bd6d219 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -39,7 +39,7 @@ extern "C" { bsw2opt_t *bsw2_init_opt(); bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool); - void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn); + void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2); void bsw2_destroy(bwtsw2_t *b); bsw2global_t *bsw2_global_init(); diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 37113fb..b66744b 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -465,7 +465,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks /* Core routine to align reads in _seq. It is separated from * process_seqs() to realize multi-threading */ -static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target) +static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) { int x; bsw2opt_t opt = *_opt; @@ -543,32 +543,32 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const #ifdef HAVE_PTHREAD typedef struct { - int tid; + int tid, is_pe; bsw2seq_t *_seq; const bsw2opt_t *_opt; const bntseq_t *bns; uint8_t *pac; - bwt_t *target; + const bwt_t *target; } thread_aux_t; /* another interface to bsw2_aln_core() to facilitate pthread_create() */ static void *worker(void *data) { thread_aux_t *p = (thread_aux_t*)data; - bsw2_aln_core(p->tid, p->_seq, p->_opt, p->bns, p->pac, p->target); + bsw2_aln_core(p->tid, p->_seq, p->_opt, p->bns, p->pac, p->target, p->is_pe); return 0; } #endif /* process sequences stored in _seq, generate SAM lines for these * sequences and reset _seq afterwards. */ -static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, bwt_t * const target) +static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) { int i; #ifdef HAVE_PTHREAD if (opt->n_threads <= 1) { - bsw2_aln_core(0, _seq, opt, bns, pac, target); + bsw2_aln_core(0, _seq, opt, bns, pac, target, is_pe); } else { pthread_t *tid; pthread_attr_t attr; @@ -580,7 +580,7 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (j = 0; j < opt->n_threads; ++j) { thread_aux_t *p = data + j; - p->tid = j; p->_seq = _seq; p->_opt = opt; p->bns = bns; + p->tid = j; p->_seq = _seq; p->_opt = opt; p->bns = bns; p->is_pe = is_pe; p->pac = pac; p->target = target; pthread_create(&tid[j], &attr, worker, p); } @@ -588,7 +588,7 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * free(data); free(tid); } #else - bsw2_aln_core(0, _seq, opt, bns, pac, target); + bsw2_aln_core(0, _seq, opt, bns, pac, target, is_pe); #endif // print and reset @@ -603,11 +603,21 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * _seq->n = 0; } -void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn) +static void kseq_to_bsw2seq(const kseq_t *ks, bsw2seq1_t *p) { - gzFile fp; - kseq_t *ks; - int l, size = 0; + p->tid = -1; + p->l = ks->seq.l; + p->name = strdup(ks->name.s); + p->seq = strdup(ks->seq.s); + p->qual = ks->qual.l? strdup(ks->qual.s) : 0; + p->sam = 0; +} + +void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2) +{ + gzFile fp, fp2; + kseq_t *ks, *ks2; + int l, size = 0, is_pe = 0; uint8_t *pac; bsw2seq_t *_seq; @@ -622,30 +632,42 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c fp = xzopen(fn, "r"); ks = kseq_init(fp); _seq = calloc(1, sizeof(bsw2seq_t)); - while ((l = kseq_read(ks)) >= 0) { - bsw2seq1_t *p; + if (fn2) { + fp2 = xzopen(fn2, "r"); + ks2 = kseq_init(fp2); + is_pe = 1; + } else fp2 = 0, ks2 = 0, is_pe = 0; + while (kseq_read(ks) >= 0) { if (_seq->n == _seq->max) { _seq->max = _seq->max? _seq->max<<1 : 1024; _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); } - p = &_seq->seq[_seq->n++]; - p->tid = -1; - p->l = l; - p->name = strdup(ks->name.s); - p->seq = strdup(ks->seq.s); - p->qual = ks->qual.l? strdup(ks->qual.s) : 0; - p->sam = 0; - size += l; + kseq_to_bsw2seq(ks, &_seq->seq[_seq->n++]); + size += ks->seq.l; + if (ks2) { + if (kseq_read(ks2) >= 0) { + kseq_to_bsw2seq(ks2, &_seq->seq[_seq->n++]); // for PE, _seq->n here must be odd and we do not need to enlarge + size += ks->seq.l; + } else { + fprintf(stderr, "[%s] The second query file has fewer reads. Switched to the single-end mode for the following batches.\n", __func__); + is_pe = 0; + } + } if (size > opt->chunk_size) { - fprintf(stderr, "[bsw2_aln] read %d sequences (%d bp)...\n", _seq->n, size); - process_seqs(_seq, opt, bns, pac, target); + fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size); + process_seqs(_seq, opt, bns, pac, target, is_pe); size = 0; } } - fprintf(stderr, "[bsw2_aln] read %d sequences (%d bp)...\n", _seq->n, size); - process_seqs(_seq, opt, bns, pac, target); + fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size); + process_seqs(_seq, opt, bns, pac, target, is_pe); + // free + free(pac); free(_seq->seq); free(_seq); kseq_destroy(ks); gzclose(fp); - free(pac); + if (fn2) { + kseq_destroy(ks2); + gzclose(fp2); + } } diff --git a/bwtsw2_main.c b/bwtsw2_main.c index 3654372..86eddd7 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -40,7 +40,7 @@ int bwa_bwtsw2(int argc, char *argv[]) if (optind + 2 > argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa bwasw [options] \n\n"); + fprintf(stderr, "Usage: bwa bwasw [options] [query2.fa]\n\n"); fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a); fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b); fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q); @@ -65,15 +65,6 @@ int bwa_bwtsw2(int argc, char *argv[]) fprintf(stderr, " increase '-z' for better sensitivity.\n"); fprintf(stderr, "\n"); - if (0) { - double c, theta, eps, delta; - c = opt->a / log(opt->yita); - theta = exp(-opt->b / c) / opt->yita; - eps = exp(-opt->q / c); - delta = exp(-opt->r / c); - fprintf(stderr, "mismatch: %lf, gap_open: %lf, gap_ext: %lf\n\n", - theta, eps, delta); - } return 1; } @@ -85,7 +76,7 @@ int bwa_bwtsw2(int argc, char *argv[]) strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".sa"), target); bns = bns_restore(argv[optind]); - bsw2_aln(opt, bns, target, argv[optind+1]); + bsw2_aln(opt, bns, target, argv[optind+1], optind+2 < argc? argv[optind+2] : 0); bns_destroy(bns); bwt_destroy(target); From a29b1790454f8e279b97c466b1309fe350d648d6 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 5 Nov 2011 14:31:30 -0400 Subject: [PATCH 077/498] further preparation for pairing --- bwtsw2_aux.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index b66744b..2d91b2d 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -66,6 +66,25 @@ void bsw2_destroy(bwtsw2_t *b) free(b); } +bwtsw2_t *bsw2_dup(const bwtsw2_t *b) +{ + bwtsw2_t *p; + int i; + p = calloc(1, sizeof(bwtsw2_t)); + p->max = p->n = b->n; + kroundup32(p->max); + p->hits = calloc(p->max, sizeof(bsw2hit_t)); + p->n_cigar = calloc(p->max, sizeof(int)); + p->cigar = calloc(p->max, sizeof(void*)); + memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t)); + for (i = 0; i < p->n; ++i) { + p->n_cigar[i] = b->n_cigar[i]; + p->cigar[i] = malloc(p->n_cigar[i] * 4); + memcpy(p->cigar[i], b->cigar[i], p->n_cigar[i] * 4); + } + return p; +} + #define __gen_ap(par, opt) do { \ int i; \ for (i = 0; i < 25; ++i) (par).matrix[i] = -(opt)->b; \ @@ -470,6 +489,8 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const int x; bsw2opt_t opt = *_opt; bsw2global_t *pool = bsw2_global_init(); + bwtsw2_t **buf; + buf = calloc(_seq->n, sizeof(void*)); for (x = 0; x < _seq->n; ++x) { bsw2seq1_t *p = _seq->seq + x; uint8_t *seq[2], *rseq[2]; @@ -533,11 +554,16 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const } else b[1] = 0; // generate CIGAR and print SAM gen_cigar(&opt, l, seq, pac, b[0]); - print_hits(bns, &opt, p, b[0]); + buf[x] = bsw2_dup(b[0]); // free free(seq[0]); bsw2_destroy(b[0]); } + for (x = 0; x < _seq->n; ++x) { + print_hits(bns, &opt, &_seq->seq[x], buf[x]); + bsw2_destroy(buf[x]); + } + free(buf); bsw2_global_destroy(pool); } From 17eaac5a2151c3d48e885abba26860a2dd6ec80b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 5 Nov 2011 23:17:52 -0400 Subject: [PATCH 078/498] compute insert size distribution --- Makefile | 2 +- bwtsw2.h | 7 +++++ bwtsw2_aux.c | 6 +---- bwtsw2_pair.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 6 deletions(-) create mode 100644 bwtsw2_pair.c diff --git a/Makefile b/Makefile index e4b073f..63acf1c 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ is.o bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \ bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ - bwtsw2_chain.o bamlite.o fastmap.o + bwtsw2_chain.o bamlite.o fastmap.o bwtsw2_pair.o PROG= bwa INCLUDES= LIBS= -lm -lz -lpthread diff --git a/bwtsw2.h b/bwtsw2.h index bd6d219..3272929 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -33,6 +33,11 @@ typedef struct { uint8_t *aln_mem; } bsw2global_t; +typedef struct { + int l, tid; + char *name, *seq, *qual, *sam; +} bsw2seq1_t; + #ifdef __cplusplus extern "C" { #endif @@ -45,6 +50,8 @@ extern "C" { bsw2global_t *bsw2_global_init(); void bsw2_global_destroy(bsw2global_t *_pool); + void bwtsw2_pair(const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit); + #ifdef __cplusplus } #endif diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 2d91b2d..e992aaf 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -328,11 +328,6 @@ static void flag_fr(bwtsw2_t *b[2]) } } -typedef struct { - int l, tid; - char *name, *seq, *qual, *sam; -} bsw2seq1_t; - typedef struct { int n, max; bsw2seq1_t *seq; @@ -559,6 +554,7 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const free(seq[0]); bsw2_destroy(b[0]); } + bwtsw2_pair(pac, _seq->n, _seq->seq, buf); for (x = 0; x < _seq->n; ++x) { print_hits(bns, &opt, &_seq->seq[x], buf[x]); bsw2_destroy(buf[x]); diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c new file mode 100644 index 0000000..cf88f8e --- /dev/null +++ b/bwtsw2_pair.c @@ -0,0 +1,71 @@ +#include +#include +#include +#include "bwt.h" +#include "bntseq.h" +#include "bwtsw2.h" +#include "ksw.h" + +#define MAX_INS 20000 +#define MIN_RATIO 0.8 +#define OUTLIER_BOUND 2.0 +#define MAX_STDDEV 4.0 + +typedef struct { + int low, high; + double avg, std; +} bsw2pestat_t; + +bsw2pestat_t bwtsw2_stat(int n, bwtsw2_t **buf) +{ + extern void ks_introsort_uint64_t(size_t n, uint64_t *a); + int i, k, x, p25, p50, p75, tmp, max_len = 0; + uint64_t *isize; + bsw2pestat_t r; + + isize = calloc(n, 8); + for (i = k = 0; i < n; i += 2) { + bsw2hit_t *t[2]; + int l; + if (buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits + t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0]; + if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough + if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough + l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + (t[1]->end - t[1]->beg) : t[1]->k - t[0]->k + (t[0]->end - t[0]->beg); + max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg; + max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; + isize[k++] = l; + } + ks_introsort_uint64_t(k, isize); + p25 = isize[(int)(.25 * k + .499)]; + p50 = isize[(int)(.50 * k + .499)]; + p75 = isize[(int)(.75 * k + .499)]; + tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); + r.low = tmp > max_len? tmp : max_len; + r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + fprintf(stderr, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); + fprintf(stderr, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high); + for (i = x = 0, r.avg = 0; i < k; ++i) + if (isize[i] >= r.low && isize[i] <= r.high) + r.avg += isize[i], ++x; + r.avg /= x; + for (i = 0, r.std = 0; i < k; ++i) + if (isize[i] >= r.low && isize[i] <= r.high) + r.std += (isize[i] - r.avg) * (isize[i] - r.avg); + r.std = sqrt(r.std / x); + fprintf(stderr, "[%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r.avg, r.std); + tmp = (int)(p25 - 3. * (p75 - p25) + .499); + r.low = tmp > max_len? tmp : max_len; + r.high = (int)(p75 + 3. * (p75 - p25) + .499); + if (r.low > r.avg - MAX_STDDEV * 4.) r.low = (int)(r.avg - MAX_STDDEV * 4. + .499); + r.low = tmp > max_len? tmp : max_len; + if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499); + fprintf(stderr, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high); + return r; +} + +void bwtsw2_pair(const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits) +{ + bsw2pestat_t pes; + pes = bwtsw2_stat(n, hits); +} From c8c79ef0241de9693130201a7cc6832900455253 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 6 Nov 2011 16:20:40 -0500 Subject: [PATCH 079/498] mate rescue seems working (not MT) --- Makefile | 2 +- bwtsw2.h | 4 +- bwtsw2_aux.c | 64 ++++---- bwtsw2_core.c | 1 + bwtsw2_pair.c | 125 +++++++++++++++- ksw.c | 399 ++++++++++++++++++++++++++++++++++++++++++++++++++ ksw.h | 54 +++++++ 7 files changed, 617 insertions(+), 32 deletions(-) create mode 100644 ksw.c create mode 100644 ksw.h diff --git a/Makefile b/Makefile index 63acf1c..65ce6dd 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64 OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ - is.o bntseq.o bwtmisc.o bwtindex.o stdaln.o simple_dp.o \ + is.o bntseq.o bwtmisc.o bwtindex.o ksw.o stdaln.o simple_dp.o \ bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o bamlite.o fastmap.o bwtsw2_pair.o diff --git a/bwtsw2.h b/bwtsw2.h index 3272929..deae04b 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -6,6 +6,8 @@ #include "bwt_lite.h" #include "bwt.h" +#define BSW2_FLAG_MATESW 0x100 + typedef struct { int a, b, q, r, t, qr, bw; int z, is, t_seeds, hard_clip; @@ -50,7 +52,7 @@ extern "C" { bsw2global_t *bsw2_global_init(); void bsw2_global_destroy(bsw2global_t *_pool); - void bwtsw2_pair(const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit); + void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit); #ifdef __cplusplus } diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index e992aaf..471c450 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -66,22 +66,14 @@ void bsw2_destroy(bwtsw2_t *b) free(b); } -bwtsw2_t *bsw2_dup(const bwtsw2_t *b) +bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b) { bwtsw2_t *p; - int i; p = calloc(1, sizeof(bwtsw2_t)); p->max = p->n = b->n; kroundup32(p->max); p->hits = calloc(p->max, sizeof(bsw2hit_t)); - p->n_cigar = calloc(p->max, sizeof(int)); - p->cigar = calloc(p->max, sizeof(void*)); memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t)); - for (i = 0; i < p->n; ++i) { - p->n_cigar[i] = b->n_cigar[i]; - p->cigar[i] = malloc(p->n_cigar[i] * 4); - memcpy(p->cigar[i], b->cigar[i], p->n_cigar[i] * 4); - } return p; } @@ -406,9 +398,22 @@ static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n return n_cigar; } +static int est_mapq(const bsw2hit_t *p, const bsw2opt_t *opt) +{ + float c = 1.0; + int qual, subo = p->G2 > opt->t? p->G2 : opt->t; + if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; + if (p->n_seeds < 2) c *= .2; + qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499); + if (qual > 250) qual = 250; + if (qual < 0) qual = 0; + if (p->flag&1) qual = 0; // this is a random hit + return qual; +} + /* generate SAM lines for a sequence in ks with alignment stored in * b. ks->name and ks->seq will be freed and set to NULL in the end. */ -static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b) +static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b, int is_pe, bwtsw2_t *bmate) { int i, k; kstring_t str; @@ -433,18 +438,15 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks nn = bns_cnt_ambi(bns, p->k, p->len, &seqid); coor = p->k - bns->anns[seqid].offset; } - ksprintf(&str, "%s\t%d", ks->name, p->flag&0x10); + ksprintf(&str, "%s\t%d", ks->name, (p->flag&0xff)|(is_pe?1:0)); ksprintf(&str, "\t%s\t%ld", seqid>=0? bns->anns[seqid].name : "*", (long)coor + 1); if (p->l == 0) { { // estimate mapping quality - float c = 1.0; - int subo = p->G2 > opt->t? p->G2 : opt->t; - if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; - if (p->n_seeds < 2) c *= .2; - qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499); - if (qual > 250) qual = 250; - if (qual < 0) qual = 0; - if (p->flag&1) qual = 0; + qual = est_mapq(p, opt); + if ((p->flag & BSW2_FLAG_MATESW) && bmate && bmate->n == 1) { // this alignment is from Smith-Waterman rescue + int mate_qual = est_mapq(bmate->hits, opt); + qual = qual < mate_qual? qual : mate_qual; + } } ksprintf(&str, "\t%d\t", qual); for (k = 0; k < b->n_cigar[i]; ++k) @@ -469,6 +471,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks } else ksprintf(&str, "\t*"); ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tXN:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, nn); if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1); + if (p->flag&BSW2_FLAG_MATESW) ksprintf(&str, "\tXT:i:1"); kputc('\n', &str); } ks->sam = str.s; @@ -526,7 +529,7 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const rseq[1][i] = c; } if (l - k < opt.t) { // too few unambiguous bases - print_hits(bns, &opt, p, 0); + buf[x] = 0; free(seq[0]); continue; } // alignment @@ -548,17 +551,28 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bsw2_resolve_query_overlaps(b[0], opt.mask_level); } else b[1] = 0; // generate CIGAR and print SAM - gen_cigar(&opt, l, seq, pac, b[0]); - buf[x] = bsw2_dup(b[0]); + buf[x] = bsw2_dup_no_cigar(b[0]); // free free(seq[0]); bsw2_destroy(b[0]); } - bwtsw2_pair(pac, _seq->n, _seq->seq, buf); + if (is_pe) bsw2_pair(&opt, bns->l_pac, pac, _seq->n, _seq->seq, buf); for (x = 0; x < _seq->n; ++x) { - print_hits(bns, &opt, &_seq->seq[x], buf[x]); - bsw2_destroy(buf[x]); + bsw2seq1_t *p = _seq->seq + x; + uint8_t *seq[2]; + int i; + seq[0] = malloc(p->l * 2); seq[1] = seq[0] + p->l; + for (i = 0; i < p->l; ++i) { + int c = nst_nt4_table[(int)p->seq[i]]; + if (c >= 4) c = (int)(drand48() * 4); + seq[0][i] = c; + seq[1][p->l-1-i] = 3 - c; + } + gen_cigar(&opt, p->l, seq, pac, buf[x]); + print_hits(bns, &opt, p, buf[x], is_pe, buf[x^1]); + free(seq[0]); } + for (x = 0; x < _seq->n; ++x) bsw2_destroy(buf[x]); free(buf); bsw2_global_destroy(pool); } diff --git a/bwtsw2_core.c b/bwtsw2_core.c index 398a276..67f126c 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -327,6 +327,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int } if (!compatible) { p->G = 0; + if (q->G2 < p->G2) q->G2 = p->G2; break; } } diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index cf88f8e..e4721d4 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "bwt.h" #include "bntseq.h" #include "bwtsw2.h" @@ -10,13 +11,14 @@ #define MIN_RATIO 0.8 #define OUTLIER_BOUND 2.0 #define MAX_STDDEV 4.0 +#define EXT_STDDEV 4.0 typedef struct { int low, high; double avg, std; } bsw2pestat_t; -bsw2pestat_t bwtsw2_stat(int n, bwtsw2_t **buf) +bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf) { extern void ks_introsort_uint64_t(size_t n, uint64_t *a); int i, k, x, p25, p50, p75, tmp, max_len = 0; @@ -27,11 +29,11 @@ bsw2pestat_t bwtsw2_stat(int n, bwtsw2_t **buf) for (i = k = 0; i < n; i += 2) { bsw2hit_t *t[2]; int l; - if (buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits + if (buf[i] == 0 || buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0]; if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough - l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + (t[1]->end - t[1]->beg) : t[1]->k - t[0]->k + (t[0]->end - t[0]->beg); + l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + t[1]->len : t[1]->k - t[0]->k + t[0]->len; max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg; max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; isize[k++] = l; @@ -64,8 +66,121 @@ bsw2pestat_t bwtsw2_stat(int n, bwtsw2_t **buf) return r; } -void bwtsw2_pair(const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits) +typedef struct { + int n_cigar, beg, end, len; + int64_t pos; + uint32_t *cigar; +} pairaux_t; + +extern unsigned char nst_nt4_table[256]; +static int8_t g_mat[25]; + +void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a) { + extern void seq_reverse(int len, ubyte_t *seq, int is_comp); + int64_t k, beg, end; + uint8_t *seq, *ref; + int i; + ksw_query_t *q; + ksw_aux_t aux[2]; + // compute the region start and end + a->n_seeds = 1; a->l = 0; a->flag |= BSW2_FLAG_MATESW; + if (h->is_rev == 0) { + beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499); + end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499); + a->is_rev = 1; a->flag |= 16; + } else { + beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499); + end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499); + a->is_rev = 0; + } + if (beg < 1) beg = 1; + if (end > l_pac) end = l_pac; + // generate the sequence + seq = malloc(l_mseq + (end - beg)); + ref = seq + l_mseq; + for (k = beg; k < end; ++k) + ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3; + if (h->is_rev == 0) { + for (i = 0; i < l_mseq; ++i) { // on the reverse strand + int c = nst_nt4_table[(int)mseq[i]]; + seq[l_mseq - 1 - i] = c > 3? 4 : 3 - c; + } + } else { + for (i = 0; i < l_mseq; ++i) // on the forward strand + seq[i] = nst_nt4_table[(int)mseq[i]]; + } + /* The following code can be made up to 2-fold as fast. I am just lazy... */ + // forward Smith-Waterman + aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0]; + q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); + ksw_sse2(q, end - beg, ref, &aux[0]); + free(q); + if (aux[0].score == 0) { + free(seq); + return; + } + // reverse Smith-Waterman + seq_reverse(l_mseq, seq, 0); + seq_reverse(end - beg, ref, 0); + q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); + ksw_sse2(q, end - beg, ref, &aux[1]); + free(q); + aux[1].te = end - beg - 1 - aux[1].te; // change to the forward-strand coordinate + // write output + a->G = aux[0].score; + a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2; + a->k = beg + aux[1].te; + a->len = aux[0].te + 1 - aux[1].te; + a->beg = l_mseq - 1 - aux[1].qe; + a->end = aux[0].qe + 1; + free(seq); +} + +void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits) +{ + extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); bsw2pestat_t pes; - pes = bwtsw2_stat(n, hits); + int i, j, k, n_rescued = 0; + pes = bsw2_stat(n, hits); + for (i = k = 0; i < 5; ++i) { + for (j = 0; j < 4; ++j) + g_mat[k++] = i == j? opt->a : -opt->b; + g_mat[k++] = 0; + } + for (i = 0; i < n; i += 2) { + bsw2hit_t a[2]; + memset(&a, 0, sizeof(bsw2hit_t) * 2); + a[0].flag = 1<<6; a[1].flag = 1<<7; + for (j = 0; j < 2; ++j) { // set the read1/2 flag + if (hits[i+j] == 0) continue; + for (k = 0; k < hits[i+j]->n; ++k) { + bsw2hit_t *p = &hits[i+j]->hits[k]; + p->flag |= 1<<(6+j); + } + } + if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N + if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit + if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit + if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1]); + if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0]); + // the following enumerate all possibilities. It is tedious but necessary... + //if (strstr(seq[i].name, "22_49258265_49258755_4")) fprintf(stderr, "%lld\t%lld\t(%d,%d)\n", hits[i+1]->hits[0].k, a[1].k, a[0].G, a[0].G2); + if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not + bwtsw2_t *p[2]; + int which; + if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1; + else p[0] = hits[i+1], p[1] = hits[i], which = 0; + if (a[which].G == 0) continue; + if (p[1]->max == 0) { + p[1]->max = 1; + p[1]->hits = malloc(sizeof(bsw2hit_t)); + } + memcpy(p[1]->hits, &a[which], sizeof(bsw2hit_t)); + p[1]->n = 1; + ++n_rescued; + } else { // then both ends mapped + } + } + fprintf(stderr, "[%s] rescued %d reads\n", __func__, n_rescued); } diff --git a/ksw.c b/ksw.c new file mode 100644 index 0000000..c2b5f9c --- /dev/null +++ b/ksw.c @@ -0,0 +1,399 @@ +/* The MIT License + + Copyright (c) 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include +#include +#include +#include "ksw.h" + +#ifdef __GNUC__ +#define LIKELY(x) __builtin_expect((x),1) +#define UNLIKELY(x) __builtin_expect((x),0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +struct _ksw_query_t { + int qlen, slen; + uint8_t shift, mdiff, max, size; + __m128i *qp, *H0, *H1, *E, *Hmax; +}; + +ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) +{ + ksw_query_t *q; + int slen, a, tmp, p; + + size = size > 1? 2 : 1; + p = 8 * (3 - size); // # values per __m128i + slen = (qlen + p - 1) / p; // segmented length + q = malloc(sizeof(ksw_query_t) + 256 + 16 * slen * (m + 4)); // a single block of memory + q->qp = (__m128i*)(((size_t)q + sizeof(ksw_query_t) + 15) >> 4 << 4); // align memory + q->H0 = q->qp + slen * m; + q->H1 = q->H0 + slen; + q->E = q->H1 + slen; + q->Hmax = q->E + slen; + q->slen = slen; q->qlen = qlen; q->size = size; + // compute shift + tmp = m * m; + for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score + if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; + if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; + } + q->max = q->mdiff; + q->shift = 256 - q->shift; // NB: q->shift is uint8_t + q->mdiff += q->shift; // this is the difference between the min and max scores + // An example: p=8, qlen=19, slen=3 and segmentation: + // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} + if (size == 1) { + int8_t *t = (int8_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; + } + } else { + int16_t *t = (int16_t*)q->qp; + for (a = 0; a < m; ++a) { + int i, k, nlen = slen * p; + const int8_t *ma = mat + a * m; + for (i = 0; i < slen; ++i) + for (k = i; k < nlen; k += slen) // p iterations + *t++ = (k >= qlen? 0 : ma[query[k]]); + } + } + return q; +} + +int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0; + uint64_t *b; + __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax; + +#define __max_16(ret, xx) do { \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ + (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ + (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ + } while (0) + + // initialization + m_b = n_b = 0; b = 0; + zero = _mm_set1_epi32(0); + gapoe = _mm_set1_epi8(a->gapo + a->gape); + gape = _mm_set1_epi8(a->gape); + shift = _mm_set1_epi8(q->shift); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm_store_si128(E + i, zero); + _mm_store_si128(H0 + i, zero); + _mm_store_si128(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, cmp, imax; + __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian + for (j = 0; LIKELY(j < slen); ++j) { + /* SW cells are computed in the following order: + * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + * E(i+1,j) = max{H(i,j)-q, E(i,j)-r} + * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} + */ + // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) + h = _mm_adds_epu8(h, _mm_load_si128(S + j)); + h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) + e = _mm_load_si128(E + j); // e=E'(i,j) + h = _mm_max_epu8(h, e); + h = _mm_max_epu8(h, f); // h=H'(i,j) + max = _mm_max_epu8(max, h); // set max + _mm_store_si128(H1 + j, h); // save to H'(i,j) + // now compute E'(i+1,j) + h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo + e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape + e = _mm_max_epu8(e, h); // e=E'(i+1,j) + _mm_store_si128(E + j, e); // save to E'(i+1,j) + // now compute F'(i,j+1) + f = _mm_subs_epu8(f, gape); + f = _mm_max_epu8(f, h); + // get H'(i-1,j) and prepare for the next j + h = _mm_load_si128(H0 + j); // h=H'(i-1,j) + } + // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion + for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max + f = _mm_slli_si128(f, 1); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epu8(h, f); // h=H'(i,j) + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu8(h, gapoe); + f = _mm_subs_epu8(f, gape); + cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero)); + if (UNLIKELY(cmp == 0xffff)) goto end_loop16; + } + } +end_loop16: + //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); + __max_16(imax, max); // imax is the maximum number in max + if (imax >= a->T) { // write the b array; this condition adds branching unfornately + if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append + if (n_b == m_b) { + m_b = m_b? m_b<<1 : 8; + b = realloc(b, 8 * m_b); + } + b[n_b++] = (uint64_t)imax<<32 | i; + } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; te = i; // te is the end position on the target + for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector + _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + if (gmax + q->shift >= 255) break; + } + S = H1; H1 = H0; H0 = S; // swap H0 and H1 + } + a->score = gmax; a->te = te; + { // get a->qe, the end of query match; find the 2nd best score + int max = -1, low, high, qlen = slen * 16; + uint8_t *t = (uint8_t*)Hmax; + for (i = 0, a->qe = -1; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, a->qe = i / 16 + i % 16 * slen; + //printf("%d,%d\n", max, gmax); + i = (a->score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0, a->score2 = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2) + a->score2 = b[i]>>32, a->te2 = e; + } + } + free(b); + return a->score + q->shift >= 255? 255 : a->score; +} + +int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e) +{ + int slen, i, m_b, n_b, te = -1, gmax = 0; + uint64_t *b; + __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; + +#define __max_8(ret, xx) do { \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ + (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ + (ret) = _mm_extract_epi16((xx), 0); \ + } while (0) + + // initialization + m_b = n_b = 0; b = 0; + zero = _mm_set1_epi32(0); + gapoe = _mm_set1_epi16(a->gapo + a->gape); + gape = _mm_set1_epi16(a->gape); + H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; + slen = q->slen; + for (i = 0; i < slen; ++i) { + _mm_store_si128(E + i, zero); + _mm_store_si128(H0 + i, zero); + _mm_store_si128(Hmax + i, zero); + } + // the core loop + for (i = 0; i < tlen; ++i) { + int j, k, imax; + __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example + h = _mm_slli_si128(h, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_adds_epi16(h, *S++); + e = _mm_load_si128(E + j); + h = _mm_max_epi16(h, e); + h = _mm_max_epi16(h, f); + max = _mm_max_epi16(max, h); + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu16(h, gapoe); + e = _mm_subs_epu16(e, gape); + e = _mm_max_epi16(e, h); + _mm_store_si128(E + j, e); + f = _mm_subs_epu16(f, gape); + f = _mm_max_epi16(f, h); + h = _mm_load_si128(H0 + j); + } + for (k = 0; LIKELY(k < 16); ++k) { + f = _mm_slli_si128(f, 2); + for (j = 0; LIKELY(j < slen); ++j) { + h = _mm_load_si128(H1 + j); + h = _mm_max_epi16(h, f); + _mm_store_si128(H1 + j, h); + h = _mm_subs_epu16(h, gapoe); + f = _mm_subs_epu16(f, gape); + if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; + } + } +end_loop8: + __max_8(imax, max); + if (imax >= a->T) { + if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { + if (n_b == m_b) { + m_b = m_b? m_b<<1 : 8; + b = realloc(b, 8 * m_b); + } + b[n_b++] = (uint64_t)imax<<32 | i; + } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last + } + if (imax > gmax) { + gmax = imax; te = i; + for (j = 0; LIKELY(j < slen); ++j) + _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + } + S = H1; H1 = H0; H0 = S; + } + a->score = gmax; a->te = te; + { + int max = -1, low, high, qlen = slen * 8; + uint16_t *t = (uint16_t*)Hmax; + for (i = 0, a->qe = -1; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, a->qe = i / 8 + i % 8 * slen; + i = (a->score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0, a->score2 = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2) + a->score2 = b[i]>>32, a->te2 = e; + } + } + free(b); + return a->score; +} + +int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) +{ + if (q->size == 1) return ksw_sse2_16(q, tlen, target, a); + else return ksw_sse2_8(q, tlen, target, a); +} + +/******************************************* + * Main function (not compiled by default) * + *******************************************/ + +#ifdef _KSW_MAIN + +#include +#include +#include +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +unsigned char seq_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +int main(int argc, char *argv[]) +{ + int c, sa = 1, sb = 3, i, j, k, forward_only = 0, size = 2; + int8_t mat[25]; + ksw_aux_t a; + gzFile fpt, fpq; + kseq_t *kst, *ksq; + // parse command line + a.gapo = 5; a.gape = 2; a.T = 10; + while ((c = getopt(argc, argv, "a:b:q:r:ft:s:")) >= 0) { + switch (c) { + case 'a': sa = atoi(optarg); break; + case 'b': sb = atoi(optarg); break; + case 'q': a.gapo = atoi(optarg); break; + case 'r': a.gape = atoi(optarg); break; + case 't': a.T = atoi(optarg); break; + case 'f': forward_only = 1; break; + case 's': size = atoi(optarg); break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: ksw [-s%d] [-a%d] [-b%d] [-q%d] [-r%d] \n", size, sa, sb, a.gapo, a.gape); + return 1; + } + // initialize scoring matrix + for (i = k = 0; i < 5; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? sa : -sb; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; + // open file + fpt = gzopen(argv[optind], "r"); kst = kseq_init(fpt); + fpq = gzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); + // all-pair alignment + while (kseq_read(ksq) > 0) { + ksw_query_t *q[2]; + for (i = 0; i < ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; + q[0] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat); + if (!forward_only) { // reverse + for (i = 0; i < ksq->seq.l/2; ++i) { + int t = ksq->seq.s[i]; + ksq->seq.s[i] = ksq->seq.s[ksq->seq.l-1-i]; + ksq->seq.s[ksq->seq.l-1-i] = t; + } + for (i = 0; i < ksq->seq.l; ++i) + ksq->seq.s[i] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; + q[1] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat); + } else q[1] = 0; + gzrewind(fpt); kseq_rewind(kst); + while (kseq_read(kst) > 0) { + int s; + for (i = 0; i < kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; + s = ksw_sse2(q[0], kst->seq.l, (uint8_t*)kst->seq.s, &a); + printf("%s\t%s\t+\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1); + if (q[1]) { + s = ksw_sse2(q[1], kst->seq.l, (uint8_t*)kst->seq.s, &a); + printf("%s\t%s\t-\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1); + } + } + free(q[0]); free(q[1]); + } + kseq_destroy(kst); gzclose(fpt); + kseq_destroy(ksq); gzclose(fpq); + return 0; +} +#endif diff --git a/ksw.h b/ksw.h new file mode 100644 index 0000000..d93d6a9 --- /dev/null +++ b/ksw.h @@ -0,0 +1,54 @@ +#ifndef __AC_KSW_H +#define __AC_KSW_H + +struct _ksw_query_t; +typedef struct _ksw_query_t ksw_query_t; + +typedef struct { + // input + unsigned gapo, gape; // the first gap costs gapo+gape + unsigned T; // threshold + // output + int score, te, qe, score2, te2; +} ksw_aux_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /** + * Initialize the query data structure + * + * @param size Number of bytes used to store a score; valid valures are 1 or 2 + * @param qlen Length of the query sequence + * @param query Query sequence + * @param m Size of the alphabet + * @param mat Scoring matrix in a one-dimension array + * + * @return Query data structure + */ + ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat); // to free, simply call free() + + /** + * Compute the maximum local score for queries initialized with ksw_qinit(1, ...) + * + * @param q Query data structure returned by ksw_qinit(1, ...) + * @param tlen Length of the target sequence + * @param target Target sequence + * @param a Auxiliary data structure (see ksw.h) + * + * @return The maximum local score; if the returned value equals 255, the SW may not be finished + */ + int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); + + /** Compute the maximum local score for queries initialized with ksw_qinit(2, ...) */ + int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); + + /** Unified interface for ksw_sse2_8() and ksw_sse2_16() */ + int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); + +#ifdef __cplusplus +} +#endif + +#endif From e06685db45542356d9e6b975918e84c9266fac67 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 7 Nov 2011 00:51:43 -0500 Subject: [PATCH 080/498] bwa-sw PE seems working (SAM is incorrect) --- bwtsw2.h | 2 ++ bwtsw2_aux.c | 19 ++++++++---- bwtsw2_pair.c | 82 +++++++++++++++++++++++++++++++++++++++++---------- main.c | 2 +- 4 files changed, 83 insertions(+), 22 deletions(-) diff --git a/bwtsw2.h b/bwtsw2.h index deae04b..951d2f1 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -6,7 +6,9 @@ #include "bwt_lite.h" #include "bwt.h" +#define BSW2_FLAG_MOVED 0x80 #define BSW2_FLAG_MATESW 0x100 +#define BSW2_FLAG_TANDEM 0x200 typedef struct { int a, b, q, r, t, qr, bw; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 471c450..e077580 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -431,21 +431,28 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks bsw2hit_t *p = b->hits + i; int seqid = -1; int64_t coor = -1; - int j, qual, nn = 0; + int j, nn = 0; int beg, end; if (p->l == 0) { b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]); nn = bns_cnt_ambi(bns, p->k, p->len, &seqid); coor = p->k - bns->anns[seqid].offset; } - ksprintf(&str, "%s\t%d", ks->name, (p->flag&0xff)|(is_pe?1:0)); + ksprintf(&str, "%s\t%d", ks->name, (p->flag&0x7f)|(is_pe?1:0)); ksprintf(&str, "\t%s\t%ld", seqid>=0? bns->anns[seqid].name : "*", (long)coor + 1); if (p->l == 0) { - { // estimate mapping quality - qual = est_mapq(p, opt); - if ((p->flag & BSW2_FLAG_MATESW) && bmate && bmate->n == 1) { // this alignment is from Smith-Waterman rescue - int mate_qual = est_mapq(bmate->hits, opt); + int qual = est_mapq(p, opt); + if (is_pe && bmate && bmate->n == 1) { + int mate_qual = est_mapq(bmate->hits, opt); + if (p->flag & BSW2_FLAG_MATESW) { // this alignment is rescued by Smith-Waterman qual = qual < mate_qual? qual : mate_qual; + } else if (p->flag&2) { // properly paired + if (!(p->flag & BSW2_FLAG_TANDEM)) { // not around a tandem repeat + if (qual < mate_qual) { + qual += 20; + if (qual >= mate_qual) qual = mate_qual; + } + } } } ksprintf(&str, "\t%d\t", qual); diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index e4721d4..1a1d3ca 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -63,6 +63,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf) r.low = tmp > max_len? tmp : max_len; if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499); fprintf(stderr, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high); + free(isize); return r; } @@ -110,30 +111,32 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b for (i = 0; i < l_mseq; ++i) // on the forward strand seq[i] = nst_nt4_table[(int)mseq[i]]; } - /* The following code can be made up to 2-fold as fast. I am just lazy... */ // forward Smith-Waterman aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0]; q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); ksw_sse2(q, end - beg, ref, &aux[0]); free(q); - if (aux[0].score == 0) { + if (aux[0].score < opt->t) { + aux[0].score = 0; free(seq); return; } + ++aux[0].qe; ++aux[0].te; // reverse Smith-Waterman - seq_reverse(l_mseq, seq, 0); - seq_reverse(end - beg, ref, 0); - q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); - ksw_sse2(q, end - beg, ref, &aux[1]); + seq_reverse(aux[0].qe, seq, 0); + seq_reverse(aux[0].te, ref, 0); + q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat); + ksw_sse2(q, aux[0].te, ref, &aux[1]); free(q); - aux[1].te = end - beg - 1 - aux[1].te; // change to the forward-strand coordinate + ++aux[1].qe; ++aux[1].te; // write output a->G = aux[0].score; a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2; - a->k = beg + aux[1].te; - a->len = aux[0].te + 1 - aux[1].te; - a->beg = l_mseq - 1 - aux[1].qe; - a->end = aux[0].qe + 1; + a->k = beg + (aux[0].te - aux[1].te); + a->len = aux[1].te; + a->beg = aux[0].qe - aux[1].qe; + a->end = aux[0].qe; + if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i; free(seq); } @@ -141,7 +144,7 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b { extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); bsw2pestat_t pes; - int i, j, k, n_rescued = 0; + int i, j, k, n_rescued = 0, n_moved = 0, n_fixed = 0; pes = bsw2_stat(n, hits); for (i = k = 0; i < 5; ++i) { for (j = 0; j < 4; ++j) @@ -165,7 +168,6 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1]); if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0]); // the following enumerate all possibilities. It is tedious but necessary... - //if (strstr(seq[i].name, "22_49258265_49258755_4")) fprintf(stderr, "%lld\t%lld\t(%d,%d)\n", hits[i+1]->hits[0].k, a[1].k, a[0].G, a[0].G2); if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not bwtsw2_t *p[2]; int which; @@ -176,11 +178,61 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b p[1]->max = 1; p[1]->hits = malloc(sizeof(bsw2hit_t)); } - memcpy(p[1]->hits, &a[which], sizeof(bsw2hit_t)); + p[1]->hits[0] = a[which]; p[1]->n = 1; ++n_rescued; } else { // then both ends mapped + //fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end); + for (j = 0; j < 2; ++j) { // first fix wrong mappings + if (hits[i+j]->hits[0].G < a[j].G) { // the orginal mapping is suboptimal + a[j].G2 = a[j].G2 > hits[i+j]->hits[0].G? a[j].G2 : hits[i+j]->hits[0].G; + hits[i+j]->hits[0] = a[j]; + ++n_fixed; + } + } + if (hits[i]->hits[0].k == a[0].k && hits[i+1]->hits[0].k == a[1].k) { // properly paired and no ends need to be moved + for (j = 0; j < 2; ++j) { + if (hits[i+j]->hits[0].G2 < a[j].G2) + hits[i+j]->hits[0].G2 = a[j].G2; + if (a[j].G2) hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM; + hits[i+j]->hits[0].flag |= 2; + } + } else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match + for (j = 0; j < 2; ++j) { + hits[i+j]->hits[0].flag |= 2; + if (hits[i+j]->hits[0].k != a[j].k) + hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM | 2; + } + } else if (a[0].G || a[1].G) { // it is possible to move one end + if (a[0].G && a[1].G) { // now we have two "proper pairs" + int G[2]; + double diff; + G[0] = hits[i]->hits[0].G + a[1].G; + G[1] = hits[i+1]->hits[0].G + a[0].G; + diff = fabs(G[0] - G[1]) / (opt->a + opt->b) / ((hits[i]->hits[0].len + a[1].len + hits[i+1]->hits[0].len + a[0].len) / 2.); + if (diff > 0.05) a[G[0] > G[1]? 0 : 1].G = 0; + } + if (a[0].G == 0 || a[1].G == 0) { // one proper pair only + bsw2hit_t *p[2]; + int which, isize; + double dev, diff; + if (a[0].G) p[0] = &hits[i+1]->hits[0], p[1] = &hits[i]->hits[0], which = 0; + else p[0] = &hits[i]->hits[0], p[1] = &hits[i+1]->hits[0], which = 1; + isize = p[0]->is_rev? p[0]->k + p[0]->len - a[which].k : a[which].k + a[which].len - p[0]->k; + dev = fabs(isize - pes.avg) / pes.std; + diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0; + if (diff < dev * 2.) { // then move + int tflag = 0; + if (a[which].G - a[which].G2 < 2 * (opt->a + opt->b)) tflag = BSW2_FLAG_TANDEM; + a[which].G2 = a[which].G; + p[1][0] = a[which]; + p[1]->flag |= BSW2_FLAG_MOVED | 2 | tflag; + p[0]->flag |= 2; + ++n_moved; + } + } + } } } - fprintf(stderr, "[%s] rescued %d reads\n", __func__, n_rescued); + fprintf(stderr, "[%s] #fixed=%d, #rescued=%d, #moved=%d\n", __func__, n_fixed, n_rescued, n_moved); } diff --git a/main.c b/main.c index 7f9601f..1e5613c 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r70-dev" +#define PACKAGE_VERSION "0.6.0-r77-dev" #endif static int usage() From b42910ada6b467f473b74a3e447bcbfe91af1a9a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Nov 2011 00:49:21 -0500 Subject: [PATCH 081/498] proper mate information --- bwtsw2.h | 10 ++- bwtsw2_aux.c | 182 ++++++++++++++++++++++++++++++++------------------ bwtsw2_pair.c | 18 +++-- main.c | 2 +- 4 files changed, 136 insertions(+), 76 deletions(-) diff --git a/bwtsw2.h b/bwtsw2.h index 951d2f1..0718826 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -6,9 +6,9 @@ #include "bwt_lite.h" #include "bwt.h" -#define BSW2_FLAG_MOVED 0x80 #define BSW2_FLAG_MATESW 0x100 #define BSW2_FLAG_TANDEM 0x200 +#define BSW2_FLAG_MOVED 0x400 typedef struct { int a, b, q, r, t, qr, bw; @@ -24,11 +24,15 @@ typedef struct { int beg, end; } bsw2hit_t; +typedef struct { + int flag, nn, n_cigar, chr, pos, qual, mchr, mpos, pqual, isize; + uint32_t *cigar; +} bsw2aux_t; + typedef struct { int n, max; bsw2hit_t *hits; - int *n_cigar; - uint32_t **cigar; + bsw2aux_t *aux; } bwtsw2_t; typedef struct { diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index e077580..eff6cd1 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -60,9 +60,9 @@ void bsw2_destroy(bwtsw2_t *b) { int i; if (b == 0) return; - if (b->cigar) - for (i = 0; i < b->n; ++i) free(b->cigar[i]); - free(b->cigar); free(b->n_cigar); free(b->hits); + if (b->aux) + for (i = 0; i < b->n; ++i) free(b->aux[i].cigar); + free(b->aux); free(b->hits); free(b); } @@ -175,18 +175,10 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], uint8_t *pa i = ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq; // maximum possible target length target = calloc(i, 1); path = calloc(i + lq, sizeof(path_t)); - // memory clean up for b - if (b->n < b->max) { - b->max = b->n; - b->hits = realloc(b->hits, b->n * sizeof(bsw2hit_t)); - } - if (b->cigar) free(b->cigar); - if (b->n_cigar) free(b->n_cigar); - b->cigar = (uint32_t**)calloc(b->max, sizeof(void*)); - b->n_cigar = (int*)calloc(b->max, sizeof(int)); // generate CIGAR for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; + bsw2aux_t *q = b->aux + i; uint8_t *query; bwtint_t k; int score, path_len, beg, end; @@ -197,17 +189,17 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], uint8_t *pa for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3; score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); - b->cigar[i] = aln_path2cigar32(path, path_len, &b->n_cigar[i]); + q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar); if (beg != 0 || end < lq) { // write soft clipping - b->cigar[i] = realloc(b->cigar[i], 4 * (b->n_cigar[i] + 2)); + q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2)); if (beg != 0) { - memmove(b->cigar[i] + 1, b->cigar[i], b->n_cigar[i] * 4); - b->cigar[i][0] = beg<<4 | 4; - ++b->n_cigar[i]; + memmove(q->cigar + 1, q->cigar, q->n_cigar * 4); + q->cigar[0] = beg<<4 | 4; + ++q->n_cigar; } if (end < lq) { - b->cigar[i][b->n_cigar[i]] = (lq - end)<<4 | 4; - ++b->n_cigar[i]; + q->cigar[q->n_cigar] = (lq - end)<<4 | 4; + ++q->n_cigar; } } } @@ -325,7 +317,7 @@ typedef struct { bsw2seq1_t *seq; } bsw2seq_t; -static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar) +static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar) { // FIXME: this routine does not work if the query bridge three reference sequences int32_t coor, refl, lq; @@ -398,17 +390,82 @@ static int fix_cigar(const char *qname, const bntseq_t *bns, bsw2hit_t *p, int n return n_cigar; } -static int est_mapq(const bsw2hit_t *p, const bsw2opt_t *opt) +static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], uint8_t *pac, bwtsw2_t *b) { - float c = 1.0; - int qual, subo = p->G2 > opt->t? p->G2 : opt->t; - if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; - if (p->n_seeds < 2) c *= .2; - qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499); - if (qual > 250) qual = 250; - if (qual < 0) qual = 0; - if (p->flag&1) qual = 0; // this is a random hit - return qual; + int i; + // allocate for b->aux + if (b->n<<1 < b->max) { + b->max = b->n; + kroundup32(b->max); + b->hits = realloc(b->hits, b->max * sizeof(bsw2hit_t)); + } + b->aux = calloc(b->n, sizeof(bsw2aux_t)); + // generate CIGAR + gen_cigar(opt, qlen, seq, pac, b); + // fix CIGAR, generate mapQ, and write chromosomal position + for (i = 0; i < b->n; ++i) { + bsw2hit_t *p = &b->hits[i]; + bsw2aux_t *q = &b->aux[i]; + q->flag = p->flag & 0xfe; + q->isize = 0; + if (p->l == 0) { // unique hit + float c = 1.0; + int subo; + // fix out-of-boundary CIGAR + q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar); + // compute mapQ + subo = p->G2 > opt->t? p->G2 : opt->t; + if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; + if (p->n_seeds < 2) c *= .2; + q->qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499); + if (q->qual > 250) q->qual = 250; + if (q->qual < 0) q->qual = 0; + if (p->flag&1) q->qual = 0; // this is a random hit + q->pqual = q->qual; // set the paired qual as qual + // get the chromosomal position + q->nn = bns_cnt_ambi(bns, p->k, p->len, &q->chr); + q->pos = p->k - bns->anns[q->chr].offset; + } else q->qual = 0, q->n_cigar = 0, q->chr = q->pos = -1, q->nn = 0; + } +} + +static void update_mate_aux(bwtsw2_t *b, const bwtsw2_t *m) +{ + int i; + if (m == 0) return; + // update flag, mchr and mpos + for (i = 0; i < b->n; ++i) { + bsw2aux_t *q = &b->aux[i]; + q->flag |= 1; // paired + if (m->n == 0) q->flag |= 8; // mate unmapped + if (m->n == 1) { + q->mchr = m->aux[0].chr; + q->mpos = m->aux[0].pos; + if (m->aux[0].flag&0x10) q->flag |= 0x20; // mate reverse strand + if (q->chr == q->mchr) { // set insert size + if (q->mpos + m->hits[0].len > q->pos) + q->isize = q->mpos + m->hits[0].len - q->pos; + else q->isize = q->mpos - q->pos - b->hits[0].len; + } else q->isize = 0; + } else q->mchr = q->mpos = -1; + } + // update mapping quality + if (b->n == 1 && m->n == 1) { + bsw2hit_t *p = &b->hits[0]; + int isize; + if (p->flag & BSW2_FLAG_MATESW) { // this alignment is rescued by Smith-Waterman + if (!(p->flag & BSW2_FLAG_TANDEM) && b->aux[0].pqual < m->aux[0].qual) + b->aux[0].pqual = m->aux[0].qual; + } else if (p->flag&2) { // properly paired + if (!(p->flag & BSW2_FLAG_TANDEM)) { // not around a tandem repeat + if (b->aux[0].pqual < m->aux[0].qual) { + b->aux[0].pqual += 20; + if (b->aux[0].pqual >= m->aux[0].qual) + b->aux[0].pqual = m->aux[0].qual; + } + } + } + } } /* generate SAM lines for a sequence in ks with alignment stored in @@ -429,46 +486,28 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks } for (i = 0; b && i < b->n; ++i) { bsw2hit_t *p = b->hits + i; - int seqid = -1; - int64_t coor = -1; - int j, nn = 0; - int beg, end; - if (p->l == 0) { - b->n_cigar[i] = fix_cigar(ks->name, bns, p, b->n_cigar[i], b->cigar[i]); - nn = bns_cnt_ambi(bns, p->k, p->len, &seqid); - coor = p->k - bns->anns[seqid].offset; - } - ksprintf(&str, "%s\t%d", ks->name, (p->flag&0x7f)|(is_pe?1:0)); - ksprintf(&str, "\t%s\t%ld", seqid>=0? bns->anns[seqid].name : "*", (long)coor + 1); - if (p->l == 0) { - int qual = est_mapq(p, opt); - if (is_pe && bmate && bmate->n == 1) { - int mate_qual = est_mapq(bmate->hits, opt); - if (p->flag & BSW2_FLAG_MATESW) { // this alignment is rescued by Smith-Waterman - qual = qual < mate_qual? qual : mate_qual; - } else if (p->flag&2) { // properly paired - if (!(p->flag & BSW2_FLAG_TANDEM)) { // not around a tandem repeat - if (qual < mate_qual) { - qual += 20; - if (qual >= mate_qual) qual = mate_qual; - } - } - } - } - ksprintf(&str, "\t%d\t", qual); - for (k = 0; k < b->n_cigar[i]; ++k) - ksprintf(&str, "%d%c", b->cigar[i][k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[b->cigar[i][k]&0xf]); + bsw2aux_t *q = b->aux + i; + int j, beg, end, type = 0; + // print mandatory fields before SEQ + ksprintf(&str, "%s\t%d", ks->name, q->flag); + ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1); + if (p->l == 0) { // not a repetitive hit + ksprintf(&str, "\t%d\t", q->pqual); + for (k = 0; k < q->n_cigar; ++k) + ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]); } else ksprintf(&str, "\t0\t*"); - ksprintf(&str, "\t*\t0\t0\t"); + ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize); + // get the sequence begin and end beg = 0; end = ks->l; if (opt->hard_clip) { - if ((b->cigar[i][0]&0xf) == 4) beg += b->cigar[i][0]>>4; - if ((b->cigar[i][b->n_cigar[i]-1]&0xf) == 4) end -= b->cigar[i][b->n_cigar[i]-1]>>4; + if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4; + if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4; } for (j = beg; j < end; ++j) { if (p->flag&0x10) kputc(nt_comp_table[(int)ks->seq[ks->l - 1 - j]], &str); else kputc(ks->seq[j], &str); } + // print base quality if present if (ks->qual) { kputc('\t', &str); for (j = beg; j < end; ++j) { @@ -476,9 +515,13 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks else kputc(ks->qual[j], &str); } } else ksprintf(&str, "\t*"); - ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tXN:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, nn); + // print optional tags + ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds); + if (q->nn) ksprintf(&str, "\tXN:i:%d", q->nn); if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1); - if (p->flag&BSW2_FLAG_MATESW) ksprintf(&str, "\tXT:i:1"); + if (p->flag&BSW2_FLAG_MATESW) type |= 1; + if (p->flag&BSW2_FLAG_TANDEM) type |= 2; + if (type) ksprintf(&str, "\tXT:i:%d", type); kputc('\n', &str); } ks->sam = str.s; @@ -575,10 +618,13 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const seq[0][i] = c; seq[1][p->l-1-i] = 3 - c; } - gen_cigar(&opt, p->l, seq, pac, buf[x]); - print_hits(bns, &opt, p, buf[x], is_pe, buf[x^1]); + write_aux(&opt, bns, p->l, seq, pac, buf[x]); free(seq[0]); } + for (x = 0; x < _seq->n; ++x) { + if (is_pe) update_mate_aux(buf[x], buf[x^1]); + print_hits(bns, &opt, &_seq->seq[x], buf[x], is_pe, buf[x^1]); + } for (x = 0; x < _seq->n; ++x) bsw2_destroy(buf[x]); free(buf); bsw2_global_destroy(pool); @@ -681,6 +727,8 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c is_pe = 1; } else fp2 = 0, ks2 = 0, is_pe = 0; while (kseq_read(ks) >= 0) { + if (ks->name.l > 2 && ks->name.s[ks->name.l-2] == '/') + ks->name.l -= 2, ks->name.s[ks->name.l] = 0; if (_seq->n == _seq->max) { _seq->max = _seq->max? _seq->max<<1 : 1024; _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); @@ -689,6 +737,8 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c size += ks->seq.l; if (ks2) { if (kseq_read(ks2) >= 0) { + if (ks2->name.l > 2 && ks2->name.s[ks2->name.l-2] == '/') + ks2->name.l -= 2, ks2->name.s[ks2->name.l] = 0; kseq_to_bsw2seq(ks2, &_seq->seq[_seq->n++]); // for PE, _seq->n here must be odd and we do not need to enlarge size += ks->seq.l; } else { diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index 1a1d3ca..ac149a4 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -85,7 +85,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b ksw_query_t *q; ksw_aux_t aux[2]; // compute the region start and end - a->n_seeds = 1; a->l = 0; a->flag |= BSW2_FLAG_MATESW; + a->n_seeds = 1; a->l = 0; if (h->is_rev == 0) { beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499); end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499); @@ -168,21 +168,27 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1]); if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0]); // the following enumerate all possibilities. It is tedious but necessary... - if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not + if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not; bwtsw2_t *p[2]; int which; if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1; else p[0] = hits[i+1], p[1] = hits[i], which = 0; if (a[which].G == 0) continue; + a[which].flag |= BSW2_FLAG_MATESW; + if (a[which].G2) a[which].flag |= BSW2_FLAG_TANDEM; if (p[1]->max == 0) { p[1]->max = 1; p[1]->hits = malloc(sizeof(bsw2hit_t)); } p[1]->hits[0] = a[which]; p[1]->n = 1; + p[0]->hits[0].flag |= 2; + p[1]->hits[0].flag |= 2; ++n_rescued; } else { // then both ends mapped + int ori_G2[2]; //fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end); + ori_G2[0] = a[0].G2; ori_G2[1] = a[1].G2; for (j = 0; j < 2; ++j) { // first fix wrong mappings if (hits[i+j]->hits[0].G < a[j].G) { // the orginal mapping is suboptimal a[j].G2 = a[j].G2 > hits[i+j]->hits[0].G? a[j].G2 : hits[i+j]->hits[0].G; @@ -194,14 +200,14 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b for (j = 0; j < 2; ++j) { if (hits[i+j]->hits[0].G2 < a[j].G2) hits[i+j]->hits[0].G2 = a[j].G2; - if (a[j].G2) hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM; + if (ori_G2[j]) hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM; hits[i+j]->hits[0].flag |= 2; } } else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match for (j = 0; j < 2; ++j) { hits[i+j]->hits[0].flag |= 2; if (hits[i+j]->hits[0].k != a[j].k) - hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM | 2; + hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM; } } else if (a[0].G || a[1].G) { // it is possible to move one end if (a[0].G && a[1].G) { // now we have two "proper pairs" @@ -223,14 +229,14 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0; if (diff < dev * 2.) { // then move int tflag = 0; - if (a[which].G - a[which].G2 < 2 * (opt->a + opt->b)) tflag = BSW2_FLAG_TANDEM; + if (ori_G2[which]) tflag = BSW2_FLAG_TANDEM; a[which].G2 = a[which].G; p[1][0] = a[which]; p[1]->flag |= BSW2_FLAG_MOVED | 2 | tflag; p[0]->flag |= 2; ++n_moved; } - } + } // else, do nothing } } } diff --git a/main.c b/main.c index 1e5613c..7130310 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r77-dev" +#define PACKAGE_VERSION "0.6.0-r78-dev" #endif static int usage() From fa8cfe5567e1945450328cec9d1ea2b8fddc3441 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Nov 2011 12:12:45 -0500 Subject: [PATCH 082/498] bugfix: wrong mapping quality --- bwtsw2.h | 7 ++++--- bwtsw2_aux.c | 20 +++++++++----------- bwtsw2_pair.c | 48 +++++++++++++++++++++++++----------------------- main.c | 2 +- 4 files changed, 39 insertions(+), 38 deletions(-) diff --git a/bwtsw2.h b/bwtsw2.h index 0718826..c13ef9c 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -6,9 +6,10 @@ #include "bwt_lite.h" #include "bwt.h" -#define BSW2_FLAG_MATESW 0x100 -#define BSW2_FLAG_TANDEM 0x200 -#define BSW2_FLAG_MOVED 0x400 +#define BSW2_FLAG_MATESW 0x100 +#define BSW2_FLAG_TANDEM 0x200 +#define BSW2_FLAG_MOVED 0x400 +#define BSW2_FLAG_RESCUED 0x800 typedef struct { int a, b, q, r, t, qr, bw; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index eff6cd1..6dc4f36 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -452,17 +452,15 @@ static void update_mate_aux(bwtsw2_t *b, const bwtsw2_t *m) // update mapping quality if (b->n == 1 && m->n == 1) { bsw2hit_t *p = &b->hits[0]; - int isize; - if (p->flag & BSW2_FLAG_MATESW) { // this alignment is rescued by Smith-Waterman - if (!(p->flag & BSW2_FLAG_TANDEM) && b->aux[0].pqual < m->aux[0].qual) - b->aux[0].pqual = m->aux[0].qual; - } else if (p->flag&2) { // properly paired - if (!(p->flag & BSW2_FLAG_TANDEM)) { // not around a tandem repeat - if (b->aux[0].pqual < m->aux[0].qual) { - b->aux[0].pqual += 20; - if (b->aux[0].pqual >= m->aux[0].qual) - b->aux[0].pqual = m->aux[0].qual; - } + if (p->flag & BSW2_FLAG_MATESW) { // this alignment is found by Smith-Waterman + if (!(p->flag & BSW2_FLAG_TANDEM) && b->aux[0].pqual < 20) + b->aux[0].pqual = 20; + if (b->aux[0].pqual >= m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual; + } else if ((p->flag & 2) && !(m->hits[0].flag & BSW2_FLAG_MATESW)) { // properly paired + if (!(p->flag & BSW2_FLAG_TANDEM)) { // pqual is bounded by [b->aux[0].qual,m->aux[0].qual] + b->aux[0].pqual += 20; + if (b->aux[0].pqual > m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual; + if (b->aux[0].pqual < b->aux[0].qual) b->aux[0].pqual = b->aux[0].qual; } } } diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index ac149a4..5a83199 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -85,7 +85,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b ksw_query_t *q; ksw_aux_t aux[2]; // compute the region start and end - a->n_seeds = 1; a->l = 0; + a->n_seeds = 1; a->flag |= BSW2_FLAG_MATESW; // before calling this routine, *a has been cleared with memset(0); the flag is set with 1<<6/7 if (h->is_rev == 0) { beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499); end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499); @@ -117,7 +117,6 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b ksw_sse2(q, end - beg, ref, &aux[0]); free(q); if (aux[0].score < opt->t) { - aux[0].score = 0; free(seq); return; } @@ -132,6 +131,8 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b // write output a->G = aux[0].score; a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2; + if (a->G2 < opt->t) a->G2 = 0; + if (a->G2) a->flag |= BSW2_FLAG_TANDEM; a->k = beg + (aux[0].te - aux[1].te); a->len = aux[1].te; a->beg = aux[0].qe - aux[1].qe; @@ -174,8 +175,7 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1; else p[0] = hits[i+1], p[1] = hits[i], which = 0; if (a[which].G == 0) continue; - a[which].flag |= BSW2_FLAG_MATESW; - if (a[which].G2) a[which].flag |= BSW2_FLAG_TANDEM; + a[which].flag |= BSW2_FLAG_RESCUED; if (p[1]->max == 0) { p[1]->max = 1; p[1]->hits = malloc(sizeof(bsw2hit_t)); @@ -186,30 +186,31 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b p[1]->hits[0].flag |= 2; ++n_rescued; } else { // then both ends mapped - int ori_G2[2]; + int is_fixed = 0; //fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end); - ori_G2[0] = a[0].G2; ori_G2[1] = a[1].G2; - for (j = 0; j < 2; ++j) { // first fix wrong mappings - if (hits[i+j]->hits[0].G < a[j].G) { // the orginal mapping is suboptimal - a[j].G2 = a[j].G2 > hits[i+j]->hits[0].G? a[j].G2 : hits[i+j]->hits[0].G; - hits[i+j]->hits[0] = a[j]; + for (j = 0; j < 2; ++j) { // fix wrong mappings and wrong suboptimal alignment score + bsw2hit_t *p = &hits[i+j]->hits[0]; + if (p->G < a[j].G) { // the orginal mapping is suboptimal + a[j].G2 = a[j].G2 > p->G? a[j].G2 : p->G; // FIXME: reset BSW2_FLAG_TANDEM? + *p = a[j]; ++n_fixed; + is_fixed = 1; + } else if (p->k != a[j].k && p->G2 < a[j].G) { + p->G2 = a[j].G; + } else if (p->k == a[j].k && p->G2 < a[j].G2) { + p->G2 = a[j].G2; } } if (hits[i]->hits[0].k == a[0].k && hits[i+1]->hits[0].k == a[1].k) { // properly paired and no ends need to be moved - for (j = 0; j < 2; ++j) { - if (hits[i+j]->hits[0].G2 < a[j].G2) - hits[i+j]->hits[0].G2 = a[j].G2; - if (ori_G2[j]) hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM; - hits[i+j]->hits[0].flag |= 2; - } + for (j = 0; j < 2; ++j) + hits[i+j]->hits[0].flag |= 2 | (a[j].flag & BSW2_FLAG_TANDEM); } else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match for (j = 0; j < 2; ++j) { hits[i+j]->hits[0].flag |= 2; if (hits[i+j]->hits[0].k != a[j].k) hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM; } - } else if (a[0].G || a[1].G) { // it is possible to move one end + } else if (!is_fixed && (a[0].G || a[1].G)) { // it is possible to move one end if (a[0].G && a[1].G) { // now we have two "proper pairs" int G[2]; double diff; @@ -219,7 +220,7 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b if (diff > 0.05) a[G[0] > G[1]? 0 : 1].G = 0; } if (a[0].G == 0 || a[1].G == 0) { // one proper pair only - bsw2hit_t *p[2]; + bsw2hit_t *p[2]; // p[0] points the unchanged hit; p[1] to the hit to be moved int which, isize; double dev, diff; if (a[0].G) p[0] = &hits[i+1]->hits[0], p[1] = &hits[i]->hits[0], which = 0; @@ -227,16 +228,17 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b isize = p[0]->is_rev? p[0]->k + p[0]->len - a[which].k : a[which].k + a[which].len - p[0]->k; dev = fabs(isize - pes.avg) / pes.std; diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0; - if (diff < dev * 2.) { // then move - int tflag = 0; - if (ori_G2[which]) tflag = BSW2_FLAG_TANDEM; + if (diff < dev * 2.) { // then move (heuristic) a[which].G2 = a[which].G; p[1][0] = a[which]; - p[1]->flag |= BSW2_FLAG_MOVED | 2 | tflag; + p[1]->flag |= BSW2_FLAG_MOVED | 2; p[0]->flag |= 2; ++n_moved; } - } // else, do nothing + } + } else if (is_fixed) { + hits[i+0]->hits[0].flag |= 2; + hits[i+1]->hits[0].flag |= 2; } } } diff --git a/main.c b/main.c index 7130310..edebe0b 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r78-dev" +#define PACKAGE_VERSION "0.6.0-r79-dev" #endif static int usage() From 806069341169c506777efa9727ab26f616d1cfcb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Nov 2011 16:50:58 -0500 Subject: [PATCH 083/498] multithreading works again --- bwtsw2_aux.c | 46 +++++++++++++++++++++++++++++++--------------- bwtsw2_pair.c | 42 ++++++++++++++++++++++++++++++------------ main.c | 46 ++++++++++++++++++++++++++++------------------ utils.c | 16 ++++++++++++++++ utils.h | 3 +++ 5 files changed, 108 insertions(+), 45 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 6dc4f36..5d57ba8 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -71,9 +71,11 @@ bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b) bwtsw2_t *p; p = calloc(1, sizeof(bwtsw2_t)); p->max = p->n = b->n; - kroundup32(p->max); - p->hits = calloc(p->max, sizeof(bsw2hit_t)); - memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t)); + if (b->n) { + kroundup32(p->max); + p->hits = calloc(p->max, sizeof(bsw2hit_t)); + memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t)); + } return p; } @@ -530,7 +532,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks /* Core routine to align reads in _seq. It is separated from * process_seqs() to realize multi-threading */ -static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) +static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) { int x; bsw2opt_t opt = *_opt; @@ -543,11 +545,6 @@ static void bsw2_aln_core(int tid, bsw2seq_t *_seq, const bsw2opt_t *_opt, const int i, l, k; bwtsw2_t *b[2]; l = p->l; - -#ifdef HAVE_PTHREAD - if (x % _opt->n_threads != tid) continue; -#endif - // set opt->t opt.t = _opt->t; if (opt.t < log(l) * opt.coef) opt.t = (int)(log(l) * opt.coef + .499); @@ -642,7 +639,7 @@ typedef struct { static void *worker(void *data) { thread_aux_t *p = (thread_aux_t*)data; - bsw2_aln_core(p->tid, p->_seq, p->_opt, p->bns, p->pac, p->target, p->is_pe); + bsw2_aln_core(p->_seq, p->_opt, p->bns, p->pac, p->target, p->is_pe); return 0; } #endif @@ -652,10 +649,11 @@ static void *worker(void *data) static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) { int i; + is_pe = is_pe? 1 : 0; #ifdef HAVE_PTHREAD if (opt->n_threads <= 1) { - bsw2_aln_core(0, _seq, opt, bns, pac, target, is_pe); + bsw2_aln_core(_seq, opt, bns, pac, target, is_pe); } else { pthread_t *tid; pthread_attr_t attr; @@ -667,15 +665,33 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (j = 0; j < opt->n_threads; ++j) { thread_aux_t *p = data + j; - p->tid = j; p->_seq = _seq; p->_opt = opt; p->bns = bns; p->is_pe = is_pe; + p->tid = j; p->_opt = opt; p->bns = bns; p->is_pe = is_pe; p->pac = pac; p->target = target; - pthread_create(&tid[j], &attr, worker, p); + p->_seq = calloc(1, sizeof(bsw2seq_t)); + p->_seq->max = (_seq->n + opt->n_threads - 1) / opt->n_threads + 1; + p->_seq->n = 0; + p->_seq->seq = calloc(p->_seq->max, sizeof(bsw2seq1_t)); } + for (i = 0; i < _seq->n; ++i) { // assign sequences to each thread + bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq; + p->seq[p->n++] = _seq->seq[i]; + } + for (j = 0; j < opt->n_threads; ++j) pthread_create(&tid[j], &attr, worker, &data[j]); for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); + for (j = 0; j < opt->n_threads; ++j) data[j]._seq->n = 0; + for (i = 0; i < _seq->n; ++i) { // copy the result from each thread back + bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq; + _seq->seq[i] = p->seq[p->n++]; + } + for (j = 0; j < opt->n_threads; ++j) { + thread_aux_t *p = data + j; + free(p->_seq->seq); + free(p->_seq); + } free(data); free(tid); } #else - bsw2_aln_core(0, _seq, opt, bns, pac, target, is_pe); + bsw2_aln_core(_seq, opt, bns, pac, target, is_pe); #endif // print and reset @@ -744,7 +760,7 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c is_pe = 0; } } - if (size > opt->chunk_size) { + if (size > opt->chunk_size * opt->n_threads) { fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size); process_seqs(_seq, opt, bns, pac, target, is_pe); size = 0; diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index 5a83199..581e19c 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -6,6 +6,7 @@ #include "bntseq.h" #include "bwtsw2.h" #include "ksw.h" +#include "kstring.h" #define MAX_INS 20000 #define MIN_RATIO 0.8 @@ -14,17 +15,18 @@ #define EXT_STDDEV 4.0 typedef struct { - int low, high; + int low, high, failed; double avg, std; } bsw2pestat_t; -bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf) +bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg) { extern void ks_introsort_uint64_t(size_t n, uint64_t *a); int i, k, x, p25, p50, p75, tmp, max_len = 0; uint64_t *isize; bsw2pestat_t r; + memset(&r, 0, sizeof(bsw2pestat_t)); isize = calloc(n, 8); for (i = k = 0; i < n; i += 2) { bsw2hit_t *t[2]; @@ -42,11 +44,19 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf) p25 = isize[(int)(.25 * k + .499)]; p50 = isize[(int)(.50 * k + .499)]; p75 = isize[(int)(.75 * k + .499)]; + ksprintf(msg, "[%s] infer the insert size distribution from %d high-quality pairs.\n", __func__, k); + if (k < 8) { + ksprintf(msg, "[%s] fail to infer the insert size distribution.\n", __func__); + free(isize); + r.failed = 1; + return r; + } tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); r.low = tmp > max_len? tmp : max_len; + if (r.low < 1) r.low = 1; r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); - fprintf(stderr, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); - fprintf(stderr, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high); + ksprintf(msg, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); + ksprintf(msg, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high); for (i = x = 0, r.avg = 0; i < k; ++i) if (isize[i] >= r.low && isize[i] <= r.high) r.avg += isize[i], ++x; @@ -55,14 +65,15 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf) if (isize[i] >= r.low && isize[i] <= r.high) r.std += (isize[i] - r.avg) * (isize[i] - r.avg); r.std = sqrt(r.std / x); - fprintf(stderr, "[%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r.avg, r.std); + ksprintf(msg, "[%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r.avg, r.std); tmp = (int)(p25 - 3. * (p75 - p25) + .499); r.low = tmp > max_len? tmp : max_len; + if (r.low < 1) r.low = 1; r.high = (int)(p75 + 3. * (p75 - p25) + .499); if (r.low > r.avg - MAX_STDDEV * 4.) r.low = (int)(r.avg - MAX_STDDEV * 4. + .499); r.low = tmp > max_len? tmp : max_len; if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499); - fprintf(stderr, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high); + ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high); free(isize); return r; } @@ -74,9 +85,8 @@ typedef struct { } pairaux_t; extern unsigned char nst_nt4_table[256]; -static int8_t g_mat[25]; -void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a) +void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a, int8_t g_mat[25]) { extern void seq_reverse(int len, ubyte_t *seq, int is_comp); int64_t k, beg, end; @@ -88,11 +98,13 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b a->n_seeds = 1; a->flag |= BSW2_FLAG_MATESW; // before calling this routine, *a has been cleared with memset(0); the flag is set with 1<<6/7 if (h->is_rev == 0) { beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499); + if (beg < h->k) beg = h->k; end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499); a->is_rev = 1; a->flag |= 16; } else { beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499); end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499); + if (end > h->k + (h->end - h->beg)) end = h->k + (h->end - h->beg); a->is_rev = 0; } if (beg < 1) beg = 1; @@ -146,7 +158,10 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); bsw2pestat_t pes; int i, j, k, n_rescued = 0, n_moved = 0, n_fixed = 0; - pes = bsw2_stat(n, hits); + int8_t g_mat[25]; + kstring_t msg; + memset(&msg, 0, sizeof(kstring_t)); + pes = bsw2_stat(n, hits, &msg); for (i = k = 0; i < 5; ++i) { for (j = 0; j < 4; ++j) g_mat[k++] = i == j? opt->a : -opt->b; @@ -163,11 +178,12 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b p->flag |= 1<<(6+j); } } + if (pes.failed) continue; if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit - if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1]); - if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0]); + if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1], g_mat); + if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0], g_mat); // the following enumerate all possibilities. It is tedious but necessary... if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not; bwtsw2_t *p[2]; @@ -242,5 +258,7 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b } } } - fprintf(stderr, "[%s] #fixed=%d, #rescued=%d, #moved=%d\n", __func__, n_fixed, n_rescued, n_moved); + ksprintf(&msg, "[%s] #fixed=%d, #rescued=%d, #moved=%d\n", __func__, n_fixed, n_rescued, n_moved); + fputs(msg.s, stderr); + free(msg.s); } diff --git a/main.c b/main.c index edebe0b..8121f36 100644 --- a/main.c +++ b/main.c @@ -39,28 +39,38 @@ void bwa_print_sam_PG() int main(int argc, char *argv[]) { + int i, ret; + double t_real; + t_real = realtime(); if (argc < 2) return usage(); - if (strcmp(argv[1], "fa2pac") == 0) return bwa_fa2pac(argc-1, argv+1); - else if (strcmp(argv[1], "pac2bwt") == 0) return bwa_pac2bwt(argc-1, argv+1); - else if (strcmp(argv[1], "pac2bwtgen") == 0) return bwt_bwtgen_main(argc-1, argv+1); - else if (strcmp(argv[1], "bwtupdate") == 0) return bwa_bwtupdate(argc-1, argv+1); - else if (strcmp(argv[1], "bwt2sa") == 0) return bwa_bwt2sa(argc-1, argv+1); - else if (strcmp(argv[1], "index") == 0) return bwa_index(argc-1, argv+1); - else if (strcmp(argv[1], "aln") == 0) return bwa_aln(argc-1, argv+1); - else if (strcmp(argv[1], "sw") == 0) return bwa_stdsw(argc-1, argv+1); - else if (strcmp(argv[1], "samse") == 0) return bwa_sai2sam_se(argc-1, argv+1); - else if (strcmp(argv[1], "sampe") == 0) return bwa_sai2sam_pe(argc-1, argv+1); - else if (strcmp(argv[1], "pac2cspac") == 0) return bwa_pac2cspac(argc-1, argv+1); - else if (strcmp(argv[1], "stdsw") == 0) return bwa_stdsw(argc-1, argv+1); - else if (strcmp(argv[1], "bwtsw2") == 0) return bwa_bwtsw2(argc-1, argv+1); - else if (strcmp(argv[1], "dbwtsw") == 0) return bwa_bwtsw2(argc-1, argv+1); - else if (strcmp(argv[1], "bwasw") == 0) return bwa_bwtsw2(argc-1, argv+1); - else if (strcmp(argv[1], "fastmap") == 0) return main_fastmap(argc-1, argv+1); + if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1); + else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1); + else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1); + else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1); + else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); + else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); + else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); + else if (strcmp(argv[1], "sw") == 0) ret = bwa_stdsw(argc-1, argv+1); + else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); + else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); + else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1); + else if (strcmp(argv[1], "stdsw") == 0) ret = bwa_stdsw(argc-1, argv+1); + else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; } - err_fflush(stdout); - err_fclose(stdout); + err_fflush(stdout); + err_fclose(stdout); + if (ret == 0) { + fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION); + fprintf(stderr, "[%s] CMD:", __func__); + for (i = 0; i < argc; ++i) + fprintf(stderr, " %s", argv[i]); + fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime()); + } return 0; } diff --git a/utils.c b/utils.c index d47ec5c..8c1ad7e 100644 --- a/utils.c +++ b/utils.c @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include "utils.h" FILE *err_xopen_core(const char *func, const char *fn, const char *mode) @@ -146,3 +148,17 @@ int err_fclose(FILE *stream) return ret; } +double cputime() +{ + struct rusage r; + getrusage(RUSAGE_SELF, &r); + return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec); +} + +double realtime() +{ + struct timeval tp; + struct timezone tzp; + gettimeofday(&tp, &tzp); + return tp.tv_sec + tp.tv_usec * 1e-6; +} diff --git a/utils.h b/utils.h index a7fecbc..b6839e9 100644 --- a/utils.h +++ b/utils.h @@ -63,6 +63,9 @@ extern "C" { int err_fflush(FILE *stream); int err_fclose(FILE *stream); + double cputime(); + double realtime(); + #ifdef __cplusplus } #endif From 56a18659b6d2caa5218a2d2f3e08f9f10ab2be0a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Nov 2011 16:53:53 -0500 Subject: [PATCH 084/498] crash under non-typical setting (by John Marshall) --- bwape.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bwape.c b/bwape.c index 30cba12..63783c8 100644 --- a/bwape.c +++ b/bwape.c @@ -346,7 +346,8 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT)) { // only when both ends mapped b128_t x; - int j, k, n_occ[2]; + int j, k; + long long n_occ[2]; for (j = 0; j < 2; ++j) { n_occ[j] = 0; for (k = 0; k < d->aln[j].n; ++k) From 2f3cdcd55ba97c6748cf44658ac1446e8b2583c5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Nov 2011 16:54:38 -0500 Subject: [PATCH 085/498] bugfix: bwa-short MT does not work (by Peter) --- bwtaln.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwtaln.c b/bwtaln.c index 08a6d5c..cb7cd71 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -188,7 +188,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (j = 0; j < opt->n_threads; ++j) { - data[j].tid = j; data[j].bwt[0] = bwt[0]; data[j].bwt[1] = bwt[1]; + data[j].tid = j; data[j].bwt = bwt; data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; pthread_create(&tid[j], &attr, worker, data + j); } From 7544aca718dc6e87d181deae3cb1c2bfec969d20 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Nov 2011 16:56:21 -0500 Subject: [PATCH 086/498] updated revision number --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index 8121f36..ad75ce0 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r79-dev" +#define PACKAGE_VERSION "0.6.0-r83-dev" #endif static int usage() From f1517b845cede5067a4f83b174ed8e11f43992d0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Nov 2011 18:35:04 -0500 Subject: [PATCH 087/498] updated manual --- bwa.1 | 119 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 64 insertions(+), 55 deletions(-) diff --git a/bwa.1 b/bwa.1 index 85ebf04..077dd33 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "24 October 2011" "bwa-0.6.0" "Bioinformatics tools" +.TH bwa 1 "12 November 2011" "bwa-0.6.0" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -20,19 +20,19 @@ BWA is a fast light-weighted tool that aligns relatively short sequences (queries) to a sequence database (targe), such as the human reference genome. It implements two different algorithms, both based on Burrows-Wheeler Transform (BWT). The first algorithm is designed for -short queries up to ~200bp with low error rate (<3%). It does gapped +short queries up to ~150bp with low error rate (<3%). It does gapped global alignment w.r.t. queries, supports paired-end reads, and is one of the fastest short read alignment algorithms to date while also visiting suboptimal hits. The second algorithm, BWA-SW, is designed for -long reads with more errors. It performs heuristic Smith-Waterman-like -alignment to find high-scoring local hits (and thus chimera). On -low-error short queries, BWA-SW is slower and less accurate than the +reads longer than 100bp with more errors. It performs a heuristic Smith-Waterman-like +alignment to find high-scoring local hits and split hits. On +low-error short queries, BWA-SW is a little slower and less accurate than the first algorithm, but on long queries, it is better. .PP For both algorithms, the database file in the FASTA format must be first indexed with the .B `index' -command, which typically takes a few hours. The first algorithm is +command, which typically takes a few hours for a 3GB genome. The first algorithm is implemented via the .B `aln' command, which finds the suffix array (SA) coordinates of good hits of @@ -72,8 +72,7 @@ reimplemented by Yuta Mori. .TP .B bwtsw Algorithm implemented in BWT-SW. This method works with the whole human -genome, but it does not work with database smaller than 10MB and it is -usually slower than IS. +genome. .RE .RE @@ -260,9 +259,17 @@ Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null] .B bwasw bwa bwasw [-a matchScore] [-b mmPen] [-q gapOpenPen] [-r gapExtPen] [-t nThreads] [-w bandWidth] [-T thres] [-s hspIntv] [-z zBest] [-N -nHspRev] [-c thresCoef] - -Align query sequences in the file. +nHspRev] [-c thresCoef] [mate.fq] + +Align query sequences in the +.I in.fq +file. When +.I mate.fq +is present, perform paired-end alignment. The paired-end mode only works +for reads Illumina short-insert libraries. In the paired-end mode, BWA-SW +may still output split alignments but they are all marked as not properly +paired; the mate positions will not be written if the mate has multiple +local hits. .B OPTIONS: .RS @@ -413,20 +420,19 @@ subsequence contains no more than differences. .PP When gapped alignment is disabled, BWA is expected to generate the same -alignment as Eland, the Illumina alignment program. However, as BWA +alignment as Eland version 1, the Illumina alignment program. However, as BWA change `N' in the database sequence to random nucleotides, hits to these random sequences will also be counted. As a consequence, BWA may mark a unique hit as a repeat, if the random sequences happen to be identical -to the sequences which should be unqiue in the database. This random -behaviour will be avoided in future releases. +to the sequences which should be unqiue in the database. .PP -By default, if the best hit is no so repetitive (controlled by -R), BWA +By default, if the best hit is not highly repetitive (controlled by -R), BWA also finds all hits contains one more mismatch; otherwise, BWA finds all equally best hits only. Base quality is NOT considered in evaluating -hits. In paired-end alignment, BWA pairs all hits it found. It further -performs Smith-Waterman alignment for unmapped reads with mates mapped -to rescue mapped mates, and for high-quality anomalous pairs to fix -potential alignment errors. +hits. In the paired-end mode, BWA pairs all hits it found. It further +performs Smith-Waterman alignment for unmapped reads to rescue reads with a +high erro rate, and for high-quality anomalous pairs to fix potential alignment +errors. .SS Estimating Insert Size Distribution .PP @@ -447,20 +453,20 @@ error output. .SS Memory Requirement .PP -With bwtsw algorithm, 2.5GB memory is required for indexing the complete +With bwtsw algorithm, 5GB memory is required for indexing the complete human genome sequences. For short reads, the -.B `aln' -command uses ~2.3GB memory and the -.B `sampe' -command uses ~3.5GB. +.B aln +command uses ~3.2GB memory and the +.B sampe +command uses ~5.4GB. .SS Speed .PP Indexing the human genome sequences takes 3 hours with bwtsw -algorithm. Indexing smaller genomes with IS or divsufsort algorithms is -several times faster, but requires more memory. +algorithm. Indexing smaller genomes with IS algorithms is +faster, but requires more memory. .PP -Speed of alignment is largely determined by the error rate of the query +The speed of alignment is largely determined by the error rate of the query sequences (r). Firstly, BWA runs much faster for near perfect hits than for hits with many differences, and it stops searching for a hit with l+2 differences if a l-difference hit is found. This means BWA will be @@ -475,36 +481,39 @@ r>0.02. Pairing is slower for shorter reads. This is mainly because shorter reads have more spurious hits and converting SA coordinates to chromosomal coordinates are very costly. -.PP -In a practical experiment, BWA is able to map 2 million 32bp reads to a -bacterial genome in several minutes, map the same amount of reads to -human X chromosome in 8-15 minutes and to the human genome in 15-25 -minutes. This result implies that the speed of BWA is insensitive to the -size of database and therefore BWA is more efficient when the database -is sufficiently large. On smaller genomes, hash based algorithms are -usually much faster. .SH NOTES ON LONG-READ ALIGNMENT .PP Command -.B `bwasw' -is designed for long-read alignment. The algorithm behind, BWA-SW, is -similar to BWT-SW, but does not guarantee to find all local hits due to -the heuristic acceleration. It tends to be faster and more accurate if -the resultant alignment is supported by more seeds, and therefore -BWA-SW usually performs better on long queries than on short ones. - -On 350-1000bp reads, BWA-SW is several to tens of times faster than the -existing programs. Its accuracy is comparable to SSAHA2, more accurate -than BLAT. Like BLAT, BWA-SW also finds chimera which may pose a -challenge to SSAHA2. On 10-100kbp queries where chimera detection is -important, BWA-SW is over 10X faster than BLAT while being more -sensitive. - -BWA-SW can also be used to align ~100bp reads, but it is slower than -the short-read algorithm. Its sensitivity and accuracy is lower than -SSAHA2 especially when the sequencing error rate is above 2%. This is -the trade-off of the 30X speed up in comparison to SSAHA2's -454 mode. +.B bwasw +is designed for long-read alignment. BWA-SW essentially aligns the trie +of the reference genome against the directed acyclic word graph (DAWG) of a +read to find seeds not highly repetitive in the genome, and then performs a +standard Smith-Waterman algorithm to extend the seeds. A key heuristic, called +the Z-best heuristic, is that at each vertex in the DAWG, BWA-SW only keeps the +top Z reference suffix intervals that match the vertex. BWA-SW is more accurate +if the resultant alignment is supported by more seeds, and therefore BWA-SW +usually performs better on long queries or queries with low divergence to the +reference genome. + +BWA-SW is perhaps a better choice than BWA-short for 100bp single-end HiSeq reads +mainly because it gives better gapped alignment. For paired-end reads, it is yet +to know whether BWA-short or BWA-SW yield overall better results. + +.SH CHANGES IN BWA-0.6 +.PP +Since version 0.6, BWA has been able to work with a reference genome longer than 4GB. +This feature makes it possible to integrate the forward and reverse complemented +genome in one FM-index, which speeds up both BWA-short and BWA-SW. As a tradeoff, +BWA uses more memory because it has to keep all positions and ranks in 64-bit +integers, twice larger than 32-bit integers used in the previous versions. + +The latest BWA-SW also works for paired-end reads longer than 100bp. In +comparison to BWA-short, BWA-SW tends to be more accurate for highly unique +reads and more robust to relative long INDELs and structural variants. +Nonetheless, BWA-short usually has higher power to distinguish the optimal hit +from many suboptimal hits. The choice of the mapping algorithm may depend on +the application. .SH SEE ALSO BWA website , Samtools website @@ -529,12 +538,12 @@ If you use the short-read alignment component, please cite the following paper: .PP Li H. and Durbin R. (2009) Fast and accurate short read alignment with -Burrows-Wheeler transform. Bioinformatics, 25, 1754-60. [PMID: 19451168] +Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168] .PP If you use the long-read component (BWA-SW), please cite: .PP Li H. and Durbin R. (2010) Fast and accurate long-read alignment with -Burrows-Wheeler transform. Bioinformatics. [PMID: 20080505] +Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505] .SH HISTORY BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW From 770a5f2ae0fe0adf8097cb608947eceb9f279f08 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Nov 2011 20:04:39 -0500 Subject: [PATCH 088/498] Release BWA-0.6.0 --- NEWS | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ main.c | 2 +- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index a49db00..34d7151 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,52 @@ +Release 0.5.10 and 0.6.0 (12 November, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The 0.6.0 release comes with two major changes. Firstly, the index data +structure has been changed to support genomes longer than 4GB. The forward and +reverse backward genome is now integrated in one index. This change speeds up +BWA-short by about 20% and BWA-SW by 90% with the mapping acccuracy largely +unchanged. A tradeoff is BWA requires more memory, but this is the price almost +all mappers that index the genome have to pay. + +Secondly, BWA-SW in 0.6.0 now works with paired-end data. It is more accurate +for highly unique reads and more robust to long indels and structural +variations. However, BWA-short still has edges for reads with many suboptimal +hits. It is yet to know which algorithm is the best for variant calling. + +0.5.10 is a bugfix release only and is likely to be the last release in the 0.5 +branch unless I find critical bugs in future. + +Other notable changes: + + * Added the `fastmap' command that finds super-maximal exact matches. It does + not give the final alignment, but runs much faster. It can be a building + block for other alignment algorithms. [0.6.0 only] + + * Output the timing information before BWA exits. This also tells users that + the task has been finished instead of being killed or aborted. [0.6.0 only] + + * Sped up multi-threading when using many (>20) CPU cores. + + * Check I/O error. + + * Increased the maximum barcode length to 63bp. + + * Automatically choose the indexing algorithm. + + * Bugfix: very rare segfault due to an uninitialized variable. The bug also + affects the placement of suboptimal alignments. The effect is very minor. + +This release involves quite a lot of tricky changes. Although it has been +tested on a few data sets, subtle bugs may be still hidden. It is *NOT* +recommended to use this release in a production pipeline. In future, however, +BWA-SW may be better when reads continue to go longer. I would encourage users +to try the 0.6 release. I would also like to hear the users' experience. Thank +you. + +(0.6.0: 12 November 2011, r85) + + + Beta Release 0.5.9 (24 January, 2011) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/main.c b/main.c index ad75ce0..3e9a6d4 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r83-dev" +#define PACKAGE_VERSION "0.6.0-r85" #endif static int usage() From 9f2c77880de02331bc69a45c2a73f47e92120e93 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Nov 2011 20:06:37 -0500 Subject: [PATCH 089/498] remove a useless file --- bwt_gen/Makefile | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 bwt_gen/Makefile diff --git a/bwt_gen/Makefile b/bwt_gen/Makefile deleted file mode 100644 index a7fe6cf..0000000 --- a/bwt_gen/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -CC= gcc -CFLAGS= -g -Wall -O2 -m64 # comment out `-m64' for 32-bit compilation -DFLAGS= -D_FILE_OFFSET_BITS=64 -OBJS= bwt_gen.o QSufSort.o -INCLUDES= -VERSION= 0.1.0 -LIBS= -SUBDIRS= - -.SUFFIXES:.c .o - -.c.o: - $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ - -lib:libbwtgen.a - -libbwtgen.a:$(OBJS) - $(AR) -scru $@ $(OBJS) - -cleanlocal: - rm -f gmon.out *.o a.out $(PROG) *~ *.a - -clean:cleanlocal From 162f220deb731718bc5a689d5244484afb4ef764 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Nov 2011 20:26:52 -0500 Subject: [PATCH 090/498] updated README --- README | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README b/README index f398cec..13eb120 100644 --- a/README +++ b/README @@ -27,3 +27,9 @@ Incomplete list of citations (via HubMed.org): http://www.hubmed.org/references.cgi?uids=20080505 http://www.hubmed.org/references.cgi?uids=19451168 + +Related projects: + + http://pbwa.sourceforge.net/ + http://www.many-core.group.cam.ac.uk/projects/lam.shtml + http://gitorious.org/bwa-cuda From c8edcafb4403498c81f186fa793360bfaf754b03 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 12 Nov 2011 20:48:35 -0500 Subject: [PATCH 091/498] added SEAL --- README | 1 + 1 file changed, 1 insertion(+) diff --git a/README b/README index 13eb120..dd1d335 100644 --- a/README +++ b/README @@ -32,4 +32,5 @@ Related projects: http://pbwa.sourceforge.net/ http://www.many-core.group.cam.ac.uk/projects/lam.shtml + http://biodoop-seal.sourceforge.net/ http://gitorious.org/bwa-cuda From 8f89f55484c40ff7f64b9041bb6c593ced5fbbce Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 17 Nov 2011 22:13:38 -0500 Subject: [PATCH 092/498] fixed a segfault when there are too few good bases. --- bntseq.c | 1 + bwtsw2_aux.c | 2 +- main.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bntseq.c b/bntseq.c index 98a5a49..adcd2d7 100644 --- a/bntseq.c +++ b/bntseq.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "bntseq.h" #include "main.h" #include "utils.h" diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 5d57ba8..4ac11fd 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -574,7 +574,7 @@ static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t rseq[1][i] = c; } if (l - k < opt.t) { // too few unambiguous bases - buf[x] = 0; + buf[x] = calloc(1, sizeof(bwtsw2_t)); free(seq[0]); continue; } // alignment diff --git a/main.c b/main.c index 3e9a6d4..d15d14f 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r85" +#define PACKAGE_VERSION "0.6.0-r89-dev" #endif static int usage() From dc4008936c88f5d8ab28f6d730471f2848ebcb4f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 19 Nov 2011 14:52:47 -0500 Subject: [PATCH 093/498] avoid duplicated XA tags --- bwape.c | 7 +++++-- bwase.c | 22 ++++++++++------------ main.c | 2 +- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/bwape.c b/bwape.c index 63783c8..204bbfe 100644 --- a/bwape.c +++ b/bwape.c @@ -395,16 +395,19 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw if (opt->N_multi || opt->n_multi) { for (j = 0; j < 2; ++j) { if (p[j]->type != BWA_TYPE_NO_MATCH) { - int k; + int k, n_multi; if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) { bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi); } else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi); - for (k = 0; k < p[j]->n_multi; ++k) { + for (k = 0, n_multi = 0; k < p[j]->n_multi; ++k) { int strand; bwt_multi1_t *q = p[j]->multi + k; q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len, &strand); q->strand = strand; + if (q->pos != p[j]->pos) + p[j]->multi[n_multi++] = *q; } + p[j]->n_multi = n_multi; } } } diff --git a/bwase.c b/bwase.c index 76480d4..8fa8d45 100644 --- a/bwase.c +++ b/bwase.c @@ -84,12 +84,6 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma } } s->n_multi = z; - /*// the following code removes the primary hit, but this leads to a bug in the PE mode - for (k = z = 0; k < s->n_multi; ++k) - if (s->multi[k].pos != s->sa) - s->multi[z++] = s->multi[k]; - s->n_multi = z < n_multi? z : n_multi; - */ } } @@ -141,19 +135,23 @@ void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr) { - int i, j, strand; + int i, j, strand, n_multi; char str[1024]; bwt_t *bwt; // load forward SA strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); for (i = 0; i != n_seqs; ++i) { - bwa_cal_pac_pos_core(bns, bwt, &seqs[i], max_mm, fnr); - for (j = 0; j < seqs[i].n_multi; ++j) { - bwt_multi1_t *p = seqs[i].multi + j; - p->pos = bwa_sa2pos(bns, bwt, p->pos, seqs[i].len, &strand); - p->strand = strand; + bwa_seq_t *p = &seqs[i]; + bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr); + for (j = n_multi = 0; j < p->n_multi; ++j) { + bwt_multi1_t *q = p->multi + j; + q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len, &strand); + q->strand = strand; + if (q->pos != p->pos) + p->multi[n_multi++] = *q; } + p->n_multi = n_multi; } bwt_destroy(bwt); } diff --git a/main.c b/main.c index d15d14f..4d4402f 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r89-dev" +#define PACKAGE_VERSION "0.6.0-r90-dev" #endif static int usage() From 182cb2e89ca57d1d916d7a967a19414e08751e25 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 19 Nov 2011 19:38:21 -0500 Subject: [PATCH 094/498] use standard SW when no SSE2 --- Makefile | 2 +- bwtsw2_pair.c | 80 +++++++++++++++++++++++++++++++++------------------ ksw.c | 4 ++- main.c | 2 +- stdaln.c | 2 +- 5 files changed, 58 insertions(+), 32 deletions(-) diff --git a/Makefile b/Makefile index 65ce6dd..1fe8462 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ CC= gcc CXX= g++ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) -DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64 +DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ is.o bntseq.o bwtmisc.o bwtindex.o ksw.o stdaln.o simple_dp.o \ bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \ diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index 581e19c..ad1bb3f 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -5,8 +5,12 @@ #include "bwt.h" #include "bntseq.h" #include "bwtsw2.h" -#include "ksw.h" #include "kstring.h" +#ifndef _NO_SSE2 +#include "ksw.h" +#else +#include "stdaln.h" +#endif #define MAX_INS 20000 #define MIN_RATIO 0.8 @@ -92,8 +96,6 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b int64_t k, beg, end; uint8_t *seq, *ref; int i; - ksw_query_t *q; - ksw_aux_t aux[2]; // compute the region start and end a->n_seeds = 1; a->flag |= BSW2_FLAG_MATESW; // before calling this routine, *a has been cleared with memset(0); the flag is set with 1<<6/7 if (h->is_rev == 0) { @@ -123,32 +125,54 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b for (i = 0; i < l_mseq; ++i) // on the forward strand seq[i] = nst_nt4_table[(int)mseq[i]]; } - // forward Smith-Waterman - aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0]; - q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); - ksw_sse2(q, end - beg, ref, &aux[0]); - free(q); - if (aux[0].score < opt->t) { - free(seq); - return; +#ifndef _NO_SSE2 + { + ksw_query_t *q; + ksw_aux_t aux[2]; + // forward Smith-Waterman + aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0]; + q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); + ksw_sse2(q, end - beg, ref, &aux[0]); + free(q); + if (aux[0].score < opt->t) { + free(seq); + return; + } + ++aux[0].qe; ++aux[0].te; + // reverse Smith-Waterman + seq_reverse(aux[0].qe, seq, 0); + seq_reverse(aux[0].te, ref, 0); + q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat); + ksw_sse2(q, aux[0].te, ref, &aux[1]); + free(q); + ++aux[1].qe; ++aux[1].te; + // write output + a->G = aux[0].score; + a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2; + if (a->G2 < opt->t) a->G2 = 0; + if (a->G2) a->flag |= BSW2_FLAG_TANDEM; + a->k = beg + (aux[0].te - aux[1].te); + a->len = aux[1].te; + a->beg = aux[0].qe - aux[1].qe; + a->end = aux[0].qe; + } +#else + { + AlnParam ap; + path_t path[2]; + int matrix[25]; + for (i = 0; i < 25; ++i) matrix[i] = g_mat[i]; + ap.gap_open = opt->q; ap.gap_ext = opt->r; ap.gap_end = opt->r; + ap.matrix = matrix; ap.row = 5; ap.band_width = 50; + a->G = aln_local_core(ref, end - beg, seq, l_mseq, &ap, path, 0, opt->t, &a->G2); + if (a->G < opt->t) a->G = 0; + if (a->G2 < opt->t) a->G2 = 0; + a->k = beg + path[0].i - 1; + a->len = path[1].i - path[0].i + 1; + a->beg = path[0].j - 1; + a->end = path[1].j; } - ++aux[0].qe; ++aux[0].te; - // reverse Smith-Waterman - seq_reverse(aux[0].qe, seq, 0); - seq_reverse(aux[0].te, ref, 0); - q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat); - ksw_sse2(q, aux[0].te, ref, &aux[1]); - free(q); - ++aux[1].qe; ++aux[1].te; - // write output - a->G = aux[0].score; - a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2; - if (a->G2 < opt->t) a->G2 = 0; - if (a->G2) a->flag |= BSW2_FLAG_TANDEM; - a->k = beg + (aux[0].te - aux[1].te); - a->len = aux[1].te; - a->beg = aux[0].qe - aux[1].qe; - a->end = aux[0].qe; +#endif if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i; free(seq); } diff --git a/ksw.c b/ksw.c index c2b5f9c..bd29e96 100644 --- a/ksw.c +++ b/ksw.c @@ -23,6 +23,7 @@ SOFTWARE. */ +#ifndef _NO_SSE2 #include #include #include @@ -396,4 +397,5 @@ int main(int argc, char *argv[]) kseq_destroy(ksq); gzclose(fpq); return 0; } -#endif +#endif // _KSW_MAIN +#endif // _NO_SSE2 diff --git a/main.c b/main.c index 4d4402f..10e3314 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r90-dev" +#define PACKAGE_VERSION "0.6.0-r91-dev" #endif static int usage() diff --git a/stdaln.c b/stdaln.c index 7b55b2e..eb41882 100644 --- a/stdaln.c +++ b/stdaln.c @@ -631,7 +631,7 @@ int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, score_f += of_base; if (score_f < thres) { /* no matching residue at all, 090218 */ - *path_len = 0; + if (path_len) *path_len = 0; goto end_func; } if (path == 0) goto end_func; /* skip path-filling */ From 84aa3fa696f80bf997ee24f9cb98a73c3a6e8113 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 19 Nov 2011 21:48:54 -0500 Subject: [PATCH 095/498] useless script --- pairsam.pl | 69 ------------------------------------------------------ 1 file changed, 69 deletions(-) delete mode 100755 pairsam.pl diff --git a/pairsam.pl b/pairsam.pl deleted file mode 100755 index bb901bb..0000000 --- a/pairsam.pl +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use warnings; -use Getopt::Std; - -my %opts = (a=>700, b=>100); -getopts('a:b:', \%opts); -die("Usage: pairsam.pl [-a $opts{a}] \n") if (@ARGV < 2); - -my ($fh0, $fh1, $l0, $l1); - -open($fh0, $ARGV[0] =~ /\.gz$/? "gzip -dc $ARGV[0] |" : $ARGV[0]) || die; -open($fh1, $ARGV[1] =~ /\.gz$/? "gzip -dc $ARGV[1] |" : $ARGV[1]) || die; - -while ($l0 = <$fh0>) { last if $l0 !~ /^@/; print $l0 } -while ($l1 = <$fh1>) { last if $l1 !~ /^@/ } - -while (defined($l0) && defined($l1)) { - my ($r0, $r1) = &pair_line(\$l0, \$l1, $opts{a}, $opts{b}); - while ($l0 = <$fh0>) { last if $l0 !~ /^$r0/; print $l0; } - while ($l1 = <$fh1>) { last if $l1 !~ /^$r1/; print $l1; } -} - -close($fh0); close($fh1); - -sub pair_line { - my ($l0, $l1, $max_ins, $min_ins) = @_; - my @t0 = split("\t", $$l0); - my @t1 = split("\t", $$l1); - my ($n0, $n1) = ($t0[0], $t1[0]); - my ($cigar, $a0, $a1, $p0, $p1); - # length in alignment - $cigar = $t0[5]; $a0 = 0; $cigar =~ s/(\d+)[MI]/$a0 += $1/eg; - $cigar = $t1[5]; $a1 = 0; $cigar =~ s/(\d+)[MI]/$a1 += $1/eg; - # 5'-end alignment position on the read - $p0 = $t0[1] == 16? $t0[3] + $a0 : $t0[3]; - $p1 = $t1[1] == 16? $t1[3] + $a1 : $t1[3]; - # adjust mapping quality - if ($t0[2] eq $t1[2] && $t0[1]+$t1[1] == 16) { # on the same chr and forward-reverse - if (abs($p0 - $p1) <= $max_ins && abs($p0 - $p1) >= $min_ins) { # within the right insert size distribution - $t0[1] |= 2; $t1[1] |= 2; # flag as paired - if ($t0[4] < $t1[4]) { # increase mapQ - $t0[4] = $t0[4] + 10 < $t1[4]? $t0[4] + 10 : $t1[4]; - } else { - $t1[4] = $t1[4] + 10 < $t0[4]? $t1[4] + 10 : $t0[4]; - } - } - } - unless ($t0[1]&2) { # decrease mapQ if unpaired - $t0[4] = $t0[4] > 10? $t0[4] - 10 : 0; - $t1[4] = $t1[4] > 10? $t1[4] - 10 : 0; - } - # strip off /[12] - $t0[0] =~ s/\/[12]$//; $t1[0] =~ s/\/[12]$//; - # update FLAG - $t0[1] |= 0x41 | (($t1[1]&16)? 0x20 : 0) | (($t1[1]&4)? 0x8 : 0); - $t1[1] |= 0x81 | (($t0[1]&16)? 0x20 : 0) | (($t0[1]&4)? 0x8 : 0); - # update mate positions - if ($t0[2] eq $t1[2]) { - $t0[6] = $t1[6] = '='; - $t0[8] = $p1 - $p0; $t1[8] = $p0 - $p1; - } else { $t0[6] = $t1[2]; $t1[6] = $t0[2]; } - $t0[7] = $t1[3]; $t1[7] = $t0[3]; - # print out - print join("\t", @t0); - print join("\t", @t1); - return ($n0, $n1); -} From dec584d50b3703b75d2fd734214217eba9ac1bc0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 23 Nov 2011 23:11:50 -0500 Subject: [PATCH 096/498] fixed a long-existing bug in trimming --- bwa.1 | 6 +++--- bwaseqio.c | 8 +++----- bwtaln.c | 2 +- bwtindex.c | 3 ++- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/bwa.1 b/bwa.1 index 077dd33..d3f0458 100644 --- a/bwa.1 +++ b/bwa.1 @@ -46,7 +46,7 @@ command. It works for single-end reads only. .SH COMMANDS AND OPTIONS .TP .B index -bwa index [-p prefix] [-a algoType] [-c] +bwa index [-p prefix] [-a algoType] Index database sequences in the FASTA format. @@ -54,7 +54,7 @@ Index database sequences in the FASTA format. .RS .TP 10 .B -c -Build color-space index. The input fast should be in nucleotide space. +Build color-space index. The input fast should be in nucleotide space. (Disabled since 0.6.x) .TP .BI -p \ STR Prefix of the output database [same as db filename] @@ -142,7 +142,7 @@ especially for short reads (~32bp). .TP .B -c Reverse query but not complement it, which is required for alignment in -the color space. +the color space. (Disabled since 0.6.x) .TP .B -N Disable iterative search. All hits with no more than diff --git a/bwaseqio.c b/bwaseqio.c index 600754e..90abca0 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -73,16 +73,14 @@ void seq_reverse(int len, ubyte_t *seq, int is_comp) int bwa_trim_read(int trim_qual, bwa_seq_t *p) { - int s = 0, l, max = 0, max_l = p->len - 1; + int s = 0, l, max = 0, max_l = p->len; if (trim_qual < 1 || p->qual == 0) return 0; for (l = p->len - 1; l >= BWA_MIN_RDLEN - 1; --l) { s += trim_qual - (p->qual[l] - 33); if (s < 0) break; - if (s > max) { - max = s; max_l = l; - } + if (s > max) max = s, max_l = l; } - p->clip_len = p->len = max_l + 1; + p->clip_len = p->len = max_l; return p->full_len - p->len; } diff --git a/bwtaln.c b/bwtaln.c index cb7cd71..18fa636 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -283,7 +283,7 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); fprintf(stderr, " -f FILE file to write output to instead of stdout\n"); fprintf(stderr, " -B INT length of barcode\n"); - fprintf(stderr, " -c input sequences are in the color space\n"); +// fprintf(stderr, " -c input sequences are in the color space\n"); fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n"); diff --git a/bwtindex.c b/bwtindex.c index 8d40245..a7b126e 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -65,7 +65,8 @@ int bwa_index(int argc, char *argv[]) fprintf(stderr, "Usage: bwa index [-a bwtsw|div|is] [-c] \n\n"); fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [is]\n"); fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); - fprintf(stderr, " -c build color-space index\n\n"); +// fprintf(stderr, " -c build color-space index\n"); + fprintf(stderr, "\n"); fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n"); fprintf(stderr, " according to the length of the genome.\n\n"); From 107a9870eff9b27f266fe0544301f1ccfea2a547 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 23 Nov 2011 23:18:51 -0500 Subject: [PATCH 097/498] incorrect mate pos in BWA-SW SE mode --- bwtsw2_aux.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 4ac11fd..6a283e2 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -496,7 +496,8 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks for (k = 0; k < q->n_cigar; ++k) ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]); } else ksprintf(&str, "\t0\t*"); - ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize); + if (!is_pe) kputs("\t*\t0\t0\t", &str); + else ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize); // get the sequence begin and end beg = 0; end = ks->l; if (opt->hard_clip) { From b17b6577ac636ebd9642a0d11312b83acc9eea6f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 23 Nov 2011 23:30:14 -0500 Subject: [PATCH 098/498] removed a useless bwa-sw CMD option --- bwtsw2.h | 2 +- bwtsw2_aux.c | 2 +- bwtsw2_main.c | 4 +--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/bwtsw2.h b/bwtsw2.h index c13ef9c..8354511 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -14,7 +14,7 @@ typedef struct { int a, b, q, r, t, qr, bw; int z, is, t_seeds, hard_clip; - float yita, mask_level, coef; + float mask_level, coef; int n_threads, chunk_size; } bsw2opt_t; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 6a283e2..863d23d 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -51,7 +51,7 @@ bsw2opt_t *bsw2_init_opt() o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30; o->bw = 50; o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; - o->mask_level = 0.50f; o->yita = 5.5f; o->coef = 5.5f; + o->mask_level = 0.50f; o->coef = 5.5f; o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000; return o; } diff --git a/bwtsw2_main.c b/bwtsw2_main.c index 86eddd7..34d3822 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -17,7 +17,7 @@ int bwa_bwtsw2(int argc, char *argv[]) opt = bsw2_init_opt(); srand48(11); - while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:y:s:c:N:Hf:")) >= 0) { + while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:")) >= 0) { switch (c) { case 'q': opt->q = atoi(optarg); break; case 'r': opt->r = atoi(optarg); break; @@ -27,7 +27,6 @@ int bwa_bwtsw2(int argc, char *argv[]) case 'T': opt->t = atoi(optarg); break; case 't': opt->n_threads = atoi(optarg); break; case 'z': opt->z = atoi(optarg); break; - case 'y': opt->yita = atof(optarg); break; case 's': opt->is = atoi(optarg); break; case 'm': opt->mask_level = atof(optarg); break; case 'c': opt->coef = atof(optarg); break; @@ -45,7 +44,6 @@ int bwa_bwtsw2(int argc, char *argv[]) fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b); fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q); fprintf(stderr, " -r INT gap extension penalty [%d]\n", opt->r); -// fprintf(stderr, " -y FLOAT error recurrence coef. (4..16) [%.1f]\n", opt->yita); fprintf(stderr, "\n"); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, "\n"); From 9f84384ed234525a3ec71d7f3eab3f6cab8a52c3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 23 Nov 2011 23:36:08 -0500 Subject: [PATCH 099/498] reordering BWA-SW command-line prompt --- bwtsw2.h | 2 +- bwtsw2_main.c | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/bwtsw2.h b/bwtsw2.h index 8354511..c156c31 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -13,7 +13,7 @@ typedef struct { int a, b, q, r, t, qr, bw; - int z, is, t_seeds, hard_clip; + int z, is, t_seeds, hard_clip, multi_2nd; float mask_level, coef; int n_threads, chunk_size; } bsw2opt_t; diff --git a/bwtsw2_main.c b/bwtsw2_main.c index 34d3822..ff5595a 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -17,7 +17,7 @@ int bwa_bwtsw2(int argc, char *argv[]) opt = bsw2_init_opt(); srand48(11); - while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:")) >= 0) { + while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:M")) >= 0) { switch (c) { case 'q': opt->q = atoi(optarg); break; case 'r': opt->r = atoi(optarg); break; @@ -31,6 +31,7 @@ int bwa_bwtsw2(int argc, char *argv[]) case 'm': opt->mask_level = atof(optarg); break; case 'c': opt->coef = atof(optarg); break; case 'N': opt->t_seeds = atoi(optarg); break; + case 'M': opt->multi_2nd = 1; break; case 'H': opt->hard_clip = 1; break; case 'f': xreopen(optarg, "w", stdout); break; } @@ -44,19 +45,20 @@ int bwa_bwtsw2(int argc, char *argv[]) fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b); fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q); fprintf(stderr, " -r INT gap extension penalty [%d]\n", opt->r); - fprintf(stderr, "\n"); - fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, "\n"); fprintf(stderr, " -w INT band width [%d]\n", opt->bw); fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level); fprintf(stderr, "\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -f FILE file to output results to instead of stdout\n"); + fprintf(stderr, " -H in SAM output, use hard clipping instead of soft clipping\n"); + fprintf(stderr, " -M mark multi-part alignments as secondary\n"); + fprintf(stderr, "\n"); fprintf(stderr, " -T INT score threshold divided by a [%d]\n", opt->t); - fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is); + fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef); fprintf(stderr, " -z INT Z-best [%d]\n", opt->z); + fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is); fprintf(stderr, " -N INT # seeds to trigger reverse alignment [%d]\n", opt->t_seeds); - fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef); - fprintf(stderr, " -H in SAM output, use hard clipping rather than soft\n"); - fprintf(stderr, " -f FILE file to output results to instead of stdout\n\n"); + fprintf(stderr, "\n"); fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n"); fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n"); fprintf(stderr, " reads (end of 2010), '-b5 -q2 -r1 -z10' is recommended. One may also\n"); From 196b50dde3a345720a55ae6d751a395525adef73 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 23 Nov 2011 23:39:59 -0500 Subject: [PATCH 100/498] optionally mark multi-part hits as secondary --- bwtsw2_aux.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 863d23d..400fcf2 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -489,7 +489,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks bsw2aux_t *q = b->aux + i; int j, beg, end, type = 0; // print mandatory fields before SEQ - ksprintf(&str, "%s\t%d", ks->name, q->flag); + ksprintf(&str, "%s\t%d", ks->name, q->flag | (opt->multi_2nd && i? 0x100 : 0)); ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1); if (p->l == 0) { // not a repetitive hit ksprintf(&str, "\t%d\t", q->pqual); diff --git a/main.c b/main.c index 10e3314..69379ed 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r91-dev" +#define PACKAGE_VERSION "0.6.0-r97-dev" #endif static int usage() From eeedda105dbec23fc742304affdaee4c28630069 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 24 Nov 2011 01:22:02 -0500 Subject: [PATCH 101/498] perhaps improves samse/sampe --- bwt.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bwt.c b/bwt.c index a19f6d8..5a07268 100644 --- a/bwt.c +++ b/bwt.c @@ -49,7 +49,10 @@ void bwt_gen_cnt_table(bwt_t *bwt) void bwt_cal_sa(bwt_t *bwt, int intv) { bwtint_t isa, sa, i; // S(isa) = sa + int intv_round = intv; + kv_roundup32(intv_round); + xassert(intv_round == intv, "SA sample interval is not a power of 2."); xassert(bwt->bwt, "bwt_t::bwt is not initialized."); if (bwt->sa) free(bwt->sa); @@ -73,8 +76,8 @@ void bwt_cal_sa(bwt_t *bwt, int intv) bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k) { - bwtint_t sa = 0; - while (k % bwt->sa_intv != 0) { + bwtint_t sa = 0, mask = bwt->sa_intv - 1; + while (k & mask) { ++sa; k = bwt_invPsi(bwt, k); } From 717959e8199e29f6ac851b7e3e76055b502457a5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 24 Nov 2011 10:39:41 -0500 Subject: [PATCH 102/498] fixed a minor issue in trimming --- bwaseqio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwaseqio.c b/bwaseqio.c index 90abca0..e22d4cd 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -75,7 +75,7 @@ int bwa_trim_read(int trim_qual, bwa_seq_t *p) { int s = 0, l, max = 0, max_l = p->len; if (trim_qual < 1 || p->qual == 0) return 0; - for (l = p->len - 1; l >= BWA_MIN_RDLEN - 1; --l) { + for (l = p->len - 1; l >= BWA_MIN_RDLEN; --l) { s += trim_qual - (p->qual[l] - 33); if (s < 0) break; if (s > max) max = s, max_l = l; From b5170e0efa26daf1695a8c4bf37b9d2d82ecc07c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 24 Nov 2011 11:51:38 -0500 Subject: [PATCH 103/498] output the NM tag --- bwtsw2.h | 2 +- bwtsw2_aux.c | 29 ++++++++++++++++++++++++++--- main.c | 2 +- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/bwtsw2.h b/bwtsw2.h index c156c31..89615b5 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -26,7 +26,7 @@ typedef struct { } bsw2hit_t; typedef struct { - int flag, nn, n_cigar, chr, pos, qual, mchr, mpos, pqual, isize; + int flag, nn, n_cigar, chr, pos, qual, mchr, mpos, pqual, isize, nm; uint32_t *cigar; } bsw2aux_t; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 400fcf2..c3f42bd 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -165,7 +165,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, } /* generate CIGAR array(s) in b->cigar[] */ -static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], uint8_t *pac, bwtsw2_t *b) +static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b) { uint8_t *target; int i, matrix[25]; @@ -392,7 +392,28 @@ static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *c return n_cigar; } -static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], uint8_t *pac, bwtsw2_t *b) +static int compute_nm(bsw2hit_t *p, int n_cigar, const uint32_t *cigar, const uint8_t *pac, const uint8_t *seq) +{ + int k, x, n_mm = 0, i, n_gap = 0; + bwtint_t y; + x = 0; y = p->k; + for (k = 0; k < n_cigar; ++k) { + int op = cigar[k]&0xf; + int len = cigar[k]>>4; + if (op == 0) { // match + for (i = 0; i < len; ++i) { + int ref = pac[(y+i)>>2] >> (~(y+i)&3)*2 & 0x3; + if (seq[x + i] != ref) ++n_mm; + } + x += len; y += len; + } else if (op == 1) x += len, n_gap += len; + else if (op == 2) y += len, n_gap += len; + else if (op == 4) x += len; + } + return n_mm + n_gap; +} + +static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b) { int i; // allocate for b->aux @@ -415,6 +436,8 @@ static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8 int subo; // fix out-of-boundary CIGAR q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar); + // compute the NM tag + q->nm = compute_nm(p, q->n_cigar, q->cigar, pac, seq[p->is_rev]); // compute mapQ subo = p->G2 > opt->t? p->G2 : opt->t; if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; @@ -517,7 +540,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks } } else ksprintf(&str, "\t*"); // print optional tags - ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds); + ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tNM:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, q->nm); if (q->nn) ksprintf(&str, "\tXN:i:%d", q->nn); if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1); if (p->flag&BSW2_FLAG_MATESW) type |= 1; diff --git a/main.c b/main.c index 69379ed..c1d9864 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r97-dev" +#define PACKAGE_VERSION "0.6.0-r100-dev" #endif static int usage() From 150bfbdef47c109db6c8c6495df8a69c91cc3df4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 24 Nov 2011 19:15:14 -0500 Subject: [PATCH 104/498] fixed a deadlock; SMEM iterator --- bwt.c | 36 +++++++---------------- bwt.h | 10 ++++++- fastmap.c | 88 ++++++++++++++++++++++++++++++++++++++++--------------- 3 files changed, 83 insertions(+), 51 deletions(-) diff --git a/bwt.c b/bwt.c index 5a07268..fcc141e 100644 --- a/bwt.c +++ b/bwt.c @@ -292,13 +292,17 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem ik.info = x + 1; for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search - if (q[i] > 3) break; - c = 3 - q[i]; - bwt_extend(bwt, &ik, ok, 0); - if (ok[c].x[2] != ik.x[2]) // change of the interval size + if (q[i] < 4) { + c = 3 - q[i]; + bwt_extend(bwt, &ik, ok, 0); + if (ok[c].x[2] != ik.x[2]) // change of the interval size + kv_push(bwtintv_t, *curr, ik); + if (ok[c].x[2] == 0) break; // cannot be extended + ik = ok[c]; ik.info = i + 1; + } else { // an ambiguous base kv_push(bwtintv_t, *curr, ik); - if (ok[c].x[2] == 0) break; // cannot be extended - ik = ok[c]; ik.info = i + 1; + break; // cannot be extended; in this case, in = 0; - do { - x = bwt_smem1(bwt, len, q, x, mem1, tvec); - for (i = 0; i < mem1->n; ++i) - kv_push(bwtintv_t, *mem, mem1->a[i]); - } while (x < len); - if (tmpvec[0] == 0) free(a[0].a); - if (tmpvec[1] == 0) free(a[1].a); - if (tmpvec[2] == 0) free(a[2].a); - return mem->n; -} diff --git a/bwt.h b/bwt.h index 75dd21c..5823f82 100644 --- a/bwt.h +++ b/bwt.h @@ -112,8 +112,16 @@ extern "C" { int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end); int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0); + /** + * Extend bi-SA-interval _ik_ + */ void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back); - int bwt_smem(const bwt_t *bwt, int len, const uint8_t *q, bwtintv_v *mem, bwtintv_v *tmpvec[3]); + + /** + * Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_. + * Return the end of the longest exact match starting from _x_. + */ + int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]); #ifdef __cplusplus } diff --git a/fastmap.c b/fastmap.c index 585a043..8776869 100644 --- a/fastmap.c +++ b/fastmap.c @@ -10,6 +10,49 @@ KSEQ_INIT(gzFile, gzread) extern unsigned char nst_nt4_table[256]; +typedef struct { + const bwt_t *bwt; + const uint8_t *query; + int start, len; + bwtintv_v *tmpvec[2], *matches; +} smem_i; + +smem_i *smem_iter_init(const bwt_t *bwt) +{ + smem_i *iter; + iter = calloc(1, sizeof(smem_i)); + iter->bwt = bwt; + iter->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); + iter->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); + iter->matches = calloc(1, sizeof(bwtintv_v)); + return iter; +} + +void smem_iter_destroy(smem_i *iter) +{ + free(iter->tmpvec[0]->a); + free(iter->tmpvec[1]->a); + free(iter->matches->a); + free(iter); +} + +void smem_set_query(smem_i *iter, int len, const uint8_t *query) +{ + iter->query = query; + iter->start = 0; + iter->len = len; +} + +int smem_next(smem_i *iter) +{ + iter->tmpvec[0]->n = iter->tmpvec[1]->n = iter->matches->n = 0; + if (iter->start >= iter->len || iter->start < 0) return -1; + while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases + if (iter->start == iter->len) return -1; + iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->matches, iter->tmpvec); + return iter->start; +} + int main_fastmap(int argc, char *argv[]) { int c, i, min_iwidth = 20, min_len = 17; @@ -18,7 +61,7 @@ int main_fastmap(int argc, char *argv[]) gzFile fp; bwt_t *bwt; bntseq_t *bns; - bwtintv_v a[3], mem, *tvec[3]; + smem_i *iter; while ((c = getopt(argc, argv, "w:l:")) >= 0) { switch (c) { @@ -42,38 +85,35 @@ int main_fastmap(int argc, char *argv[]) free(tmp); bns = bns_restore(argv[optind]); } - for (i = 0; i < 3; ++i) { // initiate the temporary array - kv_init(a[i]); - tvec[i] = &a[i]; - } - kv_init(mem); + iter = smem_iter_init(bwt); while (kseq_read(seq) >= 0) { for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - bwt_smem(bwt, seq->seq.l, (uint8_t*)seq->seq.s, &mem, tvec); printf("SQ\t%s\t%ld\n", seq->name.s, seq->seq.l); - for (i = 0; i < mem.n; ++i) { - bwtintv_t *p = &mem.a[i]; - if ((uint32_t)p->info - (p->info>>32) < min_len) continue; - printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); - if (p->x[2] <= min_iwidth) { - for (k = 0; k < p->x[2]; ++k) { - bwtint_t pos; - int len, is_rev, ref_id; - len = (uint32_t)p->info - (p->info>>32); - pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev); - if (is_rev) pos -= len - 1; - bns_cnt_ambi(bns, pos, len, &ref_id); - printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); - } + smem_set_query(iter, seq->seq.l, (uint8_t*)seq->seq.s); + while (smem_next(iter) > 0) { + for (i = 0; i < iter->matches->n; ++i) { + bwtintv_t *p = &iter->matches->a[i]; + if ((uint32_t)p->info - (p->info>>32) < min_len) continue; + printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); + if (p->x[2] <= min_iwidth) { + for (k = 0; k < p->x[2]; ++k) { + bwtint_t pos; + int len, is_rev, ref_id; + len = (uint32_t)p->info - (p->info>>32); + pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev); + if (is_rev) pos -= len - 1; + bns_cnt_ambi(bns, pos, len, &ref_id); + printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); + } + } else fputs("\t*", stdout); + putchar('\n'); } - putchar('\n'); } puts("//"); } - free(mem.a); - for (i = 0; i < 3; ++i) free(a[i].a); + smem_iter_destroy(iter); bns_destroy(bns); bwt_destroy(bwt); kseq_destroy(seq); From bf65b6463a8815eed9863d925a131bba6f401911 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 24 Nov 2011 19:44:21 -0500 Subject: [PATCH 105/498] fastmap: optionally output the original query seq --- fastmap.c | 13 +++++++++---- main.c | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/fastmap.c b/fastmap.c index 8776869..4d7a675 100644 --- a/fastmap.c +++ b/fastmap.c @@ -55,7 +55,7 @@ int smem_next(smem_i *iter) int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 20, min_len = 17; + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0; kseq_t *seq; bwtint_t k; gzFile fp; @@ -63,14 +63,15 @@ int main_fastmap(int argc, char *argv[]) bntseq_t *bns; smem_i *iter; - while ((c = getopt(argc, argv, "w:l:")) >= 0) { + while ((c = getopt(argc, argv, "w:l:s")) >= 0) { switch (c) { + case 's': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; } } if (optind + 1 >= argc) { - fprintf(stderr, "Usage: bwa fastmap [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); + fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); return 1; } @@ -87,9 +88,13 @@ int main_fastmap(int argc, char *argv[]) } iter = smem_iter_init(bwt); while (kseq_read(seq) >= 0) { + printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); + if (print_seq) { + putchar('\t'); + puts(seq->seq.s); + } else putchar('\n'); for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - printf("SQ\t%s\t%ld\n", seq->name.s, seq->seq.l); smem_set_query(iter, seq->seq.l, (uint8_t*)seq->seq.s); while (smem_next(iter) > 0) { for (i = 0; i < iter->matches->n; ++i) { diff --git a/main.c b/main.c index c1d9864..9298bc8 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r100-dev" +#define PACKAGE_VERSION "0.6.0-r102-dev" #endif static int usage() From 64e353ce49dabca10789b940d260b381293cf528 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 28 Nov 2011 09:12:21 -0500 Subject: [PATCH 106/498] bugfix: bwa-sw PE segfault in rare case --- NEWS | 33 +++++++++++++++++++++++++++++++++ bwape.c | 6 +++--- bwtsw2_pair.c | 1 + 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 34d7151..69f27e0 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,36 @@ +Release 0.6.1 (25 November, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes to BWA-short: + + * Bugfix: duplicated alternative hits in the XA tag. + + * Bugfix: when trimming enabled, bwa-aln trims 1bp less. + + * Disabled the color-space alignment. 0.6.x is not working with SOLiD reads at + present. + +Notable changes to BWA-SW: + + * Bugfix: segfault due to excessive ambiguous bases. + + * Bugfix: incorrect mate position in the SE mode. + + * When macro _NO_SSE2 is in use, fall back to the standard Smith-Waterman + instead of SSE2-SW. + + * Optionally mark split hits with lower alignment scores as secondary. + +Changes to fastmap: + + * Bugfix: infinite loop caused by ambiguous bases. + + * Optionally output the query sequence. + +(0.6.1: 25 November 2011, r103) + + + Release 0.5.10 and 0.6.0 (12 November, 2011) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bwape.c b/bwape.c index 204bbfe..e2c4c96 100644 --- a/bwape.c +++ b/bwape.c @@ -437,7 +437,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u bwa_cigar_t *cigar = 0; ubyte_t *ref_seq; bwtint_t k, x, y, l; - int path_len, ret; + int path_len, ret, subo; AlnParam ap = aln_param_bwa; path_t *path, *p; @@ -454,8 +454,8 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u path = (path_t*)calloc(l+len, sizeof(path_t)); // do alignment - ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, 0); - if (ret < 0) { + ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, &subo); + if (ret < 0 || subo == ret) { // no hit or tandem hits free(path); free(cigar); free(ref_seq); *n_cigar = 0; return 0; } diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index ad1bb3f..5e1ec7c 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -111,6 +111,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b } if (beg < 1) beg = 1; if (end > l_pac) end = l_pac; + if (end - beg < l_mseq) return; // generate the sequence seq = malloc(l_mseq + (end - beg)); ref = seq + l_mseq; From 91a4a0c8ea51fc42fc0f6347db2c77c5c5aeff28 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 28 Nov 2011 09:52:07 -0500 Subject: [PATCH 107/498] Release bwa-0.6.1 --- NEWS | 6 ++++-- bwa.1 | 2 +- main.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index 69f27e0..5cced2b 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -Release 0.6.1 (25 November, 2011) +Release 0.6.1 (28 November, 2011) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Notable changes to BWA-short: @@ -16,6 +16,8 @@ Notable changes to BWA-SW: * Bugfix: incorrect mate position in the SE mode. + * Bugfix: rare segfault in the PE mode + * When macro _NO_SSE2 is in use, fall back to the standard Smith-Waterman instead of SSE2-SW. @@ -27,7 +29,7 @@ Changes to fastmap: * Optionally output the query sequence. -(0.6.1: 25 November 2011, r103) +(0.6.1: 28 November 2011, r104) diff --git a/bwa.1 b/bwa.1 index d3f0458..caa60cb 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "12 November 2011" "bwa-0.6.0" "Bioinformatics tools" +.TH bwa 1 "28 November 2011" "bwa-0.6.1" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool diff --git a/main.c b/main.c index 9298bc8..7606a29 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.0-r102-dev" +#define PACKAGE_VERSION "0.6.1-r104" #endif static int usage() From a471f1918ba52a27987bc9a4e76db25369c7aa95 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 19 Mar 2012 13:45:09 -0400 Subject: [PATCH 108/498] bugfix: long-existing out-of-boundary bug --- bwase.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwase.c b/bwase.c index 8fa8d45..f05fff9 100644 --- a/bwase.c +++ b/bwase.c @@ -245,7 +245,7 @@ char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_ } } } else { // no gaps - for (z = u = 0; z < (bwtint_t)len; ++z) { + for (z = u = 0; z < (bwtint_t)len && x+z < l_pac; ++z) { c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { ksprintf(str, "%d", u); From bdc953cad9ff619dd6d316e35416a6dba77385ca Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 29 Mar 2012 12:22:51 -0400 Subject: [PATCH 109/498] Tim's suggestion suffix file name with .64 --- bwape.c | 12 ++++++++++-- bwase.c | 9 ++++++++- bwtaln.c | 34 ++++++++++++++++++++++++++++++++-- bwtindex.c | 6 +++++- bwtsw2_main.c | 13 +++++++++---- main.c | 2 +- 6 files changed, 65 insertions(+), 11 deletions(-) diff --git a/bwape.c b/bwape.c index e2c4c96..779670f 100644 --- a/bwape.c +++ b/bwape.c @@ -766,8 +766,11 @@ int bwa_sai2sam_pe(int argc, char *argv[]) { extern char *bwa_rg_line, *bwa_rg_id; extern int bwa_set_rg(const char *s); + extern char *bwa_infer_prefix(const char *hint); int c; pe_opt_t *popt; + char *prefix; + popt = bwa_init_pe_opt(); while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) { switch (c) { @@ -809,8 +812,13 @@ int bwa_sai2sam_pe(int argc, char *argv[]) fprintf(stderr, "\n"); return 1; } - bwa_sai2sam_pe_core(argv[optind], argv + optind + 1, argv + optind+3, popt); - free(bwa_rg_line); free(bwa_rg_id); + if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + free(bwa_rg_line); free(bwa_rg_id); + return 0; + } + bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt); + free(bwa_rg_line); free(bwa_rg_id); free(prefix); free(popt); return 0; } diff --git a/bwase.c b/bwase.c index f05fff9..e2754cd 100644 --- a/bwase.c +++ b/bwase.c @@ -650,7 +650,9 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f int bwa_sai2sam_se(int argc, char *argv[]) { + extern char *bwa_infer_prefix(const char *hint); int c, n_occ = 3; + char *prefix; while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { switch (c) { case 'h': break; @@ -670,7 +672,12 @@ int bwa_sai2sam_se(int argc, char *argv[]) fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] \n"); return 1; } - bwa_sai2sam_se_core(argv[optind], argv[optind+1], argv[optind+2], n_occ); + if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + free(bwa_rg_line); free(bwa_rg_id); + return 0; + } + bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ); free(bwa_rg_line); free(bwa_rg_id); return 0; } diff --git a/bwtaln.c b/bwtaln.c index 18fa636..9db63c8 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -219,10 +219,35 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) bwa_seq_close(ks); } +char *bwa_infer_prefix(const char *hint) +{ + char *prefix; + int l_hint; + FILE *fp; + l_hint = strlen(hint); + prefix = malloc(l_hint + 3 + 4 + 1); + strcpy(prefix, hint); + strcpy(prefix + l_hint, ".bwt"); + if ((fp = fopen(prefix, "rb")) != 0) { + prefix[l_hint] = 0; + return prefix; + } else { + strcpy(prefix + l_hint, ".64.bwt"); + if ((fp = fopen(prefix, "rb")) == 0) { + free(prefix); + return 0; + } else { + prefix[l_hint + 3] = 0; + return prefix; + } + } +} + int bwa_aln(int argc, char *argv[]) { int c, opte = -1; gap_opt_t *opt; + char *prefix; opt = gap_init_opt(); while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { @@ -303,8 +328,13 @@ int bwa_aln(int argc, char *argv[]) k = l; } } - bwa_aln_core(argv[optind], argv[optind+1], opt); - free(opt); + if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + free(opt); + return 0; + } + bwa_aln_core(prefix, argv[optind+1], opt); + free(opt); free(prefix); return 0; } diff --git a/bwtindex.c b/bwtindex.c index a7b126e..eef3def 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -72,7 +72,11 @@ int bwa_index(int argc, char *argv[]) fprintf(stderr, " according to the length of the genome.\n\n"); return 1; } - if (prefix == 0) prefix = strdup(argv[optind]); + if (prefix == 0) { + prefix = malloc(strlen(argv[optind]) + 4); + strcpy(prefix, argv[optind]); + strcat(prefix, ".64"); + } str = (char*)calloc(strlen(prefix) + 10, 1); str2 = (char*)calloc(strlen(prefix) + 10, 1); str3 = (char*)calloc(strlen(prefix) + 10, 1); diff --git a/bwtsw2_main.c b/bwtsw2_main.c index ff5595a..dbd5d8d 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -9,9 +9,10 @@ int bwa_bwtsw2(int argc, char *argv[]) { + extern char *bwa_infer_prefix(const char *hint); bsw2opt_t *opt; bwt_t *target; - char buf[1024]; + char buf[1024], *prefix; bntseq_t *bns; int c; @@ -72,9 +73,13 @@ int bwa_bwtsw2(int argc, char *argv[]) opt->t *= opt->a; opt->coef *= opt->a; - strcpy(buf, argv[optind]); target = bwt_restore_bwt(strcat(buf, ".bwt")); - strcpy(buf, argv[optind]); bwt_restore_sa(strcat(buf, ".sa"), target); - bns = bns_restore(argv[optind]); + if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + fprintf(stderr, "[%s] fail to locate the index\n", __func__); + return 0; + } + strcpy(buf, prefix); target = bwt_restore_bwt(strcat(buf, ".bwt")); + strcpy(buf, prefix); bwt_restore_sa(strcat(buf, ".sa"), target); + bns = bns_restore(prefix); bsw2_aln(opt, bns, target, argv[optind+1], optind+2 < argc? argv[optind+2] : 0); diff --git a/main.c b/main.c index 7606a29..e4eaf66 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.1-r104" +#define PACKAGE_VERSION "0.6.1-r106-master" #endif static int usage() From c875085b2b5eb24141938fef8fcf82b9fe118cca Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 29 Mar 2012 12:31:01 -0400 Subject: [PATCH 110/498] do not use .64 suffix by default --- bwtindex.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/bwtindex.c b/bwtindex.c index eef3def..938e982 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -42,11 +42,11 @@ void bwa_pac_rev_core(const char *fn, const char *fn_rev); int bwa_index(int argc, char *argv[]) { char *prefix = 0, *str, *str2, *str3; - int c, algo_type = 0, is_color = 0; + int c, algo_type = 0, is_color = 0, is_64 = 0; clock_t t; int64_t l_pac; - while ((c = getopt(argc, argv, "ca:p:")) >= 0) { + while ((c = getopt(argc, argv, "6ca:p:")) >= 0) { switch (c) { case 'a': // if -a is not set, algo_type will be determined later if (strcmp(optarg, "div") == 0) algo_type = 1; @@ -56,15 +56,17 @@ int bwa_index(int argc, char *argv[]) break; case 'p': prefix = strdup(optarg); break; case 'c': is_color = 1; break; + case '6': is_64 = 1; break; default: return 1; } } if (optind + 1 > argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa index [-a bwtsw|div|is] [-c] \n\n"); - fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [is]\n"); + fprintf(stderr, "Usage: bwa index [-a bwtsw|is] [-c] \n\n"); + fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n"); fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); + fprintf(stderr, " -6 index files named as .64.* instead of .* \n"); // fprintf(stderr, " -c build color-space index\n"); fprintf(stderr, "\n"); fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); @@ -75,7 +77,7 @@ int bwa_index(int argc, char *argv[]) if (prefix == 0) { prefix = malloc(strlen(argv[optind]) + 4); strcpy(prefix, argv[optind]); - strcat(prefix, ".64"); + if (is_64) strcat(prefix, ".64"); } str = (char*)calloc(strlen(prefix) + 10, 1); str2 = (char*)calloc(strlen(prefix) + 10, 1); From d33abf127f2f5cbe4a3129d10b8f85a501a21857 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 29 Mar 2012 14:45:00 -0400 Subject: [PATCH 111/498] check *.64.bwt first and then *.bwt --- bwtaln.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bwtaln.c b/bwtaln.c index 9db63c8..014734c 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -227,17 +227,19 @@ char *bwa_infer_prefix(const char *hint) l_hint = strlen(hint); prefix = malloc(l_hint + 3 + 4 + 1); strcpy(prefix, hint); - strcpy(prefix + l_hint, ".bwt"); + strcpy(prefix + l_hint, ".64.bwt"); if ((fp = fopen(prefix, "rb")) != 0) { - prefix[l_hint] = 0; + fclose(fp); + prefix[l_hint + 3] = 0; return prefix; } else { - strcpy(prefix + l_hint, ".64.bwt"); + strcpy(prefix + l_hint, ".bwt"); if ((fp = fopen(prefix, "rb")) == 0) { free(prefix); return 0; } else { - prefix[l_hint + 3] = 0; + fclose(fp); + prefix[l_hint] = 0; return prefix; } } From 173b93dfd4f8e89117b9bd6406a2290ba8ffaa46 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 2 Apr 2012 09:37:02 -0400 Subject: [PATCH 112/498] debugging code: bwasw has a rare bug --- bwtsw2_aux.c | 20 ++++++++++++++++---- bwtsw2_main.c | 2 +- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index c3f42bd..f7eec31 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -165,7 +165,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, } /* generate CIGAR array(s) in b->cigar[] */ -static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b) +static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name) { uint8_t *target; int i, matrix[25]; @@ -192,6 +192,18 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3; score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar); + if (name && score != p->G) { // debugging only + int j, glen = 0; + for (j = 0; j < q->n_cigar; ++j) + if ((q->cigar[j]&0xf) == 1 || (q->cigar[j]&0xf) == 2) + glen += q->cigar[j]>>4; + fprintf(stderr, "[E::%s] %s - unequal score: %d != %d; (qlen, aqlen, arlen, glen) = (%d, %d, %d, %d)\n", __func__, name, score, p->G, lq, end - beg, p->len, glen); + fprintf(stderr, "%d, %d\n", ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, path_len); + if (p->G - score > 100) { + for (j = 0; j < p->len; ++j) fputc("ACGTN"[target[j]], stderr); fputc('\n', stderr); + for (j = 0; j < end - beg; ++j) fputc("ACGTN"[query[j]], stderr); fputc('\n', stderr); + } + } if (beg != 0 || end < lq) { // write soft clipping q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2)); if (beg != 0) { @@ -413,7 +425,7 @@ static int compute_nm(bsw2hit_t *p, int n_cigar, const uint32_t *cigar, const ui return n_mm + n_gap; } -static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b) +static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name) { int i; // allocate for b->aux @@ -424,7 +436,7 @@ static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8 } b->aux = calloc(b->n, sizeof(bsw2aux_t)); // generate CIGAR - gen_cigar(opt, qlen, seq, pac, b); + gen_cigar(opt, qlen, seq, pac, b, name); // fix CIGAR, generate mapQ, and write chromosomal position for (i = 0; i < b->n; ++i) { bsw2hit_t *p = &b->hits[i]; @@ -637,7 +649,7 @@ static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t seq[0][i] = c; seq[1][p->l-1-i] = 3 - c; } - write_aux(&opt, bns, p->l, seq, pac, buf[x]); + write_aux(&opt, bns, p->l, seq, pac, buf[x], _seq->seq[x].name); free(seq[0]); } for (x = 0; x < _seq->n; ++x) { diff --git a/bwtsw2_main.c b/bwtsw2_main.c index dbd5d8d..041e8ae 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -85,7 +85,7 @@ int bwa_bwtsw2(int argc, char *argv[]) bns_destroy(bns); bwt_destroy(target); - free(opt); + free(opt); free(prefix); return 0; } From d3169804f637088ce0e7a867d48b7a2f60005836 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 2 Apr 2012 10:45:42 -0400 Subject: [PATCH 113/498] debugging; bug persistant --- bwtsw2_aux.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index f7eec31..815b9b0 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -1,6 +1,7 @@ #include #include #include +#include #ifdef HAVE_CONFIG_H #include "config.h" #endif @@ -45,6 +46,8 @@ unsigned char nt_comp_table[256] = { extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level); +static int64_t g_l_pac = 0; + bsw2opt_t *bsw2_init_opt() { bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t)); @@ -188,6 +191,7 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 beg = (p->flag & 0x10)? lq - p->end : p->beg; end = (p->flag & 0x10)? lq - p->beg : p->end; query = seq[(p->flag & 0x10)? 1 : 0] + beg; + assert(p->k + p->len <= g_l_pac); for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3; score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); @@ -198,10 +202,12 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 if ((q->cigar[j]&0xf) == 1 || (q->cigar[j]&0xf) == 2) glen += q->cigar[j]>>4; fprintf(stderr, "[E::%s] %s - unequal score: %d != %d; (qlen, aqlen, arlen, glen) = (%d, %d, %d, %d)\n", __func__, name, score, p->G, lq, end - beg, p->len, glen); - fprintf(stderr, "%d, %d\n", ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, path_len); if (p->G - score > 100) { - for (j = 0; j < p->len; ++j) fputc("ACGTN"[target[j]], stderr); fputc('\n', stderr); - for (j = 0; j < end - beg; ++j) fputc("ACGTN"[query[j]], stderr); fputc('\n', stderr); + char *t; + t = malloc((p->len > end - beg? p->len : end - beg) + 2); + for (j = 0; j < p->len; ++j) t[j] = "ACGTN"[target[j]]; t[j++] = '\n'; t[j] = 0; fputs(t, stderr); + for (j = 0; j < end - beg; ++j) t[j] = "ACGTN"[query[j]]; t[j++] = '\n'; t[j] = 0; fputs(t, stderr); + free(t); } } if (beg != 0 || end < lq) { // write soft clipping @@ -760,6 +766,7 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c uint8_t *pac; bsw2seq_t *_seq; + g_l_pac = bns->l_pac; pac = calloc(bns->l_pac/4+1, 1); if (pac == 0) { fprintf(stderr, "[bsw2_aln] insufficient memory!\n"); From 36f2fd6238ca9daf61aaa55f1457a992ff1ba4d2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 2 Apr 2012 11:39:40 -0400 Subject: [PATCH 114/498] bugfix: incorrect bandwidth --- bwtsw2_aux.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 815b9b0..0f3a0f6 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -1,7 +1,6 @@ #include #include #include -#include #ifdef HAVE_CONFIG_H #include "config.h" #endif @@ -46,8 +45,6 @@ unsigned char nt_comp_table[256] = { extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level); -static int64_t g_l_pac = 0; - bsw2opt_t *bsw2_init_opt() { bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t)); @@ -191,7 +188,6 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 beg = (p->flag & 0x10)? lq - p->end : p->beg; end = (p->flag & 0x10)? lq - p->beg : p->end; query = seq[(p->flag & 0x10)? 1 : 0] + beg; - assert(p->k + p->len <= g_l_pac); for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3; score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); @@ -201,14 +197,8 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 for (j = 0; j < q->n_cigar; ++j) if ((q->cigar[j]&0xf) == 1 || (q->cigar[j]&0xf) == 2) glen += q->cigar[j]>>4; - fprintf(stderr, "[E::%s] %s - unequal score: %d != %d; (qlen, aqlen, arlen, glen) = (%d, %d, %d, %d)\n", __func__, name, score, p->G, lq, end - beg, p->len, glen); - if (p->G - score > 100) { - char *t; - t = malloc((p->len > end - beg? p->len : end - beg) + 2); - for (j = 0; j < p->len; ++j) t[j] = "ACGTN"[target[j]]; t[j++] = '\n'; t[j] = 0; fputs(t, stderr); - for (j = 0; j < end - beg; ++j) t[j] = "ACGTN"[query[j]]; t[j++] = '\n'; t[j] = 0; fputs(t, stderr); - free(t); - } + fprintf(stderr, "[E::%s] %s - unequal score: %d != %d; (qlen, aqlen, arlen, glen, bw) = (%d, %d, %d, %d, %d)\n", + __func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw); } if (beg != 0 || end < lq) { // write soft clipping q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2)); @@ -572,12 +562,27 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks free(ks->name); ks->name = 0; } +static void update_opt(bsw2opt_t *dst, const bsw2opt_t *src, int qlen) +{ + double ll = log(qlen); + int i, k; + *dst = *src; + dst->t = src->t; + if (dst->t < ll * dst->coef) dst->t = (int)(ll * dst->coef + .499); + // set band width: the query length sets a boundary on the maximum band width + k = (qlen * dst->a - 2 * dst->q) / (2 * dst->r + dst->a); + i = (qlen * dst->a - dst->a - dst->t) / dst->r; + if (k > i) k = i; + if (k < 1) k = 1; // I do not know if k==0 causes troubles + dst->bw = src->bw < k? src->bw : k; +} + /* Core routine to align reads in _seq. It is separated from * process_seqs() to realize multi-threading */ static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) { int x; - bsw2opt_t opt = *_opt; + bsw2opt_t opt; bsw2global_t *pool = bsw2_global_init(); bwtsw2_t **buf; buf = calloc(_seq->n, sizeof(void*)); @@ -587,21 +592,12 @@ static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t int i, l, k; bwtsw2_t *b[2]; l = p->l; - // set opt->t - opt.t = _opt->t; - if (opt.t < log(l) * opt.coef) opt.t = (int)(log(l) * opt.coef + .499); + update_opt(&opt, _opt, p->l); if (pool->max_l < l) { // then enlarge working space for aln_extend_core() int tmp = ((l + 1) / 2 * opt.a + opt.r) / opt.r + l; pool->max_l = l; pool->aln_mem = realloc(pool->aln_mem, (tmp + 2) * 24); } - // set opt->bw - opt.bw = _opt->bw; - k = (l * opt.a - 2 * opt.q) / (2 * opt.r + opt.a); - i = (l * opt.a - opt.a - opt.t) / opt.r; - if (k > i) k = i; - if (k < 1) k = 1; // I do not know if k==0 causes troubles - opt.bw = _opt->bw < k? _opt->bw : k; // set seq[2] and rseq[2] seq[0] = calloc(l * 4, 1); seq[1] = seq[0] + l; @@ -655,6 +651,7 @@ static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t seq[0][i] = c; seq[1][p->l-1-i] = 3 - c; } + update_opt(&opt, _opt, p->l); write_aux(&opt, bns, p->l, seq, pac, buf[x], _seq->seq[x].name); free(seq[0]); } @@ -766,7 +763,6 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c uint8_t *pac; bsw2seq_t *_seq; - g_l_pac = bns->l_pac; pac = calloc(bns->l_pac/4+1, 1); if (pac == 0) { fprintf(stderr, "[bsw2_aln] insufficient memory!\n"); From 790df95e1af8cbb25be8905fe1562961402af0b3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 2 Apr 2012 11:43:32 -0400 Subject: [PATCH 115/498] updated revision number --- bwtsw2_aux.c | 1 - main.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 0f3a0f6..d33c206 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -567,7 +567,6 @@ static void update_opt(bsw2opt_t *dst, const bsw2opt_t *src, int qlen) double ll = log(qlen); int i, k; *dst = *src; - dst->t = src->t; if (dst->t < ll * dst->coef) dst->t = (int)(ll * dst->coef + .499); // set band width: the query length sets a boundary on the maximum band width k = (qlen * dst->a - 2 * dst->q) / (2 * dst->r + dst->a); diff --git a/main.c b/main.c index e4eaf66..bd9c926 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.1-r106-master" +#define PACKAGE_VERSION "0.6.1-r112-master" #endif static int usage() From cff473393c7e8e37ee2b5a0789bbe10742a88e75 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 2 Apr 2012 12:05:15 -0400 Subject: [PATCH 116/498] commented out debugging code --- bwtsw2_aux.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index d33c206..2cce142 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -192,6 +192,7 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3; score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar); +#if 0 if (name && score != p->G) { // debugging only int j, glen = 0; for (j = 0; j < q->n_cigar; ++j) @@ -200,6 +201,7 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 fprintf(stderr, "[E::%s] %s - unequal score: %d != %d; (qlen, aqlen, arlen, glen, bw) = (%d, %d, %d, %d, %d)\n", __func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw); } +#endif if (beg != 0 || end < lq) { // write soft clipping q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2)); if (beg != 0) { From a1abfe9977f5d193064518d5c8ddcd3d1e81dcb0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 7 Apr 2012 00:23:01 -0400 Subject: [PATCH 117/498] API: aln seems working --- Makefile | 25 ++++++----- bwa.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ bwa.h | 44 +++++++++++++++++++ bwtaln.c | 2 +- bwtgap.c | 17 ++++---- bwtgap.h | 1 + 6 files changed, 195 insertions(+), 20 deletions(-) create mode 100644 bwa.c create mode 100644 bwa.h diff --git a/Makefile b/Makefile index 1fe8462..b39bd27 100644 --- a/Makefile +++ b/Makefile @@ -2,16 +2,19 @@ CC= gcc CXX= g++ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) +AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ - is.o bntseq.o bwtmisc.o bwtindex.o ksw.o stdaln.o simple_dp.o \ - bwaseqio.o bwase.o bwape.o kstring.o cs2nt.o \ +LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o stdaln.o \ + bwaseqio.o +AOBJS= QSufSort.o bwt_gen.o \ + is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ + bwase.o bwape.o kstring.o cs2nt.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ - bwtsw2_chain.o bamlite.o fastmap.o bwtsw2_pair.o + bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa INCLUDES= LIBS= -lm -lz -lpthread -SUBDIRS= . bwt_gen +SUBDIRS= . .SUFFIXES:.c .o .cc @@ -22,19 +25,21 @@ SUBDIRS= . bwt_gen all:$(PROG) -bwa:$(OBJS) main.o - $(CC) $(CFLAGS) $(DFLAGS) $(OBJS) main.o -o $@ $(LIBS) +bwa:libbwa.a $(AOBJS) main.o + $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) + +libbwa.a:$(LOBJS) + $(AR) -csru $@ $(LOBJS) + +bwa.o:bwa.h QSufSort.o:QSufSort.h bwt.o:bwt.h bwtio.o:bwt.h bwtaln.o:bwt.h bwtaln.h kseq.h -bwt1away.o:bwt.h bwtaln.h -bwt2fmv.o:bwt.h bntseq.o:bntseq.h bwtgap.o:bwtgap.h bwtaln.h bwt.h -fastmap:bwt.h bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h diff --git a/bwa.c b/bwa.c new file mode 100644 index 0000000..ab0afea --- /dev/null +++ b/bwa.c @@ -0,0 +1,126 @@ +#include +#include +#include +#include "bwa.h" +#include "bwt.h" +#include "bwtgap.h" +#include "bntseq.h" + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +extern unsigned char nst_nt4_table[256]; + +bwa_opt_t bwa_def_opt = { 11, 4, -1, 1, 6, 32, 2, 0.04 }; + +struct bwa_idx_t { + bwt_t *bwt; + bntseq_t *bns; + uint8_t *pac; +}; + +struct bwa_aux_t { + int max_buf; + gap_stack_t *stack; + gap_opt_t *opt; + int *diff_tab; + uint8_t *buf; +}; + +bwa_idx_t *bwa_idx_load(const char *prefix) +{ + bwa_idx_t *p; + int l; + char *str; + l = strlen(prefix); + p = calloc(1, sizeof(bwa_idx_t)); + str = malloc(l + 10); + strcpy(str, prefix); + p->bns = bns_restore(str); + strcpy(str + l, ".bwt"); + p->bwt = bwt_restore_bwt(str); + str[l] = 0; + strcpy(str + l, ".sa"); + bwt_restore_sa(str, p->bwt); + free(str); + p->pac = calloc(p->bns->l_pac/4+1, 1); + fread(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac); + fclose(p->bns->fp_pac); + p->bns->fp_pac = 0; + return p; +} + +void bwa_idx_destroy(bwa_idx_t *p) +{ + bns_destroy(p->bns); + bwt_destroy(p->bwt); + free(p->pac); + free(p); +} + +bwa_aux_t *bwa_aux_init(const bwa_opt_t *opt, int max_score) +{ + extern gap_opt_t *gap_init_opt(void); + extern int bwa_cal_maxdiff(int l, double err, double thres); + int i; + bwa_aux_t *p; + p = malloc(sizeof(bwa_aux_t)); + p->stack = gap_init_stack2(max_score); + p->opt = gap_init_opt(); + p->opt->s_gapo = opt->s_gapo; + p->opt->s_gape = opt->s_gape; + p->opt->max_diff = opt->max_diff; + p->opt->max_gapo = opt->max_gapo; + p->opt->max_gape = opt->max_gape; + p->opt->seed_len = opt->seed_len; + p->opt->max_seed_diff = opt->max_seed_diff; + p->opt->fnr = opt->fnr; + p->diff_tab = calloc(BWA_MAX_QUERY_LEN, sizeof(int)); + for (i = 1; i < BWA_MAX_QUERY_LEN; ++i) + p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); + return p; +} + +void bwa_aux_destroy(bwa_aux_t *p) +{ + gap_destroy_stack(p->stack); + free(p->diff_tab); free(p->opt); + free(p); +} + +bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, int *n_aln) +{ + extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width); + extern void seq_reverse(int len, uint8_t *seq, int is_comp); + int i, seq_len, buf_len; + bwt_width_t *w, *seed_w; + uint8_t *s; + gap_opt_t opt2 = *aux->opt; + + seq_len = strlen(seq); + // estimate the buffer length + buf_len = (aux->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len; + if (buf_len > aux->max_buf) { + aux->max_buf = buf_len; + kroundup32(aux->max_buf); + aux->buf = realloc(aux->buf, aux->max_buf); + } + memset(aux->buf, 0, buf_len); + seed_w = (bwt_width_t*)aux->buf; + w = seed_w + aux->opt->seed_len; + s = (uint8_t*)(w + seq_len + 1); + if (opt2.fnr > 0.) opt2.max_diff = aux->diff_tab[seq_len]; + // copy the sequence + for (i = 0; i < seq_len; ++i) + s[i] = nst_nt4_table[(int)seq[i]]; + seq_reverse(seq_len, s, 0); + // mapping + bwt_cal_width(idx->bwt, seq_len, s, w); + if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff; + if (seq_len > aux->opt->seed_len) + bwt_cal_width(idx->bwt, aux->opt->seed_len, s + (seq_len - aux->opt->seed_len), seed_w); + for (i = 0; i < seq_len; ++i) // complement; I forgot why... + s[i] = s[i] > 3? 4 : 3 - s[i]; + return (bwa_alnpre_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= aux->opt->seed_len? 0 : seed_w, &opt2, n_aln, aux->stack); +} diff --git a/bwa.h b/bwa.h new file mode 100644 index 0000000..a69f9d2 --- /dev/null +++ b/bwa.h @@ -0,0 +1,44 @@ +#ifndef BWA_H_ +#define BWA_H_ + +#include + +#define BWA_DEF_MAX_SCORE 2048 +#define BWA_MAX_QUERY_LEN 1024 + +struct bwa_idx_t; +typedef struct bwa_idx_t bwa_idx_t; + +struct bwa_aux_t; +typedef struct bwa_aux_t bwa_aux_t; + +typedef struct { + int s_gapo, s_gape; // the mismatch penalty is fixed at 3 + int max_diff, max_gapo, max_gape; + int seed_len, max_seed_diff; + float fnr; +} bwa_opt_t; + +typedef struct { + uint32_t n_mm:16, n_gapo:8, n_gape:8; + int score; + uint64_t k, l; +} bwa_alnpre_t; + +extern bwa_opt_t bwa_def_opt; + +#ifdef __cplusplus +extern "C" { +#endif + + bwa_idx_t *bwa_idx_load(const char *prefix); + void bwa_idx_destroy(bwa_idx_t *p); + bwa_aux_t *bwa_aux_init(const bwa_opt_t *opt, int max_score); + void bwa_aux_destroy(bwa_aux_t *p); + bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, int *n_aln); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwtaln.c b/bwtaln.c index 014734c..efc7f66 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -49,7 +49,7 @@ int bwa_cal_maxdiff(int l, double err, double thres) } // width must be filled as zero -static int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width) +int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width) { bwtint_t k, l, ok, ol; int i, bid; diff --git a/bwtgap.c b/bwtgap.c index c996f9f..364717c 100644 --- a/bwtgap.c +++ b/bwtgap.c @@ -10,21 +10,20 @@ #define aln_score(m,o,e,p) ((m)*(p)->s_mm + (o)*(p)->s_gapo + (e)*(p)->s_gape) -gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt) +gap_stack_t *gap_init_stack2(int max_score) { - int i; gap_stack_t *stack; stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t)); - stack->n_stacks = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt); + stack->n_stacks = max_score; stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t)); - for (i = 0; i != stack->n_stacks; ++i) { - gap_stack1_t *p = stack->stacks + i; - p->m_entries = 4; - p->stack = (gap_entry_t*)calloc(p->m_entries, sizeof(gap_entry_t)); - } return stack; } +gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt) +{ + return gap_init_stack2(aln_score(max_mm+1, max_gapo+1, max_gape+1, opt)); +} + void gap_destroy_stack(gap_stack_t *stack) { int i; @@ -51,7 +50,7 @@ static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, i score = aln_score(n_mm, n_gapo, n_gape, opt); q = stack->stacks + score; if (q->n_entries == q->m_entries) { - q->m_entries <<= 1; + q->m_entries = q->m_entries? q->m_entries<<1 : 4; q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries); } p = q->stack + q->n_entries; diff --git a/bwtgap.h b/bwtgap.h index 01ee359..8398762 100644 --- a/bwtgap.h +++ b/bwtgap.h @@ -25,6 +25,7 @@ typedef struct { extern "C" { #endif + gap_stack_t *gap_init_stack2(int max_score); gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt); void gap_destroy_stack(gap_stack_t *stack); bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *w, From 66154ff5d2021bd7b1610bd26552399df9f212e5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 7 Apr 2012 01:25:39 -0400 Subject: [PATCH 118/498] towards refined gapped; unfinished --- bwa.c | 22 ++++++++++++++++++++++ bwa.h | 7 +++++++ bwase.c | 14 +++++++------- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/bwa.c b/bwa.c index ab0afea..3c6fbfd 100644 --- a/bwa.c +++ b/bwa.c @@ -124,3 +124,25 @@ bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, s[i] = s[i] > 3? 4 : 3 - s[i]; return (bwa_alnpre_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= aux->opt->seed_len? 0 : seed_w, &opt2, n_aln, aux->stack); } +/* +bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, uint64_t sa, int n_gaps) +{ + extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); + extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct); + int strand, seq_len, n_cigar; + uint64_t pos; + uint8_t *s[2]; + bwa_aln_t aln; + bwa_cigar_t *cigar16; + + seq_len = strlen(seq); + if (seq_len<<1 > aux->max_buf) { + aux->max_buf = seq_len<<1; + kroundup32(aux->max_buf); + aux->buf = realloc(aux->buf, aux->max_buf); + } + pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand); + cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, len, s[strand], &pos, strand? n_gaps : -n_gaps, &n_cigar, 1); + return aln; +} +*/ diff --git a/bwa.h b/bwa.h index a69f9d2..01eddcf 100644 --- a/bwa.h +++ b/bwa.h @@ -25,6 +25,13 @@ typedef struct { uint64_t k, l; } bwa_alnpre_t; +typedef struct { + uint32_t n_cigar:15, gap:8, mm:8, strand:1; + uint32_t ref_id; + uint64_t offset; + uint32_t *cigar; +} bwa_aln_t; + extern bwa_opt_t bwa_def_opt; #ifdef __cplusplus diff --git a/bwase.c b/bwase.c index e2754cd..c88cae5 100644 --- a/bwase.c +++ b/bwase.c @@ -110,8 +110,8 @@ bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int l pos_f = bns_depos(bns, bwt_sa(bwt, sapos), &is_rev); // pos_f *strand = !is_rev; /* NB: For gapped alignment, pacpos may not be correct, which will be fixed - * in refine_gapped_core(). This line also determines the way "x" is - * calculated in refine_gapped_core() when (ext < 0 && is_end == 0). */ + * in bwa_refine_gapped_core(). This line also determines the way "x" is + * calculated in bwa_refine_gapped_core() when (ext < 0 && is_end == 0). */ if (is_rev) pos_f = pos_f + 1 < len? 0 : pos_f - len + 1; // mapped to the forward strand return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset } @@ -160,7 +160,7 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se * forward strand. This happens when p->pos is calculated by * bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct * coordinate. This happens only for color-converted alignment. */ -static bwa_cigar_t *refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos, +bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct) { bwa_cigar_t *cigar = 0; @@ -320,12 +320,12 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t bwt_multi1_t *q = s->multi + j; int n_cigar; if (q->gap == 0) continue; - q->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, + q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, (q->strand? 1 : -1) * q->gap, &n_cigar, 1); q->n_cigar = n_cigar; } if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; - s->cigar = refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, + s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); } @@ -338,13 +338,13 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t int n_cigar; if (q->gap == 0) continue; free(q->cigar); - q->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos, + q->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos, (q->strand? 1 : -1) * q->gap, &n_cigar, 0); q->n_cigar = n_cigar; } if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again free(s->cigar); - s->cigar = refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos, + s->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos, (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0); } } From 1cef219667123e0e06688fb9142fe9877b976848 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 7 Apr 2012 22:00:03 -0400 Subject: [PATCH 119/498] compute CIGAR; rev seq not working --- Makefile | 4 ++-- bwa.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++------- bwa.h | 9 +++++--- bwase.c | 6 ++--- 4 files changed, 70 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index b39bd27..6f388f2 100644 --- a/Makefile +++ b/Makefile @@ -5,10 +5,10 @@ CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o stdaln.o \ - bwaseqio.o + bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ - bwase.o bwape.o kstring.o cs2nt.o \ + bwape.o cs2nt.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa diff --git a/bwa.c b/bwa.c index 3c6fbfd..6f3dbd4 100644 --- a/bwa.c +++ b/bwa.c @@ -11,6 +11,7 @@ #endif extern unsigned char nst_nt4_table[256]; +extern void seq_reverse(int len, uint8_t *seq, int is_comp); bwa_opt_t bwa_def_opt = { 11, 4, -1, 1, 6, 32, 2, 0.04 }; @@ -92,7 +93,6 @@ void bwa_aux_destroy(bwa_aux_t *p) bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, int *n_aln) { extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width); - extern void seq_reverse(int len, uint8_t *seq, int is_comp); int i, seq_len, buf_len; bwt_width_t *w, *seed_w; uint8_t *s; @@ -124,25 +124,76 @@ bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, s[i] = s[i] > 3? 4 : 3 - s[i]; return (bwa_alnpre_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= aux->opt->seed_len? 0 : seed_w, &opt2, n_aln, aux->stack); } -/* + +static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps) +{ + uint64_t x = pos, z; + int k, y = 0; + *n_mm = *n_gaps = 0; + for (k = 0; k < n_cigar; ++k) { + int l = cigar[k]>>4; + int op = cigar[k]&0xf; + if (op == 0) { // match/mismatch + for (z = 0; z < l && x + z < l_pac; ++z) { + int c = pac[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; + if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) ++(*n_mm); + } + } + if (op == 1 || op == 2) (*n_gaps) += l; + if (op == 0 || op == 2) x += l; + if (op == 0 || op == 1 || op == 4) y += l; + } +} + bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, uint64_t sa, int n_gaps) { extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct); - int strand, seq_len, n_cigar; - uint64_t pos; + int strand, seq_len, i, n_gap, n_mm; + uint64_t pos3; uint8_t *s[2]; bwa_aln_t aln; - bwa_cigar_t *cigar16; + memset(&aln, 0, sizeof(bwa_aln_t)); seq_len = strlen(seq); if (seq_len<<1 > aux->max_buf) { aux->max_buf = seq_len<<1; kroundup32(aux->max_buf); aux->buf = realloc(aux->buf, aux->max_buf); } - pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand); - cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, len, s[strand], &pos, strand? n_gaps : -n_gaps, &n_cigar, 1); + s[0] = aux->buf; + s[1] = s[0] + seq_len; + for (i = 0; i < seq_len; ++i) + s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]]; + seq_reverse(seq_len, s[1], 1); + aln.pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand); + if (strand) aln.flag |= 16; + if (n_gaps) { // only for gapped alignment + int n_cigar; + bwa_cigar_t *cigar16; + cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &aln.pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1); + aln.n_cigar = n_cigar; + aln.cigar = malloc(n_cigar * 4); + for (i = 0, pos3 = aln.pac_pos; i < n_cigar; ++i) { + int op = cigar16[i]>>14; + int len = cigar16[i]&0x3fff; + if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR + aln.cigar[i] = len<<4 | op; + if (op == 0 || op == 2) pos3 += len; + } + free(cigar16); + } else { // ungapped + aln.n_cigar = 1; + aln.cigar = malloc(4); + aln.cigar[0] = seq_len<<4 | 0; + pos3 = aln.pac_pos + seq_len; + } + aln.n_n = bns_cnt_ambi(idx->bns, aln.pac_pos, pos3 - aln.pac_pos, &aln.ref_id); + aln.offset = aln.pac_pos - idx->bns->anns[aln.ref_id].offset; + if (pos3 - idx->bns->anns[aln.ref_id].offset > idx->bns->anns[aln.ref_id].len) // read mapped beyond the end of a sequence + aln.flag |= 4; // read unmapped + compute_NM(idx->pac, idx->bns->l_pac, s[strand], aln.pac_pos, aln.n_cigar, aln.cigar, &n_mm, &n_gap); + aln.n_mm = n_mm; + aln.n_gap = n_gap; return aln; } -*/ diff --git a/bwa.h b/bwa.h index 01eddcf..303c983 100644 --- a/bwa.h +++ b/bwa.h @@ -26,9 +26,11 @@ typedef struct { } bwa_alnpre_t; typedef struct { - uint32_t n_cigar:15, gap:8, mm:8, strand:1; - uint32_t ref_id; - uint64_t offset; + uint32_t n_n:8, n_gap:12, n_mm:12; + int32_t ref_id; + uint32_t offset; + uint32_t n_cigar:16, flag:16; + uint64_t pac_pos; uint32_t *cigar; } bwa_aln_t; @@ -43,6 +45,7 @@ extern "C" { bwa_aux_t *bwa_aux_init(const bwa_opt_t *opt, int max_score); void bwa_aux_destroy(bwa_aux_t *p); bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, int *n_aln); + bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, uint64_t sa, int n_gaps); #ifdef __cplusplus } diff --git a/bwase.c b/bwase.c index c88cae5..35744e7 100644 --- a/bwase.c +++ b/bwase.c @@ -328,7 +328,7 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); } - +#if 0 if (ntbns) { // in color space for (i = 0; i < n_seqs; ++i) { bwa_seq_t *s = seqs + i; @@ -349,7 +349,7 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t } } } - +#endif // generate MD tag str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = 0; i != n_seqs; ++i) { @@ -602,7 +602,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac ntbns = bwa_open_nt(prefix); bwa_print_sam_SQ(bns); - bwa_print_sam_PG(); + //bwa_print_sam_PG(); // set ks ks = bwa_open_reads(opt.mode, fn_fa); // core loop From 080726cb4746db94def2e9cf823a9e317a0edb23 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 7 Apr 2012 22:50:07 -0400 Subject: [PATCH 120/498] preliminary doc --- bwa.c | 68 ++++++++++++++++++++++++++------------------------ bwa.h | 76 +++++++++++++++++++++++++++++++++++++++++--------------- bwtaln.h | 2 +- 3 files changed, 92 insertions(+), 54 deletions(-) diff --git a/bwa.c b/bwa.c index 6f3dbd4..57b9a9a 100644 --- a/bwa.c +++ b/bwa.c @@ -21,7 +21,7 @@ struct bwa_idx_t { uint8_t *pac; }; -struct bwa_aux_t { +struct bwa_buf_t { int max_buf; gap_stack_t *stack; gap_opt_t *opt; @@ -60,13 +60,13 @@ void bwa_idx_destroy(bwa_idx_t *p) free(p); } -bwa_aux_t *bwa_aux_init(const bwa_opt_t *opt, int max_score) +bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score) { extern gap_opt_t *gap_init_opt(void); extern int bwa_cal_maxdiff(int l, double err, double thres); int i; - bwa_aux_t *p; - p = malloc(sizeof(bwa_aux_t)); + bwa_buf_t *p; + p = malloc(sizeof(bwa_buf_t)); p->stack = gap_init_stack2(max_score); p->opt = gap_init_opt(); p->opt->s_gapo = opt->s_gapo; @@ -83,34 +83,35 @@ bwa_aux_t *bwa_aux_init(const bwa_opt_t *opt, int max_score) return p; } -void bwa_aux_destroy(bwa_aux_t *p) +void bwa_buf_destroy(bwa_buf_t *p) { gap_destroy_stack(p->stack); free(p->diff_tab); free(p->opt); free(p); } -bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, int *n_aln) +bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq) { extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width); int i, seq_len, buf_len; bwt_width_t *w, *seed_w; uint8_t *s; - gap_opt_t opt2 = *aux->opt; + gap_opt_t opt2 = *buf->opt; + bwa_sai_t sai; seq_len = strlen(seq); // estimate the buffer length - buf_len = (aux->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len; - if (buf_len > aux->max_buf) { - aux->max_buf = buf_len; - kroundup32(aux->max_buf); - aux->buf = realloc(aux->buf, aux->max_buf); + buf_len = (buf->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len; + if (buf_len > buf->max_buf) { + buf->max_buf = buf_len; + kroundup32(buf->max_buf); + buf->buf = realloc(buf->buf, buf->max_buf); } - memset(aux->buf, 0, buf_len); - seed_w = (bwt_width_t*)aux->buf; - w = seed_w + aux->opt->seed_len; + memset(buf->buf, 0, buf_len); + seed_w = (bwt_width_t*)buf->buf; + w = seed_w + buf->opt->seed_len; s = (uint8_t*)(w + seq_len + 1); - if (opt2.fnr > 0.) opt2.max_diff = aux->diff_tab[seq_len]; + if (opt2.fnr > 0.) opt2.max_diff = buf->diff_tab[seq_len]; // copy the sequence for (i = 0; i < seq_len; ++i) s[i] = nst_nt4_table[(int)seq[i]]; @@ -118,11 +119,12 @@ bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, // mapping bwt_cal_width(idx->bwt, seq_len, s, w); if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff; - if (seq_len > aux->opt->seed_len) - bwt_cal_width(idx->bwt, aux->opt->seed_len, s + (seq_len - aux->opt->seed_len), seed_w); + if (seq_len > buf->opt->seed_len) + bwt_cal_width(idx->bwt, buf->opt->seed_len, s + (seq_len - buf->opt->seed_len), seed_w); for (i = 0; i < seq_len; ++i) // complement; I forgot why... s[i] = s[i] > 3? 4 : 3 - s[i]; - return (bwa_alnpre_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= aux->opt->seed_len? 0 : seed_w, &opt2, n_aln, aux->stack); + sai.sai = (bwa_sai1_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= buf->opt->seed_len? 0 : seed_w, &opt2, &sai.n, buf->stack); + return sai; } static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps) @@ -145,36 +147,36 @@ static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t } } -bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, uint64_t sa, int n_gaps) +bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps) { extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct); int strand, seq_len, i, n_gap, n_mm; - uint64_t pos3; + uint64_t pos3, pac_pos; uint8_t *s[2]; bwa_aln_t aln; memset(&aln, 0, sizeof(bwa_aln_t)); seq_len = strlen(seq); - if (seq_len<<1 > aux->max_buf) { - aux->max_buf = seq_len<<1; - kroundup32(aux->max_buf); - aux->buf = realloc(aux->buf, aux->max_buf); + if (seq_len<<1 > buf->max_buf) { + buf->max_buf = seq_len<<1; + kroundup32(buf->max_buf); + buf->buf = realloc(buf->buf, buf->max_buf); } - s[0] = aux->buf; + s[0] = buf->buf; s[1] = s[0] + seq_len; for (i = 0; i < seq_len; ++i) s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]]; seq_reverse(seq_len, s[1], 1); - aln.pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand); + pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand); if (strand) aln.flag |= 16; if (n_gaps) { // only for gapped alignment int n_cigar; bwa_cigar_t *cigar16; - cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &aln.pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1); + cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1); aln.n_cigar = n_cigar; aln.cigar = malloc(n_cigar * 4); - for (i = 0, pos3 = aln.pac_pos; i < n_cigar; ++i) { + for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) { int op = cigar16[i]>>14; int len = cigar16[i]&0x3fff; if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR @@ -186,13 +188,13 @@ bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, uint aln.n_cigar = 1; aln.cigar = malloc(4); aln.cigar[0] = seq_len<<4 | 0; - pos3 = aln.pac_pos + seq_len; + pos3 = pac_pos + seq_len; } - aln.n_n = bns_cnt_ambi(idx->bns, aln.pac_pos, pos3 - aln.pac_pos, &aln.ref_id); - aln.offset = aln.pac_pos - idx->bns->anns[aln.ref_id].offset; + aln.n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln.ref_id); + aln.offset = pac_pos - idx->bns->anns[aln.ref_id].offset; if (pos3 - idx->bns->anns[aln.ref_id].offset > idx->bns->anns[aln.ref_id].len) // read mapped beyond the end of a sequence aln.flag |= 4; // read unmapped - compute_NM(idx->pac, idx->bns->l_pac, s[strand], aln.pac_pos, aln.n_cigar, aln.cigar, &n_mm, &n_gap); + compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln.n_cigar, aln.cigar, &n_mm, &n_gap); aln.n_mm = n_mm; aln.n_gap = n_gap; return aln; diff --git a/bwa.h b/bwa.h index 303c983..964b832 100644 --- a/bwa.h +++ b/bwa.h @@ -6,46 +6,82 @@ #define BWA_DEF_MAX_SCORE 2048 #define BWA_MAX_QUERY_LEN 1024 +// BWA index struct bwa_idx_t; typedef struct bwa_idx_t bwa_idx_t; -struct bwa_aux_t; -typedef struct bwa_aux_t bwa_aux_t; +// Buffer for BWA alignment +struct bwa_buf_t; +typedef struct bwa_buf_t bwa_buf_t; +// BWA alignment options typedef struct { - int s_gapo, s_gape; // the mismatch penalty is fixed at 3 - int max_diff, max_gapo, max_gape; - int seed_len, max_seed_diff; - float fnr; + int s_gapo, s_gape; // gap open and extension penalties; the mismatch penalty is fixed at 3 + int max_diff, max_gapo, max_gape; // max differences (-1 to use fnr for length-adjusted max diff), gap opens and gap extensions + int seed_len, max_seed_diff; // seed length and max differences allowed in the seed + float fnr; // parameter for automatic length-adjusted max differences } bwa_opt_t; +// default BWA alignment options +extern bwa_opt_t bwa_def_opt; // = { 11, 4, -1, 1, 6, 32, 2, 0.04 } + +// an interval hit in the SA coordinate; basic unit in .sai files typedef struct { uint32_t n_mm:16, n_gapo:8, n_gape:8; int score; - uint64_t k, l; -} bwa_alnpre_t; + uint64_t k, l; // [k,l] is the SA interval; each interval has l-k+1 hits +} bwa_sai1_t; +// all interval hits in the SA coordinate typedef struct { - uint32_t n_n:8, n_gap:12, n_mm:12; - int32_t ref_id; - uint32_t offset; - uint32_t n_cigar:16, flag:16; - uint64_t pac_pos; - uint32_t *cigar; -} bwa_aln_t; + int n; // number of interval hits + bwa_sai1_t *sai; +} bwa_sai_t; -extern bwa_opt_t bwa_def_opt; +// an alignment +typedef struct { + uint32_t n_n:8, n_gap:12, n_mm:12; // number of ambiguous bases, gaps and mismatches in the alignment + int32_t ref_id; // referece sequence index (the first seq is indexed by 0) + uint32_t offset; // coordinate on the reference; zero-based + uint32_t n_cigar:16, flag:16; // number of CIGAR operations; SAM flag + uint32_t *cigar; // CIGAR in the BAM 28+4 encoding; having n_cigar operations +} bwa_aln_t; #ifdef __cplusplus extern "C" { #endif + // load a BWA index bwa_idx_t *bwa_idx_load(const char *prefix); void bwa_idx_destroy(bwa_idx_t *p); - bwa_aux_t *bwa_aux_init(const bwa_opt_t *opt, int max_score); - void bwa_aux_destroy(bwa_aux_t *p); - bwa_alnpre_t *bwa_aln_pre(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, int *n_aln); - bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_aux_t *aux, const char *seq, uint64_t sa, int n_gaps); + + // allocate a BWA alignment buffer; if unsure, set opt to &bwa_def_opt and max_score to BWA_DEF_MAX_SCORE + bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score); + void bwa_buf_destroy(bwa_buf_t *p); + + /** + * Find all the SA intervals + * + * @param idx BWA index; multiple threads can share the same index + * @param buf BWA alignment buffer; each thread should have its own buffer + * @param seq NULL terminated C string, consisting of A/C/G/T/N only + * + * @return SA intervals seq is matched to + */ + bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq); + + /** + * Construct an alignment in the base-pair coordinate + * + * @param idx BWA index + * @param buf BWA alignment buffer + * @param seq NULL terinated C string + * @param sa Suffix array value + * @param n_gaps Number of gaps (typically equal to bwa_sai1_t::n_gapo + bwa_sai1_t::n_gape + * + * @return An alignment + */ + bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps); #ifdef __cplusplus } diff --git a/bwtaln.h b/bwtaln.h index a3eace2..39eaf4b 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -35,8 +35,8 @@ typedef struct { typedef struct { uint32_t n_mm:16, n_gapo:8, n_gape:8; - bwtint_t k, l; int score; + bwtint_t k, l; } bwt_aln1_t; typedef uint16_t bwa_cigar_t; From ca93a71e6eb19e109b864357976a313e9fd02a90 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 8 Apr 2012 00:02:06 -0400 Subject: [PATCH 121/498] complete single-end alignment --- bwa.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- bwa.h | 12 ++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/bwa.c b/bwa.c index 57b9a9a..31a8136 100644 --- a/bwa.c +++ b/bwa.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "bwa.h" #include "bwt.h" #include "bwtgap.h" @@ -27,6 +28,7 @@ struct bwa_buf_t { gap_opt_t *opt; int *diff_tab; uint8_t *buf; + int *logn; }; bwa_idx_t *bwa_idx_load(const char *prefix) @@ -80,13 +82,16 @@ bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score) p->diff_tab = calloc(BWA_MAX_QUERY_LEN, sizeof(int)); for (i = 1; i < BWA_MAX_QUERY_LEN; ++i) p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); + p->logn = calloc(256, sizeof(int)); + for (i = 1; i != 256; ++i) + p->logn[i] = (int)(4.343 * log(i) + 0.499); return p; } void bwa_buf_destroy(bwa_buf_t *p) { gap_destroy_stack(p->stack); - free(p->diff_tab); free(p->opt); + free(p->diff_tab); free(p->logn); free(p->opt); free(p); } @@ -199,3 +204,54 @@ bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint aln.n_gap = n_gap; return aln; } + +bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar) +{ + bwa_one_t *one; + int best, cnt, i, seq_len; + + seq_len = strlen(seq); + one = calloc(1, sizeof(bwa_one_t)); + one->sai = bwa_sai(idx, buf, seq); + if (one->sai.n == 0) return one; + // count number of hits; randomly select one alignment + best = one->sai.sai[0].score; + for (i = cnt = 0; i < one->sai.n; ++i) { + bwa_sai1_t *p = &one->sai.sai[i]; + if (p->score > best) break; + if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) { + one->which = p; + one->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); + } + cnt += p->l - p->k + 1; + } + one->c1 = cnt; + for (; i < one->sai.n; ++i) + cnt += one->sai.sai[i].l - one->sai.sai[i].k + 1; + one->c2 = cnt - one->c1; + // estimate single-end mapping quality + one->mapQs = -1; + if (one->c1 == 0) one->mapQs = 23; // FIXME: is it possible? + else if (one->c1 > 1) one->mapQs = 0; + else { + int diff = one->which->n_mm + one->which->n_gapo + one->which->n_gape; + if (diff >= buf->diff_tab[seq_len]) one->mapQs = 25; + else if (one->c2 == 0) one->mapQs = 37; + } + if (one->mapQs < 0) { + cnt = (one->c2 >= 255)? 255 : one->c2; + one->mapQs = 23 < buf->logn[cnt]? 0 : 23 - buf->logn[cnt]; + } + one->mapQ = one->mapQs; + // compute CIGAR on request + one->one.ref_id = -1; + if (gen_cigar) one->one = bwa_sa2aln(idx, buf, seq, one->sa, one->which->n_gapo + one->which->n_gape); + return one; +} + +void bwa_one_destroy(bwa_one_t *one) +{ + free(one->sai.sai); + free(one->one.cigar); + free(one); +} diff --git a/bwa.h b/bwa.h index 964b832..97864ee 100644 --- a/bwa.h +++ b/bwa.h @@ -47,6 +47,14 @@ typedef struct { uint32_t *cigar; // CIGAR in the BAM 28+4 encoding; having n_cigar operations } bwa_aln_t; +typedef struct { + int mapQs, mapQ, c1, c2; + uint64_t sa; + bwa_sai1_t *which; + bwa_sai_t sai; + bwa_aln_t one; +} bwa_one_t; + #ifdef __cplusplus extern "C" { #endif @@ -83,6 +91,10 @@ extern "C" { */ bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps); + bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar); + + void bwa_one_destroy(bwa_one_t *one); + #ifdef __cplusplus } #endif From 3b5a9e5595023663c138d503eb8f809ea560cc76 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 8 Apr 2012 00:12:34 -0400 Subject: [PATCH 122/498] simplified bwa_se() interface --- bwa.c | 7 ++++++- bwa.h | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/bwa.c b/bwa.c index 31a8136..4e2775f 100644 --- a/bwa.c +++ b/bwa.c @@ -205,7 +205,7 @@ bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint return aln; } -bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar) +bwa_one_t *bwa_se2(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar) { bwa_one_t *one; int best, cnt, i, seq_len; @@ -249,6 +249,11 @@ bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen return one; } +bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq) +{ + return bwa_se2(idx, buf, seq, 1); +} + void bwa_one_destroy(bwa_one_t *one) { free(one->sai.sai); diff --git a/bwa.h b/bwa.h index 97864ee..5a75c13 100644 --- a/bwa.h +++ b/bwa.h @@ -91,7 +91,7 @@ extern "C" { */ bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps); - bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar); + bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq); void bwa_one_destroy(bwa_one_t *one); From 9c486fa41ef43a20bf147ff9ed8e872e99b2058e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 8 Apr 2012 00:55:52 -0400 Subject: [PATCH 123/498] updated APIs abit --- bwa.c | 60 ++++++++++++++++++++++++++++++++++------------------------- bwa.h | 9 +++++++-- 2 files changed, 42 insertions(+), 27 deletions(-) diff --git a/bwa.c b/bwa.c index 4e2775f..8e99f18 100644 --- a/bwa.c +++ b/bwa.c @@ -24,6 +24,7 @@ struct bwa_idx_t { struct bwa_buf_t { int max_buf; + bwa_pestat_t pes; gap_stack_t *stack; gap_opt_t *opt; int *diff_tab; @@ -152,16 +153,15 @@ static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t } } -bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps) +void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln) { extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct); int strand, seq_len, i, n_gap, n_mm; uint64_t pos3, pac_pos; uint8_t *s[2]; - bwa_aln_t aln; - memset(&aln, 0, sizeof(bwa_aln_t)); + memset(aln, 0, sizeof(bwa_aln_t)); seq_len = strlen(seq); if (seq_len<<1 > buf->max_buf) { buf->max_buf = seq_len<<1; @@ -174,38 +174,41 @@ bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]]; seq_reverse(seq_len, s[1], 1); pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand); - if (strand) aln.flag |= 16; + if (strand) aln->flag |= 16; if (n_gaps) { // only for gapped alignment int n_cigar; bwa_cigar_t *cigar16; cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1); - aln.n_cigar = n_cigar; - aln.cigar = malloc(n_cigar * 4); + aln->n_cigar = n_cigar; + aln->cigar = malloc(n_cigar * 4); for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) { int op = cigar16[i]>>14; int len = cigar16[i]&0x3fff; if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR - aln.cigar[i] = len<<4 | op; + aln->cigar[i] = len<<4 | op; if (op == 0 || op == 2) pos3 += len; } free(cigar16); } else { // ungapped - aln.n_cigar = 1; - aln.cigar = malloc(4); - aln.cigar[0] = seq_len<<4 | 0; + aln->n_cigar = 1; + aln->cigar = malloc(4); + aln->cigar[0] = seq_len<<4 | 0; pos3 = pac_pos + seq_len; } - aln.n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln.ref_id); - aln.offset = pac_pos - idx->bns->anns[aln.ref_id].offset; - if (pos3 - idx->bns->anns[aln.ref_id].offset > idx->bns->anns[aln.ref_id].len) // read mapped beyond the end of a sequence - aln.flag |= 4; // read unmapped - compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln.n_cigar, aln.cigar, &n_mm, &n_gap); - aln.n_mm = n_mm; - aln.n_gap = n_gap; - return aln; + aln->n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln->ref_id); + aln->offset = pac_pos - idx->bns->anns[aln->ref_id].offset; + if (pos3 - idx->bns->anns[aln->ref_id].offset > idx->bns->anns[aln->ref_id].len) // read mapped beyond the end of a sequence + aln->flag |= 4; // read unmapped + compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln->n_cigar, aln->cigar, &n_mm, &n_gap); + aln->n_mm = n_mm; + aln->n_gap = n_gap; } -bwa_one_t *bwa_se2(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar) +/************************ + * Single-end alignment * + ************************/ + +bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar) { bwa_one_t *one; int best, cnt, i, seq_len; @@ -245,18 +248,25 @@ bwa_one_t *bwa_se2(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int ge one->mapQ = one->mapQs; // compute CIGAR on request one->one.ref_id = -1; - if (gen_cigar) one->one = bwa_sa2aln(idx, buf, seq, one->sa, one->which->n_gapo + one->which->n_gape); + if (gen_cigar) bwa_sa2aln(idx, buf, seq, one->sa, one->which->n_gapo + one->which->n_gape, &one->one); return one; } -bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq) -{ - return bwa_se2(idx, buf, seq, 1); -} - void bwa_one_destroy(bwa_one_t *one) { free(one->sai.sai); free(one->one.cigar); free(one); } + +/************************ + * Paired-end alignment * + ************************/ + +void bwa_pestat(bwa_buf_t *buf, int n, bwa_one_t **o[2]) +{ +} + +void bwa_pe(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq[2], bwa_one_t *o[2]) +{ +} diff --git a/bwa.h b/bwa.h index 5a75c13..e8172da 100644 --- a/bwa.h +++ b/bwa.h @@ -55,6 +55,11 @@ typedef struct { bwa_aln_t one; } bwa_one_t; +typedef struct { + double avg, std, ap_prior; + uint64_t low, high, high_bayesian; +} bwa_pestat_t; + #ifdef __cplusplus extern "C" { #endif @@ -89,9 +94,9 @@ extern "C" { * * @return An alignment */ - bwa_aln_t bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps); + void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln); - bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq); + bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar); void bwa_one_destroy(bwa_one_t *one); From 4f61e2b7f536ead55377756d5031ac31cf08e77b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 17 Apr 2012 19:31:37 -0400 Subject: [PATCH 124/498] unnecessary large .sai output --- bwtaln.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwtaln.h b/bwtaln.h index a3eace2..39eaf4b 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -35,8 +35,8 @@ typedef struct { typedef struct { uint32_t n_mm:16, n_gapo:8, n_gape:8; - bwtint_t k, l; int score; + bwtint_t k, l; } bwt_aln1_t; typedef uint16_t bwa_cigar_t; From cd818687ac4ba6ccb20fa12ed30136f5601b9cc4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 17 Apr 2012 20:43:43 -0400 Subject: [PATCH 125/498] r115: added -I and -S to bwasw --- bwtsw2.h | 5 +++-- bwtsw2_aux.c | 3 ++- bwtsw2_main.c | 8 ++++++-- bwtsw2_pair.c | 12 +++++++----- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/bwtsw2.h b/bwtsw2.h index 89615b5..0a1b860 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -12,8 +12,9 @@ #define BSW2_FLAG_RESCUED 0x800 typedef struct { - int a, b, q, r, t, qr, bw; - int z, is, t_seeds, hard_clip, multi_2nd; + int skip_sw:16, hard_clip:16; + int a, b, q, r, t, qr, bw, max_ins; + int z, is, t_seeds, multi_2nd; float mask_level, coef; int n_threads, chunk_size; } bsw2opt_t; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 2cce142..710051d 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -50,7 +50,8 @@ bsw2opt_t *bsw2_init_opt() bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t)); o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30; o->bw = 50; - o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; + o->max_ins = 20000; + o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; o->skip_sw = 0; o->mask_level = 0.50f; o->coef = 5.5f; o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000; return o; diff --git a/bwtsw2_main.c b/bwtsw2_main.c index 041e8ae..50355fe 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -18,7 +18,7 @@ int bwa_bwtsw2(int argc, char *argv[]) opt = bsw2_init_opt(); srand48(11); - while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:M")) >= 0) { + while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:S")) >= 0) { switch (c) { case 'q': opt->q = atoi(optarg); break; case 'r': opt->r = atoi(optarg); break; @@ -35,6 +35,8 @@ int bwa_bwtsw2(int argc, char *argv[]) case 'M': opt->multi_2nd = 1; break; case 'H': opt->hard_clip = 1; break; case 'f': xreopen(optarg, "w", stdout); break; + case 'I': opt->max_ins = atoi(optarg); break; + case 'S': opt->skip_sw = 1; break; } } opt->qr = opt->q + opt->r; @@ -50,9 +52,11 @@ int bwa_bwtsw2(int argc, char *argv[]) fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level); fprintf(stderr, "\n"); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, " -f FILE file to output results to instead of stdout\n"); + fprintf(stderr, " -f FILE file to output results to instead of stdout\n"); fprintf(stderr, " -H in SAM output, use hard clipping instead of soft clipping\n"); fprintf(stderr, " -M mark multi-part alignments as secondary\n"); + fprintf(stderr, " -S skip Smith-Waterman read pairing\n"); + fprintf(stderr, " -I INT ignore pairs with insert >=INT for inferring the size distr [%d]\n", opt->max_ins); fprintf(stderr, "\n"); fprintf(stderr, " -T INT score threshold divided by a [%d]\n", opt->t); fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef); diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index 5e1ec7c..a6f4d80 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -12,7 +12,6 @@ #include "stdaln.h" #endif -#define MAX_INS 20000 #define MIN_RATIO 0.8 #define OUTLIER_BOUND 2.0 #define MAX_STDDEV 4.0 @@ -23,7 +22,7 @@ typedef struct { double avg, std; } bsw2pestat_t; -bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg) +bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) { extern void ks_introsort_uint64_t(size_t n, uint64_t *a); int i, k, x, p25, p50, p75, tmp, max_len = 0; @@ -40,6 +39,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg) if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + t[1]->len : t[1]->k - t[0]->k + t[0]->len; + if (l >= max_ins) continue; // skip pairs with excessively large insert max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg; max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; isize[k++] = l; @@ -186,7 +186,7 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b int8_t g_mat[25]; kstring_t msg; memset(&msg, 0, sizeof(kstring_t)); - pes = bsw2_stat(n, hits, &msg); + pes = bsw2_stat(n, hits, &msg, opt->max_ins); for (i = k = 0; i < 5; ++i) { for (j = 0; j < 4; ++j) g_mat[k++] = i == j? opt->a : -opt->b; @@ -207,8 +207,10 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit - if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1], g_mat); - if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0], g_mat); + if (!opt->skip_sw) { + if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1], g_mat); + if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0], g_mat); + } // else a[0].G == a[1].G == a[0].G2 == a[1].G2 == 0 // the following enumerate all possibilities. It is tedious but necessary... if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not; bwtsw2_t *p[2]; From d97ff6bf7245d687ae6a1a7dd210dc8c9ac16b72 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 17 Apr 2012 20:45:07 -0400 Subject: [PATCH 126/498] r124: updated version number --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index bd9c926..3522196 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.1-r112-master" +#define PACKAGE_VERSION "0.6.1-r124-api" #endif static int usage() From 29ed2d8287dfca298a335eee14352c885e9cd870 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 19 Jun 2012 13:13:29 -0400 Subject: [PATCH 127/498] rename the "api" branch as "master" --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index 3522196..4bb1197 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.1-r124-api" +#define PACKAGE_VERSION "0.6.1-r125" #endif static int usage() From 09ee115dcc555c71e65a0f00456753b65b307115 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 19 Jun 2012 13:29:44 -0400 Subject: [PATCH 128/498] r126: release bwa-0.6.2 --- NEWS | 19 +++++++++++++++++++ bwa.1 | 2 +- main.c | 2 +- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index 5cced2b..d68c693 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,22 @@ +Release 0.6.2 (19 June, 2012) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is largely a bug-fix release. Notable changes in BWA-short and BWA-SW: + + * Bugfix: BWA-SW may give bad alignments due to incorrect band width. + + * Bugfix: A segmentation fault due to an out-of-boundary error. The fix is a + temporary solution. The real cause has not been identified. + + * Attempt to read index from prefix.64.bwt, such that the 32-bit and 64-bit + index can coexist. + + * Added options '-I' and '-S' to control BWA-SW pairing. + +(0.6.2: 19 June 2012, r126) + + + Release 0.6.1 (28 November, 2011) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bwa.1 b/bwa.1 index caa60cb..66bc9a2 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "28 November 2011" "bwa-0.6.1" "Bioinformatics tools" +.TH bwa 1 "19 June 2012" "bwa-0.6.2" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool diff --git a/main.c b/main.c index 4bb1197..0e7af77 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.1-r125" +#define PACKAGE_VERSION "0.6.2-r126" #endif static int usage() From 84d34e1d0bcc9df2fe2474eb823a5947d9b23c10 Mon Sep 17 00:00:00 2001 From: Shaun Jackman Date: Mon, 25 Jun 2012 12:48:43 -0700 Subject: [PATCH 129/498] Do not inline functions used externally Compiling using CLANG gives the following errors: Undefined symbols for architecture x86_64: "_bwt_occ", referenced from: _bwt_cal_sa in bwt.o _bwt_sa in bwt.o "_bwt_2occ", referenced from: _bwt_match_exact in bwt.o _bwt_match_exact_alt in bwt.o _bwt_cal_width in bwtaln.o "_bwt_2occ4", referenced from: _bwt_match_gap in bwtgap.o _bsw2_core in bwtsw2_core.o "_bwtl_2occ4", referenced from: _bsw2_core in bwtsw2_core.o --- bwt.c | 8 ++++---- bwt_lite.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bwt.c b/bwt.c index fcc141e..966b718 100644 --- a/bwt.c +++ b/bwt.c @@ -95,7 +95,7 @@ static inline int __occ_aux(uint64_t y, int c) return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56; } -inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) +bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) { bwtint_t n, l, j; uint32_t *p; @@ -121,7 +121,7 @@ inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) } // an analogy to bwt_occ() but more efficient, requiring k <= l -inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol) +void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol) { bwtint_t _k, _l; _k = (k >= bwt->primary)? k-1 : k; @@ -158,7 +158,7 @@ inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \ + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24]) -inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) +void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) { bwtint_t l, j, x; uint32_t *p; @@ -178,7 +178,7 @@ inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) } // an analogy to bwt_occ4() but more efficient, requiring k <= l -inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) +void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) { bwtint_t _k, _l; _k = (k >= bwt->primary)? k-1 : k; diff --git a/bwt_lite.c b/bwt_lite.c index dd411e1..902e0fc 100644 --- a/bwt_lite.c +++ b/bwt_lite.c @@ -65,7 +65,7 @@ inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c) if (c == 0) n -= 15 - (k&15); // corrected for the masked bits return n; } -inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]) +void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]) { uint32_t x, b; if (k == (uint32_t)(-1)) { @@ -80,7 +80,7 @@ inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]) x -= 15 - (k&15); cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; } -inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]) +void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]) { bwtl_occ4(bwt, k, cntk); bwtl_occ4(bwt, l, cntl); From 0ae318be0d3f081653f516ced2df307987748e33 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 28 Jun 2012 13:32:58 -0400 Subject: [PATCH 130/498] resolve duphits after right extension as well --- bwtsw2_aux.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 710051d..8f838f5 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -227,7 +227,7 @@ void bsw2_debug_hits(const bwtsw2_t *b) for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; if (p->G > 0) - printf("G=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev); + printf("G=%d, G2=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->G2, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev); } } @@ -292,6 +292,7 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8 merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here bsw2_resolve_duphits(0, 0, bb[k][0], 0); bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, pool->aln_mem); + bsw2_resolve_duphits(0, 0, bb[k][0], 0); b[k] = bb[k][0]; free(bb[k]); } From f44edd4fc9e06f7e81b84d23f6bdb06ab1934a45 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 28 Jun 2012 14:51:02 -0400 Subject: [PATCH 131/498] r128: more conservative chaining filter --- bwtsw2.h | 2 +- bwtsw2_aux.c | 3 ++- bwtsw2_chain.c | 11 ++++++----- bwtsw2_main.c | 6 ++++-- main.c | 2 +- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/bwtsw2.h b/bwtsw2.h index 0a1b860..b1f6a3f 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -13,7 +13,7 @@ typedef struct { int skip_sw:16, hard_clip:16; - int a, b, q, r, t, qr, bw, max_ins; + int a, b, q, r, t, qr, bw, max_ins, max_chain_gap; int z, is, t_seeds, multi_2nd; float mask_level, coef; int n_threads, chunk_size; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 8f838f5..f6f6df8 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -54,6 +54,7 @@ bsw2opt_t *bsw2_init_opt() o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; o->skip_sw = 0; o->mask_level = 0.50f; o->coef = 5.5f; o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000; + o->max_chain_gap = 10000; return o; } @@ -286,7 +287,7 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8 } } b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits" - bsw2_chain_filter(opt, l, b); + bsw2_chain_filter(opt, l, b); // NB: only unique seeds are chained for (k = 0; k < 2; ++k) { bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem); merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here diff --git a/bwtsw2_chain.c b/bwtsw2_chain.c index c734657..381d0b7 100644 --- a/bwtsw2_chain.c +++ b/bwtsw2_chain.c @@ -23,15 +23,15 @@ static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t hsaip_t *q = chain + k; int x = p->qbeg - q->qbeg; // always positive int y = p->tbeg - q->tbeg; - if (y > 0 && x - y <= opt->bw && y - x <= opt->bw) { + if (y > 0 && x < opt->max_chain_gap && y < opt->max_chain_gap && x - y <= opt->bw && y - x <= opt->bw) { // chained if (p->qend > q->qend) q->qend = p->qend; if (p->tend > q->tend) q->tend = p->tend; ++q->chain; p->chain = shift + k; break; - } + } else if (q->chain > opt->t_seeds * 2) k = 0; // if the chain is strong enough, do not check the previous chains } - if (k < 0) { + if (k < 0) { // not added to any previous chains chain[m] = *p; chain[m].chain = 1; chain[m].idx = p->chain = shift + m; @@ -44,7 +44,7 @@ static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) { hsaip_t *z[2], *chain[2]; - int i, j, k, n[2], m[2]; + int i, j, k, n[2], m[2], thres = opt->t_seeds * 2; char *flag; // initialization n[0] = b[0]->n; n[1] = b[1]->n; @@ -71,6 +71,7 @@ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) int tmp = p->qbeg; p->qbeg = len - p->qend; p->qend = len - tmp; } + //for (k = 0; k < m[0]; ++k) printf("%d, [%d,%d), [%d,%d)\n", chain[0][k].chain, chain[0][k].tbeg, chain[0][k].tend, chain[0][k].qbeg, chain[0][k].qend); // filtering flag = calloc(m[0] + m[1], 1); ks_introsort(hsaip, m[0] + m[1], chain[0]); @@ -79,7 +80,7 @@ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) for (j = 0; j < k; ++j) { hsaip_t *q = chain[0] + j; if (flag[q->idx]) continue; - if (q->qend >= p->qend && q->chain > p->chain * opt->t_seeds * 2) { + if (q->qend >= p->qend && q->chain > p->chain * thres && p->chain < thres) { flag[p->idx] = 1; break; } diff --git a/bwtsw2_main.c b/bwtsw2_main.c index 50355fe..a802ee7 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -18,7 +18,7 @@ int bwa_bwtsw2(int argc, char *argv[]) opt = bsw2_init_opt(); srand48(11); - while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:S")) >= 0) { + while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:SG:")) >= 0) { switch (c) { case 'q': opt->q = atoi(optarg); break; case 'r': opt->r = atoi(optarg); break; @@ -37,6 +37,7 @@ int bwa_bwtsw2(int argc, char *argv[]) case 'f': xreopen(optarg, "w", stdout); break; case 'I': opt->max_ins = atoi(optarg); break; case 'S': opt->skip_sw = 1; break; + case 'G': opt->max_chain_gap = atoi(optarg); break; } } opt->qr = opt->q + opt->r; @@ -62,7 +63,8 @@ int bwa_bwtsw2(int argc, char *argv[]) fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef); fprintf(stderr, " -z INT Z-best [%d]\n", opt->z); fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is); - fprintf(stderr, " -N INT # seeds to trigger reverse alignment [%d]\n", opt->t_seeds); + fprintf(stderr, " -N INT # seeds to trigger rev aln; 2*INT is also the chaining threshold [%d]\n", opt->t_seeds); + fprintf(stderr, " -G INT maximum gap size during chaining [%d]\n", opt->max_chain_gap); fprintf(stderr, "\n"); fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n"); fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n"); diff --git a/main.c b/main.c index 0e7af77..aa5bb3b 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r126" +#define PACKAGE_VERSION "0.6.2-r128" #endif static int usage() From 3abfd0743a1cc822d9739482c064a0cf65b7b719 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 28 Jun 2012 14:52:18 -0400 Subject: [PATCH 132/498] r131: r128 plus remote changes --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index aa5bb3b..e394210 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r128" +#define PACKAGE_VERSION "0.6.2-r131" #endif static int usage() From 292f9061ab9c4b8d0c3d088f60513a84413433dc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 26 Oct 2012 12:54:32 -0400 Subject: [PATCH 133/498] r132: optionally copy FASTA/Q comment to SAM --- bwtsw2.h | 4 ++-- bwtsw2_aux.c | 10 +++++++++- bwtsw2_main.c | 4 +++- main.c | 2 +- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/bwtsw2.h b/bwtsw2.h index b1f6a3f..0ec9676 100644 --- a/bwtsw2.h +++ b/bwtsw2.h @@ -12,7 +12,7 @@ #define BSW2_FLAG_RESCUED 0x800 typedef struct { - int skip_sw:16, hard_clip:16; + int skip_sw:8, cpy_cmt:8, hard_clip:16; int a, b, q, r, t, qr, bw, max_ins, max_chain_gap; int z, is, t_seeds, multi_2nd; float mask_level, coef; @@ -45,7 +45,7 @@ typedef struct { typedef struct { int l, tid; - char *name, *seq, *qual, *sam; + char *name, *seq, *qual, *sam, *comment; } bsw2seq1_t; #ifdef __cplusplus diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index f6f6df8..5e8161c 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -55,6 +55,7 @@ bsw2opt_t *bsw2_init_opt() o->mask_level = 0.50f; o->coef = 5.5f; o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000; o->max_chain_gap = 10000; + o->cpy_cmt = 0; return o; } @@ -551,7 +552,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str); else kputc(ks->qual[j], &str); } - } else ksprintf(&str, "\t*"); + } else kputs("\t*", &str); // print optional tags ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tNM:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, q->nm); if (q->nn) ksprintf(&str, "\tXN:i:%d", q->nn); @@ -559,6 +560,12 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks if (p->flag&BSW2_FLAG_MATESW) type |= 1; if (p->flag&BSW2_FLAG_TANDEM) type |= 2; if (type) ksprintf(&str, "\tXT:i:%d", type); + if (opt->cpy_cmt && ks->comment) { + int l = strlen(ks->comment); + if (l >= 6 && ks->comment[2] == ':' && ks->comment[4] == ':') { + kputc('\t', &str); kputs(ks->comment, &str); + } + } kputc('\n', &str); } ks->sam = str.s; @@ -756,6 +763,7 @@ static void kseq_to_bsw2seq(const kseq_t *ks, bsw2seq1_t *p) p->name = strdup(ks->name.s); p->seq = strdup(ks->seq.s); p->qual = ks->qual.l? strdup(ks->qual.s) : 0; + p->comment = ks->comment.l? strdup(ks->comment.s) : 0; p->sam = 0; } diff --git a/bwtsw2_main.c b/bwtsw2_main.c index a802ee7..e3f57f8 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -18,7 +18,7 @@ int bwa_bwtsw2(int argc, char *argv[]) opt = bsw2_init_opt(); srand48(11); - while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:SG:")) >= 0) { + while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:SG:C")) >= 0) { switch (c) { case 'q': opt->q = atoi(optarg); break; case 'r': opt->r = atoi(optarg); break; @@ -37,6 +37,7 @@ int bwa_bwtsw2(int argc, char *argv[]) case 'f': xreopen(optarg, "w", stdout); break; case 'I': opt->max_ins = atoi(optarg); break; case 'S': opt->skip_sw = 1; break; + case 'C': opt->cpy_cmt = 1; break; case 'G': opt->max_chain_gap = atoi(optarg); break; } } @@ -55,6 +56,7 @@ int bwa_bwtsw2(int argc, char *argv[]) fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -f FILE file to output results to instead of stdout\n"); fprintf(stderr, " -H in SAM output, use hard clipping instead of soft clipping\n"); + fprintf(stderr, " -C copy FASTA/Q comment to SAM output\n"); fprintf(stderr, " -M mark multi-part alignments as secondary\n"); fprintf(stderr, " -S skip Smith-Waterman read pairing\n"); fprintf(stderr, " -I INT ignore pairs with insert >=INT for inferring the size distr [%d]\n", opt->max_ins); diff --git a/main.c b/main.c index e394210..73cbcd9 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r131" +#define PACKAGE_VERSION "0.6.2-r132" #endif static int usage() From 752ce69b78bdc34746affa3f32a53c10b5d06285 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Sun, 16 Dec 2012 10:05:32 +0000 Subject: [PATCH 134/498] Added more wrappers for functions that make system calls Added the following wrappers that check the results of system calls and exit non-zero if something went wrong. This is so bwa can be more robust against system failures (e.g. IO errors from remote storage, or running out of memory). The existing and new wrappers have also been modified so that they no longer try to dump core on failure. In most cases the resulting core files are not useful (especially if bwa was compiled with optimization turned on) so just pollute whatever directories they got written to. Wrappers for memory allocation functions: xcalloc xmalloc xrealloc xstrdup New wrappers for IO functions: err_fread_noeof (also dies on EOF) err_gzread err_fseek err_rewind err_ftell --- utils.c | 134 +++++++++++++++++++++++++++++++++++++++++++++++++------- utils.h | 29 ++++++++++-- 2 files changed, 143 insertions(+), 20 deletions(-) diff --git a/utils.c b/utils.c index 8c1ad7e..bc39bf5 100644 --- a/utils.c +++ b/utils.c @@ -41,33 +41,44 @@ FILE *err_xopen_core(const char *func, const char *fn, const char *mode) if (strcmp(fn, "-") == 0) return (strstr(mode, "r"))? stdin : stdout; if ((fp = fopen(fn, mode)) == 0) { - fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); - abort(); + err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno)); } return fp; } FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp) { if (freopen(fn, mode, fp) == 0) { - fprintf(stderr, "[%s] fail to open file '%s': ", func, fn); - perror(NULL); - fprintf(stderr, "Abort!\n"); - abort(); + err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno)); } return fp; } gzFile err_xzopen_core(const char *func, const char *fn, const char *mode) { gzFile fp; - if (strcmp(fn, "-") == 0) - return gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode); + if (strcmp(fn, "-") == 0) { + fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode); + /* According to zlib.h, this is the only reason gzdopen can fail */ + if (!fp) err_fatal(func, "Out of memory"); + return fp; + } if ((fp = gzopen(fn, mode)) == 0) { - fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); - abort(); + err_fatal(func, "fail to open file '%s' : %s", fn, errno ? strerror(errno) : "Out of memory"); } return fp; } + void err_fatal(const char *header, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + fprintf(stderr, "[%s] ", header); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n"); + va_end(args); + exit(EXIT_FAILURE); +} + +void err_fatal_core(const char *header, const char *fmt, ...) { va_list args; va_start(args, fmt); @@ -78,7 +89,13 @@ void err_fatal(const char *header, const char *fmt, ...) abort(); } -void err_fatal_simple_core(const char *func, const char *msg) +void _err_fatal_simple(const char *func, const char *msg) +{ + fprintf(stderr, "[%s] %s\n", func, msg); + exit(EXIT_FAILURE); +} + +void _err_fatal_simple_core(const char *func, const char *msg) { fprintf(stderr, "[%s] %s Abort!\n", func, msg); abort(); @@ -89,11 +106,55 @@ size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) size_t ret = fwrite(ptr, size, nmemb, stream); if (ret != nmemb) { - err_fatal_simple_core("fwrite", strerror(errno)); + _err_fatal_simple("fwrite", strerror(errno)); } return ret; } +size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream) +{ + size_t ret = fread(ptr, size, nmemb, stream); + if (ret != nmemb) + { + _err_fatal_simple("fread", ferror(stream) ? strerror(errno) : "Unexpected end of file"); + } + return ret; +} + +int err_gzread(gzFile file, void *ptr, unsigned int len) +{ + int ret = gzread(file, ptr, len); + + if (ret < 0) + { + int errnum = 0; + const char *msg = gzerror(file, &errnum); + _err_fatal_simple("gzread", Z_ERRNO == errnum ? strerror(errno) : msg); + } + + return ret; +} + +int err_fseek(FILE *stream, long offset, int whence) +{ + int ret = fseek(stream, offset, whence); + if (0 != ret) + { + _err_fatal_simple("fseek", strerror(errno)); + } + return ret; +} + +long err_ftell(FILE *stream) +{ + long ret = ftell(stream); + if (-1 == ret) + { + _err_fatal_simple("ftell", strerror(errno)); + } + return ret; +} + int err_printf(const char *format, ...) { va_list arg; @@ -106,7 +167,7 @@ int err_printf(const char *format, ...) if (done < 0) { - err_fatal_simple_core("vfprintf(stdout)", strerror(saveErrno)); + _err_fatal_simple("vfprintf(stdout)", strerror(saveErrno)); } return done; } @@ -123,7 +184,7 @@ int err_fprintf(FILE *stream, const char *format, ...) if (done < 0) { - err_fatal_simple_core("vfprintf", strerror(saveErrno)); + _err_fatal_simple("vfprintf", strerror(saveErrno)); } return done; } @@ -133,7 +194,7 @@ int err_fflush(FILE *stream) int ret = fflush(stream); if (ret != 0) { - err_fatal_simple_core("fflush", strerror(errno)); + _err_fatal_simple("fflush", strerror(errno)); } return ret; } @@ -143,11 +204,52 @@ int err_fclose(FILE *stream) int ret = fclose(stream); if (ret != 0) { - err_fatal_simple_core("fclose", strerror(errno)); + _err_fatal_simple("fclose", strerror(errno)); } return ret; } +void *err_calloc(size_t nmemb, size_t size, const char *file, unsigned int line, const char *func) +{ + void *p = calloc(nmemb, size); + if (NULL == p) + { + err_fatal(func, "Failed to allocate %zd bytes at %s line %u: %s\n", nmemb * size, file, line, strerror(errno)); + } + return p; +} + +void *err_malloc(size_t size, const char *file, unsigned int line, const char *func) +{ + void *p = malloc(size); + if (NULL == p) + { + err_fatal(func, "Failed to allocate %zd bytes at %s line %u: %s\n", size, file, line, strerror(errno)); + } + return p; +} + +void *err_realloc(void *ptr, size_t size, const char *file, unsigned int line, const char *func) +{ + void *p = realloc(ptr, size); + if (NULL == p) + { + err_fatal(func, "Failed to allocate %zd bytes at %s line %u: %s\n", size, file, line, strerror(errno)); + } + return p; +} + +char *err_strdup(const char *s, const char *file, unsigned int line, const char *func) +{ + char *p = strdup(s); + + if (NULL == p) + { + err_fatal(func, "Failed to allocate %zd bytes at %s line %u: %s\n", strlen(s), file, line, strerror(errno)); + } + return p; +} + double cputime() { struct rusage r; diff --git a/utils.h b/utils.h index b6839e9..c6cfc81 100644 --- a/utils.h +++ b/utils.h @@ -40,22 +40,38 @@ -#define err_fatal_simple(msg) err_fatal_simple_core(__func__, msg) +#define err_fatal_simple(msg) _err_fatal_simple(__func__, msg) +#define err_fatal_simple_core(msg) _err_fatal_simple_core(__func__, msg) #define xopen(fn, mode) err_xopen_core(__func__, fn, mode) #define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp) #define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode) -#define xassert(cond, msg) if ((cond) == 0) err_fatal_simple_core(__func__, msg) + +#define xassert(cond, msg) if ((cond) == 0) _err_fatal_simple_core(__func__, msg) + +#define xcalloc(n, s) err_calloc( (n), (s), __FILE__, __LINE__, __func__) +#define xmalloc(s) err_malloc( (s), __FILE__, __LINE__, __func__) +#define xrealloc(p, s) err_realloc((p), (s), __FILE__, __LINE__, __func__) +#define xstrdup(s) err_strdup( (s), __FILE__, __LINE__, __func__) + #ifdef __cplusplus extern "C" { #endif - void err_fatal(const char *header, const char *fmt, ...); - void err_fatal_simple_core(const char *func, const char *msg); + void err_fatal(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn)); + void err_fatal_core(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn)); + void _err_fatal_simple(const char *func, const char *msg) ATTRIBUTE((noreturn)); + void _err_fatal_simple_core(const char *func, const char *msg) ATTRIBUTE((noreturn)); FILE *err_xopen_core(const char *func, const char *fn, const char *mode); FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp); gzFile err_xzopen_core(const char *func, const char *fn, const char *mode); size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); + size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream); + + int err_gzread(gzFile file, void *ptr, unsigned int len); + int err_fseek(FILE *stream, long offset, int whence); +#define err_rewind(FP) err_fseek((FP), 0, SEEK_SET) + long err_ftell(FILE *stream); int err_fprintf(FILE *stream, const char *format, ...) ATTRIBUTE((format(printf, 2, 3))); int err_printf(const char *format, ...) @@ -63,6 +79,11 @@ extern "C" { int err_fflush(FILE *stream); int err_fclose(FILE *stream); + void *err_calloc(size_t nmemb, size_t size, const char *file, unsigned int line, const char *func); + void *err_malloc(size_t size, const char *file, unsigned int line, const char *func); + void *err_realloc(void *ptr, size_t size, const char *file, unsigned int line, const char *func); + char *err_strdup(const char *s, const char *file, unsigned int line, const char *func); + double cputime(); double realtime(); From b081ac9b8b16c2a57d9090784dc86dfbd48fad12 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Sun, 16 Dec 2012 10:34:57 +0000 Subject: [PATCH 135/498] Use wrapper functions to catch system errors Use the wrapper functions in utils.c plus a few extra bits of error checking code to catch system errors and exit non-zero when they occur. --- Makefile | 47 ++++++++++++++++------- bamlite.c | 38 +++++++++++-------- bamlite.h | 7 ++-- bntseq.c | 101 +++++++++++++++++++++++++++++-------------------- bwa.c | 28 +++++++------- bwape.c | 40 ++++++++++---------- bwase.c | 48 +++++++++++------------ bwaseqio.c | 26 ++++++------- bwt.c | 6 +-- bwt_gen.c | 43 +++++++++++---------- bwt_lite.c | 11 +++--- bwtaln.c | 14 +++---- bwtgap.c | 11 +++--- bwtindex.c | 10 ++--- bwtio.c | 54 +++++++++++++------------- bwtmisc.c | 38 +++++++++---------- bwtsw2_aux.c | 74 +++++++++++++++++------------------- bwtsw2_chain.c | 7 ++-- bwtsw2_core.c | 29 +++++++------- bwtsw2_pair.c | 7 ++-- cs2nt.c | 3 +- fastmap.c | 13 ++++--- is.c | 6 ++- khash.h | 13 ++++--- kseq.h | 13 ++++--- ksort.h | 5 ++- kstring.c | 3 +- kstring.h | 5 ++- ksw.c | 9 +++-- kvec.h | 8 ++-- simple_dp.c | 12 +++--- stdaln.c | 37 +++++++++--------- 32 files changed, 411 insertions(+), 355 deletions(-) diff --git a/Makefile b/Makefile index 6f388f2..b8fa824 100644 --- a/Makefile +++ b/Makefile @@ -31,19 +31,38 @@ bwa:libbwa.a $(AOBJS) main.o libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) -bwa.o:bwa.h - -QSufSort.o:QSufSort.h - -bwt.o:bwt.h -bwtio.o:bwt.h -bwtaln.o:bwt.h bwtaln.h kseq.h -bntseq.o:bntseq.h -bwtgap.o:bwtgap.h bwtaln.h bwt.h - -bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h -bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h -bwtsw2_main.o:bwtsw2.h - clean: rm -f gmon.out *.o a.out $(PROG) *~ *.a + +QSufSort.o: QSufSort.h +bamlite.o: bamlite.h utils.h +bntseq.o: bntseq.h kseq.h main.h utils.h +bwa.o: bntseq.h bwa.h bwt.h bwtaln.h bwtgap.h stdaln.h utils.h +bwape.o: bntseq.h bwase.h bwt.h bwtaln.h khash.h ksort.h kvec.h stdaln.h +bwape.o: utils.h +bwase.o: bntseq.h bwase.h bwt.h bwtaln.h kstring.h stdaln.h utils.h +bwaseqio.o: bamlite.h bwt.h bwtaln.h kseq.h stdaln.h utils.h +bwt.o: bwt.h kvec.h utils.h +bwt_gen.o: QSufSort.h utils.h +bwt_lite.o: bwt_lite.h utils.h +bwtaln.o: bwt.h bwtaln.h bwtgap.h stdaln.h utils.h +bwtgap.o: bwt.h bwtaln.h bwtgap.h stdaln.h utils.h +bwtindex.o: bntseq.h bwt.h main.h utils.h +bwtio.o: bwt.h utils.h +bwtmisc.o: bntseq.h bwt.h main.h utils.h +bwtsw2_aux.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h kseq.h ksort.h kstring.h +bwtsw2_aux.o: stdaln.h utils.h +bwtsw2_chain.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h ksort.h utils.h +bwtsw2_core.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h khash.h ksort.h kvec.h +bwtsw2_core.o: utils.h +bwtsw2_main.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h utils.h +bwtsw2_pair.o: bntseq.h bwt.h bwt_lite.h bwtsw2.h kstring.h ksw.h utils.h +cs2nt.o: bwt.h bwtaln.h stdaln.h utils.h +fastmap.o: bntseq.h bwt.h kseq.h kvec.h utils.h +is.o: utils.h +kstring.o: kstring.h utils.h +ksw.o: ksw.h utils.h +main.o: main.h utils.h +simple_dp.o: kseq.h stdaln.h utils.h +stdaln.o: stdaln.h utils.h +utils.o: utils.h diff --git a/bamlite.c b/bamlite.c index 5aad392..ec365d1 100644 --- a/bamlite.c +++ b/bamlite.c @@ -2,6 +2,7 @@ #include #include #include +#include "utils.h" #include "bamlite.h" /********************* @@ -53,7 +54,7 @@ int bam_is_be; bam_header_t *bam_header_init() { bam_is_be = bam_is_big_endian(); - return (bam_header_t*)calloc(1, sizeof(bam_header_t)); + return (bam_header_t*)xcalloc(1, sizeof(bam_header_t)); } void bam_header_destroy(bam_header_t *header) @@ -62,11 +63,11 @@ void bam_header_destroy(bam_header_t *header) if (header == 0) return; if (header->target_name) { for (i = 0; i < header->n_targets; ++i) - free(header->target_name[i]); + if (header->target_name[i]) free(header->target_name[i]); + if (header->target_len) free(header->target_len); free(header->target_name); - free(header->target_len); } - free(header->text); + if (header->text) free(header->text); free(header); } @@ -80,28 +81,33 @@ bam_header_t *bam_header_read(bamFile fp) magic_len = bam_read(fp, buf, 4); if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n"); - return 0; + return NULL; } header = bam_header_init(); // read plain text and the number of reference sequences - bam_read(fp, &header->l_text, 4); + if (bam_read(fp, &header->l_text, 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&header->l_text); - header->text = (char*)calloc(header->l_text + 1, 1); - bam_read(fp, header->text, header->l_text); - bam_read(fp, &header->n_targets, 4); + header->text = (char*)xcalloc(header->l_text + 1, 1); + if (bam_read(fp, header->text, header->l_text) != header->l_text) goto fail; + if (bam_read(fp, &header->n_targets, 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&header->n_targets); // read reference sequence names and lengths - header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); - header->target_len = (uint32_t*)calloc(header->n_targets, 4); + header->target_name = (char**)xcalloc(header->n_targets, sizeof(char*)); + header->target_len = (uint32_t*)xcalloc(header->n_targets, 4); for (i = 0; i != header->n_targets; ++i) { - bam_read(fp, &name_len, 4); + if (bam_read(fp, &name_len, 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&name_len); - header->target_name[i] = (char*)calloc(name_len, 1); - bam_read(fp, header->target_name[i], name_len); - bam_read(fp, &header->target_len[i], 4); + header->target_name[i] = (char*)xcalloc(name_len, 1); + if (bam_read(fp, header->target_name[i], name_len) != name_len) { + goto fail; + } + if (bam_read(fp, &header->target_len[i], 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]); } return header; + fail: + bam_header_destroy(header); + return NULL; } static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) @@ -146,7 +152,7 @@ int bam_read1(bamFile fp, bam1_t *b) if (b->m_data < b->data_len) { b->m_data = b->data_len; kroundup32(b->m_data); - b->data = (uint8_t*)realloc(b->data, b->m_data); + b->data = (uint8_t*)xrealloc(b->data, b->m_data); } if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; diff --git a/bamlite.h b/bamlite.h index 167fa44..2b65c57 100644 --- a/bamlite.h +++ b/bamlite.h @@ -3,12 +3,13 @@ #include #include +#include "utils.h" typedef gzFile bamFile; -#define bam_open(fn, mode) gzopen(fn, mode) +#define bam_open(fn, mode) xzopen(fn, mode) #define bam_dopen(fd, mode) gzdopen(fd, mode) #define bam_close(fp) gzclose(fp) -#define bam_read(fp, buf, size) gzread(fp, buf, size) +#define bam_read(fp, buf, size) err_gzread(fp, buf, size) typedef struct { int32_t n_targets; @@ -71,7 +72,7 @@ typedef struct { #define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) #define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) -#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t))) +#define bam_init1() ((bam1_t*)xcalloc(1, sizeof(bam1_t))) #define bam_destroy1(b) do { \ if (b) { free((b)->data); free(b); } \ } while (0) diff --git a/bntseq.c b/bntseq.c index adcd2d7..b795af8 100644 --- a/bntseq.c +++ b/bntseq.c @@ -30,12 +30,13 @@ #include #include #include +#include #include "bntseq.h" #include "main.h" #include "utils.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_INIT(gzFile, err_gzread) unsigned char nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -64,25 +65,25 @@ void bns_dump(const bntseq_t *bns, const char *prefix) { // dump .ann strcpy(str, prefix); strcat(str, ".ann"); fp = xopen(str, "w"); - fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed); + err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed); for (i = 0; i != bns->n_seqs; ++i) { bntann1_t *p = bns->anns + i; - fprintf(fp, "%d %s", p->gi, p->name); - if (p->anno[0]) fprintf(fp, " %s\n", p->anno); - else fprintf(fp, "\n"); - fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); + err_fprintf(fp, "%d %s", p->gi, p->name); + if (p->anno[0]) err_fprintf(fp, " %s\n", p->anno); + else err_fprintf(fp, "\n"); + err_fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); } - fclose(fp); + err_fclose(fp); } { // dump .amb strcpy(str, prefix); strcat(str, ".amb"); fp = xopen(str, "w"); - fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes); + err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes); for (i = 0; i != bns->n_holes; ++i) { bntamb1_t *p = bns->ambs + i; - fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); + err_fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); } - fclose(fp); + err_fclose(fp); } } @@ -90,53 +91,71 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c { char str[1024]; FILE *fp; + const char *fname; bntseq_t *bns; long long xx; int i; - bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); + int scanres; + bns = (bntseq_t*)xcalloc(1, sizeof(bntseq_t)); { // read .ann - fp = xopen(ann_filename, "r"); - fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed); + fp = xopen(fname = ann_filename, "r"); + scanres = fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed); + if (scanres != 3) goto badread; bns->l_pac = xx; - bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t)); + bns->anns = (bntann1_t*)xcalloc(bns->n_seqs, sizeof(bntann1_t)); for (i = 0; i < bns->n_seqs; ++i) { bntann1_t *p = bns->anns + i; char *q = str; int c; // read gi and sequence name - fscanf(fp, "%u%s", &p->gi, str); - p->name = strdup(str); + scanres = fscanf(fp, "%u%s", &p->gi, str); + if (scanres != 2) goto badread; + p->name = xstrdup(str); // read fasta comments - while ((c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; + while (str - q < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; + while (c != '\n' && c != EOF) c = fgetc(fp); + if (c == EOF) { + scanres = EOF; + goto badread; + } *q = 0; - if (q - str > 1) p->anno = strdup(str + 1); // skip leading space - else p->anno = strdup(""); + if (q - str > 1) p->anno = xstrdup(str + 1); // skip leading space + else p->anno = xstrdup(""); // read the rest - fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); + scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); + if (scanres != 3) goto badread; p->offset = xx; } - fclose(fp); + err_fclose(fp); } { // read .amb int64_t l_pac; int32_t n_seqs; - fp = xopen(amb_filename, "r"); - fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes); + fp = xopen(fname = amb_filename, "r"); + scanres = fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes); + if (scanres != 3) goto badread; l_pac = xx; xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); - bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)); + bns->ambs = (bntamb1_t*)xcalloc(bns->n_holes, sizeof(bntamb1_t)); for (i = 0; i < bns->n_holes; ++i) { bntamb1_t *p = bns->ambs + i; - fscanf(fp, "%lld%d%s", &xx, &p->len, str); + scanres = fscanf(fp, "%lld%d%s", &xx, &p->len, str); + if (scanres != 3) goto badread; p->offset = xx; p->amb = str[0]; } - fclose(fp); + err_fclose(fp); } { // open .pac bns->fp_pac = xopen(pac_filename, "rb"); } return bns; + + badread: + if (EOF == scanres) { + err_fatal(__func__, "Error reading %s : %s\n", fname, ferror(fp) ? strerror(errno) : "Unexpected end of file"); + } + err_fatal(__func__, "Parse error reading %s\n", fname); } bntseq_t *bns_restore(const char *prefix) @@ -153,7 +172,7 @@ void bns_destroy(bntseq_t *bns) if (bns == 0) return; else { int i; - if (bns->fp_pac) fclose(bns->fp_pac); + if (bns->fp_pac) err_fclose(bns->fp_pac); free(bns->ambs); for (i = 0; i < bns->n_seqs; ++i) { free(bns->anns[i].name); @@ -173,11 +192,11 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_ int i, lasts; if (bns->n_seqs == *m_seqs) { *m_seqs <<= 1; - bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t)); + bns->anns = (bntann1_t*)xrealloc(bns->anns, *m_seqs * sizeof(bntann1_t)); } p = bns->anns + bns->n_seqs; - p->name = strdup((char*)seq->name.s); - p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); + p->name = xstrdup((char*)seq->name.s); + p->anno = seq->comment.s? xstrdup((char*)seq->comment.s) : xstrdup("(null)"); p->gi = 0; p->len = seq->seq.l; p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; p->n_ambs = 0; @@ -189,7 +208,7 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_ } else { if (bns->n_holes == *m_holes) { (*m_holes) <<= 1; - bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t)); + bns->ambs = (bntamb1_t*)xrealloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t)); } *q = bns->ambs + bns->n_holes; (*q)->len = 1; @@ -204,7 +223,7 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_ if (c >= 4) c = lrand48()&3; if (bns->l_pac == *m_pac) { // double the pac size *m_pac <<= 1; - pac = realloc(pac, *m_pac/4); + pac = xrealloc(pac, *m_pac/4); memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4); } _set_pac(pac, bns->l_pac, c); @@ -229,13 +248,13 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) // initialization seq = kseq_init(fp_fa); - bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); + bns = (bntseq_t*)xcalloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; - bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); - bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); - pac = calloc(m_pac/4, 1); + bns->anns = (bntann1_t*)xcalloc(m_seqs, sizeof(bntann1_t)); + bns->ambs = (bntamb1_t*)xcalloc(m_holes, sizeof(bntamb1_t)); + pac = xcalloc(m_pac/4, 1); q = bns->ambs; strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); @@ -243,7 +262,7 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); if (!for_only) { // add the reverse complemented sequence m_pac = (bns->l_pac * 2 + 3) / 4 * 4; - pac = realloc(pac, m_pac/4); + pac = xrealloc(pac, m_pac/4); memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); @@ -251,16 +270,16 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) ret = bns->l_pac; { // finalize .pac file ubyte_t ct; - fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); + err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); // the following codes make the pac file size always (l_pac/4+1+1) if (bns->l_pac % 4 == 0) { ct = 0; - fwrite(&ct, 1, 1, fp); + err_fwrite(&ct, 1, 1, fp); } ct = bns->l_pac % 4; - fwrite(&ct, 1, 1, fp); + err_fwrite(&ct, 1, 1, fp); // close .pac file - fclose(fp); + err_fclose(fp); } bns_dump(bns, prefix); bns_destroy(bns); diff --git a/bwa.c b/bwa.c index 8e99f18..0b6a420 100644 --- a/bwa.c +++ b/bwa.c @@ -1,7 +1,9 @@ + #include #include #include #include +#include "utils.h" #include "bwa.h" #include "bwt.h" #include "bwtgap.h" @@ -38,8 +40,8 @@ bwa_idx_t *bwa_idx_load(const char *prefix) int l; char *str; l = strlen(prefix); - p = calloc(1, sizeof(bwa_idx_t)); - str = malloc(l + 10); + p = xcalloc(1, sizeof(bwa_idx_t)); + str = xmalloc(l + 10); strcpy(str, prefix); p->bns = bns_restore(str); strcpy(str + l, ".bwt"); @@ -48,9 +50,9 @@ bwa_idx_t *bwa_idx_load(const char *prefix) strcpy(str + l, ".sa"); bwt_restore_sa(str, p->bwt); free(str); - p->pac = calloc(p->bns->l_pac/4+1, 1); - fread(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac); - fclose(p->bns->fp_pac); + p->pac = xcalloc(p->bns->l_pac/4+1, 1); + err_fread_noeof(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac); + err_fclose(p->bns->fp_pac); p->bns->fp_pac = 0; return p; } @@ -69,7 +71,7 @@ bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score) extern int bwa_cal_maxdiff(int l, double err, double thres); int i; bwa_buf_t *p; - p = malloc(sizeof(bwa_buf_t)); + p = xmalloc(sizeof(bwa_buf_t)); p->stack = gap_init_stack2(max_score); p->opt = gap_init_opt(); p->opt->s_gapo = opt->s_gapo; @@ -80,10 +82,10 @@ bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score) p->opt->seed_len = opt->seed_len; p->opt->max_seed_diff = opt->max_seed_diff; p->opt->fnr = opt->fnr; - p->diff_tab = calloc(BWA_MAX_QUERY_LEN, sizeof(int)); + p->diff_tab = xcalloc(BWA_MAX_QUERY_LEN, sizeof(int)); for (i = 1; i < BWA_MAX_QUERY_LEN; ++i) p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); - p->logn = calloc(256, sizeof(int)); + p->logn = xcalloc(256, sizeof(int)); for (i = 1; i != 256; ++i) p->logn[i] = (int)(4.343 * log(i) + 0.499); return p; @@ -111,7 +113,7 @@ bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq) if (buf_len > buf->max_buf) { buf->max_buf = buf_len; kroundup32(buf->max_buf); - buf->buf = realloc(buf->buf, buf->max_buf); + buf->buf = xrealloc(buf->buf, buf->max_buf); } memset(buf->buf, 0, buf_len); seed_w = (bwt_width_t*)buf->buf; @@ -166,7 +168,7 @@ void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t if (seq_len<<1 > buf->max_buf) { buf->max_buf = seq_len<<1; kroundup32(buf->max_buf); - buf->buf = realloc(buf->buf, buf->max_buf); + buf->buf = xrealloc(buf->buf, buf->max_buf); } s[0] = buf->buf; s[1] = s[0] + seq_len; @@ -180,7 +182,7 @@ void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t bwa_cigar_t *cigar16; cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1); aln->n_cigar = n_cigar; - aln->cigar = malloc(n_cigar * 4); + aln->cigar = xmalloc(n_cigar * 4); for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) { int op = cigar16[i]>>14; int len = cigar16[i]&0x3fff; @@ -191,7 +193,7 @@ void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t free(cigar16); } else { // ungapped aln->n_cigar = 1; - aln->cigar = malloc(4); + aln->cigar = xmalloc(4); aln->cigar[0] = seq_len<<4 | 0; pos3 = pac_pos + seq_len; } @@ -214,7 +216,7 @@ bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen int best, cnt, i, seq_len; seq_len = strlen(seq); - one = calloc(1, sizeof(bwa_one_t)); + one = xcalloc(1, sizeof(bwa_one_t)); one->sai = bwa_sai(idx, buf, seq); if (one->sai.n == 0) return one; // count number of hits; randomly select one alignment diff --git a/bwape.c b/bwape.c index 779670f..f16d684 100644 --- a/bwape.c +++ b/bwape.c @@ -58,7 +58,7 @@ void bwa_print_sam_PG(); pe_opt_t *bwa_init_pe_opt() { pe_opt_t *po; - po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t)); + po = (pe_opt_t*)xcalloc(1, sizeof(pe_opt_t)); po->max_isize = 500; po->force_isize = 0; po->max_occ = 100000; @@ -104,7 +104,7 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ii->avg = ii->std = -1.0; ii->low = ii->high = ii->high_bayesian = 0; - isizes = (uint64_t*)calloc(n_seqs, 8); + isizes = (uint64_t*)xcalloc(n_seqs, 8); for (i = 0, tot = 0; i != n_seqs; ++i) { bwa_seq_t *p[2]; p[0] = seqs[0] + i; p[1] = seqs[1] + i; @@ -292,9 +292,9 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw pe_data_t *d; aln_buf_t *buf[2]; - d = (pe_data_t*)calloc(1, sizeof(pe_data_t)); - buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); - buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); + d = (pe_data_t*)xcalloc(1, sizeof(pe_data_t)); + buf[0] = (aln_buf_t*)xcalloc(n_seqs, sizeof(aln_buf_t)); + buf[1] = (aln_buf_t*)xcalloc(n_seqs, sizeof(aln_buf_t)); if (_bwt == 0) { // load forward SA strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); @@ -309,11 +309,11 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw p[j] = seqs[j] + i; p[j]->n_multi = 0; p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2); - fread(&n_aln, 4, 1, fp_sa[j]); + err_fread_noeof(&n_aln, 4, 1, fp_sa[j]); if (n_aln > kv_max(d->aln[j])) kv_resize(bwt_aln1_t, d->aln[j], n_aln); d->aln[j].n = n_aln; - fread(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]); + err_fread_noeof(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]); kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j] // generate SE alignment and mapping quality bwa_aln2seq(n_aln, d->aln[j].a, p[j]); @@ -367,7 +367,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw if (ret) { // not in the hash table; ret must equal 1 as we never remove elements poslist_t *z = &kh_val(g_hash, iter); z->n = r->l - r->k + 1; - z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n); + z->a = (bwtint_t*)xmalloc(sizeof(bwtint_t) * z->n); for (l = r->k; l <= r->l; ++l) { int strand; z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand)<<1; @@ -448,10 +448,10 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0; // get reference subsequence - ref_seq = (ubyte_t*)calloc(reglen, 1); + ref_seq = (ubyte_t*)xcalloc(reglen, 1); for (k = *beg, l = 0; l < reglen && k < l_pac; ++k) ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - path = (path_t*)calloc(l+len, sizeof(path_t)); + path = (path_t*)xcalloc(l+len, sizeof(path_t)); // do alignment ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, &subo); @@ -480,7 +480,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u *beg += (p->i? p->i : 1) - 1; start = (p->j? p->j : 1) - 1; end = path->j; - cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); + cigar = (bwa_cigar_t*)xrealloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); if (start) { memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar)); cigar[0] = __cigar_create(3, start); @@ -525,9 +525,9 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, // load reference sequence if (_pacseq == 0) { - pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); - rewind(bns->fp_pac); - fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); + pacseq = (ubyte_t*)xcalloc(bns->l_pac/4+1, 1); + err_rewind(bns->fp_pac); + err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); } else pacseq = (ubyte_t*)_pacseq; if (!popt->is_sw || ii->avg < 0.0) return pacseq; @@ -683,10 +683,10 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f g_hash = kh_init(b128); last_ii.avg = -1.0; - fread(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); + err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); ks[0] = bwa_open_reads(opt.mode, fn_fa[0]); opt0 = opt; - fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! + err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! ks[1] = bwa_open_reads(opt.mode, fn_fa[1]); if (!(opt.mode & BWA_MODE_COMPREAD)) { popt->type = BWA_PET_SOLID; @@ -695,9 +695,9 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f if (popt->is_preload) { strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); - pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1); - rewind(bns->fp_pac); - fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); + pac = (ubyte_t*)xcalloc(bns->l_pac/4+1, 1); + err_rewind(bns->fp_pac); + err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); } } @@ -752,7 +752,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f if (ntbns) bns_destroy(ntbns); for (i = 0; i < 2; ++i) { bwa_seq_close(ks[i]); - fclose(fp_sa[i]); + err_fclose(fp_sa[i]); } for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter) if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a); diff --git a/bwase.c b/bwase.c index 35744e7..afe8154 100644 --- a/bwase.c +++ b/bwase.c @@ -59,7 +59,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma * simply output all hits, but the following samples "rest" * number of random hits. */ rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa - s->multi = calloc(rest, sizeof(bwt_multi1_t)); + s->multi = xcalloc(rest, sizeof(bwt_multi1_t)); for (k = 0; k < n_aln; ++k) { const bwt_aln1_t *q = aln + k; if (q->l - q->k + 1 <= rest) { @@ -172,16 +172,16 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l ref_len = len + abs(ext); if (ext > 0) { - ref_seq = (ubyte_t*)calloc(ref_len, 1); + ref_seq = (ubyte_t*)xcalloc(ref_len, 1); for (k = __pos; k < __pos + ref_len && k < l_pac; ++k) ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; } else { int64_t x = __pos + (is_end_correct? len : ref_len); - ref_seq = (ubyte_t*)calloc(ref_len, 1); + ref_seq = (ubyte_t*)xcalloc(ref_len, 1); for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k) ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; } - path = (path_t*)calloc(l+len, sizeof(path_t)); + path = (path_t*)xcalloc(l+len, sizeof(path_t)); aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len); cigar = bwa_aln_path2cigar(path, path_len, n_cigar); @@ -257,7 +257,7 @@ char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_ } ksprintf(str, "%d", u); *_nm = nm; - return strdup(str->s); + return xstrdup(str->s); } void bwa_correct_trimmed(bwa_seq_t *s) @@ -269,11 +269,11 @@ void bwa_correct_trimmed(bwa_seq_t *s) } else { if (s->cigar == 0) { s->n_cigar = 2; - s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t)); + s->cigar = xcalloc(s->n_cigar, sizeof(bwa_cigar_t)); s->cigar[0] = __cigar_create(0, s->len); } else { ++s->n_cigar; - s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); + s->cigar = xrealloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); } s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len)); } @@ -283,11 +283,11 @@ void bwa_correct_trimmed(bwa_seq_t *s) } else { if (s->cigar == 0) { s->n_cigar = 2; - s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t)); + s->cigar = xcalloc(s->n_cigar, sizeof(bwa_cigar_t)); s->cigar[1] = __cigar_create(0, s->len); } else { ++s->n_cigar; - s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); + s->cigar = xrealloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t)); } s->cigar[0] = __cigar_create(3, (s->full_len - s->len)); @@ -303,15 +303,15 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t kstring_t *str; if (ntbns) { // in color space - ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1); - rewind(ntbns->fp_pac); - fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac); + ntpac = (ubyte_t*)xcalloc(ntbns->l_pac/4+1, 1); + err_rewind(ntbns->fp_pac); + err_fread_noeof(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac); } if (!_pacseq) { - pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); - rewind(bns->fp_pac); - fread(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); + pacseq = (ubyte_t*)xcalloc(bns->l_pac/4+1, 1); + err_rewind(bns->fp_pac); + err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); } else pacseq = _pacseq; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *s = seqs + i; @@ -351,7 +351,7 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t } #endif // generate MD tag - str = (kstring_t*)calloc(1, sizeof(kstring_t)); + str = (kstring_t*)xcalloc(1, sizeof(kstring_t)); for (i = 0; i != n_seqs; ++i) { bwa_seq_t *s = seqs + i; if (s->type != BWA_TYPE_NO_MATCH) { @@ -523,7 +523,7 @@ bntseq_t *bwa_open_nt(const char *prefix) { bntseq_t *ntbns; char *str; - str = (char*)calloc(strlen(prefix) + 10, 1); + str = (char*)xcalloc(strlen(prefix) + 10, 1); strcat(strcpy(str, prefix), ".nt"); ntbns = bns_restore(str); free(str); @@ -566,14 +566,14 @@ int bwa_set_rg(const char *s) if (strstr(s, "@RG") != s) return -1; if (bwa_rg_line) free(bwa_rg_line); if (bwa_rg_id) free(bwa_rg_id); - bwa_rg_line = strdup(s); + bwa_rg_line = xstrdup(s); bwa_rg_id = 0; bwa_escape(bwa_rg_line); p = strstr(bwa_rg_line, "\tID:"); if (p == 0) return -1; p += 4; for (q = p; *q && *q != '\t' && *q != '\n'; ++q); - bwa_rg_id = calloc(q - p + 1, 1); + bwa_rg_id = xcalloc(q - p + 1, 1); for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) *r++ = *q; return 0; @@ -598,7 +598,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f fp_sa = xopen(fn_sa, "r"); m_aln = 0; - fread(&opt, sizeof(gap_opt_t), 1, fp_sa); + err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa); if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac ntbns = bwa_open_nt(prefix); bwa_print_sam_SQ(bns); @@ -614,12 +614,12 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f for (i = 0; i < n_seqs; ++i) { bwa_seq_t *p = seqs + i; int n_aln; - fread(&n_aln, 4, 1, fp_sa); + err_fread_noeof(&n_aln, 4, 1, fp_sa); if (n_aln > m_aln) { m_aln = n_aln; - aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln); + aln = (bwt_aln1_t*)xrealloc(aln, sizeof(bwt_aln1_t) * m_aln); } - fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); + err_fread_noeof(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); bwa_aln2seq_core(n_aln, aln, p, 1, n_occ); } @@ -644,7 +644,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f bwa_seq_close(ks); if (ntbns) bns_destroy(ntbns); bns_destroy(bns); - fclose(fp_sa); + err_fclose(fp_sa); free(aln); } diff --git a/bwaseqio.c b/bwaseqio.c index e22d4cd..716f9b2 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -5,7 +5,7 @@ #include "bamlite.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_INIT(gzFile, err_gzread) extern unsigned char nst_nt4_table[256]; static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; @@ -22,7 +22,7 @@ bwa_seqio_t *bwa_bam_open(const char *fn, int which) { bwa_seqio_t *bs; bam_header_t *h; - bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); + bs = (bwa_seqio_t*)xcalloc(1, sizeof(bwa_seqio_t)); bs->is_bam = 1; bs->which = which; bs->fp = bam_open(fn, "r"); @@ -35,7 +35,7 @@ bwa_seqio_t *bwa_seq_open(const char *fn) { gzFile fp; bwa_seqio_t *bs; - bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); + bs = (bwa_seqio_t*)xcalloc(1, sizeof(bwa_seqio_t)); fp = xzopen(fn, "r"); bs->ks = kseq_init(fp); return bs; @@ -93,7 +93,7 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com b = bam_init1(); n_seqs = 0; - seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); + seqs = (bwa_seq_t*)xcalloc(n_needed, sizeof(bwa_seq_t)); while (bam_read1(bs->fp, b) >= 0) { uint8_t *s, *q; int go = 0; @@ -108,8 +108,8 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; s = bam1_seq(b); q = bam1_qual(b); - p->seq = (ubyte_t*)calloc(p->len + 1, 1); - p->qual = (ubyte_t*)calloc(p->len + 1, 1); + p->seq = (ubyte_t*)xcalloc(p->len + 1, 1); + p->qual = (ubyte_t*)xcalloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; @@ -119,11 +119,11 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); - p->rseq = (ubyte_t*)calloc(p->full_len, 1); + p->rseq = (ubyte_t*)xcalloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); - p->name = strdup((const char*)bam1_qname(b)); + p->name = xstrdup((const char*)bam1_qname(b)); if (n_seqs == n_needed) break; } *n = n_seqs; @@ -153,7 +153,7 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int tri } if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input n_seqs = 0; - seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); + seqs = (bwa_seq_t*)xcalloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { // skip reads that are marked to be filtered by Casava @@ -184,18 +184,18 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int tri p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; - p->seq = (ubyte_t*)calloc(p->len, 1); + p->seq = (ubyte_t*)xcalloc(p->full_len, 1); for (i = 0; i != p->full_len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; if (seq->qual.l) { // copy quality - p->qual = (ubyte_t*)strdup((char*)seq->qual.s); + p->qual = (ubyte_t*)xstrdup((char*)seq->qual.s); if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); } - p->rseq = (ubyte_t*)calloc(p->full_len, 1); + p->rseq = (ubyte_t*)xcalloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); - p->name = strdup((const char*)seq->name.s); + p->name = xstrdup((const char*)seq->name.s); { // trim /[12]$ int t = strlen(p->name); if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; diff --git a/bwt.c b/bwt.c index 966b718..eb85bb0 100644 --- a/bwt.c +++ b/bwt.c @@ -58,11 +58,7 @@ void bwt_cal_sa(bwt_t *bwt, int intv) if (bwt->sa) free(bwt->sa); bwt->sa_intv = intv; bwt->n_sa = (bwt->seq_len + intv) / intv; - bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); - if (bwt->sa == 0) { - fprintf(stderr, "[%s] Fail to allocate %.3fMB memory. Abort!\n", __func__, bwt->n_sa * sizeof(bwtint_t) / 1024.0/1024.0); - abort(); - } + bwt->sa = (bwtint_t*)xcalloc(bwt->n_sa, sizeof(bwtint_t)); // calculate SA value isa = 0; sa = bwt->seq_len; for (i = 0; i < bwt->seq_len; ++i) { diff --git a/bwt_gen.c b/bwt_gen.c index cac6a5f..48bd662 100644 --- a/bwt_gen.c +++ b/bwt_gen.c @@ -28,6 +28,7 @@ #include #include #include "QSufSort.h" +#include "utils.h" typedef uint64_t bgint_t; typedef int64_t sbgint_t; @@ -319,25 +320,25 @@ BWT *BWTCreate(const bgint_t textLength, unsigned int *decodeTable) { BWT *bwt; - bwt = (BWT*)calloc(1, sizeof(BWT)); + bwt = (BWT*)xcalloc(1, sizeof(BWT)); bwt->textLength = 0; - bwt->cumulativeFreq = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); + bwt->cumulativeFreq = (bgint_t*)xcalloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); initializeVAL_bg(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); bwt->bwtSizeInWord = 0; // Generate decode tables if (decodeTable == NULL) { - bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int)); + bwt->decodeTable = (unsigned*)xcalloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int)); GenerateDNAOccCountTable(bwt->decodeTable); } else { bwt->decodeTable = decodeTable; } bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength); - bwt->occValueMajor = (bgint_t*)calloc(bwt->occMajorSizeInWord, sizeof(bgint_t)); + bwt->occValueMajor = (bgint_t*)xcalloc(bwt->occMajorSizeInWord, sizeof(bgint_t)); bwt->occSizeInWord = 0; bwt->occValue = NULL; @@ -353,16 +354,16 @@ BWTInc *BWTIncCreate(const bgint_t textLength, unsigned int initialMaxBuildSize, if (textLength < incMaxBuildSize) incMaxBuildSize = textLength; if (textLength < initialMaxBuildSize) initialMaxBuildSize = textLength; - bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc)); + bwtInc = (BWTInc*)xcalloc(1, sizeof(BWTInc)); bwtInc->numberOfIterationDone = 0; bwtInc->bwt = BWTCreate(textLength, NULL); bwtInc->initialMaxBuildSize = initialMaxBuildSize; bwtInc->incMaxBuildSize = incMaxBuildSize; - bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); + bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)xcalloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); // Build frequently accessed data - bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int)); + bwtInc->packedShift = (unsigned*)xcalloc(CHAR_PER_WORD, sizeof(unsigned int)); for (i=0; ipackedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; @@ -372,7 +373,7 @@ BWTInc *BWTIncCreate(const bgint_t textLength, unsigned int initialMaxBuildSize, + incMaxBuildSize/5 * 3 * (sizeof(bgint_t) / 4); // space for the 3 temporary arrays in each iteration if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD; // lh3: otherwise segfaul when availableWord is too small fprintf(stderr, "[%s] textLength=%ld, availableWord=%ld\n", __func__, (long)textLength, (long)bwtInc->availableWord); - bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD); + bwtInc->workingMemory = (unsigned*)xcalloc(bwtInc->availableWord, BYTES_IN_WORD); return bwtInc; } @@ -1447,9 +1448,9 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxB exit(1); } - fseek(packedFile, -1, SEEK_END); + err_fseek(packedFile, -1, SEEK_END); packedFileLen = ftell(packedFile); - fread(&lastByteLength, sizeof(unsigned char), 1, packedFile); + err_fread_noeof(&lastByteLength, sizeof(unsigned char), 1, packedFile); totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); bwtInc = BWTIncCreate(totalTextLength, initialMaxBuildSize, incMaxBuildSize); @@ -1463,10 +1464,10 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxB } textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte - fseek(packedFile, -2, SEEK_CUR); - fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); - fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile); - fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR); + err_fseek(packedFile, -2, SEEK_CUR); + err_fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); + err_fread_noeof(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile); + err_fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR); ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); BWTIncConstruct(bwtInc, textToLoad); @@ -1479,9 +1480,9 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxB textToLoad = totalTextLength - processedTextLength; } textSizeInByte = textToLoad / CHAR_PER_BYTE; - fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); - fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile); - fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); + err_fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); + err_fread_noeof(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile); + err_fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); BWTIncConstruct(bwtInc, textToLoad); processedTextLength += textToLoad; @@ -1530,11 +1531,11 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o exit(1); } - fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile); - fwrite(bwt->cumulativeFreq + 1, sizeof(bgint_t), ALPHABET_SIZE, bwtFile); + err_fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile); + err_fwrite(bwt->cumulativeFreq + 1, sizeof(bgint_t), ALPHABET_SIZE, bwtFile); bwtLength = BWTFileSizeInWord(bwt->textLength); - fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile); - fclose(bwtFile); + err_fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile); + err_fclose(bwtFile); } void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) diff --git a/bwt_lite.c b/bwt_lite.c index 902e0fc..83dafc4 100644 --- a/bwt_lite.c +++ b/bwt_lite.c @@ -2,6 +2,7 @@ #include #include #include "bwt_lite.h" +#include "utils.h" int is_sa(const uint8_t *T, uint32_t *SA, int n); int is_bwt(uint8_t *T, int n); @@ -10,21 +11,21 @@ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq) { bwtl_t *b; int i; - b = (bwtl_t*)calloc(1, sizeof(bwtl_t)); + b = (bwtl_t*)xcalloc(1, sizeof(bwtl_t)); b->seq_len = len; { // calculate b->bwt uint8_t *s; - b->sa = (uint32_t*)calloc(len + 1, 4); + b->sa = (uint32_t*)xcalloc(len + 1, 4); is_sa(seq, b->sa, len); - s = (uint8_t*)calloc(len + 1, 1); + s = (uint8_t*)xcalloc(len + 1, 1); for (i = 0; i <= len; ++i) { if (b->sa[i] == 0) b->primary = i; else s[i] = seq[b->sa[i] - 1]; } for (i = b->primary; i < len; ++i) s[i] = s[i + 1]; b->bwt_size = (len + 15) / 16; - b->bwt = (uint32_t*)calloc(b->bwt_size, 4); + b->bwt = (uint32_t*)xcalloc(b->bwt_size, 4); for (i = 0; i < len; ++i) b->bwt[i>>4] |= s[i] << ((15 - (i&15)) << 1); free(s); @@ -32,7 +33,7 @@ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq) { // calculate b->occ uint32_t c[4]; b->n_occ = (len + 15) / 16 * 4; - b->occ = (uint32_t*)calloc(b->n_occ, 4); + b->occ = (uint32_t*)xcalloc(b->n_occ, 4); memset(c, 0, 16); for (i = 0; i < len; ++i) { if (i % 16 == 0) diff --git a/bwtaln.c b/bwtaln.c index efc7f66..109f964 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -19,7 +19,7 @@ gap_opt_t *gap_init_opt() { gap_opt_t *o; - o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t)); + o = (gap_opt_t*)xcalloc(1, sizeof(gap_opt_t)); /* IMPORTANT: s_mm*10 should be about the average base error rate. Voilating this requirement will break pairing! */ o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4; @@ -89,7 +89,7 @@ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff; stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt); - seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); + seed_w = (bwt_width_t*)xcalloc(opt->seed_len+1, sizeof(bwt_width_t)); w = 0; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = seqs + i; @@ -99,7 +99,7 @@ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; if (max_l < p->len) { max_l = p->len; - w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t)); + w = (bwt_width_t*)xrealloc(w, (max_l + 1) * sizeof(bwt_width_t)); memset(w, 0, (max_l + 1) * sizeof(bwt_width_t)); } bwt_cal_width(bwt, p->len, p->seq, w); @@ -162,7 +162,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) ks = bwa_open_reads(opt->mode, fn_fa); { // load BWT - char *str = (char*)calloc(strlen(prefix) + 10, 1); + char *str = (char*)xcalloc(strlen(prefix) + 10, 1); strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); free(str); } @@ -185,8 +185,8 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) int j; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); - tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + data = (thread_aux_t*)xcalloc(opt->n_threads, sizeof(thread_aux_t)); + tid = (pthread_t*)xcalloc(opt->n_threads, sizeof(pthread_t)); for (j = 0; j < opt->n_threads; ++j) { data[j].tid = j; data[j].bwt = bwt; data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; @@ -225,7 +225,7 @@ char *bwa_infer_prefix(const char *hint) int l_hint; FILE *fp; l_hint = strlen(hint); - prefix = malloc(l_hint + 3 + 4 + 1); + prefix = xmalloc(l_hint + 3 + 4 + 1); strcpy(prefix, hint); strcpy(prefix + l_hint, ".64.bwt"); if ((fp = fopen(prefix, "rb")) != 0) { diff --git a/bwtgap.c b/bwtgap.c index 364717c..cef9561 100644 --- a/bwtgap.c +++ b/bwtgap.c @@ -3,6 +3,7 @@ #include #include "bwtgap.h" #include "bwtaln.h" +#include "utils.h" #define STATE_M 0 #define STATE_I 1 @@ -13,9 +14,9 @@ gap_stack_t *gap_init_stack2(int max_score) { gap_stack_t *stack; - stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t)); + stack = (gap_stack_t*)xcalloc(1, sizeof(gap_stack_t)); stack->n_stacks = max_score; - stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t)); + stack->stacks = (gap_stack1_t*)xcalloc(stack->n_stacks, sizeof(gap_stack1_t)); return stack; } @@ -51,7 +52,7 @@ static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, i q = stack->stacks + score; if (q->n_entries == q->m_entries) { q->m_entries = q->m_entries? q->m_entries<<1 : 4; - q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries); + q->stack = (gap_entry_t*)xrealloc(q->stack, sizeof(gap_entry_t) * q->m_entries); } p = q->stack + q->n_entries; p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l; @@ -110,7 +111,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid bwt_aln1_t *aln; m_aln = 4; n_aln = 0; - aln = (bwt_aln1_t*)calloc(m_aln, sizeof(bwt_aln1_t)); + aln = (bwt_aln1_t*)xcalloc(m_aln, sizeof(bwt_aln1_t)); // check whether there are too many N for (j = _j = 0; j < len; ++j) @@ -177,7 +178,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid gap_shadow(l - k + 1, len, bwt->seq_len, e.last_diff_pos, width); if (n_aln == m_aln) { m_aln <<= 1; - aln = (bwt_aln1_t*)realloc(aln, m_aln * sizeof(bwt_aln1_t)); + aln = (bwt_aln1_t*)xrealloc(aln, m_aln * sizeof(bwt_aln1_t)); memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t)); } p = aln + n_aln; diff --git a/bwtindex.c b/bwtindex.c index 938e982..f430e62 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -54,7 +54,7 @@ int bwa_index(int argc, char *argv[]) else if (strcmp(optarg, "is") == 0) algo_type = 3; else err_fatal(__func__, "unknown algorithm: '%s'.", optarg); break; - case 'p': prefix = strdup(optarg); break; + case 'p': prefix = xstrdup(optarg); break; case 'c': is_color = 1; break; case '6': is_64 = 1; break; default: return 1; @@ -75,13 +75,13 @@ int bwa_index(int argc, char *argv[]) return 1; } if (prefix == 0) { - prefix = malloc(strlen(argv[optind]) + 4); + prefix = xmalloc(strlen(argv[optind]) + 4); strcpy(prefix, argv[optind]); if (is_64) strcat(prefix, ".64"); } - str = (char*)calloc(strlen(prefix) + 10, 1); - str2 = (char*)calloc(strlen(prefix) + 10, 1); - str3 = (char*)calloc(strlen(prefix) + 10, 1); + str = (char*)xcalloc(strlen(prefix) + 10, 1); + str2 = (char*)xcalloc(strlen(prefix) + 10, 1); + str3 = (char*)xcalloc(strlen(prefix) + 10, 1); if (is_color == 0) { // nucleotide indexing gzFile fp = xzopen(argv[optind], "r"); diff --git a/bwtio.c b/bwtio.c index 7508609..0d4623e 100644 --- a/bwtio.c +++ b/bwtio.c @@ -6,24 +6,24 @@ void bwt_dump_bwt(const char *fn, const bwt_t *bwt) { - FILE *fp; + FILE *fp = NULL; fp = xopen(fn, "wb"); - fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); - fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); - fwrite(bwt->bwt, 4, bwt->bwt_size, fp); - fclose(fp); + err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp); + err_fclose(fp); } void bwt_dump_sa(const char *fn, const bwt_t *bwt) { FILE *fp; fp = xopen(fn, "wb"); - fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); - fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); - fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); - fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); - fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); - fclose(fp); + err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); + err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + err_fclose(fp); } void bwt_restore_sa(const char *fn, bwt_t *bwt) @@ -33,19 +33,19 @@ void bwt_restore_sa(const char *fn, bwt_t *bwt) bwtint_t primary; fp = xopen(fn, "rb"); - fread(&primary, sizeof(bwtint_t), 1, fp); + err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same."); - fread(skipped, sizeof(bwtint_t), 4, fp); // skip - fread(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); - fread(&primary, sizeof(bwtint_t), 1, fp); + err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip + err_fread_noeof(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; - bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); + bwt->sa = (bwtint_t*)xcalloc(bwt->n_sa, sizeof(bwtint_t)); bwt->sa[0] = -1; - fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); - fclose(fp); + err_fread_noeof(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + err_fclose(fp); } bwt_t *bwt_restore_bwt(const char *fn) @@ -53,17 +53,17 @@ bwt_t *bwt_restore_bwt(const char *fn) bwt_t *bwt; FILE *fp; - bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t)); fp = xopen(fn, "rb"); - fseek(fp, 0, SEEK_END); - bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2; - bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4); - fseek(fp, 0, SEEK_SET); - fread(&bwt->primary, sizeof(bwtint_t), 1, fp); - fread(bwt->L2+1, sizeof(bwtint_t), 4, fp); - fread(bwt->bwt, 4, bwt->bwt_size, fp); + err_fseek(fp, 0, SEEK_END); + bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2; + bwt->bwt = (uint32_t*)xcalloc(bwt->bwt_size, 4); + err_fseek(fp, 0, SEEK_SET); + err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp); + err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp); + err_fread_noeof(bwt->bwt, 4, bwt->bwt_size, fp); bwt->seq_len = bwt->L2[4]; - fclose(fp); + err_fclose(fp); bwt_gen_cnt_table(bwt); return bwt; diff --git a/bwtmisc.c b/bwtmisc.c index c35d684..49aa5aa 100644 --- a/bwtmisc.c +++ b/bwtmisc.c @@ -46,10 +46,10 @@ int64_t bwa_seq_len(const char *fn_pac) int64_t pac_len; ubyte_t c; fp = xopen(fn_pac, "rb"); - fseek(fp, -1, SEEK_END); - pac_len = ftell(fp); - fread(&c, 1, 1, fp); - fclose(fp); + err_fseek(fp, -1, SEEK_END); + pac_len = err_ftell(fp); + err_fread_noeof(&c, 1, 1, fp); + err_fclose(fp); return (pac_len - 1) * 4 + (int)c; } @@ -61,18 +61,18 @@ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) FILE *fp; // initialization - bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t)); bwt->seq_len = bwa_seq_len(fn_pac); bwt->bwt_size = (bwt->seq_len + 15) >> 4; fp = xopen(fn_pac, "rb"); // prepare sequence pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); - buf2 = (ubyte_t*)calloc(pac_size, 1); - fread(buf2, 1, pac_size, fp); - fclose(fp); + buf2 = (ubyte_t*)xcalloc(pac_size, 1); + err_fread_noeof(buf2, 1, pac_size, fp); + err_fclose(fp); memset(bwt->L2, 0, 5 * 4); - buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); + buf = (ubyte_t*)xcalloc(bwt->seq_len + 1, 1); for (i = 0; i < bwt->seq_len; ++i) { buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; ++bwt->L2[1+buf[i]]; @@ -90,7 +90,7 @@ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) err_fatal_simple("libdivsufsort is not compiled in."); #endif } - bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); + bwt->bwt = (u_int32_t*)xcalloc(bwt->bwt_size, 4); for (i = 0; i < bwt->seq_len; ++i) bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); free(buf); @@ -126,7 +126,7 @@ void bwt_bwtupdate_core(bwt_t *bwt) n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size - buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt + buf = (uint32_t*)xcalloc(bwt->bwt_size, 4); // will be the new bwt c[0] = c[1] = c[2] = c[3] = 0; for (i = k = 0; i < bwt->seq_len; ++i) { if (i % OCC_INTERVAL == 0) { @@ -167,10 +167,10 @@ uint8_t *bwa_pac2cspac_core(const bntseq_t *bns) uint8_t *pac, *cspac; bwtint_t i; int c1, c2; - pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); - cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); - fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); - rewind(bns->fp_pac); + pac = (uint8_t*)xcalloc(bns->l_pac/4 + 1, 1); + cspac = (uint8_t*)xcalloc(bns->l_pac/4 + 1, 1); + err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); + err_rewind(bns->fp_pac); c1 = pac[0]>>6; cspac[0] = c1<<6; for (i = 1; i < bns->l_pac; ++i) { c2 = pac[i>>2] >> (~i&3)*2 & 3; @@ -196,13 +196,13 @@ int bwa_pac2cspac(int argc, char *argv[]) cspac = bwa_pac2cspac_core(bns); bns_dump(bns, argv[2]); // now write cspac - str = (char*)calloc(strlen(argv[2]) + 5, 1); + str = (char*)xcalloc(strlen(argv[2]) + 5, 1); strcat(strcpy(str, argv[2]), ".pac"); fp = xopen(str, "wb"); - fwrite(cspac, 1, bns->l_pac/4 + 1, fp); + err_fwrite(cspac, 1, bns->l_pac/4 + 1, fp); ct = bns->l_pac % 4; - fwrite(&ct, 1, 1, fp); - fclose(fp); + err_fwrite(&ct, 1, 1, fp); + err_fclose(fp); bns_destroy(bns); free(cspac); return 0; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 5e8161c..ca39919 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -15,7 +15,7 @@ #include "kstring.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_INIT(gzFile, err_gzread) #include "ksort.h" #define __left_lt(a, b) ((a).end > (b).end) @@ -47,7 +47,7 @@ extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level); bsw2opt_t *bsw2_init_opt() { - bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t)); + bsw2opt_t *o = (bsw2opt_t*)xcalloc(1, sizeof(bsw2opt_t)); o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30; o->bw = 50; o->max_ins = 20000; @@ -72,11 +72,11 @@ void bsw2_destroy(bwtsw2_t *b) bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b) { bwtsw2_t *p; - p = calloc(1, sizeof(bwtsw2_t)); + p = xcalloc(1, sizeof(bwtsw2_t)); p->max = p->n = b->n; if (b->n) { kroundup32(p->max); - p->hits = calloc(p->max, sizeof(bsw2hit_t)); + p->hits = xcalloc(p->max, sizeof(bsw2hit_t)); memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t)); } return p; @@ -100,10 +100,10 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq par.matrix = matrix; __gen_ap(par, opt); - query = calloc(lq, 1); + query = xcalloc(lq, 1); // sort according to the descending order of query end ks_introsort(hit, b->n, b->hits); - target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); + target = xcalloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); // reverse _query for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i]; // core loop @@ -146,7 +146,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, par.matrix = matrix; __gen_ap(par, opt); - target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); + target = xcalloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; @@ -178,8 +178,8 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 par.matrix = matrix; __gen_ap(par, opt); i = ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq; // maximum possible target length - target = calloc(i, 1); - path = calloc(i + lq, sizeof(path_t)); + target = xcalloc(i, 1); + path = xcalloc(i + lq, sizeof(path_t)); // generate CIGAR for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; @@ -206,7 +206,7 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 } #endif if (beg != 0 || end < lq) { // write soft clipping - q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2)); + q->cigar = xrealloc(q->cigar, 4 * (q->n_cigar + 2)); if (beg != 0) { memmove(q->cigar + 1, q->cigar, q->n_cigar * 4); q->cigar[0] = beg<<4 | 4; @@ -238,7 +238,7 @@ static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse) int i; if (b[0]->n + b[1]->n > b[0]->max) { b[0]->max = b[0]->n + b[1]->n; - b[0]->hits = realloc(b[0]->hits, b[0]->max * sizeof(bsw2hit_t)); + b[0]->hits = xrealloc(b[0]->hits, b[0]->max * sizeof(bsw2hit_t)); } for (i = 0; i < b[1]->n; ++i) { bsw2hit_t *p = b[0]->hits + b[0]->n + i; @@ -266,9 +266,9 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8 _b = bsw2_core(bns, opt, query, target, pool); bwtl_destroy(query); for (k = 0; k < 2; ++k) { - bb[k] = calloc(2, sizeof(void*)); - bb[k][0] = calloc(1, sizeof(bwtsw2_t)); - bb[k][1] = calloc(1, sizeof(bwtsw2_t)); + bb[k] = xcalloc(2, sizeof(void*)); + bb[k][0] = xcalloc(1, sizeof(bwtsw2_t)); + bb[k][1] = xcalloc(1, sizeof(bwtsw2_t)); } for (k = 0; k < 2; ++k) { // separate _b into bb[2] based on the strand for (j = 0; j < _b[k]->n; ++j) { @@ -276,7 +276,7 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8 p = bb[_b[k]->hits[j].is_rev][k]; if (p->n == p->max) { p->max = p->max? p->max<<1 : 8; - p->hits = realloc(p->hits, p->max * sizeof(bsw2hit_t)); + p->hits = xrealloc(p->hits, p->max * sizeof(bsw2hit_t)); } q = &p->hits[p->n++]; *q = _b[k]->hits[j]; @@ -355,7 +355,7 @@ static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *c uint32_t *cn; bwtint_t kk = 0; nc = mq[0] = mq[1] = nlen[0] = nlen[1] = 0; - cn = calloc(n_cigar + 3, 4); + cn = xcalloc(n_cigar + 3, 4); x = coor; y = 0; for (i = j = 0; i < n_cigar; ++i) { int op = cigar[i]&0xf, ln = cigar[i]>>4; @@ -434,9 +434,9 @@ static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8 if (b->n<<1 < b->max) { b->max = b->n; kroundup32(b->max); - b->hits = realloc(b->hits, b->max * sizeof(bsw2hit_t)); + b->hits = xrealloc(b->hits, b->max * sizeof(bsw2hit_t)); } - b->aux = calloc(b->n, sizeof(bsw2aux_t)); + b->aux = xcalloc(b->n, sizeof(bsw2aux_t)); // generate CIGAR gen_cigar(opt, qlen, seq, pac, b, name); // fix CIGAR, generate mapQ, and write chromosomal position @@ -596,7 +596,7 @@ static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t bsw2opt_t opt; bsw2global_t *pool = bsw2_global_init(); bwtsw2_t **buf; - buf = calloc(_seq->n, sizeof(void*)); + buf = xcalloc(_seq->n, sizeof(void*)); for (x = 0; x < _seq->n; ++x) { bsw2seq1_t *p = _seq->seq + x; uint8_t *seq[2], *rseq[2]; @@ -607,10 +607,10 @@ static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t if (pool->max_l < l) { // then enlarge working space for aln_extend_core() int tmp = ((l + 1) / 2 * opt.a + opt.r) / opt.r + l; pool->max_l = l; - pool->aln_mem = realloc(pool->aln_mem, (tmp + 2) * 24); + pool->aln_mem = xrealloc(pool->aln_mem, (tmp + 2) * 24); } // set seq[2] and rseq[2] - seq[0] = calloc(l * 4, 1); + seq[0] = xcalloc(l * 4, 1); seq[1] = seq[0] + l; rseq[0] = seq[1] + l; rseq[1] = rseq[0] + l; // convert sequences to 2-bit representation @@ -623,7 +623,7 @@ static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t rseq[1][i] = c; } if (l - k < opt.t) { // too few unambiguous bases - buf[x] = calloc(1, sizeof(bwtsw2_t)); + buf[x] = xcalloc(1, sizeof(bwtsw2_t)); free(seq[0]); continue; } // alignment @@ -655,7 +655,7 @@ static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t bsw2seq1_t *p = _seq->seq + x; uint8_t *seq[2]; int i; - seq[0] = malloc(p->l * 2); seq[1] = seq[0] + p->l; + seq[0] = xmalloc(p->l * 2); seq[1] = seq[0] + p->l; for (i = 0; i < p->l; ++i) { int c = nst_nt4_table[(int)p->seq[i]]; if (c >= 4) c = (int)(drand48() * 4); @@ -711,16 +711,16 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * int j; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); - tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + data = (thread_aux_t*)xcalloc(opt->n_threads, sizeof(thread_aux_t)); + tid = (pthread_t*)xcalloc(opt->n_threads, sizeof(pthread_t)); for (j = 0; j < opt->n_threads; ++j) { thread_aux_t *p = data + j; p->tid = j; p->_opt = opt; p->bns = bns; p->is_pe = is_pe; p->pac = pac; p->target = target; - p->_seq = calloc(1, sizeof(bsw2seq_t)); + p->_seq = xcalloc(1, sizeof(bsw2seq_t)); p->_seq->max = (_seq->n + opt->n_threads - 1) / opt->n_threads + 1; p->_seq->n = 0; - p->_seq->seq = calloc(p->_seq->max, sizeof(bsw2seq1_t)); + p->_seq->seq = xcalloc(p->_seq->max, sizeof(bsw2seq1_t)); } for (i = 0; i < _seq->n; ++i) { // assign sequences to each thread bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq; @@ -760,10 +760,10 @@ static void kseq_to_bsw2seq(const kseq_t *ks, bsw2seq1_t *p) { p->tid = -1; p->l = ks->seq.l; - p->name = strdup(ks->name.s); - p->seq = strdup(ks->seq.s); - p->qual = ks->qual.l? strdup(ks->qual.s) : 0; - p->comment = ks->comment.l? strdup(ks->comment.s) : 0; + p->name = xstrdup(ks->name.s); + p->seq = xstrdup(ks->seq.s); + p->qual = ks->qual.l? xstrdup(ks->qual.s) : 0; + p->comment = ks->comment.l? xstrdup(ks->comment.s) : 0; p->sam = 0; } @@ -775,17 +775,13 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c uint8_t *pac; bsw2seq_t *_seq; - pac = calloc(bns->l_pac/4+1, 1); - if (pac == 0) { - fprintf(stderr, "[bsw2_aln] insufficient memory!\n"); - return; - } + pac = xcalloc(bns->l_pac/4+1, 1); for (l = 0; l < bns->n_seqs; ++l) printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); - fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); + err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); fp = xzopen(fn, "r"); ks = kseq_init(fp); - _seq = calloc(1, sizeof(bsw2seq_t)); + _seq = xcalloc(1, sizeof(bsw2seq_t)); if (fn2) { fp2 = xzopen(fn2, "r"); ks2 = kseq_init(fp2); @@ -796,7 +792,7 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c ks->name.l -= 2, ks->name.s[ks->name.l] = 0; if (_seq->n == _seq->max) { _seq->max = _seq->max? _seq->max<<1 : 1024; - _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); + _seq->seq = xrealloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); } kseq_to_bsw2seq(ks, &_seq->seq[_seq->n++]); size += ks->seq.l; diff --git a/bwtsw2_chain.c b/bwtsw2_chain.c index 381d0b7..6bd320f 100644 --- a/bwtsw2_chain.c +++ b/bwtsw2_chain.c @@ -1,5 +1,6 @@ #include #include "bwtsw2.h" +#include "utils.h" typedef struct { uint32_t tbeg, tend; @@ -48,9 +49,9 @@ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) char *flag; // initialization n[0] = b[0]->n; n[1] = b[1]->n; - z[0] = calloc(n[0] + n[1], sizeof(hsaip_t)); + z[0] = xcalloc(n[0] + n[1], sizeof(hsaip_t)); z[1] = z[0] + n[0]; - chain[0] = calloc(n[0] + n[1], sizeof(hsaip_t)); + chain[0] = xcalloc(n[0] + n[1], sizeof(hsaip_t)); for (k = j = 0; k < 2; ++k) { for (i = 0; i < b[k]->n; ++i) { bsw2hit_t *p = b[k]->hits + i; @@ -73,7 +74,7 @@ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) } //for (k = 0; k < m[0]; ++k) printf("%d, [%d,%d), [%d,%d)\n", chain[0][k].chain, chain[0][k].tbeg, chain[0][k].tend, chain[0][k].qbeg, chain[0][k].qend); // filtering - flag = calloc(m[0] + m[1], 1); + flag = xcalloc(m[0] + m[1], 1); ks_introsort(hsaip, m[0] + m[1], chain[0]); for (k = 1; k < m[0] + m[1]; ++k) { hsaip_t *p = chain[0] + k; diff --git a/bwtsw2_core.c b/bwtsw2_core.c index 67f126c..d64f74b 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -7,6 +7,7 @@ #include "bwtsw2.h" #include "bwt.h" #include "kvec.h" +#include "utils.h" typedef struct { bwtint_t k, l; @@ -71,7 +72,7 @@ typedef struct __mempool_t { inline static bsw2entry_p mp_alloc(mempool_t *mp) { ++mp->cnt; - if (kv_size(mp->pool) == 0) return (bsw2entry_t*)calloc(1, sizeof(bsw2entry_t)); + if (kv_size(mp->pool) == 0) return (bsw2entry_t*)xcalloc(1, sizeof(bsw2entry_t)); else return kv_pop(mp->pool); } inline static void mp_free(mempool_t *mp, bsw2entry_p e) @@ -133,7 +134,7 @@ static void cut_tail(bsw2entry_t *u, int T, bsw2entry_t *aux) if (u->n <= T) return; if (aux->max < u->n) { aux->max = u->n; - aux->array = (bsw2cell_t*)realloc(aux->array, aux->max * sizeof(bsw2cell_t)); + aux->array = (bsw2cell_t*)xrealloc(aux->array, aux->max * sizeof(bsw2cell_t)); } a = (int*)aux->array; for (i = n = 0; i != u->n; ++i) @@ -184,7 +185,7 @@ static void merge_entry(const bsw2opt_t * __restrict opt, bsw2entry_t *u, bsw2en int i; if (u->n + v->n >= u->max) { u->max = u->n + v->n; - u->array = (bsw2cell_t*)realloc(u->array, u->max * sizeof(bsw2cell_t)); + u->array = (bsw2cell_t*)xrealloc(u->array, u->max * sizeof(bsw2cell_t)); } for (i = 0; i != v->n; ++i) { bsw2cell_t *p = v->array + i; @@ -202,7 +203,7 @@ static inline bsw2cell_t *push_array_p(bsw2entry_t *e) { if (e->n == e->max) { e->max = e->max? e->max<<1 : 256; - e->array = (bsw2cell_t*)realloc(e->array, sizeof(bsw2cell_t) * e->max); + e->array = (bsw2cell_t*)xrealloc(e->array, sizeof(bsw2cell_t) * e->max); } return e->array + e->n; } @@ -250,7 +251,7 @@ static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, i if (p->G >= t && p->ql - p->qk + 1 <= IS) { // good narrow hit if (b1->max == b1->n) { b1->max = b1->max? b1->max<<1 : 4; - b1->hits = realloc(b1->hits, b1->max * sizeof(bsw2hit_t)); + b1->hits = xrealloc(b1->hits, b1->max * sizeof(bsw2hit_t)); } q = &b1->hits[b1->n++]; q->k = p->qk; q->l = p->ql; @@ -279,7 +280,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int else if (p->G > 0) ++n; } b->n = b->max = n; - b->hits = calloc(b->max, sizeof(bsw2hit_t)); + b->hits = xcalloc(b->max, sizeof(bsw2hit_t)); for (i = j = 0; i < old_n; ++i) { bsw2hit_t *p = old_hits + i; if (p->l - p->k + 1 <= IS) { // the hit is no so repetitive @@ -399,9 +400,9 @@ bsw2global_t *bsw2_global_init() { bsw2global_t *pool; bsw2stack_t *stack; - pool = calloc(1, sizeof(bsw2global_t)); - stack = calloc(1, sizeof(bsw2stack_t)); - stack->pool = (mempool_t*)calloc(1, sizeof(mempool_t)); + pool = xcalloc(1, sizeof(bsw2global_t)); + stack = xcalloc(1, sizeof(bsw2stack_t)); + stack->pool = (mempool_t*)xcalloc(1, sizeof(mempool_t)); pool->stack = (void*)stack; return pool; } @@ -461,13 +462,13 @@ bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *ta rhash = kh_init(qintv); init_bwtsw2(target, query, stack); heap_size = opt->z; - heap = calloc(heap_size, sizeof(int)); + heap = xcalloc(heap_size, sizeof(int)); // initialize the return struct - b = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t)); + b = (bwtsw2_t*)xcalloc(1, sizeof(bwtsw2_t)); b->n = b->max = target->seq_len * 2; - b->hits = calloc(b->max, sizeof(bsw2hit_t)); - b1 = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t)); - b_ret = calloc(2, sizeof(void*)); + b->hits = xcalloc(b->max, sizeof(bsw2hit_t)); + b1 = (bwtsw2_t*)xcalloc(1, sizeof(bwtsw2_t)); + b_ret = xcalloc(2, sizeof(void*)); b_ret[0] = b; b_ret[1] = b1; // initialize timer getrusage(0, &last); diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index a6f4d80..d195a09 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -2,6 +2,7 @@ #include #include #include +#include "utils.h" #include "bwt.h" #include "bntseq.h" #include "bwtsw2.h" @@ -30,7 +31,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) bsw2pestat_t r; memset(&r, 0, sizeof(bsw2pestat_t)); - isize = calloc(n, 8); + isize = xcalloc(n, 8); for (i = k = 0; i < n; i += 2) { bsw2hit_t *t[2]; int l; @@ -113,7 +114,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b if (end > l_pac) end = l_pac; if (end - beg < l_mseq) return; // generate the sequence - seq = malloc(l_mseq + (end - beg)); + seq = xmalloc(l_mseq + (end - beg)); ref = seq + l_mseq; for (k = beg; k < end; ++k) ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3; @@ -221,7 +222,7 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b a[which].flag |= BSW2_FLAG_RESCUED; if (p[1]->max == 0) { p[1]->max = 1; - p[1]->hits = malloc(sizeof(bsw2hit_t)); + p[1]->hits = xmalloc(sizeof(bsw2hit_t)); } p[1]->hits[0] = a[which]; p[1]->n = 1; diff --git a/cs2nt.c b/cs2nt.c index dfbce60..3084f11 100644 --- a/cs2nt.c +++ b/cs2nt.c @@ -3,6 +3,7 @@ #include #include "bwtaln.h" #include "stdaln.h" +#include "utils.h" /* Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we @@ -118,7 +119,7 @@ void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac) // set temporary arrays if (p->type == BWA_TYPE_NO_MATCH) return; len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space - ta = (uint8_t*)malloc(len * 7); + ta = (uint8_t*)xmalloc(len * 7); nt_ref = ta; cs_read = nt_ref + len; nt_read = cs_read + len; diff --git a/fastmap.c b/fastmap.c index 4d7a675..7ef74a9 100644 --- a/fastmap.c +++ b/fastmap.c @@ -6,7 +6,8 @@ #include "bwt.h" #include "kvec.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +#include "utils.h" +KSEQ_INIT(gzFile, err_gzread) extern unsigned char nst_nt4_table[256]; @@ -20,11 +21,11 @@ typedef struct { smem_i *smem_iter_init(const bwt_t *bwt) { smem_i *iter; - iter = calloc(1, sizeof(smem_i)); + iter = xcalloc(1, sizeof(smem_i)); iter->bwt = bwt; - iter->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); - iter->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); - iter->matches = calloc(1, sizeof(bwtintv_v)); + iter->tmpvec[0] = xcalloc(1, sizeof(bwtintv_v)); + iter->tmpvec[1] = xcalloc(1, sizeof(bwtintv_v)); + iter->matches = xcalloc(1, sizeof(bwtintv_v)); return iter; } @@ -78,7 +79,7 @@ int main_fastmap(int argc, char *argv[]) fp = gzopen(argv[optind + 1], "r"); seq = kseq_init(fp); { // load the packed sequences, BWT and SA - char *tmp = calloc(strlen(argv[optind]) + 5, 1); + char *tmp = xcalloc(strlen(argv[optind]) + 5, 1); strcat(strcpy(tmp, argv[optind]), ".bwt"); bwt = bwt_restore_bwt(tmp); strcat(strcpy(tmp, argv[optind]), ".sa"); diff --git a/is.c b/is.c index 9e50faf..8e94abd 100644 --- a/is.c +++ b/is.c @@ -25,6 +25,7 @@ */ #include +#include "utils.h" typedef unsigned char ubyte_t; #define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i]) @@ -204,8 +205,9 @@ int is_sa(const ubyte_t *T, int *SA, int n) int is_bwt(ubyte_t *T, int n) { int *SA, i, primary = 0; - SA = (int*)calloc(n+1, sizeof(int)); - is_sa(T, SA, n); + SA = (int*)xcalloc(n+1, sizeof(int)); + + if (is_sa(T, SA, n)) err_fatal_simple("is_sa failed"); for (i = 0; i <= n; ++i) { if (SA[i] == 0) primary = i; diff --git a/khash.h b/khash.h index de6be6d..fae5008 100644 --- a/khash.h +++ b/khash.h @@ -95,6 +95,7 @@ int main() { #include #include #include +#include "utils.h" /* compipler specific configuration */ @@ -147,7 +148,7 @@ static const double __ac_HASH_UPPER = 0.77; khval_t *vals; \ } kh_##name##_t; \ static inline kh_##name##_t *kh_init_##name() { \ - return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + return (kh_##name##_t*)xcalloc(1, sizeof(kh_##name##_t)); \ } \ static inline void kh_destroy_##name(kh_##name##_t *h) \ { \ @@ -188,12 +189,12 @@ static const double __ac_HASH_UPPER = 0.77; new_n_buckets = __ac_prime_list[t+1]; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ else { \ - new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ + new_flags = (khint32_t*)xmalloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + h->keys = (khkey_t*)xrealloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) \ - h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + h->vals = (khval_t*)xrealloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } \ } \ } \ @@ -227,9 +228,9 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ if (h->n_buckets > new_n_buckets) { \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + h->keys = (khkey_t*)xrealloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) \ - h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + h->vals = (khval_t*)xrealloc(h->vals, new_n_buckets * sizeof(khval_t)); \ } \ free(h->flags); \ h->flags = new_flags; \ diff --git a/kseq.h b/kseq.h index ad8937c..0fb1847 100644 --- a/kseq.h +++ b/kseq.h @@ -29,6 +29,7 @@ #include #include #include +#include "utils.h" #define __KS_TYPE(type_t) \ typedef struct __kstream_t { \ @@ -43,9 +44,9 @@ #define __KS_BASIC(type_t, __bufsize) \ static inline kstream_t *ks_init(type_t f) \ { \ - kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + kstream_t *ks = (kstream_t*)xcalloc(1, sizeof(kstream_t)); \ ks->f = f; \ - ks->buf = (char*)malloc(__bufsize); \ + ks->buf = (char*)xmalloc(__bufsize); \ return ks; \ } \ static inline void ks_destroy(kstream_t *ks) \ @@ -107,7 +108,7 @@ typedef struct __kstring_t { if (str->m - str->l < i - ks->begin + 1) { \ str->m = str->l + (i - ks->begin) + 1; \ kroundup32(str->m); \ - str->s = (char*)realloc(str->s, str->m); \ + str->s = (char*)xrealloc(str->s, str->m); \ } \ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ str->l = str->l + (i - ks->begin); \ @@ -130,7 +131,7 @@ typedef struct __kstring_t { #define __KSEQ_BASIC(type_t) \ static inline kseq_t *kseq_init(type_t fd) \ { \ - kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + kseq_t *s = (kseq_t*)xcalloc(1, sizeof(kseq_t)); \ s->f = ks_init(fd); \ return s; \ } \ @@ -170,7 +171,7 @@ typedef struct __kstring_t { if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ seq->seq.m = seq->seq.l + 2; \ kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + seq->seq.s = (char*)xrealloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l++] = (char)c; \ } \ @@ -180,7 +181,7 @@ typedef struct __kstring_t { if (c != '+') return seq->seq.l; /* FASTA */ \ if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + seq->qual.s = (char*)xrealloc(seq->qual.s, seq->qual.m); \ } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ if (c == -1) return -2; /* we should not stop here */ \ diff --git a/ksort.h b/ksort.h index 52812e1..68f9407 100644 --- a/ksort.h +++ b/ksort.h @@ -57,6 +57,7 @@ #include #include +#include "utils.h" typedef struct { void *left, *right; @@ -72,7 +73,7 @@ typedef struct { int curr, shift; \ \ a2[0] = array; \ - a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + a2[1] = temp? temp : (type_t*)xmalloc(sizeof(type_t) * n); \ for (curr = 0, shift = 0; (1ul< #include #include "kstring.h" +#include "utils.h" int ksprintf(kstring_t *s, const char *fmt, ...) { @@ -12,7 +13,7 @@ int ksprintf(kstring_t *s, const char *fmt, ...) if (l + 1 > s->m - s->l) { s->m = s->l + l + 2; kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); + s->s = (char*)xrealloc(s->s, s->m); va_start(ap, fmt); l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); } diff --git a/kstring.h b/kstring.h index 398901f..88cf93a 100644 --- a/kstring.h +++ b/kstring.h @@ -3,6 +3,7 @@ #include #include +#include "utils.h" #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) @@ -22,7 +23,7 @@ static inline int kputs(const char *p, kstring_t *s) if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); + s->s = (char*)xrealloc(s->s, s->m); } strcpy(s->s + s->l, p); s->l += l; @@ -34,7 +35,7 @@ static inline int kputc(int c, kstring_t *s) if (s->l + 1 >= s->m) { s->m = s->l + 2; kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); + s->s = (char*)xrealloc(s->s, s->m); } s->s[s->l++] = c; s->s[s->l] = 0; diff --git a/ksw.c b/ksw.c index bd29e96..5d17a8f 100644 --- a/ksw.c +++ b/ksw.c @@ -28,6 +28,7 @@ #include #include #include "ksw.h" +#include "utils.h" #ifdef __GNUC__ #define LIKELY(x) __builtin_expect((x),1) @@ -51,7 +52,7 @@ ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const in size = size > 1? 2 : 1; p = 8 * (3 - size); // # values per __m128i slen = (qlen + p - 1) / p; // segmented length - q = malloc(sizeof(ksw_query_t) + 256 + 16 * slen * (m + 4)); // a single block of memory + q = xmalloc(sizeof(ksw_query_t) + 256 + 16 * slen * (m + 4)); // a single block of memory q->qp = (__m128i*)(((size_t)q + sizeof(ksw_query_t) + 15) >> 4 << 4); // align memory q->H0 = q->qp + slen * m; q->H1 = q->H0 + slen; @@ -169,7 +170,7 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) / if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append if (n_b == m_b) { m_b = m_b? m_b<<1 : 8; - b = realloc(b, 8 * m_b); + b = xrealloc(b, 8 * m_b); } b[n_b++] = (uint64_t)imax<<32 | i; } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last @@ -264,7 +265,7 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { if (n_b == m_b) { m_b = m_b? m_b<<1 : 8; - b = realloc(b, 8 * m_b); + b = xrealloc(b, 8 * m_b); } b[n_b++] = (uint64_t)imax<<32 | i; } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last @@ -310,7 +311,7 @@ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) #include #include #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_INIT(gzFile, err_gzread) unsigned char seq_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, diff --git a/kvec.h b/kvec.h index 57204d6..8d83c42 100644 --- a/kvec.h +++ b/kvec.h @@ -60,7 +60,7 @@ int main() { #define kv_size(v) ((v).n) #define kv_max(v) ((v).m) -#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) +#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)xrealloc((v).a, sizeof(type) * (v).m)) #define kv_copy(type, v1, v0) do { \ if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ @@ -71,19 +71,19 @@ int main() { #define kv_push(type, v, x) do { \ if ((v).n == (v).m) { \ (v).m = (v).m? (v).m<<1 : 2; \ - (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ + (v).a = (type*)xrealloc((v).a, sizeof(type) * (v).m); \ } \ (v).a[(v).n++] = (x); \ } while (0) #define kv_pushp(type, v) (((v).n == (v).m)? \ ((v).m = ((v).m? (v).m<<1 : 2), \ - (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + (v).a = (type*)xrealloc((v).a, sizeof(type) * (v).m), 0) \ : 0), ((v).a + ((v).n++)) #define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ - (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + (v).a = (type*)xrealloc((v).a, sizeof(type) * (v).m), 0) \ : (v).n <= (size_t)(i)? (v).n = (i) \ : 0), (v).a[(i)] diff --git a/simple_dp.c b/simple_dp.c index 7c078c2..267e77f 100644 --- a/simple_dp.c +++ b/simple_dp.c @@ -8,7 +8,7 @@ #include "utils.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_INIT(gzFile, err_gzread) typedef struct { int l; @@ -64,20 +64,20 @@ static seqs_t *load_seqs(const char *fn) fp = xzopen(fn, "r"); seq = kseq_init(fp); - s = (seqs_t*)calloc(1, sizeof(seqs_t)); + s = (seqs_t*)xcalloc(1, sizeof(seqs_t)); s->m_seqs = 256; - s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t)); + s->seqs = (seq1_t*)xcalloc(s->m_seqs, sizeof(seq1_t)); while ((l = kseq_read(seq)) >= 0) { if (s->n_seqs == s->m_seqs) { s->m_seqs <<= 1; - s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t)); + s->seqs = (seq1_t*)xrealloc(s->seqs, s->m_seqs * sizeof(seq1_t)); } p = s->seqs + (s->n_seqs++); p->l = seq->seq.l; - p->s = (unsigned char*)malloc(p->l + 1); + p->s = (unsigned char*)xmalloc(p->l + 1); memcpy(p->s, seq->seq.s, p->l); p->s[p->l] = 0; - p->n = strdup((const char*)seq->name.s); + p->n = xstrdup((const char*)seq->name.s); } kseq_destroy(seq); gzclose(fp); diff --git a/stdaln.c b/stdaln.c index eb41882..1a8a3d1 100644 --- a/stdaln.c +++ b/stdaln.c @@ -28,6 +28,7 @@ #include #include #include "stdaln.h" +#include "utils.h" /* char -> 17 (=16+1) nucleotides */ unsigned char aln_nt16_table[256] = { @@ -232,7 +233,7 @@ AlnParam aln_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 }; AlnAln *aln_init_AlnAln() { AlnAln *aa; - aa = (AlnAln*)malloc(sizeof(AlnAln)); + aa = (AlnAln*)xmalloc(sizeof(AlnAln)); aa->path = 0; aa->out1 = aa->out2 = aa->outm = 0; aa->path_len = 0; @@ -382,13 +383,13 @@ int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2 /* allocate memory */ end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); - dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1)); + dpcell = (dpcell_t**)xmalloc(sizeof(dpcell_t*) * (len2 + 1)); for (j = 0; j <= len2; ++j) - dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end); + dpcell[j] = (dpcell_t*)xmalloc(sizeof(dpcell_t) * end); for (j = b2 + 1; j <= len2; ++j) dpcell[j] -= j - b2; - curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); - last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); + curr = (dpscore_t*)xmalloc(sizeof(dpscore_t) * (len1 + 1)); + last = (dpscore_t*)xmalloc(sizeof(dpscore_t) * (len1 + 1)); /* set first row */ SET_INF(*curr); curr->M = 0; @@ -556,11 +557,11 @@ int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, if (len1 == 0 || len2 == 0) return -1; /* allocate memory */ - suba = (int*)malloc(sizeof(int) * (len2 + 1)); - eh = (NT_LOCAL_SCORE*)malloc(sizeof(NT_LOCAL_SCORE) * (len1 + 1)); - s_array = (int**)malloc(sizeof(int*) * N_MATRIX_ROW); + suba = (int*)xmalloc(sizeof(int) * (len2 + 1)); + eh = (NT_LOCAL_SCORE*)xmalloc(sizeof(NT_LOCAL_SCORE) * (len1 + 1)); + s_array = (int**)xmalloc(sizeof(int*) * N_MATRIX_ROW); for (i = 0; i != N_MATRIX_ROW; ++i) - s_array[i] = (int*)malloc(sizeof(int) * len1); + s_array[i] = (int*)xmalloc(sizeof(int) * len1); /* initialization */ aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); q = gap_open; @@ -773,9 +774,9 @@ AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, if (len2 < 0) len2 = strlen(seq2); aa = aln_init_AlnAln(); - seq11 = (unsigned char*)malloc(sizeof(unsigned char) * len1); - seq22 = (unsigned char*)malloc(sizeof(unsigned char) * len2); - aa->path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 1)); + seq11 = (unsigned char*)xmalloc(sizeof(unsigned char) * len1); + seq22 = (unsigned char*)xmalloc(sizeof(unsigned char) * len2); + aa->path = (path_t*)xmalloc(sizeof(path_t) * (len1 + len2 + 1)); if (ap->row < 10) { /* 4-nucleotide alignment */ for (i = 0; i < len1; ++i) @@ -805,9 +806,9 @@ AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, aa->score = score; if (thres > 0) { - out1 = aa->out1 = (char*)malloc(sizeof(char) * (aa->path_len + 1)); - out2 = aa->out2 = (char*)malloc(sizeof(char) * (aa->path_len + 1)); - outm = aa->outm = (char*)malloc(sizeof(char) * (aa->path_len + 1)); + out1 = aa->out1 = (char*)xmalloc(sizeof(char) * (aa->path_len + 1)); + out2 = aa->out2 = (char*)xmalloc(sizeof(char) * (aa->path_len + 1)); + outm = aa->outm = (char*)xmalloc(sizeof(char) * (aa->path_len + 1)); --seq1; --seq2; --seq11; --seq22; @@ -881,10 +882,10 @@ int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2 if (len1 == 0 || len2 == 0) return -1; /* allocate memory */ - mem = _mem? _mem : calloc((len1 + 2) * (N_MATRIX_ROW + 1), 4); + mem = _mem? _mem : xcalloc((len1 + 2) * (N_MATRIX_ROW + 1), 4); _p = mem; eh = (uint32_t*)_p, _p += 4 * (len1 + 2); - s_array = calloc(N_MATRIX_ROW, sizeof(void*)); + s_array = xcalloc(N_MATRIX_ROW, sizeof(void*)); for (i = 0; i != N_MATRIX_ROW; ++i) s_array[i] = (int32_t*)_p, _p += 4 * len1; /* initialization */ @@ -1024,7 +1025,7 @@ uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar) last_type = path[i].ctype; } *n_cigar = n; - cigar = (uint32_t*)malloc(*n_cigar * 4); + cigar = (uint32_t*)xmalloc(*n_cigar * 4); cigar[0] = 1u << 4 | path[path_len-1].ctype; last_type = path[path_len-1].ctype; From 55f1b3653492c53bcbc423cb574cfa00e153d903 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 3 Jan 2013 16:57:37 +0000 Subject: [PATCH 136/498] New wrapper for gzclose; added err_fflush calls and made it call fsync too. Added a new utils.c wrapper err_gzclose and changed gzclose calls to use it. Put in some more err_fflush calls before files being written are closed. Made err_fflush call fsync. This is useful for remote filesystems where errors may not be reported on fflush or fclose as problems at the server end may only be detected after they have returned. If bwa is being used only to write to local filesystems, calling fsync is not really necessary. To disable it, comment out #define FSYNC_ON_FLUSH in utils.c. --- bamlite.h | 2 +- bntseq.c | 5 ++++- bwaseqio.c | 2 +- bwtindex.c | 6 +++--- bwtio.c | 2 ++ bwtmisc.c | 3 ++- bwtsw2_aux.c | 6 +++--- fastmap.c | 4 ++-- ksw.c | 8 ++++---- simple_dp.c | 4 ++-- utils.c | 39 +++++++++++++++++++++++++++++++++++++++ utils.h | 1 + 12 files changed, 64 insertions(+), 18 deletions(-) diff --git a/bamlite.h b/bamlite.h index 2b65c57..0c080fd 100644 --- a/bamlite.h +++ b/bamlite.h @@ -8,7 +8,7 @@ typedef gzFile bamFile; #define bam_open(fn, mode) xzopen(fn, mode) #define bam_dopen(fd, mode) gzdopen(fd, mode) -#define bam_close(fp) gzclose(fp) +#define bam_close(fp) err_gzclose(fp) #define bam_read(fp, buf, size) err_gzread(fp, buf, size) typedef struct { diff --git a/bntseq.c b/bntseq.c index b795af8..af83211 100644 --- a/bntseq.c +++ b/bntseq.c @@ -73,6 +73,7 @@ void bns_dump(const bntseq_t *bns, const char *prefix) else err_fprintf(fp, "\n"); err_fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); } + err_fflush(fp); err_fclose(fp); } { // dump .amb @@ -83,6 +84,7 @@ void bns_dump(const bntseq_t *bns, const char *prefix) bntamb1_t *p = bns->ambs + i; err_fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); } + err_fflush(fp); err_fclose(fp); } } @@ -279,6 +281,7 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) ct = bns->l_pac % 4; err_fwrite(&ct, 1, 1, fp); // close .pac file + err_fflush(fp); err_fclose(fp); } bns_dump(bns, prefix); @@ -303,7 +306,7 @@ int bwa_fa2pac(int argc, char *argv[]) } fp = xzopen(argv[optind], "r"); bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only); - gzclose(fp); + err_gzclose(fp); return 0; } diff --git a/bwaseqio.c b/bwaseqio.c index 716f9b2..8d69b37 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -46,7 +46,7 @@ void bwa_seq_close(bwa_seqio_t *bs) if (bs == 0) return; if (bs->is_bam) bam_close(bs->fp); else { - gzclose(bs->ks->f->f); + err_gzclose(bs->ks->f->f); kseq_destroy(bs->ks); } free(bs); diff --git a/bwtindex.c b/bwtindex.c index f430e62..6d0604e 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -89,7 +89,7 @@ int bwa_index(int argc, char *argv[]) fprintf(stderr, "[bwa_index] Pack FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - gzclose(fp); + err_gzclose(fp); } else { // color indexing gzFile fp = xzopen(argv[optind], "r"); strcat(strcpy(str, prefix), ".nt"); @@ -97,7 +97,7 @@ int bwa_index(int argc, char *argv[]) fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... "); l_pac = bns_fasta2bntseq(fp, str, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - gzclose(fp); + err_gzclose(fp); { char *tmp_argv[3]; tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix; @@ -139,7 +139,7 @@ int bwa_index(int argc, char *argv[]) fprintf(stderr, "[bwa_index] Pack forward-only FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 1); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - gzclose(fp); + err_gzclose(fp); } { bwt_t *bwt; diff --git a/bwtio.c b/bwtio.c index 0d4623e..ca5a6c0 100644 --- a/bwtio.c +++ b/bwtio.c @@ -11,6 +11,7 @@ void bwt_dump_bwt(const char *fn, const bwt_t *bwt) err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp); + err_fflush(fp); err_fclose(fp); } @@ -23,6 +24,7 @@ void bwt_dump_sa(const char *fn, const bwt_t *bwt) err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + err_fflush(fp); err_fclose(fp); } diff --git a/bwtmisc.c b/bwtmisc.c index 49aa5aa..ccc82eb 100644 --- a/bwtmisc.c +++ b/bwtmisc.c @@ -201,7 +201,8 @@ int bwa_pac2cspac(int argc, char *argv[]) fp = xopen(str, "wb"); err_fwrite(cspac, 1, bns->l_pac/4 + 1, fp); ct = bns->l_pac % 4; - err_fwrite(&ct, 1, 1, fp); + err_fwrite(&ct, 1, 1, fp); + err_fflush(fp); err_fclose(fp); bns_destroy(bns); free(cspac); diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index ca39919..2228054 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -752,7 +752,7 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * p->tid = -1; p->l = 0; p->name = p->seq = p->qual = p->sam = 0; } - fflush(stdout); + err_fflush(stdout); _seq->n = 0; } @@ -819,9 +819,9 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c free(pac); free(_seq->seq); free(_seq); kseq_destroy(ks); - gzclose(fp); + err_gzclose(fp); if (fn2) { kseq_destroy(ks2); - gzclose(fp2); + err_gzclose(fp2); } } diff --git a/fastmap.c b/fastmap.c index 7ef74a9..504e22e 100644 --- a/fastmap.c +++ b/fastmap.c @@ -76,7 +76,7 @@ int main_fastmap(int argc, char *argv[]) return 1; } - fp = gzopen(argv[optind + 1], "r"); + fp = xzopen(argv[optind + 1], "r"); seq = kseq_init(fp); { // load the packed sequences, BWT and SA char *tmp = xcalloc(strlen(argv[optind]) + 5, 1); @@ -123,6 +123,6 @@ int main_fastmap(int argc, char *argv[]) bns_destroy(bns); bwt_destroy(bwt); kseq_destroy(seq); - gzclose(fp); + err_gzclose(fp); return 0; } diff --git a/ksw.c b/ksw.c index 5d17a8f..270ecfb 100644 --- a/ksw.c +++ b/ksw.c @@ -364,8 +364,8 @@ int main(int argc, char *argv[]) } for (j = 0; j < 5; ++j) mat[k++] = 0; // open file - fpt = gzopen(argv[optind], "r"); kst = kseq_init(fpt); - fpq = gzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); + fpt = xzopen(argv[optind], "r"); kst = kseq_init(fpt); + fpq = xzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); // all-pair alignment while (kseq_read(ksq) > 0) { ksw_query_t *q[2]; @@ -394,8 +394,8 @@ int main(int argc, char *argv[]) } free(q[0]); free(q[1]); } - kseq_destroy(kst); gzclose(fpt); - kseq_destroy(ksq); gzclose(fpq); + kseq_destroy(kst); err_gzclose(fpt); + kseq_destroy(ksq); err_gzclose(fpq); return 0; } #endif // _KSW_MAIN diff --git a/simple_dp.c b/simple_dp.c index 267e77f..4c6a156 100644 --- a/simple_dp.c +++ b/simple_dp.c @@ -80,7 +80,7 @@ static seqs_t *load_seqs(const char *fn) p->n = xstrdup((const char*)seq->name.s); } kseq_destroy(seq); - gzclose(fp); + err_gzclose(fp); fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs); return s; } @@ -123,7 +123,7 @@ static void aln_seqs(const seqs_t *ss, const char *fn) } } kseq_destroy(seq); - gzclose(fp); + err_gzclose(fp); } int bwa_stdsw(int argc, char *argv[]) diff --git a/utils.c b/utils.c index bc39bf5..dc16308 100644 --- a/utils.c +++ b/utils.c @@ -24,6 +24,7 @@ */ /* Contact: Heng Li */ +#define FSYNC_ON_FLUSH #include #include @@ -31,6 +32,11 @@ #include #include #include +#ifdef FSYNC_ON_FLUSH +#include +#include +#include +#endif #include #include #include "utils.h" @@ -196,6 +202,28 @@ int err_fflush(FILE *stream) { _err_fatal_simple("fflush", strerror(errno)); } +#ifdef FSYNC_ON_FLUSH + /* Calling fflush() ensures that all the data has made it to the + kernel buffers, but this may not be sufficient for remote filesystems + (e.g. NFS, lustre) as an error may still occur while the kernel + is copying the buffered data to the file server. To be sure of + catching these errors, we need to call fsync() on the file + descriptor, but only if it is a regular file. */ + { + struct stat sbuf; + if (0 != fstat(fileno(stream), &sbuf)) + { + _err_fatal_simple("fstat", strerror(errno)); + } + if (S_ISREG(sbuf.st_mode)) + { + if (0 != fsync(fileno(stream))) + { + _err_fatal_simple("fsync", strerror(errno)); + } + } + } +#endif return ret; } @@ -209,6 +237,17 @@ int err_fclose(FILE *stream) return ret; } +int err_gzclose(gzFile file) +{ + int ret = gzclose(file); + if (Z_OK != ret) + { + _err_fatal_simple("gzclose", Z_ERRNO == ret ? strerror(errno) : zError(ret)); + } + + return ret; +} + void *err_calloc(size_t nmemb, size_t size, const char *file, unsigned int line, const char *func) { void *p = calloc(nmemb, size); diff --git a/utils.h b/utils.h index c6cfc81..f824245 100644 --- a/utils.h +++ b/utils.h @@ -78,6 +78,7 @@ extern "C" { ATTRIBUTE((format(printf, 1, 2))); int err_fflush(FILE *stream); int err_fclose(FILE *stream); + int err_gzclose(gzFile file); void *err_calloc(size_t nmemb, size_t size, const char *file, unsigned int line, const char *func); void *err_malloc(size_t size, const char *file, unsigned int line, const char *func); From 4f4e998d7f1e28595ef5fc36dd96508ae9b34752 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 9 Jan 2013 14:43:36 +0000 Subject: [PATCH 137/498] Added wrappers for fputc and fputs; more efficient sequence printing Added wrappers err_fputc and err_fputs to catch failures in fput and fputs. Macros err_putchar and err_puts call the new wrappers and can be used in place of putchar and puts. To avoid having to make millions of function calls when printing out sequences, the code to print them in bwa_print_sam1 using putchar has been replaced by a new version in bwa_print_seq that puts the sequence into a buffer and then outputs the lot with err_fwrite. In testing, the new code was slightly faster than the old version, with the added benefit that it will stop promptly if IO problems are detected. --- bwase.c | 38 +++++++++++++++++++++++++++++--------- fastmap.c | 12 ++++++------ utils.c | 22 ++++++++++++++++++++++ utils.h | 4 ++++ 4 files changed, 61 insertions(+), 15 deletions(-) diff --git a/bwase.c b/bwase.c index afe8154..84430fe 100644 --- a/bwase.c +++ b/bwase.c @@ -404,6 +404,26 @@ static int64_t pos_5(const bwa_seq_t *p) return -1; } +void bwa_print_seq(FILE *stream, bwa_seq_t *seq) { + char buffer[4096]; + const int bsz = sizeof(buffer); + int i, j, l; + + if (seq->strand == 0) { + for (i = 0; i < seq->full_len; i += bsz) { + l = seq->full_len - i > bsz ? bsz : seq->full_len - i; + for (j = 0; j < l; j++) buffer[j] = "ACGTN"[seq->seq[i + j]]; + err_fwrite(buffer, 1, l, stream); + } + } else { + for (i = seq->full_len - 1; i >= 0; i -= bsz) { + l = i + 1 > bsz ? bsz : i + 1; + for (j = 0; j < l; j++) buffer[j] = "TGCAN"[seq->seq[i - j]]; + err_fwrite(buffer, 1, l, stream); + } + } +} + void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2) { int j; @@ -455,10 +475,8 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in else err_printf("\t*\t0\t0\t"); // print sequence and quality - if (p->strand == 0) - for (j = 0; j != p->full_len; ++j) putchar("ACGTN"[(int)p->seq[j]]); - else for (j = 0; j != p->full_len; ++j) putchar("TGCAN"[p->seq[p->full_len - 1 - j]]); - putchar('\t'); + bwa_print_seq(stdout, p); + err_putchar('\t'); if (p->qual) { if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality err_printf("%s", p->qual); @@ -500,14 +518,16 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in } } } - putchar('\n'); + err_putchar('\n'); } else { // this read has no match - ubyte_t *s = p->strand? p->rseq : p->seq; + //ubyte_t *s = p->strand? p->rseq : p->seq; int flag = p->extra_flag | SAM_FSU; if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU; err_printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag); - for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]); - putchar('\t'); + //Why did this work differently to the version above?? + //for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]); + bwa_print_seq(stdout, p); + err_putchar('\t'); if (p->qual) { if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality err_printf("%s", p->qual); @@ -515,7 +535,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); - putchar('\n'); + err_putchar('\n'); } } diff --git a/fastmap.c b/fastmap.c index 504e22e..5cb83fc 100644 --- a/fastmap.c +++ b/fastmap.c @@ -91,9 +91,9 @@ int main_fastmap(int argc, char *argv[]) while (kseq_read(seq) >= 0) { printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); if (print_seq) { - putchar('\t'); - puts(seq->seq.s); - } else putchar('\n'); + err_putchar('\t'); + err_puts(seq->seq.s); + } else err_putchar('\n'); for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; smem_set_query(iter, seq->seq.l, (uint8_t*)seq->seq.s); @@ -112,11 +112,11 @@ int main_fastmap(int argc, char *argv[]) bns_cnt_ambi(bns, pos, len, &ref_id); printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); } - } else fputs("\t*", stdout); - putchar('\n'); + } else err_puts("\t*"); + err_putchar('\n'); } } - puts("//"); + err_puts("//"); } smem_iter_destroy(iter); diff --git a/utils.c b/utils.c index dc16308..93c17c9 100644 --- a/utils.c +++ b/utils.c @@ -195,6 +195,28 @@ int err_fprintf(FILE *stream, const char *format, ...) return done; } +int err_fputc(int c, FILE *stream) +{ + int ret = putc(c, stream); + if (EOF == ret) + { + _err_fatal_simple("fputc", strerror(errno)); + } + + return ret; +} + +int err_fputs(const char *s, FILE *stream) +{ + int ret = fputs(s, stream); + if (EOF == ret) + { + _err_fatal_simple("fputs", strerror(errno)); + } + + return ret; +} + int err_fflush(FILE *stream) { int ret = fflush(stream); diff --git a/utils.h b/utils.h index f824245..b904701 100644 --- a/utils.h +++ b/utils.h @@ -76,6 +76,10 @@ extern "C" { ATTRIBUTE((format(printf, 2, 3))); int err_printf(const char *format, ...) ATTRIBUTE((format(printf, 1, 2))); + int err_fputc(int c, FILE *stream); +#define err_putchar(C) err_fputc((C), stdout) + int err_fputs(const char *s, FILE *stream); +#define err_puts(S) err_fputs((S), stdout) int err_fflush(FILE *stream); int err_fclose(FILE *stream); int err_gzclose(gzFile file); From 6641788d38ef874d09f8a88a3eac3d9df5ea2aa3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 11:42:31 -0500 Subject: [PATCH 138/498] preparation for further changes --- bwt.c | 27 ++++++++++++++------------- bwt.h | 2 +- fastmap.c | 16 +++++++++------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/bwt.c b/bwt.c index 966b718..e46c125 100644 --- a/bwt.c +++ b/bwt.c @@ -277,7 +277,7 @@ static void bwt_reverse_intvs(bwtintv_v *p) } } -int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]) +int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) { int i, j, c, ret; bwtintv_t ik, ok[4]; @@ -285,37 +285,38 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem mem->n = 0; if (q[x] > 3) return x + 1; + if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 kv_init(a[0]); kv_init(a[1]); - prev = tmpvec[0]? tmpvec[0] : &a[0]; - curr = tmpvec[1]? tmpvec[1] : &a[1]; - bwt_set_intv(bwt, q[x], ik); + prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided + curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1]; + bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base ik.info = x + 1; for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search - if (q[i] < 4) { - c = 3 - q[i]; + if (q[i] < 4) { // an A/C/G/T base + c = 3 - q[i]; // complement of q[i] bwt_extend(bwt, &ik, ok, 0); if (ok[c].x[2] != ik.x[2]) // change of the interval size kv_push(bwtintv_t, *curr, ik); - if (ok[c].x[2] == 0) break; // cannot be extended + if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further ik = ok[c]; ik.info = i + 1; } else { // an ambiguous base kv_push(bwtintv_t, *curr, ik); - break; // cannot be extended; in this case, ia[0].info; // this will be the returned value swap = curr; curr = prev; prev = swap; for (i = x - 1; i >= -1; --i) { // backward search for MEMs - if (q[i] > 3) break; + if (i >= 0 && q[i] > 3) break; // always stop at an ambiguous base as the FM-index does not have any. c = i < 0? 0 : q[i]; for (j = 0, curr->n = 0; j < prev->n; ++j) { bwtintv_t *p = &prev->a[j]; bwt_extend(bwt, p, ok, 1); - if (ok[c].x[2] == 0 || i == -1) { // keep the hit if reaching the beginning or not extended further + if (ok[c].x[2] < min_intv || i == -1) { // keep the hit if reaching the beginning or not extended further if (curr->n == 0) { // curr->n to make sure there is no longer matches if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches ik = *p; ik.info |= (uint64_t)(i + 1)<<32; @@ -333,7 +334,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem } bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate - if (tmpvec[0] == 0) free(a[0].a); - if (tmpvec[1] == 0) free(a[1].a); + if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a); + if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); return ret; } diff --git a/bwt.h b/bwt.h index 5823f82..1eeaceb 100644 --- a/bwt.h +++ b/bwt.h @@ -121,7 +121,7 @@ extern "C" { * Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_. * Return the end of the longest exact match starting from _x_. */ - int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); #ifdef __cplusplus } diff --git a/fastmap.c b/fastmap.c index 4d7a675..2c0a823 100644 --- a/fastmap.c +++ b/fastmap.c @@ -13,11 +13,11 @@ extern unsigned char nst_nt4_table[256]; typedef struct { const bwt_t *bwt; const uint8_t *query; - int start, len; + int start, len, min_intv; bwtintv_v *tmpvec[2], *matches; } smem_i; -smem_i *smem_iter_init(const bwt_t *bwt) +smem_i *smem_iter_init(const bwt_t *bwt, int min_intv) { smem_i *iter; iter = calloc(1, sizeof(smem_i)); @@ -25,6 +25,7 @@ smem_i *smem_iter_init(const bwt_t *bwt) iter->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); iter->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); iter->matches = calloc(1, sizeof(bwtintv_v)); + iter->min_intv = min_intv > 0? min_intv : 1; return iter; } @@ -49,13 +50,13 @@ int smem_next(smem_i *iter) if (iter->start >= iter->len || iter->start < 0) return -1; while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases if (iter->start == iter->len) return -1; - iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->matches, iter->tmpvec); + iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->min_intv, iter->matches, iter->tmpvec); return iter->start; } int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 20, min_len = 17, print_seq = 0; + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1; kseq_t *seq; bwtint_t k; gzFile fp; @@ -63,15 +64,16 @@ int main_fastmap(int argc, char *argv[]) bntseq_t *bns; smem_i *iter; - while ((c = getopt(argc, argv, "w:l:s")) >= 0) { + while ((c = getopt(argc, argv, "w:l:sm:")) >= 0) { switch (c) { case 's': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; + case 'm': min_intv = atoi(optarg); break; } } if (optind + 1 >= argc) { - fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); + fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] [-m minIntv=%d] \n", min_len, min_iwidth, min_intv); return 1; } @@ -86,7 +88,7 @@ int main_fastmap(int argc, char *argv[]) free(tmp); bns = bns_restore(argv[optind]); } - iter = smem_iter_init(bwt); + iter = smem_iter_init(bwt, min_intv); while (kseq_read(seq) >= 0) { printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); if (print_seq) { From 543c719a54dc3cb13ab85a4a604507368c5e0fa1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 11:53:07 -0500 Subject: [PATCH 139/498] fixed a couple of unimportant bugs in SMEM --- bwt.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bwt.c b/bwt.c index e46c125..c4a008b 100644 --- a/bwt.c +++ b/bwt.c @@ -311,20 +311,19 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, swap = curr; curr = prev; prev = swap; for (i = x - 1; i >= -1; --i) { // backward search for MEMs - if (i >= 0 && q[i] > 3) break; // always stop at an ambiguous base as the FM-index does not have any. - c = i < 0? 0 : q[i]; + c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base for (j = 0, curr->n = 0; j < prev->n; ++j) { bwtintv_t *p = &prev->a[j]; bwt_extend(bwt, p, ok, 1); - if (ok[c].x[2] < min_intv || i == -1) { // keep the hit if reaching the beginning or not extended further - if (curr->n == 0) { // curr->n to make sure there is no longer matches + if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough + if (curr->n == 0) { // test curr->n>0 to make sure there is no longer matches if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches ik = *p; ik.info |= (uint64_t)(i + 1)<<32; kv_push(bwtintv_t, *mem, ik); } } // otherwise the match is contained in another longer match } - if (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) { + if (c >= 0 && (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]))) { ok[c].info = p->info; kv_push(bwtintv_t, *curr, ok[c]); } From 6de74888fd7b2d922f05a4dcbd0d1dfd125218cc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 12:12:58 -0500 Subject: [PATCH 140/498] bugfix: min_intv not working in SMEM --- bwt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwt.c b/bwt.c index c4a008b..689d8f8 100644 --- a/bwt.c +++ b/bwt.c @@ -323,7 +323,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, } } // otherwise the match is contained in another longer match } - if (c >= 0 && (ok[c].x[2] && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]))) { + if (c >= 0 && ok[c].x[2] >= min_intv && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) { ok[c].info = p->info; kv_push(bwtintv_t, *curr, ok[c]); } From 5a4a0c4173805169f88a7e8d014176579da1adda Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 12:34:05 -0500 Subject: [PATCH 141/498] a bit refactoring for further changes --- bwt.c | 41 +++++++++++++++++++++++++++++++++++++++ bwt.h | 14 ++++++++++++++ fastmap.c | 58 +++++++------------------------------------------------ 3 files changed, 62 insertions(+), 51 deletions(-) diff --git a/bwt.c b/bwt.c index 689d8f8..fe8007f 100644 --- a/bwt.c +++ b/bwt.c @@ -337,3 +337,44 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); return ret; } + +/*************************** + * SMEM iterator interface * + ***************************/ + +smem_i *smem_itr_init(const bwt_t *bwt) +{ + smem_i *itr; + itr = calloc(1, sizeof(smem_i)); + itr->bwt = bwt; + itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); + itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); + itr->matches = calloc(1, sizeof(bwtintv_v)); + return itr; +} + +void smem_itr_destroy(smem_i *itr) +{ + free(itr->tmpvec[0]->a); + free(itr->tmpvec[1]->a); + free(itr->matches->a); + free(itr); +} + +void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query) +{ + itr->query = query; + itr->start = 0; + itr->len = len; + itr->min_intv = min_intv; +} + +int smem_next(smem_i *itr) +{ + itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = 0; + if (itr->start >= itr->len || itr->start < 0) return -1; + while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases + if (itr->start == itr->len) return -1; + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, itr->min_intv, itr->matches, itr->tmpvec); + return itr->start; +} diff --git a/bwt.h b/bwt.h index 1eeaceb..67a256d 100644 --- a/bwt.h +++ b/bwt.h @@ -60,6 +60,13 @@ typedef struct { typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v; +typedef struct { + const bwt_t *bwt; + const uint8_t *query; + int start, len, min_intv; + bwtintv_v *tmpvec[2], *matches; +} smem_i; + /* For general OCC_INTERVAL, the following is correct: #define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16]) #define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) @@ -123,6 +130,13 @@ extern "C" { */ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); + // SMEM iterator interface + + smem_i *smem_itr_init(const bwt_t *bwt); + void smem_itr_destroy(smem_i *itr); + void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query); + int smem_next(smem_i *itr); + #ifdef __cplusplus } #endif diff --git a/fastmap.c b/fastmap.c index 2c0a823..17db06d 100644 --- a/fastmap.c +++ b/fastmap.c @@ -10,50 +10,6 @@ KSEQ_INIT(gzFile, gzread) extern unsigned char nst_nt4_table[256]; -typedef struct { - const bwt_t *bwt; - const uint8_t *query; - int start, len, min_intv; - bwtintv_v *tmpvec[2], *matches; -} smem_i; - -smem_i *smem_iter_init(const bwt_t *bwt, int min_intv) -{ - smem_i *iter; - iter = calloc(1, sizeof(smem_i)); - iter->bwt = bwt; - iter->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); - iter->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); - iter->matches = calloc(1, sizeof(bwtintv_v)); - iter->min_intv = min_intv > 0? min_intv : 1; - return iter; -} - -void smem_iter_destroy(smem_i *iter) -{ - free(iter->tmpvec[0]->a); - free(iter->tmpvec[1]->a); - free(iter->matches->a); - free(iter); -} - -void smem_set_query(smem_i *iter, int len, const uint8_t *query) -{ - iter->query = query; - iter->start = 0; - iter->len = len; -} - -int smem_next(smem_i *iter) -{ - iter->tmpvec[0]->n = iter->tmpvec[1]->n = iter->matches->n = 0; - if (iter->start >= iter->len || iter->start < 0) return -1; - while (iter->start < iter->len && iter->query[iter->start] > 3) ++iter->start; // skip ambiguous bases - if (iter->start == iter->len) return -1; - iter->start = bwt_smem1(iter->bwt, iter->len, iter->query, iter->start, iter->min_intv, iter->matches, iter->tmpvec); - return iter->start; -} - int main_fastmap(int argc, char *argv[]) { int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1; @@ -62,7 +18,7 @@ int main_fastmap(int argc, char *argv[]) gzFile fp; bwt_t *bwt; bntseq_t *bns; - smem_i *iter; + smem_i *itr; while ((c = getopt(argc, argv, "w:l:sm:")) >= 0) { switch (c) { @@ -88,7 +44,7 @@ int main_fastmap(int argc, char *argv[]) free(tmp); bns = bns_restore(argv[optind]); } - iter = smem_iter_init(bwt, min_intv); + itr = smem_itr_init(bwt); while (kseq_read(seq) >= 0) { printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); if (print_seq) { @@ -97,10 +53,10 @@ int main_fastmap(int argc, char *argv[]) } else putchar('\n'); for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - smem_set_query(iter, seq->seq.l, (uint8_t*)seq->seq.s); - while (smem_next(iter) > 0) { - for (i = 0; i < iter->matches->n; ++i) { - bwtintv_t *p = &iter->matches->a[i]; + smem_set_query(itr, min_intv, seq->seq.l, (uint8_t*)seq->seq.s); + while (smem_next(itr) > 0) { + for (i = 0; i < itr->matches->n; ++i) { + bwtintv_t *p = &itr->matches->a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); if (p->x[2] <= min_iwidth) { @@ -120,7 +76,7 @@ int main_fastmap(int argc, char *argv[]) puts("//"); } - smem_iter_destroy(iter); + smem_itr_destroy(itr); bns_destroy(bns); bwt_destroy(bwt); kseq_destroy(seq); From 91debf412b59135bdf5d45c2772e72ce969ca9f1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 13:59:48 -0500 Subject: [PATCH 142/498] move smem iterators to bwamem.{c,h} --- Makefile | 5 ++++- bwamem.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ bwamem.h | 33 +++++++++++++++++++++++++++++++++ bwt.c | 41 ----------------------------------------- bwt.h | 12 ------------ fastmap.c | 23 +++++++++++++++++++++++ main.c | 2 ++ main.h | 1 + 8 files changed, 117 insertions(+), 54 deletions(-) create mode 100644 bwamem.c create mode 100644 bwamem.h diff --git a/Makefile b/Makefile index 6f388f2..04fd7a0 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o stdaln.o \ +LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o stdaln.o \ bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ @@ -45,5 +45,8 @@ bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h bwtsw2_main.o:bwtsw2.h +bwamem.o:bwamem.h +fastmap.o:bwt.h bwamem.h + clean: rm -f gmon.out *.o a.out $(PROG) *~ *.a diff --git a/bwamem.c b/bwamem.c new file mode 100644 index 0000000..91931bb --- /dev/null +++ b/bwamem.c @@ -0,0 +1,54 @@ +#include +#include "bwamem.h" + +memopt_t *mem_opt_init() +{ + memopt_t *o; + o = calloc(1, sizeof(memopt_t)); + o->a = 1; o->b = 9; o->q = 16; o->r = 1; o->w = 100; + o->min_seed_len = 17; + o->max_occ = 10; + return o; +} + +/*************************** + * SMEM iterator interface * + ***************************/ + +smem_i *smem_itr_init(const bwt_t *bwt) +{ + smem_i *itr; + itr = calloc(1, sizeof(smem_i)); + itr->bwt = bwt; + itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); + itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); + itr->matches = calloc(1, sizeof(bwtintv_v)); + return itr; +} + +void smem_itr_destroy(smem_i *itr) +{ + free(itr->tmpvec[0]->a); + free(itr->tmpvec[1]->a); + free(itr->matches->a); + free(itr); +} + +void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query) +{ + itr->query = query; + itr->start = 0; + itr->len = len; + itr->min_intv = min_intv; +} + +int smem_next(smem_i *itr) +{ + itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = 0; + if (itr->start >= itr->len || itr->start < 0) return -1; + while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases + if (itr->start == itr->len) return -1; + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, itr->min_intv, itr->matches, itr->tmpvec); + return itr->start; +} + diff --git a/bwamem.h b/bwamem.h new file mode 100644 index 0000000..da636a0 --- /dev/null +++ b/bwamem.h @@ -0,0 +1,33 @@ +#ifndef BWAMEM_H_ +#define BWAMEM_H_ + +#include "bwt.h" + +typedef struct { + const bwt_t *bwt; + const uint8_t *query; + int start, len, min_intv; + bwtintv_v *tmpvec[2], *matches; +} smem_i; + +typedef struct { + int a, b, q, r, w; + int min_seed_len, max_occ; +} memopt_t; + +#ifdef __cplusplus +extern "C" { +#endif + +smem_i *smem_itr_init(const bwt_t *bwt); +void smem_itr_destroy(smem_i *itr); +void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query); +int smem_next(smem_i *itr); + +memopt_t *mem_opt_init(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwt.c b/bwt.c index fe8007f..689d8f8 100644 --- a/bwt.c +++ b/bwt.c @@ -337,44 +337,3 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); return ret; } - -/*************************** - * SMEM iterator interface * - ***************************/ - -smem_i *smem_itr_init(const bwt_t *bwt) -{ - smem_i *itr; - itr = calloc(1, sizeof(smem_i)); - itr->bwt = bwt; - itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); - itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); - itr->matches = calloc(1, sizeof(bwtintv_v)); - return itr; -} - -void smem_itr_destroy(smem_i *itr) -{ - free(itr->tmpvec[0]->a); - free(itr->tmpvec[1]->a); - free(itr->matches->a); - free(itr); -} - -void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query) -{ - itr->query = query; - itr->start = 0; - itr->len = len; - itr->min_intv = min_intv; -} - -int smem_next(smem_i *itr) -{ - itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = 0; - if (itr->start >= itr->len || itr->start < 0) return -1; - while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases - if (itr->start == itr->len) return -1; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, itr->min_intv, itr->matches, itr->tmpvec); - return itr->start; -} diff --git a/bwt.h b/bwt.h index 67a256d..2aab9d1 100644 --- a/bwt.h +++ b/bwt.h @@ -60,13 +60,6 @@ typedef struct { typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v; -typedef struct { - const bwt_t *bwt; - const uint8_t *query; - int start, len, min_intv; - bwtintv_v *tmpvec[2], *matches; -} smem_i; - /* For general OCC_INTERVAL, the following is correct: #define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16]) #define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) @@ -132,11 +125,6 @@ extern "C" { // SMEM iterator interface - smem_i *smem_itr_init(const bwt_t *bwt); - void smem_itr_destroy(smem_i *itr); - void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query); - int smem_next(smem_i *itr); - #ifdef __cplusplus } #endif diff --git a/fastmap.c b/fastmap.c index 17db06d..6a41aeb 100644 --- a/fastmap.c +++ b/fastmap.c @@ -4,12 +4,35 @@ #include #include "bntseq.h" #include "bwt.h" +#include "bwamem.h" #include "kvec.h" #include "kseq.h" KSEQ_INIT(gzFile, gzread) extern unsigned char nst_nt4_table[256]; +int main_mem(int argc, char *argv[]) +{ + memopt_t *opt; + bwt_t *bwt; + bntseq_t *bns; + int c; + + opt = mem_opt_init(); + while ((c = getopt(argc, argv, "")) >= 0) { + } + if (optind + 1 >= argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n"); + fprintf(stderr, "\n"); + free(opt); + return 1; + } + + free(opt); + return 0; +} + int main_fastmap(int argc, char *argv[]) { int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1; diff --git a/main.c b/main.c index 73cbcd9..2718732 100644 --- a/main.c +++ b/main.c @@ -20,6 +20,7 @@ static int usage() fprintf(stderr, " sampe generate alignment (paired ended)\n"); fprintf(stderr, " bwasw BWA-SW for long queries\n"); fprintf(stderr, " fastmap identify super-maximal exact matches\n"); + fprintf(stderr, " mem BWA-MEM algorithm\n"); fprintf(stderr, "\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n"); @@ -59,6 +60,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); + else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; diff --git a/main.h b/main.h index 026a80b..1a0292a 100644 --- a/main.h +++ b/main.h @@ -22,6 +22,7 @@ extern "C" { int bwa_bwtsw2(int argc, char *argv[]); int main_fastmap(int argc, char *argv[]); + int main_mem(int argc, char *argv[]); #ifdef __cplusplus } From 6c19c9640ce1df655523bbbb1c41676c454a3c20 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 15:55:22 -0500 Subject: [PATCH 143/498] code backup --- bwamem.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ bwamem.h | 17 ++++++++++++- 2 files changed, 91 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 91931bb..52c4a18 100644 --- a/bwamem.c +++ b/bwamem.c @@ -1,4 +1,6 @@ #include +#include +#include #include "bwamem.h" memopt_t *mem_opt_init() @@ -8,6 +10,7 @@ memopt_t *mem_opt_init() o->a = 1; o->b = 9; o->q = 16; o->r = 1; o->w = 100; o->min_seed_len = 17; o->max_occ = 10; + o->max_chain_gap = 10000; return o; } @@ -52,3 +55,75 @@ int smem_next(smem_i *itr) return itr->start; } +#include "kbtree.h" + +#define chain_lt(a, b) ((a).pos < (b).pos) +KBTREE_INIT(chn, memchain1_t, chain_lt) + +static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t *p) +{ + int64_t qend, rend, x, y; + const memseed_t *last = &c->seeds[c->n-1]; + qend = last->qbeg + last->len; + rend = last->rbeg + last->len; + if (p->qbeg > c->seeds[0].qbeg && p->qbeg + p->len < qend && p->rbeg > c->seeds[0].rbeg && p->rbeg + p->len < rend) + return 1; // contained seed; do nothing + x = p->qbeg - last->qbeg; // always positive + y = p->rbeg - last->rbeg; + if (y > 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { + if (c->n == c->m) { + c->m <<= 1; + c->seeds = realloc(c->seeds, c->m * sizeof(memseed_t)); + } + c->seeds[c->n++] = *p; + return 1; + } + return 0; +} + +void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) +{ + while (smem_next(itr) > 0) { + int i; + for (i = 0; i < itr->matches->n; ++i) { + bwtintv_t *p = &itr->matches->a[i]; + int slen = (uint32_t)p->info - (p->info>>32); // seed length + int64_t k; + if (slen >= opt->min_seed_len || p->x[2] > opt->max_occ) continue; + for (k = 0; k < p->x[2]; ++k) { + memchain1_t tmp, *lower, *upper; + memseed_t c1; + int to_add = 0; + c1.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); + c1.qbeg = p->info>>32; + c1.len = slen; + if (kb_size(tree)) { + kb_intervalp(chn, tree, &tmp, &lower, &upper); + if (!test_and_merge(opt, lower, &c1)) to_add = 1; + } to_add = 1; + if (to_add) { + tmp.n = 1; tmp.m = 4; + tmp.seeds = calloc(tmp.m, sizeof(memseed_t)); + kb_putp(chn, tree, &tmp); + } + } + } + } +} + +memchain_t mem_collect_seed(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) +{ + memchain_t chain; + smem_i *itr; + kbtree_t(chn) *tree; + + memset(&chain, 0, sizeof(memchain_t)); + if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match + tree = kb_init(chn, KB_DEFAULT_SIZE); + itr = smem_itr_init(bwt); + smem_set_query(itr, 1, len, seq); + + smem_itr_destroy(itr); + kb_destroy(chn, tree); + return chain; +} diff --git a/bwamem.h b/bwamem.h index da636a0..cc86ef2 100644 --- a/bwamem.h +++ b/bwamem.h @@ -10,11 +10,26 @@ typedef struct { bwtintv_v *tmpvec[2], *matches; } smem_i; +typedef struct { + int64_t qbeg, rbeg, len; +} memseed_t; + typedef struct { int a, b, q, r, w; - int min_seed_len, max_occ; + int min_seed_len, max_occ, max_chain_gap; } memopt_t; +typedef struct { + int n, m; + int64_t pos; + memseed_t *seeds; +} memchain1_t; + +typedef struct { + int n, m; + memchain1_t *chains; +} memchain_t; + #ifdef __cplusplus extern "C" { #endif From 89777374606f3626e88933251f6ca45d703055ca Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 16:26:05 -0500 Subject: [PATCH 144/498] basic chaining working Definitely suboptimal in a lot of corner cases... --- bwamem.c | 27 ++++++++++++++++++--------- bwamem.h | 5 ++++- fastmap.c | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 57 insertions(+), 11 deletions(-) diff --git a/bwamem.c b/bwamem.c index 52c4a18..2a35fe7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -81,7 +81,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * return 0; } -void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) +static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) { while (smem_next(itr) > 0) { int i; @@ -89,21 +89,22 @@ void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) bwtintv_t *p = &itr->matches->a[i]; int slen = (uint32_t)p->info - (p->info>>32); // seed length int64_t k; - if (slen >= opt->min_seed_len || p->x[2] > opt->max_occ) continue; + if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; for (k = 0; k < p->x[2]; ++k) { memchain1_t tmp, *lower, *upper; - memseed_t c1; + memseed_t s; int to_add = 0; - c1.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); - c1.qbeg = p->info>>32; - c1.len = slen; + s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); + s.qbeg = p->info>>32; + s.len = slen; if (kb_size(tree)) { kb_intervalp(chn, tree, &tmp, &lower, &upper); - if (!test_and_merge(opt, lower, &c1)) to_add = 1; - } to_add = 1; + if (!lower || !test_and_merge(opt, lower, &s)) to_add = 1; + } else to_add = 1; if (to_add) { tmp.n = 1; tmp.m = 4; tmp.seeds = calloc(tmp.m, sizeof(memseed_t)); + tmp.seeds[0] = s; kb_putp(chn, tree, &tmp); } } @@ -111,7 +112,7 @@ void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) } } -memchain_t mem_collect_seed(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) +memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) { memchain_t chain; smem_i *itr; @@ -122,6 +123,14 @@ memchain_t mem_collect_seed(const memopt_t *opt, const bwt_t *bwt, int len, cons tree = kb_init(chn, KB_DEFAULT_SIZE); itr = smem_itr_init(bwt); smem_set_query(itr, 1, len, seq); + mem_insert_seed(opt, tree, itr); + + chain.m = kb_size(tree); chain.n = 0; + chain.chains = malloc(chain.m * sizeof(memchain1_t)); + + #define traverse_func(p_) (chain.chains[chain.n++] = *(p_)) + __kb_traverse(memchain1_t, tree, traverse_func); + #undef traverse_func smem_itr_destroy(itr); kb_destroy(chn, tree); diff --git a/bwamem.h b/bwamem.h index cc86ef2..72d9557 100644 --- a/bwamem.h +++ b/bwamem.h @@ -11,7 +11,8 @@ typedef struct { } smem_i; typedef struct { - int64_t qbeg, rbeg, len; + int64_t rbeg; + int32_t qbeg, len; } memseed_t; typedef struct { @@ -41,6 +42,8 @@ int smem_next(smem_i *itr); memopt_t *mem_opt_init(void); +memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); + #ifdef __cplusplus } #endif diff --git a/fastmap.c b/fastmap.c index 6a41aeb..85ccd5c 100644 --- a/fastmap.c +++ b/fastmap.c @@ -16,7 +16,9 @@ int main_mem(int argc, char *argv[]) memopt_t *opt; bwt_t *bwt; bntseq_t *bns; - int c; + int i, j, c; + gzFile *fp; + kseq_t *seq; opt = mem_opt_init(); while ((c = getopt(argc, argv, "")) >= 0) { @@ -28,6 +30,38 @@ int main_mem(int argc, char *argv[]) free(opt); return 1; } + fp = gzopen(argv[optind + 1], "r"); + seq = kseq_init(fp); + { // load the packed sequences, BWT and SA + char *tmp = calloc(strlen(argv[optind]) + 5, 1); + strcat(strcpy(tmp, argv[optind]), ".bwt"); + bwt = bwt_restore_bwt(tmp); + strcat(strcpy(tmp, argv[optind]), ".sa"); + bwt_restore_sa(tmp, bwt); + free(tmp); + bns = bns_restore(argv[optind]); + } + while (kseq_read(seq) >= 0) { + memchain_t chain; + printf(">%s\n", seq->name.s); + for (i = 0; i < seq->seq.l; ++i) + seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; + chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); + for (i = 0; i < chain.n; ++i) { + memchain1_t *p = &chain.chains[i]; + printf("%d\t%d", i, p->n); + for (j = 0; j < p->n; ++j) { + bwtint_t pos; + int is_rev, ref_id; + pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev); + if (is_rev) pos -= p->seeds[j].len - 1; + bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id); + printf("\t%d,%d,%s:%c%ld", p->seeds[j].len, p->seeds[j].qbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); + } + putchar('\n'); + } + puts("//"); + } free(opt); return 0; From 5d372cef65ca2a3550629b4a1428b1b256133738 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 31 Jan 2013 16:39:24 -0500 Subject: [PATCH 145/498] bugfix: wrong B-tree comparison --- bwamem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 2a35fe7..99b604e 100644 --- a/bwamem.c +++ b/bwamem.c @@ -57,8 +57,8 @@ int smem_next(smem_i *itr) #include "kbtree.h" -#define chain_lt(a, b) ((a).pos < (b).pos) -KBTREE_INIT(chn, memchain1_t, chain_lt) +#define chain_cmp(a, b) ((a).pos - (b).pos) +KBTREE_INIT(chn, memchain1_t, chain_cmp) static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t *p) { @@ -66,7 +66,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * const memseed_t *last = &c->seeds[c->n-1]; qend = last->qbeg + last->len; rend = last->rbeg + last->len; - if (p->qbeg > c->seeds[0].qbeg && p->qbeg + p->len < qend && p->rbeg > c->seeds[0].rbeg && p->rbeg + p->len < rend) + if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) return 1; // contained seed; do nothing x = p->qbeg - last->qbeg; // always positive y = p->rbeg - last->rbeg; From 86f2e134ba40ff7eb04d3cc878bebbaa892788db Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Feb 2013 12:57:48 -0500 Subject: [PATCH 146/498] no effective changes --- bwt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bwt.c b/bwt.c index 689d8f8..32dfc43 100644 --- a/bwt.c +++ b/bwt.c @@ -296,9 +296,10 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, if (q[i] < 4) { // an A/C/G/T base c = 3 - q[i]; // complement of q[i] bwt_extend(bwt, &ik, ok, 0); - if (ok[c].x[2] != ik.x[2]) // change of the interval size + if (ok[c].x[2] != ik.x[2]) { // change of the interval size kv_push(bwtintv_t, *curr, ik); - if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further + if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further + } ik = ok[c]; ik.info = i + 1; } else { // an ambiguous base kv_push(bwtintv_t, *curr, ik); From abc675f2786d9ba2879698ee36727ccdb461ef19 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Feb 2013 13:14:16 -0500 Subject: [PATCH 147/498] typo in comments --- bwt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwt.c b/bwt.c index 32dfc43..2903daa 100644 --- a/bwt.c +++ b/bwt.c @@ -317,7 +317,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_t *p = &prev->a[j]; bwt_extend(bwt, p, ok, 1); if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough - if (curr->n == 0) { // test curr->n>0 to make sure there is no longer matches + if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches ik = *p; ik.info |= (uint64_t)(i + 1)<<32; kv_push(bwtintv_t, *mem, ik); From 620ad6e5b9de135accd4bcd97f42033846293ec5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Feb 2013 14:20:38 -0500 Subject: [PATCH 148/498] reseed long SMEMs --- bwamem.c | 40 ++++++++++++++++++++++++++++++++++------ bwamem.h | 9 +++++---- fastmap.c | 14 +++++++------- 3 files changed, 46 insertions(+), 17 deletions(-) diff --git a/bwamem.c b/bwamem.c index 99b604e..0ecb190 100644 --- a/bwamem.c +++ b/bwamem.c @@ -2,6 +2,7 @@ #include #include #include "bwamem.h" +#include "kvec.h" memopt_t *mem_opt_init() { @@ -26,6 +27,7 @@ smem_i *smem_itr_init(const bwt_t *bwt) itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); itr->matches = calloc(1, sizeof(bwtintv_v)); + itr->sub = calloc(1, sizeof(bwtintv_v)); return itr; } @@ -34,24 +36,50 @@ void smem_itr_destroy(smem_i *itr) free(itr->tmpvec[0]->a); free(itr->tmpvec[1]->a); free(itr->matches->a); + free(itr->sub->a); free(itr); } -void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query) +void smem_set_query(smem_i *itr, int len, const uint8_t *query) { itr->query = query; itr->start = 0; itr->len = len; - itr->min_intv = min_intv; } -int smem_next(smem_i *itr) +int smem_next(smem_i *itr, int split_len) { + int i, max, max_i; itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = 0; if (itr->start >= itr->len || itr->start < 0) return -1; while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases if (itr->start == itr->len) return -1; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, itr->min_intv, itr->matches, itr->tmpvec); + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, 1, itr->matches, itr->tmpvec); + if (itr->matches->n == 0) return itr->start; + for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { + bwtintv_t *p = &itr->matches->a[i]; + int len = (uint32_t)p->info - (p->info>>32); + if (max < len) max = len, max_i = i; + } + if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] == 1) { + int j; + bwtintv_v *a = itr->tmpvec[0]; + bwtintv_t *p = &itr->matches->a[max_i]; + bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, 2, itr->sub, itr->tmpvec); // starting from the middle of the longest match + i = j = 0; a->n = 0; + while (i < itr->matches->n && j < itr->sub->n) { // ordered merge + if (itr->matches->a[i].info < itr->sub->a[j].info) { + kv_push(bwtintv_t, *a, itr->matches->a[i]); + ++i; + } else { + kv_push(bwtintv_t, *a, itr->sub->a[j]); + ++j; + } + } + for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]); + for (; j < itr->sub->n; ++j) kv_push(bwtintv_t, *a, itr->sub->a[j]); + kv_copy(bwtintv_t, *itr->matches, *a); + } return itr->start; } @@ -83,7 +111,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) { - while (smem_next(itr) > 0) { + while (smem_next(itr, opt->min_seed_len<<1) > 0) { int i; for (i = 0; i < itr->matches->n; ++i) { bwtintv_t *p = &itr->matches->a[i]; @@ -122,7 +150,7 @@ memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8 if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match tree = kb_init(chn, KB_DEFAULT_SIZE); itr = smem_itr_init(bwt); - smem_set_query(itr, 1, len, seq); + smem_set_query(itr, len, seq); mem_insert_seed(opt, tree, itr); chain.m = kb_size(tree); chain.n = 0; diff --git a/bwamem.h b/bwamem.h index 72d9557..1ad9e77 100644 --- a/bwamem.h +++ b/bwamem.h @@ -6,8 +6,9 @@ typedef struct { const bwt_t *bwt; const uint8_t *query; - int start, len, min_intv; - bwtintv_v *tmpvec[2], *matches; + int start, len; + bwtintv_v *matches; // matches + bwtintv_v *tmpvec[2], *sub; // these are temporary arrays } smem_i; typedef struct { @@ -37,8 +38,8 @@ extern "C" { smem_i *smem_itr_init(const bwt_t *bwt); void smem_itr_destroy(smem_i *itr); -void smem_set_query(smem_i *itr, int min_intv, int len, const uint8_t *query); -int smem_next(smem_i *itr); +void smem_set_query(smem_i *itr, int len, const uint8_t *query); +int smem_next(smem_i *itr, int split_len); memopt_t *mem_opt_init(void); diff --git a/fastmap.c b/fastmap.c index 85ccd5c..6e8a662 100644 --- a/fastmap.c +++ b/fastmap.c @@ -69,7 +69,7 @@ int main_mem(int argc, char *argv[]) int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, min_intv = 1; + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_long = 0; kseq_t *seq; bwtint_t k; gzFile fp; @@ -77,16 +77,16 @@ int main_fastmap(int argc, char *argv[]) bntseq_t *bns; smem_i *itr; - while ((c = getopt(argc, argv, "w:l:sm:")) >= 0) { + while ((c = getopt(argc, argv, "w:l:ps")) >= 0) { switch (c) { - case 's': print_seq = 1; break; + case 's': split_long = 1; break; + case 'p': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; - case 'm': min_intv = atoi(optarg); break; } } if (optind + 1 >= argc) { - fprintf(stderr, "Usage: bwa fastmap [-s] [-l minLen=%d] [-w maxSaSize=%d] [-m minIntv=%d] \n", min_len, min_iwidth, min_intv); + fprintf(stderr, "Usage: bwa fastmap [-ps] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); return 1; } @@ -110,8 +110,8 @@ int main_fastmap(int argc, char *argv[]) } else putchar('\n'); for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - smem_set_query(itr, min_intv, seq->seq.l, (uint8_t*)seq->seq.s); - while (smem_next(itr) > 0) { + smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s); + while (smem_next(itr, split_long? min_len<<1 : 0) > 0) { for (i = 0; i < itr->matches->n; ++i) { bwtintv_t *p = &itr->matches->a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; From f8f3b7577a7112d96682538c2ca8e2428d1469fc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Feb 2013 14:38:44 -0500 Subject: [PATCH 149/498] code cleanup; added a missing file --- bwamem.c | 50 ++++--- bwamem.h | 11 +- fastmap.c | 7 +- kbtree.h | 384 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 421 insertions(+), 31 deletions(-) create mode 100644 kbtree.h diff --git a/bwamem.c b/bwamem.c index 0ecb190..65807fe 100644 --- a/bwamem.c +++ b/bwamem.c @@ -19,6 +19,15 @@ memopt_t *mem_opt_init() * SMEM iterator interface * ***************************/ +struct __smem_i { + const bwt_t *bwt; + const uint8_t *query; + int start, len; + bwtintv_v *matches; // matches; to be returned by smem_next() + bwtintv_v *sub; // sub-matches inside the longest match; temporary + bwtintv_v *tmpvec[2]; // temporary arrays +}; + smem_i *smem_itr_init(const bwt_t *bwt) { smem_i *itr; @@ -47,25 +56,25 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query) itr->len = len; } -int smem_next(smem_i *itr, int split_len) +const bwtintv_v *smem_next(smem_i *itr, int split_len) { int i, max, max_i; - itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = 0; - if (itr->start >= itr->len || itr->start < 0) return -1; + itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; + if (itr->start >= itr->len || itr->start < 0) return 0; while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases - if (itr->start == itr->len) return -1; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, 1, itr->matches, itr->tmpvec); - if (itr->matches->n == 0) return itr->start; - for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { + if (itr->start == itr->len) return 0; + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, 1, itr->matches, itr->tmpvec); // search for SMEM + if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here + for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match bwtintv_t *p = &itr->matches->a[i]; int len = (uint32_t)p->info - (p->info>>32); if (max < len) max = len, max_i = i; } - if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] == 1) { + if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] == 1) { // if the longest SMEM is unique and long int j; - bwtintv_v *a = itr->tmpvec[0]; + bwtintv_v *a = itr->tmpvec[0]; // reuse tmpvec[0] for merging bwtintv_t *p = &itr->matches->a[max_i]; - bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, 2, itr->sub, itr->tmpvec); // starting from the middle of the longest match + bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, 2, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM i = j = 0; a->n = 0; while (i < itr->matches->n && j < itr->sub->n) { // ordered merge if (itr->matches->a[i].info < itr->sub->a[j].info) { @@ -80,7 +89,7 @@ int smem_next(smem_i *itr, int split_len) for (; j < itr->sub->n; ++j) kv_push(bwtintv_t, *a, itr->sub->a[j]); kv_copy(bwtintv_t, *itr->matches, *a); } - return itr->start; + return itr->matches; } #include "kbtree.h" @@ -98,7 +107,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * return 1; // contained seed; do nothing x = p->qbeg - last->qbeg; // always positive y = p->rbeg - last->rbeg; - if (y > 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { + if (y > 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain if (c->n == c->m) { c->m <<= 1; c->seeds = realloc(c->seeds, c->m * sizeof(memseed_t)); @@ -106,30 +115,31 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * c->seeds[c->n++] = *p; return 1; } - return 0; + return 0; // request to add a new chain } static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) { - while (smem_next(itr, opt->min_seed_len<<1) > 0) { + const bwtintv_v *a; + while ((a = smem_next(itr, opt->min_seed_len<<1)) != 0) { // to find all SMEM and some internal MEM int i; - for (i = 0; i < itr->matches->n; ++i) { - bwtintv_t *p = &itr->matches->a[i]; + for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start + bwtintv_t *p = &a->a[i]; int slen = (uint32_t)p->info - (p->info>>32); // seed length int64_t k; - if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; + if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive for (k = 0; k < p->x[2]; ++k) { memchain1_t tmp, *lower, *upper; memseed_t s; int to_add = 0; - s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); + s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; s.len = slen; if (kb_size(tree)) { - kb_intervalp(chn, tree, &tmp, &lower, &upper); + kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain if (!lower || !test_and_merge(opt, lower, &s)) to_add = 1; } else to_add = 1; - if (to_add) { + if (to_add) { // add the seed as a new chain tmp.n = 1; tmp.m = 4; tmp.seeds = calloc(tmp.m, sizeof(memseed_t)); tmp.seeds[0] = s; diff --git a/bwamem.h b/bwamem.h index 1ad9e77..eb79586 100644 --- a/bwamem.h +++ b/bwamem.h @@ -3,13 +3,8 @@ #include "bwt.h" -typedef struct { - const bwt_t *bwt; - const uint8_t *query; - int start, len; - bwtintv_v *matches; // matches - bwtintv_v *tmpvec[2], *sub; // these are temporary arrays -} smem_i; +struct __smem_i; +typedef struct __smem_i smem_i; typedef struct { int64_t rbeg; @@ -39,7 +34,7 @@ extern "C" { smem_i *smem_itr_init(const bwt_t *bwt); void smem_itr_destroy(smem_i *itr); void smem_set_query(smem_i *itr, int len, const uint8_t *query); -int smem_next(smem_i *itr, int split_len); +const bwtintv_v *smem_next(smem_i *itr, int split_len); memopt_t *mem_opt_init(void); diff --git a/fastmap.c b/fastmap.c index 6e8a662..42122b4 100644 --- a/fastmap.c +++ b/fastmap.c @@ -76,6 +76,7 @@ int main_fastmap(int argc, char *argv[]) bwt_t *bwt; bntseq_t *bns; smem_i *itr; + const bwtintv_v *a; while ((c = getopt(argc, argv, "w:l:ps")) >= 0) { switch (c) { @@ -111,9 +112,9 @@ int main_fastmap(int argc, char *argv[]) for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s); - while (smem_next(itr, split_long? min_len<<1 : 0) > 0) { - for (i = 0; i < itr->matches->n; ++i) { - bwtintv_t *p = &itr->matches->a[i]; + while ((a = smem_next(itr, split_long? min_len<<1 : 0)) != 0) { + for (i = 0; i < a->n; ++i) { + bwtintv_t *p = &a->a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); if (p->x[2] <= min_iwidth) { diff --git a/kbtree.h b/kbtree.h new file mode 100644 index 0000000..5ed5330 --- /dev/null +++ b/kbtree.h @@ -0,0 +1,384 @@ +/*- + * Copyright 1997-1999, 2001, John-Mark Gurney. + * 2008-2009, Attractive Chaos + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __AC_KBTREE_H +#define __AC_KBTREE_H + +#include +#include +#include + +typedef struct { + int32_t is_internal:1, n:31; +} kbnode_t; + +#define __KB_KEY(type, x) ((type*)((char*)x + 4)) +#define __KB_PTR(btr, x) ((kbnode_t**)((char*)x + btr->off_ptr)) + +#define __KB_TREE_T(name) \ + typedef struct { \ + kbnode_t *root; \ + int off_key, off_ptr, ilen, elen; \ + int n, t; \ + int n_keys, n_nodes; \ + } kbtree_##name##_t; + +#define __KB_INIT(name, key_t) \ + kbtree_##name##_t *kb_init_##name(int size) \ + { \ + kbtree_##name##_t *b; \ + b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t)); \ + b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \ + if (b->t < 2) { \ + free(b); return 0; \ + } \ + b->n = 2 * b->t - 1; \ + b->off_ptr = 4 + b->n * sizeof(key_t); \ + b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \ + b->elen = (b->off_ptr + 3) >> 2 << 2; \ + b->root = (kbnode_t*)calloc(1, b->ilen); \ + ++b->n_nodes; \ + return b; \ + } + +#define __kb_destroy(b) do { \ + int i, max = 8; \ + kbnode_t *x, **top, **stack = 0; \ + if (b) { \ + top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*)); \ + *top++ = (b)->root; \ + while (top != stack) { \ + x = *--top; \ + if (x->is_internal == 0) { free(x); continue; } \ + for (i = 0; i <= x->n; ++i) \ + if (__KB_PTR(b, x)[i]) { \ + if (top - stack == max) { \ + max <<= 1; \ + stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \ + top = stack + (max>>1); \ + } \ + *top++ = __KB_PTR(b, x)[i]; \ + } \ + free(x); \ + } \ + } \ + free(b); free(stack); \ + } while (0) + +#define __kb_get_first(key_t, b, ret) do { \ + kbnode_t *__x = (b)->root; \ + while (__KB_PTR(b, __x)[0] != 0) \ + __x = __KB_PTR(b, __x)[0]; \ + (ret) = __KB_KEY(key_t, __x)[0]; \ + } while (0) + +#define __KB_GET_AUX0(name, key_t, __cmp) \ + static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \ + { \ + int tr, *rr, begin, end, n = x->n >> 1; \ + if (x->n == 0) return -1; \ + if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) { \ + begin = 0; end = n; \ + } else { begin = n; end = x->n - 1; } \ + rr = r? r : &tr; \ + n = end; \ + while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \ + return n; \ + } + +#define __KB_GET_AUX1(name, key_t, __cmp) \ + static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \ + { \ + int tr, *rr, begin = 0, end = x->n; \ + if (x->n == 0) return -1; \ + rr = r? r : &tr; \ + while (begin < end) { \ + int mid = (begin + end) >> 1; \ + if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \ + else end = mid; \ + } \ + if (begin == x->n) { *rr = 1; return x->n - 1; } \ + if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin; \ + return begin; \ + } + +#define __KB_GET(name, key_t) \ + static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + int i, r = 0; \ + kbnode_t *x = b->root; \ + while (x) { \ + i = __kb_getp_aux_##name(x, k, &r); \ + if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i]; \ + if (x->is_internal == 0) return 0; \ + x = __KB_PTR(b, x)[i + 1]; \ + } \ + return 0; \ + } \ + static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + return kb_getp_##name(b, &k); \ + } + +#define __KB_INTERVAL(name, key_t) \ + static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper) \ + { \ + int i, r = 0; \ + kbnode_t *x = b->root; \ + *lower = *upper = 0; \ + while (x) { \ + i = __kb_getp_aux_##name(x, k, &r); \ + if (i >= 0 && r == 0) { \ + *lower = *upper = &__KB_KEY(key_t, x)[i]; \ + return; \ + } \ + if (i >= 0) *lower = &__KB_KEY(key_t, x)[i]; \ + if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1]; \ + if (x->is_internal == 0) return; \ + x = __KB_PTR(b, x)[i + 1]; \ + } \ + } \ + static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \ + { \ + kb_intervalp_##name(b, &k, lower, upper); \ + } + +#define __KB_PUT(name, key_t, __cmp) \ + /* x must be an internal node */ \ + static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \ + { \ + kbnode_t *z; \ + z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen); \ + ++b->n_nodes; \ + z->is_internal = y->is_internal; \ + z->n = b->t - 1; \ + memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \ + if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \ + y->n = b->t - 1; \ + memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \ + __KB_PTR(b, x)[i + 1] = z; \ + memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \ + __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1]; \ + ++x->n; \ + } \ + static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \ + { \ + int i = x->n - 1; \ + if (x->is_internal == 0) { \ + i = __kb_getp_aux_##name(x, k, 0); \ + if (i != x->n - 1) \ + memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + __KB_KEY(key_t, x)[i + 1] = *k; \ + ++x->n; \ + } else { \ + i = __kb_getp_aux_##name(x, k, 0) + 1; \ + if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) { \ + __kb_split_##name(b, x, i, __KB_PTR(b, x)[i]); \ + if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i; \ + } \ + __kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k); \ + } \ + } \ + static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + kbnode_t *r, *s; \ + ++b->n_keys; \ + r = b->root; \ + if (r->n == 2 * b->t - 1) { \ + ++b->n_nodes; \ + s = (kbnode_t*)calloc(1, b->ilen); \ + b->root = s; s->is_internal = 1; s->n = 0; \ + __KB_PTR(b, s)[0] = r; \ + __kb_split_##name(b, s, 0, r); \ + r = s; \ + } \ + __kb_putp_aux_##name(b, r, k); \ + } \ + static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + kb_putp_##name(b, &k); \ + } + + +#define __KB_DEL(name, key_t) \ + static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \ + { \ + int yn, zn, i, r = 0; \ + kbnode_t *xp, *y, *z; \ + key_t kp; \ + if (x == 0) return *k; \ + if (s) { /* s can only be 0, 1 or 2 */ \ + r = x->is_internal == 0? 0 : s == 1? 1 : -1; \ + i = s == 1? x->n - 1 : -1; \ + } else i = __kb_getp_aux_##name(x, k, &r); \ + if (x->is_internal == 0) { \ + if (s == 2) ++i; \ + kp = __KB_KEY(key_t, x)[i]; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + --x->n; \ + return kp; \ + } \ + if (r == 0) { \ + if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) { \ + xp = __KB_PTR(b, x)[i]; \ + kp = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \ + return kp; \ + } else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) { \ + xp = __KB_PTR(b, x)[i + 1]; \ + kp = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \ + return kp; \ + } else if (yn == b->t - 1 && zn == b->t - 1) { \ + y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1]; \ + __KB_KEY(key_t, y)[y->n++] = *k; \ + memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \ + y->n += z->n; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \ + --x->n; \ + free(z); \ + return __kb_delp_aux_##name(b, y, k, s); \ + } \ + } \ + ++i; \ + if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) { \ + if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) { \ + memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \ + if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \ + __KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1]; \ + __KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \ + if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \ + --y->n; ++xp->n; \ + } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \ + __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \ + __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0]; \ + if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \ + --y->n; \ + memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \ + } else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \ + __KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1]; \ + memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \ + if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \ + y->n += xp->n; \ + memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \ + --x->n; \ + free(xp); \ + xp = y; \ + } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \ + __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \ + memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t)); \ + if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \ + xp->n += y->n; \ + memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ + memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \ + --x->n; \ + free(y); \ + } \ + } \ + return __kb_delp_aux_##name(b, xp, k, s); \ + } \ + static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ + { \ + kbnode_t *x; \ + key_t ret; \ + ret = __kb_delp_aux_##name(b, b->root, k, 0); \ + --b->n_keys; \ + if (b->root->n == 0 && b->root->is_internal) { \ + --b->n_nodes; \ + x = b->root; \ + b->root = __KB_PTR(b, x)[0]; \ + free(x); \ + } \ + return ret; \ + } \ + static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \ + { \ + return kb_delp_##name(b, &k); \ + } + +typedef struct { + kbnode_t *x; + int i; +} __kbstack_t; + +#define __kb_traverse(key_t, b, __func) do { \ + int __kmax = 8; \ + __kbstack_t *__kstack, *__kp; \ + __kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \ + __kp->x = (b)->root; __kp->i = 0; \ + for (;;) { \ + while (__kp->x && __kp->i <= __kp->x->n) { \ + if (__kp - __kstack == __kmax - 1) { \ + __kmax <<= 1; \ + __kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \ + __kp = __kstack + (__kmax>>1) - 1; \ + } \ + (__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \ + ++__kp; \ + } \ + --__kp; \ + if (__kp >= __kstack) { \ + if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \ + ++__kp->i; \ + } else break; \ + } \ + free(__kstack); \ + } while (0) + +#define KBTREE_INIT(name, key_t, __cmp) \ + __KB_TREE_T(name) \ + __KB_INIT(name, key_t) \ + __KB_GET_AUX1(name, key_t, __cmp) \ + __KB_GET(name, key_t) \ + __KB_INTERVAL(name, key_t) \ + __KB_PUT(name, key_t, __cmp) \ + __KB_DEL(name, key_t) + +#define KB_DEFAULT_SIZE 512 + +#define kbtree_t(name) kbtree_##name##_t +#define kb_init(name, s) kb_init_##name(s) +#define kb_destroy(name, b) __kb_destroy(b) +#define kb_get(name, b, k) kb_get_##name(b, k) +#define kb_put(name, b, k) kb_put_##name(b, k) +#define kb_del(name, b, k) kb_del_##name(b, k) +#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u) +#define kb_getp(name, b, k) kb_getp_##name(b, k) +#define kb_putp(name, b, k) kb_putp_##name(b, k) +#define kb_delp(name, b, k) kb_delp_##name(b, k) +#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u) + +#define kb_size(b) ((b)->n_keys) + +#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b))) +#define kb_str_cmp(a, b) strcmp(a, b) + +#endif From 7ab4b3321fd9f44a0f6f30d82eb9a3e326801ee2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Feb 2013 15:26:34 -0500 Subject: [PATCH 150/498] bugfix: memory leak --- bwamem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index 65807fe..62df4e5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -42,10 +42,10 @@ smem_i *smem_itr_init(const bwt_t *bwt) void smem_itr_destroy(smem_i *itr) { - free(itr->tmpvec[0]->a); - free(itr->tmpvec[1]->a); - free(itr->matches->a); - free(itr->sub->a); + free(itr->tmpvec[0]->a); free(itr->tmpvec[0]); + free(itr->tmpvec[1]->a); free(itr->tmpvec[1]); + free(itr->matches->a); free(itr->matches); + free(itr->sub->a); free(itr->sub); free(itr); } From 00e5302219a595c34ffee2b05a0c7e78883ac2a7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Feb 2013 16:39:50 -0500 Subject: [PATCH 151/498] routine to get subsequence from 2-bit pac --- bntseq.c | 27 +++++++++++++++++++++++++++ bntseq.h | 1 + bwamem.c | 48 ++++++++++++++++++++++++++++++++---------------- bwamem.h | 23 +++++++++++++++-------- fastmap.c | 10 +++++++--- 5 files changed, 82 insertions(+), 27 deletions(-) diff --git a/bntseq.c b/bntseq.c index adcd2d7..18abb2b 100644 --- a/bntseq.c +++ b/bntseq.c @@ -321,3 +321,30 @@ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) } return nn; } + +static inline void get_seq_core(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, uint8_t *seq) +{ + int64_t k, l = 0; + if (beg >= l_pac) { // reverse strand + int64_t beg_f = (l_pac<<1) - 1 - end; + int64_t end_f = (l_pac<<1) - 1 - beg; + for (k = end_f; k > beg_f; --k) + seq[l++] = 3 - _get_pac(pac, k); + } else { // forward strand + for (k = beg; k < end; ++k) + seq[l++] = _get_pac(pac, k); + } +} + +uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len) +{ + uint8_t *seq; + if (end > l_pac<<1) end = l_pac<<1; + *len = end - beg; + seq = malloc(end - beg); + if (beg < l_pac && end > l_pac) { + get_seq_core(l_pac, pac, beg, l_pac, seq); + get_seq_core(l_pac, pac, l_pac, end, seq + (l_pac - beg)); + } else get_seq_core(l_pac, pac, beg, end, seq); + return seq; +} diff --git a/bntseq.h b/bntseq.h index 843db64..d4096b4 100644 --- a/bntseq.h +++ b/bntseq.h @@ -72,6 +72,7 @@ extern "C" { void bns_destroy(bntseq_t *bns); int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); + uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); #ifdef __cplusplus } diff --git a/bwamem.c b/bwamem.c index 62df4e5..ef09fd5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -3,11 +3,12 @@ #include #include "bwamem.h" #include "kvec.h" +#include "bntseq.h" -memopt_t *mem_opt_init() +mem_opt_t *mem_opt_init() { - memopt_t *o; - o = calloc(1, sizeof(memopt_t)); + mem_opt_t *o; + o = calloc(1, sizeof(mem_opt_t)); o->a = 1; o->b = 9; o->q = 16; o->r = 1; o->w = 100; o->min_seed_len = 17; o->max_occ = 10; @@ -95,12 +96,12 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) #include "kbtree.h" #define chain_cmp(a, b) ((a).pos - (b).pos) -KBTREE_INIT(chn, memchain1_t, chain_cmp) +KBTREE_INIT(chn, mem_chain1_t, chain_cmp) -static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t *p) +static int test_and_merge(const mem_opt_t *opt, mem_chain1_t *c, const mem_seed_t *p) { int64_t qend, rend, x, y; - const memseed_t *last = &c->seeds[c->n-1]; + const mem_seed_t *last = &c->seeds[c->n-1]; qend = last->qbeg + last->len; rend = last->rbeg + last->len; if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) @@ -110,7 +111,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * if (y > 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain if (c->n == c->m) { c->m <<= 1; - c->seeds = realloc(c->seeds, c->m * sizeof(memseed_t)); + c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t)); } c->seeds[c->n++] = *p; return 1; @@ -118,7 +119,7 @@ static int test_and_merge(const memopt_t *opt, memchain1_t *c, const memseed_t * return 0; // request to add a new chain } -static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *itr) +static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr) { const bwtintv_v *a; while ((a = smem_next(itr, opt->min_seed_len<<1)) != 0) { // to find all SMEM and some internal MEM @@ -129,8 +130,8 @@ static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *it int64_t k; if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive for (k = 0; k < p->x[2]; ++k) { - memchain1_t tmp, *lower, *upper; - memseed_t s; + mem_chain1_t tmp, *lower, *upper; + mem_seed_t s; int to_add = 0; s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; @@ -141,7 +142,7 @@ static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *it } else to_add = 1; if (to_add) { // add the seed as a new chain tmp.n = 1; tmp.m = 4; - tmp.seeds = calloc(tmp.m, sizeof(memseed_t)); + tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); tmp.seeds[0] = s; kb_putp(chn, tree, &tmp); } @@ -150,13 +151,13 @@ static void mem_insert_seed(const memopt_t *opt, kbtree_t(chn) *tree, smem_i *it } } -memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) +mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) { - memchain_t chain; + mem_chain_t chain; smem_i *itr; kbtree_t(chn) *tree; - memset(&chain, 0, sizeof(memchain_t)); + memset(&chain, 0, sizeof(mem_chain_t)); if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match tree = kb_init(chn, KB_DEFAULT_SIZE); itr = smem_itr_init(bwt); @@ -164,13 +165,28 @@ memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8 mem_insert_seed(opt, tree, itr); chain.m = kb_size(tree); chain.n = 0; - chain.chains = malloc(chain.m * sizeof(memchain1_t)); + chain.chains = malloc(chain.m * sizeof(mem_chain1_t)); #define traverse_func(p_) (chain.chains[chain.n++] = *(p_)) - __kb_traverse(memchain1_t, tree, traverse_func); + __kb_traverse(mem_chain1_t, tree, traverse_func); #undef traverse_func smem_itr_destroy(itr); kb_destroy(chn, tree); return chain; } + +mem_aln_t mem_chain2aln(int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) +{ + mem_aln_t a; + int i, j; + int64_t len; + for (i = 0; i < c->n; ++i) { + mem_seed_t *s = &c->seeds[i]; + uint8_t *seq = bns_get_seq(l_pac, pac, s->rbeg, s->rbeg + s->len, &len); + for (j = 0; j < len; ++j) putchar("ACGTN"[seq[j]]); putchar('\n'); + for (j = 0; j < s->len; ++j) putchar("ACGTN"[query[j+s->qbeg]]); putchar('\n'); + free(seq); + } + return a; +} diff --git a/bwamem.h b/bwamem.h index eb79586..0ebd2eb 100644 --- a/bwamem.h +++ b/bwamem.h @@ -9,23 +9,29 @@ typedef struct __smem_i smem_i; typedef struct { int64_t rbeg; int32_t qbeg, len; -} memseed_t; +} mem_seed_t; typedef struct { int a, b, q, r, w; int min_seed_len, max_occ, max_chain_gap; -} memopt_t; +} mem_opt_t; typedef struct { int n, m; int64_t pos; - memseed_t *seeds; -} memchain1_t; + mem_seed_t *seeds; +} mem_chain1_t; typedef struct { int n, m; - memchain1_t *chains; -} memchain_t; + mem_chain1_t *chains; +} mem_chain_t; + +typedef struct { + int64_t pos; + int n_cigar, len, score; + uint32_t *cigar; +} mem_aln_t; #ifdef __cplusplus extern "C" { @@ -36,9 +42,10 @@ void smem_itr_destroy(smem_i *itr); void smem_set_query(smem_i *itr, int len, const uint8_t *query); const bwtintv_v *smem_next(smem_i *itr, int split_len); -memopt_t *mem_opt_init(void); +mem_opt_t *mem_opt_init(void); -memchain_t mem_chain(const memopt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); +mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); +mem_aln_t mem_chain2aln(int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c); #ifdef __cplusplus } diff --git a/fastmap.c b/fastmap.c index 42122b4..d8a0eca 100644 --- a/fastmap.c +++ b/fastmap.c @@ -13,12 +13,13 @@ extern unsigned char nst_nt4_table[256]; int main_mem(int argc, char *argv[]) { - memopt_t *opt; + mem_opt_t *opt; bwt_t *bwt; bntseq_t *bns; int i, j, c; gzFile *fp; kseq_t *seq; + uint8_t *pac = 0; opt = mem_opt_init(); while ((c = getopt(argc, argv, "")) >= 0) { @@ -40,15 +41,18 @@ int main_mem(int argc, char *argv[]) bwt_restore_sa(tmp, bwt); free(tmp); bns = bns_restore(argv[optind]); + pac = calloc(bns->l_pac/4+1, 1); + fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); } while (kseq_read(seq) >= 0) { - memchain_t chain; + mem_chain_t chain; printf(">%s\n", seq->name.s); for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); for (i = 0; i < chain.n; ++i) { - memchain1_t *p = &chain.chains[i]; + mem_chain1_t *p = &chain.chains[i]; + mem_chain2aln(bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p); printf("%d\t%d", i, p->n); for (j = 0; j < p->n; ++j) { bwtint_t pos; From d25a87cc500b4cb654a3299818ce6982f6667ed2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 2 Feb 2013 15:14:24 -0500 Subject: [PATCH 152/498] code backup --- bwamem.c | 6 ++++-- bwamem.h | 2 +- fastmap.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index ef09fd5..02f9591 100644 --- a/bwamem.c +++ b/bwamem.c @@ -176,10 +176,10 @@ mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin return chain; } -mem_aln_t mem_chain2aln(int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) +mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) { mem_aln_t a; - int i, j; + int i, j, max, max_i; int64_t len; for (i = 0; i < c->n; ++i) { mem_seed_t *s = &c->seeds[i]; @@ -188,5 +188,7 @@ mem_aln_t mem_chain2aln(int64_t l_pac, const uint8_t *pac, int l_query, const ui for (j = 0; j < s->len; ++j) putchar("ACGTN"[query[j+s->qbeg]]); putchar('\n'); free(seq); } + for (i = max = 0, max_i = -1; i < c->n; ++i) // find the longest seed + if (max < c->seeds[i].len) max = c->seeds[i].len, max_i = i; return a; } diff --git a/bwamem.h b/bwamem.h index 0ebd2eb..214d780 100644 --- a/bwamem.h +++ b/bwamem.h @@ -45,7 +45,7 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len); mem_opt_t *mem_opt_init(void); mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); -mem_aln_t mem_chain2aln(int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c); +mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c); #ifdef __cplusplus } diff --git a/fastmap.c b/fastmap.c index d8a0eca..c92311e 100644 --- a/fastmap.c +++ b/fastmap.c @@ -52,7 +52,7 @@ int main_mem(int argc, char *argv[]) chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); for (i = 0; i < chain.n; ++i) { mem_chain1_t *p = &chain.chains[i]; - mem_chain2aln(bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p); + mem_chain2aln(opt, bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p); printf("%d\t%d", i, p->n); for (j = 0; j < p->n; ++j) { bwtint_t pos; From 92b084e553fd03c617ab60640609b40a9f89eaca Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 2 Feb 2013 16:38:21 -0500 Subject: [PATCH 153/498] reimplemented SW extension; not tested yet --- ksw.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 4 deletions(-) diff --git a/ksw.c b/ksw.c index bd29e96..9ee8453 100644 --- a/ksw.c +++ b/ksw.c @@ -23,12 +23,13 @@ SOFTWARE. */ -#ifndef _NO_SSE2 #include #include -#include #include "ksw.h" +#ifndef _NO_SSE2 +#include + #ifdef __GNUC__ #define LIKELY(x) __builtin_expect((x),1) #define UNLIKELY(x) __builtin_expect((x),0) @@ -37,6 +38,10 @@ #define UNLIKELY(x) (x) #endif +/*************** + *** SSE2 SW *** + ***************/ + struct _ksw_query_t { int qlen, slen; uint8_t shift, mdiff, max, size; @@ -300,11 +305,82 @@ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) else return ksw_sse2_8(q, tlen, target, a); } +#endif // _NO_SSE2 + +/******************** + *** SW extension *** + ********************/ + +typedef struct { + int32_t h, e; +} eh_t; + +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos) +{ + eh_t *eh; + int8_t *qp; + int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j; + // allocate memory + eh = calloc(qlen + 1, 8); + qp = malloc(qlen * m); + // generate the query profile + for (j = i = 0; j < qlen; ++j) { + const int8_t *p = &mat[query[j] * m]; + for (k = 0; k < m; ++j) qp[i++] = p[k]; + } + // DP loop + eh[0].h = h0; max = 0, max_i = max_j = -1; + beg = 0, end = 1; + for (i = 0; LIKELY(i < tlen); ++i) { + int f = 0, h1 = 0, m = 0, mj = -1, t; + // apply the band and the constraint (if provided) + t = (qw && qw[i] < w)? qw[i] : w; // this is the band width at $i + if (beg < i - t) beg = i - t; + if (end > i + t + 1) end = i + t + 1; + if (end > qlen) end = qlen; + for (j = beg; LIKELY(j < end); ++j) { + // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) + // Similar to SSE2-SW, cells are computed in the following order: + // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} + // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape + // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape + eh_t *p = &eh[j]; + int h = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) + p->h = h1; // set H(i,j-1) for the next row + h += qp[j]; + h = h > e? h : e; + h = h > f? h : f; + h1 = h; // save H(i,j) to h1 for the next column + mj = m > h? mj : j; + m = m > h? m : h; // m is stored at eh[mj+1] + h -= gapoe; + h = h > 0? h : 0; + e -= gape; + e = e > h? e : h; // computed E(i+1,j) + p->e = e; // save E(i+1,j) for the next row + f -= gape; + f = f > h? f : h; // computed F(i,j+1) + } + eh[end].h = h1; eh[end].e = 0; + if (m == 0) break; + if (m > max) max = m, max_i = i, max_j = mj; + // update beg and end for the next round + for (j = mj; j > beg && eh[j].h; --j); + beg = j + 1; + for (j = mj + 2; j <= end && eh[j].h; ++j); + end = j; + } + free(eh); free(qp); + if (_qpos) *_qpos = max_i; + if (_tpos) *_tpos = max_j; + return max; +} + /******************************************* * Main function (not compiled by default) * *******************************************/ -#ifdef _KSW_MAIN +#if defined(_KSW_MAIN) && !defined(_NO_SSE2) #include #include @@ -398,4 +474,3 @@ int main(int argc, char *argv[]) return 0; } #endif // _KSW_MAIN -#endif // _NO_SSE2 From e8a1962efe7620114d505599c2e21709b5a46637 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 3 Feb 2013 17:25:40 -0500 Subject: [PATCH 154/498] code backup; it is wrong --- ksw.c | 27 +++++++++++++++++---------- ksw.h | 4 ++++ 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/ksw.c b/ksw.c index 9ee8453..440fa50 100644 --- a/ksw.c +++ b/ksw.c @@ -314,7 +314,7 @@ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) typedef struct { int32_t h, e; } eh_t; - +#include int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos) { eh_t *eh; @@ -324,20 +324,24 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, eh = calloc(qlen + 1, 8); qp = malloc(qlen * m); // generate the query profile - for (j = i = 0; j < qlen; ++j) { - const int8_t *p = &mat[query[j] * m]; - for (k = 0; k < m; ++j) qp[i++] = p[k]; + for (k = i = 0; k < m; ++k) { + const int8_t *p = &mat[k * m]; + for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; } + // fill the first row + eh[0].h = h0; // DP loop - eh[0].h = h0; max = 0, max_i = max_j = -1; - beg = 0, end = 1; + max = 0, max_i = max_j = -1; + beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { int f = 0, h1 = 0, m = 0, mj = -1, t; + int8_t *q = &qp[target[i] * qlen]; // apply the band and the constraint (if provided) t = (qw && qw[i] < w)? qw[i] : w; // this is the band width at $i if (beg < i - t) beg = i - t; if (end > i + t + 1) end = i + t + 1; if (end > qlen) end = qlen; + printf("[%d]\t%d,%d", i, beg, end); for (j = beg; LIKELY(j < end); ++j) { // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) // Similar to SSE2-SW, cells are computed in the following order: @@ -347,7 +351,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, eh_t *p = &eh[j]; int h = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) p->h = h1; // set H(i,j-1) for the next row - h += qp[j]; + h += q[j]; h = h > e? h : e; h = h > f? h : f; h1 = h; // save H(i,j) to h1 for the next column @@ -360,19 +364,22 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, p->e = e; // save E(i+1,j) for the next row f -= gape; f = f > h? f : h; // computed F(i,j+1) + printf("\t%d:%d", j, h1); } + putchar('\n'); eh[end].h = h1; eh[end].e = 0; if (m == 0) break; if (m > max) max = m, max_i = i, max_j = mj; // update beg and end for the next round - for (j = mj; j > beg && eh[j].h; --j); + for (j = mj; j >= beg && eh[j].h; --j); beg = j + 1; for (j = mj + 2; j <= end && eh[j].h; ++j); end = j; + beg = 0; end = qlen; // uncomment this line for debugging } free(eh); free(qp); - if (_qpos) *_qpos = max_i; - if (_tpos) *_tpos = max_j; + if (_qpos) *_qpos = max_i + 1; + if (_tpos) *_tpos = max_j + 1; return max; } diff --git a/ksw.h b/ksw.h index d93d6a9..b7b9c40 100644 --- a/ksw.h +++ b/ksw.h @@ -1,6 +1,8 @@ #ifndef __AC_KSW_H #define __AC_KSW_H +#include + struct _ksw_query_t; typedef struct _ksw_query_t ksw_query_t; @@ -47,6 +49,8 @@ extern "C" { /** Unified interface for ksw_sse2_8() and ksw_sse2_16() */ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos); + #ifdef __cplusplus } #endif From 20933982318a9fe9cd9740ff283e88a7db714c5b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 3 Feb 2013 17:47:57 -0500 Subject: [PATCH 155/498] bugfix: the first line is wrong --- ksw.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/ksw.c b/ksw.c index 440fa50..6277bf6 100644 --- a/ksw.c +++ b/ksw.c @@ -320,6 +320,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, eh_t *eh; int8_t *qp; int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j; + if (h0 < 0) h0 = 0; // allocate memory eh = calloc(qlen + 1, 8); qp = malloc(qlen * m); @@ -329,19 +330,23 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; } // fill the first row - eh[0].h = h0; + eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0; + for (j = 2; j <= qlen && eh[j-1].h > gape; ++j) + eh[j].h = eh[j-1].h - gape; // DP loop - max = 0, max_i = max_j = -1; + max = h0, max_i = max_j = 0; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { - int f = 0, h1 = 0, m = 0, mj = -1, t; + int f = 0, h1, m = 0, mj = -1, t; int8_t *q = &qp[target[i] * qlen]; + // compute the first column + h1 = h0 - (gapo + gape * (i + 1)); + if (h1 < 0) h1 = 0; // apply the band and the constraint (if provided) t = (qw && qw[i] < w)? qw[i] : w; // this is the band width at $i if (beg < i - t) beg = i - t; if (end > i + t + 1) end = i + t + 1; if (end > qlen) end = qlen; - printf("[%d]\t%d,%d", i, beg, end); for (j = beg; LIKELY(j < end); ++j) { // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) // Similar to SSE2-SW, cells are computed in the following order: @@ -364,9 +369,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, p->e = e; // save E(i+1,j) for the next row f -= gape; f = f > h? f : h; // computed F(i,j+1) - printf("\t%d:%d", j, h1); } - putchar('\n'); eh[end].h = h1; eh[end].e = 0; if (m == 0) break; if (m > max) max = m, max_i = i, max_j = mj; @@ -375,7 +378,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, beg = j + 1; for (j = mj + 2; j <= end && eh[j].h; ++j); end = j; - beg = 0; end = qlen; // uncomment this line for debugging + //beg = 0; end = qlen; // uncomment this line for debugging } free(eh); free(qp); if (_qpos) *_qpos = max_i + 1; From f83dea36d85fd8c1c8e7d3f84638c77202896bbd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 3 Feb 2013 18:16:43 -0500 Subject: [PATCH 156/498] no effective changes --- ksw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ksw.c b/ksw.c index 6277bf6..437f563 100644 --- a/ksw.c +++ b/ksw.c @@ -314,7 +314,7 @@ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) typedef struct { int32_t h, e; } eh_t; -#include + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos) { eh_t *eh; From ba18db1a9fe753e81dde187a4432a9795e44fdb0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 12:37:38 -0500 Subject: [PATCH 157/498] sw extension works for the simplest case --- bwamem.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++--------- bwamem.h | 2 ++ fastmap.c | 1 + ksw.c | 17 +++++++++----- ksw.h | 2 +- 5 files changed, 73 insertions(+), 16 deletions(-) diff --git a/bwamem.c b/bwamem.c index 02f9591..b9e7f68 100644 --- a/bwamem.c +++ b/bwamem.c @@ -1,9 +1,22 @@ #include #include #include +#include #include "bwamem.h" #include "kvec.h" #include "bntseq.h" +#include "ksw.h" + +void mem_fill_scmat(int a, int b, int8_t mat[25]) +{ + int i, j, k; + for (i = k = 0; i < 5; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? a : -b; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; +} mem_opt_t *mem_opt_init() { @@ -13,6 +26,7 @@ mem_opt_t *mem_opt_init() o->min_seed_len = 17; o->max_occ = 10; o->max_chain_gap = 10000; + mem_fill_scmat(o->a, o->b, o->mat); return o; } @@ -176,19 +190,52 @@ mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin return chain; } +static inline int cal_max_gap(const mem_opt_t *opt, int qlen) +{ + int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.); + return l > 1? l : 1; +} + mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) { mem_aln_t a; - int i, j, max, max_i; - int64_t len; - for (i = 0; i < c->n; ++i) { - mem_seed_t *s = &c->seeds[i]; - uint8_t *seq = bns_get_seq(l_pac, pac, s->rbeg, s->rbeg + s->len, &len); - for (j = 0; j < len; ++j) putchar("ACGTN"[seq[j]]); putchar('\n'); - for (j = 0; j < s->len; ++j) putchar("ACGTN"[query[j+s->qbeg]]); putchar('\n'); - free(seq); + int i, j, qbeg, qend, score; + int64_t k, rlen, rbeg, rend, rmax[2], tmp; + mem_seed_t *s; + uint8_t *rseq = 0; + // get the start and end of the seeded region + rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg; + s = &c->seeds[c->n-1]; + rend = s->rbeg + s->len; qend = s->qbeg + s->len; + // get the max possible span + rmax[0] = rbeg - (qbeg + cal_max_gap(opt, qbeg)); + rmax[1] = rend + ((l_query - qend) + cal_max_gap(opt, l_query - qend)); + if (rmax[0] < 0) rmax[0] = 0; + if (rmax[1] > l_pac<<1) rmax[1] = l_pac<<1; + // retrieve the reference sequence + rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); + + if (qbeg) { // left extension of the first seed + uint8_t *rs, *qs; + int qle, tle; + qs = malloc(qbeg); + for (i = 0; i < qbeg; ++i) qs[i] = query[qbeg - 1 - i]; + tmp = rbeg - rmax[0]; + rs = malloc(tmp); + for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; + score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle); + free(qs); free(rs); + } else score = c->seeds[0].len * opt->a; + + if (c->seeds[0].qbeg + c->seeds[0].len != l_query) { // right extension of the first seed + int qle, tle, qe, re; + s = &c->seeds[0]; + qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; + for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n'); + for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); + score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, score, 0, &qle, &tle); + printf("[%d] score=%d\tqle=%d\trle=%d\n", c->n, score, qle, tle); } - for (i = max = 0, max_i = -1; i < c->n; ++i) // find the longest seed - if (max < c->seeds[i].len) max = c->seeds[i].len, max_i = i; + free(rseq); return a; } diff --git a/bwamem.h b/bwamem.h index 214d780..b026de4 100644 --- a/bwamem.h +++ b/bwamem.h @@ -14,6 +14,7 @@ typedef struct { typedef struct { int a, b, q, r, w; int min_seed_len, max_occ, max_chain_gap; + int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; typedef struct { @@ -43,6 +44,7 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query); const bwtintv_v *smem_next(smem_i *itr, int split_len); mem_opt_t *mem_opt_init(void); +void mem_fill_scmat(int a, int b, int8_t mat[25]); mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c); diff --git a/fastmap.c b/fastmap.c index c92311e..4a677c3 100644 --- a/fastmap.c +++ b/fastmap.c @@ -31,6 +31,7 @@ int main_mem(int argc, char *argv[]) free(opt); return 1; } + mem_fill_scmat(opt->a, opt->b, opt->mat); fp = gzopen(argv[optind + 1], "r"); seq = kseq_init(fp); { // load the packed sequences, BWT and SA diff --git a/ksw.c b/ksw.c index 437f563..763c774 100644 --- a/ksw.c +++ b/ksw.c @@ -315,11 +315,11 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle) { eh_t *eh; int8_t *qp; - int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j; + int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap; if (h0 < 0) h0 = 0; // allocate memory eh = calloc(qlen + 1, 8); @@ -333,8 +333,15 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0; for (j = 2; j <= qlen && eh[j-1].h > gape; ++j) eh[j].h = eh[j-1].h - gape; + // adjust $w if it is too large + k = m * m; + for (i = 0, max = 0; i < k; ++i) // get the max score + max = max > mat[i]? max : mat[i]; + max_gap = (int)((double)(qlen * max - gapo) / gape + 1.); + max_gap = max_gap > 1? max_gap : 1; + w = w < max_gap? w : max_gap; // DP loop - max = h0, max_i = max_j = 0; + max = h0, max_i = max_j = -1; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { int f = 0, h1, m = 0, mj = -1, t; @@ -381,8 +388,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, //beg = 0; end = qlen; // uncomment this line for debugging } free(eh); free(qp); - if (_qpos) *_qpos = max_i + 1; - if (_tpos) *_tpos = max_j + 1; + if (_qle) *_qle = max_i + 1; + if (_tle) *_tle = max_j + 1; return max; } diff --git a/ksw.h b/ksw.h index b7b9c40..3c9b959 100644 --- a/ksw.h +++ b/ksw.h @@ -49,7 +49,7 @@ extern "C" { /** Unified interface for ksw_sse2_8() and ksw_sse2_16() */ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qpos, int *_tpos); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle); #ifdef __cplusplus } From 666638a953a8033913f8e8b8a5da7e24c5ec4c45 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 14:51:51 -0500 Subject: [PATCH 158/498] changed the default scoring --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index b9e7f68..54ee3f1 100644 --- a/bwamem.c +++ b/bwamem.c @@ -22,7 +22,7 @@ mem_opt_t *mem_opt_init() { mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); - o->a = 1; o->b = 9; o->q = 16; o->r = 1; o->w = 100; + o->a = 1; o->b = 5; o->q = 8; o->r = 1; o->w = 100; o->min_seed_len = 17; o->max_occ = 10; o->max_chain_gap = 10000; From 5bfa45a69b7c9bf8532ec6086e1e6e4ba4b31710 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 15:02:56 -0500 Subject: [PATCH 159/498] write the mem_aln_t struct --- bwamem.c | 24 +++++++++++++++--------- bwamem.h | 4 ++-- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/bwamem.c b/bwamem.c index 54ee3f1..5fbe937 100644 --- a/bwamem.c +++ b/bwamem.c @@ -199,10 +199,12 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) { mem_aln_t a; - int i, j, qbeg, qend, score; - int64_t k, rlen, rbeg, rend, rmax[2], tmp; + int i, j, qbeg, qend; + int64_t rlen, rbeg, rend, rmax[2], tmp; mem_seed_t *s; uint8_t *rseq = 0; + + memset(&a, 0, sizeof(mem_aln_t)); // get the start and end of the seeded region rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg; s = &c->seeds[c->n-1]; @@ -223,19 +225,23 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, tmp = rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle); + a.score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle); + a.qb = qbeg - qle; a.rb = rbeg - tle; free(qs); free(rs); - } else score = c->seeds[0].len * opt->a; + } else a.score = c->seeds[0].len * opt->a, a.qb = 0, a.rb = rbeg; - if (c->seeds[0].qbeg + c->seeds[0].len != l_query) { // right extension of the first seed + s = &c->seeds[0]; + if (s->qbeg + s->len != l_query) { // right extension of the first seed int qle, tle, qe, re; - s = &c->seeds[0]; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n'); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); - score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, score, 0, &qle, &tle); - printf("[%d] score=%d\tqle=%d\trle=%d\n", c->n, score, qle, tle); - } + a.score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a.score, 0, &qle, &tle); + a.qe = qe + qle; a.re = rmax[0] + re + tle; + } else a.qe = l_query, a.re = s->rbeg + s->len; + + printf("[%d] score=%d\t[%d,%d)\t[%lld,%lld)\n", c->n, a.score, a.qb, a.qe, a.rb, a.re); + free(rseq); return a; } diff --git a/bwamem.h b/bwamem.h index b026de4..7bea0ad 100644 --- a/bwamem.h +++ b/bwamem.h @@ -29,8 +29,8 @@ typedef struct { } mem_chain_t; typedef struct { - int64_t pos; - int n_cigar, len, score; + int64_t pos, rb, re; + int n_cigar, len, score, qb, qe; uint32_t *cigar; } mem_aln_t; From f27bd18f2025598bd73b1aee4858ca5f4c43e72f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 15:09:47 -0500 Subject: [PATCH 160/498] check if every seed is included; not used for now --- bwamem.c | 8 +++++++- bwamem.h | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 5fbe937..f373ef0 100644 --- a/bwamem.c +++ b/bwamem.c @@ -240,7 +240,13 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, a.qe = qe + qle; a.re = rmax[0] + re + tle; } else a.qe = l_query, a.re = s->rbeg + s->len; - printf("[%d] score=%d\t[%d,%d)\t[%lld,%lld)\n", c->n, a.score, a.qb, a.qe, a.rb, a.re); + a.is_all = 1; + if (c->n > 1) { // check if all the seeds have been included + s = &c->seeds[c->n - 1]; + if (s->qbeg + s->len > a.qe) a.is_all = 0; + } + + printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\n", c->n, a.score, a.qb, a.qe, a.rb, a.re, a.is_all); free(rseq); return a; diff --git a/bwamem.h b/bwamem.h index 7bea0ad..fae4529 100644 --- a/bwamem.h +++ b/bwamem.h @@ -30,7 +30,7 @@ typedef struct { typedef struct { int64_t pos, rb, re; - int n_cigar, len, score, qb, qe; + int n_cigar, len, score, qb, qe, is_all; uint32_t *cigar; } mem_aln_t; From 788e9d1e3dad7c5477d075371af81f45f1ff55b9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 15:40:26 -0500 Subject: [PATCH 161/498] fixed a couple of leaks; buggy atm --- bwamem.c | 21 +++++++++++++++++++-- fastmap.c | 10 ++++++++-- ksw.c | 4 ++-- ksw.h | 2 +- 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/bwamem.c b/bwamem.c index f373ef0..032a54e 100644 --- a/bwamem.c +++ b/bwamem.c @@ -10,7 +10,7 @@ void mem_fill_scmat(int a, int b, int8_t mat[25]) { int i, j, k; - for (i = k = 0; i < 5; ++i) { + for (i = k = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) mat[k++] = i == j? a : -b; mat[k++] = 0; // ambiguous base @@ -233,11 +233,28 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, s = &c->seeds[0]; if (s->qbeg + s->len != l_query) { // right extension of the first seed int qle, tle, qe, re; + int16_t *qw = 0; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n'); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); - a.score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a.score, 0, &qle, &tle); + if (c->n > 1) { // generate $qw + int l = rmax[1] - (s->rbeg + s->len); + assert(l >= 0 && l < 1000); + qw = malloc(l * 2); + for (i = 0; i < l; ++i) qw[i] = -1; // no constraint by default + for (i = 1; i < c->n; ++i) { + const mem_seed_t *t = &c->seeds[i]; + for (j = 0; j < t->len; ++j) { + int x = t->rbeg + j - (s->rbeg + s->len), y = t->qbeg + j - (s->qbeg + s->len); + assert(x < l); + if (qw[x] == -1) qw[x] = x > y? x - y : y - x; + else if (qw[x] >= 0) qw[x] = -2; // in a seed overlap, do not set any constraint + } + } + } + a.score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a.score, qw, &qle, &tle); a.qe = qe + qle; a.re = rmax[0] + re + tle; + free(qw); } else a.qe = l_query, a.re = s->rbeg + s->len; a.is_all = 1; diff --git a/fastmap.c b/fastmap.c index 4a677c3..f3100c7 100644 --- a/fastmap.c +++ b/fastmap.c @@ -17,7 +17,7 @@ int main_mem(int argc, char *argv[]) bwt_t *bwt; bntseq_t *bns; int i, j, c; - gzFile *fp; + gzFile fp; kseq_t *seq; uint8_t *pac = 0; @@ -66,9 +66,15 @@ int main_mem(int argc, char *argv[]) putchar('\n'); } puts("//"); + for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds); + free(chain.chains); } - free(opt); + free(pac); free(opt); + bns_destroy(bns); + bwt_destroy(bwt); + kseq_destroy(seq); + gzclose(fp); return 0; } diff --git a/ksw.c b/ksw.c index 763c774..05f597d 100644 --- a/ksw.c +++ b/ksw.c @@ -315,7 +315,7 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int16_t *qw, int *_qle, int *_tle) { eh_t *eh; int8_t *qp; @@ -350,7 +350,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, h1 = h0 - (gapo + gape * (i + 1)); if (h1 < 0) h1 = 0; // apply the band and the constraint (if provided) - t = (qw && qw[i] < w)? qw[i] : w; // this is the band width at $i + t = (qw && qw[i] >= 0 && qw[i] < w)? qw[i] : w; // this is the band width at $i if (beg < i - t) beg = i - t; if (end > i + t + 1) end = i + t + 1; if (end > qlen) end = qlen; diff --git a/ksw.h b/ksw.h index 3c9b959..220a8d7 100644 --- a/ksw.h +++ b/ksw.h @@ -49,7 +49,7 @@ extern "C" { /** Unified interface for ksw_sse2_8() and ksw_sse2_16() */ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int *qw, int *_qle, int *_tle); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int16_t *qw, int *_qle, int *_tle); #ifdef __cplusplus } From 29c8546679ac986141c70b4f7351e766c0e385f3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 16:08:00 -0500 Subject: [PATCH 162/498] better ref extraction --- bwamem.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/bwamem.c b/bwamem.c index 032a54e..d7241b6 100644 --- a/bwamem.c +++ b/bwamem.c @@ -197,23 +197,26 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) } mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) -{ +{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds mem_aln_t a; - int i, j, qbeg, qend; - int64_t rlen, rbeg, rend, rmax[2], tmp; - mem_seed_t *s; + int i, j, qbeg; + int64_t rlen, rbeg, rmax[2], tmp; + const mem_seed_t *s; uint8_t *rseq = 0; memset(&a, 0, sizeof(mem_aln_t)); // get the start and end of the seeded region rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg; - s = &c->seeds[c->n-1]; - rend = s->rbeg + s->len; qend = s->qbeg + s->len; // get the max possible span - rmax[0] = rbeg - (qbeg + cal_max_gap(opt, qbeg)); - rmax[1] = rend + ((l_query - qend) + cal_max_gap(opt, l_query - qend)); - if (rmax[0] < 0) rmax[0] = 0; - if (rmax[1] > l_pac<<1) rmax[1] = l_pac<<1; + rmax[0] = l_pac<<1; rmax[1] = 0; + for (i = 0; i < c->n; ++i) { + int64_t b, e; + const mem_seed_t *t = &c->seeds[i]; + b = t->rbeg - (t->qbeg + cal_max_gap(opt, t->qbeg)); + e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len)); + rmax[0] = rmax[0] < b? rmax[0] : b; + rmax[1] = rmax[1] > e? rmax[1] : e; + } // retrieve the reference sequence rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); @@ -239,14 +242,13 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); if (c->n > 1) { // generate $qw int l = rmax[1] - (s->rbeg + s->len); - assert(l >= 0 && l < 1000); qw = malloc(l * 2); for (i = 0; i < l; ++i) qw[i] = -1; // no constraint by default for (i = 1; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; for (j = 0; j < t->len; ++j) { int x = t->rbeg + j - (s->rbeg + s->len), y = t->qbeg + j - (s->qbeg + s->len); - assert(x < l); + if (x < 0) continue; // overlap with the first seed if (qw[x] == -1) qw[x] = x > y? x - y : y - x; else if (qw[x] >= 0) qw[x] = -2; // in a seed overlap, do not set any constraint } From c589b42fb5c7deda8f843b85ae6f8ecfb77b1ae9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 16:48:11 -0500 Subject: [PATCH 163/498] minor tuning for fewer identical hits --- bwamem.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/bwamem.c b/bwamem.c index d7241b6..3f14d71 100644 --- a/bwamem.c +++ b/bwamem.c @@ -92,7 +92,9 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, 2, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM i = j = 0; a->n = 0; while (i < itr->matches->n && j < itr->sub->n) { // ordered merge - if (itr->matches->a[i].info < itr->sub->a[j].info) { + int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info); + int64_t xj = itr->matches->a[j].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[j].info); + if (xi < xj) { kv_push(bwtintv_t, *a, itr->matches->a[i]); ++i; } else { @@ -120,9 +122,9 @@ static int test_and_merge(const mem_opt_t *opt, mem_chain1_t *c, const mem_seed_ rend = last->rbeg + last->len; if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) return 1; // contained seed; do nothing - x = p->qbeg - last->qbeg; // always positive + x = p->qbeg - last->qbeg; // always non-negtive y = p->rbeg - last->rbeg; - if (y > 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain + if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain if (c->n == c->m) { c->m <<= 1; c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t)); @@ -190,6 +192,14 @@ mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin return chain; } +/******************** + * Filtering chains * + ********************/ + +/**************************************** + * Construct the alignment from a chain * + ****************************************/ + static inline int cal_max_gap(const mem_opt_t *opt, int qlen) { int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.); @@ -197,7 +207,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) } mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) -{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds +{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds mem_aln_t a; int i, j, qbeg; int64_t rlen, rbeg, rmax[2], tmp; @@ -238,8 +248,8 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int qle, tle, qe, re; int16_t *qw = 0; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; - for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n'); - for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); +// for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n'); +// for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); if (c->n > 1) { // generate $qw int l = rmax[1] - (s->rbeg + s->len); qw = malloc(l * 2); From 9d0cdb2d3cceadcfffb6483d59ed47fd94aa9ae7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Feb 2013 17:23:06 -0500 Subject: [PATCH 164/498] unfinished chain filter --- bwamem.c | 74 ++++++++++++++++++++++++++++++++++++++++++++----------- bwamem.h | 2 +- fastmap.c | 3 ++- 3 files changed, 62 insertions(+), 17 deletions(-) diff --git a/bwamem.c b/bwamem.c index 3f14d71..ae70af7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -6,6 +6,7 @@ #include "kvec.h" #include "bntseq.h" #include "ksw.h" +#include "ksort.h" void mem_fill_scmat(int a, int b, int8_t mat[25]) { @@ -109,6 +110,10 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) return itr->matches; } +/******************************** + * Chaining while finding SMEMs * + ********************************/ + #include "kbtree.h" #define chain_cmp(a, b) ((a).pos - (b).pos) @@ -196,6 +201,49 @@ mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin * Filtering chains * ********************/ +typedef struct { + int beg, end, w; + void *p, *p2; +} flt_aux_t; + +#define flt_lt(a, b) ((a).w > (b).w) +KSORT_INIT(mem_flt, flt_aux_t, flt_lt) + +void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) +{ + flt_aux_t *a; + int i, j, n; + if (chn->n <= 1) return; // no need to filter + a = malloc(sizeof(flt_aux_t) * chn->n); + for (i = 0; i < chn->n; ++i) { + mem_chain1_t *c = &chn->chains[i]; + int w = 0; + for (j = 0; j < c->n; ++j) w += c->len; + a[i].beg = c->seeds[0].qbeg; + a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; + a[i].w = w; + a[i].p = c; + a[i].w2 = 0; a[i].p2 = 0; + } + ks_introsort(mem_flt, chn->n, a); + for (i = 1, n = 1; i < chn->n; ++i) { + for (j = 0; j < n; ++j) { + int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg; + int e_min = e[j].end < a[i].end? a[j].end : a[i].end; + if (e_min > b_max) { // have overlap + int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg; + if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap + if (a[j].p2 == 0) a[j].p2 = a[i].p; + if (a[i].w < a[j].w * opt->chain_drop_ratio) + break; + } + } + } + if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it. + } + free(a); +} + /**************************************** * Construct the alignment from a chain * ****************************************/ @@ -206,15 +254,14 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) return l > 1? l : 1; } -mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c) +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - mem_aln_t a; int i, j, qbeg; int64_t rlen, rbeg, rmax[2], tmp; const mem_seed_t *s; uint8_t *rseq = 0; - memset(&a, 0, sizeof(mem_aln_t)); + memset(a, 0, sizeof(mem_aln_t)); // get the start and end of the seeded region rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg; // get the max possible span @@ -238,18 +285,16 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, tmp = rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - a.score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle); - a.qb = qbeg - qle; a.rb = rbeg - tle; + a->score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle); + a->qb = qbeg - qle; a->rb = rbeg - tle; free(qs); free(rs); - } else a.score = c->seeds[0].len * opt->a, a.qb = 0, a.rb = rbeg; + } else a->score = c->seeds[0].len * opt->a, a->qb = 0, a->rb = rbeg; s = &c->seeds[0]; if (s->qbeg + s->len != l_query) { // right extension of the first seed int qle, tle, qe, re; int16_t *qw = 0; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; -// for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[j+qe]]); putchar('\n'); -// for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[j+re]]); putchar('\n'); if (c->n > 1) { // generate $qw int l = rmax[1] - (s->rbeg + s->len); qw = malloc(l * 2); @@ -264,19 +309,18 @@ mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, } } } - a.score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a.score, qw, &qle, &tle); - a.qe = qe + qle; a.re = rmax[0] + re + tle; + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); + a->qe = qe + qle; a->re = rmax[0] + re + tle; free(qw); - } else a.qe = l_query, a.re = s->rbeg + s->len; + } else a->qe = l_query, a->re = s->rbeg + s->len; - a.is_all = 1; + a->is_all = 1; if (c->n > 1) { // check if all the seeds have been included s = &c->seeds[c->n - 1]; - if (s->qbeg + s->len > a.qe) a.is_all = 0; + if (s->qbeg + s->len > a->qe) a->is_all = 0; } - printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\n", c->n, a.score, a.qb, a.qe, a.rb, a.re, a.is_all); + printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\n", c->n, a->score, a->qb, a->qe, a->rb, a->re, a->is_all); free(rseq); - return a; } diff --git a/bwamem.h b/bwamem.h index fae4529..0484edf 100644 --- a/bwamem.h +++ b/bwamem.h @@ -47,7 +47,7 @@ mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); -mem_aln_t mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c); +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a); #ifdef __cplusplus } diff --git a/fastmap.c b/fastmap.c index f3100c7..797a22f 100644 --- a/fastmap.c +++ b/fastmap.c @@ -53,7 +53,8 @@ int main_mem(int argc, char *argv[]) chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); for (i = 0; i < chain.n; ++i) { mem_chain1_t *p = &chain.chains[i]; - mem_chain2aln(opt, bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p); + mem_aln_t a; + mem_chain2aln(opt, bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p, &a); printf("%d\t%d", i, p->n); for (j = 0; j < p->n; ++j) { bwtint_t pos; From d6a73c9171c14ac4dbfa1c9a2194c0d945ea41eb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 00:17:20 -0500 Subject: [PATCH 165/498] chain filtering apparently working --- bwamem.c | 28 +++++++++++++++++++++++++--- bwamem.h | 2 ++ fastmap.c | 1 + 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index ae70af7..33c911d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -27,6 +27,8 @@ mem_opt_t *mem_opt_init() o->min_seed_len = 17; o->max_occ = 10; o->max_chain_gap = 10000; + o->mask_level = 0.50; + o->chain_drop_ratio = 0.33; mem_fill_scmat(o->a, o->b, o->mat); return o; } @@ -218,18 +220,18 @@ void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) for (i = 0; i < chn->n; ++i) { mem_chain1_t *c = &chn->chains[i]; int w = 0; - for (j = 0; j < c->n; ++j) w += c->len; + for (j = 0; j < c->n; ++j) w += c->seeds[j].len; a[i].beg = c->seeds[0].qbeg; a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; a[i].w = w; a[i].p = c; - a[i].w2 = 0; a[i].p2 = 0; + a[i].p2 = 0; } ks_introsort(mem_flt, chn->n, a); for (i = 1, n = 1; i < chn->n; ++i) { for (j = 0; j < n; ++j) { int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg; - int e_min = e[j].end < a[i].end? a[j].end : a[i].end; + int e_min = a[j].end < a[i].end? a[j].end : a[i].end; if (e_min > b_max) { // have overlap int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg; if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap @@ -241,7 +243,27 @@ void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) } if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it. } + for (i = 0; i < n; ++i) { // mark chains to be kept + mem_chain1_t *c = (mem_chain1_t*)a[i].p; + if (c->n > 0) c->n = -c->n; + c = (mem_chain1_t*)a[i].p2; + if (c && c->n > 0) c->n = -c->n; + } free(a); + for (i = 0; i < chn->n; ++i) { // free discarded chains + mem_chain1_t *c = &chn->chains[i]; + if (c->n >= 0) { + free(c->seeds); + c->n = c->m = 0; + } else c->n = -c->n; + } + for (i = n = 0; i < chn->n; ++i) { // squeeze out discarded chains + if (chn->chains[i].n > 0) { + if (n != i) chn->chains[n++] = chn->chains[i]; + else ++n; + } + } + chn->n = n; } /**************************************** diff --git a/bwamem.h b/bwamem.h index 0484edf..adf57dd 100644 --- a/bwamem.h +++ b/bwamem.h @@ -15,6 +15,7 @@ typedef struct { int a, b, q, r, w; int min_seed_len, max_occ, max_chain_gap; int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset + float mask_level, chain_drop_ratio; } mem_opt_t; typedef struct { @@ -47,6 +48,7 @@ mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); +void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn); void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a); #ifdef __cplusplus diff --git a/fastmap.c b/fastmap.c index 797a22f..f02224a 100644 --- a/fastmap.c +++ b/fastmap.c @@ -51,6 +51,7 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); + mem_chain_flt(opt, &chain); for (i = 0; i < chain.n; ++i) { mem_chain1_t *p = &chain.chains[i]; mem_aln_t a; From 7067af833d01f9dbe3ca18295dd11a73fdd87b87 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 00:41:07 -0500 Subject: [PATCH 166/498] fixed a silly bug on sorted merge --- bwamem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 33c911d..fd7caa2 100644 --- a/bwamem.c +++ b/bwamem.c @@ -28,7 +28,7 @@ mem_opt_t *mem_opt_init() o->max_occ = 10; o->max_chain_gap = 10000; o->mask_level = 0.50; - o->chain_drop_ratio = 0.33; + o->chain_drop_ratio = 0.50; mem_fill_scmat(o->a, o->b, o->mat); return o; } @@ -96,7 +96,7 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) i = j = 0; a->n = 0; while (i < itr->matches->n && j < itr->sub->n) { // ordered merge int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info); - int64_t xj = itr->matches->a[j].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[j].info); + int64_t xj = itr->sub->a[j].info>>32<<32 | (itr->len - (uint32_t)itr->sub->a[j].info); if (xi < xj) { kv_push(bwtintv_t, *a, itr->matches->a[i]); ++i; From d91e3209724515feb9fdee69f2222e20f0c63a71 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 12:06:56 -0500 Subject: [PATCH 167/498] towards reimplementing banded NW alignment --- ksw.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ksw.h | 1 + 2 files changed, 57 insertions(+) diff --git a/ksw.c b/ksw.c index 05f597d..6282915 100644 --- a/ksw.c +++ b/ksw.c @@ -393,6 +393,62 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, return max; } +/******************** + * Global alignment * + ********************/ + +#define MINUS_INF -0x40000000 + +int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar) +{ + eh_t *eh; + int8_t *qp; + int i, j, k, gapoe = gapo + gape, score; + // allocate memory + eh = calloc(qlen + 1, 8); + qp = malloc(qlen * m); + // generate the query profile + for (k = i = 0; k < m; ++k) { + const int8_t *p = &mat[k * m]; + for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; + } + // fill the first row + eh[0].h = 0; eh[0].e = MINUS_INF; + for (j = 1; j <= qlen && j <= w; ++j) + eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF; + for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; + // DP loop + for (i = 0; LIKELY(i < tlen); ++i) { + int32_t f = MINUS_INF, h1, beg, end; + int8_t *q = &qp[target[i] * qlen]; + beg = i > w? i - w : 0; + end = i + w + 1 < qlen? i + w + 1 : qlen; + h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF; + printf("%d\t%d", i, end); + for (j = beg; LIKELY(j < end); ++j) { + eh_t *p = &eh[j]; + int32_t h = p->h, e = p->e; + p->h = h1; + h += q[j]; + h = h > e? h : e; + h = h > f? h : f; + h1 = h; + printf("\t%d:%d", j, h); + h -= gapoe; + e -= gape; + e = e > h? e : h; + p->e = e; + f -= gape; + f = f > h? f : h; + } + putchar('\n'); + eh[end].h = h1; eh[end].e = MINUS_INF; + } + score = eh[qlen].h; + free(eh); free(qp); + return score; +} + /******************************************* * Main function (not compiled by default) * *******************************************/ diff --git a/ksw.h b/ksw.h index 220a8d7..d58f423 100644 --- a/ksw.h +++ b/ksw.h @@ -50,6 +50,7 @@ extern "C" { int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int16_t *qw, int *_qle, int *_tle); + int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar); #ifdef __cplusplus } From 7e1466c8856b4490d4f962766d28ae4fc6742bfe Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 16:05:53 -0500 Subject: [PATCH 168/498] implemented NW backtrack --- ksw.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 7 deletions(-) diff --git a/ksw.c b/ksw.c index 6282915..7d97b2c 100644 --- a/ksw.c +++ b/ksw.c @@ -27,6 +27,10 @@ #include #include "ksw.h" +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + #ifndef _NO_SSE2 #include @@ -322,8 +326,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap; if (h0 < 0) h0 = 0; // allocate memory - eh = calloc(qlen + 1, 8); qp = malloc(qlen * m); + eh = calloc(qlen + 1, 8); // generate the query profile for (k = i = 0; k < m; ++k) { const int8_t *p = &mat[k * m]; @@ -399,14 +403,35 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, #define MINUS_INF -0x40000000 -int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar) +typedef struct { + uint8_t h:2, e:1, f:1; +} btmat_t; + +static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len) +{ + if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { + if (*n_cigar == *m_cigar) { + *m_cigar = *m_cigar? (*m_cigar)<<1 : 4; + cigar = realloc(cigar, (*m_cigar) << 4); + } + cigar[(*n_cigar)++] = len<<4 | op; + } else cigar[(*n_cigar)-1] += len<<4; + return cigar; +} + +#define cal_j_(i_, k_, w_) ((k_) - ((i_) > (w_)? (i_) - (w_) : 0)) + +int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_) { eh_t *eh; int8_t *qp; - int i, j, k, gapoe = gapo + gape, score; + int i, j, k, gapoe = gapo + gape, score, n_col; + btmat_t *z; // allocate memory - eh = calloc(qlen + 1, 8); + n_col = qlen < w? qlen : w; + z = malloc(n_col * tlen * sizeof(btmat_t)); qp = malloc(qlen * m); + eh = calloc(qlen + 1, 8); // generate the query profile for (k = i = 0; k < m; ++k) { const int8_t *p = &mat[k * m]; @@ -421,31 +446,63 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, for (i = 0; LIKELY(i < tlen); ++i) { int32_t f = MINUS_INF, h1, beg, end; int8_t *q = &qp[target[i] * qlen]; + btmat_t *zi = &z[i * n_col]; beg = i > w? i - w : 0; end = i + w + 1 < qlen? i + w + 1 : qlen; h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF; - printf("%d\t%d", i, end); + printf("%d", i); for (j = beg; LIKELY(j < end); ++j) { eh_t *p = &eh[j]; + btmat_t *zij = &zi[j - beg]; int32_t h = p->h, e = p->e; p->h = h1; h += q[j]; + zij->h = h > e? 0 : 1; h = h > e? h : e; + zij->h = h > f? zij->h : 2; h = h > f? h : f; + printf("\t%d:%d:%d", h<-99?-99:h, e<-99?-99:e, f<-99?-99:f); h1 = h; - printf("\t%d:%d", j, h); h -= gapoe; e -= gape; + zij->e = (e > h); // NB: zij->e keeps the direction for the NEXT row, not the current one e = e > h? e : h; p->e = e; f -= gape; + zij->f = (f > h); f = f > h? f : h; + printf(",%d:%d:%d", zij->h, zij->e, zij->f); } putchar('\n'); eh[end].h = h1; eh[end].e = MINUS_INF; } score = eh[qlen].h; - free(eh); free(qp); + if (n_cigar_ && cigar_) { // backtrack + int n_cigar = 0, m_cigar = 0, which; + uint32_t *cigar = 0, tmp; + i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell + which = z[i * n_col + cal_j_(i, k, w)].h; + while (i >= 0 && k >= 0) { + printf("(%d,%d)\t%d\n", i, k, which); + if (which == 0) { + cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1); --i, --k; + if (i >= 0 && k >= 0) which = z[i * n_col + cal_j_(i, k, w)].h; + } else if (which == 1) { + cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1); --i; + if (i >= 0) which = z[i * n_col + cal_j_(i, k, w)].e? 1 : 0; + } else { // which == 2 + cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1); --k; + if (k >= 0) which = z[i * n_col + cal_j_(i, k, w)].f? 2 : 0; + } + } + printf("(%d,%d)\t%d\n", i, k, which); + if (i > 0) push_cigar(&n_cigar, &m_cigar, cigar, 2, i); + if (k > 0) push_cigar(&n_cigar, &m_cigar, cigar, 1, k); + for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR + tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; + *n_cigar_ = n_cigar, *cigar_ = cigar; + } + free(eh); free(qp); free(z); return score; } From 1bc9712cd827244159c4bde528fb05bebef5abf9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 16:28:15 -0500 Subject: [PATCH 169/498] explicitly use bit to keep bt matrix This also simplifies backtracking. --- ksw.c | 51 +++++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/ksw.c b/ksw.c index 7d97b2c..654ef3e 100644 --- a/ksw.c +++ b/ksw.c @@ -403,10 +403,6 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, #define MINUS_INF -0x40000000 -typedef struct { - uint8_t h:2, e:1, f:1; -} btmat_t; - static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len) { if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { @@ -419,17 +415,15 @@ static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, return cigar; } -#define cal_j_(i_, k_, w_) ((k_) - ((i_) > (w_)? (i_) - (w_) : 0)) - int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_) { eh_t *eh; int8_t *qp; int i, j, k, gapoe = gapo + gape, score, n_col; - btmat_t *z; + uint8_t *z; // allocate memory - n_col = qlen < w? qlen : w; - z = malloc(n_col * tlen * sizeof(btmat_t)); + n_col = qlen < 2*w+1? qlen : 2*w+1; + z = malloc(n_col * tlen); qp = malloc(qlen * m); eh = calloc(qlen + 1, 8); // generate the query profile @@ -446,58 +440,51 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, for (i = 0; LIKELY(i < tlen); ++i) { int32_t f = MINUS_INF, h1, beg, end; int8_t *q = &qp[target[i] * qlen]; - btmat_t *zi = &z[i * n_col]; + uint8_t *zi = &z[i * n_col]; beg = i > w? i - w : 0; end = i + w + 1 < qlen? i + w + 1 : qlen; h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF; printf("%d", i); for (j = beg; LIKELY(j < end); ++j) { eh_t *p = &eh[j]; - btmat_t *zij = &zi[j - beg]; int32_t h = p->h, e = p->e; + uint8_t d; // direction p->h = h1; h += q[j]; - zij->h = h > e? 0 : 1; + d = h > e? 0 : 1; h = h > e? h : e; - zij->h = h > f? zij->h : 2; + d = h > f? d : 2; h = h > f? h : f; - printf("\t%d:%d:%d", h<-99?-99:h, e<-99?-99:e, f<-99?-99:f); + printf("\t[%d],%d:%d:%d", j, h<-99?-99:h, e<-99?-99:e, f<-99?-99:f); h1 = h; h -= gapoe; e -= gape; - zij->e = (e > h); // NB: zij->e keeps the direction for the NEXT row, not the current one + d |= e > h? 1<<2 : 0; e = e > h? e : h; p->e = e; f -= gape; - zij->f = (f > h); + d |= f > h? 2<<4 : 0; f = f > h? f : h; - printf(",%d:%d:%d", zij->h, zij->e, zij->f); + zi[j - beg] = d; + printf(",%d:%d:%d", d>>0&3, d>>2&3, d>>4&3); } putchar('\n'); eh[end].h = h1; eh[end].e = MINUS_INF; } score = eh[qlen].h; if (n_cigar_ && cigar_) { // backtrack - int n_cigar = 0, m_cigar = 0, which; + int n_cigar = 0, m_cigar = 0, which = 0; uint32_t *cigar = 0, tmp; i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell - which = z[i * n_col + cal_j_(i, k, w)].h; while (i >= 0 && k >= 0) { + which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3; printf("(%d,%d)\t%d\n", i, k, which); - if (which == 0) { - cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1); --i, --k; - if (i >= 0 && k >= 0) which = z[i * n_col + cal_j_(i, k, w)].h; - } else if (which == 1) { - cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1); --i; - if (i >= 0) which = z[i * n_col + cal_j_(i, k, w)].e? 1 : 0; - } else { // which == 2 - cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1); --k; - if (k >= 0) which = z[i * n_col + cal_j_(i, k, w)].f? 2 : 0; - } + if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k; + else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i; + else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k; } - printf("(%d,%d)\t%d\n", i, k, which); - if (i > 0) push_cigar(&n_cigar, &m_cigar, cigar, 2, i); - if (k > 0) push_cigar(&n_cigar, &m_cigar, cigar, 1, k); + if (i >= 0) push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1); + if (k >= 0) push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1); for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; *n_cigar_ = n_cigar, *cigar_ = cigar; From 86caae811e6d2a256b19990eb58c469f16aae60c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 16:58:35 -0500 Subject: [PATCH 170/498] added comments --- ksw.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/ksw.c b/ksw.c index 654ef3e..66728d5 100644 --- a/ksw.c +++ b/ksw.c @@ -321,8 +321,8 @@ typedef struct { int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int16_t *qw, int *_qle, int *_tle) { - eh_t *eh; - int8_t *qp; + eh_t *eh; // score array + int8_t *qp; // query profile int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap; if (h0 < 0) h0 = 0; // allocate memory @@ -418,11 +418,11 @@ static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_) { eh_t *eh; - int8_t *qp; + int8_t *qp; // query profile int i, j, k, gapoe = gapo + gape, score, n_col; - uint8_t *z; + uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex // allocate memory - n_col = qlen < 2*w+1? qlen : 2*w+1; + n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix z = malloc(n_col * tlen); qp = malloc(qlen * m); eh = calloc(qlen + 1, 8); @@ -435,17 +435,18 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, eh[0].h = 0; eh[0].e = MINUS_INF; for (j = 1; j <= qlen && j <= w; ++j) eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF; - for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; + for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band // DP loop - for (i = 0; LIKELY(i < tlen); ++i) { + for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop int32_t f = MINUS_INF, h1, beg, end; int8_t *q = &qp[target[i] * qlen]; uint8_t *zi = &z[i * n_col]; beg = i > w? i - w : 0; - end = i + w + 1 < qlen? i + w + 1 : qlen; + end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF; - printf("%d", i); for (j = beg; LIKELY(j < end); ++j) { + // This loop is organized in a similar way to ksw_extend() and ksw_sse2(), except: + // 1) not checking h>0; 2) recording direction for backtracking eh_t *p = &eh[j]; int32_t h = p->h, e = p->e; uint8_t d; // direction @@ -455,7 +456,6 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, h = h > e? h : e; d = h > f? d : 2; h = h > f? h : f; - printf("\t[%d],%d:%d:%d", j, h<-99?-99:h, e<-99?-99:e, f<-99?-99:f); h1 = h; h -= gapoe; e -= gape; @@ -463,12 +463,10 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, e = e > h? e : h; p->e = e; f -= gape; - d |= f > h? 2<<4 : 0; + d |= f > h? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two f = f > h? f : h; - zi[j - beg] = d; - printf(",%d:%d:%d", d>>0&3, d>>2&3, d>>4&3); + zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell } - putchar('\n'); eh[end].h = h1; eh[end].e = MINUS_INF; } score = eh[qlen].h; @@ -478,7 +476,6 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell while (i >= 0 && k >= 0) { which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3; - printf("(%d,%d)\t%d\n", i, k, which); if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k; else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i; else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k; From 1e16f3e701b670508b890407cbf9ce2995b4c091 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 17:13:12 -0500 Subject: [PATCH 171/498] calling ksw_global(); ksw_extend() is buggy! --- bwamem.c | 13 +++++++++++-- fastmap.c | 1 + ksw.c | 1 + 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index fd7caa2..fd164be 100644 --- a/bwamem.c +++ b/bwamem.c @@ -278,7 +278,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - int i, j, qbeg; + int i, j, qbeg, w, nw_score; int64_t rlen, rbeg, rmax[2], tmp; const mem_seed_t *s; uint8_t *rseq = 0; @@ -342,7 +342,16 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (s->qbeg + s->len > a->qe) a->is_all = 0; } - printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\n", c->n, a->score, a->qb, a->qe, a->rb, a->re, a->is_all); + w = (int)((double)(l_query * opt->a - opt->q) / opt->r + 1.); + w = w < opt->w? w : opt->w; + w += abs((a->re - a->rb) - (a->qe - a->qb)); + nw_score = ksw_global(a->qe - a->qb, query + a->qb, a->re - a->rb, rseq + (a->rb - rmax[0]), 5, opt->mat, opt->q, opt->r, w, &a->n_cigar, &a->cigar); + + printf("[%d] ", c->n); for (i = a->qb; i < a->qe; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + printf("[%d] ", c->n); for (i = a->rb; i < a->re; ++i) putchar("ACGTN"[(int)rseq[i - rmax[0]]]); putchar('\n'); + printf("[%d] score=%d,%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\t", c->n, a->score, nw_score, a->qb, a->qe, a->rb, a->re, a->is_all); + for (i = 0; i < a->n_cigar; ++i) printf("%d%c", a->cigar[i]>>4, "MIDS"[a->cigar[i]&0xf]); + putchar('\n'); free(rseq); } diff --git a/fastmap.c b/fastmap.c index f02224a..811149f 100644 --- a/fastmap.c +++ b/fastmap.c @@ -66,6 +66,7 @@ int main_mem(int argc, char *argv[]) printf("\t%d,%d,%s:%c%ld", p->seeds[j].len, p->seeds[j].qbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); } putchar('\n'); + free(a.cigar); } puts("//"); for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds); diff --git a/ksw.c b/ksw.c index 66728d5..6708e40 100644 --- a/ksw.c +++ b/ksw.c @@ -421,6 +421,7 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t *qp; // query profile int i, j, k, gapoe = gapo + gape, score, n_col; uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex + if (n_cigar_) *n_cigar_ = 0; // allocate memory n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix z = malloc(n_col * tlen); From 14e6a7bdb90014e9602fecd733d5e7745986aab8 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 17:29:03 -0500 Subject: [PATCH 172/498] fixed a silly bug in ksw_extend() Query return value is assigned to the target variable and vice versa... --- bwamem.c | 6 ++++-- ksw.c | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index fd164be..1880512 100644 --- a/bwamem.c +++ b/bwamem.c @@ -331,6 +331,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } } } + //printf("[Q] "); for (i = qe; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + //printf("[R] "); for (i = re; i < rmax[1] - rmax[0]; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; free(qw); @@ -347,8 +349,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int w += abs((a->re - a->rb) - (a->qe - a->qb)); nw_score = ksw_global(a->qe - a->qb, query + a->qb, a->re - a->rb, rseq + (a->rb - rmax[0]), 5, opt->mat, opt->q, opt->r, w, &a->n_cigar, &a->cigar); - printf("[%d] ", c->n); for (i = a->qb; i < a->qe; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); - printf("[%d] ", c->n); for (i = a->rb; i < a->re; ++i) putchar("ACGTN"[(int)rseq[i - rmax[0]]]); putchar('\n'); + //printf("[Q] "); for (i = a->qb; i < a->qe; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + //printf("[R] "); for (i = a->rb; i < a->re; ++i) putchar("ACGTN"[(int)rseq[i - rmax[0]]]); putchar('\n'); printf("[%d] score=%d,%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\t", c->n, a->score, nw_score, a->qb, a->qe, a->rb, a->re, a->is_all); for (i = 0; i < a->n_cigar; ++i) printf("%d%c", a->cigar[i]>>4, "MIDS"[a->cigar[i]&0xf]); putchar('\n'); diff --git a/ksw.c b/ksw.c index 6708e40..405bd86 100644 --- a/ksw.c +++ b/ksw.c @@ -392,8 +392,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, //beg = 0; end = qlen; // uncomment this line for debugging } free(eh); free(qp); - if (_qle) *_qle = max_i + 1; - if (_tle) *_tle = max_j + 1; + if (_qle) *_qle = max_j + 1; + if (_tle) *_tle = max_i + 1; return max; } From a61288c7683e011fb8f8750043e6fecfb535b256 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 21:49:19 -0500 Subject: [PATCH 173/498] separate CIGAR generation --- bwamem.c | 42 ++++++++++++++++++++++++++++++++++-------- bwamem.h | 3 +-- fastmap.c | 1 - 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/bwamem.c b/bwamem.c index 1880512..54355e3 100644 --- a/bwamem.c +++ b/bwamem.c @@ -278,7 +278,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - int i, j, qbeg, w, nw_score; + int i, j, qbeg; int64_t rlen, rbeg, rmax[2], tmp; const mem_seed_t *s; uint8_t *rseq = 0; @@ -344,16 +344,42 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (s->qbeg + s->len > a->qe) a->is_all = 0; } - w = (int)((double)(l_query * opt->a - opt->q) / opt->r + 1.); - w = w < opt->w? w : opt->w; - w += abs((a->re - a->rb) - (a->qe - a->qb)); - nw_score = ksw_global(a->qe - a->qb, query + a->qb, a->re - a->rb, rseq + (a->rb - rmax[0]), 5, opt->mat, opt->q, opt->r, w, &a->n_cigar, &a->cigar); - //printf("[Q] "); for (i = a->qb; i < a->qe; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); //printf("[R] "); for (i = a->rb; i < a->re; ++i) putchar("ACGTN"[(int)rseq[i - rmax[0]]]); putchar('\n'); - printf("[%d] score=%d,%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\t", c->n, a->score, nw_score, a->qb, a->qe, a->rb, a->re, a->is_all); - for (i = 0; i < a->n_cigar; ++i) printf("%d%c", a->cigar[i]>>4, "MIDS"[a->cigar[i]&0xf]); + printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\t", c->n, a->score, a->qb, a->qe, a->rb, a->re, a->is_all); putchar('\n'); free(rseq); } + +uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) +{ + uint32_t *cigar = 0; + uint8_t tmp, *rseq; + int i, w; + int64_t rlen; + *n_cigar = 0; + if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand + rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); + if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range + if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + for (i = 0; i < rlen>>1; ++i) + tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], query[rlen - 1 - i] = tmp; + } + // set the band-width + w = (int)((double)(l_query * opt->a - opt->q) / opt->r + 1.); + w = w < 1? w : 1; + w = w < opt->w? w : opt->w; + w += abs(rlen - l_query); + // NW alignment + *score = ksw_global(l_query, query, rlen, rseq, 5, opt->mat, opt->q, opt->r, w, n_cigar, &cigar); + if (rb >= l_pac) // reverse back query + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + +ret_gen_cigar: + free(rseq); + return cigar; +} diff --git a/bwamem.h b/bwamem.h index adf57dd..74eb70a 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,8 +31,7 @@ typedef struct { typedef struct { int64_t pos, rb, re; - int n_cigar, len, score, qb, qe, is_all; - uint32_t *cigar; + int len, score, qb, qe, is_all; } mem_aln_t; #ifdef __cplusplus diff --git a/fastmap.c b/fastmap.c index 811149f..f02224a 100644 --- a/fastmap.c +++ b/fastmap.c @@ -66,7 +66,6 @@ int main_mem(int argc, char *argv[]) printf("\t%d,%d,%s:%c%ld", p->seeds[j].len, p->seeds[j].qbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); } putchar('\n'); - free(a.cigar); } puts("//"); for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds); From 797a8c147e266458d1ea8c1790ea4e1b5666ffc3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Feb 2013 21:58:33 -0500 Subject: [PATCH 174/498] sorting chains while filtering chains --- bwamem.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 54355e3..69f9e81 100644 --- a/bwamem.c +++ b/bwamem.c @@ -220,7 +220,7 @@ void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) for (i = 0; i < chn->n; ++i) { mem_chain1_t *c = &chn->chains[i]; int w = 0; - for (j = 0; j < c->n; ++j) w += c->seeds[j].len; + for (j = 0; j < c->n; ++j) w += c->seeds[j].len; // FIXME: take care of seed overlaps a[i].beg = c->seeds[0].qbeg; a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; a[i].w = w; @@ -228,6 +228,16 @@ void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) a[i].p2 = 0; } ks_introsort(mem_flt, chn->n, a); + { // reorder chains such that the best chain appears first + mem_chain1_t *swap; + swap = malloc(sizeof(mem_chain1_t) * chn->n); + for (i = 0; i < chn->n; ++i) { + swap[i] = *((mem_chain1_t*)a[i].p); + a[i].p = &chn->chains[i]; // as we will memcpy() below, a[i].p is changed + } + memcpy(chn->chains, swap, sizeof(mem_chain1_t) * chn->n); + free(swap); + } for (i = 1, n = 1; i < chn->n; ++i) { for (j = 0; j < n; ++j) { int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg; From e65b2096f7eb2ce275202507286cd5af988510e5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Feb 2013 12:25:49 -0500 Subject: [PATCH 175/498] removed useless members --- bwamem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.h b/bwamem.h index 74eb70a..6d8049b 100644 --- a/bwamem.h +++ b/bwamem.h @@ -30,8 +30,8 @@ typedef struct { } mem_chain_t; typedef struct { - int64_t pos, rb, re; - int len, score, qb, qe, is_all; + int64_t rb, re; + int score, qb, qe, is_all; } mem_aln_t; #ifdef __cplusplus From a9292d674d5dcbdccd560f48dc3c55a1e99342d1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Feb 2013 13:59:32 -0500 Subject: [PATCH 176/498] a bit code cleanup --- bwamem.c | 51 +++++++++++++++++++++++---------------------------- bwamem.h | 9 +++++---- fastmap.c | 4 ++-- 3 files changed, 30 insertions(+), 34 deletions(-) diff --git a/bwamem.c b/bwamem.c index 69f9e81..c5c6366 100644 --- a/bwamem.c +++ b/bwamem.c @@ -211,34 +211,32 @@ typedef struct { #define flt_lt(a, b) ((a).w > (b).w) KSORT_INIT(mem_flt, flt_aux_t, flt_lt) -void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) +int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains) { flt_aux_t *a; int i, j, n; - if (chn->n <= 1) return; // no need to filter - a = malloc(sizeof(flt_aux_t) * chn->n); - for (i = 0; i < chn->n; ++i) { - mem_chain1_t *c = &chn->chains[i]; + if (n_chn <= 1) return n_chn; // no need to filter + a = malloc(sizeof(flt_aux_t) * n_chn); + for (i = 0; i < n_chn; ++i) { + mem_chain1_t *c = &chains[i]; int w = 0; for (j = 0; j < c->n; ++j) w += c->seeds[j].len; // FIXME: take care of seed overlaps a[i].beg = c->seeds[0].qbeg; a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; - a[i].w = w; - a[i].p = c; - a[i].p2 = 0; + a[i].w = w; a[i].p = c; a[i].p2 = 0; } - ks_introsort(mem_flt, chn->n, a); + ks_introsort(mem_flt, n_chn, a); { // reorder chains such that the best chain appears first mem_chain1_t *swap; - swap = malloc(sizeof(mem_chain1_t) * chn->n); - for (i = 0; i < chn->n; ++i) { + swap = malloc(sizeof(mem_chain1_t) * n_chn); + for (i = 0; i < n_chn; ++i) { swap[i] = *((mem_chain1_t*)a[i].p); - a[i].p = &chn->chains[i]; // as we will memcpy() below, a[i].p is changed + a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed } - memcpy(chn->chains, swap, sizeof(mem_chain1_t) * chn->n); + memcpy(chains, swap, sizeof(mem_chain1_t) * n_chn); free(swap); } - for (i = 1, n = 1; i < chn->n; ++i) { + for (i = 1, n = 1; i < n_chn; ++i) { for (j = 0; j < n; ++j) { int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg; int e_min = a[j].end < a[i].end? a[j].end : a[i].end; @@ -260,20 +258,20 @@ void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn) if (c && c->n > 0) c->n = -c->n; } free(a); - for (i = 0; i < chn->n; ++i) { // free discarded chains - mem_chain1_t *c = &chn->chains[i]; + for (i = 0; i < n_chn; ++i) { // free discarded chains + mem_chain1_t *c = &chains[i]; if (c->n >= 0) { free(c->seeds); c->n = c->m = 0; } else c->n = -c->n; } - for (i = n = 0; i < chn->n; ++i) { // squeeze out discarded chains - if (chn->chains[i].n > 0) { - if (n != i) chn->chains[n++] = chn->chains[i]; + for (i = n = 0; i < n_chn; ++i) { // squeeze out discarded chains + if (chains[i].n > 0) { + if (n != i) chains[n++] = chains[i]; else ++n; } } - chn->n = n; + return n; } /**************************************** @@ -286,14 +284,14 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) return l > 1? l : 1; } -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a) +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_alnreg_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds int i, j, qbeg; int64_t rlen, rbeg, rmax[2], tmp; const mem_seed_t *s; uint8_t *rseq = 0; - memset(a, 0, sizeof(mem_aln_t)); + memset(a, 0, sizeof(mem_alnreg_t)); // get the start and end of the seeded region rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg; // get the max possible span @@ -347,17 +345,14 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->qe = qe + qle; a->re = rmax[0] + re + tle; free(qw); } else a->qe = l_query, a->re = s->rbeg + s->len; - + /* a->is_all = 1; if (c->n > 1) { // check if all the seeds have been included s = &c->seeds[c->n - 1]; if (s->qbeg + s->len > a->qe) a->is_all = 0; } - - //printf("[Q] "); for (i = a->qb; i < a->qe; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); - //printf("[R] "); for (i = a->rb; i < a->re; ++i) putchar("ACGTN"[(int)rseq[i - rmax[0]]]); putchar('\n'); - printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\tis_all=%d\t", c->n, a->score, a->qb, a->qe, a->rb, a->re, a->is_all); - putchar('\n'); + */ + printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\n", c->n, a->score, a->qb, a->qe, a->rb, a->re); free(rseq); } diff --git a/bwamem.h b/bwamem.h index 6d8049b..ef951c3 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,8 +31,8 @@ typedef struct { typedef struct { int64_t rb, re; - int score, qb, qe, is_all; -} mem_aln_t; + int score, qb, qe; +} mem_alnreg_t; #ifdef __cplusplus extern "C" { @@ -47,8 +47,9 @@ mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); -void mem_chain_flt(const mem_opt_t *opt, mem_chain_t *chn); -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_aln_t *a); +int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains); +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_alnreg_t *a); +uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); #ifdef __cplusplus } diff --git a/fastmap.c b/fastmap.c index f02224a..c5667e7 100644 --- a/fastmap.c +++ b/fastmap.c @@ -51,10 +51,10 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); - mem_chain_flt(opt, &chain); + chain.n = mem_chain_flt(opt, chain.n, chain.chains); for (i = 0; i < chain.n; ++i) { mem_chain1_t *p = &chain.chains[i]; - mem_aln_t a; + mem_alnreg_t a; mem_chain2aln(opt, bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p, &a); printf("%d\t%d", i, p->n); for (j = 0; j < p->n; ++j) { From 5a0b32bfd24e89f70460c85bdb03d7dd1d87045a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Feb 2013 14:38:40 -0500 Subject: [PATCH 177/498] updated to the latest kseq.h --- Makefile | 2 +- bntseq.c | 2 +- bseq.c | 4 ++ bseq.h | 15 ++++++ bwamem.c | 30 +++++++++++ bwamem.h | 2 +- bwaseqio.c | 2 +- bwtsw2_aux.c | 2 +- fastmap.c | 2 +- kseq.h | 137 ++++++++++++++++++++++++++++++--------------------- simple_dp.c | 2 +- 11 files changed, 138 insertions(+), 62 deletions(-) create mode 100644 bseq.c create mode 100644 bseq.h diff --git a/Makefile b/Makefile index 04fd7a0..46e0b80 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o stdaln.o \ - bwaseqio.o bwase.o kstring.o + bseq.o bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ bwape.o cs2nt.o \ diff --git a/bntseq.c b/bntseq.c index 18abb2b..06d82a0 100644 --- a/bntseq.c +++ b/bntseq.c @@ -35,7 +35,7 @@ #include "utils.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_DECLARE(gzFile) unsigned char nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, diff --git a/bseq.c b/bseq.c new file mode 100644 index 0000000..0ec57fa --- /dev/null +++ b/bseq.c @@ -0,0 +1,4 @@ +#include +#include "bseq.h" +#include "kseq.h" +KSEQ_INIT2(, gzFile, gzread) diff --git a/bseq.h b/bseq.h new file mode 100644 index 0000000..73afb63 --- /dev/null +++ b/bseq.h @@ -0,0 +1,15 @@ +#ifndef BATCHSEQ_H_ +#define BATCHSEQ_H_ + +typedef struct { + char *name, *comment, *seq, *qual; +} bseq1_t; + +typedef struct { + int n, m; + bseq1_t *seqs; +} bseq_t; + +int bseq_read(int chunk_size, bseq_t *bs); + +#endif diff --git a/bwamem.c b/bwamem.c index c5c6366..6d08bb8 100644 --- a/bwamem.c +++ b/bwamem.c @@ -274,6 +274,32 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains) return n; } +#define alnreg_lt(a, b) ((a).score > (b).score) +KSORT_INIT(mem_ar, mem_alnreg_t, alnreg_lt) + +int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) +{ // similar to the loop in mem_chain_flt() + int i, j, m; + if (n <= 1) return n; + ks_introsort(mem_ar, n, a); + for (i = 0; i < n; ++i) a[i].sub = 0; + for (i = 1, m = 1; i < n; ++i) { + for (j = 0; j < m; ++j) { + int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb; + int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe; + if (e_min > b_max) { // have overlap + int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb; + if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap + if (a[j].sub == 0) a[j].sub = a[i].score; + break; + } + } + } + if (j == m) a[m++] = a[i]; + } + return m; +} + /**************************************** * Construct the alignment from a chain * ****************************************/ @@ -388,3 +414,7 @@ uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, free(rseq); return cigar; } + +/**************** + * Sequence I/O * + ****************/ diff --git a/bwamem.h b/bwamem.h index ef951c3..d69f15a 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,7 +31,7 @@ typedef struct { typedef struct { int64_t rb, re; - int score, qb, qe; + int score, qb, qe, sub; } mem_alnreg_t; #ifdef __cplusplus diff --git a/bwaseqio.c b/bwaseqio.c index e22d4cd..c1e9f97 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -5,7 +5,7 @@ #include "bamlite.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 5e8161c..619930b 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -15,7 +15,7 @@ #include "kstring.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_DECLARE(gzFile) #include "ksort.h" #define __left_lt(a, b) ((a).end > (b).end) diff --git a/fastmap.c b/fastmap.c index c5667e7..475667b 100644 --- a/fastmap.c +++ b/fastmap.c @@ -7,7 +7,7 @@ #include "bwamem.h" #include "kvec.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; diff --git a/kseq.h b/kseq.h index ad8937c..a5cec7c 100644 --- a/kseq.h +++ b/kseq.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, by Heng Li + Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,6 +23,8 @@ SOFTWARE. */ +/* Last Modified: 05MAR2012 */ + #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -30,9 +32,14 @@ #include #include +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 + #define __KS_TYPE(type_t) \ typedef struct __kstream_t { \ - char *buf; \ + unsigned char *buf; \ int begin, end, is_eof; \ type_t f; \ } kstream_t; @@ -45,7 +52,7 @@ { \ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ ks->f = f; \ - ks->buf = (char*)malloc(__bufsize); \ + ks->buf = (unsigned char*)malloc(__bufsize); \ return ks; \ } \ static inline void ks_destroy(kstream_t *ks) \ @@ -82,10 +89,10 @@ typedef struct __kstring_t { #endif #define __KS_GETUNTIL(__read, __bufsize) \ - static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ if (dret) *dret = 0; \ - str->l = 0; \ + str->l = append? str->l : 0; \ if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ @@ -97,14 +104,20 @@ typedef struct __kstring_t { if (ks->end == 0) break; \ } else break; \ } \ - if (delimiter) { \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == delimiter) break; \ - } else { \ + } else if (delimiter == KS_SEP_SPACE) { \ for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i])) break; \ - } \ - if (str->m - str->l < i - ks->begin + 1) { \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ str->m = str->l + (i - ks->begin) + 1; \ kroundup32(str->m); \ str->s = (char*)realloc(str->s, str->m); \ @@ -117,9 +130,15 @@ typedef struct __kstring_t { break; \ } \ } \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ str->s[str->l] = '\0'; \ return str->l; \ - } + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } #define KSTREAM_INIT(type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ @@ -127,19 +146,16 @@ typedef struct __kstring_t { __KS_GETC(__read, __bufsize) \ __KS_GETUNTIL(__read, __bufsize) -#define __KSEQ_BASIC(type_t) \ - static inline kseq_t *kseq_init(type_t fd) \ +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ { \ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ s->f = ks_init(fd); \ return s; \ } \ - static inline void kseq_rewind(kseq_t *ks) \ - { \ - ks->last_char = 0; \ - ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ - } \ - static inline void kseq_destroy(kseq_t *ks) \ + SCOPE void kseq_destroy(kseq_t *ks) \ { \ if (!ks) return; \ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ @@ -152,44 +168,46 @@ typedef struct __kstring_t { -1 end-of-file -2 truncated quality string */ -#define __KSEQ_READ \ - static int kseq_read(kseq_t *seq) \ - { \ - int c; \ - kstream_t *ks = seq->f; \ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* the first header char has been read */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ - if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ - if (isgraph(c)) { /* printable non-space character */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l++] = (char)c; \ - } \ - } \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ - if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ - } \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ - if (c == -1) return -2; /* we should not stop here */ \ - while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ - if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ - seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ seq->last_char = 0; /* we have not come to the next header line */ \ - if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ - return seq->seq.l; \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ } #define __KSEQ_TYPE(type_t) \ @@ -199,10 +217,19 @@ typedef struct __kstring_t { kstream_t *f; \ } kseq_t; -#define KSEQ_INIT(type_t, __read) \ - KSTREAM_INIT(type_t, __read, 4096) \ +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT(type_t, __read, 16384) \ __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(type_t) \ - __KSEQ_READ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); #endif diff --git a/simple_dp.c b/simple_dp.c index 7c078c2..d2b4b71 100644 --- a/simple_dp.c +++ b/simple_dp.c @@ -8,7 +8,7 @@ #include "utils.h" #include "kseq.h" -KSEQ_INIT(gzFile, gzread) +KSEQ_DECLARE(gzFile) typedef struct { int l; From 901d28d5f54c6e58a966c233b471c315df453580 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Feb 2013 15:03:09 -0500 Subject: [PATCH 178/498] code backup --- bseq.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ bseq.h | 8 ++------ 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/bseq.c b/bseq.c index 0ec57fa..0889851 100644 --- a/bseq.c +++ b/bseq.c @@ -1,4 +1,55 @@ #include +#include +#include +#include #include "bseq.h" #include "kseq.h" KSEQ_INIT2(, gzFile, gzread) + +static inline void trim_readno(kstring_t *s) +{ + if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) + s->l -= 2, s->s[s->l] = 0; +} + +static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) +{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice + s->name = strdup(ks->name.s); + s->comment = ks->comment.l? strdup(s->comment) : 0; + s->seq = strdup(ks->seq.s); + s->qual = ks->qual.l? strdup(ks->qual.s) : 0; + s->l_seq = strlen(s->seq); +} + +bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) +{ + kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; + int size = 0, m, n; + bseq1_t *seqs; + m = n = 0; seqs = 0; + while (kseq_read(ks) >= 0) { + if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + if (n >= m) { + m = m? m<<1 : 256; + seqs = realloc(seqs, m * sizeof(bseq1_t)); + } + trim_readno(&ks->name); + kseq2bseq1(ks, &seqs[n]); + size += seqs[n++].l_seq; + if (ks2) { + trim_readno(&ks2->name); + kseq2bseq1(ks2, &seqs[n++]); + size += seqs[n++].l_seq; + } + if (size >= chunk_size) break; + } + *n_ = n; + if (size < chunk_size) { // test if the 2nd file is finished + if (kseq_read(ks2) >= 0) + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + return seqs; +} diff --git a/bseq.h b/bseq.h index 73afb63..b54a268 100644 --- a/bseq.h +++ b/bseq.h @@ -2,14 +2,10 @@ #define BATCHSEQ_H_ typedef struct { + int l_seq; char *name, *comment, *seq, *qual; } bseq1_t; -typedef struct { - int n, m; - bseq1_t *seqs; -} bseq_t; - -int bseq_read(int chunk_size, bseq_t *bs); +bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); #endif From a09db6903736bd42933847277c1a734190b0e3ab Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Feb 2013 17:12:27 -0500 Subject: [PATCH 179/498] In bwtsw, replace the batch seq-reader with bseq --- bseq.c | 2 +- bwtsw2_aux.c | 54 ++++++++++++++++++---------------------------------- 2 files changed, 19 insertions(+), 37 deletions(-) diff --git a/bseq.c b/bseq.c index 0889851..54a25f6 100644 --- a/bseq.c +++ b/bseq.c @@ -48,7 +48,7 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) } *n_ = n; if (size < chunk_size) { // test if the 2nd file is finished - if (kseq_read(ks2) >= 0) + if (ks2 && kseq_read(ks2) >= 0) fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); } return seqs; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 619930b..a18ffc8 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -13,6 +13,7 @@ #include "bwtsw2.h" #include "stdaln.h" #include "kstring.h" +#include "bseq.h" #include "kseq.h" KSEQ_DECLARE(gzFile) @@ -756,24 +757,14 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * _seq->n = 0; } -static void kseq_to_bsw2seq(const kseq_t *ks, bsw2seq1_t *p) -{ - p->tid = -1; - p->l = ks->seq.l; - p->name = strdup(ks->name.s); - p->seq = strdup(ks->seq.s); - p->qual = ks->qual.l? strdup(ks->qual.s) : 0; - p->comment = ks->comment.l? strdup(ks->comment.s) : 0; - p->sam = 0; -} - void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2) { gzFile fp, fp2; kseq_t *ks, *ks2; - int l, size = 0, is_pe = 0; + int l, is_pe = 0, i, n; uint8_t *pac; bsw2seq_t *_seq; + bseq1_t *bseq; pac = calloc(bns->l_pac/4+1, 1); if (pac == 0) { @@ -791,34 +782,25 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c ks2 = kseq_init(fp2); is_pe = 1; } else fp2 = 0, ks2 = 0, is_pe = 0; - while (kseq_read(ks) >= 0) { - if (ks->name.l > 2 && ks->name.s[ks->name.l-2] == '/') - ks->name.l -= 2, ks->name.s[ks->name.l] = 0; - if (_seq->n == _seq->max) { - _seq->max = _seq->max? _seq->max<<1 : 1024; + while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { + int size = 0; + if (n > _seq->max) { + _seq->max = n; + kroundup32(_seq->max); _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); } - kseq_to_bsw2seq(ks, &_seq->seq[_seq->n++]); - size += ks->seq.l; - if (ks2) { - if (kseq_read(ks2) >= 0) { - if (ks2->name.l > 2 && ks2->name.s[ks2->name.l-2] == '/') - ks2->name.l -= 2, ks2->name.s[ks2->name.l] = 0; - kseq_to_bsw2seq(ks2, &_seq->seq[_seq->n++]); // for PE, _seq->n here must be odd and we do not need to enlarge - size += ks->seq.l; - } else { - fprintf(stderr, "[%s] The second query file has fewer reads. Switched to the single-end mode for the following batches.\n", __func__); - is_pe = 0; - } - } - if (size > opt->chunk_size * opt->n_threads) { - fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size); - process_seqs(_seq, opt, bns, pac, target, is_pe); - size = 0; + _seq->n = n; + for (i = 0; i < n; ++i) { + bseq1_t *b = &bseq[i]; + bsw2seq1_t *p = &_seq->seq[i]; + p->tid = -1; p->l = b->l_seq; + p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0; + size += p->l; } + fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size); + free(bseq); + process_seqs(_seq, opt, bns, pac, target, is_pe); } - fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp)...\n", _seq->n, size); - process_seqs(_seq, opt, bns, pac, target, is_pe); // free free(pac); free(_seq->seq); free(_seq); From 5dc398cdef5324a0e9535dcdd59a602007067134 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 13:13:43 -0500 Subject: [PATCH 180/498] start to write CLI --- bwamem.c | 55 ++++++++++++++++++++++++++++++++----------------------- bwamem.h | 23 ++++++++++++++--------- fastmap.c | 35 ++++++++++++++++++++++++++++------- 3 files changed, 74 insertions(+), 39 deletions(-) diff --git a/bwamem.c b/bwamem.c index 6d08bb8..69c085d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -29,6 +29,10 @@ mem_opt_t *mem_opt_init() o->max_chain_gap = 10000; o->mask_level = 0.50; o->chain_drop_ratio = 0.50; + o->chunk_size = 10000000; + o->n_threads = 1; + o->pe_dir = 0<<1|1; + o->is_pe = 0; mem_fill_scmat(o->a, o->b, o->mat); return o; } @@ -119,9 +123,9 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) #include "kbtree.h" #define chain_cmp(a, b) ((a).pos - (b).pos) -KBTREE_INIT(chn, mem_chain1_t, chain_cmp) +KBTREE_INIT(chn, mem_chain_t, chain_cmp) -static int test_and_merge(const mem_opt_t *opt, mem_chain1_t *c, const mem_seed_t *p) +static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p) { int64_t qend, rend, x, y; const mem_seed_t *last = &c->seeds[c->n-1]; @@ -153,7 +157,7 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i int64_t k; if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive for (k = 0; k < p->x[2]; ++k) { - mem_chain1_t tmp, *lower, *upper; + mem_chain_t tmp, *lower, *upper; mem_seed_t s; int to_add = 0; s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference @@ -174,24 +178,23 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i } } -mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) +mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) { - mem_chain_t chain; + mem_chain_v chain; smem_i *itr; kbtree_t(chn) *tree; - memset(&chain, 0, sizeof(mem_chain_t)); + kv_init(chain); if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match tree = kb_init(chn, KB_DEFAULT_SIZE); itr = smem_itr_init(bwt); smem_set_query(itr, len, seq); mem_insert_seed(opt, tree, itr); - chain.m = kb_size(tree); chain.n = 0; - chain.chains = malloc(chain.m * sizeof(mem_chain1_t)); + kv_resize(mem_chain_t, chain, kb_size(tree)); - #define traverse_func(p_) (chain.chains[chain.n++] = *(p_)) - __kb_traverse(mem_chain1_t, tree, traverse_func); + #define traverse_func(p_) (chain.a[chain.n++] = *(p_)) + __kb_traverse(mem_chain_t, tree, traverse_func); #undef traverse_func smem_itr_destroy(itr); @@ -211,14 +214,14 @@ typedef struct { #define flt_lt(a, b) ((a).w > (b).w) KSORT_INIT(mem_flt, flt_aux_t, flt_lt) -int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains) +int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) { flt_aux_t *a; int i, j, n; if (n_chn <= 1) return n_chn; // no need to filter a = malloc(sizeof(flt_aux_t) * n_chn); for (i = 0; i < n_chn; ++i) { - mem_chain1_t *c = &chains[i]; + mem_chain_t *c = &chains[i]; int w = 0; for (j = 0; j < c->n; ++j) w += c->seeds[j].len; // FIXME: take care of seed overlaps a[i].beg = c->seeds[0].qbeg; @@ -227,13 +230,13 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains) } ks_introsort(mem_flt, n_chn, a); { // reorder chains such that the best chain appears first - mem_chain1_t *swap; - swap = malloc(sizeof(mem_chain1_t) * n_chn); + mem_chain_t *swap; + swap = malloc(sizeof(mem_chain_t) * n_chn); for (i = 0; i < n_chn; ++i) { - swap[i] = *((mem_chain1_t*)a[i].p); + swap[i] = *((mem_chain_t*)a[i].p); a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed } - memcpy(chains, swap, sizeof(mem_chain1_t) * n_chn); + memcpy(chains, swap, sizeof(mem_chain_t) * n_chn); free(swap); } for (i = 1, n = 1; i < n_chn; ++i) { @@ -252,14 +255,14 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains) if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it. } for (i = 0; i < n; ++i) { // mark chains to be kept - mem_chain1_t *c = (mem_chain1_t*)a[i].p; + mem_chain_t *c = (mem_chain_t*)a[i].p; if (c->n > 0) c->n = -c->n; - c = (mem_chain1_t*)a[i].p2; + c = (mem_chain_t*)a[i].p2; if (c && c->n > 0) c->n = -c->n; } free(a); for (i = 0; i < n_chn; ++i) { // free discarded chains - mem_chain1_t *c = &chains[i]; + mem_chain_t *c = &chains[i]; if (c->n >= 0) { free(c->seeds); c->n = c->m = 0; @@ -310,7 +313,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) return l > 1? l : 1; } -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_alnreg_t *a) +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds int i, j, qbeg; int64_t rlen, rbeg, rmax[2], tmp; @@ -415,6 +418,12 @@ uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, return cigar; } -/**************** - * Sequence I/O * - ****************/ +static void process_seq1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s) +{ +} + +int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs) +{ + int i; + return 0; +} diff --git a/bwamem.h b/bwamem.h index d69f15a..7d921fd 100644 --- a/bwamem.h +++ b/bwamem.h @@ -2,6 +2,9 @@ #define BWAMEM_H_ #include "bwt.h" +#include "bntseq.h" +#include "bseq.h" +#include "kvec.h" struct __smem_i; typedef struct __smem_i smem_i; @@ -14,19 +17,16 @@ typedef struct { typedef struct { int a, b, q, r, w; int min_seed_len, max_occ, max_chain_gap; - int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset + int n_threads, chunk_size; + int pe_dir, is_pe; float mask_level, chain_drop_ratio; + int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; typedef struct { int n, m; int64_t pos; mem_seed_t *seeds; -} mem_chain1_t; - -typedef struct { - int n, m; - mem_chain1_t *chains; } mem_chain_t; typedef struct { @@ -34,6 +34,9 @@ typedef struct { int score, qb, qe, sub; } mem_alnreg_t; +typedef kvec_t(mem_chain_t) mem_chain_v; +typedef kvec_t(mem_alnreg_t) mem_alnreg_v; + #ifdef __cplusplus extern "C" { #endif @@ -46,11 +49,13 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len); mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); -mem_chain_t mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); -int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain1_t *chains); -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain1_t *c, mem_alnreg_t *a); +mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); +int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains); +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a); uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); +int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); + #ifdef __cplusplus } #endif diff --git a/fastmap.c b/fastmap.c index 475667b..32f8db0 100644 --- a/fastmap.c +++ b/fastmap.c @@ -6,6 +6,7 @@ #include "bwt.h" #include "bwamem.h" #include "kvec.h" +#include "bseq.h" #include "kseq.h" KSEQ_DECLARE(gzFile) @@ -16,10 +17,11 @@ int main_mem(int argc, char *argv[]) mem_opt_t *opt; bwt_t *bwt; bntseq_t *bns; - int i, j, c; - gzFile fp; - kseq_t *seq; + int i, c, n; + gzFile fp, fp2 = 0; + kseq_t *ks, *ks2 = 0; uint8_t *pac = 0; + bseq1_t *seqs; opt = mem_opt_init(); while ((c = getopt(argc, argv, "")) >= 0) { @@ -32,8 +34,6 @@ int main_mem(int argc, char *argv[]) return 1; } mem_fill_scmat(opt->a, opt->b, opt->mat); - fp = gzopen(argv[optind + 1], "r"); - seq = kseq_init(fp); { // load the packed sequences, BWT and SA char *tmp = calloc(strlen(argv[optind]) + 5, 1); strcat(strcpy(tmp, argv[optind]), ".bwt"); @@ -45,6 +45,22 @@ int main_mem(int argc, char *argv[]) pac = calloc(bns->l_pac/4+1, 1); fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); } + + fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r"); + ks = kseq_init(fp); + if (optind + 2 < argc) { + fp2 = gzopen(argv[optind + 2], "r"); + ks2 = kseq_init(fp); + opt->is_pe = 1; + } + while ((seqs = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { + mem_process_seqs(opt, bwt, bns, pac, n, seqs); + for (i = 0; i < n; ++i) { + free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); + } + free(seqs); + } + /* while (kseq_read(seq) >= 0) { mem_chain_t chain; printf(">%s\n", seq->name.s); @@ -71,12 +87,17 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds); free(chain.chains); } + */ - free(pac); free(opt); + free(opt); free(pac); bns_destroy(bns); bwt_destroy(bwt); - kseq_destroy(seq); + kseq_destroy(ks); gzclose(fp); + if (ks2) { + kseq_destroy(ks2); + gzclose(fp2); + } return 0; } From bfeb37c4dedd344a81da30af7bad0a37c033fe81 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 13:29:01 -0500 Subject: [PATCH 181/498] code backup --- bwamem.c | 44 +++++++++++++++++++++++++++++++++++++++++++- bwamem.h | 2 +- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 69c085d..51bcd3c 100644 --- a/bwamem.c +++ b/bwamem.c @@ -2,8 +2,10 @@ #include #include #include +#ifdef HAVE_PTHREAD +#include +#endif #include "bwamem.h" -#include "kvec.h" #include "bntseq.h" #include "ksw.h" #include "ksort.h" @@ -422,8 +424,48 @@ static void process_seq1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t { } +typedef struct { + int start, step, n; + const mem_opt_t *opt; + const bwt_t *bwt; + const bntseq_t *bns; + const uint8_t *pac; + bseq1_t *seqs; +} worker1_t; + +static void *worker1(void *data) +{ + worker1_t *w = (worker1_t*)data; + int i; + for (i = w->start; i < w->n; i += w->step) + process_seq1(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); + return 0; +} + int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs) { int i; + worker1_t *w1; + w1 = calloc(opt->n_threads, sizeof(worker1_t)); + for (i = 0; i < opt->n_threads; ++i) { + worker1_t *w = &w1[i]; + w->start = i; w->step = opt->n_threads; w->n = n; + w->opt = opt; w->bwt = bwt; w->bns = bns; w->pac = pac; + w->seqs = seqs; + } +#ifdef HAVE_PTHREAD + if (opt->n_threads == 1) { + worker1(w1); + } else { + pthread_t *tid; + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w1[i]); + for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); + free(tid); + } +#else + worker1(w1); +#endif + free(w1); return 0; } diff --git a/bwamem.h b/bwamem.h index 7d921fd..7215ad3 100644 --- a/bwamem.h +++ b/bwamem.h @@ -34,7 +34,7 @@ typedef struct { int score, qb, qe, sub; } mem_alnreg_t; -typedef kvec_t(mem_chain_t) mem_chain_v; +typedef kvec_t(mem_chain_t) mem_chain_v; typedef kvec_t(mem_alnreg_t) mem_alnreg_v; #ifdef __cplusplus From 1fd51fc3f7ac5887e27f6f2be356bfd295729bcb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 14:36:18 -0500 Subject: [PATCH 182/498] code backup --- bseq.h | 2 +- bwamem.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++------- fastmap.c | 5 +--- kstring.h | 45 ++++++++++++++++++++++++++-- 4 files changed, 123 insertions(+), 19 deletions(-) diff --git a/bseq.h b/bseq.h index b54a268..978312a 100644 --- a/bseq.h +++ b/bseq.h @@ -3,7 +3,7 @@ typedef struct { int l_seq; - char *name, *comment, *seq, *qual; + char *name, *comment, *seq, *qual, *sam; } bseq1_t; bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); diff --git a/bwamem.c b/bwamem.c index 51bcd3c..6b8d365 100644 --- a/bwamem.c +++ b/bwamem.c @@ -5,6 +5,7 @@ #ifdef HAVE_PTHREAD #include #endif +#include "kstring.h" #include "bwamem.h" #include "bntseq.h" #include "ksw.h" @@ -420,8 +421,51 @@ uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, return cigar; } -static void process_seq1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s) +/************************ + * Integrated interface * + ************************/ + +void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) { + int k, n_cigar = 0, score, is_rev, nn, rid, i; + uint32_t *cigar = 0; + int64_t pos; + kstring_t str; + mem_alnreg_t *p; + + str.l = str.m = 0; str.s = 0; + k = mem_choose_alnreg_se(opt, a->n, a->a); + p = &a->a[k]; + cigar = mem_gen_cigar(opt, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); + pos = bns_depos(bns, p->rb, &is_rev); + nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); + kputs(s->name, &str); kputc('\t', &str); kputw(is_rev? 16 : 0, &str); kputc('\t', &str); + kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset, &str); kputc('\t', &str); + kputw(0, &str); kputc('\t', &str); + for (i = 0; i < s->l_seq; ++i) s->seq[i] = "ACGTN"[(int)s->seq[i]]; + kputsn(s->seq, s->l_seq, &str); kputc('\t', &str); + if (s->qual) kputsn(s->qual, s->l_seq, &str); + free(cigar); + s->sam = str.s; +} + +static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s) +{ + int i; + mem_chain_v chn; + mem_alnreg_v regs; + for (i = 0; i < s->l_seq; ++i) + s->seq[i] = nst_nt4_table[(int)s->seq[i]]; + chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); + chn.n = mem_chain_flt(opt, chn.n, chn.a); + regs.n = regs.m = chn.n; + regs.a = malloc(regs.n * sizeof(mem_alnreg_t)); + for (i = 0; i < chn.n; ++i) { + mem_chain2aln(opt, bns->l_pac, pac, s->l_seq, (uint8_t*)s->seq, &chn.a[i], ®s.a[i]); + free(chn.a[i].seeds); + } + free(chn.a); + return regs; } typedef struct { @@ -431,41 +475,65 @@ typedef struct { const bntseq_t *bns; const uint8_t *pac; bseq1_t *seqs; -} worker1_t; + mem_alnreg_v *regs; +} worker_t; static void *worker1(void *data) { - worker1_t *w = (worker1_t*)data; + worker_t *w = (worker_t*)data; int i; for (i = w->start; i < w->n; i += w->step) - process_seq1(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); + w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); + return 0; +} + +static void *worker2(void *data) +{ + worker_t *w = (worker_t*)data; + int i; + if (!w->opt->is_pe) { + for (i = 0; i < w->n; i += w->step) { + mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]); + free(w->regs[i].a); + } + } else { + for (i = 0; i < w->n>>1; i += w->step) { // not implemented yet + free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); + } + } return 0; } int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs) { int i; - worker1_t *w1; - w1 = calloc(opt->n_threads, sizeof(worker1_t)); + worker_t *w; + w = calloc(opt->n_threads, sizeof(worker_t)); for (i = 0; i < opt->n_threads; ++i) { - worker1_t *w = &w1[i]; + worker_t *w = &w[i]; w->start = i; w->step = opt->n_threads; w->n = n; w->opt = opt; w->bwt = bwt; w->bns = bns; w->pac = pac; w->seqs = seqs; } #ifdef HAVE_PTHREAD if (opt->n_threads == 1) { - worker1(w1); + worker1(w); worker2(w); } else { pthread_t *tid; tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); - for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w1[i]); + for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]); + for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); + for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); free(tid); } #else - worker1(w1); + worker1(w); worker2(w); #endif - free(w1); + for (i = 0; i < n; ++i) { + puts(seqs[i].sam); + free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); + } + free(w); return 0; } diff --git a/fastmap.c b/fastmap.c index 32f8db0..812c2db 100644 --- a/fastmap.c +++ b/fastmap.c @@ -17,7 +17,7 @@ int main_mem(int argc, char *argv[]) mem_opt_t *opt; bwt_t *bwt; bntseq_t *bns; - int i, c, n; + int c, n; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; uint8_t *pac = 0; @@ -55,9 +55,6 @@ int main_mem(int argc, char *argv[]) } while ((seqs = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { mem_process_seqs(opt, bwt, bns, pac, n, seqs); - for (i = 0; i < n; ++i) { - free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); - } free(seqs); } /* diff --git a/kstring.h b/kstring.h index 398901f..cf14e39 100644 --- a/kstring.h +++ b/kstring.h @@ -16,19 +16,24 @@ typedef struct __kstring_t { } kstring_t; #endif -static inline int kputs(const char *p, kstring_t *s) +static inline int kputsn(const char *p, int l, kstring_t *s) { - int l = strlen(p); if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); } - strcpy(s->s + s->l, p); + memcpy(s->s + s->l, p, l); s->l += l; + s->s[s->l] = 0; return l; } +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + static inline int kputc(int c, kstring_t *s) { if (s->l + 1 >= s->m) { @@ -41,6 +46,40 @@ static inline int kputc(int c, kstring_t *s) return c; } +static inline int kputw(int c, kstring_t *s) +{ + char buf[16]; + int l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputuw(unsigned c, kstring_t *s) +{ + char buf[16]; + int l, i; + unsigned x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + s->s[s->l] = 0; + return 0; +} + int ksprintf(kstring_t *s, const char *fmt, ...); #endif From 49f2bcc01570d2744d1b7c4387fc495698b20fcf Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 14:57:22 -0500 Subject: [PATCH 183/498] CIGAR is wrong, but the rest is okay --- bwamem.c | 58 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/bwamem.c b/bwamem.c index 6b8d365..e3f09c1 100644 --- a/bwamem.c +++ b/bwamem.c @@ -427,25 +427,37 @@ uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) { - int k, n_cigar = 0, score, is_rev, nn, rid, i; - uint32_t *cigar = 0; - int64_t pos; + int k, m; kstring_t str; - mem_alnreg_t *p; str.l = str.m = 0; str.s = 0; - k = mem_choose_alnreg_se(opt, a->n, a->a); - p = &a->a[k]; - cigar = mem_gen_cigar(opt, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); - pos = bns_depos(bns, p->rb, &is_rev); - nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); - kputs(s->name, &str); kputc('\t', &str); kputw(is_rev? 16 : 0, &str); kputc('\t', &str); - kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset, &str); kputc('\t', &str); - kputw(0, &str); kputc('\t', &str); - for (i = 0; i < s->l_seq; ++i) s->seq[i] = "ACGTN"[(int)s->seq[i]]; - kputsn(s->seq, s->l_seq, &str); kputc('\t', &str); - if (s->qual) kputsn(s->qual, s->l_seq, &str); - free(cigar); + m = mem_choose_alnreg_se(opt, a->n, a->a); + for (k = 0; k < m; ++k) { + uint32_t *cigar = 0; + int score, is_rev, nn, rid, i, flag = 0, n_cigar = 0; + int64_t pos, end; + mem_alnreg_t *p = &a->a[k]; + cigar = mem_gen_cigar(opt, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); + pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); + nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); + flag |= is_rev? 16 : 0; + kputs(s->name, &str); kputc('\t', &str); kputw(flag, &str); kputc('\t', &str); + kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset + 1, &str); kputc('\t', &str); + kputw(0, &str); kputc('\t', &str); + if (n_cigar) { + for (i = 0; i < n_cigar; ++i) { + kputw(cigar[i]>>4, &str); kputc("MIDSH"[cigar[i]&0xf], &str); + } + } else kputc('*', &str); + kputsn("\t*\t0\t0\t", 7, &str); + for (i = 0; i < s->l_seq; ++i) s->seq[i] = "ACGTN"[(int)s->seq[i]]; + kputsn(s->seq, s->l_seq, &str); kputc('\t', &str); + if (s->qual) kputsn(s->qual, s->l_seq, &str); + kputsn("\tAS:i:", 6, &str); kputw(score, &str); + kputsn("\tss:i:", 6, &str); kputw(p->sub, &str); + kputc('\n', &str); + free(cigar); + } s->sam = str.s; } @@ -508,12 +520,14 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns { int i; worker_t *w; + mem_alnreg_v *regs; w = calloc(opt->n_threads, sizeof(worker_t)); + regs = malloc(n * sizeof(mem_alnreg_v)); for (i = 0; i < opt->n_threads; ++i) { - worker_t *w = &w[i]; - w->start = i; w->step = opt->n_threads; w->n = n; - w->opt = opt; w->bwt = bwt; w->bns = bns; w->pac = pac; - w->seqs = seqs; + worker_t *p = &w[i]; + p->start = i; p->step = opt->n_threads; p->n = n; + p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac; + p->seqs = seqs; p->regs = regs; } #ifdef HAVE_PTHREAD if (opt->n_threads == 1) { @@ -531,9 +545,9 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns worker1(w); worker2(w); #endif for (i = 0; i < n; ++i) { - puts(seqs[i].sam); + fputs(seqs[i].sam, stdout); free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); } - free(w); + free(regs); free(w); return 0; } From 27fdf6397db0568a72cbc598f9292d2f75fa93fb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 15:52:36 -0500 Subject: [PATCH 184/498] single-end working! no mapQ, though --- bwamem.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/bwamem.c b/bwamem.c index e3f09c1..0d864d3 100644 --- a/bwamem.c +++ b/bwamem.c @@ -371,8 +371,6 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } } } - //printf("[Q] "); for (i = qe; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); - //printf("[R] "); for (i = re; i < rmax[1] - rmax[0]; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; free(qw); @@ -384,8 +382,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (s->qbeg + s->len > a->qe) a->is_all = 0; } */ - printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\n", c->n, a->score, a->qb, a->qe, a->rb, a->re); - + //printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\n", c->n, a->score, a->qb, a->qe, a->rb, a->re); free(rseq); } @@ -403,8 +400,10 @@ uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, for (i = 0; i < l_query>>1; ++i) tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; for (i = 0; i < rlen>>1; ++i) - tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], query[rlen - 1 - i] = tmp; + tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; } + //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); // set the band-width w = (int)((double)(l_query * opt->a - opt->q) / opt->r + 1.); w = w < 1? w : 1; @@ -429,18 +428,21 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b { int k, m; kstring_t str; + char *seq; str.l = str.m = 0; str.s = 0; m = mem_choose_alnreg_se(opt, a->n, a->a); + seq = malloc(s->l_seq); for (k = 0; k < m; ++k) { uint32_t *cigar = 0; int score, is_rev, nn, rid, i, flag = 0, n_cigar = 0; - int64_t pos, end; + int64_t pos; mem_alnreg_t *p = &a->a[k]; cigar = mem_gen_cigar(opt, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); flag |= is_rev? 16 : 0; + if (n_cigar == 0) flag |= 8; kputs(s->name, &str); kputc('\t', &str); kputw(flag, &str); kputc('\t', &str); kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset + 1, &str); kputc('\t', &str); kputw(0, &str); kputc('\t', &str); @@ -450,14 +452,17 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b } } else kputc('*', &str); kputsn("\t*\t0\t0\t", 7, &str); - for (i = 0; i < s->l_seq; ++i) s->seq[i] = "ACGTN"[(int)s->seq[i]]; - kputsn(s->seq, s->l_seq, &str); kputc('\t', &str); + if (is_rev) for (i = s->l_seq - 1; i >= 0; --i) seq[i] = "TGCAN"[(int)s->seq[i]]; + else for (i = 0; i < s->l_seq; ++i) seq[i] = "ACGTN"[(int)s->seq[i]]; + kputsn(seq, s->l_seq, &str); kputc('\t', &str); if (s->qual) kputsn(s->qual, s->l_seq, &str); - kputsn("\tAS:i:", 6, &str); kputw(score, &str); + kputsn("\tAS:i:", 6, &str); kputw(p->score, &str); kputsn("\tss:i:", 6, &str); kputw(p->sub, &str); + kputsn("\tnw:i:", 6, &str); kputw(score, &str); kputc('\n', &str); free(cigar); } + free(seq); s->sam = str.s; } From ff3fea115cefc4b84909720ad89c24127893816c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 16:27:11 -0500 Subject: [PATCH 185/498] write soft clip; added debugging code --- bwamem.c | 30 ++++++++++++++++++++++++++++-- fastmap.c | 31 ++----------------------------- 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/bwamem.c b/bwamem.c index 0d864d3..f85d5e3 100644 --- a/bwamem.c +++ b/bwamem.c @@ -181,6 +181,24 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i } } +void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) +{ + int i, j; + for (i = 0; i < chn->n; ++i) { + mem_chain_t *p = &chn->a[i]; + printf("%d", p->n); + for (j = 0; j < p->n; ++j) { + bwtint_t pos; + int is_rev, ref_id; + pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev); + if (is_rev) pos -= p->seeds[j].len - 1; + bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id); + printf("\t%d,%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); + } + putchar('\n'); + } +} + mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) { mem_chain_v chain; @@ -318,7 +336,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - int i, j, qbeg; + int i, qbeg; int64_t rlen, rbeg, rmax[2], tmp; const mem_seed_t *s; uint8_t *rseq = 0; @@ -357,8 +375,9 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int int qle, tle, qe, re; int16_t *qw = 0; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; +#if 0 if (c->n > 1) { // generate $qw - int l = rmax[1] - (s->rbeg + s->len); + int j, l = rmax[1] - (s->rbeg + s->len); qw = malloc(l * 2); for (i = 0; i < l; ++i) qw[i] = -1; // no constraint by default for (i = 1; i < c->n; ++i) { @@ -371,6 +390,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } } } +#endif a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; free(qw); @@ -447,9 +467,14 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset + 1, &str); kputc('\t', &str); kputw(0, &str); kputc('\t', &str); if (n_cigar) { + int clip5, clip3; + clip5 = is_rev? s->l_seq - p->qe : p->qb; + clip3 = is_rev? p->qb : s->l_seq - p->qe; + if (clip5) { kputw(clip5, &str); kputc('S', &str); } for (i = 0; i < n_cigar; ++i) { kputw(cigar[i]>>4, &str); kputc("MIDSH"[cigar[i]&0xf], &str); } + if (clip3) { kputw(clip3, &str); kputc('S', &str); } } else kputc('*', &str); kputsn("\t*\t0\t0\t", 7, &str); if (is_rev) for (i = s->l_seq - 1; i >= 0; --i) seq[i] = "TGCAN"[(int)s->seq[i]]; @@ -475,6 +500,7 @@ static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bn s->seq[i] = nst_nt4_table[(int)s->seq[i]]; chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); + //mem_print_chain(bns, &chn); regs.n = regs.m = chn.n; regs.a = malloc(regs.n * sizeof(mem_alnreg_t)); for (i = 0; i < chn.n; ++i) { diff --git a/fastmap.c b/fastmap.c index 812c2db..b27b1df 100644 --- a/fastmap.c +++ b/fastmap.c @@ -24,7 +24,8 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "")) >= 0) { + while ((c = getopt(argc, argv, "k:")) >= 0) { + if (c == 'k') opt->min_seed_len = atoi(optarg); } if (optind + 1 >= argc) { fprintf(stderr, "\n"); @@ -57,34 +58,6 @@ int main_mem(int argc, char *argv[]) mem_process_seqs(opt, bwt, bns, pac, n, seqs); free(seqs); } - /* - while (kseq_read(seq) >= 0) { - mem_chain_t chain; - printf(">%s\n", seq->name.s); - for (i = 0; i < seq->seq.l; ++i) - seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - chain = mem_chain(opt, bwt, seq->seq.l, (uint8_t*)seq->seq.s); - chain.n = mem_chain_flt(opt, chain.n, chain.chains); - for (i = 0; i < chain.n; ++i) { - mem_chain1_t *p = &chain.chains[i]; - mem_alnreg_t a; - mem_chain2aln(opt, bns->l_pac, pac, seq->seq.l, (uint8_t*)seq->seq.s, p, &a); - printf("%d\t%d", i, p->n); - for (j = 0; j < p->n; ++j) { - bwtint_t pos; - int is_rev, ref_id; - pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev); - if (is_rev) pos -= p->seeds[j].len - 1; - bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id); - printf("\t%d,%d,%s:%c%ld", p->seeds[j].len, p->seeds[j].qbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); - } - putchar('\n'); - } - puts("//"); - for (i = 0; i < chain.n; ++i) free(chain.chains[i].seeds); - free(chain.chains); - } - */ free(opt); free(pac); bns_destroy(bns); From 6ba11ab68cd42763e4fe2911210f5f04211b785d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 16:42:01 -0500 Subject: [PATCH 186/498] no effective changes --- bwamem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index f85d5e3..e3805d3 100644 --- a/bwamem.c +++ b/bwamem.c @@ -375,7 +375,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int int qle, tle, qe, re; int16_t *qw = 0; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; -#if 0 +#if 0 // FIXME: I am not sure if the following block works. Comment it out if SW extension gives unexpected result. if (c->n > 1) { // generate $qw int j, l = rmax[1] - (s->rbeg + s->len); qw = malloc(l * 2); @@ -385,10 +385,11 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int for (j = 0; j < t->len; ++j) { int x = t->rbeg + j - (s->rbeg + s->len), y = t->qbeg + j - (s->qbeg + s->len); if (x < 0) continue; // overlap with the first seed - if (qw[x] == -1) qw[x] = x > y? x - y : y - x; + if (qw[x] == -1) qw[x] = (x > y? x - y : y - x) + 1; // FIXME: in principle, we should not need +1 else if (qw[x] >= 0) qw[x] = -2; // in a seed overlap, do not set any constraint } } +// for (i = 0; i < l; ++i) printf("%d:%d\t", i, qw[i]); putchar('\n'); } #endif a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); @@ -500,7 +501,7 @@ static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bn s->seq[i] = nst_nt4_table[(int)s->seq[i]]; chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); - //mem_print_chain(bns, &chn); +// mem_print_chain(bns, &chn); regs.n = regs.m = chn.n; regs.a = malloc(regs.n * sizeof(mem_alnreg_t)); for (i = 0; i < chn.n; ++i) { From 83a49f32100868c7ca91d17c08f9bf59e4c489e1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 17:15:45 -0500 Subject: [PATCH 187/498] compute mapQ; extend from the longest seed --- bwamem.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/bwamem.c b/bwamem.c index e3805d3..6b914b5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -2,6 +2,7 @@ #include #include #include +#include #ifdef HAVE_PTHREAD #include #endif @@ -11,6 +12,8 @@ #include "ksw.h" #include "ksort.h" +#define MAPQ_COEF 40. + void mem_fill_scmat(int a, int b, int8_t mat[25]) { int i, j, k; @@ -336,14 +339,12 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - int i, qbeg; + int i, qbeg, max = 0, max_i = -1; int64_t rlen, rbeg, rmax[2], tmp; const mem_seed_t *s; uint8_t *rseq = 0; memset(a, 0, sizeof(mem_alnreg_t)); - // get the start and end of the seeded region - rbeg = c->seeds[0].rbeg; qbeg = c->seeds[0].qbeg; // get the max possible span rmax[0] = l_pac<<1; rmax[1] = 0; for (i = 0; i < c->n; ++i) { @@ -353,7 +354,10 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len)); rmax[0] = rmax[0] < b? rmax[0] : b; rmax[1] = rmax[1] > e? rmax[1] : e; + if (max < t->len) max = t->len, max_i = i; } + // get the start and end of the seeded region + rbeg = c->seeds[max_i].rbeg; qbeg = c->seeds[max_i].qbeg; // retrieve the reference sequence rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); @@ -365,12 +369,12 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int tmp = rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - a->score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[0].len * opt->a, 0, &qle, &tle); + a->score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[max_i].len * opt->a, 0, &qle, &tle); a->qb = qbeg - qle; a->rb = rbeg - tle; free(qs); free(rs); - } else a->score = c->seeds[0].len * opt->a, a->qb = 0, a->rb = rbeg; + } else a->score = c->seeds[max_i].len * opt->a, a->qb = 0, a->rb = rbeg; - s = &c->seeds[0]; + s = &c->seeds[max_i]; if (s->qbeg + s->len != l_query) { // right extension of the first seed int qle, tle, qe, re; int16_t *qw = 0; @@ -456,7 +460,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b seq = malloc(s->l_seq); for (k = 0; k < m; ++k) { uint32_t *cigar = 0; - int score, is_rev, nn, rid, i, flag = 0, n_cigar = 0; + int score, is_rev, nn, rid, i, flag = 0, n_cigar = 0, mapq = 0; int64_t pos; mem_alnreg_t *p = &a->a[k]; cigar = mem_gen_cigar(opt, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); @@ -466,7 +470,9 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (n_cigar == 0) flag |= 8; kputs(s->name, &str); kputc('\t', &str); kputw(flag, &str); kputc('\t', &str); kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset + 1, &str); kputc('\t', &str); - kputw(0, &str); kputc('\t', &str); + mapq = p->score? (int)(MAPQ_COEF * (1. - (float)(p->sub? p->sub : opt->min_seed_len * opt->a) / p->score) * log(p->score / opt->a) + .499) : 0; + if (mapq > 60) mapq = 60; + kputw(mapq, &str); kputc('\t', &str); if (n_cigar) { int clip5, clip3; clip5 = is_rev? s->l_seq - p->qe : p->qb; From cd6bd524d48e83a338c8566c2343827a036188c5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 19:50:37 -0500 Subject: [PATCH 188/498] discard internal seeds shorter than half --- bwamem.c | 20 +++++++++++++------- fastmap.c | 5 ++++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/bwamem.c b/bwamem.c index 6b914b5..456edef 100644 --- a/bwamem.c +++ b/bwamem.c @@ -14,6 +14,8 @@ #define MAPQ_COEF 40. +int mem_debug = 0; + void mem_fill_scmat(int a, int b, int8_t mat[25]) { int i, j, k; @@ -30,8 +32,8 @@ mem_opt_t *mem_opt_init() mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); o->a = 1; o->b = 5; o->q = 8; o->r = 1; o->w = 100; - o->min_seed_len = 17; - o->max_occ = 10; + o->min_seed_len = 19; + o->max_occ = 50; o->max_chain_gap = 10000; o->mask_level = 0.50; o->chain_drop_ratio = 0.50; @@ -84,6 +86,7 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query) itr->len = len; } + const bwtintv_v *smem_next(smem_i *itr, int split_len) { int i, max, max_i; @@ -110,13 +113,15 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) if (xi < xj) { kv_push(bwtintv_t, *a, itr->matches->a[i]); ++i; - } else { + } else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1) { kv_push(bwtintv_t, *a, itr->sub->a[j]); ++j; - } + } else ++j; } for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]); - for (; j < itr->sub->n; ++j) kv_push(bwtintv_t, *a, itr->sub->a[j]); + for (; j < itr->sub->n; ++j) + if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1) + kv_push(bwtintv_t, *a, itr->sub->a[j]); kv_copy(bwtintv_t, *itr->matches, *a); } return itr->matches; @@ -407,7 +412,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (s->qbeg + s->len > a->qe) a->is_all = 0; } */ - //printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\n", c->n, a->score, a->qb, a->qe, a->rb, a->re); + if (mem_debug >= 2) + printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\n", c->n, a->score, a->qb, a->qe, a->rb, a->re); free(rseq); } @@ -507,7 +513,7 @@ static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bn s->seq[i] = nst_nt4_table[(int)s->seq[i]]; chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); -// mem_print_chain(bns, &chn); + if (mem_debug >= 1) mem_print_chain(bns, &chn); regs.n = regs.m = chn.n; regs.a = malloc(regs.n * sizeof(mem_alnreg_t)); for (i = 0; i < chn.n; ++i) { diff --git a/fastmap.c b/fastmap.c index b27b1df..0d2354a 100644 --- a/fastmap.c +++ b/fastmap.c @@ -11,6 +11,7 @@ KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; +extern int mem_debug; int main_mem(int argc, char *argv[]) { @@ -24,8 +25,10 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "k:")) >= 0) { + while ((c = getopt(argc, argv, "k:c:D:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); + else if (c == 'c') opt->max_occ = atoi(optarg); + else if (c == 'D') mem_debug = atoi(optarg); } if (optind + 1 >= argc) { fprintf(stderr, "\n"); From 45b0d3423aba34dfab8128cd250d7246a6e791fd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 20:07:31 -0500 Subject: [PATCH 189/498] bugfix: when no seed hits found --- bwamem.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 456edef..15a903c 100644 --- a/bwamem.c +++ b/bwamem.c @@ -457,16 +457,24 @@ uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) { - int k, m; + int i, k, m; kstring_t str; char *seq; str.l = str.m = 0; str.s = 0; m = mem_choose_alnreg_se(opt, a->n, a->a); seq = malloc(s->l_seq); + if (m == 0) { // no seeds found + for (i = 0; i < s->l_seq; ++i) seq[i] = "ACGTN"[(int)s->seq[i]]; + kputs(s->name, &str); kputs("\t8\t*\t0\t0\t*\t*\t0\t0\t", &str); + kputsn(seq, s->l_seq, &str); + if (s->qual) kputsn(s->qual, s->l_seq, &str); + else kputc('*', &str); + kputc('\n', &str); + } for (k = 0; k < m; ++k) { uint32_t *cigar = 0; - int score, is_rev, nn, rid, i, flag = 0, n_cigar = 0, mapq = 0; + int score, is_rev, nn, rid, flag = 0, n_cigar = 0, mapq = 0; int64_t pos; mem_alnreg_t *p = &a->a[k]; cigar = mem_gen_cigar(opt, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); @@ -494,6 +502,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b else for (i = 0; i < s->l_seq; ++i) seq[i] = "ACGTN"[(int)s->seq[i]]; kputsn(seq, s->l_seq, &str); kputc('\t', &str); if (s->qual) kputsn(s->qual, s->l_seq, &str); + else kputc('*', &str); kputsn("\tAS:i:", 6, &str); kputw(p->score, &str); kputsn("\tss:i:", 6, &str); kputw(p->sub, &str); kputsn("\tnw:i:", 6, &str); kputw(score, &str); From d890c7997cc5ab075b123cec42c0e656420ceefd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 21:20:36 -0500 Subject: [PATCH 190/498] better treatment for micro-repeat --- bwamem.c | 102 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 49 deletions(-) diff --git a/bwamem.c b/bwamem.c index 15a903c..0e6f3a2 100644 --- a/bwamem.c +++ b/bwamem.c @@ -344,12 +344,13 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - int i, qbeg, max = 0, max_i = -1; - int64_t rlen, rbeg, rmax[2], tmp; + int i, k; + int64_t rlen, rmax[2], tmp, max = 0, max_i = 0; const mem_seed_t *s; uint8_t *rseq = 0; + mem_alnreg_t best; - memset(a, 0, sizeof(mem_alnreg_t)); + memset(&best, 0, sizeof(mem_alnreg_t)); // get the max possible span rmax[0] = l_pac<<1; rmax[1] = 0; for (i = 0; i < c->n; ++i) { @@ -359,61 +360,64 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len)); rmax[0] = rmax[0] < b? rmax[0] : b; rmax[1] = rmax[1] > e? rmax[1] : e; - if (max < t->len) max = t->len, max_i = i; + if (t->len > max) max = t->len, max_i = i; } - // get the start and end of the seeded region - rbeg = c->seeds[max_i].rbeg; qbeg = c->seeds[max_i].qbeg; // retrieve the reference sequence rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); - if (qbeg) { // left extension of the first seed - uint8_t *rs, *qs; - int qle, tle; - qs = malloc(qbeg); - for (i = 0; i < qbeg; ++i) qs[i] = query[qbeg - 1 - i]; - tmp = rbeg - rmax[0]; - rs = malloc(tmp); - for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - a->score = ksw_extend(qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, c->seeds[max_i].len * opt->a, 0, &qle, &tle); - a->qb = qbeg - qle; a->rb = rbeg - tle; - free(qs); free(rs); - } else a->score = c->seeds[max_i].len * opt->a, a->qb = 0, a->rb = rbeg; - - s = &c->seeds[max_i]; - if (s->qbeg + s->len != l_query) { // right extension of the first seed - int qle, tle, qe, re; - int16_t *qw = 0; - qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; + for (k = 0; k < c->n;) { + s = &c->seeds[k]; + memset(a, 0, sizeof(mem_alnreg_t)); + if (s->qbeg) { // left extension + uint8_t *rs, *qs; + int qle, tle; + qs = malloc(s->qbeg); + for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; + tmp = s->rbeg - rmax[0]; + rs = malloc(tmp); + for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, 0, &qle, &tle); + a->qb = s->qbeg - qle; a->rb = s->rbeg - tle; + free(qs); free(rs); + } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; + + if (s->qbeg + s->len != l_query) { // right extension of the first seed + int qle, tle, qe, re; + int16_t *qw = 0; + qe = s->qbeg + s->len; + re = s->rbeg + s->len - rmax[0]; #if 0 // FIXME: I am not sure if the following block works. Comment it out if SW extension gives unexpected result. - if (c->n > 1) { // generate $qw - int j, l = rmax[1] - (s->rbeg + s->len); - qw = malloc(l * 2); - for (i = 0; i < l; ++i) qw[i] = -1; // no constraint by default - for (i = 1; i < c->n; ++i) { - const mem_seed_t *t = &c->seeds[i]; - for (j = 0; j < t->len; ++j) { - int x = t->rbeg + j - (s->rbeg + s->len), y = t->qbeg + j - (s->qbeg + s->len); - if (x < 0) continue; // overlap with the first seed - if (qw[x] == -1) qw[x] = (x > y? x - y : y - x) + 1; // FIXME: in principle, we should not need +1 - else if (qw[x] >= 0) qw[x] = -2; // in a seed overlap, do not set any constraint + if (c->n > 1) { // generate $qw + int j, l = rmax[1] - (s->rbeg + s->len); + qw = malloc(l * 2); + for (i = 0; i < l; ++i) qw[i] = -1; // no constraint by default + for (i = 1; i < c->n; ++i) { + const mem_seed_t *t = &c->seeds[i]; + for (j = 0; j < t->len; ++j) { + int x = t->rbeg + j - (s->rbeg + s->len), y = t->qbeg + j - (s->qbeg + s->len); + if (x < 0) continue; // overlap with the first seed + if (qw[x] == -1) qw[x] = (x > y? x - y : y - x) + 1; // FIXME: in principle, we should not need +1 + else if (qw[x] >= 0) qw[x] = -2; // in a seed overlap, do not set any constraint + } } } -// for (i = 0; i < l; ++i) printf("%d:%d\t", i, qw[i]); putchar('\n'); - } #endif - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); - a->qe = qe + qle; a->re = rmax[0] + re + tle; - free(qw); - } else a->qe = l_query, a->re = s->rbeg + s->len; - /* - a->is_all = 1; - if (c->n > 1) { // check if all the seeds have been included - s = &c->seeds[c->n - 1]; - if (s->qbeg + s->len > a->qe) a->is_all = 0; + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); + a->qe = qe + qle; a->re = rmax[0] + re + tle; + free(qw); + } else a->qe = l_query, a->re = s->rbeg + s->len; + if (mem_debug >= 2) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); + // check how many seeds have been covered + for (i = k + 1; i < c->n; ++i) { + const mem_seed_t *t = &c->seeds[i]; + if (t->rbeg + t->len > a->re || t->qbeg + t->len > a->qe) + break; + } + if (i >= c->n) break; // all seeds are included; no need to proceed + if (a->score > best.score) best = *a; + k = i; } - */ - if (mem_debug >= 2) - printf("[%d] score=%d\t[%d,%d) <=> [%lld,%lld)\n", c->n, a->score, a->qb, a->qe, a->rb, a->re); + if (a->score < best.score) *a = best; free(rseq); } From d8e4d57956d61ae26ea8f6b116ac765cb53e52d3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 21:22:54 -0500 Subject: [PATCH 191/498] Don't use narrow band. I may retry this feature if the profilter indicates that this greatly helps. --- bwamem.c | 22 ++-------------------- ksw.c | 9 ++++----- ksw.h | 2 +- 3 files changed, 7 insertions(+), 26 deletions(-) diff --git a/bwamem.c b/bwamem.c index 0e6f3a2..b6cafc7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -376,35 +376,17 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int tmp = s->rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, 0, &qle, &tle); + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle); a->qb = s->qbeg - qle; a->rb = s->rbeg - tle; free(qs); free(rs); } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension of the first seed int qle, tle, qe, re; - int16_t *qw = 0; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; -#if 0 // FIXME: I am not sure if the following block works. Comment it out if SW extension gives unexpected result. - if (c->n > 1) { // generate $qw - int j, l = rmax[1] - (s->rbeg + s->len); - qw = malloc(l * 2); - for (i = 0; i < l; ++i) qw[i] = -1; // no constraint by default - for (i = 1; i < c->n; ++i) { - const mem_seed_t *t = &c->seeds[i]; - for (j = 0; j < t->len; ++j) { - int x = t->rbeg + j - (s->rbeg + s->len), y = t->qbeg + j - (s->qbeg + s->len); - if (x < 0) continue; // overlap with the first seed - if (qw[x] == -1) qw[x] = (x > y? x - y : y - x) + 1; // FIXME: in principle, we should not need +1 - else if (qw[x] >= 0) qw[x] = -2; // in a seed overlap, do not set any constraint - } - } - } -#endif - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, qw, &qle, &tle); + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; - free(qw); } else a->qe = l_query, a->re = s->rbeg + s->len; if (mem_debug >= 2) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); // check how many seeds have been covered diff --git a/ksw.c b/ksw.c index 405bd86..08cdf56 100644 --- a/ksw.c +++ b/ksw.c @@ -319,7 +319,7 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int16_t *qw, int *_qle, int *_tle) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle) { eh_t *eh; // score array int8_t *qp; // query profile @@ -348,15 +348,14 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, max = h0, max_i = max_j = -1; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { - int f = 0, h1, m = 0, mj = -1, t; + int f = 0, h1, m = 0, mj = -1; int8_t *q = &qp[target[i] * qlen]; // compute the first column h1 = h0 - (gapo + gape * (i + 1)); if (h1 < 0) h1 = 0; // apply the band and the constraint (if provided) - t = (qw && qw[i] >= 0 && qw[i] < w)? qw[i] : w; // this is the band width at $i - if (beg < i - t) beg = i - t; - if (end > i + t + 1) end = i + t + 1; + if (beg < i - w) beg = i - w; + if (end > i + w + 1) end = i + w + 1; if (end > qlen) end = qlen; for (j = beg; LIKELY(j < end); ++j) { // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) diff --git a/ksw.h b/ksw.h index d58f423..c7eaabb 100644 --- a/ksw.h +++ b/ksw.h @@ -49,7 +49,7 @@ extern "C" { /** Unified interface for ksw_sse2_8() and ksw_sse2_16() */ int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a); - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, const int16_t *qw, int *_qle, int *_tle); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle); int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar); #ifdef __cplusplus From 245505deedfa6020b3e44cb619ee5c9821c16d01 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Feb 2013 22:09:58 -0500 Subject: [PATCH 192/498] minor improvement to mapQ approx. That is not good enough, but I am tired and need rest... --- bwamem.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index b6cafc7..a806a8b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -457,6 +457,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (s->qual) kputsn(s->qual, s->l_seq, &str); else kputc('*', &str); kputc('\n', &str); + goto ret_sam_se; } for (k = 0; k < m; ++k) { uint32_t *cigar = 0; @@ -470,8 +471,14 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (n_cigar == 0) flag |= 8; kputs(s->name, &str); kputc('\t', &str); kputw(flag, &str); kputc('\t', &str); kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset + 1, &str); kputc('\t', &str); - mapq = p->score? (int)(MAPQ_COEF * (1. - (float)(p->sub? p->sub : opt->min_seed_len * opt->a) / p->score) * log(p->score / opt->a) + .499) : 0; - if (mapq > 60) mapq = 60; + { // approximate mapQ + int sub = p->sub? p->sub : opt->min_seed_len * opt->a; + double identity; + mapq = p->score? (int)(MAPQ_COEF * (1. - (float)sub / p->score) * log(p->score / opt->a) + .499) : 0; + identity = (double)p->score / opt->a / (p->qe - p->qb); + mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; + if (mapq > 60) mapq = 60; + } kputw(mapq, &str); kputc('\t', &str); if (n_cigar) { int clip5, clip3; @@ -495,6 +502,8 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b kputc('\n', &str); free(cigar); } + +ret_sam_se: free(seq); s->sam = str.s; } From 1bf1a674a821731367f966d8a6bc780a9d63366d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 13:43:15 -0500 Subject: [PATCH 193/498] minor improvement to mapQ --- bwamem.c | 30 ++++++++++++++++++++++-------- bwamem.h | 2 +- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/bwamem.c b/bwamem.c index a806a8b..cb064f8 100644 --- a/bwamem.c +++ b/bwamem.c @@ -401,6 +401,15 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } if (a->score < best.score) *a = best; free(rseq); + + // compute seedcov + if (c->n > 1) { + for (i = 0, a->seedcov = 0; i < c->n; ++i) { + s = &c->seeds[i]; + if (s->qbeg >= a->qb && s->qbeg + s->len <= a->qe && s->rbeg >= a->rb && s->rbeg + s->len <= a->re) // seed fully contained + a->seedcov += s->len; // this is not very accurate, but for approx. mapQ, this is good enough + } + } else a->seedcov = c->seeds[0].len; } uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) @@ -441,6 +450,18 @@ uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, * Integrated interface * ************************/ +static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) +{ + int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a; + double identity; + l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; + mapq = a->score? (int)(MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; + identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; + mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; + if (mapq > 60) mapq = 60; + return mapq; +} + void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) { int i, k, m; @@ -471,14 +492,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (n_cigar == 0) flag |= 8; kputs(s->name, &str); kputc('\t', &str); kputw(flag, &str); kputc('\t', &str); kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset + 1, &str); kputc('\t', &str); - { // approximate mapQ - int sub = p->sub? p->sub : opt->min_seed_len * opt->a; - double identity; - mapq = p->score? (int)(MAPQ_COEF * (1. - (float)sub / p->score) * log(p->score / opt->a) + .499) : 0; - identity = (double)p->score / opt->a / (p->qe - p->qb); - mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; - if (mapq > 60) mapq = 60; - } + mapq = approx_mapq_se(opt, p); kputw(mapq, &str); kputc('\t', &str); if (n_cigar) { int clip5, clip3; diff --git a/bwamem.h b/bwamem.h index 7215ad3..ce3a221 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,7 +31,7 @@ typedef struct { typedef struct { int64_t rb, re; - int score, qb, qe, sub; + int score, qb, qe, seedcov, sub; // sub: suboptimal score } mem_alnreg_t; typedef kvec_t(mem_chain_t) mem_chain_v; From 057b292dde7bca16645c9139c3d9aaccefcd5928 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 14:18:39 -0500 Subject: [PATCH 194/498] exclude identical hits --- bwamem.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index cb064f8..e5bb792 100644 --- a/bwamem.c +++ b/bwamem.c @@ -306,7 +306,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) return n; } -#define alnreg_lt(a, b) ((a).score > (b).score) +#define alnreg_lt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) KSORT_INIT(mem_ar, mem_alnreg_t, alnreg_lt) int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) @@ -314,6 +314,13 @@ int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) int i, j, m; if (n <= 1) return n; ks_introsort(mem_ar, n, a); + for (i = 1; i < n; ++i) { // mark identical hits + if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb) + a[i].score = 0; + } + for (i = 1, m = 1; i < n; ++i) // exclude identical hits + if (a[i].score > 0) a[m++] = a[i]; + n = m; for (i = 0; i < n; ++i) a[i].sub = 0; for (i = 1, m = 1; i < n; ++i) { for (j = 0; j < m; ++j) { From fdb0a7405fc6eb51100efb546f359a2e16d48450 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 14:46:57 -0500 Subject: [PATCH 195/498] better dealing with microrepeat --- bwamem.c | 6 ++++-- bwamem.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index e5bb792..b2f61a4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -351,7 +351,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - int i, k; + int i, k, csub = 0; int64_t rlen, rmax[2], tmp, max = 0, max_i = 0; const mem_seed_t *s; uint8_t *rseq = 0; @@ -403,10 +403,11 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int break; } if (i >= c->n) break; // all seeds are included; no need to proceed - if (a->score > best.score) best = *a; + if (a->score > best.score) csub = best.score, best = *a; k = i; } if (a->score < best.score) *a = best; + a->csub = csub; free(rseq); // compute seedcov @@ -461,6 +462,7 @@ static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) { int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a; double identity; + sub = a->csub > sub? a->csub : sub; l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; mapq = a->score? (int)(MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; diff --git a/bwamem.h b/bwamem.h index ce3a221..4b30daf 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,7 +31,7 @@ typedef struct { typedef struct { int64_t rb, re; - int score, qb, qe, seedcov, sub; // sub: suboptimal score + int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain } mem_alnreg_t; typedef kvec_t(mem_chain_t) mem_chain_v; From 220fc39e9daf3569ca328a76f5075da50f85c968 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 14:51:24 -0500 Subject: [PATCH 196/498] the previous change does not work... Fixed. --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index b2f61a4..b4283fd 100644 --- a/bwamem.c +++ b/bwamem.c @@ -402,8 +402,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (t->rbeg + t->len > a->re || t->qbeg + t->len > a->qe) break; } + if (a->score >= best.score) csub = best.score, best = *a; if (i >= c->n) break; // all seeds are included; no need to proceed - if (a->score > best.score) csub = best.score, best = *a; k = i; } if (a->score < best.score) *a = best; From 2848d3045a30bd2e5af704d68017f08926354287 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 15:34:25 -0500 Subject: [PATCH 197/498] more accurate chain weight --- bwamem.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index b4283fd..4643ea7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -251,8 +251,22 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) a = malloc(sizeof(flt_aux_t) * n_chn); for (i = 0; i < n_chn; ++i) { mem_chain_t *c = &chains[i]; - int w = 0; - for (j = 0; j < c->n; ++j) w += c->seeds[j].len; // FIXME: take care of seed overlaps + int64_t end; + int w = 0, tmp; + for (j = 0, end = 0; j < c->n; ++j) { + const mem_seed_t *s = &c->seeds[j]; + if (s->qbeg >= end) w += s->len; + else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end; + end = end > s->qbeg + s->len? end : s->qbeg + s->len; + } + tmp = w; + for (j = 0, end = 0; j < c->n; ++j) { + const mem_seed_t *s = &c->seeds[j]; + if (s->rbeg >= end) w += s->len; + else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end; + end = end > s->qbeg + s->len? end : s->qbeg + s->len; + } + w = w < tmp? w : tmp; a[i].beg = c->seeds[0].qbeg; a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; a[i].w = w; a[i].p = c; a[i].p2 = 0; From 39607065e04c92099a8239f21be8f4911913bc77 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 16:56:28 -0500 Subject: [PATCH 198/498] allow more seeds to be seen (thus slower..) --- bwamem.c | 20 +++++++++++--------- bwamem.h | 3 ++- fastmap.c | 13 +++++++------ 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/bwamem.c b/bwamem.c index 4643ea7..e1ef4e7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -33,7 +33,8 @@ mem_opt_t *mem_opt_init() o = calloc(1, sizeof(mem_opt_t)); o->a = 1; o->b = 5; o->q = 8; o->r = 1; o->w = 100; o->min_seed_len = 19; - o->max_occ = 50; + o->split_width = 10; + o->max_occ = 10000; o->max_chain_gap = 10000; o->mask_level = 0.50; o->chain_drop_ratio = 0.50; @@ -87,25 +88,26 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query) } -const bwtintv_v *smem_next(smem_i *itr, int split_len) +const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) { - int i, max, max_i; + int i, max, max_i, ori_start; itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; if (itr->start >= itr->len || itr->start < 0) return 0; while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases if (itr->start == itr->len) return 0; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, itr->start, 1, itr->matches, itr->tmpvec); // search for SMEM + ori_start = itr->start; + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match bwtintv_t *p = &itr->matches->a[i]; int len = (uint32_t)p->info - (p->info>>32); if (max < len) max = len, max_i = i; } - if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] == 1) { // if the longest SMEM is unique and long + if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] <= split_width) { // if the longest SMEM is unique and long int j; bwtintv_v *a = itr->tmpvec[0]; // reuse tmpvec[0] for merging bwtintv_t *p = &itr->matches->a[max_i]; - bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, 2, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM + bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, itr->matches->a[max_i].x[2]+1, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM i = j = 0; a->n = 0; while (i < itr->matches->n && j < itr->sub->n) { // ordered merge int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info); @@ -113,14 +115,14 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len) if (xi < xj) { kv_push(bwtintv_t, *a, itr->matches->a[i]); ++i; - } else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1) { + } else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) { kv_push(bwtintv_t, *a, itr->sub->a[j]); ++j; } else ++j; } for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]); for (; j < itr->sub->n; ++j) - if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1) + if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) kv_push(bwtintv_t, *a, itr->sub->a[j]); kv_copy(bwtintv_t, *itr->matches, *a); } @@ -160,7 +162,7 @@ static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr) { const bwtintv_v *a; - while ((a = smem_next(itr, opt->min_seed_len<<1)) != 0) { // to find all SMEM and some internal MEM + while ((a = smem_next(itr, opt->min_seed_len<<1, opt->split_width)) != 0) { // to find all SMEM and some internal MEM int i; for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start bwtintv_t *p = &a->a[i]; diff --git a/bwamem.h b/bwamem.h index 4b30daf..d415ccf 100644 --- a/bwamem.h +++ b/bwamem.h @@ -16,6 +16,7 @@ typedef struct { typedef struct { int a, b, q, r, w; + int split_width; int min_seed_len, max_occ, max_chain_gap; int n_threads, chunk_size; int pe_dir, is_pe; @@ -44,7 +45,7 @@ extern "C" { smem_i *smem_itr_init(const bwt_t *bwt); void smem_itr_destroy(smem_i *itr); void smem_set_query(smem_i *itr, int len, const uint8_t *query); -const bwtintv_v *smem_next(smem_i *itr, int split_len); +const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width); mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); diff --git a/fastmap.c b/fastmap.c index 0d2354a..e31b0a5 100644 --- a/fastmap.c +++ b/fastmap.c @@ -25,10 +25,11 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "k:c:D:")) >= 0) { + while ((c = getopt(argc, argv, "k:c:D:s:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'D') mem_debug = atoi(optarg); + else if (c == 's') opt->split_width = atoi(optarg); } if (optind + 1 >= argc) { fprintf(stderr, "\n"); @@ -76,7 +77,7 @@ int main_mem(int argc, char *argv[]) int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_long = 0; + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_width = 0; kseq_t *seq; bwtint_t k; gzFile fp; @@ -85,16 +86,16 @@ int main_fastmap(int argc, char *argv[]) smem_i *itr; const bwtintv_v *a; - while ((c = getopt(argc, argv, "w:l:ps")) >= 0) { + while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) { switch (c) { - case 's': split_long = 1; break; + case 's': split_width = atoi(optarg); break; case 'p': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; } } if (optind + 1 >= argc) { - fprintf(stderr, "Usage: bwa fastmap [-ps] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); + fprintf(stderr, "Usage: bwa fastmap [-p] [-s splitWidth=%d] [-l minLen=%d] [-w maxSaSize=%d] \n", split_width, min_len, min_iwidth); return 1; } @@ -119,7 +120,7 @@ int main_fastmap(int argc, char *argv[]) for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s); - while ((a = smem_next(itr, split_long? min_len<<1 : 0)) != 0) { + while ((a = smem_next(itr, min_len<<1, split_width)) != 0) { for (i = 0; i < a->n; ++i) { bwtintv_t *p = &a->a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; From b2c7148dc93c713a0db428b7357ba1b91fa3b0b0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 17:20:44 -0500 Subject: [PATCH 199/498] consider the number of suboptimal hits --- bwamem.c | 3 +++ bwamem.h | 1 + 2 files changed, 4 insertions(+) diff --git a/bwamem.c b/bwamem.c index e1ef4e7..48a2431 100644 --- a/bwamem.c +++ b/bwamem.c @@ -346,6 +346,7 @@ int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb; if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap if (a[j].sub == 0) a[j].sub = a[i].score; + a[j].sub_n += (double)a[i].score / a[j].sub; break; } } @@ -483,7 +484,9 @@ static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) mapq = a->score? (int)(MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; + if (a->sub) mapq -= (int)(4.343 * log(a->sub_n) + .499); if (mapq > 60) mapq = 60; + if (mapq < 0) mapq = 0; return mapq; } diff --git a/bwamem.h b/bwamem.h index d415ccf..f524c8e 100644 --- a/bwamem.h +++ b/bwamem.h @@ -33,6 +33,7 @@ typedef struct { typedef struct { int64_t rb, re; int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain + double sub_n; } mem_alnreg_t; typedef kvec_t(mem_chain_t) mem_chain_v; From 829664d6b5edbaadcc67f9b8a2475dea91b40b69 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 17:55:35 -0500 Subject: [PATCH 200/498] missing identical hits; improved sub_n --- bwamem.c | 11 ++++++----- bwamem.h | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/bwamem.c b/bwamem.c index 48a2431..92be602 100644 --- a/bwamem.c +++ b/bwamem.c @@ -327,17 +327,18 @@ KSORT_INIT(mem_ar, mem_alnreg_t, alnreg_lt) int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) { // similar to the loop in mem_chain_flt() - int i, j, m; + int i, j, m, tmp; if (n <= 1) return n; ks_introsort(mem_ar, n, a); for (i = 1; i < n; ++i) { // mark identical hits if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb) - a[i].score = 0; + a[i].qe = a[i].qb; } for (i = 1, m = 1; i < n; ++i) // exclude identical hits - if (a[i].score > 0) a[m++] = a[i]; + if (a[i].qe > a[i].qb) a[m++] = a[i]; n = m; for (i = 0; i < n; ++i) a[i].sub = 0; + tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; for (i = 1, m = 1; i < n; ++i) { for (j = 0; j < m; ++j) { int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb; @@ -346,7 +347,7 @@ int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) int min_l = a[i].qe - a[i].qb < a[j].qe - a[j].qb? a[i].qe - a[i].qb : a[j].qe - a[j].qb; if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap if (a[j].sub == 0) a[j].sub = a[i].score; - a[j].sub_n += (double)a[i].score / a[j].sub; + if (a[j].score - a[i].score <= tmp) ++a[j].sub_n; break; } } @@ -484,7 +485,7 @@ static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) mapq = a->score? (int)(MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; - if (a->sub) mapq -= (int)(4.343 * log(a->sub_n) + .499); + if (a->sub_n) mapq -= (int)(4.343 * log(a->sub_n) + .499); if (mapq > 60) mapq = 60; if (mapq < 0) mapq = 0; return mapq; diff --git a/bwamem.h b/bwamem.h index f524c8e..c26893a 100644 --- a/bwamem.h +++ b/bwamem.h @@ -33,7 +33,7 @@ typedef struct { typedef struct { int64_t rb, re; int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain - double sub_n; + int sub_n; } mem_alnreg_t; typedef kvec_t(mem_chain_t) mem_chain_v; From 95a79afe719011c14c11736b7a239b2e78e848b4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 22:11:44 -0500 Subject: [PATCH 201/498] command-line prompt --- fastmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fastmap.c b/fastmap.c index e31b0a5..990f442 100644 --- a/fastmap.c +++ b/fastmap.c @@ -33,7 +33,10 @@ int main_mem(int argc, char *argv[]) } if (optind + 1 >= argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n"); + fprintf(stderr, "Usage: bwa mem [options] \n"); + fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); + fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); + fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, "\n"); free(opt); return 1; From cb55617f50ca20cf231bf4012dca5dee26715091 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Feb 2013 22:12:18 -0500 Subject: [PATCH 202/498] added a new line --- fastmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastmap.c b/fastmap.c index 990f442..5093897 100644 --- a/fastmap.c +++ b/fastmap.c @@ -33,7 +33,7 @@ int main_mem(int argc, char *argv[]) } if (optind + 1 >= argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa mem [options] \n"); + fprintf(stderr, "Usage: bwa mem [options] \n\n"); fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); From c310fb74242fb6e5f74414d60ac99172375d1ed0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 10 Feb 2013 12:24:33 -0500 Subject: [PATCH 203/498] a little refactoring for PE support --- bwamem.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/bwamem.c b/bwamem.c index 92be602..004274e 100644 --- a/bwamem.c +++ b/bwamem.c @@ -322,21 +322,31 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) return n; } -#define alnreg_lt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) -KSORT_INIT(mem_ar, mem_alnreg_t, alnreg_lt) +/****************************** + * De-overlap single-end hits * + ******************************/ -int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) -{ // similar to the loop in mem_chain_flt() - int i, j, m, tmp; +#define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) +KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) + +int mem_sort_and_dedup(int n, mem_alnreg_t *a) +{ + int m, i; if (n <= 1) return n; - ks_introsort(mem_ar, n, a); + ks_introsort(mem_ars, n, a); for (i = 1; i < n; ++i) { // mark identical hits if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb) a[i].qe = a[i].qb; } for (i = 1, m = 1; i < n; ++i) // exclude identical hits if (a[i].qe > a[i].qb) a[m++] = a[i]; - n = m; + return m; +} + +int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function +{ // similar to the loop in mem_chain_flt() + int i, j, m, tmp; + if (n <= 1) return n; for (i = 0; i < n; ++i) a[i].sub = 0; tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; for (i = 1, m = 1; i < n; ++i) { @@ -357,6 +367,10 @@ int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) return m; } +/************************ + * Pick paired-end hits * + ************************/ + /**************************************** * Construct the alignment from a chain * ****************************************/ @@ -493,14 +507,15 @@ static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) { - int i, k, m; + int i, k; kstring_t str; char *seq; str.l = str.m = 0; str.s = 0; - m = mem_choose_alnreg_se(opt, a->n, a->a); + a->n = mem_sort_and_dedup(a->n, a->a); + a->n = mem_choose_alnreg_se(opt, a->n, a->a); seq = malloc(s->l_seq); - if (m == 0) { // no seeds found + if (a->n == 0) { // no seeds found for (i = 0; i < s->l_seq; ++i) seq[i] = "ACGTN"[(int)s->seq[i]]; kputs(s->name, &str); kputs("\t8\t*\t0\t0\t*\t*\t0\t0\t", &str); kputsn(seq, s->l_seq, &str); @@ -509,7 +524,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b kputc('\n', &str); goto ret_sam_se; } - for (k = 0; k < m; ++k) { + for (k = 0; k < a->n; ++k) { uint32_t *cigar = 0; int score, is_rev, nn, rid, flag = 0, n_cigar = 0, mapq = 0; int64_t pos; From f4c0672800f98a8e58e54c2c76f068c60b0bd124 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 10 Feb 2013 12:55:19 -0500 Subject: [PATCH 204/498] move sort_and_dedup() to worker1() --- bwamem.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 004274e..7dd55b4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -512,8 +512,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b char *seq; str.l = str.m = 0; str.s = 0; - a->n = mem_sort_and_dedup(a->n, a->a); - a->n = mem_choose_alnreg_se(opt, a->n, a->a); + a->n = mem_choose_alnreg_se(opt, a->n, a->a); // NOTE: mem_sort_and_dedup() called in worker1() seq = malloc(s->l_seq); if (a->n == 0) { // no seeds found for (i = 0; i < s->l_seq; ++i) seq[i] = "ACGTN"[(int)s->seq[i]]; @@ -600,8 +599,10 @@ static void *worker1(void *data) { worker_t *w = (worker_t*)data; int i; - for (i = w->start; i < w->n; i += w->step) + for (i = w->start; i < w->n; i += w->step) { w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); + mem_sort_and_dedup(w->regs[i].n, w->regs[i].a); + } return 0; } From 59eaf650ac86620b03539264591e2681d4b55ad4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Feb 2013 10:59:38 -0500 Subject: [PATCH 205/498] code backup --- Makefile | 2 +- bntseq.h | 1 + bwamem.c | 20 +++++++++++++++----- bwamem.h | 10 ++++++++++ bwamem_pair.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ fastmap.c | 6 +++--- 6 files changed, 74 insertions(+), 9 deletions(-) create mode 100644 bwamem_pair.c diff --git a/Makefile b/Makefile index 46e0b80..c97fbcf 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o stdaln.o \ +LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ bseq.o bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ diff --git a/bntseq.h b/bntseq.h index d4096b4..0425540 100644 --- a/bntseq.h +++ b/bntseq.h @@ -29,6 +29,7 @@ #define BWT_BNTSEQ_H #include +#include #include #ifndef BWA_UBYTE diff --git a/bwamem.c b/bwamem.c index 7dd55b4..b17d23b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -14,7 +14,7 @@ #define MAPQ_COEF 40. -int mem_debug = 0; +int mem_verbose = 3; // 1: error only; 2: error+warning; 3: message+error+warning; >=4: debugging void mem_fill_scmat(int a, int b, int8_t mat[25]) { @@ -36,6 +36,7 @@ mem_opt_t *mem_opt_init() o->split_width = 10; o->max_occ = 10000; o->max_chain_gap = 10000; + o->max_ins = 10000; o->mask_level = 0.50; o->chain_drop_ratio = 0.50; o->chunk_size = 10000000; @@ -427,7 +428,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; } else a->qe = l_query, a->re = s->rbeg + s->len; - if (mem_debug >= 2) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); + if (mem_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); // check how many seeds have been covered for (i = k + 1; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; @@ -574,7 +575,7 @@ static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bn s->seq[i] = nst_nt4_table[(int)s->seq[i]]; chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); - if (mem_debug >= 1) mem_print_chain(bns, &chn); + if (mem_verbose >= 4) mem_print_chain(bns, &chn); regs.n = regs.m = chn.n; regs.a = malloc(regs.n * sizeof(mem_alnreg_t)); for (i = 0; i < chn.n; ++i) { @@ -593,6 +594,7 @@ typedef struct { const uint8_t *pac; bseq1_t *seqs; mem_alnreg_v *regs; + mem_pestat_t *pes; } worker_t; static void *worker1(void *data) @@ -628,6 +630,8 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns int i; worker_t *w; mem_alnreg_v *regs; + mem_pestat_t pes[4]; + w = calloc(opt->n_threads, sizeof(worker_t)); regs = malloc(n * sizeof(mem_alnreg_v)); for (i = 0; i < opt->n_threads; ++i) { @@ -635,21 +639,27 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns p->start = i; p->step = opt->n_threads; p->n = n; p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac; p->seqs = seqs; p->regs = regs; + p->pes = &pes[0]; } #ifdef HAVE_PTHREAD if (opt->n_threads == 1) { - worker1(w); worker2(w); + worker1(w); + mem_pestat(opt, bns->l_pac, n, regs, pes); + worker2(w); } else { pthread_t *tid; tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); + mem_pestat(opt, bns->l_pac, n, regs, pes); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); free(tid); } #else - worker1(w); worker2(w); + worker1(w); + mem_pestat(opt, bns->l_pac, n, regs, pes); + worker2(w); #endif for (i = 0; i < n; ++i) { fputs(seqs[i].sam, stdout); diff --git a/bwamem.h b/bwamem.h index c26893a..c89abf6 100644 --- a/bwamem.h +++ b/bwamem.h @@ -21,6 +21,7 @@ typedef struct { int n_threads, chunk_size; int pe_dir, is_pe; float mask_level, chain_drop_ratio; + int max_ins; // maximum insert size int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; @@ -36,9 +37,16 @@ typedef struct { int sub_n; } mem_alnreg_t; +typedef struct { + int low, high, failed; + double avg, std; +} mem_pestat_t; + typedef kvec_t(mem_chain_t) mem_chain_v; typedef kvec_t(mem_alnreg_t) mem_alnreg_v; +extern int mem_verbose; + #ifdef __cplusplus extern "C" { #endif @@ -58,6 +66,8 @@ uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); +void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); + #ifdef __cplusplus } #endif diff --git a/bwamem_pair.c b/bwamem_pair.c new file mode 100644 index 0000000..2a0079b --- /dev/null +++ b/bwamem_pair.c @@ -0,0 +1,44 @@ +#include +#include "kstring.h" +#include "bwamem.h" +#include "kvec.h" + +#define MIN_RATIO 0.8 + +static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) +{ + int j; + for (j = 1; j < r->n; ++j) { // choose unique alignment + int b_max = r->a[j].qb > r->a[0].qb? r->a[j].qb : r->a[0].qb; + int e_min = r->a[j].qe < r->a[0].qe? r->a[j].qe : r->a[0].qe; + if (e_min > b_max) { // have overlap + int min_l = r->a[j].qe - r->a[j].qb < r->a[0].qe - r->a[0].qb? r->a[j].qe - r->a[j].qb : r->a[0].qe - r->a[0].qb; + if (e_min - b_max >= min_l * opt->mask_level) break; // significant overlap + } + } + return j < r->n? r->a[j].score : opt->min_seed_len * opt->a; +} + +void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) +{ + int i; + kvec_t(int) isize[4]; + memset(isize, 0, sizeof(kvec_t(int)) * 4); + for (i = 0; i < n>>1; i += 2) { + int dir; + int64_t is, pos[2]; + mem_alnreg_v *r[2]; + r[0] = (mem_alnreg_v*)®s[i<<1|0]; + r[1] = (mem_alnreg_v*)®s[i<<1|1]; + if (r[0]->n == 0 || r[1]->n == 0) continue; + if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue; + if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue; + pos[0] = r[0]->a[0].rb < l_pac? r[0]->a[0].rb : (l_pac<<1) - 1 - r[0]->a[0].rb; // forward coordinate + pos[1] = r[1]->a[0].rb < l_pac? r[1]->a[0].rb : (l_pac<<1) - 1 - r[1]->a[0].rb; + if (pos[0] < pos[1]) dir = (r[0]->a[0].rb >= l_pac)<<1 | (r[1]->a[0].rb >= l_pac); + else dir = (r[1]->a[0].rb >= l_pac)<<1 | (r[0]->a[0].rb >= l_pac); + is = abs(pos[0] - pos[1]); + if (is <= opt->max_ins) kv_push(int, isize[dir], is); + } + if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidates unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); +} diff --git a/fastmap.c b/fastmap.c index 5093897..698b3e1 100644 --- a/fastmap.c +++ b/fastmap.c @@ -11,7 +11,6 @@ KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; -extern int mem_debug; int main_mem(int argc, char *argv[]) { @@ -25,10 +24,10 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "k:c:D:s:")) >= 0) { + while ((c = getopt(argc, argv, "k:c:v:s:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'c') opt->max_occ = atoi(optarg); - else if (c == 'D') mem_debug = atoi(optarg); + else if (c == 'v') mem_verbose = atoi(optarg); else if (c == 's') opt->split_width = atoi(optarg); } if (optind + 1 >= argc) { @@ -37,6 +36,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); + fprintf(stderr, " -v INT verbose level [%d]\n", mem_verbose); fprintf(stderr, "\n"); free(opt); return 1; From 987d4b4205c382d6c6c2c1c06af5a52a742d51a4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Feb 2013 11:27:35 -0500 Subject: [PATCH 206/498] fixed a stupid bug in fastq reading --- bseq.c | 6 +++--- bwamem.c | 6 +++--- fastmap.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bseq.c b/bseq.c index 54a25f6..d20b983 100644 --- a/bseq.c +++ b/bseq.c @@ -41,15 +41,15 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) size += seqs[n++].l_seq; if (ks2) { trim_readno(&ks2->name); - kseq2bseq1(ks2, &seqs[n++]); + kseq2bseq1(ks2, &seqs[n]); size += seqs[n++].l_seq; } if (size >= chunk_size) break; } - *n_ = n; - if (size < chunk_size) { // test if the 2nd file is finished + if (size == 0) { // test if the 2nd file is finished if (ks2 && kseq_read(ks2) >= 0) fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); } + *n_ = n; return seqs; } diff --git a/bwamem.c b/bwamem.c index b17d23b..f617fcf 100644 --- a/bwamem.c +++ b/bwamem.c @@ -644,21 +644,21 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns #ifdef HAVE_PTHREAD if (opt->n_threads == 1) { worker1(w); - mem_pestat(opt, bns->l_pac, n, regs, pes); + if (opt->is_pe) mem_pestat(opt, bns->l_pac, n, regs, pes); worker2(w); } else { pthread_t *tid; tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); - mem_pestat(opt, bns->l_pac, n, regs, pes); + if (opt->is_pe) mem_pestat(opt, bns->l_pac, n, regs, pes); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); free(tid); } #else worker1(w); - mem_pestat(opt, bns->l_pac, n, regs, pes); + if (opt->is_pe) mem_pestat(opt, bns->l_pac, n, regs, pes); worker2(w); #endif for (i = 0; i < n; ++i) { diff --git a/fastmap.c b/fastmap.c index 698b3e1..56674f9 100644 --- a/fastmap.c +++ b/fastmap.c @@ -58,7 +58,7 @@ int main_mem(int argc, char *argv[]) ks = kseq_init(fp); if (optind + 2 < argc) { fp2 = gzopen(argv[optind + 2], "r"); - ks2 = kseq_init(fp); + ks2 = kseq_init(fp2); opt->is_pe = 1; } while ((seqs = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { From 4431e359e2bca378ece303233fe54bbf55c64ffa Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Feb 2013 12:15:12 -0500 Subject: [PATCH 207/498] analyze isize distribution --- bwamem_pair.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++---- bwtsw2_pair.c | 4 ++-- 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 2a0079b..99c5c6e 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -1,9 +1,16 @@ #include +#include #include "kstring.h" #include "bwamem.h" #include "kvec.h" #define MIN_RATIO 0.8 +#define MIN_DIR_CNT 10 +#define MIN_DIR_RATIO 0.1 +#define OUTLIER_BOUND 2.0 +#define MAPPING_BOUND 3.0 +#define MAX_STDDEV 4.0 +#define EXT_STDDEV 4.0 static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) { @@ -19,12 +26,16 @@ static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) return j < r->n? r->a[j].score : opt->min_seed_len * opt->a; } +typedef kvec_t(uint64_t) vec64_t; + void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) { - int i; - kvec_t(int) isize[4]; + extern void ks_introsort_uint64_t(size_t n, uint64_t *a); + int i, d; + vec64_t isize[4]; + memset(pes, 0, 4 * sizeof(mem_pestat_t)); memset(isize, 0, sizeof(kvec_t(int)) * 4); - for (i = 0; i < n>>1; i += 2) { + for (i = 0; i < n>>1; ++i) { int dir; int64_t is, pos[2]; mem_alnreg_v *r[2]; @@ -38,7 +49,41 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * if (pos[0] < pos[1]) dir = (r[0]->a[0].rb >= l_pac)<<1 | (r[1]->a[0].rb >= l_pac); else dir = (r[1]->a[0].rb >= l_pac)<<1 | (r[0]->a[0].rb >= l_pac); is = abs(pos[0] - pos[1]); - if (is <= opt->max_ins) kv_push(int, isize[dir], is); + if (is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); } if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidates unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); + for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. + mem_pestat_t *r = &pes[d]; + vec64_t *q = &isize[d]; + int p25, p50, p75, tmp, x; + if (q->n < MIN_DIR_CNT) { + fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + r->failed = 1; + continue; + } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + ks_introsort_uint64_t(q->n, q->a); + p25 = q->a[(int)(.25 * q->n + .499)]; + p50 = q->a[(int)(.50 * q->n + .499)]; + p75 = q->a[(int)(.75 * q->n + .499)]; + r->low = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); + if (r->low < 1) r->low = 1; + r->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + fprintf(stderr, "[M::%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); + fprintf(stderr, "[M::%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r->low, r->high); + for (i = x = 0, r->avg = 0; i < q->n; ++i) + if (q->a[i] >= r->low && q->a[i] <= r->high) + r->avg += q->a[i], ++x; + r->avg /= x; + for (i = 0, r->std = 0; i < q->n; ++i) + if (q->a[i] >= r->low && q->a[i] <= r->high) + r->std += (q->a[i] - r->avg) * (q->a[i] - r->avg); + r->std = sqrt(r->std / x); + fprintf(stderr, "[M::%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r->avg, r->std); + r->low = (int)(p25 - MAPPING_BOUND * (p75 - p25) + .499); + r->high = (int)(p75 + MAPPING_BOUND * (p75 - p25) + .499); + if (r->low > r->avg - MAX_STDDEV * r->std) r->low = (int)(r->avg - MAX_STDDEV * r->std + .499); + if (r->high < r->avg - MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499); + if (r->low < 1) r->low = 1; + fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high); + } } diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index a6f4d80..633641e 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -74,9 +74,9 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) r.low = tmp > max_len? tmp : max_len; if (r.low < 1) r.low = 1; r.high = (int)(p75 + 3. * (p75 - p25) + .499); - if (r.low > r.avg - MAX_STDDEV * 4.) r.low = (int)(r.avg - MAX_STDDEV * 4. + .499); + if (r.low > r.avg - MAX_STDDEV * r.std) r.low = (int)(r.avg - MAX_STDDEV * r.std + .499); r.low = tmp > max_len? tmp : max_len; - if (r.high < r.avg - MAX_STDDEV * 4.) r.high = (int)(r.avg + MAX_STDDEV * 4. + .499); + if (r.high < r.avg - MAX_STDDEV * r.std) r.high = (int)(r.avg + MAX_STDDEV * r.std + .499); ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high); free(isize); return r; From b6006cbe9d98b0682701f2f0e6ebe857547f8588 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Feb 2013 13:44:39 -0500 Subject: [PATCH 208/498] skip orientations that are much smaller than best --- bwamem_pair.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 99c5c6e..d537718 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -6,7 +6,7 @@ #define MIN_RATIO 0.8 #define MIN_DIR_CNT 10 -#define MIN_DIR_RATIO 0.1 +#define MIN_DIR_RATIO 0.05 #define OUTLIER_BOUND 2.0 #define MAPPING_BOUND 3.0 #define MAX_STDDEV 4.0 @@ -31,7 +31,7 @@ typedef kvec_t(uint64_t) vec64_t; void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) { extern void ks_introsort_uint64_t(size_t n, uint64_t *a); - int i, d; + int i, d, max; vec64_t isize[4]; memset(pes, 0, 4 * sizeof(mem_pestat_t)); memset(isize, 0, sizeof(kvec_t(int)) * 4); @@ -51,11 +51,11 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * is = abs(pos[0] - pos[1]); if (is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); } - if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidates unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); + if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. mem_pestat_t *r = &pes[d]; vec64_t *q = &isize[d]; - int p25, p50, p75, tmp, x; + int p25, p50, p75, x; if (q->n < MIN_DIR_CNT) { fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); r->failed = 1; @@ -85,5 +85,13 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * if (r->high < r->avg - MAX_STDDEV * r->std) r->high = (int)(r->avg + MAX_STDDEV * r->std + .499); if (r->low < 1) r->low = 1; fprintf(stderr, "[M::%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r->low, r->high); + free(q->a); } + for (d = 0, max = 0; d < 4; ++d) + max = max > isize[d].n? max : isize[d].n; + for (d = 0; d < 4; ++d) + if (pes[d].failed == 0 && isize[d].n < max * MIN_DIR_RATIO) { + pes[d].failed = 1; + fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]); + } } From 99907c98fb99844bd7c9f2023474b67b1924545c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Feb 2013 15:29:03 -0500 Subject: [PATCH 209/498] separated and improved SAM printing code This is for the PE mode. The routines may also be useful for bwa-sw, but probably I won't change the old code. --- bwamem.c | 158 ++++++++++++++++++++++++++++++++------------------ bwamem.h | 9 +++ bwamem_pair.c | 24 ++++++++ kstring.h | 9 +++ 4 files changed, 142 insertions(+), 58 deletions(-) diff --git a/bwamem.c b/bwamem.c index f617fcf..99e8938 100644 --- a/bwamem.c +++ b/bwamem.c @@ -453,7 +453,11 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } else a->seedcov = c->seeds[0].len; } -uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) +/***************************** + * Basic hit->SAM conversion * + *****************************/ + +uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) { uint32_t *cigar = 0; uint8_t tmp, *rseq; @@ -472,12 +476,12 @@ uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); // set the band-width - w = (int)((double)(l_query * opt->a - opt->q) / opt->r + 1.); + w = (int)((double)(l_query * mat[0] - q) / r + 1.); w = w < 1? w : 1; - w = w < opt->w? w : opt->w; + w = w < w_? w : w_; w += abs(rlen - l_query); // NW alignment - *score = ksw_global(l_query, query, rlen, rseq, 5, opt->mat, opt->q, opt->r, w, n_cigar, &cigar); + *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); if (rb >= l_pac) // reverse back query for (i = 0; i < l_query>>1; ++i) tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; @@ -487,6 +491,82 @@ uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, return cigar; } +void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard) +{ + int score, n_cigar, is_rev, nn, rid, mid, is_unmapped = 0; + uint32_t *cigar = 0; + int64_t pos; + + kputs(s->name, str); + if (p && p->rb >= 0 && p->re < bns->l_pac<<1) { + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); + pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); + nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); + p->flag |= is_rev? 16 : 0; // reverse + p->flag |= p->mb >= 0? 1 : 0; // paired in sequencing + p->flag |= n_cigar == 0? 8 : 0; // FIXME: check why this may happen (this has already happened) + kputc('\t', str); kputw(p->flag, str); kputc('\t', str); + kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str); + kputw(p->qual, str); kputc('\t', str); + if (n_cigar) { + int i, clip5, clip3; + clip5 = is_rev? s->l_seq - p->qe : p->qb; + clip3 = is_rev? p->qb : s->l_seq - p->qe; + if (clip5) { kputw(clip5, str); kputc("SH"[(is_hard!=0)], str); } + for (i = 0; i < n_cigar; ++i) { + kputw(cigar[i]>>4, str); kputc("MIDSH"[cigar[i]&0xf], str); + } + if (clip3) { kputw(clip3, str); kputc("SH"[(is_hard!=0)], str); } + } else kputc('*', str); + if (p->mb >= 0 && p->mb < bns->l_pac<<1) { // then print mate pos and isize + pos = bns_depos(bns, p->mb < bns->l_pac? p->mb : p->me - 1, &is_rev); + nn = bns_cnt_ambi(bns, pos, p->me - p->mb, &mid); + kputc('\t', str); + if (mid == rid) kputc('=', str); + else kputs(bns->anns[mid].name, str); + kputc('\t', str); kputuw(pos - bns->anns[mid].offset + 1, str); + kputc('\t', str); + if (mid != rid) { + int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; + int64_t p1 = p->mb < bns->l_pac? p->mb : (bns->l_pac<<1) - 1 - p->mb; + kputw(abs(p0 - p1), str); + } + kputc('\t', str); + } else kputsn("\t*\t0\t0\t", 7, str); + } else { // unaligned + is_unmapped = 1; + kputw(p? p->flag : 0, str); + kputs("\t*\t0\t0\t*\t*\t0\t0\t", str); + } + if (!is_rev) { // print SEQ and QUAL, the forward strand + int i, qb = 0, qe = s->l_seq; + if (!is_unmapped && is_hard) qb = p->qb, qe = p->qe; + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } else { // the reverse strand + int i, qb = 0, qe = s->l_seq; + if (!is_unmapped && is_hard) qb = p->qb, qe = p->qe; + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } + if (!is_unmapped && p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } + if (!is_unmapped && p->sub >= 0) { kputsn("\tss:i:", 6, str); kputw(p->sub, str); } + kputc('\n', str); + free(cigar); +} + /************************ * Integrated interface * ************************/ @@ -508,61 +588,23 @@ static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) { - int i, k; + int k; kstring_t str; - char *seq; - str.l = str.m = 0; str.s = 0; a->n = mem_choose_alnreg_se(opt, a->n, a->a); // NOTE: mem_sort_and_dedup() called in worker1() - seq = malloc(s->l_seq); - if (a->n == 0) { // no seeds found - for (i = 0; i < s->l_seq; ++i) seq[i] = "ACGTN"[(int)s->seq[i]]; - kputs(s->name, &str); kputs("\t8\t*\t0\t0\t*\t*\t0\t0\t", &str); - kputsn(seq, s->l_seq, &str); - if (s->qual) kputsn(s->qual, s->l_seq, &str); - else kputc('*', &str); - kputc('\n', &str); - goto ret_sam_se; - } - for (k = 0; k < a->n; ++k) { - uint32_t *cigar = 0; - int score, is_rev, nn, rid, flag = 0, n_cigar = 0, mapq = 0; - int64_t pos; - mem_alnreg_t *p = &a->a[k]; - cigar = mem_gen_cigar(opt, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); - pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); - nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); - flag |= is_rev? 16 : 0; - if (n_cigar == 0) flag |= 8; - kputs(s->name, &str); kputc('\t', &str); kputw(flag, &str); kputc('\t', &str); - kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputuw(pos - bns->anns[rid].offset + 1, &str); kputc('\t', &str); - mapq = approx_mapq_se(opt, p); - kputw(mapq, &str); kputc('\t', &str); - if (n_cigar) { - int clip5, clip3; - clip5 = is_rev? s->l_seq - p->qe : p->qb; - clip3 = is_rev? p->qb : s->l_seq - p->qe; - if (clip5) { kputw(clip5, &str); kputc('S', &str); } - for (i = 0; i < n_cigar; ++i) { - kputw(cigar[i]>>4, &str); kputc("MIDSH"[cigar[i]&0xf], &str); - } - if (clip3) { kputw(clip3, &str); kputc('S', &str); } - } else kputc('*', &str); - kputsn("\t*\t0\t0\t", 7, &str); - if (is_rev) for (i = s->l_seq - 1; i >= 0; --i) seq[i] = "TGCAN"[(int)s->seq[i]]; - else for (i = 0; i < s->l_seq; ++i) seq[i] = "ACGTN"[(int)s->seq[i]]; - kputsn(seq, s->l_seq, &str); kputc('\t', &str); - if (s->qual) kputsn(s->qual, s->l_seq, &str); - else kputc('*', &str); - kputsn("\tAS:i:", 6, &str); kputw(p->score, &str); - kputsn("\tss:i:", 6, &str); kputw(p->sub, &str); - kputsn("\tnw:i:", 6, &str); kputw(score, &str); - kputc('\n', &str); - free(cigar); - } - -ret_sam_se: - free(seq); + if (a->n > 0) { + for (k = 0; k < a->n; ++k) { + bwahit_t h; + mem_alnreg_t *p = &a->a[k]; + h.rb = p->rb; h.re = p->re; + h.qb = p->qb; h.qe = p->qe; + h.score = p->score; h.sub = p->sub; + h.flag = 0; + h.qual = approx_mapq_se(opt, p); + h.mb = h.me = -2; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->is_hard); + } + } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->is_hard); s->sam = str.s; } @@ -592,9 +634,9 @@ typedef struct { const bwt_t *bwt; const bntseq_t *bns; const uint8_t *pac; + const mem_pestat_t *pes; bseq1_t *seqs; mem_alnreg_v *regs; - mem_pestat_t *pes; } worker_t; static void *worker1(void *data) @@ -603,7 +645,7 @@ static void *worker1(void *data) int i; for (i = w->start; i < w->n; i += w->step) { w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); - mem_sort_and_dedup(w->regs[i].n, w->regs[i].a); + w->regs[i].n = mem_sort_and_dedup(w->regs[i].n, w->regs[i].a); } return 0; } diff --git a/bwamem.h b/bwamem.h index c89abf6..b95c96d 100644 --- a/bwamem.h +++ b/bwamem.h @@ -20,6 +20,7 @@ typedef struct { int min_seed_len, max_occ, max_chain_gap; int n_threads, chunk_size; int pe_dir, is_pe; + int is_hard; // if to use hard clip float mask_level, chain_drop_ratio; int max_ins; // maximum insert size int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset @@ -42,6 +43,14 @@ typedef struct { double avg, std; } mem_pestat_t; +typedef struct { + int64_t rb, re; + int qb, qe, flag, qual; + // optional info + int score, sub; + int64_t mb, me; // mb: mate start; -1 if single-end; -2 if mate unmapped +} bwahit_t; + typedef kvec_t(mem_chain_t) mem_chain_v; typedef kvec_t(mem_alnreg_t) mem_alnreg_v; diff --git a/bwamem_pair.c b/bwamem_pair.c index d537718..f7c1ef8 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -27,6 +27,7 @@ static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) } typedef kvec_t(uint64_t) vec64_t; +extern void ks_introsort_uint64_t(size_t n, uint64_t *a); void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) { @@ -95,3 +96,26 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * fprintf(stderr, "[M::%s] skip orientation %c%c\n", __func__, "FR"[d>>1&1], "FR"[d&1]); } } + +void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) +{ + vec64_t v; + int r, i; + kv_init(v); + for (r = 0; r < 2; ++r) { + for (i = 0; i < a[r].n; ++i) { + int64_t pos; + mem_alnreg_t *e = &a[r].a[i]; + pos = (e->rb < bns->l_pac? e->rb<<1 : ((bns->l_pac<<1) - 1 - e->rb)<<1 | 1)<<1 | r; + kv_push(uint64_t, v, pos); + } + } + ks_introsort_uint64_t(v.n, v.a); + free(v.a); +} + +void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]) +{ + bwahit_t h[2]; + mem_pair(opt, bns, pac, pes, s, a, h); +} diff --git a/kstring.h b/kstring.h index cf14e39..81d7d60 100644 --- a/kstring.h +++ b/kstring.h @@ -16,6 +16,15 @@ typedef struct __kstring_t { } kstring_t; #endif +static inline void ks_resize(kstring_t *s, size_t size) +{ + if (s->m < size) { + s->m = size; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } +} + static inline int kputsn(const char *p, int l, kstring_t *s) { if (s->l + l + 1 >= s->m) { From dcb190069ac9ce249d9985d27ea792eeebc73457 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Feb 2013 16:10:14 -0500 Subject: [PATCH 210/498] PE NOT working, yet --- bwamem_pair.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index f7c1ef8..b465602 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -1,4 +1,5 @@ #include +#include #include #include "kstring.h" #include "bwamem.h" @@ -12,6 +13,11 @@ #define MAX_STDDEV 4.0 #define EXT_STDDEV 4.0 +typedef kvec_t(uint64_t) vec64_t; + +extern void ks_introsort_uint64_t(size_t n, uint64_t *a); +void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard); + static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) { int j; @@ -26,9 +32,6 @@ static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) return j < r->n? r->a[j].score : opt->min_seed_len * opt->a; } -typedef kvec_t(uint64_t) vec64_t; -extern void ks_introsort_uint64_t(size_t n, uint64_t *a); - void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) { extern void ks_introsort_uint64_t(size_t n, uint64_t *a); @@ -116,6 +119,12 @@ void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, con void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]) { + kstring_t str; bwahit_t h[2]; + str.l = str.m = 0; str.s = 0; mem_pair(opt, bns, pac, pes, s, a, h); + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); + s[0].sam = strdup(str.s); str.l = 0; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->is_hard); + s[1].sam = str.s; } From 13288e2dcde6437ba9fad8713f17bf3db047f59f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 09:22:47 -0500 Subject: [PATCH 211/498] code backup --- bwamem.c | 4 +++- bwamem_pair.c | 33 +++++++++++++++++++++++++++++---- ksort.h | 2 +- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/bwamem.c b/bwamem.c index 99e8938..7557af6 100644 --- a/bwamem.c +++ b/bwamem.c @@ -493,7 +493,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard) { - int score, n_cigar, is_rev, nn, rid, mid, is_unmapped = 0; + int score, n_cigar, is_rev = 0, nn, rid, mid, is_unmapped = 0; uint32_t *cigar = 0; int64_t pos; @@ -652,6 +652,7 @@ static void *worker1(void *data) static void *worker2(void *data) { + extern void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]); worker_t *w = (worker_t*)data; int i; if (!w->opt->is_pe) { @@ -661,6 +662,7 @@ static void *worker2(void *data) } } else { for (i = 0; i < w->n>>1; i += w->step) { // not implemented yet + mem_sam_pe(w->opt, w->bns, w->pac, w->pes, &w->seqs[i<<1], &w->regs[i<<1]); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } } diff --git a/bwamem_pair.c b/bwamem_pair.c index b465602..019f570 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -103,17 +103,40 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) { vec64_t v; - int r, i; + int r, i, y[4]; // y[] keeps the last hit kv_init(v); for (r = 0; r < 2; ++r) { for (i = 0; i < a[r].n; ++i) { - int64_t pos; + uint64_t key; mem_alnreg_t *e = &a[r].a[i]; - pos = (e->rb < bns->l_pac? e->rb<<1 : ((bns->l_pac<<1) - 1 - e->rb)<<1 | 1)<<1 | r; - kv_push(uint64_t, v, pos); + key = ((e->rb < bns->l_pac? e->rb<<1 : ((bns->l_pac<<1) - 1 - e->rb)<<1 | 1)<<1 | r) << 30 | e->score; + kv_push(uint64_t, v, key); } } ks_introsort_uint64_t(v.n, v.a); + y[0] = y[1] = y[2] = y[3] = -1; + printf("**** %ld\n", v.n); + for (i = 0; i < v.n; ++i) { + printf("%lld\t%c\t%lld\t%lld\n", v.a[i]>>32, "+-"[v.a[i]>>31&1], v.a[i]>>30&1, v.a[i]<<34>>34); + for (r = 0; r < 2; ++r) { + int dir = r<<1 | (v.a[i]>>31&1), which, k; + if (pes[dir].failed) continue; // invalid orientation + which = r<<1 | ((v.a[i]>>30&1)^1); + if (y[which] < 0) continue; // no previous hits + for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt) + int dist; + double ns; + if ((v.a[k]>>30&3) != which) continue; + dist = (v.a[i]>>32) - (v.a[k]>>32); + printf("%d\t%d\t%d\n", r, which, dist); + if (dist > pes[dir].high) break; + if (dist < pes[dir].low) continue; + ns = (dist - pes[dir].avg) / pes[dir].std; + printf("%f\n", ns); + } + } + y[v.a[i]>>30&3] = i; + } free(v.a); } @@ -123,8 +146,10 @@ void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, c bwahit_t h[2]; str.l = str.m = 0; str.s = 0; mem_pair(opt, bns, pac, pes, s, a, h); + /* bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); s[0].sam = strdup(str.s); str.l = 0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->is_hard); s[1].sam = str.s; + */ } diff --git a/ksort.h b/ksort.h index 52812e1..ad66a17 100644 --- a/ksort.h +++ b/ksort.h @@ -139,7 +139,7 @@ typedef struct { tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ } \ } \ - inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ { \ type_t *i, *j, swap_tmp; \ for (i = s + 1; i < t; ++i) \ From e5ab59db5327f628bf688952e1c16f4a4f038b4f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 09:50:28 -0500 Subject: [PATCH 212/498] Multiple changes: 1. Removed bwa.{h,c}. I am not going to finish them anyway. 2. Updated to the latest khash.h, which should be faster. 3. Define 64-bit vector and 128-bit integer/vector in utils.h. --- Makefile | 5 +- bwa.c | 272 ------------------------------------------------ bwa.h | 107 ------------------- bwamem_pair.c | 15 ++- bwape.c | 33 +++--- bwtsw2_pair.c | 4 +- khash.h | 282 ++++++++++++++++++++++++++++++++++---------------- utils.c | 79 +++++++------- utils.h | 13 ++- 9 files changed, 261 insertions(+), 549 deletions(-) delete mode 100644 bwa.c delete mode 100644 bwa.h diff --git a/Makefile b/Makefile index c97fbcf..6a3fc1e 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,9 @@ -CC= gcc -CXX= g++ +CC= clang CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ +LOBJS= bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ bseq.o bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ diff --git a/bwa.c b/bwa.c deleted file mode 100644 index 8e99f18..0000000 --- a/bwa.c +++ /dev/null @@ -1,272 +0,0 @@ -#include -#include -#include -#include -#include "bwa.h" -#include "bwt.h" -#include "bwtgap.h" -#include "bntseq.h" - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -extern unsigned char nst_nt4_table[256]; -extern void seq_reverse(int len, uint8_t *seq, int is_comp); - -bwa_opt_t bwa_def_opt = { 11, 4, -1, 1, 6, 32, 2, 0.04 }; - -struct bwa_idx_t { - bwt_t *bwt; - bntseq_t *bns; - uint8_t *pac; -}; - -struct bwa_buf_t { - int max_buf; - bwa_pestat_t pes; - gap_stack_t *stack; - gap_opt_t *opt; - int *diff_tab; - uint8_t *buf; - int *logn; -}; - -bwa_idx_t *bwa_idx_load(const char *prefix) -{ - bwa_idx_t *p; - int l; - char *str; - l = strlen(prefix); - p = calloc(1, sizeof(bwa_idx_t)); - str = malloc(l + 10); - strcpy(str, prefix); - p->bns = bns_restore(str); - strcpy(str + l, ".bwt"); - p->bwt = bwt_restore_bwt(str); - str[l] = 0; - strcpy(str + l, ".sa"); - bwt_restore_sa(str, p->bwt); - free(str); - p->pac = calloc(p->bns->l_pac/4+1, 1); - fread(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac); - fclose(p->bns->fp_pac); - p->bns->fp_pac = 0; - return p; -} - -void bwa_idx_destroy(bwa_idx_t *p) -{ - bns_destroy(p->bns); - bwt_destroy(p->bwt); - free(p->pac); - free(p); -} - -bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score) -{ - extern gap_opt_t *gap_init_opt(void); - extern int bwa_cal_maxdiff(int l, double err, double thres); - int i; - bwa_buf_t *p; - p = malloc(sizeof(bwa_buf_t)); - p->stack = gap_init_stack2(max_score); - p->opt = gap_init_opt(); - p->opt->s_gapo = opt->s_gapo; - p->opt->s_gape = opt->s_gape; - p->opt->max_diff = opt->max_diff; - p->opt->max_gapo = opt->max_gapo; - p->opt->max_gape = opt->max_gape; - p->opt->seed_len = opt->seed_len; - p->opt->max_seed_diff = opt->max_seed_diff; - p->opt->fnr = opt->fnr; - p->diff_tab = calloc(BWA_MAX_QUERY_LEN, sizeof(int)); - for (i = 1; i < BWA_MAX_QUERY_LEN; ++i) - p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); - p->logn = calloc(256, sizeof(int)); - for (i = 1; i != 256; ++i) - p->logn[i] = (int)(4.343 * log(i) + 0.499); - return p; -} - -void bwa_buf_destroy(bwa_buf_t *p) -{ - gap_destroy_stack(p->stack); - free(p->diff_tab); free(p->logn); free(p->opt); - free(p); -} - -bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq) -{ - extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width); - int i, seq_len, buf_len; - bwt_width_t *w, *seed_w; - uint8_t *s; - gap_opt_t opt2 = *buf->opt; - bwa_sai_t sai; - - seq_len = strlen(seq); - // estimate the buffer length - buf_len = (buf->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len; - if (buf_len > buf->max_buf) { - buf->max_buf = buf_len; - kroundup32(buf->max_buf); - buf->buf = realloc(buf->buf, buf->max_buf); - } - memset(buf->buf, 0, buf_len); - seed_w = (bwt_width_t*)buf->buf; - w = seed_w + buf->opt->seed_len; - s = (uint8_t*)(w + seq_len + 1); - if (opt2.fnr > 0.) opt2.max_diff = buf->diff_tab[seq_len]; - // copy the sequence - for (i = 0; i < seq_len; ++i) - s[i] = nst_nt4_table[(int)seq[i]]; - seq_reverse(seq_len, s, 0); - // mapping - bwt_cal_width(idx->bwt, seq_len, s, w); - if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff; - if (seq_len > buf->opt->seed_len) - bwt_cal_width(idx->bwt, buf->opt->seed_len, s + (seq_len - buf->opt->seed_len), seed_w); - for (i = 0; i < seq_len; ++i) // complement; I forgot why... - s[i] = s[i] > 3? 4 : 3 - s[i]; - sai.sai = (bwa_sai1_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= buf->opt->seed_len? 0 : seed_w, &opt2, &sai.n, buf->stack); - return sai; -} - -static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps) -{ - uint64_t x = pos, z; - int k, y = 0; - *n_mm = *n_gaps = 0; - for (k = 0; k < n_cigar; ++k) { - int l = cigar[k]>>4; - int op = cigar[k]&0xf; - if (op == 0) { // match/mismatch - for (z = 0; z < l && x + z < l_pac; ++z) { - int c = pac[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; - if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) ++(*n_mm); - } - } - if (op == 1 || op == 2) (*n_gaps) += l; - if (op == 0 || op == 2) x += l; - if (op == 0 || op == 1 || op == 4) y += l; - } -} - -void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln) -{ - extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); - extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct); - int strand, seq_len, i, n_gap, n_mm; - uint64_t pos3, pac_pos; - uint8_t *s[2]; - - memset(aln, 0, sizeof(bwa_aln_t)); - seq_len = strlen(seq); - if (seq_len<<1 > buf->max_buf) { - buf->max_buf = seq_len<<1; - kroundup32(buf->max_buf); - buf->buf = realloc(buf->buf, buf->max_buf); - } - s[0] = buf->buf; - s[1] = s[0] + seq_len; - for (i = 0; i < seq_len; ++i) - s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]]; - seq_reverse(seq_len, s[1], 1); - pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand); - if (strand) aln->flag |= 16; - if (n_gaps) { // only for gapped alignment - int n_cigar; - bwa_cigar_t *cigar16; - cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1); - aln->n_cigar = n_cigar; - aln->cigar = malloc(n_cigar * 4); - for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) { - int op = cigar16[i]>>14; - int len = cigar16[i]&0x3fff; - if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR - aln->cigar[i] = len<<4 | op; - if (op == 0 || op == 2) pos3 += len; - } - free(cigar16); - } else { // ungapped - aln->n_cigar = 1; - aln->cigar = malloc(4); - aln->cigar[0] = seq_len<<4 | 0; - pos3 = pac_pos + seq_len; - } - aln->n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln->ref_id); - aln->offset = pac_pos - idx->bns->anns[aln->ref_id].offset; - if (pos3 - idx->bns->anns[aln->ref_id].offset > idx->bns->anns[aln->ref_id].len) // read mapped beyond the end of a sequence - aln->flag |= 4; // read unmapped - compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln->n_cigar, aln->cigar, &n_mm, &n_gap); - aln->n_mm = n_mm; - aln->n_gap = n_gap; -} - -/************************ - * Single-end alignment * - ************************/ - -bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar) -{ - bwa_one_t *one; - int best, cnt, i, seq_len; - - seq_len = strlen(seq); - one = calloc(1, sizeof(bwa_one_t)); - one->sai = bwa_sai(idx, buf, seq); - if (one->sai.n == 0) return one; - // count number of hits; randomly select one alignment - best = one->sai.sai[0].score; - for (i = cnt = 0; i < one->sai.n; ++i) { - bwa_sai1_t *p = &one->sai.sai[i]; - if (p->score > best) break; - if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) { - one->which = p; - one->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); - } - cnt += p->l - p->k + 1; - } - one->c1 = cnt; - for (; i < one->sai.n; ++i) - cnt += one->sai.sai[i].l - one->sai.sai[i].k + 1; - one->c2 = cnt - one->c1; - // estimate single-end mapping quality - one->mapQs = -1; - if (one->c1 == 0) one->mapQs = 23; // FIXME: is it possible? - else if (one->c1 > 1) one->mapQs = 0; - else { - int diff = one->which->n_mm + one->which->n_gapo + one->which->n_gape; - if (diff >= buf->diff_tab[seq_len]) one->mapQs = 25; - else if (one->c2 == 0) one->mapQs = 37; - } - if (one->mapQs < 0) { - cnt = (one->c2 >= 255)? 255 : one->c2; - one->mapQs = 23 < buf->logn[cnt]? 0 : 23 - buf->logn[cnt]; - } - one->mapQ = one->mapQs; - // compute CIGAR on request - one->one.ref_id = -1; - if (gen_cigar) bwa_sa2aln(idx, buf, seq, one->sa, one->which->n_gapo + one->which->n_gape, &one->one); - return one; -} - -void bwa_one_destroy(bwa_one_t *one) -{ - free(one->sai.sai); - free(one->one.cigar); - free(one); -} - -/************************ - * Paired-end alignment * - ************************/ - -void bwa_pestat(bwa_buf_t *buf, int n, bwa_one_t **o[2]) -{ -} - -void bwa_pe(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq[2], bwa_one_t *o[2]) -{ -} diff --git a/bwa.h b/bwa.h deleted file mode 100644 index e8172da..0000000 --- a/bwa.h +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef BWA_H_ -#define BWA_H_ - -#include - -#define BWA_DEF_MAX_SCORE 2048 -#define BWA_MAX_QUERY_LEN 1024 - -// BWA index -struct bwa_idx_t; -typedef struct bwa_idx_t bwa_idx_t; - -// Buffer for BWA alignment -struct bwa_buf_t; -typedef struct bwa_buf_t bwa_buf_t; - -// BWA alignment options -typedef struct { - int s_gapo, s_gape; // gap open and extension penalties; the mismatch penalty is fixed at 3 - int max_diff, max_gapo, max_gape; // max differences (-1 to use fnr for length-adjusted max diff), gap opens and gap extensions - int seed_len, max_seed_diff; // seed length and max differences allowed in the seed - float fnr; // parameter for automatic length-adjusted max differences -} bwa_opt_t; - -// default BWA alignment options -extern bwa_opt_t bwa_def_opt; // = { 11, 4, -1, 1, 6, 32, 2, 0.04 } - -// an interval hit in the SA coordinate; basic unit in .sai files -typedef struct { - uint32_t n_mm:16, n_gapo:8, n_gape:8; - int score; - uint64_t k, l; // [k,l] is the SA interval; each interval has l-k+1 hits -} bwa_sai1_t; - -// all interval hits in the SA coordinate -typedef struct { - int n; // number of interval hits - bwa_sai1_t *sai; -} bwa_sai_t; - -// an alignment -typedef struct { - uint32_t n_n:8, n_gap:12, n_mm:12; // number of ambiguous bases, gaps and mismatches in the alignment - int32_t ref_id; // referece sequence index (the first seq is indexed by 0) - uint32_t offset; // coordinate on the reference; zero-based - uint32_t n_cigar:16, flag:16; // number of CIGAR operations; SAM flag - uint32_t *cigar; // CIGAR in the BAM 28+4 encoding; having n_cigar operations -} bwa_aln_t; - -typedef struct { - int mapQs, mapQ, c1, c2; - uint64_t sa; - bwa_sai1_t *which; - bwa_sai_t sai; - bwa_aln_t one; -} bwa_one_t; - -typedef struct { - double avg, std, ap_prior; - uint64_t low, high, high_bayesian; -} bwa_pestat_t; - -#ifdef __cplusplus -extern "C" { -#endif - - // load a BWA index - bwa_idx_t *bwa_idx_load(const char *prefix); - void bwa_idx_destroy(bwa_idx_t *p); - - // allocate a BWA alignment buffer; if unsure, set opt to &bwa_def_opt and max_score to BWA_DEF_MAX_SCORE - bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score); - void bwa_buf_destroy(bwa_buf_t *p); - - /** - * Find all the SA intervals - * - * @param idx BWA index; multiple threads can share the same index - * @param buf BWA alignment buffer; each thread should have its own buffer - * @param seq NULL terminated C string, consisting of A/C/G/T/N only - * - * @return SA intervals seq is matched to - */ - bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq); - - /** - * Construct an alignment in the base-pair coordinate - * - * @param idx BWA index - * @param buf BWA alignment buffer - * @param seq NULL terinated C string - * @param sa Suffix array value - * @param n_gaps Number of gaps (typically equal to bwa_sai1_t::n_gapo + bwa_sai1_t::n_gape - * - * @return An alignment - */ - void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln); - - bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar); - - void bwa_one_destroy(bwa_one_t *one); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/bwamem_pair.c b/bwamem_pair.c index 019f570..845051c 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -4,6 +4,7 @@ #include "kstring.h" #include "bwamem.h" #include "kvec.h" +#include "utils.h" #define MIN_RATIO 0.8 #define MIN_DIR_CNT 10 @@ -13,9 +14,6 @@ #define MAX_STDDEV 4.0 #define EXT_STDDEV 4.0 -typedef kvec_t(uint64_t) vec64_t; - -extern void ks_introsort_uint64_t(size_t n, uint64_t *a); void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard); static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) @@ -34,9 +32,8 @@ static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]) { - extern void ks_introsort_uint64_t(size_t n, uint64_t *a); int i, d, max; - vec64_t isize[4]; + uint64_v isize[4]; memset(pes, 0, 4 * sizeof(mem_pestat_t)); memset(isize, 0, sizeof(kvec_t(int)) * 4); for (i = 0; i < n>>1; ++i) { @@ -58,14 +55,14 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. mem_pestat_t *r = &pes[d]; - vec64_t *q = &isize[d]; + uint64_v *q = &isize[d]; int p25, p50, p75, x; if (q->n < MIN_DIR_CNT) { fprintf(stderr, "[M::%s] skip orientation %c%c as there are not enough pairs\n", __func__, "FR"[d>>1&1], "FR"[d&1]); r->failed = 1; continue; } else fprintf(stderr, "[M::%s] analyzing insert size distribution for orientation %c%c...\n", __func__, "FR"[d>>1&1], "FR"[d&1]); - ks_introsort_uint64_t(q->n, q->a); + ks_introsort_64(q->n, q->a); p25 = q->a[(int)(.25 * q->n + .499)]; p50 = q->a[(int)(.50 * q->n + .499)]; p75 = q->a[(int)(.75 * q->n + .499)]; @@ -102,7 +99,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) { - vec64_t v; + uint64_v v; int r, i, y[4]; // y[] keeps the last hit kv_init(v); for (r = 0; r < 2; ++r) { @@ -113,7 +110,7 @@ void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, con kv_push(uint64_t, v, key); } } - ks_introsort_uint64_t(v.n, v.a); + ks_introsort_64(v.n, v.a); y[0] = y[1] = y[2] = y[3] = -1; printf("**** %ld\n", v.n); for (i = 0; i < v.n; ++i) { diff --git a/bwape.c b/bwape.c index 779670f..644b5bd 100644 --- a/bwape.c +++ b/bwape.c @@ -21,24 +21,15 @@ typedef struct { bwtint_t low, high, high_bayesian; } isize_info_t; -typedef struct { - uint64_t x, y; -} b128_t; - -#define b128_lt(a, b) ((a).x < (b).x) #define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y) #define b128_hash(a) ((uint32_t)(a).x) #include "khash.h" -KHASH_INIT(b128, b128_t, poslist_t, 1, b128_hash, b128_eq) - -#include "ksort.h" -KSORT_INIT(b128, b128_t, b128_lt) -KSORT_INIT_GENERIC(uint64_t) +KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq) typedef struct { - kvec_t(b128_t) arr; - kvec_t(b128_t) pos[2]; + pair64_v arr; + pair64_v pos[2]; kvec_t(bwt_aln1_t) aln[2]; } pe_data_t; @@ -120,7 +111,7 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double free(isizes); return -1; } - ks_introsort(uint64_t, tot, isizes); + ks_introsort_64(tot, isizes); p25 = isizes[(int)(tot*0.25 + 0.5)]; p50 = isizes[(int)(tot*0.50 + 0.5)]; p75 = isizes[(int)(tot*0.75 + 0.5)]; @@ -170,7 +161,7 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, { int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len; uint64_t o_score, subo_score; - b128_t last_pos[2][2], o_pos[2]; + pair64_t last_pos[2][2], o_pos[2]; max_len = p[0]->full_len; if (max_len < p[1]->full_len) max_len = p[1]->full_len; if (low_bound < max_len) low_bound = max_len; @@ -206,11 +197,11 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, o_score = subo_score = (uint64_t)-1; o_n = subo_n = 0; - ks_introsort(b128, d->arr.n, d->arr.a); + ks_introsort_128(d->arr.n, d->arr.a); for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1; if (opt->type == BWA_PET_STD) { for (i = 0; i < d->arr.n; ++i) { - b128_t x = d->arr.a[i]; + pair64_t x = d->arr.a[i]; int strand = x.y>>1&1; if (strand == 1) { // reverse strand, then check int y = 1 - (x.y&1); @@ -223,7 +214,7 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, } } else if (opt->type == BWA_PET_SOLID) { for (i = 0; i < d->arr.n; ++i) { - b128_t x = d->arr.a[i]; + pair64_t x = d->arr.a[i]; int strand = x.y>>1&1; if ((strand^x.y)&1) { // push int y = 1 - (x.y&1); @@ -345,7 +336,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT) && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT)) { // only when both ends mapped - b128_t x; + pair64_t x; int j, k; long long n_occ[2]; for (j = 0; j < 2; ++j) { @@ -360,7 +351,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw bwt_aln1_t *r = d->aln[j].a + k; bwtint_t l; if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table - b128_t key; + pair64_t key; int ret; key.x = r->k; key.y = r->l; khint_t iter = kh_put(b128, g_hash, key, &ret); @@ -377,14 +368,14 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw for (l = 0; l < kh_val(g_hash, iter).n; ++l) { x.x = kh_val(g_hash, iter).a[l]>>1; x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j; - kv_push(b128_t, d->arr, x); + kv_push(pair64_t, d->arr, x); } } else { // then calculate on the fly for (l = r->k; l <= r->l; ++l) { int strand; x.x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand); x.y = k<<2 | strand<<1 | j; - kv_push(b128_t, d->arr, x); + kv_push(pair64_t, d->arr, x); } } } diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index 633641e..8a8287b 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -6,6 +6,7 @@ #include "bntseq.h" #include "bwtsw2.h" #include "kstring.h" +#include "utils.h" #ifndef _NO_SSE2 #include "ksw.h" #else @@ -24,7 +25,6 @@ typedef struct { bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) { - extern void ks_introsort_uint64_t(size_t n, uint64_t *a); int i, k, x, p25, p50, p75, tmp, max_len = 0; uint64_t *isize; bsw2pestat_t r; @@ -44,7 +44,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; isize[k++] = l; } - ks_introsort_uint64_t(k, isize); + ks_introsort_64(k, isize); p25 = isize[(int)(.25 * k + .499)]; p50 = isize[(int)(.50 * k + .499)]; p75 = isize[(int)(.75 * k + .499)]; diff --git a/khash.h b/khash.h index de6be6d..2422044 100644 --- a/khash.h +++ b/khash.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, 2009 by attractor + Copyright (c) 2008, 2009, 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -33,7 +33,6 @@ int main() { khiter_t k; khash_t(32) *h = kh_init(32); k = kh_put(32, h, 5, &ret); - if (!ret) kh_del(32, h, k); kh_value(h, k) = 10; k = kh_get(32, h, 10); is_missing = (k == kh_end(h)); @@ -47,6 +46,29 @@ int main() { */ /* + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + 2009-09-26 (0.2.4): * Improve portability @@ -86,11 +108,9 @@ int main() { @header Generic hash table library. - - @copyright Heng Li */ -#define AC_VERSION_KHASH_H "0.2.4" +#define AC_VERSION_KHASH_H "0.2.6" #include #include @@ -111,24 +131,14 @@ typedef unsigned long long khint64_t; #endif #ifdef _MSC_VER -#define inline __inline +#define kh_inline __inline +#else +#define kh_inline inline #endif typedef khint32_t khint_t; typedef khint_t khiter_t; -#define __ac_HASH_PRIME_SIZE 32 -static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = -{ - 0ul, 3ul, 11ul, 23ul, 53ul, - 97ul, 193ul, 389ul, 769ul, 1543ul, - 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, - 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, - 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, - 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, - 3221225473ul, 4294967291ul -}; - #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) @@ -137,88 +147,128 @@ static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) +#ifdef KHASH_LINEAR +#define __ac_inc(k, m) 1 +#else +#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) +#endif + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef kcalloc +#define kcalloc(N,Z) calloc(N,Z) +#endif +#ifndef kmalloc +#define kmalloc(Z) malloc(Z) +#endif +#ifndef krealloc +#define krealloc(P,Z) realloc(P,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + static const double __ac_HASH_UPPER = 0.77; -#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - typedef struct { \ - khint_t n_buckets, size, n_occupied, upper_bound; \ - khint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; \ - static inline kh_##name##_t *kh_init_##name() { \ - return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ } \ - static inline void kh_destroy_##name(kh_##name##_t *h) \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ - free(h->keys); free(h->flags); \ - free(h->vals); \ - free(h); \ + kfree((void *)h->keys); kfree(h->flags); \ + kfree((void *)h->vals); \ + kfree(h); \ } \ } \ - static inline void kh_clear_##name(kh_##name##_t *h) \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ - memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ - static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ - khint_t inc, k, i, last; \ - k = __hash_func(key); i = k % h->n_buckets; \ - inc = 1 + k % (h->n_buckets - 1); last = i; \ + khint_t inc, k, i, last, mask; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ - else i += inc; \ + i = (i + inc) & mask; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ - static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ - { \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ - khint_t t = __ac_HASH_PRIME_SIZE - 1; \ - while (__ac_prime_list[t] > new_n_buckets) --t; \ - new_n_buckets = __ac_prime_list[t+1]; \ - if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ - else { \ - new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ - memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ - if (h->n_buckets < new_n_buckets) { \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) \ - h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ - } \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) return -1; \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) return -1; \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ } \ } \ - if (j) { \ + if (j) { /* rehashing is needed */ \ for (j = 0; j != h->n_buckets; ++j) { \ if (__ac_iseither(h->flags, j) == 0) { \ khkey_t key = h->keys[j]; \ khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ if (kh_is_map) val = h->vals[j]; \ __ac_set_isdel_true(h->flags, j); \ - while (1) { \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ khint_t inc, k, i; \ k = __hash_func(key); \ - i = k % new_n_buckets; \ - inc = 1 + k % (new_n_buckets - 1); \ - while (!__ac_isempty(new_flags, i)) { \ - if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ - else i += inc; \ - } \ + i = k & new_mask; \ + inc = __ac_inc(k, new_mask); \ + while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ __ac_set_isempty_false(new_flags, i); \ - if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ - __ac_set_isdel_true(h->flags, i); \ - } else { \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ h->keys[i] = key; \ if (kh_is_map) h->vals[i] = val; \ break; \ @@ -226,35 +276,39 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ } \ - if (h->n_buckets > new_n_buckets) { \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) \ - h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ } \ - free(h->flags); \ + kfree(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ + return 0; \ } \ - static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ - if (h->n_occupied >= h->upper_bound) { \ - if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ - else kh_resize_##name(h, h->n_buckets + 1); \ - } \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ - khint_t inc, k, i, site, last; \ - x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ - if (__ac_isempty(h->flags, i)) x = i; \ + khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ else { \ - inc = 1 + k % (h->n_buckets - 1); last = i; \ + inc = __ac_inc(k, mask); last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ - if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ - else i += inc; \ + i = (i + inc) & mask; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \ @@ -263,20 +317,20 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ } \ - if (__ac_isempty(h->flags, x)) { \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; ++h->n_occupied; \ *ret = 1; \ - } else if (__ac_isdel(h->flags, x)) { \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ h->keys[x] = key; \ __ac_set_isboth_false(h->flags, x); \ ++h->size; \ *ret = 2; \ - } else *ret = 0; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ return x; \ } \ - static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ @@ -284,6 +338,17 @@ static const double __ac_HASH_UPPER = 0.77; } \ } +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + /* --- BEGIN OF HASH FUNCTIONS --- */ /*! @function @@ -311,10 +376,10 @@ static const double __ac_HASH_UPPER = 0.77; @param s Pointer to a null terminated string @return The hash value */ -static inline khint_t __ac_X31_hash_string(const char *s) +static kh_inline khint_t __ac_X31_hash_string(const char *s) { - khint_t h = *s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; return h; } /*! @function @@ -328,9 +393,21 @@ static inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) +static kh_inline khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) + /* --- END OF HASH FUNCTIONS --- */ -/* Other necessary macros... */ +/* Other convenient macros... */ /*! @abstract Type of the hash table. @@ -396,7 +473,6 @@ static inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_del(name, h, k) kh_del_##name(h, k) - /*! @function @abstract Test whether a bucket contains data. @param h Pointer to the hash table [khash_t(name)*] @@ -455,6 +531,34 @@ static inline khint_t __ac_X31_hash_string(const char *s) */ #define kh_n_buckets(h) ((h)->n_buckets) +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + /* More conenient interfaces */ /*! @function diff --git a/utils.c b/utils.c index 8c1ad7e..41594c3 100644 --- a/utils.c +++ b/utils.c @@ -35,6 +35,12 @@ #include #include "utils.h" +#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y)) + +#include "ksort.h" +KSORT_INIT(128, pair64_t, pair64_lt) +KSORT_INIT(64, uint64_t, ks_lt_generic) + FILE *err_xopen_core(const char *func, const char *fn, const char *mode) { FILE *fp = 0; @@ -46,6 +52,7 @@ FILE *err_xopen_core(const char *func, const char *fn, const char *mode) } return fp; } + FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp) { if (freopen(fn, mode, fp) == 0) { @@ -56,6 +63,7 @@ FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE } return fp; } + gzFile err_xzopen_core(const char *func, const char *fn, const char *mode) { gzFile fp; @@ -67,6 +75,7 @@ gzFile err_xzopen_core(const char *func, const char *fn, const char *mode) } return fp; } + void err_fatal(const char *header, const char *fmt, ...) { va_list args; @@ -86,66 +95,48 @@ void err_fatal_simple_core(const char *func, const char *msg) size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { - size_t ret = fwrite(ptr, size, nmemb, stream); - if (ret != nmemb) - { - err_fatal_simple_core("fwrite", strerror(errno)); - } - return ret; + size_t ret = fwrite(ptr, size, nmemb, stream); + if (ret != nmemb) + err_fatal_simple_core("fwrite", strerror(errno)); + return ret; } int err_printf(const char *format, ...) { - va_list arg; - int done; - - va_start(arg, format); - done = vfprintf(stdout, format, arg); - int saveErrno = errno; - va_end(arg); - - if (done < 0) - { - err_fatal_simple_core("vfprintf(stdout)", strerror(saveErrno)); - } - return done; + va_list arg; + int done; + va_start(arg, format); + done = vfprintf(stdout, format, arg); + int saveErrno = errno; + va_end(arg); + if (done < 0) err_fatal_simple_core("vfprintf(stdout)", strerror(saveErrno)); + return done; } int err_fprintf(FILE *stream, const char *format, ...) { - va_list arg; - int done; - - va_start(arg, format); - done = vfprintf(stream, format, arg); - int saveErrno = errno; - va_end(arg); - - if (done < 0) - { - err_fatal_simple_core("vfprintf", strerror(saveErrno)); - } - return done; + va_list arg; + int done; + va_start(arg, format); + done = vfprintf(stream, format, arg); + int saveErrno = errno; + va_end(arg); + if (done < 0) err_fatal_simple_core("vfprintf", strerror(saveErrno)); + return done; } int err_fflush(FILE *stream) { - int ret = fflush(stream); - if (ret != 0) - { - err_fatal_simple_core("fflush", strerror(errno)); - } - return ret; + int ret = fflush(stream); + if (ret != 0) err_fatal_simple_core("fflush", strerror(errno)); + return ret; } int err_fclose(FILE *stream) { - int ret = fclose(stream); - if (ret != 0) - { - err_fatal_simple_core("fclose", strerror(errno)); - } - return ret; + int ret = fclose(stream); + if (ret != 0) err_fatal_simple_core("fclose", strerror(errno)); + return ret; } double cputime() diff --git a/utils.h b/utils.h index b6839e9..5abab41 100644 --- a/utils.h +++ b/utils.h @@ -28,6 +28,7 @@ #ifndef LH3_UTILS_H #define LH3_UTILS_H +#include #include #include @@ -38,14 +39,19 @@ #define ATTRIBUTE(list) #endif - - #define err_fatal_simple(msg) err_fatal_simple_core(__func__, msg) #define xopen(fn, mode) err_xopen_core(__func__, fn, mode) #define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp) #define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode) #define xassert(cond, msg) if ((cond) == 0) err_fatal_simple_core(__func__, msg) +typedef struct { + uint64_t x, y; +} pair64_t; + +typedef struct { size_t n, m; uint64_t *a; } uint64_v; +typedef struct { size_t n, m; pair64_t *a; } pair64_v; + #ifdef __cplusplus extern "C" { #endif @@ -66,6 +72,9 @@ extern "C" { double cputime(); double realtime(); + void ks_introsort_64 (size_t n, uint64_t *a); + void ks_introsort_128(size_t n, pair64_t *a); + #ifdef __cplusplus } #endif From 6ad5a3c086c82139f6f30f191fbfcc3cae2ec542 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 10:21:17 -0500 Subject: [PATCH 213/498] removed color-space support which has been broken since 0.6.x --- Makefile | 3 +- bwape.c | 38 ++--------- bwase.c | 45 ++----------- bwase.h | 2 +- bwtaln.c | 4 +- bwtaln.h | 1 - bwtindex.c | 24 +------ cs2nt.c | 191 ---------------------------------------------------- main.c | 3 - main.h | 2 - simple_dp.c | 162 -------------------------------------------- 11 files changed, 17 insertions(+), 458 deletions(-) delete mode 100644 cs2nt.c delete mode 100644 simple_dp.c diff --git a/Makefile b/Makefile index 6a3fc1e..8cf767a 100644 --- a/Makefile +++ b/Makefile @@ -6,8 +6,7 @@ DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ bseq.o bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ - is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \ - bwape.o cs2nt.o \ + is.o bwtmisc.o bwtindex.o ksw.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa diff --git a/bwape.c b/bwape.c index 644b5bd..4201cf2 100644 --- a/bwape.c +++ b/bwape.c @@ -212,19 +212,6 @@ static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, last_pos[x.y&1][1] = x; } } - } else if (opt->type == BWA_PET_SOLID) { - for (i = 0; i < d->arr.n; ++i) { - pair64_t x = d->arr.a[i]; - int strand = x.y>>1&1; - if ((strand^x.y)&1) { // push - int y = 1 - (x.y&1); - __pairing_aux(last_pos[y][1], x); - __pairing_aux(last_pos[y][0], x); - } else { // check - last_pos[x.y&1][0] = last_pos[x.y&1][1]; - last_pos[x.y&1][1] = x; - } - } } else { fprintf(stderr, "[paring] not implemented yet!\n"); exit(1); @@ -567,11 +554,11 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, ++n_tot[is_singleton]; cigar[0] = cigar[1] = 0; n_cigar[0] = n_cigar[1] = 0; - if (popt->type != BWA_PET_STD && popt->type != BWA_PET_SOLID) continue; // other types of pairing is not considered + if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified ubyte_t *seq; if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip - if (popt->type == BWA_PET_STD) { + { // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate __set_rght_coor(beg[k], end[k], p[1-k], p[k]); seq = p[k]->rseq; @@ -580,17 +567,6 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, seq = p[k]->seq; seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly } - } else { // BWA_PET_SOLID - if (p[1-k]->strand == 0) { // R3-F3 pairing - if (k == 0) __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 - else __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 - seq = p[k]->rseq; - seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed - } else { // F3-R3 pairing - if (k == 0) __set_rght_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is R3 - else __set_left_coor(beg[k], end[k], p[1-k], p[k]); // p[k] is F3 - seq = p[k]->seq; - } } // perform SW alignment cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]); @@ -654,7 +630,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f bwa_seq_t *seqs[2]; bwa_seqio_t *ks[2]; clock_t t; - bntseq_t *bns, *ntbns = 0; + bntseq_t *bns; FILE *fp_sa[2]; gap_opt_t opt, opt0; khint_t iter; @@ -679,10 +655,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f opt0 = opt; fread(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! ks[1] = bwa_open_reads(opt.mode, fn_fa[1]); - if (!(opt.mode & BWA_MODE_COMPREAD)) { - popt->type = BWA_PET_SOLID; - ntbns = bwa_open_nt(prefix); - } else { // for Illumina alignment only + { // for Illumina alignment only if (popt->is_preload) { strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); @@ -715,7 +688,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... "); for (j = 0; j < 2; ++j) - bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq, ntbns); + bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); if (pac == 0) free(pacseq); @@ -740,7 +713,6 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f // destroy bns_destroy(bns); - if (ntbns) bns_destroy(ntbns); for (i = 0; i < 2; ++i) { bwa_seq_close(ks[i]); fclose(fp_sa[i]); diff --git a/bwase.c b/bwase.c index 35744e7..8fa79ac 100644 --- a/bwase.c +++ b/bwase.c @@ -296,18 +296,12 @@ void bwa_correct_trimmed(bwa_seq_t *s) s->len = s->full_len; } -void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns) +void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq) { - ubyte_t *pacseq, *ntpac = 0; + ubyte_t *pacseq; int i, j; kstring_t *str; - if (ntbns) { // in color space - ntpac = (ubyte_t*)calloc(ntbns->l_pac/4+1, 1); - rewind(ntbns->fp_pac); - fread(ntpac, 1, ntbns->l_pac/4 + 1, ntbns->fp_pac); - } - if (!_pacseq) { pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); rewind(bns->fp_pac); @@ -328,28 +322,6 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); } -#if 0 - if (ntbns) { // in color space - for (i = 0; i < n_seqs; ++i) { - bwa_seq_t *s = seqs + i; - bwa_cs2nt_core(s, bns->l_pac, ntpac); - for (j = 0; j < s->n_multi; ++j) { - bwt_multi1_t *q = s->multi + j; - int n_cigar; - if (q->gap == 0) continue; - free(q->cigar); - q->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, q->strand? s->rseq : s->seq, &q->pos, - (q->strand? 1 : -1) * q->gap, &n_cigar, 0); - q->n_cigar = n_cigar; - } - if (s->type != BWA_TYPE_NO_MATCH && s->cigar) { // update cigar again - free(s->cigar); - s->cigar = bwa_refine_gapped_core(bns->l_pac, ntpac, s->len, s->strand? s->rseq : s->seq, &s->pos, - (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 0); - } - } - } -#endif // generate MD tag str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = 0; i != n_seqs; ++i) { @@ -357,18 +329,16 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t if (s->type != BWA_TYPE_NO_MATCH) { int nm; s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, - bns->l_pac, ntbns? ntpac : pacseq, str, &nm); + bns->l_pac, pacseq, str, &nm); s->nm = nm; } } free(str->s); free(str); // correct for trimmed reads - if (!ntbns) // trimming is only enabled for Illumina reads - for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); + for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); if (!_pacseq) free(pacseq); - free(ntpac); } int64_t pos_end(const bwa_seq_t *p) @@ -587,7 +557,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; - bntseq_t *bns, *ntbns = 0; + bntseq_t *bns; FILE *fp_sa; gap_opt_t opt; @@ -599,8 +569,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f m_aln = 0; fread(&opt, sizeof(gap_opt_t), 1, fp_sa); - if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac - ntbns = bwa_open_nt(prefix); bwa_print_sam_SQ(bns); //bwa_print_sam_PG(); // set ks @@ -628,7 +596,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); - bwa_refine_gapped(bns, n_seqs, seqs, 0, ntbns); + bwa_refine_gapped(bns, n_seqs, seqs, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); fprintf(stderr, "[bwa_aln_core] print alignments... "); @@ -642,7 +610,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f // destroy bwa_seq_close(ks); - if (ntbns) bns_destroy(ntbns); bns_destroy(bns); fclose(fp_sa); free(aln); diff --git a/bwase.h b/bwase.h index f8e9b0a..26a9f68 100644 --- a/bwase.h +++ b/bwase.h @@ -14,7 +14,7 @@ extern "C" { // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array. void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr); // Refine the approximate position of the sequence to an actual placement for the sequence. - void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns); + void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq); // Backfill certain alignment properties mainly centering around number of matches. void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); // Calculate the end position of a read given a certain sequence. diff --git a/bwtaln.c b/bwtaln.c index efc7f66..84be510 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -252,7 +252,7 @@ int bwa_aln(int argc, char *argv[]) char *prefix; opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:cLR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { + while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -272,7 +272,6 @@ int bwa_aln(int argc, char *argv[]) case 'L': opt->mode |= BWA_MODE_LOGGAP; break; case 'R': opt->max_top2 = atoi(optarg); break; case 'q': opt->trim_qual = atoi(optarg); break; - case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break; case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; case 'f': xreopen(optarg, "wb", stdout); break; case 'b': opt->mode |= BWA_MODE_BAM; break; @@ -310,7 +309,6 @@ int bwa_aln(int argc, char *argv[]) fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); fprintf(stderr, " -f FILE file to write output to instead of stdout\n"); fprintf(stderr, " -B INT length of barcode\n"); -// fprintf(stderr, " -c input sequences are in the color space\n"); fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n"); diff --git a/bwtaln.h b/bwtaln.h index 39eaf4b..412cc04 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -107,7 +107,6 @@ typedef struct { } gap_opt_t; #define BWA_PET_STD 1 -#define BWA_PET_SOLID 2 typedef struct { int max_isize, force_isize; diff --git a/bwtindex.c b/bwtindex.c index 938e982..c01fa95 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -42,11 +42,11 @@ void bwa_pac_rev_core(const char *fn, const char *fn_rev); int bwa_index(int argc, char *argv[]) { char *prefix = 0, *str, *str2, *str3; - int c, algo_type = 0, is_color = 0, is_64 = 0; + int c, algo_type = 0, is_64 = 0; clock_t t; int64_t l_pac; - while ((c = getopt(argc, argv, "6ca:p:")) >= 0) { + while ((c = getopt(argc, argv, "6a:p:")) >= 0) { switch (c) { case 'a': // if -a is not set, algo_type will be determined later if (strcmp(optarg, "div") == 0) algo_type = 1; @@ -55,7 +55,6 @@ int bwa_index(int argc, char *argv[]) else err_fatal(__func__, "unknown algorithm: '%s'.", optarg); break; case 'p': prefix = strdup(optarg); break; - case 'c': is_color = 1; break; case '6': is_64 = 1; break; default: return 1; } @@ -67,7 +66,6 @@ int bwa_index(int argc, char *argv[]) fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n"); fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); fprintf(stderr, " -6 index files named as .64.* instead of .* \n"); -// fprintf(stderr, " -c build color-space index\n"); fprintf(stderr, "\n"); fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n"); @@ -83,29 +81,13 @@ int bwa_index(int argc, char *argv[]) str2 = (char*)calloc(strlen(prefix) + 10, 1); str3 = (char*)calloc(strlen(prefix) + 10, 1); - if (is_color == 0) { // nucleotide indexing + { // nucleotide indexing gzFile fp = xzopen(argv[optind], "r"); t = clock(); fprintf(stderr, "[bwa_index] Pack FASTA... "); l_pac = bns_fasta2bntseq(fp, prefix, 0); fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); gzclose(fp); - } else { // color indexing - gzFile fp = xzopen(argv[optind], "r"); - strcat(strcpy(str, prefix), ".nt"); - t = clock(); - fprintf(stderr, "[bwa_index] Pack nucleotide FASTA... "); - l_pac = bns_fasta2bntseq(fp, str, 0); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - gzclose(fp); - { - char *tmp_argv[3]; - tmp_argv[0] = argv[0]; tmp_argv[1] = str; tmp_argv[2] = prefix; - t = clock(); - fprintf(stderr, "[bwa_index] Convert nucleotide PAC to color PAC... "); - bwa_pac2cspac(3, tmp_argv); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - } } if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT { diff --git a/cs2nt.c b/cs2nt.c deleted file mode 100644 index dfbce60..0000000 --- a/cs2nt.c +++ /dev/null @@ -1,191 +0,0 @@ -#include -#include -#include -#include "bwtaln.h" -#include "stdaln.h" - -/* - Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we - decode as ATTGAC(RBGOG), there are one color change and one nt change; - if we decode as ATTAAC(RBRBG), there are two color changes. - - In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM - as the penalty; otherwise, we will use color quality as the - penalty. This means we always prefer two consistent color changes over - a nt change, but if a color has high quality, we may prefer one nt - change. - - In the above example, the penalties of the two types of decoding are - q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first; - otherwise the second. Note that no matter what we choose, the fourth - base will get a low nt quality. - */ - -#define COLOR_MM 19 -#define NUCL_MM 25 - -static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 }; - -/* - {A,C,G,T,N} -> {0,1,2,3,4} - nt_ref[0..size]: nucleotide reference: 0/1/2/3/4 - cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N - nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned) - btarray[0..4*size]: backtrack array (working space) - */ -void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray) -{ - int h[8], curr, last; - int x, y, xmin, hmin, k; - - // h[0..3] and h[4..7] are the current and last best score array, depending on curr and last - - // recursion: initial value - if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2); - else { - for (x = 0; x != 4; ++x) h[x] = NUCL_MM; - h[nt_ref[0]] = 0; - } - // recursion: main loop - curr = 1; last = 0; - for (k = 1; k <= size; ++k) { - for (x = 0; x != 4; ++x) { - int min = 0x7fffffff, ymin = 0; - for (y = 0; y != 4; ++y) { - int s = h[last<<2|y]; - if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<= 0; --k) - nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]]; -} -/* - nt_read[0..size]: nucleotide read sequence: 0/1/2/3 - cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N - tarray[0..size*2-1]: temporary array - */ -uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray) -{ - int k, c1, c2; - uint8_t *t2array = tarray + size; - // get the color sequence of nt_read - c1 = nt_read[0]; - for (k = 1; k <= size; ++k) { - c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case - tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<>6 && tarray[k] == cs_read[k]>>6) { - q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10; - } else if (tarray[k-1] == cs_read[k-1]>>6) { - q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f); - } else if (tarray[k] == cs_read[k]>>6) { - q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f); - } // else, q = 0 - if (q < 0) q = 0; - if (q > 60) q = 60; - t2array[k] = nt_read[k]<<6 | q; - if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0; - } - return t2array + 1; // of size-2 -} - -// this function will be called when p->seq has been reversed by refine_gapped() -void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac) -{ - uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read; - int i, len; - uint8_t *seq; - - // set temporary arrays - if (p->type == BWA_TYPE_NO_MATCH) return; - len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space - ta = (uint8_t*)malloc(len * 7); - nt_ref = ta; - cs_read = nt_ref + len; - nt_read = cs_read + len; - btarray = nt_read + len; - tarray = nt_read + len; - -#define __gen_csbase(_cs, _i, _seq) do { \ - int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \ - if (q > 60) q = 60; \ - if (_seq[_i] > 3) q = 63; \ - (_cs) = _seq[_i]<<6 | q; \ - } while (0) - - // generate len, nt_ref[] and cs_read - seq = p->strand? p->rseq : p->seq; - nt_ref[0] = p->pos? bns_pac(pac, p->pos-1) : 4; - if (p->cigar == 0) { // no gap or clipping - len = p->len; - for (i = 0; i < p->len; ++i) { - __gen_csbase(cs_read[i], i, seq); - nt_ref[i+1] = bns_pac(pac, p->pos + i); - } - } else { - int k, z; - bwtint_t x, y; - x = p->pos; y = 0; - for (k = z = 0; k < p->n_cigar; ++k) { - int l = __cigar_len(p->cigar[k]); - if (__cigar_op(p->cigar[k]) == FROM_M) { - for (i = 0; i < l; ++i, ++x, ++y) { - __gen_csbase(cs_read[z], y, seq); - nt_ref[z+1] = bns_pac(pac, x); - ++z; - } - } else if (__cigar_op(p->cigar[k]) == FROM_I) { - for (i = 0; i < l; ++i, ++y) { - __gen_csbase(cs_read[z], y, seq); - nt_ref[z+1] = 4; - ++z; - } - } else if (__cigar_op(p->cigar[k]) == FROM_S) y += l; - else x += l; - } - len = z; - } - - cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray); - new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray); - - // update p - p->len = p->full_len = len - 1; - for (i = 0; i < p->len; ++i) { - if ((new_nt_read[i]&0x3f) == 63) { - p->qual[i] = 33; seq[i] = 4; - } else { - p->qual[i] = (new_nt_read[i]&0x3f) + 33; - seq[i] = new_nt_read[i]>>6; - } - } - p->qual[p->len] = seq[p->len] = 0; - if (p->strand) { - memcpy(p->seq, seq, p->len); - seq_reverse(p->len, p->seq, 1); - seq_reverse(p->len, p->qual, 0); - } else { - memcpy(p->rseq, seq, p->len); - seq_reverse(p->len, p->rseq, 1); - } - free(ta); -} diff --git a/main.c b/main.c index 2718732..fc63c2e 100644 --- a/main.c +++ b/main.c @@ -28,7 +28,6 @@ static int usage() fprintf(stderr, " bwtupdate update .bwt to the new format\n"); fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n"); - fprintf(stderr, " stdsw standard SW/NW alignment\n"); fprintf(stderr, "\n"); return 1; } @@ -51,11 +50,9 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); - else if (strcmp(argv[1], "sw") == 0) ret = bwa_stdsw(argc-1, argv+1); else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1); - else if (strcmp(argv[1], "stdsw") == 0) ret = bwa_stdsw(argc-1, argv+1); else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); diff --git a/main.h b/main.h index 1a0292a..7b638ca 100644 --- a/main.h +++ b/main.h @@ -17,8 +17,6 @@ extern "C" { int bwa_sai2sam_se(int argc, char *argv[]); int bwa_sai2sam_pe(int argc, char *argv[]); - int bwa_stdsw(int argc, char *argv[]); - int bwa_bwtsw2(int argc, char *argv[]); int main_fastmap(int argc, char *argv[]); diff --git a/simple_dp.c b/simple_dp.c deleted file mode 100644 index d2b4b71..0000000 --- a/simple_dp.c +++ /dev/null @@ -1,162 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "stdaln.h" -#include "utils.h" - -#include "kseq.h" -KSEQ_DECLARE(gzFile) - -typedef struct { - int l; - unsigned char *s; - char *n; -} seq1_t; - -typedef struct { - int n_seqs, m_seqs; - seq1_t *seqs; -} seqs_t; - -unsigned char aln_rev_table[256] = { - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N', - 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N', - 'N','t','v','g', 'h','N','N','c', 'd','N','N','m', 'N','k','N','N', - 'N','N','y','s', 'a','N','b','w', 'x','r}; - -static int g_is_global = 0, g_thres = 1, g_strand = 0, g_aa = 0; -static AlnParam g_aln_param; - -static void revseq(int len, uint8_t *seq) -{ - int i; - for (i = 0; i < len>>1; ++i) { - uint8_t tmp = aln_rev_table[seq[len-1-i]]; - seq[len-1-i] = aln_rev_table[seq[i]]; - seq[i] = tmp; - } - if (len&1) seq[i] = aln_rev_table[seq[i]]; -} - -static seqs_t *load_seqs(const char *fn) -{ - seqs_t *s; - seq1_t *p; - gzFile fp; - int l; - kseq_t *seq; - - fp = xzopen(fn, "r"); - seq = kseq_init(fp); - s = (seqs_t*)calloc(1, sizeof(seqs_t)); - s->m_seqs = 256; - s->seqs = (seq1_t*)calloc(s->m_seqs, sizeof(seq1_t)); - while ((l = kseq_read(seq)) >= 0) { - if (s->n_seqs == s->m_seqs) { - s->m_seqs <<= 1; - s->seqs = (seq1_t*)realloc(s->seqs, s->m_seqs * sizeof(seq1_t)); - } - p = s->seqs + (s->n_seqs++); - p->l = seq->seq.l; - p->s = (unsigned char*)malloc(p->l + 1); - memcpy(p->s, seq->seq.s, p->l); - p->s[p->l] = 0; - p->n = strdup((const char*)seq->name.s); - } - kseq_destroy(seq); - gzclose(fp); - fprintf(stderr, "[load_seqs] %d sequences are loaded.\n", s->n_seqs); - return s; -} - -static void aln_1seq(const seqs_t *ss, const char *name, int l, const char *s, char strand) -{ - int i; - for (i = 0; i < ss->n_seqs; ++i) { - AlnAln *aa; - seq1_t *p = ss->seqs + i; - g_aln_param.band_width = l + p->l; - aa = aln_stdaln_aux(s, (const char*)p->s, &g_aln_param, g_is_global, g_thres, l, p->l); - if (aa->score >= g_thres || g_is_global) { - printf(">%s\t%d\t%d\t%s\t%c\t%d\t%d\t%d\t%d\t", p->n, aa->start1? aa->start1 : 1, aa->end1, name, strand, - aa->start2? aa->start2 : 1, aa->end2, aa->score, aa->subo); - // NB: I put the short sequence as the first sequence in SW, an insertion to - // the reference becomes a deletion from the short sequence. Therefore, I use - // "MDI" here rather than "MID", and print ->out2 first rather than ->out1. - for (i = 0; i != aa->n_cigar; ++i) - printf("%d%c", aa->cigar32[i]>>4, "MDI"[aa->cigar32[i]&0xf]); - printf("\n%s\n%s\n%s\n", aa->out2, aa->outm, aa->out1); - } - aln_free_AlnAln(aa); - } -} - -static void aln_seqs(const seqs_t *ss, const char *fn) -{ - gzFile fp; - kseq_t *seq; - int l; - - fp = xzopen(fn, "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - if (g_strand&1) aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '+'); - if (g_strand&2) { - revseq(l, (uint8_t*)seq->seq.s); - aln_1seq(ss, (char*)seq->name.s, l, seq->seq.s, '-'); - } - } - kseq_destroy(seq); - gzclose(fp); -} - -int bwa_stdsw(int argc, char *argv[]) -{ - int c; - seqs_t *ss; - - while ((c = getopt(argc, argv, "gT:frp")) >= 0) { - switch (c) { - case 'g': g_is_global = 1; break; - case 'T': g_thres = atoi(optarg); break; - case 'f': g_strand |= 1; break; - case 'r': g_strand |= 2; break; - case 'p': g_aa = 1; break; - } - } - if (g_strand == 0) g_strand = 3; - if (g_aa) g_strand = 1; - if (optind + 1 >= argc) { - fprintf(stderr, "\nUsage: bwa stdsw [options] \n\n"); - fprintf(stderr, "Options: -T INT minimum score [%d]\n", g_thres); - fprintf(stderr, " -p protein alignment (suppressing -r)\n"); - fprintf(stderr, " -f forward strand only\n"); - fprintf(stderr, " -r reverse strand only\n"); - fprintf(stderr, " -g global alignment\n\n"); - fprintf(stderr, "Note: This program is specifically designed for alignment between multiple short\n"); - fprintf(stderr, " sequences and ONE long sequence. It outputs the suboptimal score on the long\n"); - fprintf(stderr, " sequence.\n\n"); - return 1; - } - g_aln_param = g_aa? aln_param_aa2aa : aln_param_blast; - g_aln_param.gap_end = 0; - ss = load_seqs(argv[optind]); - aln_seqs(ss, argv[optind+1]); - return 0; -} From 95d18449b385d75bd8e5ffebfbdb39bfeb526e8e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 10:36:15 -0500 Subject: [PATCH 214/498] merge bseq.{h,c} to utils.{h,c} I do not like many small files. --- Makefile | 2 +- bseq.c | 55 ---------------------------------------------- bseq.h | 11 ---------- bwamem.c | 1 + bwamem.h | 7 +++--- bwtsw2_aux.c | 1 - fastmap.c | 2 +- utils.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++-- utils.h | 7 ++++++ 9 files changed, 73 insertions(+), 75 deletions(-) delete mode 100644 bseq.c delete mode 100644 bseq.h diff --git a/Makefile b/Makefile index 8cf767a..334616c 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ - bseq.o bwaseqio.o bwase.o kstring.o + bwaseqio.o bwase.o kstring.o AOBJS= QSufSort.o bwt_gen.o \ is.o bwtmisc.o bwtindex.o ksw.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ diff --git a/bseq.c b/bseq.c deleted file mode 100644 index d20b983..0000000 --- a/bseq.c +++ /dev/null @@ -1,55 +0,0 @@ -#include -#include -#include -#include -#include "bseq.h" -#include "kseq.h" -KSEQ_INIT2(, gzFile, gzread) - -static inline void trim_readno(kstring_t *s) -{ - if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) - s->l -= 2, s->s[s->l] = 0; -} - -static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) -{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice - s->name = strdup(ks->name.s); - s->comment = ks->comment.l? strdup(s->comment) : 0; - s->seq = strdup(ks->seq.s); - s->qual = ks->qual.l? strdup(ks->qual.s) : 0; - s->l_seq = strlen(s->seq); -} - -bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) -{ - kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; - int size = 0, m, n; - bseq1_t *seqs; - m = n = 0; seqs = 0; - while (kseq_read(ks) >= 0) { - if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads - fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); - break; - } - if (n >= m) { - m = m? m<<1 : 256; - seqs = realloc(seqs, m * sizeof(bseq1_t)); - } - trim_readno(&ks->name); - kseq2bseq1(ks, &seqs[n]); - size += seqs[n++].l_seq; - if (ks2) { - trim_readno(&ks2->name); - kseq2bseq1(ks2, &seqs[n]); - size += seqs[n++].l_seq; - } - if (size >= chunk_size) break; - } - if (size == 0) { // test if the 2nd file is finished - if (ks2 && kseq_read(ks2) >= 0) - fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); - } - *n_ = n; - return seqs; -} diff --git a/bseq.h b/bseq.h deleted file mode 100644 index 978312a..0000000 --- a/bseq.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef BATCHSEQ_H_ -#define BATCHSEQ_H_ - -typedef struct { - int l_seq; - char *name, *comment, *seq, *qual, *sam; -} bseq1_t; - -bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); - -#endif diff --git a/bwamem.c b/bwamem.c index 7557af6..8d0494d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -10,6 +10,7 @@ #include "bwamem.h" #include "bntseq.h" #include "ksw.h" +#include "kvec.h" #include "ksort.h" #define MAPQ_COEF 40. diff --git a/bwamem.h b/bwamem.h index b95c96d..4e2e5ce 100644 --- a/bwamem.h +++ b/bwamem.h @@ -3,8 +3,7 @@ #include "bwt.h" #include "bntseq.h" -#include "bseq.h" -#include "kvec.h" +#include "utils.h" struct __smem_i; typedef struct __smem_i smem_i; @@ -51,8 +50,8 @@ typedef struct { int64_t mb, me; // mb: mate start; -1 if single-end; -2 if mate unmapped } bwahit_t; -typedef kvec_t(mem_chain_t) mem_chain_v; -typedef kvec_t(mem_alnreg_t) mem_alnreg_v; +typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; +typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; extern int mem_verbose; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index a18ffc8..55c7c64 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -13,7 +13,6 @@ #include "bwtsw2.h" #include "stdaln.h" #include "kstring.h" -#include "bseq.h" #include "kseq.h" KSEQ_DECLARE(gzFile) diff --git a/fastmap.c b/fastmap.c index 56674f9..f2677eb 100644 --- a/fastmap.c +++ b/fastmap.c @@ -6,7 +6,7 @@ #include "bwt.h" #include "bwamem.h" #include "kvec.h" -#include "bseq.h" +#include "utils.h" #include "kseq.h" KSEQ_DECLARE(gzFile) diff --git a/utils.c b/utils.c index 41594c3..127c8fe 100644 --- a/utils.c +++ b/utils.c @@ -35,9 +35,8 @@ #include #include "utils.h" -#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y)) - #include "ksort.h" +#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y)) KSORT_INIT(128, pair64_t, pair64_lt) KSORT_INIT(64, uint64_t, ks_lt_generic) @@ -139,6 +138,10 @@ int err_fclose(FILE *stream) return ret; } +/********* + * Timer * + *********/ + double cputime() { struct rusage r; @@ -153,3 +156,58 @@ double realtime() gettimeofday(&tp, &tzp); return tp.tv_sec + tp.tv_usec * 1e-6; } + +/************************ + * Batch FASTA/Q reader * + ************************/ + +#include "kseq.h" +KSEQ_INIT2(, gzFile, gzread) + +static inline void trim_readno(kstring_t *s) +{ + if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) + s->l -= 2, s->s[s->l] = 0; +} + +static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) +{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice + s->name = strdup(ks->name.s); + s->comment = ks->comment.l? strdup(s->comment) : 0; + s->seq = strdup(ks->seq.s); + s->qual = ks->qual.l? strdup(ks->qual.s) : 0; + s->l_seq = strlen(s->seq); +} + +bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) +{ + kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; + int size = 0, m, n; + bseq1_t *seqs; + m = n = 0; seqs = 0; + while (kseq_read(ks) >= 0) { + if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + if (n >= m) { + m = m? m<<1 : 256; + seqs = realloc(seqs, m * sizeof(bseq1_t)); + } + trim_readno(&ks->name); + kseq2bseq1(ks, &seqs[n]); + size += seqs[n++].l_seq; + if (ks2) { + trim_readno(&ks2->name); + kseq2bseq1(ks2, &seqs[n]); + size += seqs[n++].l_seq; + } + if (size >= chunk_size) break; + } + if (size == 0) { // test if the 2nd file is finished + if (ks2 && kseq_read(ks2) >= 0) + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + *n_ = n; + return seqs; +} diff --git a/utils.h b/utils.h index 5abab41..6c065c1 100644 --- a/utils.h +++ b/utils.h @@ -52,6 +52,11 @@ typedef struct { typedef struct { size_t n, m; uint64_t *a; } uint64_v; typedef struct { size_t n, m; pair64_t *a; } pair64_v; +typedef struct { + int l_seq; + char *name, *comment, *seq, *qual, *sam; +} bseq1_t; + #ifdef __cplusplus extern "C" { #endif @@ -75,6 +80,8 @@ extern "C" { void ks_introsort_64 (size_t n, uint64_t *a); void ks_introsort_128(size_t n, pair64_t *a); + bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + #ifdef __cplusplus } #endif From cfdc938fc316e0e4f9764636f87a87f349513549 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 10:39:16 -0500 Subject: [PATCH 215/498] to exclude "test64" --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 16d123a..57cb318 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.[oa] bwa test +test64 .*.swp From 2fc469d0c9b459c28caf6618520eff948e886a58 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 12:09:36 -0500 Subject: [PATCH 216/498] code backup --- Makefile | 2 +- bwamem.c | 18 +++++++++++------- bwamem_pair.c | 42 +++++++++++++++++++++++++----------------- bwape.c | 13 ------------- utils.h | 13 +++++++++++++ 5 files changed, 50 insertions(+), 38 deletions(-) diff --git a/Makefile b/Makefile index 334616c..2c060e9 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,7 @@ SUBDIRS= . all:$(PROG) bwa:libbwa.a $(AOBJS) main.o - $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) + $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) diff --git a/bwamem.c b/bwamem.c index 8d0494d..b44e54e 100644 --- a/bwamem.c +++ b/bwamem.c @@ -587,6 +587,15 @@ static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) return mapq; } +void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h) +{ + h->rb = a->rb; h->re = a->re; h->qb = a->qb; h->qe = a->qe; + h->score = a->score; + h->sub = a->sub > a->csub? a->sub : a->csub; + h->qual = h->flag = 0; // these are unset + h->mb = h->me = -2; // mate positions are unset +} + void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) { int k; @@ -596,13 +605,8 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (a->n > 0) { for (k = 0; k < a->n; ++k) { bwahit_t h; - mem_alnreg_t *p = &a->a[k]; - h.rb = p->rb; h.re = p->re; - h.qb = p->qb; h.qe = p->qe; - h.score = p->score; h.sub = p->sub; - h.flag = 0; - h.qual = approx_mapq_se(opt, p); - h.mb = h.me = -2; + mem_alnreg2hit(&a->a[k], &h); + h.qual = approx_mapq_se(opt, &a->a[k]); bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->is_hard); } } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->is_hard); diff --git a/bwamem_pair.c b/bwamem_pair.c index 845051c..6b44f78 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -99,40 +99,48 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) { - uint64_v v; + pair64_v v; + pair64_t o, subo; // score<<32 | raw_score<<8 | hash int r, i, y[4]; // y[] keeps the last hit kv_init(v); - for (r = 0; r < 2; ++r) { + for (r = 0; r < 2; ++r) { // loop through read number for (i = 0; i < a[r].n; ++i) { - uint64_t key; + pair64_t key; mem_alnreg_t *e = &a[r].a[i]; - key = ((e->rb < bns->l_pac? e->rb<<1 : ((bns->l_pac<<1) - 1 - e->rb)<<1 | 1)<<1 | r) << 30 | e->score; - kv_push(uint64_t, v, key); + key.x = e->rb < bns->l_pac? e->rb : (bns->l_pac<<1) - 1 - e->rb; // forward position + key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= bns->l_pac)<<1 | r; + kv_push(pair64_t, v, key); } } - ks_introsort_64(v.n, v.a); + ks_introsort_128(v.n, v.a); y[0] = y[1] = y[2] = y[3] = -1; - printf("**** %ld\n", v.n); + o.x = o.y = subo.x = subo.y = 0; for (i = 0; i < v.n; ++i) { - printf("%lld\t%c\t%lld\t%lld\n", v.a[i]>>32, "+-"[v.a[i]>>31&1], v.a[i]>>30&1, v.a[i]<<34>>34); - for (r = 0; r < 2; ++r) { - int dir = r<<1 | (v.a[i]>>31&1), which, k; + for (r = 0; r < 2; ++r) { // loop through direction + int dir = r<<1 | (v.a[i].y>>1&1), which, k; if (pes[dir].failed) continue; // invalid orientation - which = r<<1 | ((v.a[i]>>30&1)^1); + which = r<<1 | ((v.a[i].y&1)^1); if (y[which] < 0) continue; // no previous hits for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt) - int dist; + int64_t dist; + int raw_score, score; double ns; - if ((v.a[k]>>30&3) != which) continue; - dist = (v.a[i]>>32) - (v.a[k]>>32); - printf("%d\t%d\t%d\n", r, which, dist); + uint64_t x, pair; + if ((v.a[k].y&3) != which) continue; + dist = (int64_t)v.a[i].x - v.a[k].x; if (dist > pes[dir].high) break; if (dist < pes[dir].low) continue; + raw_score = (v.a[i].y>>32) + (v.a[i].y>>32); + if (raw_score + 20 * opt->a < (subo.x>>8&0xffffff)) continue; // skip the following if the score is too small ns = (dist - pes[dir].avg) / pes[dir].std; - printf("%f\n", ns); + score = (int)(23. * raw_score / (opt->a + opt->b) - 4.343 * log(.5 * erfc(fabs(ns) * M_SQRT1_2)) + .499); + pair = (uint64_t)k<<32 | i; + x = (uint64_t)score<<32 | (int64_t)raw_score<<8 | (hash_64(pair)&0xff); + if (x > o.x) subo = o, o.x = x, o.y = pair; + else if (x > subo.x) subo.x = x, subo.y = pair; } } - y[v.a[i]>>30&3] = i; + y[v.a[i].y&3] = i; } free(v.a); } diff --git a/bwape.c b/bwape.c index 4201cf2..77ae1fa 100644 --- a/bwape.c +++ b/bwape.c @@ -60,19 +60,6 @@ pe_opt_t *bwa_init_pe_opt() po->ap_prior = 1e-5; return po; } - -static inline uint64_t hash_64(uint64_t key) -{ - key += ~(key << 32); - key ^= (key >> 22); - key += ~(key << 13); - key ^= (key >> 8); - key += (key << 3); - key ^= (key >> 15); - key += ~(key << 27); - key ^= (key >> 31); - return key; -} /* static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x); { diff --git a/utils.h b/utils.h index 6c065c1..70f4e11 100644 --- a/utils.h +++ b/utils.h @@ -86,4 +86,17 @@ extern "C" { } #endif +static inline uint64_t hash_64(uint64_t key) +{ + key += ~(key << 32); + key ^= (key >> 22); + key += ~(key << 13); + key ^= (key >> 8); + key += (key << 3); + key ^= (key >> 15); + key += ~(key << 27); + key ^= (key >> 31); + return key; +} + #endif From 22b79b3475700160b557c7db0bc770371e85c21f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 15:34:44 -0500 Subject: [PATCH 217/498] mark primary, instead of dropping secondary --- bwamem.c | 24 +++++++++++++++--------- bwamem.h | 2 +- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/bwamem.c b/bwamem.c index b44e54e..68078d0 100644 --- a/bwamem.c +++ b/bwamem.c @@ -345,14 +345,18 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a) return m; } -int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function +void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function { // similar to the loop in mem_chain_flt() - int i, j, m, tmp; - if (n <= 1) return n; - for (i = 0; i < n; ++i) a[i].sub = 0; + int i, k, tmp; + kvec_t(int) z; + if (n == 0) return; + kv_init(z); + for (i = 0; i < n; ++i) a[i].sub = a[i].is_primary = 0; tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; - for (i = 1, m = 1; i < n; ++i) { - for (j = 0; j < m; ++j) { + kv_push(int, z, 0); + for (i = 1; i < n; ++i) { + for (k = 0; k < z.n; ++k) { + int j = z.a[k]; int b_max = a[j].qb > a[i].qb? a[j].qb : a[i].qb; int e_min = a[j].qe < a[i].qe? a[j].qe : a[i].qe; if (e_min > b_max) { // have overlap @@ -364,9 +368,10 @@ int mem_choose_alnreg_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT } } } - if (j == m) a[m++] = a[i]; + if (k == z.n) kv_push(int, z, i); } - return m; + for (k = 0; k < z.n; ++k) a[z.a[k]].is_primary = 1; + free(z.a); } /************************ @@ -601,10 +606,11 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b int k; kstring_t str; str.l = str.m = 0; str.s = 0; - a->n = mem_choose_alnreg_se(opt, a->n, a->a); // NOTE: mem_sort_and_dedup() called in worker1() if (a->n > 0) { + mem_mark_primary_se(opt, a->n, a->a); // NOTE: mem_sort_and_dedup() called in worker1() for (k = 0; k < a->n; ++k) { bwahit_t h; + if (!a->a[k].is_primary) continue; mem_alnreg2hit(&a->a[k], &h); h.qual = approx_mapq_se(opt, &a->a[k]); bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->is_hard); diff --git a/bwamem.h b/bwamem.h index 4e2e5ce..d511254 100644 --- a/bwamem.h +++ b/bwamem.h @@ -34,7 +34,7 @@ typedef struct { typedef struct { int64_t rb, re; int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain - int sub_n; + int sub_n, is_primary; } mem_alnreg_t; typedef struct { From cd0969332f6804db96357871321263f0196e9e6a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 15:52:23 -0500 Subject: [PATCH 218/498] keep track of the "parent" of a secondary --- bwamem.c | 9 +++++---- bwamem.h | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/bwamem.c b/bwamem.c index 68078d0..c2b0eed 100644 --- a/bwamem.c +++ b/bwamem.c @@ -351,7 +351,7 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT kvec_t(int) z; if (n == 0) return; kv_init(z); - for (i = 0; i < n; ++i) a[i].sub = a[i].is_primary = 0; + for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1; tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; kv_push(int, z, 0); for (i = 1; i < n; ++i) { @@ -369,8 +369,8 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT } } if (k == z.n) kv_push(int, z, i); + else a[i].secondary = z.a[k]; } - for (k = 0; k < z.n; ++k) a[z.a[k]].is_primary = 1; free(z.a); } @@ -597,7 +597,8 @@ void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h) h->rb = a->rb; h->re = a->re; h->qb = a->qb; h->qe = a->qe; h->score = a->score; h->sub = a->sub > a->csub? a->sub : a->csub; - h->qual = h->flag = 0; // these are unset + h->qual = 0; // quality unset + h->flag = a->secondary? 0x100 : 0; // only the "secondary" bit is set h->mb = h->me = -2; // mate positions are unset } @@ -610,7 +611,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b mem_mark_primary_se(opt, a->n, a->a); // NOTE: mem_sort_and_dedup() called in worker1() for (k = 0; k < a->n; ++k) { bwahit_t h; - if (!a->a[k].is_primary) continue; + if (a->a[k].secondary >= 0) continue; mem_alnreg2hit(&a->a[k], &h); h.qual = approx_mapq_se(opt, &a->a[k]); bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->is_hard); diff --git a/bwamem.h b/bwamem.h index d511254..3ac15d0 100644 --- a/bwamem.h +++ b/bwamem.h @@ -34,7 +34,8 @@ typedef struct { typedef struct { int64_t rb, re; int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain - int sub_n, is_primary; + int sub_n; // approximate number of suboptimal hits + int secondary; // non-negative if the hit is secondary } mem_alnreg_t; typedef struct { From 325ba8213b1865ff28d794c6beb17741c7bc75ed Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 15:54:55 -0500 Subject: [PATCH 219/498] move mark primary to worker1() --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index c2b0eed..97ec661 100644 --- a/bwamem.c +++ b/bwamem.c @@ -608,7 +608,6 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b kstring_t str; str.l = str.m = 0; str.s = 0; if (a->n > 0) { - mem_mark_primary_se(opt, a->n, a->a); // NOTE: mem_sort_and_dedup() called in worker1() for (k = 0; k < a->n; ++k) { bwahit_t h; if (a->a[k].secondary >= 0) continue; @@ -658,6 +657,7 @@ static void *worker1(void *data) for (i = w->start; i < w->n; i += w->step) { w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); w->regs[i].n = mem_sort_and_dedup(w->regs[i].n, w->regs[i].a); + mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); } return 0; } From 604e3d8da10d96ccb5495634d9ea03999d58e7fa Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 16:15:26 -0500 Subject: [PATCH 220/498] code backup; to upgrade ksw.{c,h} --- bwamem.c | 4 +--- bwamem.h | 2 ++ bwamem_pair.c | 29 ++++++++++++++++++----------- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/bwamem.c b/bwamem.c index 97ec661..a639109 100644 --- a/bwamem.c +++ b/bwamem.c @@ -13,8 +13,6 @@ #include "kvec.h" #include "ksort.h" -#define MAPQ_COEF 40. - int mem_verbose = 3; // 1: error only; 2: error+warning; 3: message+error+warning; >=4: debugging void mem_fill_scmat(int a, int b, int8_t mat[25]) @@ -583,7 +581,7 @@ static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) double identity; sub = a->csub > sub? a->csub : sub; l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; - mapq = a->score? (int)(MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; + mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; if (a->sub_n) mapq -= (int)(4.343 * log(a->sub_n) + .499); diff --git a/bwamem.h b/bwamem.h index 3ac15d0..ebfb8cd 100644 --- a/bwamem.h +++ b/bwamem.h @@ -5,6 +5,8 @@ #include "bntseq.h" #include "utils.h" +#define MEM_MAPQ_COEF 40.0 + struct __smem_i; typedef struct __smem_i smem_i; diff --git a/bwamem_pair.c b/bwamem_pair.c index 6b44f78..9129663 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -97,11 +97,12 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * } } -void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) +int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) { + extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); pair64_v v; pair64_t o, subo; // score<<32 | raw_score<<8 | hash - int r, i, y[4]; // y[] keeps the last hit + int r, i, k, y[4]; // y[] keeps the last hit kv_init(v); for (r = 0; r < 2; ++r) { // loop through read number for (i = 0; i < a[r].n; ++i) { @@ -117,7 +118,7 @@ void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, con o.x = o.y = subo.x = subo.y = 0; for (i = 0; i < v.n; ++i) { for (r = 0; r < 2; ++r) { // loop through direction - int dir = r<<1 | (v.a[i].y>>1&1), which, k; + int dir = r<<1 | (v.a[i].y>>1&1), which; if (pes[dir].failed) continue; // invalid orientation which = r<<1 | ((v.a[i].y&1)^1); if (y[which] < 0) continue; // no previous hits @@ -133,7 +134,7 @@ void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, con raw_score = (v.a[i].y>>32) + (v.a[i].y>>32); if (raw_score + 20 * opt->a < (subo.x>>8&0xffffff)) continue; // skip the following if the score is too small ns = (dist - pes[dir].avg) / pes[dir].std; - score = (int)(23. * raw_score / (opt->a + opt->b) - 4.343 * log(.5 * erfc(fabs(ns) * M_SQRT1_2)) + .499); + score = (int)(raw_score - 4.343 / 23. * (opt->a + opt->b) * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); pair = (uint64_t)k<<32 | i; x = (uint64_t)score<<32 | (int64_t)raw_score<<8 | (hash_64(pair)&0xff); if (x > o.x) subo = o, o.x = x, o.y = pair; @@ -142,7 +143,13 @@ void mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, con } y[v.a[i].y&3] = i; } + if (o.x > 0) { + i = o.y >> 32; k = o.y << 32 >> 32; + mem_alnreg2hit(&a[v.a[i].y&1].a[v.a[i].y<<32>>34], &h[v.a[i].y&1]); + mem_alnreg2hit(&a[v.a[k].y&1].a[v.a[k].y<<32>>34], &h[v.a[k].y&1]); + } free(v.a); + return o.x == 0? -1 : 0; } void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]) @@ -150,11 +157,11 @@ void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, c kstring_t str; bwahit_t h[2]; str.l = str.m = 0; str.s = 0; - mem_pair(opt, bns, pac, pes, s, a, h); - /* - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); - s[0].sam = strdup(str.s); str.l = 0; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->is_hard); - s[1].sam = str.s; - */ + if (mem_pair(opt, bns, pac, pes, s, a, h) == 0) { // successful + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); + s[0].sam = strdup(str.s); str.l = 0; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->is_hard); + s[1].sam = str.s; + } else { + } } From 28a7d501f2f911866ad925f0636d1e7f07b6667f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Feb 2013 16:35:05 -0500 Subject: [PATCH 221/498] updated to the latest ksw; NOT TESTED YET!!! --- bwtsw2_pair.c | 38 +++------ ksw.c | 209 ++++++++++++++++++++++++++++++-------------------- ksw.h | 77 +++++++++++-------- 3 files changed, 182 insertions(+), 142 deletions(-) diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index 8a8287b..85ba1eb 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -127,35 +127,18 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b seq[i] = nst_nt4_table[(int)mseq[i]]; } #ifndef _NO_SSE2 - { - ksw_query_t *q; - ksw_aux_t aux[2]; - // forward Smith-Waterman - aux[0].T = opt->t; aux[0].gapo = opt->q; aux[0].gape = opt->r; aux[1] = aux[0]; - q = ksw_qinit(l_mseq * g_mat[0] < 250? 1 : 2, l_mseq, seq, 5, g_mat); - ksw_sse2(q, end - beg, ref, &aux[0]); - free(q); - if (aux[0].score < opt->t) { - free(seq); - return; - } - ++aux[0].qe; ++aux[0].te; - // reverse Smith-Waterman - seq_reverse(aux[0].qe, seq, 0); - seq_reverse(aux[0].te, ref, 0); - q = ksw_qinit(aux[0].qe * g_mat[0] < 250? 1 : 2, aux[0].qe, seq, 5, g_mat); - ksw_sse2(q, aux[0].te, ref, &aux[1]); - free(q); - ++aux[1].qe; ++aux[1].te; - // write output - a->G = aux[0].score; - a->G2 = aux[0].score2 > aux[1].score2? aux[0].score2 : aux[1].score2; + { // FIXME!!! The following block has not been tested since the update of the ksw library + int flag = KSW_XSUBO | KSW_XSTOP | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0); + kswr_t aln; + aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0); + a->G = aln.score; + a->G2 = aln.score2; if (a->G2 < opt->t) a->G2 = 0; if (a->G2) a->flag |= BSW2_FLAG_TANDEM; - a->k = beg + (aux[0].te - aux[1].te); - a->len = aux[1].te; - a->beg = aux[0].qe - aux[1].qe; - a->end = aux[0].qe; + a->k = beg + aln.tb; + a->len = aln.te - aln.tb; + a->beg = aln.qb; + a->end = aln.qe; } #else { @@ -168,6 +151,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b a->G = aln_local_core(ref, end - beg, seq, l_mseq, &ap, path, 0, opt->t, &a->G2); if (a->G < opt->t) a->G = 0; if (a->G2 < opt->t) a->G2 = 0; + if (a->G2) a->flag |= BSW2_FLAG_TANDEM; a->k = beg + path[0].i - 1; a->len = path[1].i - path[0].i + 1; a->beg = path[0].j - 1; diff --git a/ksw.c b/ksw.c index 08cdf56..4599c6b 100644 --- a/ksw.c +++ b/ksw.c @@ -25,14 +25,8 @@ #include #include -#include "ksw.h" - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -#ifndef _NO_SSE2 #include +#include "ksw.h" #ifdef __GNUC__ #define LIKELY(x) __builtin_expect((x),1) @@ -42,26 +36,35 @@ #define UNLIKELY(x) (x) #endif -/*************** - *** SSE2 SW *** - ***************/ +const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 }; -struct _ksw_query_t { +struct _kswq_t { int qlen, slen; uint8_t shift, mdiff, max, size; __m128i *qp, *H0, *H1, *E, *Hmax; }; -ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) +/** + * Initialize the query data structure + * + * @param size Number of bytes used to store a score; valid valures are 1 or 2 + * @param qlen Length of the query sequence + * @param query Query sequence + * @param m Size of the alphabet + * @param mat Scoring matrix in a one-dimension array + * + * @return Query data structure + */ +kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) { - ksw_query_t *q; + kswq_t *q; int slen, a, tmp, p; size = size > 1? 2 : 1; p = 8 * (3 - size); // # values per __m128i slen = (qlen + p - 1) / p; // segmented length - q = malloc(sizeof(ksw_query_t) + 256 + 16 * slen * (m + 4)); // a single block of memory - q->qp = (__m128i*)(((size_t)q + sizeof(ksw_query_t) + 15) >> 4 << 4); // align memory + q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory + q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory q->H0 = q->qp + slen * m; q->H1 = q->H0 + slen; q->E = q->H1 + slen; @@ -100,11 +103,12 @@ ksw_query_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const in return q; } -int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e) +kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) { - int slen, i, m_b, n_b, te = -1, gmax = 0; + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; uint64_t *b; __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax; + kswr_t r; #define __max_16(ret, xx) do { \ (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ @@ -115,10 +119,13 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) / } while (0) // initialization + r = g_defr; + minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; + endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; m_b = n_b = 0; b = 0; zero = _mm_set1_epi32(0); - gapoe = _mm_set1_epi8(a->gapo + a->gape); - gape = _mm_set1_epi8(a->gape); + gapoe = _mm_set1_epi8(_gapo + _gape); + gape = _mm_set1_epi8(_gape); shift = _mm_set1_epi8(q->shift); H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; slen = q->slen; @@ -174,11 +181,11 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) / end_loop16: //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); __max_16(imax, max); // imax is the maximum number in max - if (imax >= a->T) { // write the b array; this condition adds branching unfornately + if (imax >= minsc) { // write the b array; this condition adds branching unfornately if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append if (n_b == m_b) { m_b = m_b? m_b<<1 : 8; - b = realloc(b, 8 * m_b); + b = (uint64_t*)realloc(b, 8 * m_b); } b[n_b++] = (uint64_t)imax<<32 | i; } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last @@ -187,34 +194,38 @@ int ksw_sse2_16(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) / gmax = imax; te = i; // te is the end position on the target for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); - if (gmax + q->shift >= 255) break; + if (gmax + q->shift >= 255 || gmax >= endsc) break; } S = H1; H1 = H0; H0 = S; // swap H0 and H1 } - a->score = gmax; a->te = te; - { // get a->qe, the end of query match; find the 2nd best score + r.score = gmax + q->shift < 255? gmax : 255; + r.te = te; + if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score int max = -1, low, high, qlen = slen * 16; uint8_t *t = (uint8_t*)Hmax; - for (i = 0, a->qe = -1; i < qlen; ++i, ++t) - if ((int)*t > max) max = *t, a->qe = i / 16 + i % 16 * slen; + for (i = 0; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; //printf("%d,%d\n", max, gmax); - i = (a->score + q->max - 1) / q->max; - low = te - i; high = te + i; - for (i = 0, a->score2 = 0; i < n_b; ++i) { - int e = (int32_t)b[i]; - if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2) - a->score2 = b[i]>>32, a->te2 = e; + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && b[i]>>32 > (uint32_t)r.score2) + r.score2 = b[i]>>32, r.te2 = e; + } } } free(b); - return a->score + q->shift >= 255? 255 : a->score; + return r; } -int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // the first gap costs -(_o+_e) +kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) { - int slen, i, m_b, n_b, te = -1, gmax = 0; + int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; uint64_t *b; __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; + kswr_t r; #define __max_8(ret, xx) do { \ (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ @@ -224,10 +235,13 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // } while (0) // initialization + r = g_defr; + minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; + endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; m_b = n_b = 0; b = 0; zero = _mm_set1_epi32(0); - gapoe = _mm_set1_epi16(a->gapo + a->gape); - gape = _mm_set1_epi16(a->gape); + gapoe = _mm_set1_epi16(_gapo + _gape); + gape = _mm_set1_epi16(_gape); H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; slen = q->slen; for (i = 0; i < slen; ++i) { @@ -269,11 +283,11 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // } end_loop8: __max_8(imax, max); - if (imax >= a->T) { + if (imax >= minsc) { if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { if (n_b == m_b) { m_b = m_b? m_b<<1 : 8; - b = realloc(b, 8 * m_b); + b = (uint64_t*)realloc(b, 8 * m_b); } b[n_b++] = (uint64_t)imax<<32 | i; } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last @@ -282,34 +296,60 @@ int ksw_sse2_8(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) // gmax = imax; te = i; for (j = 0; LIKELY(j < slen); ++j) _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); + if (gmax >= endsc) break; } S = H1; H1 = H0; H0 = S; } - a->score = gmax; a->te = te; + r.score = gmax; r.te = te; { int max = -1, low, high, qlen = slen * 8; uint16_t *t = (uint16_t*)Hmax; - for (i = 0, a->qe = -1; i < qlen; ++i, ++t) - if ((int)*t > max) max = *t, a->qe = i / 8 + i % 8 * slen; - i = (a->score + q->max - 1) / q->max; - low = te - i; high = te + i; - for (i = 0, a->score2 = 0; i < n_b; ++i) { - int e = (int32_t)b[i]; - if ((e < low || e > high) && b[i]>>32 > (uint32_t)a->score2) - a->score2 = b[i]>>32, a->te2 = e; + for (i = 0, r.qe = -1; i < qlen; ++i, ++t) + if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; + if (b) { + i = (r.score + q->max - 1) / q->max; + low = te - i; high = te + i; + for (i = 0; i < n_b; ++i) { + int e = (int32_t)b[i]; + if ((e < low || e > high) && b[i]>>32 > (uint32_t)r.score2) + r.score2 = b[i]>>32, r.te2 = e; + } } } free(b); - return a->score; + return r; } -int ksw_sse2(ksw_query_t *q, int tlen, const uint8_t *target, ksw_aux_t *a) +static void revseq(int l, uint8_t *s) { - if (q->size == 1) return ksw_sse2_16(q, tlen, target, a); - else return ksw_sse2_8(q, tlen, target, a); + int i, t; + for (i = 0; i < l>>1; ++i) + t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t; } -#endif // _NO_SSE2 +kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry) +{ + int size; + kswq_t *q; + kswr_t r, rr; + kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int); + + q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat); + if (qry && *qry == 0) *qry = q; + func = q->size == 2? ksw_i16 : ksw_u8; + size = q->size; + r = func(q, tlen, target, gapo, gape, xtra); + if (qry == 0) free(q); + if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r; + revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end + q = ksw_qinit(size, r.qe + 1, query, m, mat); + rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score); + revseq(r.qe + 1, query); revseq(r.te + 1, target); + free(q); + if (r.score == rr.score) + r.tb = r.te - rr.te, r.qb = r.qe - rr.qe; + return r; +} /******************** *** SW extension *** @@ -494,7 +534,7 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, * Main function (not compiled by default) * *******************************************/ -#if defined(_KSW_MAIN) && !defined(_NO_SSE2) +#ifdef _KSW_MAIN #include #include @@ -523,30 +563,33 @@ unsigned char seq_nt4_table[256] = { int main(int argc, char *argv[]) { - int c, sa = 1, sb = 3, i, j, k, forward_only = 0, size = 2; + int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0; int8_t mat[25]; - ksw_aux_t a; + int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART; + uint8_t *rseq = 0; gzFile fpt, fpq; kseq_t *kst, *ksq; + // parse command line - a.gapo = 5; a.gape = 2; a.T = 10; - while ((c = getopt(argc, argv, "a:b:q:r:ft:s:")) >= 0) { + while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) { switch (c) { case 'a': sa = atoi(optarg); break; case 'b': sb = atoi(optarg); break; - case 'q': a.gapo = atoi(optarg); break; - case 'r': a.gape = atoi(optarg); break; - case 't': a.T = atoi(optarg); break; + case 'q': gapo = atoi(optarg); break; + case 'r': gape = atoi(optarg); break; + case 't': minsc = atoi(optarg); break; case 'f': forward_only = 1; break; - case 's': size = atoi(optarg); break; + case '1': xtra |= KSW_XBYTE; break; } } if (optind + 2 > argc) { - fprintf(stderr, "Usage: ksw [-s%d] [-a%d] [-b%d] [-q%d] [-r%d] \n", size, sa, sb, a.gapo, a.gape); + fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] \n", sa, sb, gapo, gape, minsc); return 1; } + if (minsc > 0xffff) minsc = 0xffff; + if (minsc > 0) xtra |= KSW_XSUBO | minsc; // initialize scoring matrix - for (i = k = 0; i < 5; ++i) { + for (i = k = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) mat[k++] = i == j? sa : -sb; mat[k++] = 0; // ambiguous base @@ -557,34 +600,34 @@ int main(int argc, char *argv[]) fpq = gzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); // all-pair alignment while (kseq_read(ksq) > 0) { - ksw_query_t *q[2]; - for (i = 0; i < ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; - q[0] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat); + kswq_t *q[2] = {0, 0}; + kswr_t r; + for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; if (!forward_only) { // reverse - for (i = 0; i < ksq->seq.l/2; ++i) { - int t = ksq->seq.s[i]; - ksq->seq.s[i] = ksq->seq.s[ksq->seq.l-1-i]; - ksq->seq.s[ksq->seq.l-1-i] = t; + if ((int)ksq->seq.m > max_rseq) { + max_rseq = ksq->seq.m; + rseq = (uint8_t*)realloc(rseq, max_rseq); } - for (i = 0; i < ksq->seq.l; ++i) - ksq->seq.s[i] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; - q[1] = ksw_qinit(size, ksq->seq.l, (uint8_t*)ksq->seq.s, 5, mat); - } else q[1] = 0; + for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j) + rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; + } gzrewind(fpt); kseq_rewind(kst); while (kseq_read(kst) > 0) { - int s; - for (i = 0; i < kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; - s = ksw_sse2(q[0], kst->seq.l, (uint8_t*)kst->seq.s, &a); - printf("%s\t%s\t+\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1); - if (q[1]) { - s = ksw_sse2(q[1], kst->seq.l, (uint8_t*)kst->seq.s, &a); - printf("%s\t%s\t-\t%d\t%d\t%d\n", ksq->name.s, kst->name.s, s, a.te+1, a.qe+1); + for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; + r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]); + if (r.score >= minsc) + printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2); + if (rseq) { + r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]); + if (r.score >= minsc) + printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2); } } free(q[0]); free(q[1]); } + free(rseq); kseq_destroy(kst); gzclose(fpt); kseq_destroy(ksq); gzclose(fpq); return 0; } -#endif // _KSW_MAIN +#endif diff --git a/ksw.h b/ksw.h index c7eaabb..5162dc0 100644 --- a/ksw.h +++ b/ksw.h @@ -3,51 +3,64 @@ #include -struct _ksw_query_t; -typedef struct _ksw_query_t ksw_query_t; +#define KSW_XBYTE 0x10000 +#define KSW_XSTOP 0x20000 +#define KSW_XSUBO 0x40000 +#define KSW_XSTART 0x80000 + +struct _kswq_t; +typedef struct _kswq_t kswq_t; typedef struct { - // input - unsigned gapo, gape; // the first gap costs gapo+gape - unsigned T; // threshold - // output - int score, te, qe, score2, te2; -} ksw_aux_t; + int score; // best score + int te, qe; // target end and query end + int score2, te2; // second best score and ending position on the target + int tb, qb; // target start and query start +} kswr_t; #ifdef __cplusplus extern "C" { #endif /** - * Initialize the query data structure + * Aligning two sequences * - * @param size Number of bytes used to store a score; valid valures are 1 or 2 - * @param qlen Length of the query sequence - * @param query Query sequence - * @param m Size of the alphabet - * @param mat Scoring matrix in a one-dimension array + * @param qlen length of the query sequence (typically Date: Tue, 12 Feb 2013 17:48:46 -0500 Subject: [PATCH 222/498] bugfix: bug in the new ksw.c On my test data, one alignment is different, caused by polyA --- bwtsw2_pair.c | 12 +++++++++--- ksw.c | 6 +++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index 85ba1eb..cf29087 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -128,17 +128,23 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b } #ifndef _NO_SSE2 { // FIXME!!! The following block has not been tested since the update of the ksw library - int flag = KSW_XSUBO | KSW_XSTOP | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0); + int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t; kswr_t aln; aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0); a->G = aln.score; a->G2 = aln.score2; + if (a->G < opt->t) a->G = 0; if (a->G2 < opt->t) a->G2 = 0; if (a->G2) a->flag |= BSW2_FLAG_TANDEM; a->k = beg + aln.tb; - a->len = aln.te - aln.tb; + a->len = aln.te - aln.tb + 1; a->beg = aln.qb; - a->end = aln.qe; + a->end = aln.qe + 1; + /* + printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n'); + printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n'); + printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len); + */ } #else { diff --git a/ksw.c b/ksw.c index 4599c6b..8d741a6 100644 --- a/ksw.c +++ b/ksw.c @@ -211,7 +211,7 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, low = te - i; high = te + i; for (i = 0; i < n_b; ++i) { int e = (int32_t)b[i]; - if ((e < low || e > high) && b[i]>>32 > (uint32_t)r.score2) + if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) r.score2 = b[i]>>32, r.te2 = e; } } @@ -311,7 +311,7 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, low = te - i; high = te + i; for (i = 0; i < n_b; ++i) { int e = (int32_t)b[i]; - if ((e < low || e > high) && b[i]>>32 > (uint32_t)r.score2) + if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) r.score2 = b[i]>>32, r.te2 = e; } } @@ -587,7 +587,7 @@ int main(int argc, char *argv[]) return 1; } if (minsc > 0xffff) minsc = 0xffff; - if (minsc > 0) xtra |= KSW_XSUBO | minsc; + xtra |= KSW_XSUBO | minsc; // initialize scoring matrix for (i = k = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) From 87d619a21f67561ecd4efdde2dc6b648f4e6f0a6 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 13 Feb 2013 23:16:16 -0500 Subject: [PATCH 223/498] minor code simplification --- bwamem_pair.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 9129663..9cb41c2 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -97,7 +97,12 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * } } -int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) +void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) +{ + int is_rev = a->rb >= l_pac? 1 : 0; +} + +int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) { extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); pair64_v v; @@ -108,8 +113,8 @@ int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, cons for (i = 0; i < a[r].n; ++i) { pair64_t key; mem_alnreg_t *e = &a[r].a[i]; - key.x = e->rb < bns->l_pac? e->rb : (bns->l_pac<<1) - 1 - e->rb; // forward position - key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= bns->l_pac)<<1 | r; + key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position + key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; kv_push(pair64_t, v, key); } } @@ -157,7 +162,7 @@ void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, c kstring_t str; bwahit_t h[2]; str.l = str.m = 0; str.s = 0; - if (mem_pair(opt, bns, pac, pes, s, a, h) == 0) { // successful + if (mem_pair(opt, bns->l_pac, pac, pes, s, a, h) == 0) { // successful bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); s[0].sam = strdup(str.s); str.l = 0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->is_hard); From 688b524cdfcaa9c04783570ae29f441807cd4d07 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 13 Feb 2013 23:55:56 -0500 Subject: [PATCH 224/498] code backup; tired.. --- bwamem_pair.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 9cb41c2..08af1d3 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -97,9 +97,16 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * } } -void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) +void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], int rn, const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) { - int is_rev = a->rb >= l_pac? 1 : 0; + int r; + rn = !!rn; // either 0 or 1 + for (r = 0; r < 4; ++r) { + int is_rev, is_larger; + if (pes[r].failed) continue; + is_rev = r>>1 == (r&1)? 0 : 1; // whether to reverse complement the mate + is_larger = r>>(!rn)&1; // whether the mate has larger coordinate + } } int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) From df1ff2b36e86c7fe1f3a170f7ffa37b243aeeac7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 14 Feb 2013 12:59:32 -0500 Subject: [PATCH 225/498] better and proper way to infer orinentation --- Makefile | 1 + bwamem.h | 9 +++++++++ bwamem_pair.c | 22 ++++++++++++++-------- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 2c060e9..e11a04d 100644 --- a/Makefile +++ b/Makefile @@ -44,6 +44,7 @@ bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h bwtsw2_main.o:bwtsw2.h bwamem.o:bwamem.h +bwamem_pair.o:bwamem.h fastmap.o:bwt.h bwamem.h clean: diff --git a/bwamem.h b/bwamem.h index ebfb8cd..1f9605d 100644 --- a/bwamem.h +++ b/bwamem.h @@ -83,4 +83,13 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * } #endif +static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist) +{ + int64_t p2; + int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac); + p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand + *dist = p2 > b1? p2 - b1 : b1 - p2; + return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3); +} + #endif diff --git a/bwamem_pair.c b/bwamem_pair.c index 08af1d3..cc0e8f0 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -1,5 +1,6 @@ #include #include +#include #include #include "kstring.h" #include "bwamem.h" @@ -38,19 +39,15 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * memset(isize, 0, sizeof(kvec_t(int)) * 4); for (i = 0; i < n>>1; ++i) { int dir; - int64_t is, pos[2]; + int64_t is; mem_alnreg_v *r[2]; r[0] = (mem_alnreg_v*)®s[i<<1|0]; r[1] = (mem_alnreg_v*)®s[i<<1|1]; if (r[0]->n == 0 || r[1]->n == 0) continue; if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue; if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue; - pos[0] = r[0]->a[0].rb < l_pac? r[0]->a[0].rb : (l_pac<<1) - 1 - r[0]->a[0].rb; // forward coordinate - pos[1] = r[1]->a[0].rb < l_pac? r[1]->a[0].rb : (l_pac<<1) - 1 - r[1]->a[0].rb; - if (pos[0] < pos[1]) dir = (r[0]->a[0].rb >= l_pac)<<1 | (r[1]->a[0].rb >= l_pac); - else dir = (r[1]->a[0].rb >= l_pac)<<1 | (r[0]->a[0].rb >= l_pac); - is = abs(pos[0] - pos[1]); - if (is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); + dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is); + if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); } if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. @@ -99,8 +96,17 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], int rn, const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) { - int r; + int i, r, skip[4]; rn = !!rn; // either 0 or 1 + for (r = 0; r < 4; ++r) + skip[r] = pes[r].failed? 1 : 0; + for (i = 0; i < ma->n; ++i) { // check which orinentation has been found + int64_t dist; + r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist); + if (dist >= pes[r].low && dist <= pes[r].high) + skip[r] = 1; + } + if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return; // consistent pair exist; no need to perform SW for (r = 0; r < 4; ++r) { int is_rev, is_larger; if (pes[r].failed) continue; From 5f8c6efbc3f4677371c5fd13ab837a98aa6aade0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 16 Feb 2013 09:48:44 -0500 Subject: [PATCH 226/498] forbid x-bounary bns_get_seq(); code backup --- bntseq.c | 38 +++++++++++++++++--------------------- bwamem.c | 2 ++ bwamem_pair.c | 37 ++++++++++++++++++++++++++++++++----- 3 files changed, 51 insertions(+), 26 deletions(-) diff --git a/bntseq.c b/bntseq.c index 06d82a0..0286c19 100644 --- a/bntseq.c +++ b/bntseq.c @@ -322,29 +322,25 @@ int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) return nn; } -static inline void get_seq_core(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, uint8_t *seq) -{ - int64_t k, l = 0; - if (beg >= l_pac) { // reverse strand - int64_t beg_f = (l_pac<<1) - 1 - end; - int64_t end_f = (l_pac<<1) - 1 - beg; - for (k = end_f; k > beg_f; --k) - seq[l++] = 3 - _get_pac(pac, k); - } else { // forward strand - for (k = beg; k < end; ++k) - seq[l++] = _get_pac(pac, k); - } -} - uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len) { - uint8_t *seq; + uint8_t *seq = 0; + if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap if (end > l_pac<<1) end = l_pac<<1; - *len = end - beg; - seq = malloc(end - beg); - if (beg < l_pac && end > l_pac) { - get_seq_core(l_pac, pac, beg, l_pac, seq); - get_seq_core(l_pac, pac, l_pac, end, seq + (l_pac - beg)); - } else get_seq_core(l_pac, pac, beg, end, seq); + if (beg < 0) beg = 0; + if (beg >= l_pac || end <= l_pac) { + int64_t k, l = 0; + *len = end - beg; + seq = malloc(end - beg); + if (beg >= l_pac) { // reverse strand + int64_t beg_f = (l_pac<<1) - 1 - end; + int64_t end_f = (l_pac<<1) - 1 - beg; + for (k = end_f; k > beg_f; --k) + seq[l++] = 3 - _get_pac(pac, k); + } else { // forward strand + for (k = beg; k < end; ++k) + seq[l++] = _get_pac(pac, k); + } + } else *len = 0; // if bridging the forward-reverse boundary, return nothing return seq; } diff --git a/bwamem.c b/bwamem.c index a639109..320df8d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -395,6 +395,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int mem_alnreg_t best; memset(&best, 0, sizeof(mem_alnreg_t)); + memset(a, 0, sizeof(mem_alnreg_t)); // get the max possible span rmax[0] = l_pac<<1; rmax[1] = 0; for (i = 0; i < c->n; ++i) { @@ -408,6 +409,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } // retrieve the reference sequence rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); + if (rlen != rmax[1] - rmax[0]) return; for (k = 0; k < c->n;) { s = &c->seeds[k]; diff --git a/bwamem_pair.c b/bwamem_pair.c index cc0e8f0..dd9b3cd 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -97,7 +97,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], int rn, const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) { int i, r, skip[4]; - rn = !!rn; // either 0 or 1 + rn = !!rn; // either 0 or 1; $rn is the read number of $a for (r = 0; r < 4; ++r) skip[r] = pes[r].failed? 1 : 0; for (i = 0; i < ma->n; ++i) { // check which orinentation has been found @@ -109,9 +109,28 @@ void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const m if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return; // consistent pair exist; no need to perform SW for (r = 0; r < 4; ++r) { int is_rev, is_larger; - if (pes[r].failed) continue; - is_rev = r>>1 == (r&1)? 0 : 1; // whether to reverse complement the mate - is_larger = r>>(!rn)&1; // whether the mate has larger coordinate + uint8_t *seq, *rev = 0, *ref; + int64_t rb, re, len; + if (skip[r]) continue; + is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate + is_larger = r>>rn&1; // whether the mate has larger coordinate + if (is_rev) { + rev = malloc(l_ms); // this is the reverse complement of $ms + for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? ms[i] : 4; + seq = rev; + } else seq = (uint8_t*)ms; + if (!is_rev) { + rb = is_larger? a->rb + pes[r].low : a->rb - pes[r].high; + re = (is_larger? a->rb + pes[r].high: a->rb - pes[r].low) + l_ms; // if on the same strand, end position should be larger to make room for the seq length + } else { + rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands + re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low; + } + ref = bns_get_seq(l_pac, pac, rb, re, &len); + if (len == re - rb) { + } + if (rev == 0) free(rev); + free(ref); } } @@ -174,8 +193,16 @@ void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, c { kstring_t str; bwahit_t h[2]; + mem_alnreg_t a0[2]; str.l = str.m = 0; str.s = 0; - if (mem_pair(opt, bns->l_pac, pac, pes, s, a, h) == 0) { // successful + // perform SW for the best alignment + a0[0].score = a0[1].score = -1; + if (a[0].n) a0[0] = a[0].a[0]; + if (a[1].n) a0[1] = a[1].a[0]; + if (a0[0].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, 0, &a0[0], s[1].l_seq, (uint8_t*)s[1].seq, &a[1]); + if (a0[1].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, 1, &a0[1], s[0].l_seq, (uint8_t*)s[0].seq, &a[0]); + // pairing single-end hits + if (mem_pair(opt, bns->l_pac, pac, pes, s, a, h) == 0) { // successful pairing bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); s[0].sam = strdup(str.s); str.l = 0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->is_hard); From fe2236f6feca5b5230e6de75934371bc244434e5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 16 Feb 2013 10:09:30 -0500 Subject: [PATCH 227/498] code backup --- bwamem_pair.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index dd9b3cd..3979341 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -6,6 +6,7 @@ #include "bwamem.h" #include "kvec.h" #include "utils.h" +#include "ksw.h" #define MIN_RATIO 0.8 #define MIN_DIR_CNT 10 @@ -126,8 +127,28 @@ void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const m rb = (is_larger? a->rb + pes[r].low : a->rb - pes[r].high) - l_ms; // similarly on opposite strands re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low; } + if (rb < 0) rb = 0; + if (re > l_pac) re = l_pac; ref = bns_get_seq(l_pac, pac, rb, re, &len); - if (len == re - rb) { + if (len == re - rb) { // no funny things happening + kswr_t aln; + mem_alnreg_t b; + int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len; + aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0); + memset(&b, 0, sizeof(mem_alnreg_t)); + b.qb = aln.qb; b.qe = aln.qe + 1; + b.rb = rb + aln.tb; + b.re = rb + aln.te + 1; + b.score = aln.score; + b.csub = aln.score2; + b.secondary = -1; + kv_push(mem_alnreg_t, *ma, b); // make room for a new element + // move b s.t. ma is sorted + for (i = 0; i < ma->n - 1; ++i) // find the insertion point + if (ma->a[i].score < b.score) break; + tmp = i; + for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1]; + ma->a[i] = b; } if (rev == 0) free(rev); free(ref); From 8ee464478aa5dd5f89c186b9824f28151a6a574d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 16 Feb 2013 10:48:50 -0500 Subject: [PATCH 228/498] matesw working; for testing only --- bwamem_pair.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 3979341..05f0547 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -95,18 +95,19 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * } } -void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], int rn, const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) +void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) { int i, r, skip[4]; - rn = !!rn; // either 0 or 1; $rn is the read number of $a for (r = 0; r < 4; ++r) skip[r] = pes[r].failed? 1 : 0; +#if 0 for (i = 0; i < ma->n; ++i) { // check which orinentation has been found int64_t dist; r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist); if (dist >= pes[r].low && dist <= pes[r].high) skip[r] = 1; } +#endif if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return; // consistent pair exist; no need to perform SW for (r = 0; r < 4; ++r) { int is_rev, is_larger; @@ -114,10 +115,10 @@ void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const m int64_t rb, re, len; if (skip[r]) continue; is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate - is_larger = r>>rn&1; // whether the mate has larger coordinate + is_larger = !(r>>1); // whether the mate has larger coordinate if (is_rev) { rev = malloc(l_ms); // this is the reverse complement of $ms - for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? ms[i] : 4; + for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4; seq = rev; } else seq = (uint8_t*)ms; if (!is_rev) { @@ -128,7 +129,7 @@ void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const m re = is_larger? a->rb + pes[r].high: a->rb - pes[r].low; } if (rb < 0) rb = 0; - if (re > l_pac) re = l_pac; + if (re > l_pac<<1) re = l_pac<<1; ref = bns_get_seq(l_pac, pac, rb, re, &len); if (len == re - rb) { // no funny things happening kswr_t aln; @@ -137,11 +138,17 @@ void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const m aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); b.qb = aln.qb; b.qe = aln.qe + 1; - b.rb = rb + aln.tb; - b.re = rb + aln.te + 1; + if (is_rev) { + b.rb = (l_pac<<1) - (rb + aln.te + 1); + b.re = (l_pac<<1) - (rb + aln.tb); + } else { + b.rb = rb + aln.tb; + b.re = rb + aln.te + 1; + } b.score = aln.score; b.csub = aln.score2; b.secondary = -1; + printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); kv_push(mem_alnreg_t, *ma, b); // make room for a new element // move b s.t. ma is sorted for (i = 0; i < ma->n - 1; ++i) // find the insertion point @@ -220,8 +227,8 @@ void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, c a0[0].score = a0[1].score = -1; if (a[0].n) a0[0] = a[0].a[0]; if (a[1].n) a0[1] = a[1].a[0]; - if (a0[0].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, 0, &a0[0], s[1].l_seq, (uint8_t*)s[1].seq, &a[1]); - if (a0[1].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, 1, &a0[1], s[0].l_seq, (uint8_t*)s[0].seq, &a[0]); + if (a0[0].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, &a0[0], s[1].l_seq, (uint8_t*)s[1].seq, &a[1]); + if (a0[1].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, &a0[1], s[0].l_seq, (uint8_t*)s[0].seq, &a[0]); // pairing single-end hits if (mem_pair(opt, bns->l_pac, pac, pes, s, a, h) == 0) { // successful pairing bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); From ea9fc7df48e9219613d3549884e9d597079cc6d4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 16 Feb 2013 11:03:27 -0500 Subject: [PATCH 229/498] keep the number of SW performed --- bwamem.c | 6 ++++-- bwamem_pair.c | 29 +++++++++++++---------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/bwamem.c b/bwamem.c index 320df8d..ae4992f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -664,7 +664,7 @@ static void *worker1(void *data) static void *worker2(void *data) { - extern void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]); + extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]); worker_t *w = (worker_t*)data; int i; if (!w->opt->is_pe) { @@ -673,10 +673,12 @@ static void *worker2(void *data) free(w->regs[i].a); } } else { + int n = 0; for (i = 0; i < w->n>>1; i += w->step) { // not implemented yet - mem_sam_pe(w->opt, w->bns, w->pac, w->pes, &w->seqs[i<<1], &w->regs[i<<1]); + n += mem_sam_pe(w->opt, w->bns, w->pac, w->pes, &w->seqs[i<<1], &w->regs[i<<1]); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } + fprintf(stderr, "[M::%s@%d] performed mate-SW for %d reads\n", __func__, w->start, n); } return 0; } diff --git a/bwamem_pair.c b/bwamem_pair.c index 05f0547..df27ef1 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -95,20 +95,18 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * } } -void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) +int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) { - int i, r, skip[4]; + int i, r, skip[4], n = 0; for (r = 0; r < 4; ++r) skip[r] = pes[r].failed? 1 : 0; -#if 0 for (i = 0; i < ma->n; ++i) { // check which orinentation has been found int64_t dist; r = mem_infer_dir(l_pac, a->rb, ma->a[i].rb, &dist); if (dist >= pes[r].low && dist <= pes[r].high) skip[r] = 1; } -#endif - if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return; // consistent pair exist; no need to perform SW + if (skip[0] + skip[1] + skip[2] + skip[3] == 4) return 0; // consistent pair exist; no need to perform SW for (r = 0; r < 4; ++r) { int is_rev, is_larger; uint8_t *seq, *rev = 0, *ref; @@ -138,17 +136,12 @@ void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const m aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); b.qb = aln.qb; b.qe = aln.qe + 1; - if (is_rev) { - b.rb = (l_pac<<1) - (rb + aln.te + 1); - b.re = (l_pac<<1) - (rb + aln.tb); - } else { - b.rb = rb + aln.tb; - b.re = rb + aln.te + 1; - } + b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; + b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1; b.score = aln.score; b.csub = aln.score2; b.secondary = -1; - printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); +// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); kv_push(mem_alnreg_t, *ma, b); // make room for a new element // move b s.t. ma is sorted for (i = 0; i < ma->n - 1; ++i) // find the insertion point @@ -156,10 +149,12 @@ void mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const m tmp = i; for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1]; ma->a[i] = b; + ++n; } if (rev == 0) free(rev); free(ref); } + return n; } int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) @@ -217,8 +212,9 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ return o.x == 0? -1 : 0; } -void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]) +int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]) { + int n = 0; kstring_t str; bwahit_t h[2]; mem_alnreg_t a0[2]; @@ -227,8 +223,8 @@ void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, c a0[0].score = a0[1].score = -1; if (a[0].n) a0[0] = a[0].a[0]; if (a[1].n) a0[1] = a[1].a[0]; - if (a0[0].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, &a0[0], s[1].l_seq, (uint8_t*)s[1].seq, &a[1]); - if (a0[1].score > 0) mem_matesw(opt, bns->l_pac, pac, pes, &a0[1], s[0].l_seq, (uint8_t*)s[0].seq, &a[0]); + if (a0[0].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &a0[0], s[1].l_seq, (uint8_t*)s[1].seq, &a[1]); + if (a0[1].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &a0[1], s[0].l_seq, (uint8_t*)s[0].seq, &a[0]); // pairing single-end hits if (mem_pair(opt, bns->l_pac, pac, pes, s, a, h) == 0) { // successful pairing bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); @@ -237,4 +233,5 @@ void mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, c s[1].sam = str.s; } else { } + return n; } From f0a6285abad5010e1916f2109db73ac44371954a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 16 Feb 2013 11:52:04 -0500 Subject: [PATCH 230/498] perform mate-SW for some suboptimal alignments --- bwamem_pair.c | 49 ++++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index df27ef1..fe6f697 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -135,20 +135,22 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len; aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); - b.qb = aln.qb; b.qe = aln.qe + 1; - b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; - b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1; - b.score = aln.score; - b.csub = aln.score2; - b.secondary = -1; -// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); - kv_push(mem_alnreg_t, *ma, b); // make room for a new element - // move b s.t. ma is sorted - for (i = 0; i < ma->n - 1; ++i) // find the insertion point - if (ma->a[i].score < b.score) break; - tmp = i; - for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1]; - ma->a[i] = b; + if (aln.score >= opt->min_seed_len) { + b.qb = aln.qb; b.qe = aln.qe + 1; + b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; + b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1; + b.score = aln.score; + b.csub = aln.score2; + b.secondary = -1; +// printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); + kv_push(mem_alnreg_t, *ma, b); // make room for a new element + // move b s.t. ma is sorted + for (i = 0; i < ma->n - 1; ++i) // find the insertion point + if (ma->a[i].score < b.score) break; + tmp = i; + for (i = ma->n - 1; i > tmp; --i) ma->a[i] = ma->a[i-1]; + ma->a[i] = b; + } ++n; } if (rev == 0) free(rev); @@ -214,17 +216,22 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]) { - int n = 0; + int n = 0, i, j; kstring_t str; bwahit_t h[2]; - mem_alnreg_t a0[2]; + mem_alnreg_t b[2][2]; str.l = str.m = 0; str.s = 0; // perform SW for the best alignment - a0[0].score = a0[1].score = -1; - if (a[0].n) a0[0] = a[0].a[0]; - if (a[1].n) a0[1] = a[1].a[0]; - if (a0[0].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &a0[0], s[1].l_seq, (uint8_t*)s[1].seq, &a[1]); - if (a0[1].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &a0[1], s[0].l_seq, (uint8_t*)s[0].seq, &a[0]); + for (i = 0; i < 2; ++i) + for (j = 0; j < 2; ++j) b[i][j].score = -1; + for (i = 0; i < 2; ++i) { + for (j = 0; j < a[i].n && j < 2; ++j) b[i][j] = a[i].a[j]; + if (b[i][0].score > 0 && b[i][1].score > 0 && b[i][1].score < b[i][0].score * 0.8) + b[i][1].score = -1; + } + for (i = 0; i < 2; ++i) + for (j = 0; j < 2; ++j) + if (b[i][j].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i][j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); // pairing single-end hits if (mem_pair(opt, bns->l_pac, pac, pes, s, a, h) == 0) { // successful pairing bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); From 66585b7982ea5820511ee1701234c50a94d98067 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 18 Feb 2013 16:33:06 -0500 Subject: [PATCH 231/498] code backup --- bwamem.c | 27 ++++++++++++++------------- bwamem.h | 8 ++++++-- bwamem_pair.c | 32 ++++++++++++++++++-------------- fastmap.c | 6 ++++-- 4 files changed, 42 insertions(+), 31 deletions(-) diff --git a/bwamem.c b/bwamem.c index ae4992f..397422f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -31,6 +31,7 @@ mem_opt_t *mem_opt_init() mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); o->a = 1; o->b = 5; o->q = 8; o->r = 1; o->w = 100; + o->flag = 0; o->min_seed_len = 19; o->split_width = 10; o->max_occ = 10000; @@ -41,7 +42,6 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->pe_dir = 0<<1|1; - o->is_pe = 0; mem_fill_scmat(o->a, o->b, o->mat); return o; } @@ -598,11 +598,11 @@ void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h) h->score = a->score; h->sub = a->sub > a->csub? a->sub : a->csub; h->qual = 0; // quality unset - h->flag = a->secondary? 0x100 : 0; // only the "secondary" bit is set + h->flag = a->secondary >= 0? 0x100 : 0; // only the "secondary" bit is set h->mb = h->me = -2; // mate positions are unset } -void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) +void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag) { int k; kstring_t str; @@ -612,10 +612,11 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b bwahit_t h; if (a->a[k].secondary >= 0) continue; mem_alnreg2hit(&a->a[k], &h); + h.flag |= extra_flag; h.qual = approx_mapq_se(opt, &a->a[k]); - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->is_hard); + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP); } - } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->is_hard); + } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP); s->sam = str.s; } @@ -657,25 +658,25 @@ static void *worker1(void *data) for (i = w->start; i < w->n; i += w->step) { w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); w->regs[i].n = mem_sort_and_dedup(w->regs[i].n, w->regs[i].a); - mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); } return 0; } static void *worker2(void *data) { - extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]); + extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); worker_t *w = (worker_t*)data; int i; - if (!w->opt->is_pe) { + if (!(w->opt->flag&MEM_F_PE)) { for (i = 0; i < w->n; i += w->step) { - mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]); + mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); + mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0); free(w->regs[i].a); } } else { int n = 0; for (i = 0; i < w->n>>1; i += w->step) { // not implemented yet - n += mem_sam_pe(w->opt, w->bns, w->pac, w->pes, &w->seqs[i<<1], &w->regs[i<<1]); + n += mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } fprintf(stderr, "[M::%s@%d] performed mate-SW for %d reads\n", __func__, w->start, n); @@ -702,21 +703,21 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns #ifdef HAVE_PTHREAD if (opt->n_threads == 1) { worker1(w); - if (opt->is_pe) mem_pestat(opt, bns->l_pac, n, regs, pes); + if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes); worker2(w); } else { pthread_t *tid; tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); - if (opt->is_pe) mem_pestat(opt, bns->l_pac, n, regs, pes); + if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); free(tid); } #else worker1(w); - if (opt->is_pe) mem_pestat(opt, bns->l_pac, n, regs, pes); + if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes); worker2(w); #endif for (i = 0; i < n; ++i) { diff --git a/bwamem.h b/bwamem.h index 1f9605d..5fa49e4 100644 --- a/bwamem.h +++ b/bwamem.h @@ -15,13 +15,17 @@ typedef struct { int32_t qbeg, len; } mem_seed_t; +#define MEM_F_HARDCLIP 0x1 +#define MEM_F_PE 0x2 +#define MEM_F_NOPAIRING 0x4 + typedef struct { int a, b, q, r, w; + int flag; int split_width; int min_seed_len, max_occ, max_chain_gap; int n_threads, chunk_size; - int pe_dir, is_pe; - int is_hard; // if to use hard clip + int pe_dir; float mask_level, chain_drop_ratio; int max_ins; // maximum insert size int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset diff --git a/bwamem_pair.c b/bwamem_pair.c index fe6f697..9d4d590 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -159,11 +159,11 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me return n; } -int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], bwahit_t h[2]) +uint64_t mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, uint64_t *sub, int z[2]) { extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); pair64_v v; - pair64_t o, subo; // score<<32 | raw_score<<8 | hash + pair64_t o, subo; // .x: score<<32 | raw_score<<8 | hash; .y: pair int r, i, k, y[4]; // y[] keeps the last hit kv_init(v); for (r = 0; r < 2; ++r) { // loop through read number @@ -198,7 +198,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ ns = (dist - pes[dir].avg) / pes[dir].std; score = (int)(raw_score - 4.343 / 23. * (opt->a + opt->b) * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); pair = (uint64_t)k<<32 | i; - x = (uint64_t)score<<32 | (int64_t)raw_score<<8 | (hash_64(pair)&0xff); + x = (uint64_t)score<<32 | (int64_t)raw_score<<8 | (hash_64(pair ^ id<<8)&0xff); if (x > o.x) subo = o, o.x = x, o.y = pair; else if (x > subo.x) subo.x = x, subo.y = pair; } @@ -207,19 +207,24 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ } if (o.x > 0) { i = o.y >> 32; k = o.y << 32 >> 32; - mem_alnreg2hit(&a[v.a[i].y&1].a[v.a[i].y<<32>>34], &h[v.a[i].y&1]); - mem_alnreg2hit(&a[v.a[k].y&1].a[v.a[k].y<<32>>34], &h[v.a[k].y&1]); + z[v.a[i].y&1] = v.a[i].y<<32>>34; + z[v.a[k].y&1] = v.a[k].y<<32>>34; } free(v.a); - return o.x == 0? -1 : 0; + *sub = subo.x; + return o.x; } -int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2]) +int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) { - int n = 0, i, j; + extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a); + extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag); + int n = 0, i, j, z[2]; kstring_t str; bwahit_t h[2]; mem_alnreg_t b[2][2]; + uint64_t o, subo; + str.l = str.m = 0; str.s = 0; // perform SW for the best alignment for (i = 0; i < 2; ++i) @@ -233,12 +238,11 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co for (j = 0; j < 2; ++j) if (b[i][j].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i][j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); // pairing single-end hits - if (mem_pair(opt, bns->l_pac, pac, pes, s, a, h) == 0) { // successful pairing - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->is_hard); - s[0].sam = strdup(str.s); str.l = 0; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->is_hard); - s[1].sam = str.s; - } else { + o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z); + if (0&&o) { // with proper pairing + } else { // no proper pairing + mem_mark_primary_se(opt, a[0].n, a[0].a); mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41); + mem_mark_primary_se(opt, a[1].n, a[1].a); mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81); } return n; } diff --git a/fastmap.c b/fastmap.c index f2677eb..a2d7d94 100644 --- a/fastmap.c +++ b/fastmap.c @@ -24,8 +24,10 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "k:c:v:s:")) >= 0) { + while ((c = getopt(argc, argv, "PHk:c:v:s:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); + else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; + else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'v') mem_verbose = atoi(optarg); else if (c == 's') opt->split_width = atoi(optarg); @@ -59,7 +61,7 @@ int main_mem(int argc, char *argv[]) if (optind + 2 < argc) { fp2 = gzopen(argv[optind + 2], "r"); ks2 = kseq_init(fp2); - opt->is_pe = 1; + opt->flag |= MEM_F_PE; } while ((seqs = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { mem_process_seqs(opt, bwt, bns, pac, n, seqs); From 688872fb1bf1796102b3517d55ad48122b81e9ff Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 19 Feb 2013 00:50:39 -0500 Subject: [PATCH 232/498] code backup --- bwamem.c | 4 ++-- bwamem.h | 1 + bwamem_pair.c | 47 +++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/bwamem.c b/bwamem.c index 397422f..3d1b9c5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -577,7 +577,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons * Integrated interface * ************************/ -static inline int approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) +int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) { int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a; double identity; @@ -613,7 +613,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (a->a[k].secondary >= 0) continue; mem_alnreg2hit(&a->a[k], &h); h.flag |= extra_flag; - h.qual = approx_mapq_se(opt, &a->a[k]); + h.qual = mem_approx_mapq_se(opt, &a->a[k]); bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP); } } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP); diff --git a/bwamem.h b/bwamem.h index 5fa49e4..d6e9f01 100644 --- a/bwamem.h +++ b/bwamem.h @@ -6,6 +6,7 @@ #include "utils.h" #define MEM_MAPQ_COEF 40.0 +#define MEM_MAPQ_MAX 60 struct __smem_i; typedef struct __smem_i smem_i; diff --git a/bwamem_pair.c b/bwamem_pair.c index 9d4d590..92b8842 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -159,6 +159,12 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me return n; } +static inline double approx_match(const mem_opt_t *opt, const mem_alnreg_v *a) +{ + int l = a->qe - a->qb < a->re - a->rb? a->qe - a->qb : a->re - a->rb; + return l - (double)(l * opt->a - a->score) / (opt->a + opt->b); +} + uint64_t mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, uint64_t *sub, int z[2]) { extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); @@ -219,9 +225,10 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co { extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a); extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag); + extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); + int n = 0, i, j, z[2]; kstring_t str; - bwahit_t h[2]; mem_alnreg_t b[2][2]; uint64_t o, subo; @@ -237,12 +244,40 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co for (i = 0; i < 2; ++i) for (j = 0; j < 2; ++j) if (b[i][j].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i][j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); + mem_mark_primary_se(opt, a[0].n, a[0].a); + mem_mark_primary_se(opt, a[1].n, a[1].a); // pairing single-end hits o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z); - if (0&&o) { // with proper pairing - } else { // no proper pairing - mem_mark_primary_se(opt, a[0].n, a[0].a); mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41); - mem_mark_primary_se(opt, a[1].n, a[1].a); mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81); - } + if (o && !(opt->flag&MEM_F_NOPAIRING)) { // with proper pairing + int is_multi[2], q_se[2], q_pe, is_tandem[2]; + // check if an end has multiple hits even after mate-SW + for (i = 0; i < 2; ++i) { + for (j = 1; j < a[i].n; ++j) + if (a[i].a[j].secondary < 0) break; + is_multi[i] = j < a[i].n? 1 : 0; + } + if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score + // compute mapQ for the best SE hit + for (i = 0; i < 2; ++i) { + q_se[i] = mem_approx_mapq_se(opt, &a[i].a[0]); + is_tandem[i] = (a[i].a[0].csub > a[i].a[0].sub); + } + q_pe = (int)(MEM_MAPQ_COEF * (1. - (double)(subo>>32) / (o>>32)) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499); + // the following assumes no split hits + if (z[0] == 0 && z[1] == 0) { // the best hit + q_pe = q_pe > q_se[0] + q_se[1]? q_pe : q_se[0] + q_se[1]; + q_se[0] = is_tabdem[0]? q_se[0] : q_pe; + q_se[1] = is_tabdem[1]? q_se[1] : q_pe; + } else { + double m[2]; + m[0] = approx_match(opt, a[0].a[0]) + approx_match(opt, a[1].a[0]); + m[1] = approx_match(opt, a[0].a[z[0]]) + approx_match(opt, a[1].a[z[1]]); + } + } else goto no_pairing; + return n; + +no_pairing: + mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41); + mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81); return n; } From a7d574d125bd99bfb299bf7686a1306215c03ab3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 20 Feb 2013 01:11:38 -0500 Subject: [PATCH 233/498] backup comments --- bwamem.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 3d1b9c5..0c9c7b9 100644 --- a/bwamem.c +++ b/bwamem.c @@ -26,11 +26,32 @@ void mem_fill_scmat(int a, int b, int8_t mat[25]) for (j = 0; j < 5; ++j) mat[k++] = 0; } +/* Theory on probability and scoring *ungapped* alignment + * + * s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution + * s'(a,a) = log(4), s'(a,b) = log(4e/3), where e is the error rate + * + * Scale s'(a,b) to s(a,a) s.t. s(a,a)=x. Then s(a,b) = x*s'(a,b)/log(4), or conversely: s'(a,b)=s(a,b)*log(4)/x + * + * If the matching score is x and mismatch penalty is -y, we can compute error rate e: + * e = .75 * exp[-log(4) * y/x] + * + * log P(seq) = \sum_i log P(b_i|a_i) = \sum_i {s'(a,b) - log(4)} + * = \sum_i { s(a,b)*log(4)/x - log(4) } = log(4) * (S/x - l) + * + * where S=\sum_i s(a,b) is the alignment score. Converting to the phred scale: + * Q(seq) = -10/log(10) * log P(seq) = 10*log(4)/log(10) * (l - S/x) = 6.02 * (l - S/x) + * + * + * Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1) + * Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4) + */ + mem_opt_t *mem_opt_init() { mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); - o->a = 1; o->b = 5; o->q = 8; o->r = 1; o->w = 100; + o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; o->flag = 0; o->min_seed_len = 19; o->split_width = 10; From 5626fe29b7b07cf9efc4ee625d75ceca8f319de3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 20 Feb 2013 19:11:44 -0500 Subject: [PATCH 234/498] Well, at least output sth --- bwamem.c | 3 +++ bwamem.h | 1 + bwamem_pair.c | 52 +++++++++++++++++++++++++++++++-------------------- 3 files changed, 36 insertions(+), 20 deletions(-) diff --git a/bwamem.c b/bwamem.c index 0c9c7b9..f9415b2 100644 --- a/bwamem.c +++ b/bwamem.c @@ -45,6 +45,8 @@ void mem_fill_scmat(int a, int b, int8_t mat[25]) * * Gap open (zero gap): q' = log[P(gap-open)], r' = log[P(gap-ext)] (see Durbin et al. (1998) Section 4.1) * Then q = x*log[P(gap-open)]/log(4), r = x*log[P(gap-ext)]/log(4) + * + * When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR) */ mem_opt_t *mem_opt_init() @@ -63,6 +65,7 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->pe_dir = 0<<1|1; + o->pen_unpaired = 50; mem_fill_scmat(o->a, o->b, o->mat); return o; } diff --git a/bwamem.h b/bwamem.h index d6e9f01..43a5401 100644 --- a/bwamem.h +++ b/bwamem.h @@ -28,6 +28,7 @@ typedef struct { int n_threads, chunk_size; int pe_dir; float mask_level, chain_drop_ratio; + int pen_unpaired; // phred-scaled penalty for unpaired reads int max_ins; // maximum insert size int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; diff --git a/bwamem_pair.c b/bwamem_pair.c index 92b8842..dc46f44 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -159,13 +159,13 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me return n; } -static inline double approx_match(const mem_opt_t *opt, const mem_alnreg_v *a) +static inline double aln_q(const mem_opt_t *opt, const mem_alnreg_t *a) { int l = a->qe - a->qb < a->re - a->rb? a->qe - a->qb : a->re - a->rb; - return l - (double)(l * opt->a - a->score) / (opt->a + opt->b); + return (int)(6.02 * (l - (double)a->score / opt->a) + .499); } -uint64_t mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, uint64_t *sub, int z[2]) +int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int z[2]) { extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); pair64_v v; @@ -177,7 +177,7 @@ uint64_t mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const pair64_t key; mem_alnreg_t *e = &a[r].a[i]; key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position - key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; + key.y = (uint64_t)aln_q(opt, e) << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; kv_push(pair64_t, v, key); } } @@ -192,19 +192,17 @@ uint64_t mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const if (y[which] < 0) continue; // no previous hits for (k = y[which]; k >= 0; --k) { // TODO: this is a O(n^2) solution in the worst case; remember to check if this loop takes a lot of time (I doubt) int64_t dist; - int raw_score, score; + int q; double ns; uint64_t x, pair; if ((v.a[k].y&3) != which) continue; dist = (int64_t)v.a[i].x - v.a[k].x; if (dist > pes[dir].high) break; if (dist < pes[dir].low) continue; - raw_score = (v.a[i].y>>32) + (v.a[i].y>>32); - if (raw_score + 20 * opt->a < (subo.x>>8&0xffffff)) continue; // skip the following if the score is too small ns = (dist - pes[dir].avg) / pes[dir].std; - score = (int)(raw_score - 4.343 / 23. * (opt->a + opt->b) * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); + q = (int)((v.a[i].y>>32) + (v.a[i].y>>32) - 4.343 * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); pair = (uint64_t)k<<32 | i; - x = (uint64_t)score<<32 | (int64_t)raw_score<<8 | (hash_64(pair ^ id<<8)&0xff); + x = (uint64_t)q<<32 | (hash_64(pair ^ id<<8) & 0xffffffffU); if (x > o.x) subo = o, o.x = x, o.y = pair; else if (x > subo.x) subo.x = x, subo.y = pair; } @@ -217,8 +215,8 @@ uint64_t mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const z[v.a[k].y&1] = v.a[k].y<<32>>34; } free(v.a); - *sub = subo.x; - return o.x; + *sub = subo.x>>32; + return o.x>>32; } int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) @@ -226,11 +224,12 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a); extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); + extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); + extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard); - int n = 0, i, j, z[2]; + int n = 0, i, j, z[2], o, subo; kstring_t str; mem_alnreg_t b[2][2]; - uint64_t o, subo; str.l = str.m = 0; str.s = 0; // perform SW for the best alignment @@ -249,7 +248,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co // pairing single-end hits o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z); if (o && !(opt->flag&MEM_F_NOPAIRING)) { // with proper pairing - int is_multi[2], q_se[2], q_pe, is_tandem[2]; + int is_multi[2], q_se[2], q_pe, is_tandem[2], extra_flag = 1, un; + bwahit_t h[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { for (j = 1; j < a[i].n; ++j) @@ -262,17 +262,29 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[i] = mem_approx_mapq_se(opt, &a[i].a[0]); is_tandem[i] = (a[i].a[0].csub > a[i].a[0].sub); } - q_pe = (int)(MEM_MAPQ_COEF * (1. - (double)(subo>>32) / (o>>32)) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499); + un = aln_q(opt, &a[0].a[0]) + aln_q(opt, &a[1].a[0]) + opt->pen_unpaired; + subo = subo < un? subo : un; + q_pe = subo - o; // the following assumes no split hits if (z[0] == 0 && z[1] == 0) { // the best hit q_pe = q_pe > q_se[0] + q_se[1]? q_pe : q_se[0] + q_se[1]; - q_se[0] = is_tabdem[0]? q_se[0] : q_pe; - q_se[1] = is_tabdem[1]? q_se[1] : q_pe; + q_se[0] = is_tandem[0]? q_se[0] : q_pe; + q_se[1] = is_tandem[1]? q_se[1] : q_pe; + extra_flag |= 2; } else { - double m[2]; - m[0] = approx_match(opt, a[0].a[0]) + approx_match(opt, a[1].a[0]); - m[1] = approx_match(opt, a[0].a[z[0]]) + approx_match(opt, a[1].a[z[1]]); + if (o > un) { // then move the pair + q_se[0] = z[0] == 0? q_se[0] : 0; + q_se[1] = z[1] == 0? q_se[1] : 0; + if (q_se[0] == 0) q_se[0] = q_se[1]; + if (q_se[1] == 0) q_se[1] = q_se[0]; + } else { // the unpaired alignment is much better + z[0] = z[1] = 0; + } } + mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag; + mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP); s[0].sam = strdup(str.s); str.l = 0; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP); s[1].sam = str.s; } else goto no_pairing; return n; From ea8f4f4d34b05be0c3a821c9eede57fae39c5477 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 20 Feb 2013 20:26:57 -0500 Subject: [PATCH 235/498] clean bill from valgrind --- bwamem.c | 5 ++++- bwamem_pair.c | 2 +- ksw.c | 6 +++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/bwamem.c b/bwamem.c index f9415b2..f0e1a29 100644 --- a/bwamem.c +++ b/bwamem.c @@ -363,7 +363,10 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a) a[i].qe = a[i].qb; } for (i = 1, m = 1; i < n; ++i) // exclude identical hits - if (a[i].qe > a[i].qb) a[m++] = a[i]; + if (a[i].qe > a[i].qb) { + if (m != i) a[m++] = a[i]; + else ++m; + } return m; } diff --git a/bwamem_pair.c b/bwamem_pair.c index dc46f44..092eee2 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -153,7 +153,7 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me } ++n; } - if (rev == 0) free(rev); + if (rev) free(rev); free(ref); } return n; diff --git a/ksw.c b/ksw.c index 8d741a6..742fec9 100644 --- a/ksw.c +++ b/ksw.c @@ -447,7 +447,7 @@ static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { if (*n_cigar == *m_cigar) { *m_cigar = *m_cigar? (*m_cigar)<<1 : 4; - cigar = realloc(cigar, (*m_cigar) << 4); + cigar = realloc(cigar, (*m_cigar) << 2); } cigar[(*n_cigar)++] = len<<4 | op; } else cigar[(*n_cigar)-1] += len<<4; @@ -520,8 +520,8 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i; else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k; } - if (i >= 0) push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1); - if (k >= 0) push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1); + if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1); + if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1); for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; *n_cigar_ = n_cigar, *cigar_ = cigar; From 41624fb347fcf6dfeb850b920e79dfa4a5781871 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 20 Feb 2013 20:43:22 -0500 Subject: [PATCH 236/498] bugfix: choosing the worse instead of the best --- bwamem_pair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 092eee2..1302414 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -203,8 +203,8 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ q = (int)((v.a[i].y>>32) + (v.a[i].y>>32) - 4.343 * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); pair = (uint64_t)k<<32 | i; x = (uint64_t)q<<32 | (hash_64(pair ^ id<<8) & 0xffffffffU); - if (x > o.x) subo = o, o.x = x, o.y = pair; - else if (x > subo.x) subo.x = x, subo.y = pair; + if (x < o.x) subo = o, o.x = x, o.y = pair; + else if (x < subo.x) subo.x = x, subo.y = pair; } } y[v.a[i].y&3] = i; From a9cae8c9af8a3e8b9983f146b14b7672a832f463 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 10:39:17 -0500 Subject: [PATCH 237/498] minor changes --- bwamem_pair.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 1302414..7dc67fe 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -268,13 +268,16 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co // the following assumes no split hits if (z[0] == 0 && z[1] == 0) { // the best hit q_pe = q_pe > q_se[0] + q_se[1]? q_pe : q_se[0] + q_se[1]; + if (q_pe > 60) q_pe = 60; q_se[0] = is_tandem[0]? q_se[0] : q_pe; q_se[1] = is_tandem[1]? q_se[1] : q_pe; extra_flag |= 2; } else { if (o > un) { // then move the pair - q_se[0] = z[0] == 0? q_se[0] : 0; - q_se[1] = z[1] == 0? q_se[1] : 0; + int tmp[2]; + tmp[0] = q_se[0]; tmp[1] = q_se[1]; + q_se[0] = z[0] == 0? q_se[0] : tmp[1] < q_pe? tmp[1] : q_pe; + q_se[1] = z[1] == 0? q_se[1] : tmp[0] < q_pe? tmp[0] : q_pe; if (q_se[0] == 0) q_se[0] = q_se[1]; if (q_se[1] == 0) q_se[1] = q_se[0]; } else { // the unpaired alignment is much better From 84a328764a4f48a9b8686353c947e4e9edabc41c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 11:42:30 -0500 Subject: [PATCH 238/498] bugfix: mis-chaining caused by integer overflow I really need to rewrite kbtree some time. --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index f0e1a29..2a9de82 100644 --- a/bwamem.c +++ b/bwamem.c @@ -159,7 +159,7 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) #include "kbtree.h" -#define chain_cmp(a, b) ((a).pos - (b).pos) +#define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) KBTREE_INIT(chn, mem_chain_t, chain_cmp) static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p) From f8829318cf4dee5d05b538bac58f66562e25f3a2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 12:25:20 -0500 Subject: [PATCH 239/498] weakened the chain filter --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 2a9de82..8122f70 100644 --- a/bwamem.c +++ b/bwamem.c @@ -316,7 +316,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg; if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap if (a[j].p2 == 0) a[j].p2 = a[i].p; - if (a[i].w < a[j].w * opt->chain_drop_ratio) + if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1) break; } } From 54da54ffd4aaf39f88c6051025561c7cf3d44b76 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 12:52:00 -0500 Subject: [PATCH 240/498] extend more seeds (and thus slower...) --- bwamem.c | 4 +++- bwamem.h | 4 +++- fastmap.c | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 8122f70..9bb5ad0 100644 --- a/bwamem.c +++ b/bwamem.c @@ -62,6 +62,7 @@ mem_opt_t *mem_opt_init() o->max_ins = 10000; o->mask_level = 0.50; o->chain_drop_ratio = 0.50; + o->split_factor = 1.5; o->chunk_size = 10000000; o->n_threads = 1; o->pe_dir = 0<<1|1; @@ -186,7 +187,8 @@ static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr) { const bwtintv_v *a; - while ((a = smem_next(itr, opt->min_seed_len<<1, opt->split_width)) != 0) { // to find all SMEM and some internal MEM + int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); + while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM int i; for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start bwtintv_t *p = &a->a[i]; diff --git a/bwamem.h b/bwamem.h index 43a5401..6b191ae 100644 --- a/bwamem.h +++ b/bwamem.h @@ -27,7 +27,9 @@ typedef struct { int min_seed_len, max_occ, max_chain_gap; int n_threads, chunk_size; int pe_dir; - float mask_level, chain_drop_ratio; + float mask_level; + float chain_drop_ratio; + float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor int pen_unpaired; // phred-scaled penalty for unpaired reads int max_ins; // maximum insert size int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset diff --git a/fastmap.c b/fastmap.c index a2d7d94..91a4ecb 100644 --- a/fastmap.c +++ b/fastmap.c @@ -24,12 +24,13 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "PHk:c:v:s:")) >= 0) { + while ((c = getopt(argc, argv, "PHk:c:v:s:r:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'v') mem_verbose = atoi(optarg); + else if (c == 'r') opt->split_factor = atof(optarg); else if (c == 's') opt->split_width = atoi(optarg); } if (optind + 1 >= argc) { @@ -38,6 +39,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); + fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); fprintf(stderr, " -v INT verbose level [%d]\n", mem_verbose); fprintf(stderr, "\n"); free(opt); From cfbc4c89e32a74a47cd25c695058beb31ed517f4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 14:34:10 -0500 Subject: [PATCH 241/498] perform extension when there are, say, 20bp tandem --- bwamem.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bwamem.c b/bwamem.c index 9bb5ad0..ec6aeff 100644 --- a/bwamem.c +++ b/bwamem.c @@ -463,15 +463,14 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; } else a->qe = l_query, a->re = s->rbeg + s->len; + if (a->score >= best.score) csub = best.score, best = *a; if (mem_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); - // check how many seeds have been covered + // jump to the next seed that: 1) has no overlap with the previous seed; 2) is not fully contained in the alignment for (i = k + 1; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; - if (t->rbeg + t->len > a->re || t->qbeg + t->len > a->qe) - break; + if ((t-1)->rbeg + (t-1)->len >= t->rbeg || (t-1)->qbeg + (t-1)->len >= t->qbeg) break; + if (t->rbeg + t->len > a->re || t->qbeg + t->len > a->qe) break; } - if (a->score >= best.score) csub = best.score, best = *a; - if (i >= c->n) break; // all seeds are included; no need to proceed k = i; } if (a->score < best.score) *a = best; From a578688fa80212b800f6f82842269095dba22f4e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 14:58:51 -0500 Subject: [PATCH 242/498] generate multiple alignments from one chain --- bwamem.c | 46 ++++++++++++++++++++-------------------------- bwamem.h | 2 +- kvec.h | 12 ++++++------ 3 files changed, 27 insertions(+), 33 deletions(-) diff --git a/bwamem.c b/bwamem.c index ec6aeff..2df9c53 100644 --- a/bwamem.c +++ b/bwamem.c @@ -415,16 +415,14 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) return l > 1? l : 1; } -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a) +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds - int i, k, csub = 0; + int i, k; int64_t rlen, rmax[2], tmp, max = 0, max_i = 0; const mem_seed_t *s; uint8_t *rseq = 0; - mem_alnreg_t best; - memset(&best, 0, sizeof(mem_alnreg_t)); - memset(a, 0, sizeof(mem_alnreg_t)); + av->n = 0; // get the max possible span rmax[0] = l_pac<<1; rmax[1] = 0; for (i = 0; i < c->n; ++i) { @@ -441,6 +439,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (rlen != rmax[1] - rmax[0]) return; for (k = 0; k < c->n;) { + mem_alnreg_t *a; + a = kv_pushp(mem_alnreg_t, *av); s = &c->seeds[k]; memset(a, 0, sizeof(mem_alnreg_t)); if (s->qbeg) { // left extension @@ -463,9 +463,14 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; } else a->qe = l_query, a->re = s->rbeg + s->len; - if (a->score >= best.score) csub = best.score, best = *a; if (mem_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); - // jump to the next seed that: 1) has no overlap with the previous seed; 2) is not fully contained in the alignment + // compute seedcov + for (i = 0, a->seedcov = 0; i < c->n; ++i) { + const mem_seed_t *t = &c->seeds[i]; + if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained + a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough + } + // jump to the next seed that: 1) has no overlap with the previous seed, or 2) is not fully contained in the alignment for (i = k + 1; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; if ((t-1)->rbeg + (t-1)->len >= t->rbeg || (t-1)->qbeg + (t-1)->len >= t->qbeg) break; @@ -473,18 +478,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } k = i; } - if (a->score < best.score) *a = best; - a->csub = csub; free(rseq); - - // compute seedcov - if (c->n > 1) { - for (i = 0, a->seedcov = 0; i < c->n; ++i) { - s = &c->seeds[i]; - if (s->qbeg >= a->qb && s->qbeg + s->len <= a->qe && s->rbeg >= a->rb && s->rbeg + s->len <= a->re) // seed fully contained - a->seedcov += s->len; // this is not very accurate, but for approx. mapQ, this is good enough - } - } else a->seedcov = c->seeds[0].len; } /***************************** @@ -650,21 +644,23 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s) { - int i; + int i, j; mem_chain_v chn; - mem_alnreg_v regs; + mem_alnreg_v regs, tmp; for (i = 0; i < s->l_seq; ++i) s->seq[i] = nst_nt4_table[(int)s->seq[i]]; chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); if (mem_verbose >= 4) mem_print_chain(bns, &chn); - regs.n = regs.m = chn.n; - regs.a = malloc(regs.n * sizeof(mem_alnreg_t)); + kv_init(regs); kv_init(tmp); for (i = 0; i < chn.n; ++i) { - mem_chain2aln(opt, bns->l_pac, pac, s->l_seq, (uint8_t*)s->seq, &chn.a[i], ®s.a[i]); + mem_chain2aln(opt, bns->l_pac, pac, s->l_seq, (uint8_t*)s->seq, &chn.a[i], &tmp); + for (j = 0; j < tmp.n; ++j) + kv_push(mem_alnreg_t, regs, tmp.a[j]); free(chn.a[i].seeds); } free(chn.a); + regs.n = mem_sort_and_dedup(regs.n, regs.a); return regs; } @@ -683,10 +679,8 @@ static void *worker1(void *data) { worker_t *w = (worker_t*)data; int i; - for (i = w->start; i < w->n; i += w->step) { + for (i = w->start; i < w->n; i += w->step) w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); - w->regs[i].n = mem_sort_and_dedup(w->regs[i].n, w->regs[i].a); - } return 0; } diff --git a/bwamem.h b/bwamem.h index 6b191ae..f20663e 100644 --- a/bwamem.h +++ b/bwamem.h @@ -80,7 +80,7 @@ void mem_fill_scmat(int a, int b, int8_t mat[25]); mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains); -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_t *a); +void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *a); uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); diff --git a/kvec.h b/kvec.h index 57204d6..9c9ca6e 100644 --- a/kvec.h +++ b/kvec.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, by Attractive Chaos + Copyright (c) 2008, by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -76,15 +76,15 @@ int main() { (v).a[(v).n++] = (x); \ } while (0) -#define kv_pushp(type, v) (((v).n == (v).m)? \ +#define kv_pushp(type, v) ((((v).n == (v).m)? \ ((v).m = ((v).m? (v).m<<1 : 2), \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ - : 0), ((v).a + ((v).n++)) + : 0), &(v).a[(v).n++]) -#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ +#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \ ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ - : (v).n <= (size_t)(i)? (v).n = (i) \ - : 0), (v).a[(i)] + : (v).n <= (size_t)(i)? (v).n = (i) + 1 \ + : 0), (v).a[(i)]) #endif From d4cf6d97a66c407d060735282cb884d7571ea6be Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 15:04:31 -0500 Subject: [PATCH 243/498] bugfix: memory leak --- bwamem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 2df9c53..412235a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -470,10 +470,10 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough } - // jump to the next seed that: 1) has no overlap with the previous seed, or 2) is not fully contained in the alignment + // jump to the next seed that: 1) has no >7bp overlap with the previous seed, or 2) is not fully contained in the alignment for (i = k + 1; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; - if ((t-1)->rbeg + (t-1)->len >= t->rbeg || (t-1)->qbeg + (t-1)->len >= t->qbeg) break; + if ((t-1)->rbeg + (t-1)->len >= t->rbeg + 7 || (t-1)->qbeg + (t-1)->len >= t->qbeg + 7) break; if (t->rbeg + t->len > a->re || t->qbeg + t->len > a->qe) break; } k = i; @@ -659,7 +659,7 @@ static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bn kv_push(mem_alnreg_t, regs, tmp.a[j]); free(chn.a[i].seeds); } - free(chn.a); + free(chn.a); free(tmp.a); regs.n = mem_sort_and_dedup(regs.n, regs.a); return regs; } From 58e4cc207fefe85ec8ac32f86f2d75885563b1f3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Feb 2013 21:26:01 -0500 Subject: [PATCH 244/498] bugfix: 1) fill seedcov; 2) pairing not working --- bwamem_pair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 7dc67fe..3db6882 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -142,6 +142,7 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me b.score = aln.score; b.csub = aln.score2; b.secondary = -1; + b.seedcov = (b.re - b.rb < b.qe - b.qb? b.re - b.rb : b.qe - b.qb) >> 1; // printf("*** %d, [%lld,%lld], %d:%d, (%lld,%lld), (%lld,%lld) == (%lld,%lld)\n", aln.score, rb, re, is_rev, is_larger, a->rb, a->re, ma->a[0].rb, ma->a[0].re, b.rb, b.re); kv_push(mem_alnreg_t, *ma, b); // make room for a new element // move b s.t. ma is sorted @@ -183,7 +184,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ } ks_introsort_128(v.n, v.a); y[0] = y[1] = y[2] = y[3] = -1; - o.x = o.y = subo.x = subo.y = 0; + o.x = subo.x = (uint64_t)-1; o.y = subo.y = 0; for (i = 0; i < v.n; ++i) { for (r = 0; r < 2; ++r) { // loop through direction int dir = r<<1 | (v.a[i].y>>1&1), which; @@ -245,9 +246,9 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co if (b[i][j].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i][j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); mem_mark_primary_se(opt, a[0].n, a[0].a); mem_mark_primary_se(opt, a[1].n, a[1].a); + if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits - o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z); - if (o && !(opt->flag&MEM_F_NOPAIRING)) { // with proper pairing + if ((o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z)) > 0) { int is_multi[2], q_se[2], q_pe, is_tandem[2], extra_flag = 1, un; bwahit_t h[2]; // check if an end has multiple hits even after mate-SW From 81fe6f8e382bf038dd402673c8a6ec5746b77f9d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 10:57:07 -0500 Subject: [PATCH 245/498] bugfix: a typo leading to wrong pairing --- bwamem_pair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 3db6882..cab7c2f 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -201,7 +201,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ if (dist > pes[dir].high) break; if (dist < pes[dir].low) continue; ns = (dist - pes[dir].avg) / pes[dir].std; - q = (int)((v.a[i].y>>32) + (v.a[i].y>>32) - 4.343 * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); + q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) - 4.343 * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); pair = (uint64_t)k<<32 | i; x = (uint64_t)q<<32 | (hash_64(pair ^ id<<8) & 0xffffffffU); if (x < o.x) subo = o, o.x = x, o.y = pair; From d5820177c63497aae8f1f83ffc0230924de47f85 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 11:02:14 -0500 Subject: [PATCH 246/498] bugfix: wrong mate-sw qry coor for rev --- bwamem_pair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index cab7c2f..d926857 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -136,7 +136,8 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); if (aln.score >= opt->min_seed_len) { - b.qb = aln.qb; b.qe = aln.qe + 1; + b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb; + b.qe = is_rev? l_ms - aln.qb : aln.qe + 1; b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; b.re = is_rev? (l_pac<<1) - (rb + aln.tb) : rb + aln.te + 1; b.score = aln.score; From dfc63acc11dd86da41523dd190816260889fe396 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 11:06:36 -0500 Subject: [PATCH 247/498] bugfix: another ">" vs. "<" bug That hurts, as I am going to reverse all these again! --- bwamem_pair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index d926857..6b9c490 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -275,7 +275,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[1] = is_tandem[1]? q_se[1] : q_pe; extra_flag |= 2; } else { - if (o > un) { // then move the pair + if (o < un) { // then move the pair int tmp[2]; tmp[0] = q_se[0]; tmp[1] = q_se[1]; q_se[0] = z[0] == 0? q_se[0] : tmp[1] < q_pe? tmp[1] : q_pe; From ed08d08f364be17dde6c6a7cdda264181e825a07 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 11:17:31 -0500 Subject: [PATCH 248/498] fixed bugs caused by interger overflow --- bwamem_pair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 6b9c490..7340b05 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -185,7 +185,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ } ks_introsort_128(v.n, v.a); y[0] = y[1] = y[2] = y[3] = -1; - o.x = subo.x = (uint64_t)-1; o.y = subo.y = 0; + o.x = subo.x = o.x = subo.x = 0x7fffffffULL<<32; o.y = subo.y = 0; for (i = 0; i < v.n; ++i) { for (r = 0; r < 2; ++r) { // loop through direction int dir = r<<1 | (v.a[i].y>>1&1), which; @@ -267,6 +267,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co un = aln_q(opt, &a[0].a[0]) + aln_q(opt, &a[1].a[0]) + opt->pen_unpaired; subo = subo < un? subo : un; q_pe = subo - o; + if (q_pe > 60) q_pe = 60; // the following assumes no split hits if (z[0] == 0 && z[1] == 0) { // the best hit q_pe = q_pe > q_se[0] + q_se[1]? q_pe : q_se[0] + q_se[1]; @@ -282,6 +283,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[1] = z[1] == 0? q_se[1] : tmp[0] < q_pe? tmp[0] : q_pe; if (q_se[0] == 0) q_se[0] = q_se[1]; if (q_se[1] == 0) q_se[1] = q_se[0]; + a[0].a[z[0]].secondary = a[1].a[z[1]].secondary = -2; } else { // the unpaired alignment is much better z[0] = z[1] = 0; } From c5ce72f5936828513c6f87f04723dfe5a22c56c4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 12:10:20 -0500 Subject: [PATCH 249/498] scoring pairs by score, not by errors This is important for bwa-mem which does local alignment. A short exact match is worse than a long inexact match. Also fixed a bug in approximating mapping quality. --- bwamem.c | 2 +- bwamem_pair.c | 35 +++++++++++++++++------------------ 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/bwamem.c b/bwamem.c index 412235a..431590f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -66,7 +66,7 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->pe_dir = 0<<1|1; - o->pen_unpaired = 50; + o->pen_unpaired = 9; mem_fill_scmat(o->a, o->b, o->mat); return o; } diff --git a/bwamem_pair.c b/bwamem_pair.c index 7340b05..9fa3505 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -161,12 +161,6 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me return n; } -static inline double aln_q(const mem_opt_t *opt, const mem_alnreg_t *a) -{ - int l = a->qe - a->qb < a->re - a->rb? a->qe - a->qb : a->re - a->rb; - return (int)(6.02 * (l - (double)a->score / opt->a) + .499); -} - int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int z[2]) { extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); @@ -179,13 +173,14 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ pair64_t key; mem_alnreg_t *e = &a[r].a[i]; key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position - key.y = (uint64_t)aln_q(opt, e) << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; + key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; kv_push(pair64_t, v, key); } } ks_introsort_128(v.n, v.a); y[0] = y[1] = y[2] = y[3] = -1; - o.x = subo.x = o.x = subo.x = 0x7fffffffULL<<32; o.y = subo.y = 0; + o.x = subo.x = o.y = subo.y = 0; + //for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x); for (i = 0; i < v.n; ++i) { for (r = 0; r < 2; ++r) { // loop through direction int dir = r<<1 | (v.a[i].y>>1&1), which; @@ -199,14 +194,17 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ uint64_t x, pair; if ((v.a[k].y&3) != which) continue; dist = (int64_t)v.a[i].x - v.a[k].x; + //printf("%d: %lld\n", k, dist); if (dist > pes[dir].high) break; if (dist < pes[dir].low) continue; ns = (dist - pes[dir].avg) / pes[dir].std; - q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) - 4.343 * log(erfc(fabs(ns) * M_SQRT1_2)) + .499); + q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) + .499); // .721 = 1/log(4) + if (q < 0) q = 0; pair = (uint64_t)k<<32 | i; x = (uint64_t)q<<32 | (hash_64(pair ^ id<<8) & 0xffffffffU); - if (x < o.x) subo = o, o.x = x, o.y = pair; - else if (x < subo.x) subo.x = x, subo.y = pair; + //printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist); + if (x > o.x) subo = o, o.x = x, o.y = pair; + else if (x > subo.x) subo.x = x, subo.y = pair; } } y[v.a[i].y&3] = i; @@ -264,9 +262,10 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[i] = mem_approx_mapq_se(opt, &a[i].a[0]); is_tandem[i] = (a[i].a[0].csub > a[i].a[0].sub); } - un = aln_q(opt, &a[0].a[0]) + aln_q(opt, &a[1].a[0]) + opt->pen_unpaired; - subo = subo < un? subo : un; - q_pe = subo - o; + un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; + if (un < 0) un = 0; + subo = subo > un? subo : un; + q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; if (q_pe > 60) q_pe = 60; // the following assumes no split hits if (z[0] == 0 && z[1] == 0) { // the best hit @@ -276,14 +275,14 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[1] = is_tandem[1]? q_se[1] : q_pe; extra_flag |= 2; } else { - if (o < un) { // then move the pair + if (o > un) { // then move the pair int tmp[2]; tmp[0] = q_se[0]; tmp[1] = q_se[1]; q_se[0] = z[0] == 0? q_se[0] : tmp[1] < q_pe? tmp[1] : q_pe; q_se[1] = z[1] == 0? q_se[1] : tmp[0] < q_pe? tmp[0] : q_pe; - if (q_se[0] == 0) q_se[0] = q_se[1]; - if (q_se[1] == 0) q_se[1] = q_se[0]; - a[0].a[z[0]].secondary = a[1].a[z[1]].secondary = -2; + for (i = 0; i < 2; ++i) + if (a[i].a[z[i]].secondary >= 0) + a[i].a[z[i]].sub = a[i].a[a[i].a[z[i]].secondary].score, a[i].a[z[i]].secondary = -2; } else { // the unpaired alignment is much better z[0] = z[1] = 0; } From c0093264de2e8979f5b98318c57ae567ba3db0bf Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 12:34:46 -0500 Subject: [PATCH 250/498] wrong logic: paired mapQ should 60) q_pe = 60; // the following assumes no split hits if (z[0] == 0 && z[1] == 0) { // the best hit - q_pe = q_pe > q_se[0] + q_se[1]? q_pe : q_se[0] + q_se[1]; + q_pe = q_pe < q_se[0] + q_se[1]? q_pe : q_se[0] + q_se[1]; if (q_pe > 60) q_pe = 60; q_se[0] = is_tandem[0]? q_se[0] : q_pe; q_se[1] = is_tandem[1]? q_se[1] : q_pe; From 6a16edc15effa0ba813984066e5d2539c227e0a3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 12:47:26 -0500 Subject: [PATCH 251/498] tuning PE mapQ --- bwamem_pair.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 5dd5927..4c0b908 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -248,7 +248,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits if ((o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z)) > 0) { - int is_multi[2], q_se[2], q_pe, is_tandem[2], extra_flag = 1, un; + int is_multi[2], q_se[2], q_pe, extra_flag = 1; bwahit_t h[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { @@ -258,31 +258,26 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co } if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score // compute mapQ for the best SE hit - for (i = 0; i < 2; ++i) { + for (i = 0; i < 2; ++i) q_se[i] = mem_approx_mapq_se(opt, &a[i].a[0]); - is_tandem[i] = (a[i].a[0].csub > a[i].a[0].sub); - } - un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; - if (un < 0) un = 0; - subo = subo > un? subo : un; q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; if (q_pe > 60) q_pe = 60; // the following assumes no split hits if (z[0] == 0 && z[1] == 0) { // the best hit - q_pe = q_pe < q_se[0] + q_se[1]? q_pe : q_se[0] + q_se[1]; - if (q_pe > 60) q_pe = 60; - q_se[0] = is_tandem[0]? q_se[0] : q_pe; - q_se[1] = is_tandem[1]? q_se[1] : q_pe; + q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe; + q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe; extra_flag |= 2; } else { - if (o > un) { // then move the pair + if (o > a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired) { // then move the pair int tmp[2]; + q_pe = q_pe > 7? q_pe - 7 : 0; tmp[0] = q_se[0]; tmp[1] = q_se[1]; q_se[0] = z[0] == 0? q_se[0] : tmp[1] < q_pe? tmp[1] : q_pe; q_se[1] = z[1] == 0? q_se[1] : tmp[0] < q_pe? tmp[0] : q_pe; for (i = 0; i < 2; ++i) if (a[i].a[z[i]].secondary >= 0) a[i].a[z[i]].sub = a[i].a[a[i].a[z[i]].secondary].score, a[i].a[z[i]].secondary = -2; + extra_flag |= 2; } else { // the unpaired alignment is much better z[0] = z[1] = 0; } From 38fc5c88223cd700e8b5fb3370baa2709acff88c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 12:54:42 -0500 Subject: [PATCH 252/498] reduce mapQ when a read is moved --- bwamem_pair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 4c0b908..f5ab495 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -268,9 +268,10 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe; extra_flag |= 2; } else { - if (o > a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired) { // then move the pair - int tmp[2]; - q_pe = q_pe > 7? q_pe - 7 : 0; + int un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; + if (o > un) { // then move the pair + int tmp[2], q_un = (o - un) * 6; + q_pe = q_pe < q_un? q_pe : q_un; tmp[0] = q_se[0]; tmp[1] = q_se[1]; q_se[0] = z[0] == 0? q_se[0] : tmp[1] < q_pe? tmp[1] : q_pe; q_se[1] = z[1] == 0? q_se[1] : tmp[0] < q_pe? tmp[0] : q_pe; From 6c1a1137539ca806bebe147c9be0effdf5f4ba4c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 13:26:23 -0500 Subject: [PATCH 253/498] mate-SW for all high-scoring hits --- bwamem_pair.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index f5ab495..21d31ad 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -229,20 +229,19 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co int n = 0, i, j, z[2], o, subo; kstring_t str; - mem_alnreg_t b[2][2]; + mem_alnreg_v b[2]; str.l = str.m = 0; str.s = 0; // perform SW for the best alignment + kv_init(b[0]); kv_init(b[1]); for (i = 0; i < 2; ++i) - for (j = 0; j < 2; ++j) b[i][j].score = -1; - for (i = 0; i < 2; ++i) { - for (j = 0; j < a[i].n && j < 2; ++j) b[i][j] = a[i].a[j]; - if (b[i][0].score > 0 && b[i][1].score > 0 && b[i][1].score < b[i][0].score * 0.8) - b[i][1].score = -1; - } + for (j = 0; j < a[i].n; ++j) + if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) + kv_push(mem_alnreg_t, b[i], a[i].a[j]); for (i = 0; i < 2; ++i) - for (j = 0; j < 2; ++j) - if (b[i][j].score > 0) n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i][j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); + for (j = 0; j < b[i].n; ++j) + n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); + free(b[0].a); free(b[1].a); mem_mark_primary_se(opt, a[0].n, a[0].a); mem_mark_primary_se(opt, a[1].n, a[1].a); if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; From ba15b787cb59be38a82cc3314569e65aec2061cf Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 14:47:57 -0500 Subject: [PATCH 254/498] rework PE mapq; don't know if better --- bwamem.c | 1 + bwamem_pair.c | 45 ++++++++++++++++++++++----------------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/bwamem.c b/bwamem.c index 431590f..931e685 100644 --- a/bwamem.c +++ b/bwamem.c @@ -604,6 +604,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) int mapq, l, sub = a->sub? a->sub : opt->min_seed_len * opt->a; double identity; sub = a->csub > sub? a->csub : sub; + if (sub >= a->score) return 0; l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; diff --git a/bwamem_pair.c b/bwamem_pair.c index 21d31ad..46d1dde 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -14,7 +14,6 @@ #define OUTLIER_BOUND 2.0 #define MAPPING_BOUND 3.0 #define MAX_STDDEV 4.0 -#define EXT_STDDEV 4.0 void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard); @@ -247,7 +246,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits if ((o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z)) > 0) { - int is_multi[2], q_se[2], q_pe, extra_flag = 1; + int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2]; bwahit_t h[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { @@ -257,30 +256,30 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co } if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score // compute mapQ for the best SE hit - for (i = 0; i < 2; ++i) - q_se[i] = mem_approx_mapq_se(opt, &a[i].a[0]); - q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; + score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; + //q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; + subo = subo > score_un? subo : score_un; + q_pe = (o - subo) * 6; if (q_pe > 60) q_pe = 60; // the following assumes no split hits - if (z[0] == 0 && z[1] == 0) { // the best hit - q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe; - q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe; - extra_flag |= 2; - } else { - int un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; - if (o > un) { // then move the pair - int tmp[2], q_un = (o - un) * 6; - q_pe = q_pe < q_un? q_pe : q_un; - tmp[0] = q_se[0]; tmp[1] = q_se[1]; - q_se[0] = z[0] == 0? q_se[0] : tmp[1] < q_pe? tmp[1] : q_pe; - q_se[1] = z[1] == 0? q_se[1] : tmp[0] < q_pe? tmp[0] : q_pe; - for (i = 0; i < 2; ++i) - if (a[i].a[z[i]].secondary >= 0) - a[i].a[z[i]].sub = a[i].a[a[i].a[z[i]].secondary].score, a[i].a[z[i]].secondary = -2; - extra_flag |= 2; - } else { // the unpaired alignment is much better - z[0] = z[1] = 0; + if (o > score_un) { // paired alignment is preferred + mem_alnreg_t *c[2]; + c[0] = &a[0].a[z[0]]; c[1] = &a[1].a[z[1]]; + for (i = 0; i < 2; ++i) { + if (c[i]->secondary >= 0) + c[i]->sub = a[i].a[c[i]->secondary].score, c[i]->secondary = -2; + q_se[i] = mem_approx_mapq_se(opt, c[i]); } + q_se[0] = q_se[0] > q_pe? q_se[0] : q_pe < q_se[0] + 40? q_pe : q_se[0] + 40; + q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40; + extra_flag |= 2; + // cap at the tandem repeat score + q_se[0] = q_se[0] < (c[0]->score - c[0]->csub) * 6? q_se[0] : (c[0]->score - c[0]->csub) * 6; + q_se[1] = q_se[1] < (c[1]->score - c[1]->csub) * 6? q_se[1] : (c[1]->score - c[1]->csub) * 6; + } else { // the unpaired alignment is preferred + z[0] = z[1] = 0; + q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]); + q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); } mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag; mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag; From 17c123d65a4ac81752a86c049f578f18166ebf38 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 16:38:48 -0500 Subject: [PATCH 255/498] pring paired-end SAM --- bwamem.c | 94 ++++++++++++++++++++++++++++++--------------------- bwamem.h | 1 - bwamem_pair.c | 22 ++++++------ 3 files changed, 67 insertions(+), 50 deletions(-) diff --git a/bwamem.c b/bwamem.c index 931e685..b52d20b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -519,21 +519,37 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa return cigar; } -void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard) + +void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) { - int score, n_cigar, is_rev = 0, nn, rid, mid, is_unmapped = 0; +#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) + int score, n_cigar, is_rev = 0, nn, rid, mid, copy_mate = 0; uint32_t *cigar = 0; int64_t pos; - - kputs(s->name, str); - if (p && p->rb >= 0 && p->re < bns->l_pac<<1) { - cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); + bwahit_t ptmp, *p = &ptmp; + + if (!p_) { // in this case, generate an unmapped alignment + memset(&ptmp, 0, sizeof(bwahit_t)); + ptmp.rb = ptmp.re = -1; + } else ptmp = *p_; + p->flag |= m? 1 : 0; // is paired in sequencing + p->flag |= !is_mapped(p)? 4 : 0; // is mapped + p->flag |= m && !is_mapped(m)? 8 : 0; // is mate mapped + if (m && !is_mapped(p) && is_mapped(m)) { + p->rb = m->rb; p->re = m->re; p->qb = 0; p->qe = s->l_seq; + copy_mate = 1; + } + p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand + p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand + kputs(s->name, str); kputc('\t', str); + if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate + if (!copy_mate) { + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); + p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) + } else n_cigar = 0, cigar = 0; pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); - p->flag |= is_rev? 16 : 0; // reverse - p->flag |= p->mb >= 0? 1 : 0; // paired in sequencing - p->flag |= n_cigar == 0? 8 : 0; // FIXME: check why this may happen (this has already happened) - kputc('\t', str); kputw(p->flag, str); kputc('\t', str); + kputw(p->flag, str); kputc('\t', str); kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str); kputw(p->qual, str); kputc('\t', str); if (n_cigar) { @@ -546,29 +562,29 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } if (clip3) { kputw(clip3, str); kputc("SH"[(is_hard!=0)], str); } } else kputc('*', str); - if (p->mb >= 0 && p->mb < bns->l_pac<<1) { // then print mate pos and isize - pos = bns_depos(bns, p->mb < bns->l_pac? p->mb : p->me - 1, &is_rev); - nn = bns_cnt_ambi(bns, pos, p->me - p->mb, &mid); - kputc('\t', str); - if (mid == rid) kputc('=', str); - else kputs(bns->anns[mid].name, str); - kputc('\t', str); kputuw(pos - bns->anns[mid].offset + 1, str); - kputc('\t', str); - if (mid != rid) { - int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; - int64_t p1 = p->mb < bns->l_pac? p->mb : (bns->l_pac<<1) - 1 - p->mb; - kputw(abs(p0 - p1), str); - } - kputc('\t', str); - } else kputsn("\t*\t0\t0\t", 7, str); - } else { // unaligned - is_unmapped = 1; - kputw(p? p->flag : 0, str); - kputs("\t*\t0\t0\t*\t*\t0\t0\t", str); + } else { // no coordinate + kputw(p->flag, str); + kputs("\t*\t0\t0\t*", str); + rid = -1; } - if (!is_rev) { // print SEQ and QUAL, the forward strand + if (m && is_mapped(m)) { // then print mate pos and isize + pos = bns_depos(bns, m->rb < bns->l_pac? m->rb : m->re - 1, &is_rev); + nn = bns_cnt_ambi(bns, pos, m->re - m->rb, &mid); + kputc('\t', str); + if (mid == rid) kputc('=', str); + else kputs(bns->anns[mid].name, str); + kputc('\t', str); kputuw(pos - bns->anns[mid].offset + 1, str); + kputc('\t', str); + if (mid == rid) { + int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; + int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb; + kputw(p0 - p1, str); + } else kputw(0, str); + kputc('\t', str); + } else kputsn("\t*\t0\t0\t", 7, str); + if (!(p->flag&0x10)) { // print SEQ and QUAL, the forward strand int i, qb = 0, qe = s->l_seq; - if (!is_unmapped && is_hard) qb = p->qb, qe = p->qe; + if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe; ks_resize(str, str->l + (qe - qb) + 1); for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; kputc('\t', str); @@ -579,7 +595,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } else kputc('*', str); } else { // the reverse strand int i, qb = 0, qe = s->l_seq; - if (!is_unmapped && is_hard) qb = p->qb, qe = p->qe; + if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe; ks_resize(str, str->l + (qe - qb) + 1); for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; kputc('\t', str); @@ -589,10 +605,11 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons str->s[str->l] = 0; } else kputc('*', str); } - if (!is_unmapped && p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } - if (!is_unmapped && p->sub >= 0) { kputsn("\tss:i:", 6, str); kputw(p->sub, str); } + if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } + if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } kputc('\n', str); free(cigar); +#undef is_mapped } /************************ @@ -622,10 +639,9 @@ void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h) h->sub = a->sub > a->csub? a->sub : a->csub; h->qual = 0; // quality unset h->flag = a->secondary >= 0? 0x100 : 0; // only the "secondary" bit is set - h->mb = h->me = -2; // mate positions are unset } -void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag) +void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m) { int k; kstring_t str; @@ -637,9 +653,9 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b mem_alnreg2hit(&a->a[k], &h); h.flag |= extra_flag; h.qual = mem_approx_mapq_se(opt, &a->a[k]); - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP); + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); } - } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP); + } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m); s->sam = str.s; } @@ -693,7 +709,7 @@ static void *worker2(void *data) if (!(w->opt->flag&MEM_F_PE)) { for (i = 0; i < w->n; i += w->step) { mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); - mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0); + mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); free(w->regs[i].a); } } else { diff --git a/bwamem.h b/bwamem.h index f20663e..4319911 100644 --- a/bwamem.h +++ b/bwamem.h @@ -58,7 +58,6 @@ typedef struct { int qb, qe, flag, qual; // optional info int score, sub; - int64_t mb, me; // mb: mate start; -1 if single-end; -2 if mate unmapped } bwahit_t; typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; diff --git a/bwamem_pair.c b/bwamem_pair.c index 46d1dde..3dce119 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -15,8 +15,6 @@ #define MAPPING_BOUND 3.0 #define MAX_STDDEV 4.0 -void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard); - static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) { int j; @@ -221,14 +219,15 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) { extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a); - extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag); + extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); - extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, bwahit_t *p, int is_hard); + extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p, int is_hard, const bwahit_t *m); int n = 0, i, j, z[2], o, subo; kstring_t str; mem_alnreg_v b[2]; + bwahit_t h[2]; str.l = str.m = 0; str.s = 0; // perform SW for the best alignment @@ -245,9 +244,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co mem_mark_primary_se(opt, a[1].n, a[1].a); if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits - if ((o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z)) > 0) { + if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z)) > 0) { int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2]; - bwahit_t h[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { for (j = 1; j < a[i].n; ++j) @@ -283,13 +281,17 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co } mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag; mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP); s[0].sam = strdup(str.s); str.l = 0; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP); s[1].sam = str.s; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP, &h[1]); s[0].sam = strdup(str.s); str.l = 0; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP, &h[0]); s[1].sam = str.s; } else goto no_pairing; return n; no_pairing: - mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41); - mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81); + for (i = 0; i < 2; ++i) { + if (a[i].n) mem_alnreg2hit(&a[i].a[0], &h[i]); + else h[i].rb = h[i].re = -1; + } + mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]); + mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]); return n; } From f122fad5625b133640f1f5e1844002744e537724 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 17:09:40 -0500 Subject: [PATCH 256/498] minor code clean up bwtio.c is merged to bwt.c --- Makefile | 7 +++-- bwase.c | 11 -------- bwt.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ bwtio.c | 77 ------------------------------------------------------- fastmap.c | 4 ++- utils.c | 4 +++ 6 files changed, 86 insertions(+), 93 deletions(-) delete mode 100644 bwtio.c diff --git a/Makefile b/Makefile index e11a04d..f14d906 100644 --- a/Makefile +++ b/Makefile @@ -3,10 +3,9 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \ - bwaseqio.o bwase.o kstring.o -AOBJS= QSufSort.o bwt_gen.o \ - is.o bwtmisc.o bwtindex.o ksw.o bwape.o \ +LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwamem.o bwamem_pair.o +AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ + is.o bwtmisc.o bwtindex.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa diff --git a/bwase.c b/bwase.c index 8fa79ac..1f36aaa 100644 --- a/bwase.c +++ b/bwase.c @@ -489,17 +489,6 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in } } -bntseq_t *bwa_open_nt(const char *prefix) -{ - bntseq_t *ntbns; - char *str; - str = (char*)calloc(strlen(prefix) + 10, 1); - strcat(strcpy(str, prefix), ".nt"); - ntbns = bns_restore(str); - free(str); - return ntbns; -} - void bwa_print_sam_SQ(const bntseq_t *bns) { int i; diff --git a/bwt.c b/bwt.c index 2903daa..7b37fe5 100644 --- a/bwt.c +++ b/bwt.c @@ -338,3 +338,79 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); return ret; } + +/************************* + * Read/write BWT and SA * + *************************/ + +void bwt_dump_bwt(const char *fn, const bwt_t *bwt) +{ + FILE *fp; + fp = xopen(fn, "wb"); + fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + fwrite(bwt->bwt, 4, bwt->bwt_size, fp); + fclose(fp); +} + +void bwt_dump_sa(const char *fn, const bwt_t *bwt) +{ + FILE *fp; + fp = xopen(fn, "wb"); + fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); + fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); + fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); + fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + fclose(fp); +} + +void bwt_restore_sa(const char *fn, bwt_t *bwt) +{ + char skipped[256]; + FILE *fp; + bwtint_t primary; + + fp = xopen(fn, "rb"); + fread(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same."); + fread(skipped, sizeof(bwtint_t), 4, fp); // skip + fread(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); + fread(&primary, sizeof(bwtint_t), 1, fp); + xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); + + bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; + bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); + bwt->sa[0] = -1; + + fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + fclose(fp); +} + +bwt_t *bwt_restore_bwt(const char *fn) +{ + bwt_t *bwt; + FILE *fp; + + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + fp = xopen(fn, "rb"); + fseek(fp, 0, SEEK_END); + bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2; + bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4); + fseek(fp, 0, SEEK_SET); + fread(&bwt->primary, sizeof(bwtint_t), 1, fp); + fread(bwt->L2+1, sizeof(bwtint_t), 4, fp); + fread(bwt->bwt, 4, bwt->bwt_size, fp); + bwt->seq_len = bwt->L2[4]; + fclose(fp); + bwt_gen_cnt_table(bwt); + + return bwt; +} + +void bwt_destroy(bwt_t *bwt) +{ + if (bwt == 0) return; + free(bwt->sa); free(bwt->bwt); + free(bwt); +} diff --git a/bwtio.c b/bwtio.c deleted file mode 100644 index 7508609..0000000 --- a/bwtio.c +++ /dev/null @@ -1,77 +0,0 @@ -#include -#include -#include -#include "bwt.h" -#include "utils.h" - -void bwt_dump_bwt(const char *fn, const bwt_t *bwt) -{ - FILE *fp; - fp = xopen(fn, "wb"); - fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); - fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); - fwrite(bwt->bwt, 4, bwt->bwt_size, fp); - fclose(fp); -} - -void bwt_dump_sa(const char *fn, const bwt_t *bwt) -{ - FILE *fp; - fp = xopen(fn, "wb"); - fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); - fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); - fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); - fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); - fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); - fclose(fp); -} - -void bwt_restore_sa(const char *fn, bwt_t *bwt) -{ - char skipped[256]; - FILE *fp; - bwtint_t primary; - - fp = xopen(fn, "rb"); - fread(&primary, sizeof(bwtint_t), 1, fp); - xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same."); - fread(skipped, sizeof(bwtint_t), 4, fp); // skip - fread(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); - fread(&primary, sizeof(bwtint_t), 1, fp); - xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); - - bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; - bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); - bwt->sa[0] = -1; - - fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); - fclose(fp); -} - -bwt_t *bwt_restore_bwt(const char *fn) -{ - bwt_t *bwt; - FILE *fp; - - bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); - fp = xopen(fn, "rb"); - fseek(fp, 0, SEEK_END); - bwt->bwt_size = (ftell(fp) - sizeof(bwtint_t) * 5) >> 2; - bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4); - fseek(fp, 0, SEEK_SET); - fread(&bwt->primary, sizeof(bwtint_t), 1, fp); - fread(bwt->L2+1, sizeof(bwtint_t), 4, fp); - fread(bwt->bwt, 4, bwt->bwt_size, fp); - bwt->seq_len = bwt->L2[4]; - fclose(fp); - bwt_gen_cnt_table(bwt); - - return bwt; -} - -void bwt_destroy(bwt_t *bwt) -{ - if (bwt == 0) return; - free(bwt->sa); free(bwt->bwt); - free(bwt); -} diff --git a/fastmap.c b/fastmap.c index 91a4ecb..d52a315 100644 --- a/fastmap.c +++ b/fastmap.c @@ -17,7 +17,7 @@ int main_mem(int argc, char *argv[]) mem_opt_t *opt; bwt_t *bwt; bntseq_t *bns; - int c, n; + int c, n, l; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; uint8_t *pac = 0; @@ -57,6 +57,8 @@ int main_mem(int argc, char *argv[]) pac = calloc(bns->l_pac/4+1, 1); fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); } + for (l = 0; l < bns->n_seqs; ++l) + printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r"); ks = kseq_init(fp); diff --git a/utils.c b/utils.c index 127c8fe..1cebaab 100644 --- a/utils.c +++ b/utils.c @@ -40,6 +40,10 @@ KSORT_INIT(128, pair64_t, pair64_lt) KSORT_INIT(64, uint64_t, ks_lt_generic) +/******************** + * System utilities * + ********************/ + FILE *err_xopen_core(const char *func, const char *fn, const char *mode) { FILE *fp = 0; From 545fb87feb5c4ff87b3e2c449f9571581abe89b8 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 17:15:57 -0500 Subject: [PATCH 257/498] removed another part related to color-space --- bwtmisc.c | 51 --------------------------------------------------- main.c | 2 -- main.h | 1 - 3 files changed, 54 deletions(-) diff --git a/bwtmisc.c b/bwtmisc.c index c35d684..de96dc2 100644 --- a/bwtmisc.c +++ b/bwtmisc.c @@ -157,57 +157,6 @@ int bwa_bwtupdate(int argc, char *argv[]) return 0; } -const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; - -/* this function is not memory efficient, but this will make life easier - Ideally we should also change .amb files as one 'N' in the nucleotide - sequence leads to two ambiguous colors. I may do this later... */ -uint8_t *bwa_pac2cspac_core(const bntseq_t *bns) -{ - uint8_t *pac, *cspac; - bwtint_t i; - int c1, c2; - pac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); - cspac = (uint8_t*)calloc(bns->l_pac/4 + 1, 1); - fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); - rewind(bns->fp_pac); - c1 = pac[0]>>6; cspac[0] = c1<<6; - for (i = 1; i < bns->l_pac; ++i) { - c2 = pac[i>>2] >> (~i&3)*2 & 3; - cspac[i>>2] |= nst_color_space_table[(1< \n"); - return 1; - } - bns = bns_restore(argv[1]); - cspac = bwa_pac2cspac_core(bns); - bns_dump(bns, argv[2]); - // now write cspac - str = (char*)calloc(strlen(argv[2]) + 5, 1); - strcat(strcpy(str, argv[2]), ".pac"); - fp = xopen(str, "wb"); - fwrite(cspac, 1, bns->l_pac/4 + 1, fp); - ct = bns->l_pac % 4; - fwrite(&ct, 1, 1, fp); - fclose(fp); - bns_destroy(bns); - free(cspac); - return 0; -} - int bwa_bwt2sa(int argc, char *argv[]) { bwt_t *bwt; diff --git a/main.c b/main.c index fc63c2e..dbe9dd0 100644 --- a/main.c +++ b/main.c @@ -27,7 +27,6 @@ static int usage() fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n"); fprintf(stderr, " bwtupdate update .bwt to the new format\n"); fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); - fprintf(stderr, " pac2cspac convert PAC to color-space PAC\n"); fprintf(stderr, "\n"); return 1; } @@ -52,7 +51,6 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); - else if (strcmp(argv[1], "pac2cspac") == 0) ret = bwa_pac2cspac(argc-1, argv+1); else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); diff --git a/main.h b/main.h index 7b638ca..3e70362 100644 --- a/main.h +++ b/main.h @@ -6,7 +6,6 @@ extern "C" { #endif int bwa_fa2pac(int argc, char *argv[]); - int bwa_pac2cspac(int argc, char *argv[]); int bwa_pac2bwt(int argc, char *argv[]); int bwa_bwtupdate(int argc, char *argv[]); int bwa_bwt2sa(int argc, char *argv[]); From 6230f86799f30b836495beea1d81ed2d384c61b4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 22 Feb 2013 17:23:34 -0500 Subject: [PATCH 258/498] merged bwtmisc.c to bwtindex.c bwtmisc.c implements routines related to indexing --- Makefile | 2 +- bwtindex.c | 149 +++++++++++++++++++++++++++++++++++++++++++- bwtmisc.c | 179 ----------------------------------------------------- 3 files changed, 147 insertions(+), 183 deletions(-) delete mode 100644 bwtmisc.c diff --git a/Makefile b/Makefile index f14d906..bfed694 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ - is.o bwtmisc.o bwtindex.o bwape.o \ + is.o bwtindex.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa diff --git a/bwtindex.c b/bwtindex.c index c01fa95..298153d 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -36,11 +36,154 @@ #include "main.h" #include "utils.h" -bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is); -void bwa_pac_rev_core(const char *fn, const char *fn_rev); +#ifdef _DIVBWT +#include "divsufsort.h" +#endif -int bwa_index(int argc, char *argv[]) +int is_bwt(ubyte_t *T, int n); + +int64_t bwa_seq_len(const char *fn_pac) +{ + FILE *fp; + int64_t pac_len; + ubyte_t c; + fp = xopen(fn_pac, "rb"); + fseek(fp, -1, SEEK_END); + pac_len = ftell(fp); + fread(&c, 1, 1, fp); + fclose(fp); + return (pac_len - 1) * 4 + (int)c; +} + +bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) +{ + bwt_t *bwt; + ubyte_t *buf, *buf2; + int i, pac_size; + FILE *fp; + + // initialization + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); + bwt->seq_len = bwa_seq_len(fn_pac); + bwt->bwt_size = (bwt->seq_len + 15) >> 4; + fp = xopen(fn_pac, "rb"); + + // prepare sequence + pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); + buf2 = (ubyte_t*)calloc(pac_size, 1); + fread(buf2, 1, pac_size, fp); + fclose(fp); + memset(bwt->L2, 0, 5 * 4); + buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); + for (i = 0; i < bwt->seq_len; ++i) { + buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; + ++bwt->L2[1+buf[i]]; + } + for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; + free(buf2); + + // Burrows-Wheeler Transform + if (use_is) { + bwt->primary = is_bwt(buf, bwt->seq_len); + } else { +#ifdef _DIVBWT + bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); +#else + err_fatal_simple("libdivsufsort is not compiled in."); +#endif + } + bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); + for (i = 0; i < bwt->seq_len; ++i) + bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); + free(buf); + return bwt; +} + +int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required! +{ + bwt_t *bwt; + int c, use_is = 1; + while ((c = getopt(argc, argv, "d")) >= 0) { + switch (c) { + case 'd': use_is = 0; break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); + return 1; + } + bwt = bwt_pac2bwt(argv[optind], use_is); + bwt_dump_bwt(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) + +void bwt_bwtupdate_core(bwt_t *bwt) { + bwtint_t i, k, c[4], n_occ; + uint32_t *buf; + + n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; + bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size + buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt + c[0] = c[1] = c[2] = c[3] = 0; + for (i = k = 0; i < bwt->seq_len; ++i) { + if (i % OCC_INTERVAL == 0) { + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) + } + if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 + ++c[bwt_B00(bwt, i)]; + } + // the last element + memcpy(buf + k, c, sizeof(bwtint_t) * 4); + xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); + // update bwt + free(bwt->bwt); bwt->bwt = buf; +} + +int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command +{ + bwt_t *bwt; + if (argc < 2) { + fprintf(stderr, "Usage: bwa bwtupdate \n"); + return 1; + } + bwt = bwt_restore_bwt(argv[1]); + bwt_bwtupdate_core(bwt); + bwt_dump_bwt(argv[1], bwt); + bwt_destroy(bwt); + return 0; +} + +int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command +{ + bwt_t *bwt; + int c, sa_intv = 32; + while ((c = getopt(argc, argv, "i:")) >= 0) { + switch (c) { + case 'i': sa_intv = atoi(optarg); break; + default: return 1; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); + return 1; + } + bwt = bwt_restore_bwt(argv[optind]); + bwt_cal_sa(bwt, sa_intv); + bwt_dump_sa(argv[optind+1], bwt); + bwt_destroy(bwt); + return 0; +} + +int bwa_index(int argc, char *argv[]) // the "index" command +{ + extern void bwa_pac_rev_core(const char *fn, const char *fn_rev); + char *prefix = 0, *str, *str2, *str3; int c, algo_type = 0, is_64 = 0; clock_t t; diff --git a/bwtmisc.c b/bwtmisc.c deleted file mode 100644 index de96dc2..0000000 --- a/bwtmisc.c +++ /dev/null @@ -1,179 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#include -#include -#include -#include -#include "bntseq.h" -#include "utils.h" -#include "main.h" -#include "bwt.h" - -#ifdef _DIVBWT -#include "divsufsort.h" -#endif - -int is_bwt(ubyte_t *T, int n); - -int64_t bwa_seq_len(const char *fn_pac) -{ - FILE *fp; - int64_t pac_len; - ubyte_t c; - fp = xopen(fn_pac, "rb"); - fseek(fp, -1, SEEK_END); - pac_len = ftell(fp); - fread(&c, 1, 1, fp); - fclose(fp); - return (pac_len - 1) * 4 + (int)c; -} - -bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) -{ - bwt_t *bwt; - ubyte_t *buf, *buf2; - int i, pac_size; - FILE *fp; - - // initialization - bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); - bwt->seq_len = bwa_seq_len(fn_pac); - bwt->bwt_size = (bwt->seq_len + 15) >> 4; - fp = xopen(fn_pac, "rb"); - - // prepare sequence - pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); - buf2 = (ubyte_t*)calloc(pac_size, 1); - fread(buf2, 1, pac_size, fp); - fclose(fp); - memset(bwt->L2, 0, 5 * 4); - buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); - for (i = 0; i < bwt->seq_len; ++i) { - buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; - ++bwt->L2[1+buf[i]]; - } - for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; - free(buf2); - - // Burrows-Wheeler Transform - if (use_is) { - bwt->primary = is_bwt(buf, bwt->seq_len); - } else { -#ifdef _DIVBWT - bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); -#else - err_fatal_simple("libdivsufsort is not compiled in."); -#endif - } - bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); - for (i = 0; i < bwt->seq_len; ++i) - bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); - free(buf); - return bwt; -} - -int bwa_pac2bwt(int argc, char *argv[]) -{ - bwt_t *bwt; - int c, use_is = 1; - while ((c = getopt(argc, argv, "d")) >= 0) { - switch (c) { - case 'd': use_is = 0; break; - default: return 1; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); - return 1; - } - bwt = bwt_pac2bwt(argv[optind], use_is); - bwt_dump_bwt(argv[optind+1], bwt); - bwt_destroy(bwt); - return 0; -} - -#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) - -void bwt_bwtupdate_core(bwt_t *bwt) -{ - bwtint_t i, k, c[4], n_occ; - uint32_t *buf; - - n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; - bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size - buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt - c[0] = c[1] = c[2] = c[3] = 0; - for (i = k = 0; i < bwt->seq_len; ++i) { - if (i % OCC_INTERVAL == 0) { - memcpy(buf + k, c, sizeof(bwtint_t) * 4); - k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) - } - if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 - ++c[bwt_B00(bwt, i)]; - } - // the last element - memcpy(buf + k, c, sizeof(bwtint_t) * 4); - xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); - // update bwt - free(bwt->bwt); bwt->bwt = buf; -} - -int bwa_bwtupdate(int argc, char *argv[]) -{ - bwt_t *bwt; - if (argc < 2) { - fprintf(stderr, "Usage: bwa bwtupdate \n"); - return 1; - } - bwt = bwt_restore_bwt(argv[1]); - bwt_bwtupdate_core(bwt); - bwt_dump_bwt(argv[1], bwt); - bwt_destroy(bwt); - return 0; -} - -int bwa_bwt2sa(int argc, char *argv[]) -{ - bwt_t *bwt; - int c, sa_intv = 32; - while ((c = getopt(argc, argv, "i:")) >= 0) { - switch (c) { - case 'i': sa_intv = atoi(optarg); break; - default: return 1; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); - return 1; - } - bwt = bwt_restore_bwt(argv[optind]); - bwt_cal_sa(bwt, sa_intv); - bwt_dump_sa(argv[optind+1], bwt); - bwt_destroy(bwt); - return 0; -} From 904c3205c0a7c205da3ccf5ca6fae3860858c80d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 13:26:50 -0500 Subject: [PATCH 259/498] removed a few unused variables These variables have been assigned but never actually used. Reported by gcc-4.7. Lower version cannot give such warnings. --- QSufSort.c | 3 --- bwamem.c | 10 +++++----- bwase.c | 8 ++++---- bwtsw2_aux.c | 4 ++-- stdaln.c | 6 ++---- 5 files changed, 13 insertions(+), 18 deletions(-) diff --git a/QSufSort.c b/QSufSort.c index e437ac3..36c5a51 100644 --- a/QSufSort.c +++ b/QSufSort.c @@ -59,12 +59,9 @@ void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsin qsint_t i, j; qsint_t s, negatedSortedGroupLength; qsint_t numSymbolAggregated; - qsint_t maxNumInputSymbol; qsint_t numSortedPos = 1; qsint_t newAlphabetSize; - maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; - if (!skipTransform) { /* bucketing possible*/ newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol, diff --git a/bwamem.c b/bwamem.c index b52d20b..5bd495c 100644 --- a/bwamem.c +++ b/bwamem.c @@ -418,7 +418,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds int i, k; - int64_t rlen, rmax[2], tmp, max = 0, max_i = 0; + int64_t rlen, rmax[2], tmp, max = 0; const mem_seed_t *s; uint8_t *rseq = 0; @@ -432,7 +432,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int e = t->rbeg + t->len + ((l_query - t->qbeg - t->len) + cal_max_gap(opt, l_query - t->qbeg - t->len)); rmax[0] = rmax[0] < b? rmax[0] : b; rmax[1] = rmax[1] > e? rmax[1] : e; - if (t->len > max) max = t->len, max_i = i; + if (t->len > max) max = t->len; } // retrieve the reference sequence rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); @@ -523,7 +523,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) { #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) - int score, n_cigar, is_rev = 0, nn, rid, mid, copy_mate = 0; + int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0; uint32_t *cigar = 0; int64_t pos; bwahit_t ptmp, *p = &ptmp; @@ -548,7 +548,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) } else n_cigar = 0, cigar = 0; pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); - nn = bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); + bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); kputw(p->flag, str); kputc('\t', str); kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str); kputw(p->qual, str); kputc('\t', str); @@ -569,7 +569,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } if (m && is_mapped(m)) { // then print mate pos and isize pos = bns_depos(bns, m->rb < bns->l_pac? m->rb : m->re - 1, &is_rev); - nn = bns_cnt_ambi(bns, pos, m->re - m->rb, &mid); + bns_cnt_ambi(bns, pos, m->re - m->rb, &mid); kputc('\t', str); if (mid == rid) kputc('=', str); else kputs(bns->anns[mid].name, str); diff --git a/bwase.c b/bwase.c index 1f36aaa..017322b 100644 --- a/bwase.c +++ b/bwase.c @@ -71,8 +71,8 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma } rest -= q->l - q->k + 1; } else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here. - int j, i, k; - for (j = rest, i = q->l - q->k + 1, k = 0; j > 0; --j) { + int j, i; + for (j = rest, i = q->l - q->k + 1; j > 0; --j) { double p = 1.0, x = drand48(); while (x < p) p -= p * j / (i--); s->multi[z].pos = q->l - i; @@ -412,11 +412,11 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in // print mate coordinate if (mate && mate->type != BWA_TYPE_NO_MATCH) { - int m_seqid, m_is_N; + int m_seqid; long long isize; am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality // redundant calculation here, but should not matter too much - m_is_N = bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid); + bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid); err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0; if (p->type == BWA_TYPE_NO_MATCH) isize = 0; diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 55c7c64..c727984 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -186,14 +186,14 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 bsw2aux_t *q = b->aux + i; uint8_t *query; bwtint_t k; - int score, path_len, beg, end; + int path_len, beg, end; if (p->l) continue; beg = (p->flag & 0x10)? lq - p->end : p->beg; end = (p->flag & 0x10)? lq - p->beg : p->end; query = seq[(p->flag & 0x10)? 1 : 0] + beg; for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3; - score = aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); + aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar); #if 0 if (name && score != p->G) { // debugging only diff --git a/stdaln.c b/stdaln.c index eb41882..cd064cf 100644 --- a/stdaln.c +++ b/stdaln.c @@ -542,13 +542,12 @@ int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, int start, end, max_score; int thres, *suba, *ss; - int gap_open, gap_ext, b; + int gap_open, gap_ext; int *score_matrix, N_MATRIX_ROW; /* initialize some align-related parameters. just for compatibility */ gap_open = ap->gap_open; gap_ext = ap->gap_ext; - b = ap->band_width; score_matrix = ap->matrix; N_MATRIX_ROW = ap->row; thres = _thres > 0? _thres : -_thres; @@ -862,7 +861,7 @@ uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar) int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, path_t *path, int *path_len, int G0, uint8_t *_mem) { - int q, r, qr, tmp_len; + int q, r, qr; int32_t **s_array, *score_array; int is_overflow, of_base; uint32_t *eh; @@ -889,7 +888,6 @@ int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2 s_array[i] = (int32_t*)_p, _p += 4 * len1; /* initialization */ aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); - tmp_len = len1 + 1; start = 1; end = 2; end_i = end_j = 0; score = 0; From dd85c528d6d34ee594218e98f787a358e1e72de2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 13:59:18 -0500 Subject: [PATCH 260/498] an alternative bwt_invPsi() implementation Cleaner, but not necessarily faster. --- bwt.h | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/bwt.h b/bwt.h index 2aab9d1..e08872a 100644 --- a/bwt.h +++ b/bwt.h @@ -74,13 +74,6 @@ typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v; * called bwt_B0 instead of bwt_B */ #define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3) -// inverse Psi function -#define bwt_invPsi(bwt, k) \ - (((k) == (bwt)->primary)? 0 : \ - ((k) < (bwt)->primary)? \ - (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \ - : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1))) - #define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0) #ifdef __cplusplus @@ -129,4 +122,21 @@ extern "C" { } #endif +// inverse Psi function +#if 0 +#define bwt_invPsi(bwt, k) \ + (((k) == (bwt)->primary)? 0 : \ + ((k) < (bwt)->primary)? \ + (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \ + : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1))) +#else +static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) +{ + register int64_t x = k - (k > bwt->primary); + x = bwt_B0(bwt, x); + x = bwt->L2[x] + bwt_occ(bwt, k, x); + return k == bwt->primary? 0 : x; +} +#endif + #endif From a19ab654dfee2963420e1e00db4a82e51b6a18e7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 14:21:19 -0500 Subject: [PATCH 261/498] no effective change --- bwt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwt.h b/bwt.h index e08872a..e06329a 100644 --- a/bwt.h +++ b/bwt.h @@ -132,7 +132,7 @@ extern "C" { #else static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) { - register int64_t x = k - (k > bwt->primary); + bwtint_t x = k - (k > bwt->primary); x = bwt_B0(bwt, x); x = bwt->L2[x] + bwt_occ(bwt, k, x); return k == bwt->primary? 0 : x; From d460f2ec9e86cf66e475f2ffcbbe81b84210eeef Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 14:48:54 -0500 Subject: [PATCH 262/498] bugfix in multi-threaded bwa-mem --- bwamem.c | 4 ++-- fastmap.c | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 5bd495c..5202fb4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -707,14 +707,14 @@ static void *worker2(void *data) worker_t *w = (worker_t*)data; int i; if (!(w->opt->flag&MEM_F_PE)) { - for (i = 0; i < w->n; i += w->step) { + for (i = w->start; i < w->n; i += w->step) { mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); free(w->regs[i].a); } } else { int n = 0; - for (i = 0; i < w->n>>1; i += w->step) { // not implemented yet + for (i = w->start; i < w->n>>1; i += w->step) { // not implemented yet n += mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } diff --git a/fastmap.c b/fastmap.c index d52a315..1d6ed04 100644 --- a/fastmap.c +++ b/fastmap.c @@ -24,8 +24,9 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "PHk:c:v:s:r:")) >= 0) { + while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); + else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; else if (c == 'c') opt->max_occ = atoi(optarg); @@ -37,6 +38,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] \n\n"); fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); From 3c330d50494a10afdc016b9717918319108a3128 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 15:12:26 -0500 Subject: [PATCH 263/498] for another round of code cleanup --- fastmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastmap.c b/fastmap.c index 1d6ed04..c97d566 100644 --- a/fastmap.c +++ b/fastmap.c @@ -69,7 +69,7 @@ int main_mem(int argc, char *argv[]) ks2 = kseq_init(fp2); opt->flag |= MEM_F_PE; } - while ((seqs = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { + while ((seqs = bseq_read(opt->chunk_size, &n, ks, ks2)) != 0) { mem_process_seqs(opt, bwt, bns, pac, n, seqs); free(seqs); } From e613195e172cea20b903ae848fc4bbf238e0a4c9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 15:30:46 -0500 Subject: [PATCH 264/498] moved some common code to bwa.{c,h} --- Makefile | 14 ++++---- bwa.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++ bwa.h | 23 +++++++++++++ bwamem.c | 35 ------------------- bwamem.h | 2 +- bwtsw2_aux.c | 1 + utils.c | 58 ++----------------------------- utils.h | 7 ---- 8 files changed, 132 insertions(+), 104 deletions(-) create mode 100644 bwa.c create mode 100644 bwa.h diff --git a/Makefile b/Makefile index bfed694..2029dc1 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwamem.o bwamem_pair.o +LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ @@ -28,14 +28,16 @@ bwa:libbwa.a $(AOBJS) main.o libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) -bwa.o:bwa.h - QSufSort.o:QSufSort.h +bwt_gen.o:QSufSort.h -bwt.o:bwt.h -bwtio.o:bwt.h -bwtaln.o:bwt.h bwtaln.h kseq.h +ksw.o:ksw.h +utils.o:utils.h ksort.h kseq.h bntseq.o:bntseq.h +bwt.o:bwt.h utils.h +bwa.o:bwa.h + +bwtaln.o:bwt.h bwtaln.h kseq.h bwtgap.o:bwtgap.h bwtaln.h bwt.h bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h diff --git a/bwa.c b/bwa.c new file mode 100644 index 0000000..eca721e --- /dev/null +++ b/bwa.c @@ -0,0 +1,96 @@ +#include +#include +#include "bntseq.h" +#include "bwa.h" +#include "ksw.h" + +/************************ + * Batch FASTA/Q reader * + ************************/ + +#include "kseq.h" +KSEQ_DECLARE(gzFile) + +static inline void trim_readno(kstring_t *s) +{ + if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) + s->l -= 2, s->s[s->l] = 0; +} + +static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) +{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice + s->name = strdup(ks->name.s); + s->comment = ks->comment.l? strdup(s->comment) : 0; + s->seq = strdup(ks->seq.s); + s->qual = ks->qual.l? strdup(ks->qual.s) : 0; + s->l_seq = strlen(s->seq); +} + +bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) +{ + kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; + int size = 0, m, n; + bseq1_t *seqs; + m = n = 0; seqs = 0; + while (kseq_read(ks) >= 0) { + if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads + fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); + break; + } + if (n >= m) { + m = m? m<<1 : 256; + seqs = realloc(seqs, m * sizeof(bseq1_t)); + } + trim_readno(&ks->name); + kseq2bseq1(ks, &seqs[n]); + size += seqs[n++].l_seq; + if (ks2) { + trim_readno(&ks2->name); + kseq2bseq1(ks2, &seqs[n]); + size += seqs[n++].l_seq; + } + if (size >= chunk_size) break; + } + if (size == 0) { // test if the 2nd file is finished + if (ks2 && kseq_read(ks2) >= 0) + fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); + } + *n_ = n; + return seqs; +} + +// Generate CIGAR when the alignment end points are known +uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) +{ + uint32_t *cigar = 0; + uint8_t tmp, *rseq; + int i, w; + int64_t rlen; + *n_cigar = 0; + if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand + rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); + if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range + if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + for (i = 0; i < rlen>>1; ++i) + tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; + } + //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); + // set the band-width + w = (int)((double)(l_query * mat[0] - q) / r + 1.); + w = w < 1? w : 1; + w = w < w_? w : w_; + w += abs(rlen - l_query); + // NW alignment + *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); + if (rb >= l_pac) // reverse back query + for (i = 0; i < l_query>>1; ++i) + tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; + +ret_gen_cigar: + free(rseq); + return cigar; +} + diff --git a/bwa.h b/bwa.h new file mode 100644 index 0000000..022b784 --- /dev/null +++ b/bwa.h @@ -0,0 +1,23 @@ +#ifndef BWA_H_ +#define BWA_H_ + +#include + +typedef struct { + int l_seq; + char *name, *comment, *seq, *qual, *sam; +} bseq1_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + + uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bwamem.c b/bwamem.c index 5202fb4..43f9f2f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -485,41 +485,6 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int * Basic hit->SAM conversion * *****************************/ -uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) -{ - uint32_t *cigar = 0; - uint8_t tmp, *rseq; - int i, w; - int64_t rlen; - *n_cigar = 0; - if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand - rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); - if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range - if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position - for (i = 0; i < l_query>>1; ++i) - tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; - for (i = 0; i < rlen>>1; ++i) - tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; - } - //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); - //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); - // set the band-width - w = (int)((double)(l_query * mat[0] - q) / r + 1.); - w = w < 1? w : 1; - w = w < w_? w : w_; - w += abs(rlen - l_query); - // NW alignment - *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); - if (rb >= l_pac) // reverse back query - for (i = 0; i < l_query>>1; ++i) - tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; - -ret_gen_cigar: - free(rseq); - return cigar; -} - - void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) { #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) diff --git a/bwamem.h b/bwamem.h index 4319911..ce27c6e 100644 --- a/bwamem.h +++ b/bwamem.h @@ -3,7 +3,7 @@ #include "bwt.h" #include "bntseq.h" -#include "utils.h" +#include "bwa.h" #define MEM_MAPQ_COEF 40.0 #define MEM_MAPQ_MAX 60 diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index c727984..bc12d20 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -13,6 +13,7 @@ #include "bwtsw2.h" #include "stdaln.h" #include "kstring.h" +#include "bwa.h" #include "kseq.h" KSEQ_DECLARE(gzFile) diff --git a/utils.c b/utils.c index 1cebaab..20b09ee 100644 --- a/utils.c +++ b/utils.c @@ -40,6 +40,9 @@ KSORT_INIT(128, pair64_t, pair64_lt) KSORT_INIT(64, uint64_t, ks_lt_generic) +#include "kseq.h" +KSEQ_INIT2(, gzFile, gzread) + /******************** * System utilities * ********************/ @@ -160,58 +163,3 @@ double realtime() gettimeofday(&tp, &tzp); return tp.tv_sec + tp.tv_usec * 1e-6; } - -/************************ - * Batch FASTA/Q reader * - ************************/ - -#include "kseq.h" -KSEQ_INIT2(, gzFile, gzread) - -static inline void trim_readno(kstring_t *s) -{ - if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) - s->l -= 2, s->s[s->l] = 0; -} - -static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) -{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice - s->name = strdup(ks->name.s); - s->comment = ks->comment.l? strdup(s->comment) : 0; - s->seq = strdup(ks->seq.s); - s->qual = ks->qual.l? strdup(ks->qual.s) : 0; - s->l_seq = strlen(s->seq); -} - -bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) -{ - kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; - int size = 0, m, n; - bseq1_t *seqs; - m = n = 0; seqs = 0; - while (kseq_read(ks) >= 0) { - if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads - fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); - break; - } - if (n >= m) { - m = m? m<<1 : 256; - seqs = realloc(seqs, m * sizeof(bseq1_t)); - } - trim_readno(&ks->name); - kseq2bseq1(ks, &seqs[n]); - size += seqs[n++].l_seq; - if (ks2) { - trim_readno(&ks2->name); - kseq2bseq1(ks2, &seqs[n]); - size += seqs[n++].l_seq; - } - if (size >= chunk_size) break; - } - if (size == 0) { // test if the 2nd file is finished - if (ks2 && kseq_read(ks2) >= 0) - fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); - } - *n_ = n; - return seqs; -} diff --git a/utils.h b/utils.h index 70f4e11..a3db251 100644 --- a/utils.h +++ b/utils.h @@ -52,11 +52,6 @@ typedef struct { typedef struct { size_t n, m; uint64_t *a; } uint64_v; typedef struct { size_t n, m; pair64_t *a; } pair64_v; -typedef struct { - int l_seq; - char *name, *comment, *seq, *qual, *sam; -} bseq1_t; - #ifdef __cplusplus extern "C" { #endif @@ -80,8 +75,6 @@ extern "C" { void ks_introsort_64 (size_t n, uint64_t *a); void ks_introsort_128(size_t n, pair64_t *a); - bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); - #ifdef __cplusplus } #endif From 67543f19a1415c8e9a55f981085a9971771c3cfc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 15:55:55 -0500 Subject: [PATCH 265/498] code refactoring --- bwa.c | 37 +++++++++++++++++++++++++++++++++++++ bwa.h | 16 ++++++++++++++++ bwamem.c | 1 - fastmap.c | 54 +++++++++++++++--------------------------------------- 4 files changed, 68 insertions(+), 40 deletions(-) diff --git a/bwa.c b/bwa.c index eca721e..e8735b5 100644 --- a/bwa.c +++ b/bwa.c @@ -94,3 +94,40 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa return cigar; } +/********************* + * Full index reader * + *********************/ + +bwaidx_t *bwa_idx_load(const char *prefix, int which) +{ + bwaidx_t *idx; + idx = calloc(1, sizeof(bwaidx_t)); + if (which & BWA_IDX_BWT) { + char *tmp; + tmp = calloc(strlen(prefix) + 5, 1); + strcat(strcpy(tmp, prefix), ".bwt"); // FM-index + idx->bwt = bwt_restore_bwt(tmp); + strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA) + bwt_restore_sa(tmp, idx->bwt); + free(tmp); + } + if (which & BWA_IDX_BNS) { + idx->bns = bns_restore(prefix); + if (which & BWA_IDX_PAC) { + idx->pac = calloc(idx->bns->l_pac/4+1, 1); + fread(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence + } + fclose(idx->bns->fp_pac); + idx->bns->fp_pac = 0; + } + return idx; +} + +void bwa_idx_destroy(bwaidx_t *idx) +{ + if (idx == 0) return; + if (idx->bwt) bwt_destroy(idx->bwt); + if (idx->bns) bns_destroy(idx->bns); + if (idx->pac) free(idx->pac); + free(idx); +} diff --git a/bwa.h b/bwa.h index 022b784..ad528c9 100644 --- a/bwa.h +++ b/bwa.h @@ -2,6 +2,19 @@ #define BWA_H_ #include +#include "bntseq.h" +#include "bwt.h" + +#define BWA_IDX_BWT 0x1 +#define BWA_IDX_BNS 0x2 +#define BWA_IDX_PAC 0x4 +#define BWA_IDX_ALL 0x7 + +typedef struct { + bwt_t *bwt; // FM-index + bntseq_t *bns; // information on the reference sequences + uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base +} bwaidx_t; typedef struct { int l_seq; @@ -16,6 +29,9 @@ extern "C" { uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); + bwaidx_t *bwa_idx_load(const char *prefix, int which); + void bwa_idx_destroy(bwaidx_t *idx); + #ifdef __cplusplus } #endif diff --git a/bwamem.c b/bwamem.c index 43f9f2f..6b219cf 100644 --- a/bwamem.c +++ b/bwamem.c @@ -112,7 +112,6 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query) itr->len = len; } - const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) { int i, max, max_i, ori_start; diff --git a/fastmap.c b/fastmap.c index c97d566..2800821 100644 --- a/fastmap.c +++ b/fastmap.c @@ -2,8 +2,7 @@ #include #include #include -#include "bntseq.h" -#include "bwt.h" +#include "bwa.h" #include "bwamem.h" #include "kvec.h" #include "utils.h" @@ -15,13 +14,11 @@ extern unsigned char nst_nt4_table[256]; int main_mem(int argc, char *argv[]) { mem_opt_t *opt; - bwt_t *bwt; - bntseq_t *bns; int c, n, l; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; - uint8_t *pac = 0; bseq1_t *seqs; + bwaidx_t *idx; opt = mem_opt_init(); while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:")) >= 0) { @@ -48,19 +45,9 @@ int main_mem(int argc, char *argv[]) return 1; } mem_fill_scmat(opt->a, opt->b, opt->mat); - { // load the packed sequences, BWT and SA - char *tmp = calloc(strlen(argv[optind]) + 5, 1); - strcat(strcpy(tmp, argv[optind]), ".bwt"); - bwt = bwt_restore_bwt(tmp); - strcat(strcpy(tmp, argv[optind]), ".sa"); - bwt_restore_sa(tmp, bwt); - free(tmp); - bns = bns_restore(argv[optind]); - pac = calloc(bns->l_pac/4+1, 1); - fread(pac, 1, bns->l_pac/4+1, bns->fp_pac); - } - for (l = 0; l < bns->n_seqs; ++l) - printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); + idx = bwa_idx_load(argv[optind], BWA_IDX_ALL); + for (l = 0; l < idx->bns->n_seqs; ++l) + printf("@SQ\tSN:%s\tLN:%d\n", idx->bns->anns[l].name, idx->bns->anns[l].len); fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r"); ks = kseq_init(fp); @@ -70,13 +57,12 @@ int main_mem(int argc, char *argv[]) opt->flag |= MEM_F_PE; } while ((seqs = bseq_read(opt->chunk_size, &n, ks, ks2)) != 0) { - mem_process_seqs(opt, bwt, bns, pac, n, seqs); + mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs); free(seqs); } - free(opt); free(pac); - bns_destroy(bns); - bwt_destroy(bwt); + free(opt); + bwa_idx_destroy(idx); kseq_destroy(ks); gzclose(fp); if (ks2) { @@ -92,10 +78,9 @@ int main_fastmap(int argc, char *argv[]) kseq_t *seq; bwtint_t k; gzFile fp; - bwt_t *bwt; - bntseq_t *bns; smem_i *itr; const bwtintv_v *a; + bwaidx_t *idx; while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) { switch (c) { @@ -112,16 +97,8 @@ int main_fastmap(int argc, char *argv[]) fp = gzopen(argv[optind + 1], "r"); seq = kseq_init(fp); - { // load the packed sequences, BWT and SA - char *tmp = calloc(strlen(argv[optind]) + 5, 1); - strcat(strcpy(tmp, argv[optind]), ".bwt"); - bwt = bwt_restore_bwt(tmp); - strcat(strcpy(tmp, argv[optind]), ".sa"); - bwt_restore_sa(tmp, bwt); - free(tmp); - bns = bns_restore(argv[optind]); - } - itr = smem_itr_init(bwt); + idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS); + itr = smem_itr_init(idx->bwt); while (kseq_read(seq) >= 0) { printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); if (print_seq) { @@ -141,10 +118,10 @@ int main_fastmap(int argc, char *argv[]) bwtint_t pos; int len, is_rev, ref_id; len = (uint32_t)p->info - (p->info>>32); - pos = bns_depos(bns, bwt_sa(bwt, p->x[0] + k), &is_rev); + pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev); if (is_rev) pos -= len - 1; - bns_cnt_ambi(bns, pos, len, &ref_id); - printf("\t%s:%c%ld", bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); + bns_cnt_ambi(idx->bns, pos, len, &ref_id); + printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1); } } else fputs("\t*", stdout); putchar('\n'); @@ -154,8 +131,7 @@ int main_fastmap(int argc, char *argv[]) } smem_itr_destroy(itr); - bns_destroy(bns); - bwt_destroy(bwt); + bwa_idx_destroy(idx); kseq_destroy(seq); gzclose(fp); return 0; From cfa7165036a83fffe973dbc25a4ed9406287bf97 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 16:10:48 -0500 Subject: [PATCH 266/498] cleanup index loading code --- bwa.c | 67 ++++++++++++++++++++++++++++++++++++++++++--------- bwa.h | 6 ++++- bwape.c | 4 +-- bwase.c | 4 +-- bwtaln.c | 29 ++-------------------- bwtsw2_main.c | 23 +++++------------- 6 files changed, 72 insertions(+), 61 deletions(-) diff --git a/bwa.c b/bwa.c index e8735b5..fac0db7 100644 --- a/bwa.c +++ b/bwa.c @@ -4,6 +4,8 @@ #include "bwa.h" #include "ksw.h" +int bwa_verbose = 3; + /************************ * Batch FASTA/Q reader * ************************/ @@ -98,28 +100,69 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa * Full index reader * *********************/ -bwaidx_t *bwa_idx_load(const char *prefix, int which) +char *bwa_idx_infer_prefix(const char *hint) +{ + char *prefix; + int l_hint; + FILE *fp; + l_hint = strlen(hint); + prefix = malloc(l_hint + 3 + 4 + 1); + strcpy(prefix, hint); + strcpy(prefix + l_hint, ".64.bwt"); + if ((fp = fopen(prefix, "rb")) != 0) { + fclose(fp); + prefix[l_hint + 3] = 0; + return prefix; + } else { + strcpy(prefix + l_hint, ".bwt"); + if ((fp = fopen(prefix, "rb")) == 0) { + free(prefix); + return 0; + } else { + fclose(fp); + prefix[l_hint] = 0; + return prefix; + } + } +} + +bwt_t *bwa_idx_load_bwt(const char *hint) +{ + char *tmp, *prefix; + bwt_t *bwt; + prefix = bwa_idx_infer_prefix(hint); + if (prefix == 0) { + if (bwa_verbose >= 1) + fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); + return 0; + } + tmp = calloc(strlen(prefix) + 5, 1); + strcat(strcpy(tmp, prefix), ".bwt"); // FM-index + bwt = bwt_restore_bwt(tmp); + strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA) + bwt_restore_sa(tmp, bwt); + free(tmp); free(prefix); + return bwt; +} + +bwaidx_t *bwa_idx_load(const char *hint, int which) { bwaidx_t *idx; + char *prefix; + prefix = bwa_idx_infer_prefix(hint); + if (prefix == 0) return 0; idx = calloc(1, sizeof(bwaidx_t)); - if (which & BWA_IDX_BWT) { - char *tmp; - tmp = calloc(strlen(prefix) + 5, 1); - strcat(strcpy(tmp, prefix), ".bwt"); // FM-index - idx->bwt = bwt_restore_bwt(tmp); - strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA) - bwt_restore_sa(tmp, idx->bwt); - free(tmp); - } + if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); if (which & BWA_IDX_BNS) { idx->bns = bns_restore(prefix); if (which & BWA_IDX_PAC) { idx->pac = calloc(idx->bns->l_pac/4+1, 1); fread(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence + fclose(idx->bns->fp_pac); + idx->bns->fp_pac = 0; } - fclose(idx->bns->fp_pac); - idx->bns->fp_pac = 0; } + free(prefix); return idx; } diff --git a/bwa.h b/bwa.h index ad528c9..b5eda13 100644 --- a/bwa.h +++ b/bwa.h @@ -21,6 +21,8 @@ typedef struct { char *name, *comment, *seq, *qual, *sam; } bseq1_t; +extern int bwa_verbose; + #ifdef __cplusplus extern "C" { #endif @@ -29,7 +31,9 @@ extern "C" { uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); - bwaidx_t *bwa_idx_load(const char *prefix, int which); + char *bwa_idx_infer_prefix(const char *hint); + bwt_t *bwa_idx_load_bwt(const char *hint); + bwaidx_t *bwa_idx_load(const char *hint, int which); void bwa_idx_destroy(bwaidx_t *idx); #ifdef __cplusplus diff --git a/bwape.c b/bwape.c index 77ae1fa..87393b1 100644 --- a/bwape.c +++ b/bwape.c @@ -10,6 +10,7 @@ #include "utils.h" #include "stdaln.h" #include "bwase.h" +#include "bwa.h" typedef struct { int n; @@ -716,7 +717,6 @@ int bwa_sai2sam_pe(int argc, char *argv[]) { extern char *bwa_rg_line, *bwa_rg_id; extern int bwa_set_rg(const char *s); - extern char *bwa_infer_prefix(const char *hint); int c; pe_opt_t *popt; char *prefix; @@ -762,7 +762,7 @@ int bwa_sai2sam_pe(int argc, char *argv[]) fprintf(stderr, "\n"); return 1; } - if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); free(bwa_rg_line); free(bwa_rg_id); return 0; diff --git a/bwase.c b/bwase.c index 017322b..8f50c7a 100644 --- a/bwase.c +++ b/bwase.c @@ -10,6 +10,7 @@ #include "bntseq.h" #include "utils.h" #include "kstring.h" +#include "bwa.h" int g_log_n[256]; char *bwa_rg_line, *bwa_rg_id; @@ -606,7 +607,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f int bwa_sai2sam_se(int argc, char *argv[]) { - extern char *bwa_infer_prefix(const char *hint); int c, n_occ = 3; char *prefix; while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { @@ -628,7 +628,7 @@ int bwa_sai2sam_se(int argc, char *argv[]) fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] \n"); return 1; } - if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); free(bwa_rg_line); free(bwa_rg_id); return 0; diff --git a/bwtaln.c b/bwtaln.c index 84be510..96d4026 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -11,6 +11,7 @@ #include "bwtaln.h" #include "bwtgap.h" #include "utils.h" +#include "bwa.h" #ifdef HAVE_PTHREAD #include @@ -219,32 +220,6 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) bwa_seq_close(ks); } -char *bwa_infer_prefix(const char *hint) -{ - char *prefix; - int l_hint; - FILE *fp; - l_hint = strlen(hint); - prefix = malloc(l_hint + 3 + 4 + 1); - strcpy(prefix, hint); - strcpy(prefix + l_hint, ".64.bwt"); - if ((fp = fopen(prefix, "rb")) != 0) { - fclose(fp); - prefix[l_hint + 3] = 0; - return prefix; - } else { - strcpy(prefix + l_hint, ".bwt"); - if ((fp = fopen(prefix, "rb")) == 0) { - free(prefix); - return 0; - } else { - fclose(fp); - prefix[l_hint] = 0; - return prefix; - } - } -} - int bwa_aln(int argc, char *argv[]) { int c, opte = -1; @@ -328,7 +303,7 @@ int bwa_aln(int argc, char *argv[]) k = l; } } - if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { + if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); free(opt); return 0; diff --git a/bwtsw2_main.c b/bwtsw2_main.c index e3f57f8..ab126f2 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -6,14 +6,12 @@ #include "bwt.h" #include "bwtsw2.h" #include "utils.h" +#include "bwa.h" int bwa_bwtsw2(int argc, char *argv[]) { - extern char *bwa_infer_prefix(const char *hint); bsw2opt_t *opt; - bwt_t *target; - char buf[1024], *prefix; - bntseq_t *bns; + bwaidx_t *idx; int c; opt = bsw2_init_opt(); @@ -81,19 +79,10 @@ int bwa_bwtsw2(int argc, char *argv[]) opt->t *= opt->a; opt->coef *= opt->a; - if ((prefix = bwa_infer_prefix(argv[optind])) == 0) { - fprintf(stderr, "[%s] fail to locate the index\n", __func__); - return 0; - } - strcpy(buf, prefix); target = bwt_restore_bwt(strcat(buf, ".bwt")); - strcpy(buf, prefix); bwt_restore_sa(strcat(buf, ".sa"), target); - bns = bns_restore(prefix); - - bsw2_aln(opt, bns, target, argv[optind+1], optind+2 < argc? argv[optind+2] : 0); - - bns_destroy(bns); - bwt_destroy(target); - free(opt); free(prefix); + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 0; + bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0); + bwa_idx_destroy(idx); + free(opt); return 0; } From ee4540c3948db2f357301fbf9f7e44f41a80bcfc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 16:41:44 -0500 Subject: [PATCH 267/498] support read group in bwa-mem --- bwa.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- bwa.h | 4 ++++ bwamem.c | 1 + bwape.c | 19 +++++----------- bwase.c | 59 +++++-------------------------------------------- fastmap.c | 15 ++++++++----- 6 files changed, 88 insertions(+), 76 deletions(-) diff --git a/bwa.c b/bwa.c index fac0db7..f5e8692 100644 --- a/bwa.c +++ b/bwa.c @@ -1,10 +1,13 @@ +#include #include #include #include "bntseq.h" #include "bwa.h" #include "ksw.h" +#include "utils.h" int bwa_verbose = 3; +char bwa_rg_id[256]; /************************ * Batch FASTA/Q reader * @@ -132,8 +135,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint) bwt_t *bwt; prefix = bwa_idx_infer_prefix(hint); if (prefix == 0) { - if (bwa_verbose >= 1) - fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); return 0; } tmp = calloc(strlen(prefix) + 5, 1); @@ -150,7 +152,10 @@ bwaidx_t *bwa_idx_load(const char *hint, int which) bwaidx_t *idx; char *prefix; prefix = bwa_idx_infer_prefix(hint); - if (prefix == 0) return 0; + if (prefix == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); + return 0; + } idx = calloc(1, sizeof(bwaidx_t)); if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); if (which & BWA_IDX_BNS) { @@ -174,3 +179,58 @@ void bwa_idx_destroy(bwaidx_t *idx) if (idx->pac) free(idx->pac); free(idx); } + +/*********************** + * SAM header routines * + ***********************/ + +void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line) +{ + int i; + for (i = 0; i < bns->n_seqs; ++i) + err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); + if (rg_line) err_printf("%s\n", rg_line); +} + +static char *bwa_escape(char *s) +{ + char *p, *q; + for (p = q = s; *p; ++p) { + if (*p == '\\') { + ++p; + if (*p == 't') *q++ = '\t'; + else if (*p == 'n') *q++ = '\n'; + else if (*p == 'r') *q++ = '\r'; + else if (*p == '\\') *q++ = '\\'; + } else *q++ = *p; + } + *q = '\0'; + return s; +} + +char *bwa_set_rg(const char *s) +{ + char *p, *q, *r, *rg_line = 0; + memset(bwa_rg_id, 0, 256); + if (strstr(s, "@RG") != s) return 0; + rg_line = strdup(s); + bwa_escape(rg_line); + if ((p = strstr(rg_line, "\tID:")) == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID in the @RG line\n", __func__); + goto err_set_rg; + } + p += 4; + for (q = p; *q && *q != '\t' && *q != '\n'; ++q); + if (q - p + 1 > 256) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] RG:ID is longer than 255 characters\n", __func__); + goto err_set_rg; + } + for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) + *r++ = *q; + return rg_line; + +err_set_rg: + free(rg_line); + return 0; +} + diff --git a/bwa.h b/bwa.h index b5eda13..208db6a 100644 --- a/bwa.h +++ b/bwa.h @@ -22,6 +22,7 @@ typedef struct { } bseq1_t; extern int bwa_verbose; +extern char bwa_rg_id[256]; #ifdef __cplusplus extern "C" { @@ -36,6 +37,9 @@ extern "C" { bwaidx_t *bwa_idx_load(const char *hint, int which); void bwa_idx_destroy(bwaidx_t *idx); + void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line); + char *bwa_set_rg(const char *s); + #ifdef __cplusplus } #endif diff --git a/bwamem.c b/bwamem.c index 6b219cf..ce55cad 100644 --- a/bwamem.c +++ b/bwamem.c @@ -571,6 +571,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } + if (bwa_rg_id) { kputsn("\tRG:i:", 6, str); kputs(bwa_rg_id, str); } kputc('\n', str); free(cigar); #undef is_mapped diff --git a/bwape.c b/bwape.c index 87393b1..0b2b8d6 100644 --- a/bwape.c +++ b/bwape.c @@ -611,7 +611,7 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, return pacseq; } -void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt) +void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); int i, j, n_seqs, tot_seqs = 0; @@ -654,7 +654,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f } // core loop - bwa_print_sam_SQ(bns); + bwa_print_sam_hdr(bns, rg_line); bwa_print_sam_PG(); while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) { int cnt_chg; @@ -715,20 +715,15 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f int bwa_sai2sam_pe(int argc, char *argv[]) { - extern char *bwa_rg_line, *bwa_rg_id; - extern int bwa_set_rg(const char *s); int c; pe_opt_t *popt; - char *prefix; + char *prefix, *rg_line = 0; popt = bwa_init_pe_opt(); while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) { switch (c) { case 'r': - if (bwa_set_rg(optarg) < 0) { - fprintf(stderr, "[%s] malformated @RG line\n", __func__); - return 1; - } + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; break; case 'a': popt->max_isize = atoi(optarg); break; case 'o': popt->max_occ = atoi(optarg); break; @@ -764,11 +759,9 @@ int bwa_sai2sam_pe(int argc, char *argv[]) } if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); - free(bwa_rg_line); free(bwa_rg_id); return 0; } - bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt); - free(bwa_rg_line); free(bwa_rg_id); free(prefix); - free(popt); + bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line); + free(prefix); free(popt); return 0; } diff --git a/bwase.c b/bwase.c index 8f50c7a..27da794 100644 --- a/bwase.c +++ b/bwase.c @@ -13,7 +13,6 @@ #include "bwa.h" int g_log_n[256]; -char *bwa_rg_line, *bwa_rg_id; void bwa_print_sam_PG(); @@ -490,56 +489,13 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in } } -void bwa_print_sam_SQ(const bntseq_t *bns) -{ - int i; - for (i = 0; i < bns->n_seqs; ++i) - err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); - if (bwa_rg_line) err_printf("%s\n", bwa_rg_line); -} - void bwase_initialize() { int i; for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); } -char *bwa_escape(char *s) -{ - char *p, *q; - for (p = q = s; *p; ++p) { - if (*p == '\\') { - ++p; - if (*p == 't') *q++ = '\t'; - else if (*p == 'n') *q++ = '\n'; - else if (*p == 'r') *q++ = '\r'; - else if (*p == '\\') *q++ = '\\'; - } else *q++ = *p; - } - *q = '\0'; - return s; -} - -int bwa_set_rg(const char *s) -{ - char *p, *q, *r; - if (strstr(s, "@RG") != s) return -1; - if (bwa_rg_line) free(bwa_rg_line); - if (bwa_rg_id) free(bwa_rg_id); - bwa_rg_line = strdup(s); - bwa_rg_id = 0; - bwa_escape(bwa_rg_line); - p = strstr(bwa_rg_line, "\tID:"); - if (p == 0) return -1; - p += 4; - for (q = p; *q && *q != '\t' && *q != '\n'; ++q); - bwa_rg_id = calloc(q - p + 1, 1); - for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) - *r++ = *q; - return 0; -} - -void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ) +void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); int i, n_seqs, tot_seqs = 0, m_aln; @@ -559,7 +515,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f m_aln = 0; fread(&opt, sizeof(gap_opt_t), 1, fp_sa); - bwa_print_sam_SQ(bns); + bwa_print_sam_hdr(bns, rg_line); //bwa_print_sam_PG(); // set ks ks = bwa_open_reads(opt.mode, fn_fa); @@ -608,15 +564,12 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f int bwa_sai2sam_se(int argc, char *argv[]) { int c, n_occ = 3; - char *prefix; + char *prefix, *rg_line = 0; while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { switch (c) { case 'h': break; case 'r': - if (bwa_set_rg(optarg) < 0) { - fprintf(stderr, "[%s] malformated @RG line\n", __func__); - return 1; - } + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; break; case 'n': n_occ = atoi(optarg); break; case 'f': xreopen(optarg, "w", stdout); break; @@ -630,10 +583,8 @@ int bwa_sai2sam_se(int argc, char *argv[]) } if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); - free(bwa_rg_line); free(bwa_rg_id); return 0; } - bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ); - free(bwa_rg_line); free(bwa_rg_id); + bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line); return 0; } diff --git a/fastmap.c b/fastmap.c index 2800821..adbe04c 100644 --- a/fastmap.c +++ b/fastmap.c @@ -14,14 +14,15 @@ extern unsigned char nst_nt4_table[256]; int main_mem(int argc, char *argv[]) { mem_opt_t *opt; - int c, n, l; + int c, n; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; bseq1_t *seqs; bwaidx_t *idx; + char *rg_line = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:")) >= 0) { + while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:R:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; @@ -29,7 +30,9 @@ int main_mem(int argc, char *argv[]) else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'v') mem_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); - else if (c == 's') opt->split_width = atoi(optarg); + else if (c == 'R') { + if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak + } else if (c == 's') opt->split_width = atoi(optarg); } if (optind + 1 >= argc) { fprintf(stderr, "\n"); @@ -39,15 +42,15 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); + fprintf(stderr, " -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]\n"); fprintf(stderr, " -v INT verbose level [%d]\n", mem_verbose); fprintf(stderr, "\n"); free(opt); return 1; } mem_fill_scmat(opt->a, opt->b, opt->mat); - idx = bwa_idx_load(argv[optind], BWA_IDX_ALL); - for (l = 0; l < idx->bns->n_seqs; ++l) - printf("@SQ\tSN:%s\tLN:%d\n", idx->bns->anns[l].name, idx->bns->anns[l].len); + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak + bwa_print_sam_hdr(idx->bns, rg_line); fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r"); ks = kseq_init(fp); From 33236de32e132440f7f4202bbaee031464adb69a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 16:44:02 -0500 Subject: [PATCH 268/498] a bit more error message --- bwa.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bwa.c b/bwa.c index f5e8692..aff9aff 100644 --- a/bwa.c +++ b/bwa.c @@ -212,17 +212,20 @@ char *bwa_set_rg(const char *s) { char *p, *q, *r, *rg_line = 0; memset(bwa_rg_id, 0, 256); - if (strstr(s, "@RG") != s) return 0; + if (strstr(s, "@RG") != s) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__); + goto err_set_rg; + } rg_line = strdup(s); bwa_escape(rg_line); if ((p = strstr(rg_line, "\tID:")) == 0) { - if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID in the @RG line\n", __func__); + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__); goto err_set_rg; } p += 4; for (q = p; *q && *q != '\t' && *q != '\n'; ++q); if (q - p + 1 > 256) { - if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] RG:ID is longer than 255 characters\n", __func__); + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__); goto err_set_rg; } for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) From b4c38bcc1c8e54657a32537a4aa0e4b4a808f725 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 16:57:34 -0500 Subject: [PATCH 269/498] append fasta/q comment --- bwa.c | 2 +- bwamem.c | 7 +++---- bwamem_pair.c | 2 +- fastmap.c | 18 ++++++++++++++---- 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/bwa.c b/bwa.c index aff9aff..c8400b1 100644 --- a/bwa.c +++ b/bwa.c @@ -25,7 +25,7 @@ static inline void trim_readno(kstring_t *s) static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) { // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice s->name = strdup(ks->name.s); - s->comment = ks->comment.l? strdup(s->comment) : 0; + s->comment = ks->comment.l? strdup(ks->comment.s) : 0; s->seq = strdup(ks->seq.s); s->qual = ks->qual.l? strdup(ks->qual.s) : 0; s->l_seq = strlen(s->seq); diff --git a/bwamem.c b/bwamem.c index ce55cad..553fe1c 100644 --- a/bwamem.c +++ b/bwamem.c @@ -13,8 +13,6 @@ #include "kvec.h" #include "ksort.h" -int mem_verbose = 3; // 1: error only; 2: error+warning; 3: message+error+warning; >=4: debugging - void mem_fill_scmat(int a, int b, int8_t mat[25]) { int i, j, k; @@ -462,7 +460,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); a->qe = qe + qle; a->re = rmax[0] + re + tle; } else a->qe = l_query, a->re = s->rbeg + s->len; - if (mem_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); + if (bwa_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); // compute seedcov for (i = 0, a->seedcov = 0; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; @@ -572,6 +570,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id) { kputsn("\tRG:i:", 6, str); kputs(bwa_rg_id, str); } + if (s->comment) { kputc('\t', str); kputs(s->comment, str); } kputc('\n', str); free(cigar); #undef is_mapped @@ -633,7 +632,7 @@ static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bn s->seq[i] = nst_nt4_table[(int)s->seq[i]]; chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); - if (mem_verbose >= 4) mem_print_chain(bns, &chn); + if (bwa_verbose >= 4) mem_print_chain(bns, &chn); kv_init(regs); kv_init(tmp); for (i = 0; i < chn.n; ++i) { mem_chain2aln(opt, bns->l_pac, pac, s->l_seq, (uint8_t*)s->seq, &chn.a[i], &tmp); diff --git a/bwamem_pair.c b/bwamem_pair.c index 3dce119..57a128a 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -47,7 +47,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is); if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); } - if (mem_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); + if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] # candidate unique pairs for (FF, FR, RF, RR): (%ld, %ld, %ld, %ld)\n", __func__, isize[0].n, isize[1].n, isize[2].n, isize[3].n); for (d = 0; d < 4; ++d) { // TODO: this block is nearly identical to the one in bwtsw2_pair.c. It would be better to merge these two. mem_pestat_t *r = &pes[d]; uint64_v *q = &isize[d]; diff --git a/fastmap.c b/fastmap.c index adbe04c..437192a 100644 --- a/fastmap.c +++ b/fastmap.c @@ -14,7 +14,7 @@ extern unsigned char nst_nt4_table[256]; int main_mem(int argc, char *argv[]) { mem_opt_t *opt; - int c, n; + int i, c, n, copy_comment = 0; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; bseq1_t *seqs; @@ -22,14 +22,15 @@ int main_mem(int argc, char *argv[]) char *rg_line = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "PHk:c:v:s:r:t:R:")) >= 0) { + while ((c = getopt(argc, argv, "CPHk:c:v:s:r:t:R:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; else if (c == 'c') opt->max_occ = atoi(optarg); - else if (c == 'v') mem_verbose = atoi(optarg); + else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); + else if (c == 'C') copy_comment = 1; else if (c == 'R') { if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak } else if (c == 's') opt->split_width = atoi(optarg); @@ -43,7 +44,8 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); fprintf(stderr, " -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]\n"); - fprintf(stderr, " -v INT verbose level [%d]\n", mem_verbose); + fprintf(stderr, " -v INT verbose level [%d]\n", bwa_verbose); + fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, "\n"); free(opt); return 1; @@ -60,6 +62,14 @@ int main_mem(int argc, char *argv[]) opt->flag |= MEM_F_PE; } while ((seqs = bseq_read(opt->chunk_size, &n, ks, ks2)) != 0) { + int64_t size = 0; + if (!copy_comment) + for (i = 0; i < n; ++i) { + free(seqs[i].comment); seqs[i].comment = 0; + } + for (i = 0; i < n; ++i) size += seqs[i].l_seq; + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size); mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs); free(seqs); } From 6e7903e9f33e15890832e354759d658000a0fe22 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 17:09:23 -0500 Subject: [PATCH 270/498] added kopen support --- Makefile | 2 +- fastmap.c | 16 ++- kopen.c | 343 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 355 insertions(+), 6 deletions(-) create mode 100644 kopen.c diff --git a/Makefile b/Makefile index 2029dc1..f1da07e 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o +LOBJS= utils.o kstring.o ksw.o kopen.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ diff --git a/fastmap.c b/fastmap.c index 437192a..90307b3 100644 --- a/fastmap.c +++ b/fastmap.c @@ -11,15 +11,19 @@ KSEQ_DECLARE(gzFile) extern unsigned char nst_nt4_table[256]; +void *kopen(const char *fn, int *_fd); +int kclose(void *a); + int main_mem(int argc, char *argv[]) { mem_opt_t *opt; - int i, c, n, copy_comment = 0; + int fd, fd2, i, c, n, copy_comment = 0; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; bseq1_t *seqs; bwaidx_t *idx; char *rg_line = 0; + void *ko = 0, *ko2 = 0; opt = mem_opt_init(); while ((c = getopt(argc, argv, "CPHk:c:v:s:r:t:R:")) >= 0) { @@ -54,10 +58,12 @@ int main_mem(int argc, char *argv[]) if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak bwa_print_sam_hdr(idx->bns, rg_line); - fp = strcmp(argv[optind + 1], "-")? gzopen(argv[optind + 1], "r") : gzdopen(fileno(stdin), "r"); + ko = kopen(argv[optind + 1], &fd); + fp = gzdopen(fd, "r"); ks = kseq_init(fp); if (optind + 2 < argc) { - fp2 = gzopen(argv[optind + 2], "r"); + ko2 = kopen(argv[optind + 2], &fd2); + fp2 = gzdopen(fd2, "r"); ks2 = kseq_init(fp2); opt->flag |= MEM_F_PE; } @@ -77,10 +83,10 @@ int main_mem(int argc, char *argv[]) free(opt); bwa_idx_destroy(idx); kseq_destroy(ks); - gzclose(fp); + gzclose(fp); kclose(ko); if (ks2) { kseq_destroy(ks2); - gzclose(fp2); + gzclose(fp2); kclose(ko2); } return 0; } diff --git a/kopen.c b/kopen.c new file mode 100644 index 0000000..f72735c --- /dev/null +++ b/kopen.c @@ -0,0 +1,343 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef _WIN32 +#include +#include +#include +#endif + +#ifdef _WIN32 +#define _KO_NO_NET +#endif + +#ifndef _KO_NO_NET +static int socket_wait(int fd, int is_read) +{ + fd_set fds, *fdr = 0, *fdw = 0; + struct timeval tv; + int ret; + tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (is_read) fdr = &fds; + else fdw = &fds; + ret = select(fd+1, fdr, fdw, 0, &tv); + if (ret == -1) perror("select"); + return ret; +} + +static int socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) + + int on = 1, fd; + struct linger lng = { 0, 0 }; + struct addrinfo hints, *res = 0; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); + if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); + freeaddrinfo(res); + return fd; +#undef __err_connect +} + +static int http_open(const char *fn) +{ + char *p, *proxy, *q, *http_host, *host, *port, *path, *buf; + int fd, ret, l; + + /* parse URL; adapted from khttp_parse_url() in knetfile.c */ + if (strstr(fn, "http://") != fn) return 0; + // set ->http_host + for (p = (char*)fn + 7; *p && *p != '/'; ++p); + l = p - fn - 7; + http_host = calloc(l + 1, 1); + strncpy(http_host, fn + 7, l); + http_host[l] = 0; + for (q = http_host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + // get http_proxy + proxy = getenv("http_proxy"); + // set host, port and path + if (proxy == 0) { + host = strdup(http_host); // when there is no proxy, server name is identical to http_host name. + port = strdup(*q? q : "80"); + path = strdup(*p? p : "/"); + } else { + host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); + for (q = host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + port = strdup(*q? q : "80"); + path = strdup(fn); + } + + /* connect; adapted from khttp_connect() in knetfile.c */ + l = 0; + fd = socket_connect(host, port); + buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. + l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", path, http_host); + l += sprintf(buf + l, "\r\n"); + write(fd, buf, l); + l = 0; + while (read(fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency + if (buf[l] == '\n' && l >= 3) + if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; + ++l; + } + buf[l] = 0; + if (l < 14) { // prematured header + close(fd); + fd = -1; + } + ret = strtol(buf + 8, &p, 0); // HTTP return code + if (ret != 200) { + close(fd); + fd = -1; + } + free(buf); free(http_host); free(host); free(port); free(path); + return fd; +} + +typedef struct { + int max_response, ctrl_fd; + char *response; +} ftpaux_t; + +static int kftp_get_response(ftpaux_t *aux) +{ + unsigned char c; + int n = 0; + char *p; + if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0; + while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O + if (n >= aux->max_response) { + aux->max_response = aux->max_response? aux->max_response<<1 : 256; + aux->response = realloc(aux->response, aux->max_response); + } + aux->response[n++] = c; + if (c == '\n') { + if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2]) + && aux->response[3] != '-') break; + n = 0; + continue; + } + } + if (n < 2) return -1; + aux->response[n-2] = 0; + return strtol(aux->response, &p, 0); +} + +static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get) +{ + if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing + write(aux->ctrl_fd, cmd, strlen(cmd)); + return is_get? kftp_get_response(aux) : 0; +} + +static int ftp_open(const char *fn) +{ + char *p, *host = 0, *port = 0, *retr = 0; + char host2[80], port2[10]; + int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4]; + ftpaux_t aux; + + /* parse URL */ + if (strstr(fn, "ftp://") != fn) return 0; + for (p = (char*)fn + 6; *p && *p != '/'; ++p); + if (*p != '/') return 0; + l = p - fn - 6; + port = strdup("21"); + host = calloc(l + 1, 1); + strncpy(host, fn + 6, l); + retr = calloc(strlen(p) + 8, 1); + sprintf(retr, "RETR %s\r\n", p); + + /* connect to ctrl */ + memset(&aux, 0, sizeof(ftpaux_t)); + aux.ctrl_fd = socket_connect(host, port); + if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */ + + /* connect to the data stream */ + kftp_get_response(&aux); + kftp_send_cmd(&aux, "USER anonymous\r\n", 1); + kftp_send_cmd(&aux, "PASS kopen@\r\n", 1); + kftp_send_cmd(&aux, "TYPE I\r\n", 1); + kftp_send_cmd(&aux, "PASV\r\n", 1); + for (p = aux.response; *p && *p != '('; ++p); + if (*p != '(') goto ftp_open_end; + ++p; + sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); + memcpy(pasv_ip, v, 4 * sizeof(int)); + pasv_port = (v[4]<<8&0xff00) + v[5]; + kftp_send_cmd(&aux, retr, 0); + sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]); + sprintf(port2, "%d", pasv_port); + fd = socket_connect(host2, port2); + if (fd == -1) goto ftp_open_end; + ret = kftp_get_response(&aux); + if (ret != 150) { + close(fd); + fd = -1; + } + close(aux.ctrl_fd); + +ftp_open_end: + free(host); free(port); free(retr); free(aux.response); + return fd; +} +#endif /* !defined(_KO_NO_NET) */ + +static char **cmd2argv(const char *cmd) +{ + int i, beg, end, argc; + char **argv, *p, *q, *str; + end = strlen(cmd); + for (i = end - 1; i >= 0; --i) + if (!isspace(cmd[i])) break; + end = i + 1; + for (beg = 0; beg < end; ++beg) + if (!isspace(cmd[beg])) break; + if (beg == end) return 0; + for (i = beg + 1, argc = 0; i < end; ++i) + if (isspace(cmd[i]) && !isspace(cmd[i-1])) + ++argc; + argv = (char**)calloc(argc + 2, sizeof(void*)); + argv[0] = str = (char*)calloc(end - beg + 1, 1); + strncpy(argv[0], cmd + beg, end - beg); + for (i = argc = 1, q = p = str; i < end - beg; ++i) + if (isspace(str[i])) str[i] = 0; + else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i]; + return argv; +} + +#define KO_STDIN 1 +#define KO_FILE 2 +#define KO_PIPE 3 +#define KO_HTTP 4 +#define KO_FTP 5 + +typedef struct { + int type, fd; + pid_t pid; +} koaux_t; + +void *kopen(const char *fn, int *_fd) +{ + koaux_t *aux = 0; + *_fd = -1; + if (strstr(fn, "http://") == fn) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_HTTP; + aux->fd = http_open(fn); + } else if (strstr(fn, "ftp://") == fn) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_FTP; + aux->fd = ftp_open(fn); + } else if (strcmp(fn, "-") == 0) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_STDIN; + aux->fd = STDIN_FILENO; + } else { + const char *p, *q; + for (p = fn; *p; ++p) + if (!isspace(*p)) break; + if (*p == '<') { // pipe open + int need_shell, pfd[2]; + pid_t pid; + // a simple check to see if we need to invoke a shell; not always working + for (q = p + 1; *q; ++q) + if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':') + break; + need_shell = (*q != 0); + pipe(pfd); + pid = vfork(); + if (pid == -1) { /* vfork() error */ + close(pfd[0]); close(pfd[1]); + return 0; + } + if (pid == 0) { /* the child process */ + char **argv; /* FIXME: I do not know if this will lead to a memory leak */ + close(pfd[0]); + dup2(pfd[1], STDOUT_FILENO); + close(pfd[1]); + if (!need_shell) { + argv = cmd2argv(p + 1); + execvp(argv[0], argv); + free(argv[0]); free(argv); + } else execl("/bin/sh", "sh", "-c", p + 1, NULL); + exit(1); + } else { /* parent process */ + close(pfd[1]); + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_PIPE; + aux->fd = pfd[0]; + aux->pid = pid; + } + } else { +#ifdef _WIN32 + *_fd = open(fn, O_RDONLY | O_BINARY); +#else + *_fd = open(fn, O_RDONLY); +#endif + if (*_fd) { + aux = calloc(1, sizeof(koaux_t)); + aux->type = KO_FILE; + aux->fd = *_fd; + } + } + } + *_fd = aux->fd; + return aux; +} + +int kclose(void *a) +{ + koaux_t *aux = (koaux_t*)a; + if (aux->type == KO_PIPE) { + int status; + pid_t pid; + pid = waitpid(aux->pid, &status, WNOHANG); + if (pid != aux->pid) kill(aux->pid, 15); + } + return 0; +} + +#ifdef _KO_MAIN +#define BUF_SIZE 0x10000 +int main(int argc, char *argv[]) +{ + void *x; + int l, fd; + unsigned char buf[BUF_SIZE]; + FILE *fp; + if (argc == 1) { + fprintf(stderr, "Usage: kopen \n"); + return 1; + } + x = kopen(argv[1], &fd); + fp = fdopen(fd, "r"); + if (fp == 0) { + fprintf(stderr, "ERROR: fail to open the input\n"); + return 1; + } + do { + if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0) + fwrite(buf, 1, l, stdout); + } while (l == BUF_SIZE); + fclose(fp); + kclose(x); + return 0; +} +#endif From cda85be059ad845edaf89d020c0d5edd35f04187 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Feb 2013 17:15:07 -0500 Subject: [PATCH 271/498] fixed a couple bugs identified by gcc Recent gcc is better. --- bwamem.c | 2 +- bwase.c | 4 ++-- kopen.c | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 553fe1c..7daa5d2 100644 --- a/bwamem.c +++ b/bwamem.c @@ -569,7 +569,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } - if (bwa_rg_id) { kputsn("\tRG:i:", 6, str); kputs(bwa_rg_id, str); } + if (bwa_rg_id[0]) { kputsn("\tRG:i:", 6, str); kputs(bwa_rg_id, str); } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } kputc('\n', str); free(cigar); diff --git a/bwase.c b/bwase.c index 27da794..2dd783b 100644 --- a/bwase.c +++ b/bwase.c @@ -434,7 +434,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in err_printf("%s", p->qual); } else err_printf("*"); - if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); + if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); if (p->type != BWA_TYPE_NO_MATCH) { @@ -482,7 +482,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality err_printf("%s", p->qual); } else err_printf("*"); - if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); + if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); putchar('\n'); diff --git a/kopen.c b/kopen.c index f72735c..8c191bc 100644 --- a/kopen.c +++ b/kopen.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #ifndef _WIN32 #include From ee59a131094ec3d0576bd72ed6421fc15b655397 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 24 Feb 2013 12:17:29 -0500 Subject: [PATCH 272/498] simplified bwamem.h Hide mem_seed_t and mem_chain_t. Don't expose unnecessary routines. --- bwamem.c | 17 +++++++++++++---- bwamem.h | 44 ++++++++------------------------------------ bwamem_pair.c | 9 +++++++++ 3 files changed, 30 insertions(+), 40 deletions(-) diff --git a/bwamem.c b/bwamem.c index 7daa5d2..b6741d2 100644 --- a/bwamem.c +++ b/bwamem.c @@ -155,6 +155,19 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) * Chaining while finding SMEMs * ********************************/ +typedef struct { + int64_t rbeg; + int32_t qbeg, len; +} mem_seed_t; + +typedef struct { + int n, m; + int64_t pos; + mem_seed_t *seeds; +} mem_chain_t; + +typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; + #include "kbtree.h" #define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) @@ -398,10 +411,6 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT free(z.a); } -/************************ - * Pick paired-end hits * - ************************/ - /**************************************** * Construct the alignment from a chain * ****************************************/ diff --git a/bwamem.h b/bwamem.h index ce27c6e..27a3dc1 100644 --- a/bwamem.h +++ b/bwamem.h @@ -11,11 +11,6 @@ struct __smem_i; typedef struct __smem_i smem_i; -typedef struct { - int64_t rbeg; - int32_t qbeg, len; -} mem_seed_t; - #define MEM_F_HARDCLIP 0x1 #define MEM_F_PE 0x2 #define MEM_F_NOPAIRING 0x4 @@ -35,12 +30,6 @@ typedef struct { int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; -typedef struct { - int n, m; - int64_t pos; - mem_seed_t *seeds; -} mem_chain_t; - typedef struct { int64_t rb, re; int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain @@ -60,43 +49,26 @@ typedef struct { int score, sub; } bwahit_t; -typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; -extern int mem_verbose; - #ifdef __cplusplus extern "C" { #endif -smem_i *smem_itr_init(const bwt_t *bwt); -void smem_itr_destroy(smem_i *itr); -void smem_set_query(smem_i *itr, int len, const uint8_t *query); -const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width); - -mem_opt_t *mem_opt_init(void); -void mem_fill_scmat(int a, int b, int8_t mat[25]); + smem_i *smem_itr_init(const bwt_t *bwt); + void smem_itr_destroy(smem_i *itr); + void smem_set_query(smem_i *itr, int len, const uint8_t *query); + const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width); -mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq); -int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains); -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *a); -uint32_t *mem_gen_cigar(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); + mem_opt_t *mem_opt_init(void); + void mem_fill_scmat(int a, int b, int8_t mat[25]); -int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); + int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); -void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); + void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); #ifdef __cplusplus } #endif -static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist) -{ - int64_t p2; - int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac); - p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand - *dist = p2 > b1? p2 - b1 : b1 - p2; - return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3); -} - #endif diff --git a/bwamem_pair.c b/bwamem_pair.c index 57a128a..51f51c9 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -15,6 +15,15 @@ #define MAPPING_BOUND 3.0 #define MAX_STDDEV 4.0 +static inline int mem_infer_dir(int64_t l_pac, int64_t b1, int64_t b2, int64_t *dist) +{ + int64_t p2; + int r1 = (b1 >= l_pac), r2 = (b2 >= l_pac); + p2 = r1 == r2? b2 : (l_pac<<1) - 1 - b2; // p2 is the coordinate of read 2 on the read 1 strand + *dist = p2 > b1? p2 - b1 : b1 - p2; + return (r1 == r2? 0 : 1) ^ (p2 > b1? 0 : 3); +} + static int cal_sub(const mem_opt_t *opt, mem_alnreg_v *r) { int j; From 6bdccf2a8acf8c8b4d6c397ab0c6e9b7f9906ae7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 24 Feb 2013 13:09:29 -0500 Subject: [PATCH 273/498] added a bit documentation --- bwa.h | 1 + bwamem.c | 16 +++++++------- bwamem.h | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++----- fastmap.c | 2 +- 4 files changed, 69 insertions(+), 15 deletions(-) diff --git a/bwa.h b/bwa.h index 208db6a..d4ca807 100644 --- a/bwa.h +++ b/bwa.h @@ -34,6 +34,7 @@ extern "C" { char *bwa_idx_infer_prefix(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint); + bwaidx_t *bwa_idx_load(const char *hint, int which); void bwa_idx_destroy(bwaidx_t *idx); diff --git a/bwamem.c b/bwamem.c index b6741d2..4fffe38 100644 --- a/bwamem.c +++ b/bwamem.c @@ -6,6 +6,7 @@ #ifdef HAVE_PTHREAD #include #endif + #include "kstring.h" #include "bwamem.h" #include "bntseq.h" @@ -632,19 +633,19 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b s->sam = str.s; } -static mem_alnreg_v find_alnreg(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s) +mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) { int i, j; mem_chain_v chn; mem_alnreg_v regs, tmp; - for (i = 0; i < s->l_seq; ++i) - s->seq[i] = nst_nt4_table[(int)s->seq[i]]; - chn = mem_chain(opt, bwt, s->l_seq, (uint8_t*)s->seq); + for (i = 0; i < l_seq; ++i) + seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; + chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); if (bwa_verbose >= 4) mem_print_chain(bns, &chn); kv_init(regs); kv_init(tmp); for (i = 0; i < chn.n; ++i) { - mem_chain2aln(opt, bns->l_pac, pac, s->l_seq, (uint8_t*)s->seq, &chn.a[i], &tmp); + mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, &chn.a[i], &tmp); for (j = 0; j < tmp.n; ++j) kv_push(mem_alnreg_t, regs, tmp.a[j]); free(chn.a[i].seeds); @@ -670,7 +671,7 @@ static void *worker1(void *data) worker_t *w = (worker_t*)data; int i; for (i = w->start; i < w->n; i += w->step) - w->regs[i] = find_alnreg(w->opt, w->bwt, w->bns, w->pac, &w->seqs[i]); + w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); return 0; } @@ -696,7 +697,7 @@ static void *worker2(void *data) return 0; } -int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs) +void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs) { int i; worker_t *w; @@ -737,5 +738,4 @@ int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); } free(regs); free(w); - return 0; } diff --git a/bwamem.h b/bwamem.h index 27a3dc1..fa55b44 100644 --- a/bwamem.h +++ b/bwamem.h @@ -31,10 +31,14 @@ typedef struct { } mem_opt_t; typedef struct { - int64_t rb, re; - int score, qb, qe, seedcov, sub, csub; // sub: suboptimal score; csub: suboptimal inside the chain - int sub_n; // approximate number of suboptimal hits - int secondary; // non-negative if the hit is secondary + int64_t rb, re; // [rb,re): reference sequence in the alignment + int qb, qe; // [qb,qe): query sequence in the alignment + int score; // best SW score + int sub; // 2nd best SW score + int csub; // SW score of a tandem hit + int sub_n; // approximate number of suboptimal hits + int seedcov; // length of regions coverged by seeds + int secondary; // index of the parent hit shadowing the current hit; <0 if primary } mem_alnreg_t; typedef struct { @@ -63,8 +67,57 @@ extern "C" { mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); - int mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); - + /** + * Align a batch of sequences and generate the alignments in the SAM format + * + * This routine requires $seqs[i].{l_seq,seq,name} and write $seqs[i].sam. + * Note that $seqs[i].sam may consist of several SAM lines if the + * corresponding sequence has multiple primary hits. + * + * In the paired-end mode (i.e. MEM_F_PE is set in $opt->flag), query + * sequences must be interleaved: $n must be an even number and the 2i-th + * sequence and the (2i+1)-th sequence constitute a read pair. In this + * mode, there should be enough (typically >50) unique pairs for the + * routine to infer the orientation and insert size. + * + * @param opt alignment parameters + * @param bwt FM-index of the reference sequence + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param n number of query sequences + * @param seqs query sequences; $seqs[i].seq/sam to be modified after the call + */ + void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); + + /** + * Find the aligned regions for one query sequence + * + * Note that this routine does not generate CIGAR. CIGAR should be + * generated later by bwa_gen_cigar() defined in bwa.c. + * + * @param opt alignment parameters + * @param bwt FM-index of the reference sequence + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param l_seq length of query sequence + * @param seq query sequence; conversion ACGTN/acgtn=>01234 to be applied + * + * @return list of aligned regions. + */ + mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq); + + /** + * Infer the insert size distribution from interleaved alignment regions + * + * This function can be called after mem_align1(), as long as paired-end + * reads are properly interleaved. + * + * @param opt alignment parameters + * @param l_pac length of concatenated reference sequence + * @param n number of query sequences; must be an even number + * @param regs region array of size $n; 2i-th and (2i+1)-th elements constitute a pair + * @param pes inferred insert size distribution (output) + */ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v *regs, mem_pestat_t pes[4]); #ifdef __cplusplus diff --git a/fastmap.c b/fastmap.c index 90307b3..819e301 100644 --- a/fastmap.c +++ b/fastmap.c @@ -67,7 +67,7 @@ int main_mem(int argc, char *argv[]) ks2 = kseq_init(fp2); opt->flag |= MEM_F_PE; } - while ((seqs = bseq_read(opt->chunk_size, &n, ks, ks2)) != 0) { + while ((seqs = bseq_read(opt->chunk_size * (ko2? 2 : 1), &n, ks, ks2)) != 0) { int64_t size = 0; if (!copy_comment) for (i = 0; i < n; ++i) { From 85775c338432818d1b05805a0357b194e634cb2c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 24 Feb 2013 13:23:43 -0500 Subject: [PATCH 274/498] output multiple hits --- bwamem.c | 10 ++++++---- bwamem.h | 1 + fastmap.c | 6 +++++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/bwamem.c b/bwamem.c index 4fffe38..ae1886a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -554,7 +554,9 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } else kputw(0, str); kputc('\t', str); } else kputsn("\t*\t0\t0\t", 7, str); - if (!(p->flag&0x10)) { // print SEQ and QUAL, the forward strand + if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL + kputsn("*\t*", 3, str); + } else if (!(p->flag&0x10)) { // print SEQ and QUAL, the forward strand int i, qb = 0, qe = s->l_seq; if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe; ks_resize(str, str->l + (qe - qb) + 1); @@ -610,7 +612,7 @@ void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h) { h->rb = a->rb; h->re = a->re; h->qb = a->qb; h->qe = a->qe; h->score = a->score; - h->sub = a->sub > a->csub? a->sub : a->csub; + h->sub = a->secondary >= 0? -1 : a->sub > a->csub? a->sub : a->csub; h->qual = 0; // quality unset h->flag = a->secondary >= 0? 0x100 : 0; // only the "secondary" bit is set } @@ -623,10 +625,10 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (a->n > 0) { for (k = 0; k < a->n; ++k) { bwahit_t h; - if (a->a[k].secondary >= 0) continue; + if (a->a[k].secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; mem_alnreg2hit(&a->a[k], &h); h.flag |= extra_flag; - h.qual = mem_approx_mapq_se(opt, &a->a[k]); + h.qual = a->a[k].secondary >= 0? 0 : mem_approx_mapq_se(opt, &a->a[k]); bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); } } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m); diff --git a/bwamem.h b/bwamem.h index fa55b44..5cf3ac5 100644 --- a/bwamem.h +++ b/bwamem.h @@ -14,6 +14,7 @@ typedef struct __smem_i smem_i; #define MEM_F_HARDCLIP 0x1 #define MEM_F_PE 0x2 #define MEM_F_NOPAIRING 0x4 +#define MEM_F_ALL 0x8 typedef struct { int a, b, q, r, w; diff --git a/fastmap.c b/fastmap.c index 819e301..49c46fd 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,11 +26,12 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "CPHk:c:v:s:r:t:R:")) >= 0) { + while ((c = getopt(argc, argv, "aCPHk:c:v:s:r:t:R:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; + else if (c == 'a') opt->flag |= MEM_F_ALL; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); @@ -49,6 +50,9 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); fprintf(stderr, " -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]\n"); fprintf(stderr, " -v INT verbose level [%d]\n", bwa_verbose); + fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); + fprintf(stderr, " -P perform mate SW only but skip pairing\n"); + fprintf(stderr, " -H hard clipping\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, "\n"); free(opt); From 0b4a40dc25f3191bb1af289af03f568fb6563de3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 24 Feb 2013 13:34:20 -0500 Subject: [PATCH 275/498] updated revision number; to merge into master --- fastmap.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fastmap.c b/fastmap.c index 49c46fd..a5cafd7 100644 --- a/fastmap.c +++ b/fastmap.c @@ -48,7 +48,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); - fprintf(stderr, " -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null]\n"); + fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); fprintf(stderr, " -v INT verbose level [%d]\n", bwa_verbose); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); fprintf(stderr, " -P perform mate SW only but skip pairing\n"); diff --git a/main.c b/main.c index dbe9dd0..1e12cfd 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r132" +#define PACKAGE_VERSION "0.6.2-r270-beta" #endif static int usage() From 29e41b592c471e0dc09cbcbce32294af10cfd3aa Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 24 Feb 2013 23:00:51 -0500 Subject: [PATCH 276/498] bugfix: isize is off by 1 --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index ae1886a..fe0ecbe 100644 --- a/bwamem.c +++ b/bwamem.c @@ -550,7 +550,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons if (mid == rid) { int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb; - kputw(p0 - p1, str); + kputw(p0 - p1 + (p0 > p1? 1 : -1), str); } else kputw(0, str); kputc('\t', str); } else kputsn("\t*\t0\t0\t", 7, str); From 570e082b38a4b8d124afd0b690c2dc4b6a5c0fdd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 24 Feb 2013 23:45:40 -0500 Subject: [PATCH 277/498] change CC back to gcc --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f1da07e..de45ff1 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -CC= clang +CC= gcc CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar From 4dc982a3c72d69835682d6f930b66333c622f367 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 00:13:32 -0500 Subject: [PATCH 278/498] support interleaved fastq --- fastmap.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fastmap.c b/fastmap.c index a5cafd7..4cf92b2 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,12 +26,13 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "aCPHk:c:v:s:r:t:R:")) >= 0) { + while ((c = getopt(argc, argv, "paCPHk:c:v:s:r:t:R:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; else if (c == 'a') opt->flag |= MEM_F_ALL; + else if (c == 'p') opt->flag |= MEM_F_PE; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); @@ -42,7 +43,7 @@ int main_mem(int argc, char *argv[]) } if (optind + 1 >= argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa mem [options] \n\n"); + fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); @@ -51,6 +52,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); fprintf(stderr, " -v INT verbose level [%d]\n", bwa_verbose); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); + fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -P perform mate SW only but skip pairing\n"); fprintf(stderr, " -H hard clipping\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); @@ -58,6 +60,7 @@ int main_mem(int argc, char *argv[]) free(opt); return 1; } + mem_fill_scmat(opt->a, opt->b, opt->mat); if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak bwa_print_sam_hdr(idx->bns, rg_line); @@ -66,10 +69,15 @@ int main_mem(int argc, char *argv[]) fp = gzdopen(fd, "r"); ks = kseq_init(fp); if (optind + 2 < argc) { - ko2 = kopen(argv[optind + 2], &fd2); - fp2 = gzdopen(fd2, "r"); - ks2 = kseq_init(fp2); - opt->flag |= MEM_F_PE; + if (opt->flag&MEM_F_PE) { + if (bwa_verbose >= 2) + fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__); + } else { + ko2 = kopen(argv[optind + 2], &fd2); + fp2 = gzdopen(fd2, "r"); + ks2 = kseq_init(fp2); + opt->flag |= MEM_F_PE; + } } while ((seqs = bseq_read(opt->chunk_size * (ko2? 2 : 1), &n, ks, ks2)) != 0) { int64_t size = 0; From 514563bd0adfa752e081e3c0e4c6d13277962731 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 10:54:12 -0500 Subject: [PATCH 279/498] no poor hits with -a; reduce mapq for 2nd primary --- bwamem.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bwamem.c b/bwamem.c index fe0ecbe..edabd38 100644 --- a/bwamem.c +++ b/bwamem.c @@ -623,12 +623,16 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b kstring_t str; str.l = str.m = 0; str.s = 0; if (a->n > 0) { + int mapq0 = -1; for (k = 0; k < a->n; ++k) { bwahit_t h; if (a->a[k].secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; + if (a->a[k].secondary >= 0 && a->a[k].score < a->a[a->a[k].secondary].score * .5) continue; mem_alnreg2hit(&a->a[k], &h); h.flag |= extra_flag; h.qual = a->a[k].secondary >= 0? 0 : mem_approx_mapq_se(opt, &a->a[k]); + if (k == 0) mapq0 = h.qual; + else if (h.qual > mapq0) h.qual = mapq0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); } } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m); From 5ead86acd35a7703b36794fbf04973e391014ea7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 11:18:35 -0500 Subject: [PATCH 280/498] optionally mark split hit as secondary --- bwamem.c | 14 +++++++++----- bwamem.h | 1 + fastmap.c | 33 +++++++++++++++++++-------------- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/bwamem.c b/bwamem.c index edabd38..2d326e2 100644 --- a/bwamem.c +++ b/bwamem.c @@ -515,13 +515,15 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand kputs(s->name, str); kputc('\t', str); if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate + int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag + if (sam_flag&0x10000) sam_flag |= 0x100; if (!copy_mate) { cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) } else n_cigar = 0, cigar = 0; pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); - kputw(p->flag, str); kputc('\t', str); + kputw(sam_flag, str); kputc('\t', str); kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str); kputw(p->qual, str); kputc('\t', str); if (n_cigar) { @@ -626,11 +628,13 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b int mapq0 = -1; for (k = 0; k < a->n; ++k) { bwahit_t h; - if (a->a[k].secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; - if (a->a[k].secondary >= 0 && a->a[k].score < a->a[a->a[k].secondary].score * .5) continue; - mem_alnreg2hit(&a->a[k], &h); + mem_alnreg_t *p = &a->a[k]; + if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; + if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; + mem_alnreg2hit(p, &h); h.flag |= extra_flag; - h.qual = a->a[k].secondary >= 0? 0 : mem_approx_mapq_se(opt, &a->a[k]); + if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) h.flag |= 0x10000; // print the sequence, but flag as secondary (for Picard) + h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p); if (k == 0) mapq0 = h.qual; else if (h.qual > mapq0) h.qual = mapq0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); diff --git a/bwamem.h b/bwamem.h index 5cf3ac5..6ab2b01 100644 --- a/bwamem.h +++ b/bwamem.h @@ -15,6 +15,7 @@ typedef struct __smem_i smem_i; #define MEM_F_PE 0x2 #define MEM_F_NOPAIRING 0x4 #define MEM_F_ALL 0x8 +#define MEM_F_NO_MULTI 0x16 typedef struct { int a, b, q, r, w; diff --git a/fastmap.c b/fastmap.c index 4cf92b2..72aea0b 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,13 +26,14 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paCPHk:c:v:s:r:t:R:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; else if (c == 'a') opt->flag |= MEM_F_ALL; else if (c == 'p') opt->flag |= MEM_F_PE; + else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); @@ -43,19 +44,23 @@ int main_mem(int argc, char *argv[]) } if (optind + 1 >= argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); - fprintf(stderr, "Options: -k INT minimum seed length [%d]\n", opt->min_seed_len); - fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); - fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); - fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); - fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); - fprintf(stderr, " -v INT verbose level [%d]\n", bwa_verbose); - fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); - fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); - fprintf(stderr, " -P perform mate SW only but skip pairing\n"); - fprintf(stderr, " -H hard clipping\n"); - fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); + fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); + fprintf(stderr, "Algorithm options:\n\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); + fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); + fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); + fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); + fprintf(stderr, " -P skip pairing; perform mate SW only\n"); + fprintf(stderr, "\nInput/output options:\n\n"); + fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); + fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); + fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); + fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); + fprintf(stderr, " -H hard clipping\n"); + fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n"); fprintf(stderr, "\n"); free(opt); return 1; From 5092211d75f5088824b79ca292620799be951529 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 11:24:21 -0500 Subject: [PATCH 281/498] controllable scoring matrix --- fastmap.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fastmap.c b/fastmap.c index 72aea0b..77b3d75 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,8 +26,12 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); + else if (c == 'A') opt->a = atoi(optarg); + else if (c == 'B') opt->b = atoi(optarg); + else if (c == 'O') opt->q = atoi(optarg); + else if (c == 'E') opt->r = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; @@ -52,6 +56,10 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -P skip pairing; perform mate SW only\n"); + fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); + fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); + fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); + fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); From e9e5ee6a3d1a0d185b74a250c7353b79646a5436 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 11:34:06 -0500 Subject: [PATCH 282/498] r277: updated the revision number --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index 1e12cfd..74980c9 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r270-beta" +#define PACKAGE_VERSION "0.6.2-r277-beta" #endif static int usage() From 9957e04590da85875f6671ca317926a5e329b971 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 11:56:02 -0500 Subject: [PATCH 283/498] r278: don't perform too many mate-sw --- bwamem.c | 24 ++++++++++++------------ bwamem.h | 28 ++++++++++++++++------------ bwamem_pair.c | 2 +- fastmap.c | 4 +++- main.c | 2 +- 5 files changed, 33 insertions(+), 27 deletions(-) diff --git a/bwamem.c b/bwamem.c index 2d326e2..88086ee 100644 --- a/bwamem.c +++ b/bwamem.c @@ -14,17 +14,6 @@ #include "kvec.h" #include "ksort.h" -void mem_fill_scmat(int a, int b, int8_t mat[25]) -{ - int i, j, k; - for (i = k = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - mat[k++] = i == j? a : -b; - mat[k++] = 0; // ambiguous base - } - for (j = 0; j < 5; ++j) mat[k++] = 0; -} - /* Theory on probability and scoring *ungapped* alignment * * s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution @@ -64,12 +53,23 @@ mem_opt_t *mem_opt_init() o->split_factor = 1.5; o->chunk_size = 10000000; o->n_threads = 1; - o->pe_dir = 0<<1|1; o->pen_unpaired = 9; + o->max_matesw = 100; mem_fill_scmat(o->a, o->b, o->mat); return o; } +void mem_fill_scmat(int a, int b, int8_t mat[25]) +{ + int i, j, k; + for (i = k = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? a : -b; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; +} + /*************************** * SMEM iterator interface * ***************************/ diff --git a/bwamem.h b/bwamem.h index 6ab2b01..d99a9da 100644 --- a/bwamem.h +++ b/bwamem.h @@ -18,18 +18,22 @@ typedef struct __smem_i smem_i; #define MEM_F_NO_MULTI 0x16 typedef struct { - int a, b, q, r, w; - int flag; - int split_width; - int min_seed_len, max_occ, max_chain_gap; - int n_threads, chunk_size; - int pe_dir; - float mask_level; - float chain_drop_ratio; - float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor - int pen_unpaired; // phred-scaled penalty for unpaired reads - int max_ins; // maximum insert size - int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset + int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r + int w; // band width + int flag; // see MEM_F_* macros + int min_seed_len; // minimum seed length + float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor + int split_width; // split into a seed if its occurence is smaller than this value + int max_occ; // skip a seed if its occurence is larger than this value + int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed + int n_threads; // number of threads + int chunk_size; // process chunk_size-bp sequences in a batch + float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits + float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain + int pen_unpaired; // phred-scaled penalty for unpaired reads + int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value + int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end + int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset } mem_opt_t; typedef struct { diff --git a/bwamem_pair.c b/bwamem_pair.c index 51f51c9..3ef71ea 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -246,7 +246,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) kv_push(mem_alnreg_t, b[i], a[i].a[j]); for (i = 0; i < 2; ++i) - for (j = 0; j < b[i].n; ++j) + for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); free(b[0].a); free(b[1].a); mem_mark_primary_se(opt, a[0].n, a[0].a); diff --git a/fastmap.c b/fastmap.c index 77b3d75..b4f8ea8 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,8 +26,9 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:w:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); + else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); @@ -52,6 +53,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "Algorithm options:\n\n"); fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); + fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); diff --git a/main.c b/main.c index 74980c9..9ef33fa 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r277-beta" +#define PACKAGE_VERSION "0.6.2-r278-beta" #endif static int usage() From 20aa848b3c4b48382dc24112e6da6bfad1c991ba Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 13:00:35 -0500 Subject: [PATCH 284/498] r279: for PE mapq, consider the number of pairs If there are a lot of proper pairs, it is more likely that the best pair is wrong. --- bwamem.c | 2 +- bwamem.h | 2 +- bwamem_pair.c | 44 ++++++++++++++++++++++++-------------------- main.c | 2 +- 4 files changed, 27 insertions(+), 23 deletions(-) diff --git a/bwamem.c b/bwamem.c index 88086ee..a1f25b7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -604,7 +604,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; - if (a->sub_n) mapq -= (int)(4.343 * log(a->sub_n) + .499); + if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499); if (mapq > 60) mapq = 60; if (mapq < 0) mapq = 0; return mapq; diff --git a/bwamem.h b/bwamem.h index d99a9da..7fc2c85 100644 --- a/bwamem.h +++ b/bwamem.h @@ -5,7 +5,7 @@ #include "bntseq.h" #include "bwa.h" -#define MEM_MAPQ_COEF 40.0 +#define MEM_MAPQ_COEF 30.0 #define MEM_MAPQ_MAX 60 struct __smem_i; diff --git a/bwamem_pair.c b/bwamem_pair.c index 3ef71ea..3fbdec7 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -167,13 +167,12 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me return n; } -int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int z[2]) +int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2]) { extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); - pair64_v v; - pair64_t o, subo; // .x: score<<32 | raw_score<<8 | hash; .y: pair - int r, i, k, y[4]; // y[] keeps the last hit - kv_init(v); + pair64_v v, u; + int r, i, k, y[4], ret; // y[] keeps the last hit + kv_init(v); kv_init(u); for (r = 0; r < 2; ++r) { // loop through read number for (i = 0; i < a[r].n; ++i) { pair64_t key; @@ -185,7 +184,6 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ } ks_introsort_128(v.n, v.a); y[0] = y[1] = y[2] = y[3] = -1; - o.x = subo.x = o.y = subo.y = 0; //for (i = 0; i < v.n; ++i) printf("[%d]\t%d\t%c%ld\n", i, (int)(v.a[i].y&1)+1, "+-"[v.a[i].y>>1&1], (long)v.a[i].x); for (i = 0; i < v.n; ++i) { for (r = 0; r < 2; ++r) { // loop through direction @@ -197,7 +195,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ int64_t dist; int q; double ns; - uint64_t x, pair; + pair64_t *p; if ((v.a[k].y&3) != which) continue; dist = (int64_t)v.a[i].x - v.a[k].x; //printf("%d: %lld\n", k, dist); @@ -206,23 +204,27 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ ns = (dist - pes[dir].avg) / pes[dir].std; q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) + .499); // .721 = 1/log(4) if (q < 0) q = 0; - pair = (uint64_t)k<<32 | i; - x = (uint64_t)q<<32 | (hash_64(pair ^ id<<8) & 0xffffffffU); + p = kv_pushp(pair64_t, u); + p->y = (uint64_t)k<<32 | i; + p->x = (uint64_t)q<<32 | (hash_64(p->y ^ id<<8) & 0xffffffffU); //printf("[%lld,%lld]\t%d\tdist=%ld\n", v.a[k].x, v.a[i].x, q, (long)dist); - if (x > o.x) subo = o, o.x = x, o.y = pair; - else if (x > subo.x) subo.x = x, subo.y = pair; } } y[v.a[i].y&3] = i; } - if (o.x > 0) { - i = o.y >> 32; k = o.y << 32 >> 32; - z[v.a[i].y&1] = v.a[i].y<<32>>34; + if (u.n) { // found at least one proper pair + int tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; + ks_introsort_128(u.n, u.a); + i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32; + z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair z[v.a[k].y&1] = v.a[k].y<<32>>34; - } - free(v.a); - *sub = subo.x>>32; - return o.x>>32; + ret = u.a[u.n-1].x >> 32; + *sub = u.n > 1? u.a[u.n-2].x>>32 : 0; + for (i = (long)u.n - 2, *n_sub = 0; i >= 0; --i) + if (*sub - (int)(u.a[i].x>>32) <= tmp) ++*n_sub; + } else ret = 0, *sub = 0, *n_sub = 0; + free(u.a); free(v.a); + return ret; } int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) @@ -233,7 +235,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p, int is_hard, const bwahit_t *m); - int n = 0, i, j, z[2], o, subo; + int n = 0, i, j, z[2], o, subo, n_sub; kstring_t str; mem_alnreg_v b[2]; bwahit_t h[2]; @@ -253,7 +255,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co mem_mark_primary_se(opt, a[1].n, a[1].a); if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits - if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, z)) > 0) { + if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) { int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { @@ -267,6 +269,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co //q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; subo = subo > score_un? subo : score_un; q_pe = (o - subo) * 6; + if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499); + if (q_pe < 0) q_pe = 0; if (q_pe > 60) q_pe = 60; // the following assumes no split hits if (o > score_un) { // paired alignment is preferred diff --git a/main.c b/main.c index 9ef33fa..a0e8ec6 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r278-beta" +#define PACKAGE_VERSION "0.6.2-r279-beta" #endif static int usage() From d19e834d84dc9a2659b1be665cd7ff48828c3deb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 15:40:15 -0500 Subject: [PATCH 285/498] r280: align two ends in the same thread Otherwise odd-number threads may be of different speed from even-number threads. --- bwamem.c | 11 +++++++++-- main.c | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index a1f25b7..5c274b1 100644 --- a/bwamem.c +++ b/bwamem.c @@ -680,8 +680,15 @@ static void *worker1(void *data) { worker_t *w = (worker_t*)data; int i; - for (i = w->start; i < w->n; i += w->step) - w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); + if (!(w->opt->flag&MEM_F_PE)) { + for (i = w->start; i < w->n; i += w->step) + w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); + } else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower + for (i = w->start; i < w->n>>1; i += w->step) { + w->regs[i<<1|0] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq); + w->regs[i<<1|1] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq); + } + } return 0; } diff --git a/main.c b/main.c index a0e8ec6..b85757e 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r279-beta" +#define PACKAGE_VERSION "0.6.2-r280-beta" #endif static int usage() From 30cc8a95d1fa96ec5057e05c6dc4a7fcbe92942e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 16:34:19 -0500 Subject: [PATCH 286/498] fixed an unimportant memory leak --- kopen.c | 1 + main.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kopen.c b/kopen.c index 8c191bc..45f2713 100644 --- a/kopen.c +++ b/kopen.c @@ -312,6 +312,7 @@ int kclose(void *a) pid = waitpid(aux->pid, &status, WNOHANG); if (pid != aux->pid) kill(aux->pid, 15); } + free(aux); return 0; } diff --git a/main.c b/main.c index b85757e..749c7de 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r280-beta" +#define PACKAGE_VERSION "0.6.2-r281-beta" #endif static int usage() From 77b5b586ad9914639b1da4d2289e30711237662a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 17:29:35 -0500 Subject: [PATCH 287/498] r282: set min split_len to read length --- bwamem.c | 1 + main.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 5c274b1..2c24ba5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -199,6 +199,7 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i { const bwtintv_v *a; int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); + split_len = split_len < itr->len? split_len : itr->len; while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM int i; for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start diff --git a/main.c b/main.c index 749c7de..4e2f15d 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r281-beta" +#define PACKAGE_VERSION "0.6.2-r282-beta" #endif static int usage() From 61dd3bf13a1d571f938ac2698999c5f34445f11f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Feb 2013 22:49:15 -0500 Subject: [PATCH 288/498] r283: prepare for fixing cross-ref aln --- bntseq.c | 29 +++++++++++++++++------------ bntseq.h | 1 + bwa.c | 4 ++++ main.c | 2 +- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/bntseq.c b/bntseq.c index 0286c19..972837e 100644 --- a/bntseq.c +++ b/bntseq.c @@ -288,21 +288,26 @@ int bwa_fa2pac(int argc, char *argv[]) return 0; } +int bns_pos2rid(const bntseq_t *bns, int64_t pos_f) +{ + int left, mid, right; + if (pos_f >= bns->l_pac) return -1; + left = 0; mid = 0; right = bns->n_seqs; + while (left < right) { // binary search + mid = (left + right) >> 1; + if (pos_f >= bns->anns[mid].offset) { + if (mid == bns->n_seqs - 1) break; + if (pos_f < bns->anns[mid+1].offset) break; // bracketed + left = mid + 1; + } else right = mid; + } + return mid; +} + int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) { int left, mid, right, nn; - if (ref_id) { - left = 0; mid = 0; right = bns->n_seqs; - while (left < right) { - mid = (left + right) >> 1; - if (pos_f >= bns->anns[mid].offset) { - if (mid == bns->n_seqs - 1) break; - if (pos_f < bns->anns[mid+1].offset) break; // bracketed - left = mid + 1; - } else right = mid; - } - *ref_id = mid; - } + if (ref_id) *ref_id = bns_pos2rid(bns, pos_f); left = 0; right = bns->n_holes; nn = 0; while (left < right) { mid = (left + right) >> 1; diff --git a/bntseq.h b/bntseq.h index 0425540..4061438 100644 --- a/bntseq.h +++ b/bntseq.h @@ -72,6 +72,7 @@ extern "C" { bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); void bns_destroy(bntseq_t *bns); int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); + int bns_pos2rid(const bntseq_t *bns, int64_t pos_f); int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); diff --git a/bwa.c b/bwa.c index c8400b1..f34bd12 100644 --- a/bwa.c +++ b/bwa.c @@ -64,6 +64,10 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) return seqs; } +/***************** + * CIGAR related * + *****************/ + // Generate CIGAR when the alignment end points are known uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) { diff --git a/main.c b/main.c index 4e2f15d..4bad9ee 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r282-beta" +#define PACKAGE_VERSION "0.6.2-r283-beta" #endif static int usage() From e70c7c2a71744f5a316c84dad8016d054020d425 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 00:03:49 -0500 Subject: [PATCH 289/498] r284: amend cross-reference hit I really hate this: complex and twisted logic for a nasty scenario that almost never happens to short reads - but it may become serious when the reference genome consists of many contigs. On toy examples, the code seems to work. Don't know if it really works... --- bwa.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++ bwa.h | 1 + bwamem.c | 1 + bwamem_pair.c | 8 ++++++-- main.c | 2 +- 5 files changed, 65 insertions(+), 3 deletions(-) diff --git a/bwa.c b/bwa.c index f34bd12..e8221ca 100644 --- a/bwa.c +++ b/bwa.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "bntseq.h" #include "bwa.h" #include "ksw.h" @@ -103,6 +104,61 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa return cigar; } +int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) +{ + int ib, ie, is_rev; + int64_t fb, fe, mid = -1; + if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary + *qb = *qe = *rb = *re = -1; + return -1; // unable to fix + } else { + fb = bns_depos(bns, *rb < bns->l_pac? *rb : *re - 1, &is_rev); + ib = bns_pos2rid(bns, fb); + if (fb - bns->anns[ib].offset + (*re - *rb) <= bns->anns[ib].len) return 0; // no need to fix + fe = bns_depos(bns, *re - 1 < bns->l_pac? *re - 1 : *rb, &is_rev); + ie = bns_pos2rid(bns, fe); + if (ie - ib > 1) { // bridge three or more references + *qb = *qe = *rb = *re = -1; + return -2; // unable to fix + } else { + int l = bns->anns[ib].offset + bns->anns[ib].len - fb; + mid = is_rev? *re - l : *rb + l; + } + } + if (mid >= 0) { + int i, score, n_cigar, y; + uint32_t *cigar; + int64_t x; + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar); + for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) { + int op = cigar[i]&0xf, len = cigar[i]>>4; + if (op == 0) { + if (x <= mid && mid < x + len) { + if (mid - *rb > *re - mid) { // the first part is longer + if (x == mid) { // need to check the previous operation + assert(i); // mid != *rb should always stand + if ((cigar[i-1]&0xf) == 1) *qe = y - (cigar[i-1]>>4), *re = x; + else if ((cigar[i-1]&0xf) == 2) *qe = y, *re = x - (cigar[i-1]>>4); + else abort(); // should not be here + } else *qe = y + (mid - x), *re = mid; + } else *qb = y + (mid - x), *rb = mid; + break; + } else x += len, y += len; + } else if (op == 1) { // insertion + y += len; + } else if (op == 2) { // deletion + if (x <= mid && mid < x + len) { + if (mid - *rb > *re - mid) *qe = y, *re = x; + else *qb = y, *rb = x + len; + break; + } else x += len; + } else abort(); // should not be here + } + free(cigar); + } + return 1; +} + /********************* * Full index reader * *********************/ diff --git a/bwa.h b/bwa.h index d4ca807..2d6c7bf 100644 --- a/bwa.h +++ b/bwa.h @@ -31,6 +31,7 @@ extern "C" { bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); + int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); char *bwa_idx_infer_prefix(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint); diff --git a/bwamem.c b/bwamem.c index 2c24ba5..7c837bf 100644 --- a/bwamem.c +++ b/bwamem.c @@ -633,6 +633,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; mem_alnreg2hit(p, &h); + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s->seq, &h.qb, &h.qe, &h.rb, &h.re); h.flag |= extra_flag; if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) h.flag |= 0x10000; // print the sequence, but flag as secondary (for Picard) h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p); diff --git a/bwamem_pair.c b/bwamem_pair.c index 3fbdec7..9ff12b3 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -293,7 +293,9 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); } mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag; + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[0].seq, &h[0].qb, &h[0].qe, &h[0].rb, &h[0].re); mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag; + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[1].seq, &h[1].qb, &h[1].qe, &h[1].rb, &h[1].re); bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP, &h[1]); s[0].sam = strdup(str.s); str.l = 0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP, &h[0]); s[1].sam = str.s; } else goto no_pairing; @@ -301,8 +303,10 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co no_pairing: for (i = 0; i < 2; ++i) { - if (a[i].n) mem_alnreg2hit(&a[i].a[0], &h[i]); - else h[i].rb = h[i].re = -1; + if (a[i].n) { + mem_alnreg2hit(&a[i].a[0], &h[i]); + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re); + } else h[i].rb = h[i].re = -1; } mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]); mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]); diff --git a/main.c b/main.c index 4bad9ee..c1c232a 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r283-beta" +#define PACKAGE_VERSION "0.6.2-r284-beta" #endif static int usage() From 174fe0f1d57f54823f0ed2e83b242b4c2f5d765c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 11:14:19 -0500 Subject: [PATCH 290/498] code backup: less dependent on gcc optimization --- bwt.c | 17 ++++++++++------- bwt.h | 6 ++++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/bwt.c b/bwt.c index 7b37fe5..47b06e2 100644 --- a/bwt.c +++ b/bwt.c @@ -161,7 +161,7 @@ void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) { bwtint_t l, j, x; - uint32_t *p; + uint32_t *p, tmp; if (k == (bwtint_t)(-1)) { memset(cnt, 0, 4 * sizeof(bwtint_t)); return; @@ -171,9 +171,10 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) memcpy(cnt, p, 4 * sizeof(bwtint_t)); p += sizeof(bwtint_t); j = k >> 4 << 4; - for (l = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; l < j; l += 16, ++p) + for (l = k & ~OCC_INTV_MASK, x = 0; l < j; l += 16, ++p) x += __occ_aux4(bwt, *p); - x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); + tmp = *p & ~((1U<<((~k&15)<<1)) - 1); + x += __occ_aux4(bwt, tmp) - (~k&15); cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; } @@ -188,7 +189,7 @@ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtin bwt_occ4(bwt, l, cntl); } else { bwtint_t i, j, x, y; - uint32_t *p; + uint32_t *p, tmp; if (k >= bwt->primary) --k; // because $ is not in bwt if (l >= bwt->primary) --l; p = bwt_occ_intv(bwt, k); @@ -196,14 +197,16 @@ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtin p += sizeof(bwtint_t); // prepare cntk[] j = k >> 4 << 4; - for (i = k / OCC_INTERVAL * OCC_INTERVAL, x = 0; i < j; i += 16, ++p) + for (i = k & ~OCC_INTV_MASK, x = 0; i < j; i += 16, ++p) x += __occ_aux4(bwt, *p); y = x; - x += __occ_aux4(bwt, *p & ~((1U<<((~k&15)<<1)) - 1)) - (~k&15); + tmp = *p & ~((1U<<((~k&15)<<1)) - 1); + x += __occ_aux4(bwt, tmp) - (~k&15); // calculate cntl[] and finalize cntk[] j = l >> 4 << 4; for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p); - y += __occ_aux4(bwt, *p & ~((1U<<((~l&15)<<1)) - 1)) - (~l&15); + tmp = *p & ~((1U<<((~l&15)<<1)) - 1); + y += __occ_aux4(bwt, tmp) - (~l&15); memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24; cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24; diff --git a/bwt.h b/bwt.h index e06329a..ab5aecd 100644 --- a/bwt.h +++ b/bwt.h @@ -30,8 +30,10 @@ #include -// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line -#define OCC_INTERVAL 0x80 +// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80 +#define OCC_INTV_SHIFT 7 +#define OCC_INTERVAL (1LL< Date: Tue, 26 Feb 2013 11:22:24 -0500 Subject: [PATCH 291/498] code backup --- bwt.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bwt.c b/bwt.c index 47b06e2..ff5a4a0 100644 --- a/bwt.c +++ b/bwt.c @@ -166,7 +166,7 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) memset(cnt, 0, 4 * sizeof(bwtint_t)); return; } - if (k >= bwt->primary) --k; // because $ is not in bwt + k -= (k >= bwt->primary); // because $ is not in bwt p = bwt_occ_intv(bwt, k); memcpy(cnt, p, 4 * sizeof(bwtint_t)); p += sizeof(bwtint_t); @@ -182,16 +182,16 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) { bwtint_t _k, _l; - _k = (k >= bwt->primary)? k-1 : k; - _l = (l >= bwt->primary)? l-1 : l; - if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { + _k = k - (k >= bwt->primary); + _l = l - (l >= bwt->primary); + if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { bwt_occ4(bwt, k, cntk); bwt_occ4(bwt, l, cntl); } else { bwtint_t i, j, x, y; uint32_t *p, tmp; - if (k >= bwt->primary) --k; // because $ is not in bwt - if (l >= bwt->primary) --l; + k -= (k >= bwt->primary); // because $ is not in bwt + l -= (l >= bwt->primary); p = bwt_occ_intv(bwt, k); memcpy(cntk, p, 4 * sizeof(bwtint_t)); p += sizeof(bwtint_t); From 264d5e42e5f5ec5f5218a88d4c8cf8d6743cc51a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 11:49:39 -0500 Subject: [PATCH 292/498] simplified bwt_occ4() a little --- bwt.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/bwt.c b/bwt.c index ff5a4a0..d57e2d5 100644 --- a/bwt.c +++ b/bwt.c @@ -160,8 +160,8 @@ void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) { - bwtint_t l, j, x; - uint32_t *p, tmp; + bwtint_t x; + uint32_t *p, tmp, *end; if (k == (bwtint_t)(-1)) { memset(cnt, 0, 4 * sizeof(bwtint_t)); return; @@ -169,10 +169,9 @@ void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) k -= (k >= bwt->primary); // because $ is not in bwt p = bwt_occ_intv(bwt, k); memcpy(cnt, p, 4 * sizeof(bwtint_t)); - p += sizeof(bwtint_t); - j = k >> 4 << 4; - for (l = k & ~OCC_INTV_MASK, x = 0; l < j; l += 16, ++p) - x += __occ_aux4(bwt, *p); + p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) + end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop + for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p); tmp = *p & ~((1U<<((~k&15)<<1)) - 1); x += __occ_aux4(bwt, tmp) - (~k&15); cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; @@ -188,23 +187,22 @@ void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtin bwt_occ4(bwt, k, cntk); bwt_occ4(bwt, l, cntl); } else { - bwtint_t i, j, x, y; - uint32_t *p, tmp; + bwtint_t x, y; + uint32_t *p, tmp, *endk, *endl; k -= (k >= bwt->primary); // because $ is not in bwt l -= (l >= bwt->primary); p = bwt_occ_intv(bwt, k); memcpy(cntk, p, 4 * sizeof(bwtint_t)); - p += sizeof(bwtint_t); + p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) // prepare cntk[] - j = k >> 4 << 4; - for (i = k & ~OCC_INTV_MASK, x = 0; i < j; i += 16, ++p) - x += __occ_aux4(bwt, *p); + endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); + endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4)); + for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p); y = x; tmp = *p & ~((1U<<((~k&15)<<1)) - 1); x += __occ_aux4(bwt, tmp) - (~k&15); // calculate cntl[] and finalize cntk[] - j = l >> 4 << 4; - for (; i < j; i += 16, ++p) y += __occ_aux4(bwt, *p); + for (; p < endl; ++p) y += __occ_aux4(bwt, *p); tmp = *p & ~((1U<<((~l&15)<<1)) - 1); y += __occ_aux4(bwt, tmp) - (~l&15); memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); From fd6706420788f9a80996d1f22385291f9adcffed Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 11:51:03 -0500 Subject: [PATCH 293/498] removed an unnecessary condition --- bwt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bwt.c b/bwt.c index d57e2d5..ab0a6fc 100644 --- a/bwt.c +++ b/bwt.c @@ -324,8 +324,7 @@ int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, kv_push(bwtintv_t, *mem, ik); } } // otherwise the match is contained in another longer match - } - if (c >= 0 && ok[c].x[2] >= min_intv && (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2])) { + } else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) { ok[c].info = p->info; kv_push(bwtintv_t, *curr, ok[c]); } From 80e1137a6c27f9d2984f28400175a0e9d96eb82a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 11:57:36 -0500 Subject: [PATCH 294/498] move bwt_invPsi() from bwt.h to bwt.c --- bwt.c | 8 ++++++++ bwt.h | 17 ----------------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/bwt.c b/bwt.c index ab0a6fc..43979ac 100644 --- a/bwt.c +++ b/bwt.c @@ -45,6 +45,14 @@ void bwt_gen_cnt_table(bwt_t *bwt) } } +static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA +{ + bwtint_t x = k - (k > bwt->primary); + x = bwt_B0(bwt, x); + x = bwt->L2[x] + bwt_occ(bwt, k, x); + return k == bwt->primary? 0 : x; +} + // bwt->bwt and bwt->occ must be precalculated void bwt_cal_sa(bwt_t *bwt, int intv) { diff --git a/bwt.h b/bwt.h index ab5aecd..e7b0f97 100644 --- a/bwt.h +++ b/bwt.h @@ -124,21 +124,4 @@ extern "C" { } #endif -// inverse Psi function -#if 0 -#define bwt_invPsi(bwt, k) \ - (((k) == (bwt)->primary)? 0 : \ - ((k) < (bwt)->primary)? \ - (bwt)->L2[bwt_B0(bwt, k)] + bwt_occ(bwt, k, bwt_B0(bwt, k)) \ - : (bwt)->L2[bwt_B0(bwt, (k)-1)] + bwt_occ(bwt, k, bwt_B0(bwt, (k)-1))) -#else -static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) -{ - bwtint_t x = k - (k > bwt->primary); - x = bwt_B0(bwt, x); - x = bwt->L2[x] + bwt_occ(bwt, k, x); - return k == bwt->primary? 0 : x; -} -#endif - #endif From aa92c720b5ddf2d6107a537ed8f7bb6d94fbc1bb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 12:09:28 -0500 Subject: [PATCH 295/498] cleanup bwt_occ() --- bwt.c | 11 +++++------ kopen.c | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/bwt.c b/bwt.c index 43979ac..4ee9ea8 100644 --- a/bwt.c +++ b/bwt.c @@ -105,21 +105,20 @@ static inline int __occ_aux(uint64_t y, int c) bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) { - bwtint_t n, l, j; - uint32_t *p; + bwtint_t n; + uint32_t *p, *end; if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; if (k == (bwtint_t)(-1)) return 0; - if (k >= bwt->primary) --k; // because $ is not in bwt + k -= (k >= bwt->primary); // because $ is not in bwt // retrieve Occ at k/OCC_INTERVAL n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; p += sizeof(bwtint_t); // jump to the start of the first BWT cell // calculate Occ up to the last k/32 - j = k >> 5 << 5; - for (l = k/OCC_INTERVAL*OCC_INTERVAL; l < j; l += 32, p += 2) - n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); + end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1); + for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); // calculate Occ n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); diff --git a/kopen.c b/kopen.c index 45f2713..8887932 100644 --- a/kopen.c +++ b/kopen.c @@ -203,7 +203,7 @@ static int ftp_open(const char *fn) static char **cmd2argv(const char *cmd) { int i, beg, end, argc; - char **argv, *p, *q, *str; + char **argv, *str; end = strlen(cmd); for (i = end - 1; i >= 0; --i) if (!isspace(cmd[i])) break; @@ -217,7 +217,7 @@ static char **cmd2argv(const char *cmd) argv = (char**)calloc(argc + 2, sizeof(void*)); argv[0] = str = (char*)calloc(end - beg + 1, 1); strncpy(argv[0], cmd + beg, end - beg); - for (i = argc = 1, q = p = str; i < end - beg; ++i) + for (i = argc = 1; i < end - beg; ++i) if (isspace(str[i])) str[i] = 0; else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i]; return argv; From bfb2583d7f52a021bb1ae5d7564f3cf1014a9a0a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 12:10:19 -0500 Subject: [PATCH 296/498] r291: summary - bwt.c micro optimization --- main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.c b/main.c index c1c232a..5930a78 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r284-beta" +#define PACKAGE_VERSION "0.6.2-r291-beta" #endif static int usage() From c6b226d71971cfdacb9792ac5cb278ed3615094b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 12:49:48 -0500 Subject: [PATCH 297/498] r292: fixed a very stupid bug on CLI I was thinking 0x10 or 16, but wrote 0x16... --- bwamem.h | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.h b/bwamem.h index 7fc2c85..8a7c7b8 100644 --- a/bwamem.h +++ b/bwamem.h @@ -15,7 +15,7 @@ typedef struct __smem_i smem_i; #define MEM_F_PE 0x2 #define MEM_F_NOPAIRING 0x4 #define MEM_F_ALL 0x8 -#define MEM_F_NO_MULTI 0x16 +#define MEM_F_NO_MULTI 0x10 typedef struct { int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r diff --git a/main.c b/main.c index 5930a78..f7ec3d7 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r291-beta" +#define PACKAGE_VERSION "0.6.2-r292-beta" #endif static int usage() From 619ac4f93d6e0049aff19a22d39685260a03cc28 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 13:03:35 -0500 Subject: [PATCH 298/498] r293: bugfix - wrong RG type in SAM output --- bwamem.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 7c837bf..e75a3c4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -584,7 +584,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } - if (bwa_rg_id[0]) { kputsn("\tRG:i:", 6, str); kputs(bwa_rg_id, str); } + if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } kputc('\n', str); free(cigar); diff --git a/main.c b/main.c index f7ec3d7..473bcd3 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r292-beta" +#define PACKAGE_VERSION "0.6.2-r293-beta" #endif static int usage() From 32f2d60a2e6406c3114a7bd7f6a11f7413dfcb0a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 13:14:33 -0500 Subject: [PATCH 299/498] r294: bugfix - -M not working --- bwamem.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index e75a3c4..8d8402b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -517,7 +517,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons kputs(s->name, str); kputc('\t', str); if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag - if (sam_flag&0x10000) sam_flag |= 0x100; + if (p->flag&0x10000) sam_flag |= 0x100; if (!copy_mate) { cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) diff --git a/main.c b/main.c index 473bcd3..80c9fb1 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r293-beta" +#define PACKAGE_VERSION "0.6.2-r294-beta" #endif static int usage() From 98787f0ae064241baeebf8cd394913a2b1cc2587 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 13:36:01 -0500 Subject: [PATCH 300/498] r295: generate NM --- bwa.c | 22 ++++++++++++++++++---- bwa.h | 2 +- bwamem.c | 5 +++-- fastmap.c | 3 ++- main.c | 2 +- 5 files changed, 25 insertions(+), 9 deletions(-) diff --git a/bwa.c b/bwa.c index e8221ca..aef2ec8 100644 --- a/bwa.c +++ b/bwa.c @@ -70,13 +70,13 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) *****************/ // Generate CIGAR when the alignment end points are known -uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar) +uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) { uint32_t *cigar = 0; uint8_t tmp, *rseq; int i, w; int64_t rlen; - *n_cigar = 0; + *n_cigar = 0; *NM = -1; if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range @@ -95,6 +95,20 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa w += abs(rlen - l_query); // NW alignment *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); + {// compute NM + int k, x, y, n_mm = 0, n_gap = 0; + for (k = 0, x = y = 0; k < *n_cigar; ++k) { + int op = cigar[k]&0xf; + int len = cigar[k]>>4; + if (op == 0) { // match + for (i = 0; i < len; ++i) + if (query[x + i] != rseq[y + i]) ++n_mm; + x += len; y += len; + } else if (op == 1) x += len, n_gap += len; + else if (op == 2) y += len, n_gap += len; + } + *NM = n_mm + n_gap; + } if (rb >= l_pac) // reverse back query for (i = 0; i < l_query>>1; ++i) tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; @@ -126,10 +140,10 @@ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, } } if (mid >= 0) { - int i, score, n_cigar, y; + int i, score, n_cigar, y, NM; uint32_t *cigar; int64_t x; - cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar); + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM); for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) { int op = cigar[i]&0xf, len = cigar[i]>>4; if (op == 0) { diff --git a/bwa.h b/bwa.h index 2d6c7bf..81d40e0 100644 --- a/bwa.h +++ b/bwa.h @@ -30,7 +30,7 @@ extern "C" { bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); - uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar); + uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); char *bwa_idx_infer_prefix(const char *hint); diff --git a/bwamem.c b/bwamem.c index 8d8402b..156e9b7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -496,7 +496,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) { #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) - int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0; + int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1; uint32_t *cigar = 0; int64_t pos; bwahit_t ptmp, *p = &ptmp; @@ -519,7 +519,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag if (p->flag&0x10000) sam_flag |= 0x100; if (!copy_mate) { - cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar); + cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM); p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) } else n_cigar = 0, cigar = 0; pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); @@ -582,6 +582,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons str->s[str->l] = 0; } else kputc('*', str); } + if (NM >= 0) { kputsn("\tNM:i:", 6, str); kputw(NM, str); } if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } diff --git a/fastmap.c b/fastmap.c index b4f8ea8..81ce665 100644 --- a/fastmap.c +++ b/fastmap.c @@ -47,6 +47,7 @@ int main_mem(int argc, char *argv[]) if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak } else if (c == 's') opt->split_width = atoi(optarg); } + if (opt->n_threads < 1) opt->n_threads = 1; if (optind + 1 >= argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); @@ -94,7 +95,7 @@ int main_mem(int argc, char *argv[]) opt->flag |= MEM_F_PE; } } - while ((seqs = bseq_read(opt->chunk_size * (ko2? 2 : 1), &n, ks, ks2)) != 0) { + while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { int64_t size = 0; if (!copy_comment) for (i = 0; i < n; ++i) { diff --git a/main.c b/main.c index 80c9fb1..a33830b 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r294-beta" +#define PACKAGE_VERSION "0.6.2-r295-beta" #endif static int usage() From 54ab3bbec74658c65ac1be48c631c1b128ea0225 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 14:35:03 -0500 Subject: [PATCH 301/498] Dropped solid2fastq.pl SOLiD is not supported any more. --- solid2fastq.pl | 111 ------------------------------------------------- 1 file changed, 111 deletions(-) delete mode 100755 solid2fastq.pl diff --git a/solid2fastq.pl b/solid2fastq.pl deleted file mode 100755 index c60ad81..0000000 --- a/solid2fastq.pl +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/perl -w - -# Author: lh3 -# Note: Ideally, this script should be written in C. It is a bit slow at present. -# Also note that this script is different from the one contained in MAQ. - -use strict; -use warnings; -use Getopt::Std; - -my %opts; -my $version = '0.1.4'; -my $usage = qq{ -Usage: solid2fastq.pl - -Note: is the string showed in the `# Title:' line of a - ".csfasta" read file. Then F3.csfasta is read sequence - file and F3_QV.qual is the quality file. If - R3.csfasta is present, this script assumes reads are - paired; otherwise reads will be regarded as single-end. - - The read name will be :panel_x_y/[12] with `1' for R3 - tag and `2' for F3. Usually you may want to use short - to save diskspace. Long also causes troubles to maq. - -}; - -getopts('', \%opts); -die($usage) if (@ARGV != 2); -my ($title, $pre) = @ARGV; -my (@fhr, @fhw); -my @fn_suff = ('F3.csfasta', 'F3_QV.qual', 'R3.csfasta', 'R3_QV.qual'); -my $is_paired = (-f "$title$fn_suff[2]" || -f "$title$fn_suff[2].gz")? 1 : 0; -if ($is_paired) { # paired end - for (0 .. 3) { - my $fn = "$title$fn_suff[$_]"; - $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); - open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); - } - open($fhw[0], "|gzip >$pre.read2.fastq.gz") || die; # this is NOT a typo - open($fhw[1], "|gzip >$pre.read1.fastq.gz") || die; - open($fhw[2], "|gzip >$pre.single.fastq.gz") || die; - my (@df, @dr); - @df = &read1(1); @dr = &read1(2); - while (@df && @dr) { - if ($df[0] eq $dr[0]) { # mate pair - print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1]; - @df = &read1(1); @dr = &read1(2); - } else { - if ($df[0] le $dr[0]) { - print {$fhw[2]} $df[1]; - @df = &read1(1); - } else { - print {$fhw[2]} $dr[1]; - @dr = &read1(2); - } - } - } - if (@df) { - print {$fhw[2]} $df[1]; - while (@df = &read1(1, $fhr[0], $fhr[1])) { - print {$fhw[2]} $df[1]; - } - } - if (@dr) { - print {$fhw[2]} $dr[1]; - while (@dr = &read1(2, $fhr[2], $fhr[3])) { - print {$fhw[2]} $dr[1]; - } - } - close($fhr[$_]) for (0 .. $#fhr); - close($fhw[$_]) for (0 .. $#fhw); -} else { # single end - for (0 .. 1) { - my $fn = "$title$fn_suff[$_]"; - $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); - open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); - } - open($fhw[2], "|gzip >$pre.single.fastq.gz") || die; - my @df; - while (@df = &read1(1, $fhr[0], $fhr[1])) { - print {$fhw[2]} $df[1]; - } - close($fhr[$_]) for (0 .. $#fhr); - close($fhw[2]); -} - -sub read1 { - my $i = shift(@_); - my $j = ($i-1)<<1; - my ($key, $seq); - my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]); - while (<$fhs>) { - my $t = <$fhq>; - if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) { - $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines - die(qq/** unmatched read name: '$_' != '$_'\n/) unless ($_ eq $t); - my $name = "$pre:$1_$2_$3/$i"; - $_ = substr(<$fhs>, 2); - tr/0123./ACGTN/; - my $s = $_; - $_ = <$fhq>; - s/-1\b/0/eg; - s/^(\d+)\s*//; - s/(\d+)\s*/chr($1+33)/eg; - $seq = qq/\@$name\n$s+\n$_\n/; - last; - } - } - return defined($seq)? ($key, $seq) : (); -} From acd1ab607b8048485e871df139294236f646e679 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 16:26:46 -0500 Subject: [PATCH 302/498] r297: reduce wasteful SW extension This is particularly important for long sequences --- bwamem.c | 20 ++++++++++++++++---- main.c | 2 +- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/bwamem.c b/bwamem.c index 156e9b7..4471682 100644 --- a/bwamem.c +++ b/bwamem.c @@ -648,7 +648,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) { - int i, j; + int i, j, k; mem_chain_v chn; mem_alnreg_v regs, tmp; for (i = 0; i < l_seq; ++i) @@ -658,9 +658,21 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * if (bwa_verbose >= 4) mem_print_chain(bns, &chn); kv_init(regs); kv_init(tmp); for (i = 0; i < chn.n; ++i) { - mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, &chn.a[i], &tmp); - for (j = 0; j < tmp.n; ++j) - kv_push(mem_alnreg_t, regs, tmp.a[j]); + mem_chain_t *p = &chn.a[i]; + for (j = 0; j < regs.n; ++j) { // check if all the seeds are contained in alnreg found previously + mem_alnreg_t *q = ®s.a[j]; + for (k = 0; k < p->n; ++k) { + mem_seed_t *s = &p->seeds[k]; + if (!(s->qbeg >= q->qb && s->qbeg + s->len <= q->qe && s->rbeg >= q->rb && s->rbeg + s->len <= q->re)) + break; // stop if seed is not contained + } + if (k == p->n) break; // if all seeds are contained, stop + } + if (j == regs.n) { + mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &tmp); + for (j = 0; j < tmp.n; ++j) + kv_push(mem_alnreg_t, regs, tmp.a[j]); + } free(chn.a[i].seeds); } free(chn.a); free(tmp.a); diff --git a/main.c b/main.c index a33830b..f566493 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r295-beta" +#define PACKAGE_VERSION "0.6.2-r297-beta" #endif static int usage() From ee80fb8bd07451f0eba4d7dc9f76d507ed325a13 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 26 Feb 2013 22:55:44 -0500 Subject: [PATCH 303/498] Test each seed to see if extension is needed The old version wastefully extends many seeds contained in an aligned region found before. While this wastes little time for short reads, it becomes a serious defect for long query sequences. This is an attempt to fix this problem, but more tuning are needed. --- bwamem.c | 83 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/bwamem.c b/bwamem.c index 4471682..8b2216e 100644 --- a/bwamem.c +++ b/bwamem.c @@ -13,6 +13,7 @@ #include "ksw.h" #include "kvec.h" #include "ksort.h" +#include "utils.h" /* Theory on probability and scoring *ungapped* alignment * @@ -417,6 +418,21 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT * Construct the alignment from a chain * ****************************************/ +static const char LogTable256[256] = { +#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n + -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6), + LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7) +}; +#undef LT + +static inline int ilog2(uint32_t v) +{ + register uint32_t t, tt; + if ((tt = (v >> 16))) return (t = (tt >> 8)) ? 24 + LogTable256[t] : 16 + LogTable256[tt]; + return (t = (v >> 8)) ? 8 + LogTable256[t] : LogTable256[v]; +} + static inline int cal_max_gap(const mem_opt_t *opt, int qlen) { int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.); @@ -429,8 +445,9 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int int64_t rlen, rmax[2], tmp, max = 0; const mem_seed_t *s; uint8_t *rseq = 0; + uint64_t *srt; - av->n = 0; + if (c->n == 0) return; // get the max possible span rmax[0] = l_pac<<1; rmax[1] = 0; for (i = 0; i < c->n; ++i) { @@ -446,11 +463,31 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); if (rlen != rmax[1] - rmax[0]) return; - for (k = 0; k < c->n;) { + srt = malloc(c->n * 8); + for (i = 0; i < c->n; ++i) + srt[i] = (uint64_t)c->seeds[i].len<<32 | i; + ks_introsort_64(c->n, srt); + + for (k = c->n - 1; k >= 0; --k) { mem_alnreg_t *a; + s = &c->seeds[(uint32_t)srt[k]]; + + for (i = 0; i < av->n; ++i) { // test whether extension has been made before + mem_alnreg_t *p = &av->a[i]; + int64_t rd; + int qd, w; + if (s->qbeg < p->qb || s->qbeg + s->len > p->qe || s->rbeg < p->rb || s->rbeg + s->len > p->re) continue; + qd = s->qbeg - p->qb; + rd = s->rbeg - p->rb; + w = ilog2(p->re - p->rb)<<1; // heuristic band width: small size for short hits + w = w < opt->w? w : opt->w; + if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit + } + if (i < av->n) continue; + a = kv_pushp(mem_alnreg_t, *av); - s = &c->seeds[k]; memset(a, 0, sizeof(mem_alnreg_t)); + if (s->qbeg) { // left extension uint8_t *rs, *qs; int qle, tle; @@ -464,7 +501,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int free(qs); free(rs); } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; - if (s->qbeg + s->len != l_query) { // right extension of the first seed + if (s->qbeg + s->len != l_query) { // right extension int qle, tle, qe, re; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; @@ -472,21 +509,15 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->qe = qe + qle; a->re = rmax[0] + re + tle; } else a->qe = l_query, a->re = s->rbeg + s->len; if (bwa_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); + // compute seedcov for (i = 0, a->seedcov = 0; i < c->n; ++i) { const mem_seed_t *t = &c->seeds[i]; if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough } - // jump to the next seed that: 1) has no >7bp overlap with the previous seed, or 2) is not fully contained in the alignment - for (i = k + 1; i < c->n; ++i) { - const mem_seed_t *t = &c->seeds[i]; - if ((t-1)->rbeg + (t-1)->len >= t->rbeg + 7 || (t-1)->qbeg + (t-1)->len >= t->qbeg + 7) break; - if (t->rbeg + t->len > a->re || t->qbeg + t->len > a->qe) break; - } - k = i; } - free(rseq); + free(srt); free(rseq); } /***************************** @@ -648,34 +679,24 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) { - int i, j, k; + int i; mem_chain_v chn; - mem_alnreg_v regs, tmp; - for (i = 0; i < l_seq; ++i) + mem_alnreg_v regs; + + for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; + chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); if (bwa_verbose >= 4) mem_print_chain(bns, &chn); - kv_init(regs); kv_init(tmp); + + kv_init(regs); for (i = 0; i < chn.n; ++i) { mem_chain_t *p = &chn.a[i]; - for (j = 0; j < regs.n; ++j) { // check if all the seeds are contained in alnreg found previously - mem_alnreg_t *q = ®s.a[j]; - for (k = 0; k < p->n; ++k) { - mem_seed_t *s = &p->seeds[k]; - if (!(s->qbeg >= q->qb && s->qbeg + s->len <= q->qe && s->rbeg >= q->rb && s->rbeg + s->len <= q->re)) - break; // stop if seed is not contained - } - if (k == p->n) break; // if all seeds are contained, stop - } - if (j == regs.n) { - mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, &tmp); - for (j = 0; j < tmp.n; ++j) - kv_push(mem_alnreg_t, regs, tmp.a[j]); - } + mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); free(chn.a[i].seeds); } - free(chn.a); free(tmp.a); + free(chn.a); regs.n = mem_sort_and_dedup(regs.n, regs.a); return regs; } From 0b533385efff5960506a7832e07196221c476c9f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 00:29:11 -0500 Subject: [PATCH 304/498] r299: better way to exclude seed --- bwamem.c | 35 +++++++++++++---------------------- main.c | 2 +- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/bwamem.c b/bwamem.c index 8b2216e..5c526f4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -418,25 +418,11 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT * Construct the alignment from a chain * ****************************************/ -static const char LogTable256[256] = { -#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n - -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6), - LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7) -}; -#undef LT - -static inline int ilog2(uint32_t v) -{ - register uint32_t t, tt; - if ((tt = (v >> 16))) return (t = (tt >> 8)) ? 24 + LogTable256[t] : 16 + LogTable256[tt]; - return (t = (v >> 8)) ? 8 + LogTable256[t] : LogTable256[v]; -} - static inline int cal_max_gap(const mem_opt_t *opt, int qlen) { int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.); - return l > 1? l : 1; + l = l > 1? l : 1; + return l < opt->w<<1? l : opt->w<<1; } void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) @@ -475,13 +461,18 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int for (i = 0; i < av->n; ++i) { // test whether extension has been made before mem_alnreg_t *p = &av->a[i]; int64_t rd; - int qd, w; - if (s->qbeg < p->qb || s->qbeg + s->len > p->qe || s->rbeg < p->rb || s->rbeg + s->len > p->re) continue; - qd = s->qbeg - p->qb; - rd = s->rbeg - p->rb; - w = ilog2(p->re - p->rb)<<1; // heuristic band width: small size for short hits - w = w < opt->w? w : opt->w; + int qd, w, max_gap; + if (s->rbeg < p->rb || s->rbeg + s->len > p->re || s->qbeg < p->qb || s->qbeg + s->len > p->qe) continue; // not fully contained + // qd: distance ahead of the seed on query; rd: on reference + qd = s->qbeg - p->qb; rd = s->rbeg - p->rb; + max_gap = cal_max_gap(opt, qd < rd? qd : rd); // the maximal gap allowed in regions ahead of the seed + w = max_gap < opt->w? max_gap : opt->w; // bounded by the band width if (qd - rd < w && rd - qd < w) break; // the seed is "around" a previous hit + // similar to the previous four lines, but this time we look at the region behind + qd = p->qe - (s->qbeg + s->len); rd = p->re - (s->rbeg + s->len); + max_gap = cal_max_gap(opt, qd < rd? qd : rd); + w = max_gap < opt->w? max_gap : opt->w; + if (qd - rd < w && rd - qd < w) break; } if (i < av->n) continue; diff --git a/main.c b/main.c index f566493..12fbf20 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r297-beta" +#define PACKAGE_VERSION "0.6.2-r299-beta" #endif static int usage() From 65e099df347d96a845f1161b665e51e352bef6a4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 00:37:17 -0500 Subject: [PATCH 305/498] r300: fixed an out-of-boundary bug in rare case --- bwamem.c | 6 ++++++ main.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 5c526f4..86b3e7a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -445,6 +445,12 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int rmax[1] = rmax[1] > e? rmax[1] : e; if (t->len > max) max = t->len; } + rmax[0] = rmax[0] > 0? rmax[0] : 0; + rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1; + if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side + if (l_pac - rmax[0] > rmax[1] - l_pac) rmax[1] = l_pac; + else rmax[0] = l_pac; + } // retrieve the reference sequence rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); if (rlen != rmax[1] - rmax[0]) return; diff --git a/main.c b/main.c index 12fbf20..636f818 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r299-beta" +#define PACKAGE_VERSION "0.6.2-r300-beta" #endif static int usage() From b621d3ae38a06484ec4931c944421012b53f775a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 00:42:19 -0500 Subject: [PATCH 306/498] r301: left-align indels Don't know why the change is working... --- ksw.c | 8 ++++---- main.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ksw.c b/ksw.c index 742fec9..4cbcb32 100644 --- a/ksw.c +++ b/ksw.c @@ -492,10 +492,10 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, uint8_t d; // direction p->h = h1; h += q[j]; - d = h > e? 0 : 1; - h = h > e? h : e; - d = h > f? d : 2; - h = h > f? h : f; + d = h >= e? 0 : 1; + h = h >= e? h : e; + d = h >= f? d : 2; + h = h >= f? h : f; h1 = h; h -= gapoe; e -= gape; diff --git a/main.c b/main.c index 636f818..7648310 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r300-beta" +#define PACKAGE_VERSION "0.6.2-r301-beta" #endif static int usage() From e620f0ff4ed29b93758128ad1141ec5248b5e591 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 13:16:22 -0500 Subject: [PATCH 307/498] r302: updated the manpage --- bwa.1 | 264 ++++++++++++++++++++++++++++++++++++++++++------------ fastmap.c | 7 +- main.c | 2 +- 3 files changed, 215 insertions(+), 58 deletions(-) diff --git a/bwa.1 b/bwa.1 index 66bc9a2..442d069 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,47 +1,45 @@ -.TH bwa 1 "19 June 2012" "bwa-0.6.2" "Bioinformatics tools" +.TH bwa 1 "27 Feburary 2013" "bwa-0.7.0" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool .SH SYNOPSIS .PP -bwa index -a bwtsw database.fasta +bwa index ref.fa .PP -bwa aln database.fasta short_read.fastq > aln_sa.sai +bwa mem ref.fa reads.fq > aln-se.sam .PP -bwa samse database.fasta aln_sa.sai short_read.fastq > aln.sam +bwa mem ref.fa read1.fq read2.fq > aln-pe.sam .PP -bwa sampe database.fasta aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln.sam +bwa aln ref.fa short_read.fq > aln_sa.sai .PP -bwa bwasw database.fasta long_read.fastq > aln.sam +bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam +.PP +bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam +.PP +bwa bwasw ref.fa long_read.fq > aln.sam .SH DESCRIPTION .PP -BWA is a fast light-weighted tool that aligns relatively short sequences -(queries) to a sequence database (targe), such as the human reference -genome. It implements two different algorithms, both based on -Burrows-Wheeler Transform (BWT). The first algorithm is designed for -short queries up to ~150bp with low error rate (<3%). It does gapped -global alignment w.r.t. queries, supports paired-end reads, and is one -of the fastest short read alignment algorithms to date while also -visiting suboptimal hits. The second algorithm, BWA-SW, is designed for -reads longer than 100bp with more errors. It performs a heuristic Smith-Waterman-like -alignment to find high-scoring local hits and split hits. On -low-error short queries, BWA-SW is a little slower and less accurate than the -first algorithm, but on long queries, it is better. -.PP -For both algorithms, the database file in the FASTA format must be -first indexed with the -.B `index' -command, which typically takes a few hours for a 3GB genome. The first algorithm is -implemented via the -.B `aln' -command, which finds the suffix array (SA) coordinates of good hits of -each individual read, and the -.B `samse/sampe' -command, which converts SA coordinates to chromosomal coordinate and -pairs reads (for `sampe'). The second algorithm is invoked by the -.B `bwasw' -command. It works for single-end reads only. +BWA is a software package for mapping low-divergent sequences against a large +reference genome, such as the human genome. It consists of three algorithms: +BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina +sequence reads up to 100bp, while the rest two for longer sequences ranged from +70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read +support and split alignment, but BWA-MEM, which is the latest, is generally +recommended for high-quality queries as it is faster and more accurate. +BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina +reads. + +For all the algorithms, BWA first needs to construct the FM-index for +the reference genome (the +.B index +command). Alignment algorithms are invoked with different sub-commands: +.BR aln / samse / sampe +for BWA-backtrack, +.B bwasw +for BWA-SW and +.B mem +for the BWA-MEM algorithm. .SH COMMANDS AND OPTIONS .TP @@ -53,9 +51,6 @@ Index database sequences in the FASTA format. .B OPTIONS: .RS .TP 10 -.B -c -Build color-space index. The input fast should be in nucleotide space. (Disabled since 0.6.x) -.TP .BI -p \ STR Prefix of the output database [same as db filename] .TP @@ -76,6 +71,168 @@ genome. .RE .RE +.TP +.B mem +.B bwa mem +.RB [ -aCHMpP ] +.RB [ -t +.IR nThreads ] +.RB [ -k +.IR minSeedLen ] +.RB [ -w +.IR bandWidth ] +.RB [ -r +.IR seedSplitRatio ] +.RB [ -c +.IR maxOcc ] +.RB [ -A +.IR matchScore ] +.RB [ -B +.IR mmPenalty ] +.RB [ -O +.IR gapOpenPen ] +.RB [ -E +.IR gapExtPen ] +.RB [ -U +.IR unpairPen ] +.RB [ -R +.IR RGline ] +.RB [ -v +.IR verboseLevel ] +.I db.prefix +.I reads.fq +.RI [ mates.fq ] + +Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the +algorithm works by seeding alignments with maximal exact matches (MEMs) and +then extending seeds with the affine-gap Smith-Waterman algorithm (SW). + +If +.I mates.fq +file is absent and option +.B -p +is not set, this command regards input reads are single-end. If +.I mates.fq +is present, this command assumes the +.IR i -th +read in +.I reads.fq +and the +.IR i -th +read in +.I mates.fq +constitute a read pair. If +.B -p +is used, the command assumes the +.RI 2 i -th +and the +.RI (2 i +1)-th +read in +.I reads.fq +constitute a read pair (such input file is said to be interleaved). In this case, +.I mates.fq +is ignored. In the paired-end mode, the +.B mem +command will infer the read orientation and the insert size distribution from a +batch of reads. + +The BWA-MEM algorithm performs local alignment. It may produce multiple primary +alignments for different part of a query sequence. This is a crucial feature +for long sequences. However, some tools such as Picard's markDuplicates does +not work with split alignments. One may consider to use option +.B -M +to flag shorter split hits as secondary. + +.B OPTIONS: +.RS +.TP 10 +.BI -t \ INT +Number of threads [1] +.TP +.BI -k \ INT +Minimum seed length. Matches shorter than +.I INT +will be missed. The alignment speed is usually insensitive to this value unless +it significantly deviates 20. [19] +.TP +.BI -w \ INT +Band width. Essentially, gaps longer than +.I INT +will not be found. Note that the maximum gap length is also affected by the +scoring matrix and the hit length, not solely determined by this option. [100] +.TP +.BI -r \ FLOAT +Trigger re-seeding for a MEM longer than +.IR minSeedLen * FLOAT . +This is a key heuristic parameter for tuning the performance. Larger value +yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5] +.TP +.BI -c \ INT +Discard a MEM if it has more than +.I INT +occurence in the genome. This is an insensitive parameter. [10000] +.TP +.B -P +In the paired-end mode, perform SW to rescue missing hits only but do not try to find +hits that fit a proper pair. +.TP +.BI -A \ INT +Matching score. [1] +.TP +.BI -B \ INT +Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4] +.TP +.BI -O \ INT +Gap open penalty. [6] +.TP +.BI -E \ INT +Gap extension penalty. A gap of length k costs O + k*E (i.e. +.B -O +is for opening a zero-length gap). [1] +.TP +.BI -U \ INT +Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as +.RI scoreRead1+scoreRead2- INT +and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these +two scores to determine whether we should force pairing. [9] +.TP +.B -p +Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details. +.TP +.BI -R \ STR +Complete read group header line. '\\t' can be used in +.I STR +and will be converted to a TAB in the output SAM. The read group ID will be +attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'. +[null] +.TP +.B -a +Output all found alignments for single-end or unpaired paired-end reads. These +alignments will be flagged as secondary alignments. +.TP +.B -C +Append append FASTA/Q comment to SAM output. This option can be used to +transfer read meta information (e.g. barcode) to the SAM output. Note that the +FASTA/Q comment (the string after a space in the header line) must conform the SAM +spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output. +.TP +.B -H +Use hard clipping 'H' in the SAM output. This option may dramatically reduce +the redundancy of output when mapping long contig or BAC sequences. +.TP +.B -M +Mark shorter split hits as secondary (for Picard compatibility). +.TP +.BI -v \ INT +Control the verbose level of the output. This option has not been fully +supported throughout BWA. Ideally, a value 0 for disabling all the output to +stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for +all normal messages; 4 or higher for debugging. When this option takes value +4, the output is not SAM. [3] + +.RE +.RE + .TP .B aln bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i @@ -482,24 +639,6 @@ Pairing is slower for shorter reads. This is mainly because shorter reads have more spurious hits and converting SA coordinates to chromosomal coordinates are very costly. -.SH NOTES ON LONG-READ ALIGNMENT -.PP -Command -.B bwasw -is designed for long-read alignment. BWA-SW essentially aligns the trie -of the reference genome against the directed acyclic word graph (DAWG) of a -read to find seeds not highly repetitive in the genome, and then performs a -standard Smith-Waterman algorithm to extend the seeds. A key heuristic, called -the Z-best heuristic, is that at each vertex in the DAWG, BWA-SW only keeps the -top Z reference suffix intervals that match the vertex. BWA-SW is more accurate -if the resultant alignment is supported by more seeds, and therefore BWA-SW -usually performs better on long queries or queries with low divergence to the -reference genome. - -BWA-SW is perhaps a better choice than BWA-short for 100bp single-end HiSeq reads -mainly because it gives better gapped alignment. For paired-end reads, it is yet -to know whether BWA-short or BWA-SW yield overall better results. - .SH CHANGES IN BWA-0.6 .PP Since version 0.6, BWA has been able to work with a reference genome longer than 4GB. @@ -534,16 +673,23 @@ The full BWA package is distributed under GPLv3 as it uses source codes from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS libraries are distributed under the MIT license. .PP -If you use the short-read alignment component, please cite the following +If you use the BWA-backtrack algorithm, please cite the following paper: .PP Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168] .PP -If you use the long-read component (BWA-SW), please cite: +If you use the BWA-SW algorithm, please cite: .PP Li H. and Durbin R. (2010) Fast and accurate long-read alignment with Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505] +.PP +If you use the fastmap component of BWA, please cite: +.PP +Li H. (2012) Exploring single-sample SNP and INDEL calling with whole-genome de +novo assembly. Bioinformatics, 28, 1838-1844. [PMID: 22569178] +.PP +The BWA-MEM algorithm has not been published yet. .SH HISTORY BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW @@ -569,3 +715,11 @@ short-read aligners are being implemented. The BWA-SW algorithm is a new component of BWA. It was conceived in November 2008 and implemented ten months later. + +The BWA-MEM algorithm is based on an algorithm finding super-maximal exact +matches (SMEMs), which was first published with the fermi assembler paper +in 2012. I first implemented the basic SMEM algorithm in the +.B fastmap +command for an experiment and then extended the basic algorithm and added the +extension part in Feburary 2013 to make BWA-MEM a fully featured mapper. + diff --git a/fastmap.c b/fastmap.c index 81ce665..b2d5f39 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,13 +26,14 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:w:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); else if (c == 'E') opt->r = atoi(optarg); + else if (c == 'U') opt->pen_unpaired = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; @@ -56,13 +57,14 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); - fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); +// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -P skip pairing; perform mate SW only\n"); fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r); + fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); @@ -72,6 +74,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, " -H hard clipping\n"); fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n"); + fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n"); fprintf(stderr, "\n"); free(opt); return 1; diff --git a/main.c b/main.c index 7648310..041a83e 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r301-beta" +#define PACKAGE_VERSION "0.6.2-r302-beta" #endif static int usage() From 292e92b602f6b198529109887234de5f4b06b84f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 15:39:15 -0500 Subject: [PATCH 308/498] r303: bugfix - wrong band width when CIGAR --- bwa.1 | 2 -- bwa.c | 10 ++++++---- main.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bwa.1 b/bwa.1 index 442d069..198c2ab 100644 --- a/bwa.1 +++ b/bwa.1 @@ -229,8 +229,6 @@ supported throughout BWA. Ideally, a value 0 for disabling all the output to stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for all normal messages; 4 or higher for debugging. When this option takes value 4, the output is not SAM. [3] - -.RE .RE .TP diff --git a/bwa.c b/bwa.c index aef2ec8..3e2f30e 100644 --- a/bwa.c +++ b/bwa.c @@ -74,7 +74,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa { uint32_t *cigar = 0; uint8_t tmp, *rseq; - int i, w; + int i, w, max_gap, min_w; int64_t rlen; *n_cigar = 0; *NM = -1; if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand @@ -89,10 +89,12 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); // set the band-width - w = (int)((double)(l_query * mat[0] - q) / r + 1.); - w = w < 1? w : 1; + max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.); + max_gap = max_gap > 1? max_gap : 1; + w = (max_gap + abs(rlen - l_query) + 1) >> 1; w = w < w_? w : w_; - w += abs(rlen - l_query); + min_w = abs(rlen - l_query) + 3; + w = w > min_w? w : min_w; // NW alignment *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); {// compute NM diff --git a/main.c b/main.c index 041a83e..da1d5dc 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r302-beta" +#define PACKAGE_VERSION "0.6.2-r303-beta" #endif static int usage() From aef179a58006cc96ba6ff0b7cf1465ba470d09c7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 16:55:07 -0500 Subject: [PATCH 309/498] r304: prepare release notes (not released yet) --- NEWS | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/NEWS b/NEWS index d68c693..be7c035 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,47 @@ +Beta Release 0.7.0 (28 Feburary, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release comes with a new alignment algorithm, BWA-MEM, for 70bp-1Mbp query +sequences. BWA-MEM essentially seeds alignments with a variant of the fastmap +algorithm and extends seeds with banded affine-gap-penalty dynamic programming +(i.e. the Smith-Waterman-Gotoh algorithm). For typical Illumina 100bp reads or +longer low-divergence query sequences, BWA-MEM is about twice as fast as BWA +and BWA-SW and is more accurate. It also supports split alignments like BWA-SW +and may optionally output multiple hits like BWA. BWA-MEM does not guarantee +to find hits within a certain edit distance, but BWA is not efficient for such +task given longer reads, either, and the edit-distance criterion is arguably +not as important in long-read alignment. + +In addition to the algorithmic improvements, BWA-SW also implements a few +handy features, some of which are experimental: + + 1. BWA-MEM automatically infers pair orientation from a batch of single-end + alignments. It allows more than one orientations if there are sufficient + reads supporting them. This feature has not been tested on reads from + Illumina jumping library yet. + + 2. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It + is possible to convert a name-sorted BAM to an interleaved fastq on the fly + and feed the data stream to BWA-MEM for mapping. + + 3. BWA-MEM optionally copies FASTA/Q comments to the final SAM output. This + helps to transfer individual read annotations to the output. + + 4. BWA-MEM supports more advanced piping. Users can now run: + (bwa mem ref.fa ' Date: Wed, 27 Feb 2013 16:56:54 -0500 Subject: [PATCH 310/498] r305: in NEWS, convert TAB to space --- NEWS | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/NEWS b/NEWS index be7c035..0a969e1 100644 --- a/NEWS +++ b/NEWS @@ -16,20 +16,20 @@ In addition to the algorithmic improvements, BWA-SW also implements a few handy features, some of which are experimental: 1. BWA-MEM automatically infers pair orientation from a batch of single-end - alignments. It allows more than one orientations if there are sufficient - reads supporting them. This feature has not been tested on reads from - Illumina jumping library yet. + alignments. It allows more than one orientations if there are sufficient + reads supporting them. This feature has not been tested on reads from + Illumina jumping library yet. 2. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It - is possible to convert a name-sorted BAM to an interleaved fastq on the fly - and feed the data stream to BWA-MEM for mapping. + is possible to convert a name-sorted BAM to an interleaved fastq on the fly + and feed the data stream to BWA-MEM for mapping. 3. BWA-MEM optionally copies FASTA/Q comments to the final SAM output. This helps to transfer individual read annotations to the output. 4. BWA-MEM supports more advanced piping. Users can now run: - (bwa mem ref.fa ' Date: Wed, 27 Feb 2013 21:13:39 -0500 Subject: [PATCH 311/498] r306: introduce clipping penalty More clipping leads to more severe reference bias. We should not clip the alignment unless necessary. --- bwa.1 | 9 +++++++++ bwamem.c | 21 +++++++++++++-------- bwamem.h | 4 +++- fastmap.c | 4 +++- ksw.c | 12 +++++++++--- ksw.h | 2 +- main.c | 2 +- 7 files changed, 39 insertions(+), 15 deletions(-) diff --git a/bwa.1 b/bwa.1 index 198c2ab..45b9921 100644 --- a/bwa.1 +++ b/bwa.1 @@ -93,6 +93,8 @@ genome. .IR gapOpenPen ] .RB [ -E .IR gapExtPen ] +.RB [ -L +.IR clipPen ] .RB [ -U .IR unpairPen ] .RB [ -R @@ -190,6 +192,13 @@ Gap extension penalty. A gap of length k costs O + k*E (i.e. .B -O is for opening a zero-length gap). [1] .TP +.BI -L \ INT +Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best +score reaching the end of query. If this score is larger than the best SW score +minus the clipping penalty, clipping will not be applied. Note that in this +case, the SAM AS tag reports the best SW score; clipping penalty is not +deducted. [5] +.TP .BI -U \ INT Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as .RI scoreRead1+scoreRead2- INT diff --git a/bwamem.c b/bwamem.c index 86b3e7a..f5173d3 100644 --- a/bwamem.c +++ b/bwamem.c @@ -42,8 +42,10 @@ mem_opt_t *mem_opt_init() { mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); - o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; o->flag = 0; + o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; + o->pen_unpaired = 9; + o->pen_clip = 5; o->min_seed_len = 19; o->split_width = 10; o->max_occ = 10000; @@ -54,7 +56,6 @@ mem_opt_t *mem_opt_init() o->split_factor = 1.5; o->chunk_size = 10000000; o->n_threads = 1; - o->pen_unpaired = 9; o->max_matesw = 100; mem_fill_scmat(o->a, o->b, o->mat); return o; @@ -487,23 +488,27 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (s->qbeg) { // left extension uint8_t *rs, *qs; - int qle, tle; + int qle, tle, gtle, gscore; qs = malloc(s->qbeg); for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; tmp = s->rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle); - a->qb = s->qbeg - qle; a->rb = s->rbeg - tle; + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle, >le, &gscore); + // check whether we prefer to reach the end of the query + if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits + else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end free(qs); free(rs); } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension - int qle, tle, qe, re; + int qle, tle, qe, re, gtle, gscore; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle); - a->qe = qe + qle; a->re = rmax[0] + re + tle; + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, >le, &gscore); + // similar to the above + if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; + else a->qe = l_query, a->re = rmax[0] + re + gtle; } else a->qe = l_query, a->re = s->rbeg + s->len; if (bwa_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); diff --git a/bwamem.h b/bwamem.h index 8a7c7b8..5c63402 100644 --- a/bwamem.h +++ b/bwamem.h @@ -19,7 +19,10 @@ typedef struct __smem_i smem_i; typedef struct { int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r + int pen_unpaired; // phred-scaled penalty for unpaired reads + int pen_clip; // clipping penalty. This score is not deducted from the DP score. int w; // band width + int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor @@ -30,7 +33,6 @@ typedef struct { int chunk_size; // process chunk_size-bp sequences in a batch float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain - int pen_unpaired; // phred-scaled penalty for unpaired reads int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset diff --git a/fastmap.c b/fastmap.c index b2d5f39..56cfb01 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,13 +26,14 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); else if (c == 'E') opt->r = atoi(optarg); + else if (c == 'L') opt->pen_clip = atoi(optarg); else if (c == 'U') opt->pen_unpaired = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; @@ -64,6 +65,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r); + fprintf(stderr, " -L INT penalty for clipping [%d]\n", opt->pen_clip); fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); diff --git a/ksw.c b/ksw.c index 4cbcb32..b97fed5 100644 --- a/ksw.c +++ b/ksw.c @@ -359,11 +359,11 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore) { eh_t *eh; // score array int8_t *qp; // query profile - int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap; + int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore; if (h0 < 0) h0 = 0; // allocate memory qp = malloc(qlen * m); @@ -385,7 +385,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, max_gap = max_gap > 1? max_gap : 1; w = w < max_gap? w : max_gap; // DP loop - max = h0, max_i = max_j = -1; + max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { int f = 0, h1, m = 0, mj = -1; @@ -421,6 +421,10 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, f = f > h? f : h; // computed F(i,j+1) } eh[end].h = h1; eh[end].e = 0; + if (j == qlen) { + max_ie = gscore > h1? max_ie : i; + gscore = gscore > h1? gscore : h1; + } if (m == 0) break; if (m > max) max = m, max_i = i, max_j = mj; // update beg and end for the next round @@ -433,6 +437,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, free(eh); free(qp); if (_qle) *_qle = max_j + 1; if (_tle) *_tle = max_i + 1; + if (_gtle) *_gtle = max_ie + 1; + if (_gscore) *_gscore = gscore; return max; } diff --git a/ksw.h b/ksw.h index 5162dc0..40216d9 100644 --- a/ksw.h +++ b/ksw.h @@ -62,7 +62,7 @@ extern "C" { */ kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry); - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore); int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar); #ifdef __cplusplus diff --git a/main.c b/main.c index da1d5dc..0590e63 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r303-beta" +#define PACKAGE_VERSION "0.6.2-r306-beta" #endif static int usage() From 64d92d26dfe712be2f28d58ea1be6cdfea881d6f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 21:40:46 -0500 Subject: [PATCH 312/498] more documentation in ksw.h --- Makefile | 17 +++++++++-------- ksw.h | 45 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index de45ff1..98d0eb6 100644 --- a/Makefile +++ b/Makefile @@ -28,14 +28,19 @@ bwa:libbwa.a $(AOBJS) main.o libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) -QSufSort.o:QSufSort.h -bwt_gen.o:QSufSort.h - ksw.o:ksw.h +kstring.o:kstring.h utils.o:utils.h ksort.h kseq.h bntseq.o:bntseq.h bwt.o:bwt.h utils.h -bwa.o:bwa.h +bwa.o:bwa.h bwt.h bntseq.h +bwamem.o:ksw.h kbtree.h ksort.h kvec.h kstring.h utils.h bwamem.h +bwamem_pair.o:ksw.h kvec.h kstring.h utils.h bwamem.h + +QSufSort.o:QSufSort.h +bwt_gen.o:QSufSort.h + +fastmap.o:bwt.h bwamem.h bwtaln.o:bwt.h bwtaln.h kseq.h bwtgap.o:bwtgap.h bwtaln.h bwt.h @@ -44,9 +49,5 @@ bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h bwtsw2_main.o:bwtsw2.h -bwamem.o:bwamem.h -bwamem_pair.o:bwamem.h -fastmap.o:bwt.h bwamem.h - clean: rm -f gmon.out *.o a.out $(PROG) *~ *.a diff --git a/ksw.h b/ksw.h index 40216d9..d2975de 100644 --- a/ksw.h +++ b/ksw.h @@ -30,7 +30,7 @@ extern "C" { * @param tlen length of the target sequence * @param target target sequence * @param m number of residue types - * @param mat m*m scoring matrix in one-dimention array + * @param mat m*m scoring matrix in one-dimension array * @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)" * @param gape gap extension penalty * @param xtra extra information (see below) @@ -62,8 +62,47 @@ extern "C" { */ kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry); - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore); - int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar); + /** + * Banded global alignment + * + * @param qlen query length + * @param query query sequence with 0 <= query[i] < m + * @param tlen target length + * @param target target sequence with 0 <= target[i] < m + * @param m number of residue types + * @param mat m*m scoring mattrix in one-dimension array + * @param gapo gap open penalty; a gap of length l cost "-(gapo+l*gape)" + * @param gape gap extension penalty + * @param w band width + * @param n_cigar (out) number of CIGAR elements + * @param cigar (out) BAM-encoded CIGAR; caller need to deallocate with free() + * + * @return score of the alignment + */ + int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar); + + /** + * Extend alignment + * + * The routine aligns $query and $target, assuming their upstream sequences, + * which are not provided, have been aligned with score $h0. In return, + * region [0,*qle) on the query and [0,*tle) on the target sequences are + * aligned together. If *gscore>=0, *gscore keeps the best score such that + * the entire query sequence is aligned; *gtle keeps the position on the + * target where *gscore is achieved. Returning *gscore and *gtle helps the + * caller to decide whether an end-to-end hit or a partial hit is preferred. + * + * The first 9 parameters are identical to those in ksw_global() + * + * @param h0 alignment score of upstream sequences + * @param _qle (out) length of the query in the alignment + * @param _tle (out) length of the target in the alignment + * @param _gtle (out) length of the target if query is fully aligned + * @param _gscore (out) score of the best end-to-end alignment; negative if not found + * + * @return best semi-local alignment score + */ + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore); #ifdef __cplusplus } From df7c3f00004a20a4c80e3d8e9fd27f5874908946 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 22:28:29 -0500 Subject: [PATCH 313/498] r308: added a new API to convert region to CIGAR and an example program demonstrating how to do single-end alignment in <50 lines of C code. --- Makefile | 5 ++++- bwamem.c | 34 ++++++++++++++++++++++++++++++---- bwamem.h | 14 ++++++++++++-- example.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ main.c | 2 +- 5 files changed, 97 insertions(+), 8 deletions(-) create mode 100644 example.c diff --git a/Makefile b/Makefile index 98d0eb6..eab4198 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamli is.o bwtindex.o bwape.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o -PROG= bwa +PROG= bwa bwamem-lite INCLUDES= LIBS= -lm -lz -lpthread SUBDIRS= . @@ -25,6 +25,9 @@ all:$(PROG) bwa:libbwa.a $(AOBJS) main.o $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa +bwamem-lite:libbwa.a example.o + $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa + libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) diff --git a/bwamem.c b/bwamem.c index f5173d3..9950097 100644 --- a/bwamem.c +++ b/bwamem.c @@ -679,7 +679,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b s->sam = str.s; } -mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) +mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) { int i; mem_chain_v chn; @@ -703,6 +703,32 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * return regs; } +mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) +{ // the difference from mem_align1_core() lies in that this routine calls mem_mark_primary_se() + mem_alnreg_v ar; + ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq); + mem_mark_primary_se(opt, ar.n, ar.a); + return ar; +} + +// This routine is only used for the API purpose +mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *ar) +{ + mem_aln_t a; + int qb = ar->qb, qe = ar->qe, NM, score, is_rev; + int64_t pos, rb = ar->rb, re = ar->re; + memset(&a, 0, sizeof(mem_aln_t)); + a.mapq = mem_approx_mapq_se(opt, ar); + bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); + a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, opt->w, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); + a.NM = NM; + pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); + a.is_rev = is_rev; + a.rid = bns_pos2rid(bns, pos); + a.pos = pos - bns->anns[a.rid].offset; + return a; +} + typedef struct { int start, step, n; const mem_opt_t *opt; @@ -720,11 +746,11 @@ static void *worker1(void *data) int i; if (!(w->opt->flag&MEM_F_PE)) { for (i = w->start; i < w->n; i += w->step) - w->regs[i] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); + w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); } else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower for (i = w->start; i < w->n>>1; i += w->step) { - w->regs[i<<1|0] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq); - w->regs[i<<1|1] = mem_align1(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq); + w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq); + w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq); } } return 0; diff --git a/bwamem.h b/bwamem.h index 5c63402..9996f6b 100644 --- a/bwamem.h +++ b/bwamem.h @@ -49,19 +49,27 @@ typedef struct { int secondary; // index of the parent hit shadowing the current hit; <0 if primary } mem_alnreg_t; +typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; + typedef struct { int low, high, failed; double avg, std; } mem_pestat_t; -typedef struct { +typedef struct { // TODO: This is an intermediate struct only. Better get rid of it. int64_t rb, re; int qb, qe, flag, qual; // optional info int score, sub; } bwahit_t; -typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; +typedef struct { // This struct is only used for the convenience of API. + int rid; + int pos; + uint32_t is_rev:1, mapq:8, NM:23; + int n_cigar; + uint32_t *cigar; +} mem_aln_t; #ifdef __cplusplus extern "C" { @@ -114,6 +122,8 @@ extern "C" { */ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq); + mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *ar); + /** * Infer the insert size distribution from interleaved alignment regions * diff --git a/example.c b/example.c new file mode 100644 index 0000000..2fefde4 --- /dev/null +++ b/example.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include "bwamem.h" +#include "kseq.h" // for the FASTA/Q parser +KSEQ_DECLARE(gzFile) + +int main(int argc, char *argv[]) +{ + bwaidx_t *idx; + gzFile fp; + kseq_t *ks; + mem_opt_t *opt; + + if (argc < 3) { + fprintf(stderr, "Usage: bwamem-lite \n"); + return 1; + } + + idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index + assert(idx); + fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); + assert(fp); + ks = kseq_init(fp); // initialize the FASTA/Q parser + opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values + + while (kseq_read(ks) >= 0) { // read one sequence + mem_alnreg_v ar; + int i, k; + ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits + for (i = 0; i < ar.n; ++i) { // traverse each hit + mem_aln_t a; + if (ar.a[i].secondary >= 0) continue; // skip secondary alignments + a = mem_reg2aln(opt, idx->bns, idx->pac, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR + printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq); + for (k = 0; k < a.n_cigar; ++k) + printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); + printf("\t%d\n", a.NM); + free(a.cigar); // don't forget to deallocate CIGAR + } + free(ar.a); // and deallocate the hit list + } + + free(opt); + kseq_destroy(ks); + gzclose(fp); + bwa_idx_destroy(idx); + return 0; +} diff --git a/main.c b/main.c index 0590e63..0009fc6 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r306-beta" +#define PACKAGE_VERSION "0.6.2-r308-beta" #endif static int usage() From 6a4d8c79d8b69104a0780c5a3cc837f928460a5e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 22:45:18 -0500 Subject: [PATCH 314/498] r309: bugfix - soft clipping missing in example.c --- bwamem.c | 14 +++++++++++++- bwamem.h | 2 +- example.c | 2 +- main.c | 2 +- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index 9950097..6c59f59 100644 --- a/bwamem.c +++ b/bwamem.c @@ -712,7 +712,7 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * } // This routine is only used for the API purpose -mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *ar) +mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar) { mem_aln_t a; int qb = ar->qb, qe = ar->qe, NM, score, is_rev; @@ -722,6 +722,18 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, opt->w, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); a.NM = NM; + if (qb != 0 || qe != l_query) { // add clipping to CIGAR + int clip5, clip3; + clip5 = is_rev? l_query - qe : qb; + clip3 = is_rev? qb : l_query - qe; + a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2)); + if (clip5) { + memmove(a.cigar+1, a.cigar, a.n_cigar * 4); + a.cigar[0] = clip5<<4|3; + ++a.n_cigar; + } + if (clip3) a.cigar[a.n_cigar++] = clip3<<4|3; + } pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); a.is_rev = is_rev; a.rid = bns_pos2rid(bns, pos); diff --git a/bwamem.h b/bwamem.h index 9996f6b..c2f124c 100644 --- a/bwamem.h +++ b/bwamem.h @@ -122,7 +122,7 @@ extern "C" { */ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq); - mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, const mem_alnreg_t *ar); + mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar); /** * Infer the insert size distribution from interleaved alignment regions diff --git a/example.c b/example.c index 2fefde4..c0fede6 100644 --- a/example.c +++ b/example.c @@ -32,7 +32,7 @@ int main(int argc, char *argv[]) for (i = 0; i < ar.n; ++i) { // traverse each hit mem_aln_t a; if (ar.a[i].secondary >= 0) continue; // skip secondary alignments - a = mem_reg2aln(opt, idx->bns, idx->pac, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR + a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq); for (k = 0; k < a.n_cigar; ++k) printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); diff --git a/main.c b/main.c index 0009fc6..d301bb4 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r308-beta" +#define PACKAGE_VERSION "0.6.2-r309-beta" #endif static int usage() From a33b9c0633cf0a720db1deeaaa892169fbc85087 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 23:40:46 -0500 Subject: [PATCH 315/498] tighter bw for cigar SW --- bwamem.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/bwamem.c b/bwamem.c index 6c59f59..37e65e2 100644 --- a/bwamem.c +++ b/bwamem.c @@ -552,7 +552,10 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag if (p->flag&0x10000) sam_flag |= 0x100; if (!copy_mate) { - cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM); + int w2 = (int)((double)((p->qe - p->qb < p->re - p->rb? p->qe - p->qb : p->re - p->rb) * mat[0] - p->score - q) / r + 1.499); + w2 = w2 > 1? w2 : 1; + w2 = w2 < w? w2 : w; + cigar = bwa_gen_cigar(mat, q, r, w2, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM); p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) } else n_cigar = 0, cigar = 0; pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); @@ -715,13 +718,18 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar) { mem_aln_t a; - int qb = ar->qb, qe = ar->qe, NM, score, is_rev; + int w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev; int64_t pos, rb = ar->rb, re = ar->re; memset(&a, 0, sizeof(mem_aln_t)); a.mapq = mem_approx_mapq_se(opt, ar); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); - a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, opt->w, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); + w2 = (int)((double)((qe - qb < re - rb? qe - qb : re - rb) * opt->a - ar->score - opt->q) / opt->r + 1.499); + w2 = w2 > 1? w2 : 1; + w2 = w2 < opt->w? w2 : opt->w; + a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); a.NM = NM; + pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); + a.is_rev = is_rev; if (qb != 0 || qe != l_query) { // add clipping to CIGAR int clip5, clip3; clip5 = is_rev? l_query - qe : qb; @@ -734,8 +742,6 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * } if (clip3) a.cigar[a.n_cigar++] = clip3<<4|3; } - pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); - a.is_rev = is_rev; a.rid = bns_pos2rid(bns, pos); a.pos = pos - bns->anns[a.rid].offset; return a; From f3cff1c609903b71d56a0d1fe94361e95502b140 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 27 Feb 2013 23:59:50 -0500 Subject: [PATCH 316/498] r311: even tighter bw for CIGAR --- bwa.c | 33 +++++++++++++++++++++------------ bwamem.c | 16 ++++++++++++---- main.c | 2 +- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/bwa.c b/bwa.c index 3e2f30e..beea6d1 100644 --- a/bwa.c +++ b/bwa.c @@ -74,7 +74,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa { uint32_t *cigar = 0; uint8_t tmp, *rseq; - int i, w, max_gap, min_w; + int i; int64_t rlen; *n_cigar = 0; *NM = -1; if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand @@ -86,17 +86,26 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa for (i = 0; i < rlen>>1; ++i) tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; } - //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); - //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); - // set the band-width - max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.); - max_gap = max_gap > 1? max_gap : 1; - w = (max_gap + abs(rlen - l_query) + 1) >> 1; - w = w < w_? w : w_; - min_w = abs(rlen - l_query) + 3; - w = w > min_w? w : min_w; - // NW alignment - *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); + if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP + cigar = malloc(4); + cigar[0] = l_query<<4 | 0; + *n_cigar = 1; + for (i = 0, *score = 0; i < l_query; ++i) + *score += mat[rseq[i]*5 + query[i]]; + } else { + int w, max_gap, min_w; + //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); + // set the band-width + max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.); + max_gap = max_gap > 1? max_gap : 1; + w = (max_gap + abs(rlen - l_query) + 1) >> 1; + w = w < w_? w : w_; + min_w = abs(rlen - l_query) + 3; + w = w > min_w? w : min_w; + // NW alignment + *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); + } {// compute NM int k, x, y, n_mm = 0, n_gap = 0; for (k = 0, x = y = 0; k < *n_cigar; ++k) { diff --git a/bwamem.c b/bwamem.c index 37e65e2..52dc7fb 100644 --- a/bwamem.c +++ b/bwamem.c @@ -526,6 +526,15 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int * Basic hit->SAM conversion * *****************************/ +static inline int infer_bw(int l1, int l2, int score, int a, int q, int r) +{ + int w; + if (l1 == l2 && l1 * a - score < (q + r)<<1) return 0; // to get equal alignment length, we need at least two gaps + w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.); + if (w < abs(l1 - l2)) w = abs(l1 - l2); + return w; +} + void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) { #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) @@ -552,8 +561,8 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag if (p->flag&0x10000) sam_flag |= 0x100; if (!copy_mate) { - int w2 = (int)((double)((p->qe - p->qb < p->re - p->rb? p->qe - p->qb : p->re - p->rb) * mat[0] - p->score - q) / r + 1.499); - w2 = w2 > 1? w2 : 1; + int w2; + w2 = infer_bw(p->qe - p->qb, p->re - p->rb, p->score, mat[0], q, r); w2 = w2 < w? w2 : w; cigar = bwa_gen_cigar(mat, q, r, w2, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM); p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) @@ -723,8 +732,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * memset(&a, 0, sizeof(mem_aln_t)); a.mapq = mem_approx_mapq_se(opt, ar); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); - w2 = (int)((double)((qe - qb < re - rb? qe - qb : re - rb) * opt->a - ar->score - opt->q) / opt->r + 1.499); - w2 = w2 > 1? w2 : 1; + w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r); w2 = w2 < opt->w? w2 : opt->w; a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); a.NM = NM; diff --git a/main.c b/main.c index d301bb4..bc40374 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r309-beta" +#define PACKAGE_VERSION "0.6.2-r311-beta" #endif static int usage() From 39fcde9c19eb9b5dbd08648431386ad451725646 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 28 Feb 2013 00:58:24 -0500 Subject: [PATCH 317/498] updated NEWS further --- NEWS | 25 +++++++++++++++++-------- example.c | 5 +++-- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/NEWS b/NEWS index 0a969e1..25ad9ff 100644 --- a/NEWS +++ b/NEWS @@ -13,23 +13,32 @@ task given longer reads, either, and the edit-distance criterion is arguably not as important in long-read alignment. In addition to the algorithmic improvements, BWA-SW also implements a few -handy features, some of which are experimental: +handy features in practical aspects: - 1. BWA-MEM automatically infers pair orientation from a batch of single-end + 1. BWA-MEM automatically switches between local and glocal (global wrt reads; + local wrt reference) alignment. It reports the end-to-end glocal alignment + if the glocal alignment is not much worse than the optimal local alignment. + Glocal alignment reduces reference bias. + + 2. BWA-MEM automatically infers pair orientation from a batch of single-end alignments. It allows more than one orientations if there are sufficient - reads supporting them. This feature has not been tested on reads from - Illumina jumping library yet. + supporting reads. This feature has not been tested on reads from Illumina + jumping library yet. (EXPERIMENTAL) - 2. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It + 3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It is possible to convert a name-sorted BAM to an interleaved fastq on the fly and feed the data stream to BWA-MEM for mapping. - 3. BWA-MEM optionally copies FASTA/Q comments to the final SAM output. This + 4. BWA-MEM optionally copies FASTA/Q comments to the final SAM output, which helps to transfer individual read annotations to the output. - 4. BWA-MEM supports more advanced piping. Users can now run: + 5. BWA-MEM supports more advanced piping. Users can now run: (bwa mem ref.fa '= 0) continue; // skip secondary alignments a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR + // print alignment printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq); - for (k = 0; k < a.n_cigar; ++k) + for (k = 0; k < a.n_cigar; ++k) // print CIGAR printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); - printf("\t%d\n", a.NM); + printf("\t%d\n", a.NM); // print edit distance free(a.cigar); // don't forget to deallocate CIGAR } free(ar.a); // and deallocate the hit list From c5434ac865b71fe6fc842d63ffebb7aedc217c7a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 28 Feb 2013 15:56:05 -0500 Subject: [PATCH 318/498] r313: release bwa-0.7.0 --- NEWS | 6 +++--- main.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index 25ad9ff..35202f1 100644 --- a/NEWS +++ b/NEWS @@ -9,10 +9,10 @@ longer low-divergence query sequences, BWA-MEM is about twice as fast as BWA and BWA-SW and is more accurate. It also supports split alignments like BWA-SW and may optionally output multiple hits like BWA. BWA-MEM does not guarantee to find hits within a certain edit distance, but BWA is not efficient for such -task given longer reads, either, and the edit-distance criterion is arguably +task given longer reads anyway, and the edit-distance criterion is arguably not as important in long-read alignment. -In addition to the algorithmic improvements, BWA-SW also implements a few +In addition to the algorithmic improvements, BWA-MEM also implements a few handy features in practical aspects: 1. BWA-MEM automatically switches between local and glocal (global wrt reads; @@ -47,7 +47,7 @@ for 76bp or longer Illumina reads and long query sequences. The original BWA short-read algorithm will not deliver satisfactory results for 150bp+ Illumina reads. Change of mappers will be necessary sooner or later. -(0.7.0 beta: 28 Feburary 2013, r304) +(0.7.0 beta: 28 Feburary 2013, r313) diff --git a/main.c b/main.c index bc40374..ba60cf7 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.6.2-r311-beta" +#define PACKAGE_VERSION "0.7.0-r313" #endif static int usage() From 3e4a178e084397ced83533680219c88549b1619f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Mar 2013 11:14:51 -0500 Subject: [PATCH 319/498] r314: cleanup bwamem API Don't modify input sequences; more documentations --- bwamem.c | 18 ++++++++++++++---- bwamem.h | 29 +++++++++++++++++++++-------- example.c | 2 +- main.c | 2 +- 4 files changed, 37 insertions(+), 14 deletions(-) diff --git a/bwamem.c b/bwamem.c index 52dc7fb..7efbb8d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -715,20 +715,29 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse return regs; } -mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) -{ // the difference from mem_align1_core() lies in that this routine calls mem_mark_primary_se() +mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_) +{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence mem_alnreg_v ar; + char *seq; + seq = malloc(l_seq); + memcpy(seq, seq_, l_seq); // makes a copy of seq_ ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq); mem_mark_primary_se(opt, ar.n, ar.a); + free(seq); return ar; } // This routine is only used for the API purpose -mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar) +mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) { mem_aln_t a; - int w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev; + int i, w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev; int64_t pos, rb = ar->rb, re = ar->re; + uint8_t *query; + + query = malloc(l_query); + for (i = 0; i < l_query; ++i) // convert to the nt4 encoding + query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; memset(&a, 0, sizeof(mem_aln_t)); a.mapq = mem_approx_mapq_se(opt, ar); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); @@ -752,6 +761,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * } a.rid = bns_pos2rid(bns, pos); a.pos = pos - bns->anns[a.rid].offset; + free(query); return a; } diff --git a/bwamem.h b/bwamem.h index c2f124c..a856fa5 100644 --- a/bwamem.h +++ b/bwamem.h @@ -64,11 +64,11 @@ typedef struct { // TODO: This is an intermediate struct only. Better get rid of } bwahit_t; typedef struct { // This struct is only used for the convenience of API. - int rid; - int pos; - uint32_t is_rev:1, mapq:8, NM:23; - int n_cigar; - uint32_t *cigar; + int rid; // reference sequence index in bntseq_t + int pos; // forward strand 5'-end mapping position + uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance + int n_cigar; // number of CIGAR operations + uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 } mem_aln_t; #ifdef __cplusplus @@ -116,13 +116,26 @@ extern "C" { * @param bns Information of the reference * @param pac 2-bit encoded reference * @param l_seq length of query sequence - * @param seq query sequence; conversion ACGTN/acgtn=>01234 to be applied + * @param seq query sequence * * @return list of aligned regions. */ - mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq); + mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq); - mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, uint8_t *query, const mem_alnreg_t *ar); + /** + * Generate CIGAR and forward-strand position from alignment region + * + * @param opt alignment parameters + * @param bwt FM-index of the reference sequence + * @param bns Information of the reference + * @param pac 2-bit encoded reference + * @param l_seq length of query sequence + * @param seq query sequence + * @param ar one alignment region + * + * @return CIGAR, strand, mapping quality and forward-strand position + */ + mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar); /** * Infer the insert size distribution from interleaved alignment regions diff --git a/example.c b/example.c index 6564cbd..b59eec2 100644 --- a/example.c +++ b/example.c @@ -32,7 +32,7 @@ int main(int argc, char *argv[]) for (i = 0; i < ar.n; ++i) { // traverse each hit mem_aln_t a; if (ar.a[i].secondary >= 0) continue; // skip secondary alignments - a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, (uint8_t*)ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR + a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR // print alignment printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq); for (k = 0; k < a.n_cigar; ++k) // print CIGAR diff --git a/main.c b/main.c index ba60cf7..fee5bd6 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r313" +#define PACKAGE_VERSION "0.7.0-r314" #endif static int usage() From 35fb7f9fdfb6bc1396e63f9727301a3a12e9b155 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 1 Mar 2013 11:47:51 -0500 Subject: [PATCH 320/498] r315: move kopen.o out of libbwa.a --- Makefile | 4 ++-- bwamem.h | 3 +-- main.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index eab4198..36f951f 100644 --- a/Makefile +++ b/Makefile @@ -3,9 +3,9 @@ CFLAGS= -g -Wall -O2 CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 -LOBJS= utils.o kstring.o ksw.o kopen.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o +LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ - is.o bwtindex.o bwape.o \ + is.o bwtindex.o bwape.o kopen.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa bwamem-lite diff --git a/bwamem.h b/bwamem.h index a856fa5..3f4ce32 100644 --- a/bwamem.h +++ b/bwamem.h @@ -109,7 +109,7 @@ extern "C" { * Find the aligned regions for one query sequence * * Note that this routine does not generate CIGAR. CIGAR should be - * generated later by bwa_gen_cigar() defined in bwa.c. + * generated later by mem_reg2aln() below. * * @param opt alignment parameters * @param bwt FM-index of the reference sequence @@ -126,7 +126,6 @@ extern "C" { * Generate CIGAR and forward-strand position from alignment region * * @param opt alignment parameters - * @param bwt FM-index of the reference sequence * @param bns Information of the reference * @param pac 2-bit encoded reference * @param l_seq length of query sequence diff --git a/main.c b/main.c index fee5bd6..7499609 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r314" +#define PACKAGE_VERSION "0.7.0-r315" #endif static int usage() From d35f33b5135ce2ea0017e34e04967c34ed04a1bf Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Mar 2013 10:22:18 -0500 Subject: [PATCH 321/498] r316: don't allocate zero-length memory It is not a bug, but Electric Fence does not like that. --- bntseq.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bntseq.c b/bntseq.c index 972837e..540e966 100644 --- a/bntseq.c +++ b/bntseq.c @@ -124,7 +124,7 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes); l_pac = xx; xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); - bns->ambs = (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)); + bns->ambs = bns->n_holes? (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)) : 0; for (i = 0; i < bns->n_holes; ++i) { bntamb1_t *p = bns->ambs + i; fscanf(fp, "%lld%d%s", &xx, &p->len, str); diff --git a/main.c b/main.c index 7499609..91a62cc 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r315" +#define PACKAGE_VERSION "0.7.0-r316" #endif static int usage() From 1a451df80082872890dd3a0904abb837be9c4f03 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Mar 2013 10:32:33 -0500 Subject: [PATCH 322/498] prepare to ditch stdaln.{h,c} --- bwtsw2_pair.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index cf29087..cad96e9 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -7,11 +7,7 @@ #include "bwtsw2.h" #include "kstring.h" #include "utils.h" -#ifndef _NO_SSE2 #include "ksw.h" -#else -#include "stdaln.h" -#endif #define MIN_RATIO 0.8 #define OUTLIER_BOUND 2.0 @@ -126,8 +122,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b for (i = 0; i < l_mseq; ++i) // on the forward strand seq[i] = nst_nt4_table[(int)mseq[i]]; } -#ifndef _NO_SSE2 - { // FIXME!!! The following block has not been tested since the update of the ksw library + { int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t; kswr_t aln; aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0); @@ -146,24 +141,6 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len); */ } -#else - { - AlnParam ap; - path_t path[2]; - int matrix[25]; - for (i = 0; i < 25; ++i) matrix[i] = g_mat[i]; - ap.gap_open = opt->q; ap.gap_ext = opt->r; ap.gap_end = opt->r; - ap.matrix = matrix; ap.row = 5; ap.band_width = 50; - a->G = aln_local_core(ref, end - beg, seq, l_mseq, &ap, path, 0, opt->t, &a->G2); - if (a->G < opt->t) a->G = 0; - if (a->G2 < opt->t) a->G2 = 0; - if (a->G2) a->flag |= BSW2_FLAG_TANDEM; - a->k = beg + path[0].i - 1; - a->len = path[1].i - path[0].i + 1; - a->beg = path[0].j - 1; - a->end = path[1].j; - } -#endif if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i; free(seq); } From 7e00dbcac524f09f6e49b14b118c2fb374ef5867 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Mar 2013 11:35:23 -0500 Subject: [PATCH 323/498] r317: bugfix - out-of-range extension This happens when target region crosses the forward-reverse boundary. This will almost never happen to short-read alignment. --- bwamem.c | 19 +++++++++++-------- main.c | 2 +- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/bwamem.c b/bwamem.c index 7efbb8d..10cfae8 100644 --- a/bwamem.c +++ b/bwamem.c @@ -176,7 +176,7 @@ typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; #define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) KBTREE_INIT(chn, mem_chain_t, chain_cmp) -static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t *p) +static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p) { int64_t qend, rend, x, y; const mem_seed_t *last = &c->seeds[c->n-1]; @@ -184,6 +184,7 @@ static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t rend = last->rbeg + last->len; if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) return 1; // contained seed; do nothing + if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand x = p->qbeg - last->qbeg; // always non-negtive y = p->rbeg - last->rbeg; if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain @@ -197,7 +198,7 @@ static int test_and_merge(const mem_opt_t *opt, mem_chain_t *c, const mem_seed_t return 0; // request to add a new chain } -static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *itr) +static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) *tree, smem_i *itr) { const bwtintv_v *a; int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); @@ -216,9 +217,10 @@ static void mem_insert_seed(const mem_opt_t *opt, kbtree_t(chn) *tree, smem_i *i s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; s.len = slen; + if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip if (kb_size(tree)) { kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain - if (!lower || !test_and_merge(opt, lower, &s)) to_add = 1; + if (!lower || !test_and_merge(opt, l_pac, lower, &s)) to_add = 1; } else to_add = 1; if (to_add) { // add the seed as a new chain tmp.n = 1; tmp.m = 4; @@ -249,7 +251,7 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) } } -mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq) +mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int len, const uint8_t *seq) { mem_chain_v chain; smem_i *itr; @@ -260,7 +262,7 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int len, const uin tree = kb_init(chn, KB_DEFAULT_SIZE); itr = smem_itr_init(bwt); smem_set_query(itr, len, seq); - mem_insert_seed(opt, tree, itr); + mem_insert_seed(opt, l_pac, tree, itr); kv_resize(mem_chain_t, chain, kb_size(tree)); @@ -449,12 +451,12 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int rmax[0] = rmax[0] > 0? rmax[0] : 0; rmax[1] = rmax[1] < l_pac<<1? rmax[1] : l_pac<<1; if (rmax[0] < l_pac && l_pac < rmax[1]) { // crossing the forward-reverse boundary; then choose one side - if (l_pac - rmax[0] > rmax[1] - l_pac) rmax[1] = l_pac; + if (c->seeds[0].rbeg < l_pac) rmax[1] = l_pac; // this works because all seeds are guaranteed to be on the same strand else rmax[0] = l_pac; } // retrieve the reference sequence rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); - if (rlen != rmax[1] - rmax[0]) return; + assert(rlen == rmax[1] - rmax[0]); srt = malloc(c->n * 8); for (i = 0; i < c->n; ++i) @@ -505,6 +507,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int int qle, tle, qe, re, gtle, gscore; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; + assert(re >= 0); a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, >le, &gscore); // similar to the above if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; @@ -700,7 +703,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; - chn = mem_chain(opt, bwt, l_seq, (uint8_t*)seq); + chn = mem_chain(opt, bwt, bns->l_pac, l_seq, (uint8_t*)seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); if (bwa_verbose >= 4) mem_print_chain(bns, &chn); diff --git a/main.c b/main.c index 91a62cc..ec69e81 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r316" +#define PACKAGE_VERSION "0.7.0-r317" #endif static int usage() From 40f121473621f8ef8afbc253ab22f2395a4e0827 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Mar 2013 11:52:11 -0500 Subject: [PATCH 324/498] change to debugging code only --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 10cfae8..848ade7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -513,7 +513,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; else a->qe = l_query, a->re = rmax[0] + re + gtle; } else a->qe = l_query, a->re = s->rbeg + s->len; - if (bwa_verbose >= 4) printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); + if (bwa_verbose >= 4) { printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); } // compute seedcov for (i = 0, a->seedcov = 0; i < c->n; ++i) { From 733410b50d4bcd772851c4d0930823c0227d1d67 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Mar 2013 14:43:49 -0500 Subject: [PATCH 325/498] r320: speed up very long sequence alignment 100-200bp read alignment should not be affected at all. --- bwamem.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- main.c | 2 +- 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 848ade7..84c8424 100644 --- a/bwamem.c +++ b/bwamem.c @@ -421,6 +421,68 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT * Construct the alignment from a chain * ****************************************/ +/* mem_chain2aln() vs mem_chain2aln_short() + * + * mem_chain2aln() covers all the functionality of mem_chain2aln_short(). + * However, it may waste time on extracting the reference sequences given a + * very long query. mem_chain2aln_short() is faster for very short chains in a + * long query. It may fail when the matches are long or reach the end of the + * query. In this case, mem_chain2aln() will be called again. + * mem_chain2aln_short() is almost never used for short-read alignment. + */ + +#define MEM_SHORT_EXT 50 +#define MEM_SHORT_LEN 200 + +int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) +{ + int i, qb, qe, xtra; + int64_t rb, re, rlen; + uint8_t *rseq = 0; + mem_alnreg_t a; + kswr_t x; + + if (c->n == 0) return -1; + qb = l_query; qe = 0; + rb = l_pac<<1; re = 0; + memset(&a, 0, sizeof(mem_alnreg_t)); + for (i = 0; i < c->n; ++i) { + const mem_seed_t *s = &c->seeds[i]; + qb = qb < s->qbeg? qb : s->qbeg; + qe = qe > s->qbeg + s->len? qe : s->qbeg + s->len; + rb = rb < s->rbeg? rb : s->rbeg; + re = re > s->rbeg + s->len? re : s->rbeg + s->len; + a.seedcov += s->len; + } + qb -= MEM_SHORT_EXT; qe += MEM_SHORT_EXT; + if (qb <= 10 || qe >= l_query - 10) return 1; // because ksw_align() does not support end-to-end alignment + rb -= MEM_SHORT_EXT; re += MEM_SHORT_EXT; + rb = rb > 0? rb : 0; + re = re < l_pac<<1? re : l_pac<<1; + if (rb < l_pac && l_pac < re) { + if (c->seeds[0].rbeg < l_pac) re = l_pac; + else rb = l_pac; + } + if ((re - rb) - (qe - qb) > MEM_SHORT_EXT || (qe - qb) - (re - rb) > MEM_SHORT_EXT) return 1; + if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1; + if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1; + + rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); + assert(rlen == re - rb); + xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); + x = ksw_align(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->q, opt->r, xtra, 0); + free(rseq); + if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1; + + a.rb = rb + x.tb; a.re = rb + x.te + 1; + a.qb = qb + x.qb; a.qe = qb + x.qe + 1; + a.score = x.score; + a.csub = x.score2; + kv_push(mem_alnreg_t, *av, a); + if (bwa_verbose >= 4) printf("SHORT: [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re); + return 0; +} + static inline int cal_max_gap(const mem_opt_t *opt, int qlen) { int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.); @@ -429,7 +491,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) } void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) -{ // FIXME: in general, we SHOULD check funny seed patterns such as contained seeds. When that happens, we should use a SW or extend more seeds +{ int i, k; int64_t rlen, rmax[2], tmp, max = 0; const mem_seed_t *s; @@ -710,7 +772,9 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse kv_init(regs); for (i = 0; i < chn.n; ++i) { mem_chain_t *p = &chn.a[i]; - mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); + int ret; + ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); + if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); free(chn.a[i].seeds); } free(chn.a); diff --git a/main.c b/main.c index ec69e81..be31bf0 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r317" +#define PACKAGE_VERSION "0.7.0-r320" #endif static int usage() From 59bc9341f6e71d7b59774e8a6e51b864fa3c9084 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Mar 2013 17:29:07 -0500 Subject: [PATCH 326/498] code backup; more changes coming later --- bwamem.c | 28 ++++++++++++++++++++-------- bwamem.h | 2 ++ ksw.c | 13 +++++++++---- ksw.h | 2 +- 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/bwamem.c b/bwamem.c index 84c8424..42788c4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -43,7 +43,7 @@ mem_opt_t *mem_opt_init() mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); o->flag = 0; - o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; + o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 60; o->max_w = 500; o->pen_unpaired = 9; o->pen_clip = 5; o->min_seed_len = 19; @@ -492,7 +492,7 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { - int i, k; + int i, k, max_off[2], aw[2]; // aw: actual bandwidth used in extension int64_t rlen, rmax[2], tmp, max = 0; const mem_seed_t *s; uint8_t *rseq = 0; @@ -549,16 +549,22 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a = kv_pushp(mem_alnreg_t, *av); memset(a, 0, sizeof(mem_alnreg_t)); + a->w = aw[0] = aw[1] = opt->w; if (s->qbeg) { // left extension uint8_t *rs, *qs; - int qle, tle, gtle, gscore; + int qle, tle, gtle, gscore, tmps = -1; qs = malloc(s->qbeg); for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; tmp = s->rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, opt->w, s->len * opt->a, &qle, &tle, >le, &gscore); + for (aw[0] = opt->w;; aw[0] <<= 1) { + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); + if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[0], max_off[0]); fflush(stdout); + if (a->score == tmps || aw[0]<<1 > opt->max_w || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; + tmps = a->score; + } // check whether we prefer to reach the end of the query if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end @@ -566,16 +572,21 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension - int qle, tle, qe, re, gtle, gscore; + int qle, tle, qe, re, gtle, gscore, tmps = -1; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; assert(re >= 0); - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, opt->w, a->score, &qle, &tle, >le, &gscore); + for (aw[1] = opt->w;; aw[1] <<= 1) { + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], a->score, &qle, &tle, >le, &gscore, &max_off[1]); + if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[1], max_off[1]); fflush(stdout); + if (a->score == tmps || aw[1]<<1 > opt->max_w || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; + tmps = a->score; + } // similar to the above if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; else a->qe = l_query, a->re = rmax[0] + re + gtle; } else a->qe = l_query, a->re = s->rbeg + s->len; - if (bwa_verbose >= 4) { printf("[%d] score=%d\t[%d,%d) <=> [%ld,%ld)\n", k, a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); } + if (bwa_verbose >= 4) { printf("[%d]\taw={%d,%d}\tscore=%d\t[%d,%d) <=> [%ld,%ld)\n", k, aw[0], aw[1], a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); } // compute seedcov for (i = 0, a->seedcov = 0; i < c->n; ++i) { @@ -583,6 +594,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (t->qbeg >= a->qb && t->qbeg + t->len <= a->qe && t->rbeg >= a->rb && t->rbeg + t->len <= a->re) // seed fully contained a->seedcov += t->len; // this is not very accurate, but for approx. mapQ, this is good enough } + a->w = aw[0] > aw[1]? aw[0] : aw[1]; } free(srt); free(rseq); } @@ -750,7 +762,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p); if (k == 0) mapq0 = h.qual; else if (h.qual > mapq0) h.qual = mapq0; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, p->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); } } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m); s->sam = str.s; diff --git a/bwamem.h b/bwamem.h index 3f4ce32..4d632e7 100644 --- a/bwamem.h +++ b/bwamem.h @@ -22,6 +22,7 @@ typedef struct { int pen_unpaired; // phred-scaled penalty for unpaired reads int pen_clip; // clipping penalty. This score is not deducted from the DP score. int w; // band width + int max_w; // max band width int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length @@ -45,6 +46,7 @@ typedef struct { int sub; // 2nd best SW score int csub; // SW score of a tandem hit int sub_n; // approximate number of suboptimal hits + int w; // actual band width used in extension int seedcov; // length of regions coverged by seeds int secondary; // index of the parent hit shadowing the current hit; <0 if primary } mem_alnreg_t; diff --git a/ksw.c b/ksw.c index b97fed5..3747cb0 100644 --- a/ksw.c +++ b/ksw.c @@ -359,11 +359,11 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) { eh_t *eh; // score array int8_t *qp; // query profile - int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore; + int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore, max_off; if (h0 < 0) h0 = 0; // allocate memory qp = malloc(qlen * m); @@ -386,6 +386,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, w = w < max_gap? w : max_gap; // DP loop max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; + max_off = 0; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { int f = 0, h1, m = 0, mj = -1; @@ -410,7 +411,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, h = h > e? h : e; h = h > f? h : f; h1 = h; // save H(i,j) to h1 for the next column - mj = m > h? mj : j; + mj = m > h? mj : j; // record the position where max score is achieved m = m > h? m : h; // m is stored at eh[mj+1] h -= gapoe; h = h > 0? h : 0; @@ -426,7 +427,10 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, gscore = gscore > h1? gscore : h1; } if (m == 0) break; - if (m > max) max = m, max_i = i, max_j = mj; + if (m > max) { + max = m, max_i = i, max_j = mj; + max_off = max_off > abs(mj - i)? max_off : abs(mj - i); + } // update beg and end for the next round for (j = mj; j >= beg && eh[j].h; --j); beg = j + 1; @@ -439,6 +443,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, if (_tle) *_tle = max_i + 1; if (_gtle) *_gtle = max_ie + 1; if (_gscore) *_gscore = gscore; + if (_max_off) *_max_off = max_off; return max; } diff --git a/ksw.h b/ksw.h index d2975de..6d1f7cf 100644 --- a/ksw.h +++ b/ksw.h @@ -102,7 +102,7 @@ extern "C" { * * @return best semi-local alignment score */ - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); #ifdef __cplusplus } From d6096c3f997d52f40aeef1c352461dbd45fd163b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 4 Mar 2013 18:41:57 -0500 Subject: [PATCH 327/498] bugfix: caused by the latest change --- bwamem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 42788c4..d751cfb 100644 --- a/bwamem.c +++ b/bwamem.c @@ -572,12 +572,12 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension - int qle, tle, qe, re, gtle, gscore, tmps = -1; + int qle, tle, qe, re, gtle, gscore, tmps = -1, sc0 = a->score; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; assert(re >= 0); for (aw[1] = opt->w;; aw[1] <<= 1) { - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], a->score, &qle, &tle, >le, &gscore, &max_off[1]); + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], sc0, &qle, &tle, >le, &gscore, &max_off[1]); if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[1], max_off[1]); fflush(stdout); if (a->score == tmps || aw[1]<<1 > opt->max_w || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; tmps = a->score; From e0991d6a459daa27031df17405e0d62f0395a6f1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 00:34:33 -0500 Subject: [PATCH 328/498] r323: added Z-dropoff, a variant of blast's X-drop --- bwamem.c | 5 +++-- bwamem.h | 1 + fastmap.c | 6 +++++- ksw.c | 4 ++-- ksw.h | 2 +- main.c | 2 +- 6 files changed, 13 insertions(+), 7 deletions(-) diff --git a/bwamem.c b/bwamem.c index d751cfb..20d8bdb 100644 --- a/bwamem.c +++ b/bwamem.c @@ -44,6 +44,7 @@ mem_opt_t *mem_opt_init() o = calloc(1, sizeof(mem_opt_t)); o->flag = 0; o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 60; o->max_w = 500; + o->zdrop = 100; o->pen_unpaired = 9; o->pen_clip = 5; o->min_seed_len = 19; @@ -560,7 +561,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; for (aw[0] = opt->w;; aw[0] <<= 1) { - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[0], max_off[0]); fflush(stdout); if (a->score == tmps || aw[0]<<1 > opt->max_w || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; tmps = a->score; @@ -577,7 +578,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int re = s->rbeg + s->len - rmax[0]; assert(re >= 0); for (aw[1] = opt->w;; aw[1] <<= 1) { - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], sc0, &qle, &tle, >le, &gscore, &max_off[1]); + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[1], max_off[1]); fflush(stdout); if (a->score == tmps || aw[1]<<1 > opt->max_w || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; tmps = a->score; diff --git a/bwamem.h b/bwamem.h index 4d632e7..7d6e821 100644 --- a/bwamem.h +++ b/bwamem.h @@ -23,6 +23,7 @@ typedef struct { int pen_clip; // clipping penalty. This score is not deducted from the DP score. int w; // band width int max_w; // max band width + int zdrop; // Z-dropoff int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length diff --git a/fastmap.c b/fastmap.c index 56cfb01..eb93994 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,9 +26,10 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:W:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); + else if (c == 'W') opt->max_w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); @@ -42,6 +43,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'p') opt->flag |= MEM_F_PE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; else if (c == 'c') opt->max_occ = atoi(optarg); + else if (c == 'd') opt->zdrop = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); else if (c == 'C') copy_comment = 1; @@ -57,6 +59,8 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); + fprintf(stderr, " -W INT max band width [%d]\n", opt->max_w); + fprintf(stderr, " -d INT off-diagnal X-dropoff [%d]\n", opt->zdrop); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); diff --git a/ksw.c b/ksw.c index 3747cb0..5666a8f 100644 --- a/ksw.c +++ b/ksw.c @@ -359,7 +359,7 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) { eh_t *eh; // score array int8_t *qp; // query profile @@ -426,7 +426,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, max_ie = gscore > h1? max_ie : i; gscore = gscore > h1? gscore : h1; } - if (m == 0) break; + if (m == 0 || max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop) break; // drop to zero, or below Z-dropoff if (m > max) { max = m, max_i = i, max_j = mj; max_off = max_off > abs(mj - i)? max_off : abs(mj - i); diff --git a/ksw.h b/ksw.h index 6d1f7cf..2dd6499 100644 --- a/ksw.h +++ b/ksw.h @@ -102,7 +102,7 @@ extern "C" { * * @return best semi-local alignment score */ - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); #ifdef __cplusplus } diff --git a/main.c b/main.c index be31bf0..dd1a481 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r320" +#define PACKAGE_VERSION "0.7.0-r323-beta" #endif static int usage() From efd9769b07114d2d8539e6115277c7e24ce2930f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 00:57:16 -0500 Subject: [PATCH 329/498] r324: a little code cleanup The changes after r317 aim to improve the performance and accuracy for very long query alignment. The short-read alignment should not be affected. The changes include: 1) Z-dropoff. This is a variant of blast's X-dropoff. I orginally thought this heuristic only improves speed, but now I realize it also reduces poor alignment with long good flanking alignments. The difference from blast's X-dropoff is that Z-dropoff allows big gaps, but X-dropoff does not. 2) Band width doubling. When band width is too small, we will get a poor alignment in the middle. Sometimes such alignments cannot be fully excluded with Z-dropoff. Band width doubling is an alternative heuristic. It is based on the observation that the existing of close-to-boundary high score possibly implies inadequate band width. When we see such a signal, we double the band width. --- bwamem.c | 26 +++++++++++++++----------- bwamem.h | 1 - fastmap.c | 4 +--- main.c | 2 +- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/bwamem.c b/bwamem.c index 20d8bdb..a6897ba 100644 --- a/bwamem.c +++ b/bwamem.c @@ -43,7 +43,7 @@ mem_opt_t *mem_opt_init() mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); o->flag = 0; - o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 60; o->max_w = 500; + o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; o->zdrop = 100; o->pen_unpaired = 9; o->pen_clip = 5; @@ -434,6 +434,7 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORT #define MEM_SHORT_EXT 50 #define MEM_SHORT_LEN 200 +#define MAX_BAND_TRY 2 int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { @@ -551,20 +552,22 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a = kv_pushp(mem_alnreg_t, *av); memset(a, 0, sizeof(mem_alnreg_t)); a->w = aw[0] = aw[1] = opt->w; + a->score = -1; if (s->qbeg) { // left extension uint8_t *rs, *qs; - int qle, tle, gtle, gscore, tmps = -1; + int qle, tle, gtle, gscore; qs = malloc(s->qbeg); for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; tmp = s->rbeg - rmax[0]; rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; - for (aw[0] = opt->w;; aw[0] <<= 1) { + for (i = 0; i < MAX_BAND_TRY; ++i) { + int prev = a->score; + aw[0] = opt->w << i; a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); - if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[0], max_off[0]); fflush(stdout); - if (a->score == tmps || aw[0]<<1 > opt->max_w || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; - tmps = a->score; + if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); + if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } // check whether we prefer to reach the end of the query if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits @@ -573,15 +576,16 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension - int qle, tle, qe, re, gtle, gscore, tmps = -1, sc0 = a->score; + int qle, tle, qe, re, gtle, gscore, sc0 = a->score; qe = s->qbeg + s->len; re = s->rbeg + s->len - rmax[0]; assert(re >= 0); - for (aw[1] = opt->w;; aw[1] <<= 1) { + for (i = 0; i < MAX_BAND_TRY; ++i) { + int prev = a->score; + aw[1] = opt->w << i; a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); - if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", tmps, a->score, aw[1], max_off[1]); fflush(stdout); - if (a->score == tmps || aw[1]<<1 > opt->max_w || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; - tmps = a->score; + if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); + if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } // similar to the above if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; diff --git a/bwamem.h b/bwamem.h index 7d6e821..96a3308 100644 --- a/bwamem.h +++ b/bwamem.h @@ -22,7 +22,6 @@ typedef struct { int pen_unpaired; // phred-scaled penalty for unpaired reads int pen_clip; // clipping penalty. This score is not deducted from the DP score. int w; // band width - int max_w; // max band width int zdrop; // Z-dropoff int flag; // see MEM_F_* macros diff --git a/fastmap.c b/fastmap.c index eb93994..40c3a02 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,10 +26,9 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:W:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); - else if (c == 'W') opt->max_w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); @@ -59,7 +58,6 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); - fprintf(stderr, " -W INT max band width [%d]\n", opt->max_w); fprintf(stderr, " -d INT off-diagnal X-dropoff [%d]\n", opt->zdrop); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); diff --git a/main.c b/main.c index dd1a481..8b49c8b 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r323-beta" +#define PACKAGE_VERSION "0.7.0-r324-beta" #endif static int usage() From 07921659cf28db08961545351ba188287c40c2b4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 09:38:12 -0500 Subject: [PATCH 330/498] move mem_fill_scmat() to bwa.{h,c} --- bwa.c | 11 +++++++++++ bwa.h | 1 + bwamem.c | 13 +------------ fastmap.c | 2 +- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/bwa.c b/bwa.c index beea6d1..76b54ae 100644 --- a/bwa.c +++ b/bwa.c @@ -69,6 +69,17 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) * CIGAR related * *****************/ +void bwa_fill_scmat(int a, int b, int8_t mat[25]) +{ + int i, j, k; + for (i = k = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? a : -b; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; +} + // Generate CIGAR when the alignment end points are known uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) { diff --git a/bwa.h b/bwa.h index 81d40e0..9d5b2aa 100644 --- a/bwa.h +++ b/bwa.h @@ -30,6 +30,7 @@ extern "C" { bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); + void bwa_fill_scmat(int a, int b, int8_t mat[25]); uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); diff --git a/bwamem.c b/bwamem.c index a6897ba..cd3a1f5 100644 --- a/bwamem.c +++ b/bwamem.c @@ -58,21 +58,10 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->max_matesw = 100; - mem_fill_scmat(o->a, o->b, o->mat); + bwa_fill_scmat(o->a, o->b, o->mat); return o; } -void mem_fill_scmat(int a, int b, int8_t mat[25]) -{ - int i, j, k; - for (i = k = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - mat[k++] = i == j? a : -b; - mat[k++] = 0; // ambiguous base - } - for (j = 0; j < 5; ++j) mat[k++] = 0; -} - /*************************** * SMEM iterator interface * ***************************/ diff --git a/fastmap.c b/fastmap.c index 40c3a02..d4e5626 100644 --- a/fastmap.c +++ b/fastmap.c @@ -84,7 +84,7 @@ int main_mem(int argc, char *argv[]) return 1; } - mem_fill_scmat(opt->a, opt->b, opt->mat); + bwa_fill_scmat(opt->a, opt->b, opt->mat); if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak bwa_print_sam_hdr(idx->bns, rg_line); From 086c9d0e7dbba0519c598b819876c2b230780f32 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 09:54:49 -0500 Subject: [PATCH 331/498] bwa-sw: use bwa_gen_cigar() for cigar generation --- bwtsw2_aux.c | 56 +++++++++++----------------------------------------- 1 file changed, 11 insertions(+), 45 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index bc12d20..4455c5f 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -169,33 +169,22 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, } /* generate CIGAR array(s) in b->cigar[] */ -static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name) +static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], int64_t l_pac, const uint8_t *pac, bwtsw2_t *b, const char *name) { - uint8_t *target; - int i, matrix[25]; - AlnParam par; - path_t *path; + int i; + int8_t mat[25]; - par.matrix = matrix; - __gen_ap(par, opt); - i = ((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq; // maximum possible target length - target = calloc(i, 1); - path = calloc(i + lq, sizeof(path_t)); - // generate CIGAR + bwa_fill_scmat(opt->a, opt->b, mat); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; bsw2aux_t *q = b->aux + i; uint8_t *query; - bwtint_t k; - int path_len, beg, end; + int beg, end, score; if (p->l) continue; beg = (p->flag & 0x10)? lq - p->end : p->beg; end = (p->flag & 0x10)? lq - p->beg : p->end; query = seq[(p->flag & 0x10)? 1 : 0] + beg; - for (k = p->k; k < p->k + p->len; ++k) // in principle, no out-of-boundary here - target[k - p->k] = pac[k>>2] >> (~k&3)*2 & 0x3; - aln_global_core(target, p->len, query, end - beg, &par, path, &path_len); - q->cigar = aln_path2cigar32(path, path_len, &q->n_cigar); + q->cigar = bwa_gen_cigar(mat, opt->q, opt->r, opt->bw, l_pac, pac, end - beg, query, p->k, p->k + p->len, &score, &q->n_cigar, &q->nm); #if 0 if (name && score != p->G) { // debugging only int j, glen = 0; @@ -206,7 +195,7 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 __func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw); } #endif - if (beg != 0 || end < lq) { // write soft clipping + if (q->cigar && (beg != 0 || end < lq)) { // write soft clipping q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2)); if (beg != 0) { memmove(q->cigar + 1, q->cigar, q->n_cigar * 4); @@ -219,7 +208,6 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], const uint8 } } } - free(target); free(path); } /* this is for the debugging purpose only */ @@ -407,27 +395,6 @@ static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *c return n_cigar; } -static int compute_nm(bsw2hit_t *p, int n_cigar, const uint32_t *cigar, const uint8_t *pac, const uint8_t *seq) -{ - int k, x, n_mm = 0, i, n_gap = 0; - bwtint_t y; - x = 0; y = p->k; - for (k = 0; k < n_cigar; ++k) { - int op = cigar[k]&0xf; - int len = cigar[k]>>4; - if (op == 0) { // match - for (i = 0; i < len; ++i) { - int ref = pac[(y+i)>>2] >> (~(y+i)&3)*2 & 0x3; - if (seq[x + i] != ref) ++n_mm; - } - x += len; y += len; - } else if (op == 1) x += len, n_gap += len; - else if (op == 2) y += len, n_gap += len; - else if (op == 4) x += len; - } - return n_mm + n_gap; -} - static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name) { int i; @@ -439,7 +406,7 @@ static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8 } b->aux = calloc(b->n, sizeof(bsw2aux_t)); // generate CIGAR - gen_cigar(opt, qlen, seq, pac, b, name); + gen_cigar(opt, qlen, seq, bns->l_pac, pac, b, name); // fix CIGAR, generate mapQ, and write chromosomal position for (i = 0; i < b->n; ++i) { bsw2hit_t *p = &b->hits[i]; @@ -451,8 +418,6 @@ static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8 int subo; // fix out-of-boundary CIGAR q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar); - // compute the NM tag - q->nm = compute_nm(p, q->n_cigar, q->cigar, pac, seq[p->is_rev]); // compute mapQ subo = p->G2 > opt->t? p->G2 : opt->t; if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; @@ -527,9 +492,10 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks bsw2aux_t *q = b->aux + i; int j, beg, end, type = 0; // print mandatory fields before SEQ + if (q->cigar == 0) q->flag |= 0x4; ksprintf(&str, "%s\t%d", ks->name, q->flag | (opt->multi_2nd && i? 0x100 : 0)); ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1); - if (p->l == 0) { // not a repetitive hit + if (p->l == 0 && q->cigar) { // not a repetitive hit ksprintf(&str, "\t%d\t", q->pqual); for (k = 0; k < q->n_cigar; ++k) ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]); @@ -538,7 +504,7 @@ static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks else ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize); // get the sequence begin and end beg = 0; end = ks->l; - if (opt->hard_clip) { + if (opt->hard_clip && q->cigar) { if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4; if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4; } From e6c262594fcb6c48aa4f1ced3f49827cfdb5543f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 10:12:38 -0500 Subject: [PATCH 332/498] bwa-sw: ditch stdaln --- bwtsw2_aux.c | 38 +++++++++++++++++--------------------- ksw.c | 2 +- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 4455c5f..6527495 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -11,9 +11,9 @@ #include "bwt_lite.h" #include "utils.h" #include "bwtsw2.h" -#include "stdaln.h" #include "kstring.h" #include "bwa.h" +#include "ksw.h" #include "kseq.h" KSEQ_DECLARE(gzFile) @@ -94,13 +94,12 @@ bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b) void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { - int i, matrix[25]; + int i; bwtint_t k; uint8_t *target = 0, *query; - AlnParam par; + int8_t mat[25]; - par.matrix = matrix; - __gen_ap(par, opt); + bwa_fill_scmat(opt->a, opt->b, mat); query = calloc(lq, 1); // sort according to the descending order of query end ks_introsort(hit, b->n, b->hits); @@ -111,8 +110,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; - int score, j; - path_t path; + int score, j, qle, tle; p->n_seeds = 1; if (p->l || p->k == 0) continue; for (j = score = 0; j < i; ++j) { @@ -127,12 +125,12 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = aln_extend_core(target, lt, query + lq - p->beg, p->beg, &par, &path, 0, p->G, _mem); + score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, p->G, &qle, &tle, 0, 0, 0); if (score > p->G) { // extensible p->G = score; - p->len += path.i; - p->beg -= path.j; - p->k -= path.i; + p->k -= tle; + p->len += tle; + p->beg -= qle; } } free(query); free(target); @@ -140,29 +138,27 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) { - int i, matrix[25]; + int i; bwtint_t k; uint8_t *target; - AlnParam par; - - par.matrix = matrix; - __gen_ap(par, opt); + int8_t mat[25]; + + bwa_fill_scmat(opt->a, opt->b, mat); target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; - int j, score; - path_t path; + int j, score, qle, tle; if (p->l) continue; for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = aln_extend_core(target, lt, query + p->beg, lq - p->beg, &par, &path, 0, 1, _mem); + score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0); // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); if (score >= p->G) { p->G = score; - p->len = path.i; - p->end = path.j + p->beg; + p->len = tle; + p->end = p->beg + qle; } } free(target); diff --git a/ksw.c b/ksw.c index 5666a8f..e331390 100644 --- a/ksw.c +++ b/ksw.c @@ -426,7 +426,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, max_ie = gscore > h1? max_ie : i; gscore = gscore > h1? gscore : h1; } - if (m == 0 || max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop) break; // drop to zero, or below Z-dropoff + if (m == 0 || (zdrop > 0 && max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop)) break; // drop to zero, or below Z-dropoff if (m > max) { max = m, max_i = i, max_j = mj; max_off = max_off > abs(mj - i)? max_off : abs(mj - i); From bb37e14d02bea1eb13dfa78e4b25065429efec41 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 10:38:47 -0500 Subject: [PATCH 333/498] replace aln_global in bwase.c --- bwase.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/bwase.c b/bwase.c index 2dd783b..9e2696b 100644 --- a/bwase.c +++ b/bwase.c @@ -11,6 +11,7 @@ #include "utils.h" #include "kstring.h" #include "bwa.h" +#include "ksw.h" int g_log_n[256]; @@ -164,12 +165,13 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l int ext, int *n_cigar, int is_end_correct) { bwa_cigar_t *cigar = 0; + uint32_t *cigar32 = 0; ubyte_t *ref_seq; - int l = 0, path_len, ref_len; - AlnParam ap = aln_param_bwa; - path_t *path; + int l = 0, ref_len; int64_t k, __pos = *_pos; + int8_t mat[25]; + bwa_fill_scmat(1, 3, mat); ref_len = len + abs(ext); if (ext > 0) { ref_seq = (ubyte_t*)calloc(ref_len, 1); @@ -181,10 +183,11 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k) ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; } - path = (path_t*)calloc(l+len, sizeof(path_t)); - aln_global_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len); - cigar = bwa_aln_path2cigar(path, path_len, n_cigar); + ksw_global(len, seq, l, ref_seq, 5, mat, 5, 1, 50, n_cigar, &cigar32); + cigar = (bwa_cigar_t*)cigar32; + for (k = 0; k < *n_cigar; ++k) + cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped to the forward strand for (l = k = 0; k < *n_cigar; ++k) { @@ -206,7 +209,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0]))); *_pos = (bwtint_t)__pos; - free(ref_seq); free(path); + free(ref_seq); return cigar; } From 98f896675094c3bb12203717f29b45757e5fd056 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 12:00:24 -0500 Subject: [PATCH 334/498] r329: ditch stdaln.{c,h}; no changes to bwa-mem stdaln.{c,h} was written ten years ago. Its local and SW extension code are actually buggy (though that rarely happens and usually does not affect the results too much). ksw.{c,h} is more concise, potentially faster, less buggy, and richer in features. --- Makefile | 6 +- bwape.c | 41 ++- bwase.c | 6 +- bwtaln.c | 15 - bwtaln.h | 12 +- main.c | 2 +- stdaln.c | 1070 ------------------------------------------------------ stdaln.h | 162 --------- 8 files changed, 33 insertions(+), 1281 deletions(-) delete mode 100644 stdaln.c delete mode 100644 stdaln.h diff --git a/Makefile b/Makefile index 36f951f..96c3047 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CXXFLAGS= $(CFLAGS) AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o -AOBJS= QSufSort.o bwt_gen.o stdaln.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ +AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o @@ -48,8 +48,8 @@ fastmap.o:bwt.h bwamem.h bwtaln.o:bwt.h bwtaln.h kseq.h bwtgap.o:bwtgap.h bwtaln.h bwt.h -bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h -bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h stdaln.h +bwtsw2_core.o:bwtsw2.h bwt.h bwt_lite.h +bwtsw2_aux.o:bwtsw2.h bwt.h bwt_lite.h bwtsw2_main.o:bwtsw2.h clean: diff --git a/bwape.c b/bwape.c index 0b2b8d6..9fd12b1 100644 --- a/bwape.c +++ b/bwape.c @@ -8,9 +8,9 @@ #include "kvec.h" #include "bntseq.h" #include "utils.h" -#include "stdaln.h" #include "bwase.h" #include "bwa.h" +#include "ksw.h" typedef struct { int n; @@ -397,16 +397,17 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw #define SW_MIN_MAPQ 17 // cnt = n_mm<<16 | n_gapo<<8 | n_gape -bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, - int *n_cigar, uint32_t *_cnt) +bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt) { + kswr_t r; + uint32_t *cigar32 = 0; bwa_cigar_t *cigar = 0; ubyte_t *ref_seq; bwtint_t k, x, y, l; - int path_len, ret, subo; - AlnParam ap = aln_param_bwa; - path_t *path, *p; + int xtra; + int8_t mat[25]; + bwa_fill_scmat(1, 3, mat); // check whether there are too many N's if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0; for (k = 0, x = 0; k < len; ++k) @@ -417,15 +418,19 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u ref_seq = (ubyte_t*)calloc(reglen, 1); for (k = *beg, l = 0; l < reglen && k < l_pac; ++k) ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - path = (path_t*)calloc(l+len, sizeof(path_t)); // do alignment - ret = aln_local_core(ref_seq, l, (ubyte_t*)seq, len, &ap, path, &path_len, 1, &subo); - if (ret < 0 || subo == ret) { // no hit or tandem hits - free(path); free(cigar); free(ref_seq); *n_cigar = 0; + xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0); + r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0); + ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32); + cigar = (bwa_cigar_t*)cigar32; + for (k = 0; k < *n_cigar; ++k) + cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); + + if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score) { // poor hit or tandem hits + free(cigar); free(ref_seq); *n_cigar = 0; return 0; } - cigar = bwa_aln_path2cigar(path, path_len, n_cigar); // check whether the alignment is good enough for (k = 0, x = y = 0; k < *n_cigar; ++k) { @@ -435,17 +440,14 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u else y += __cigar_len(c); } if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough - free(path); free(cigar); free(ref_seq); + free(cigar); free(ref_seq); *n_cigar = 0; return 0; } { // update cigar and coordinate; - int start, end; - p = path + path_len - 1; - *beg += (p->i? p->i : 1) - 1; - start = (p->j? p->j : 1) - 1; - end = path->j; + int start = r.qb, end = r.qe + 1; + *beg += r.tb; cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); if (start) { memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar)); @@ -462,8 +464,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u { // set *cnt int n_mm, n_gapo, n_gape; n_mm = n_gapo = n_gape = 0; - p = path + path_len - 1; - x = p->i? p->i - 1 : 0; y = p->j? p->j - 1 : 0; + x = r.tb; y = r.qb; for (k = 0; k < *n_cigar; ++k) { bwa_cigar_t c = cigar[k]; if (__cigar_op(c) == FROM_M) { @@ -479,7 +480,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u *_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape; } - free(ref_seq); free(path); + free(ref_seq); return cigar; } diff --git a/bwase.c b/bwase.c index 9e2696b..eebe22b 100644 --- a/bwase.c +++ b/bwase.c @@ -4,7 +4,6 @@ #include #include #include -#include "stdaln.h" #include "bwase.h" #include "bwtaln.h" #include "bntseq.h" @@ -205,8 +204,8 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end // change "I" at either end of the read to S. just in case. This should rarely happen... - if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(3, (__cigar_len(cigar[*n_cigar-1]))); - if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(3, (__cigar_len(cigar[0]))); + if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(FROM_S, (__cigar_len(cigar[*n_cigar-1]))); + if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(FROM_S, (__cigar_len(cigar[0]))); *_pos = (bwtint_t)__pos; free(ref_seq); @@ -589,5 +588,6 @@ int bwa_sai2sam_se(int argc, char *argv[]) return 0; } bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line); + free(prefix); return 0; } diff --git a/bwtaln.c b/bwtaln.c index 96d4026..d132157 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -312,18 +312,3 @@ int bwa_aln(int argc, char *argv[]) free(opt); free(prefix); return 0; } - -/* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, -__cigar_op and __cigar_len while keeping stdaln stand alone */ -bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar) -{ - uint32_t *cigar32; - bwa_cigar_t *cigar; - int i; - cigar32 = aln_path2cigar32((path_t*) path, path_len, n_cigar); - cigar = (bwa_cigar_t*)cigar32; - for (i = 0; i < *n_cigar; ++i) - cigar[i] = __cigar_create( (cigar32[i]&0xf), (cigar32[i]>>4) ); - return cigar; -} - diff --git a/bwtaln.h b/bwtaln.h index 412cc04..556f259 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -28,6 +28,11 @@ #define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3) #endif +#define FROM_M 0 +#define FROM_I 1 +#define FROM_D 2 +#define FROM_S 3 + typedef struct { bwtint_t w; int bid; @@ -138,13 +143,6 @@ extern "C" { void bwa_cs2nt_core(bwa_seq_t *p, bwtint_t l_pac, ubyte_t *pac); - - /* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, - __cigar_op and __cigar_len while keeping stdaln stand alone */ -#include "stdaln.h" - - bwa_cigar_t *bwa_aln_path2cigar(const path_t *path, int path_len, int *n_cigar); - #ifdef __cplusplus } #endif diff --git a/main.c b/main.c index 8b49c8b..00a21b9 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r324-beta" +#define PACKAGE_VERSION "0.7.0-r329-beta" #endif static int usage() diff --git a/stdaln.c b/stdaln.c deleted file mode 100644 index cd064cf..0000000 --- a/stdaln.c +++ /dev/null @@ -1,1070 +0,0 @@ -/* The MIT License - - Copyright (c) 2003-2006, 2008, 2009, by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include -#include -#include -#include -#include "stdaln.h" - -/* char -> 17 (=16+1) nucleotides */ -unsigned char aln_nt16_table[256] = { - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,16 /*'-'*/,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15, - 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15, - 15, 1,14, 4, 11,15,15, 2, 13,15,15,10, 15, 5,15,15, - 15,15, 3, 6, 8,15, 7, 9, 0,12,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 -}; -char *aln_nt16_rev_table = "XAGRCMSVTWKDYHBN-"; - -/* char -> 5 (=4+1) nucleotides */ -unsigned char aln_nt4_table[256] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 2, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; -char *aln_nt4_rev_table = "AGCTN-"; - -/* char -> 22 (=20+1+1) amino acids */ -unsigned char aln_aa_table[256] = { - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,20,21, 21,22 /*'-'*/,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21, - 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21, - 21, 0,21, 4, 3, 6,13, 7, 8, 9,21,11, 10,12, 2,21, - 14, 5, 1,15, 16,21,19,17, 21,18,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21, - 21,21,21,21, 21,21,21,21, 21,21,21,21, 21,21,21,21 -}; -char *aln_aa_rev_table = "ARNDCQEGHILKMFPSTWYV*X-"; - /* 01234567890123456789012 */ - -/* translation table. They are useless in stdaln.c, but when you realize you need it, you need not write the table again. */ -unsigned char aln_trans_table_eu[66] = { - 11,11, 2, 2, 1, 1,15,15, 16,16,16,16, 9,12, 9, 9, - 6, 6, 3, 3, 7, 7, 7, 7, 0, 0, 0, 0, 19,19,19,19, - 5, 5, 8, 8, 1, 1, 1, 1, 14,14,14,14, 10,10,10,10, - 20,20,18,18, 20,17, 4, 4, 15,15,15,15, 10,10,13,13, 21, 22 -}; -char *aln_trans_table_eu_char = "KKNNRRSSTTTTIMIIEEDDGGGGAAAAVVVVQQHHRRRRPPPPLLLL**YY*WCCSSSSLLFFX"; - /* 01234567890123456789012345678901234567890123456789012345678901234 */ -int aln_sm_blosum62[] = { -/* A R N D C Q E G H I L K M F P S T W Y V * X */ - 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0, - -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1, - -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1, - -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1, - 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2, - -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1, - -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1, - 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1, - -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1, - -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1, - -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1, - -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1, - -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1, - -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1, - -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2, - 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0, - 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0, - -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2, - -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1, - 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1, - -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4, - 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 -}; - -int aln_sm_blosum45[] = { -/* A R N D C Q E G H I L K M F P S T W Y V * X */ - 5,-2,-1,-2,-1,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-2,-2, 0,-5, 0, - -2, 7, 0,-1,-3, 1, 0,-2, 0,-3,-2, 3,-1,-2,-2,-1,-1,-2,-1,-2,-5,-1, - -1, 0, 6, 2,-2, 0, 0, 0, 1,-2,-3, 0,-2,-2,-2, 1, 0,-4,-2,-3,-5,-1, - -2,-1, 2, 7,-3, 0, 2,-1, 0,-4,-3, 0,-3,-4,-1, 0,-1,-4,-2,-3,-5,-1, - -1,-3,-2,-3,12,-3,-3,-3,-3,-3,-2,-3,-2,-2,-4,-1,-1,-5,-3,-1,-5,-2, - -1, 1, 0, 0,-3, 6, 2,-2, 1,-2,-2, 1, 0,-4,-1, 0,-1,-2,-1,-3,-5,-1, - -1, 0, 0, 2,-3, 2, 6,-2, 0,-3,-2, 1,-2,-3, 0, 0,-1,-3,-2,-3,-5,-1, - 0,-2, 0,-1,-3,-2,-2, 7,-2,-4,-3,-2,-2,-3,-2, 0,-2,-2,-3,-3,-5,-1, - -2, 0, 1, 0,-3, 1, 0,-2,10,-3,-2,-1, 0,-2,-2,-1,-2,-3, 2,-3,-5,-1, - -1,-3,-2,-4,-3,-2,-3,-4,-3, 5, 2,-3, 2, 0,-2,-2,-1,-2, 0, 3,-5,-1, - -1,-2,-3,-3,-2,-2,-2,-3,-2, 2, 5,-3, 2, 1,-3,-3,-1,-2, 0, 1,-5,-1, - -1, 3, 0, 0,-3, 1, 1,-2,-1,-3,-3, 5,-1,-3,-1,-1,-1,-2,-1,-2,-5,-1, - -1,-1,-2,-3,-2, 0,-2,-2, 0, 2, 2,-1, 6, 0,-2,-2,-1,-2, 0, 1,-5,-1, - -2,-2,-2,-4,-2,-4,-3,-3,-2, 0, 1,-3, 0, 8,-3,-2,-1, 1, 3, 0,-5,-1, - -1,-2,-2,-1,-4,-1, 0,-2,-2,-2,-3,-1,-2,-3, 9,-1,-1,-3,-3,-3,-5,-1, - 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-3,-1,-2,-2,-1, 4, 2,-4,-2,-1,-5, 0, - 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-1,-1, 2, 5,-3,-1, 0,-5, 0, - -2,-2,-4,-4,-5,-2,-3,-2,-3,-2,-2,-2,-2, 1,-3,-4,-3,15, 3,-3,-5,-2, - -2,-1,-2,-2,-3,-1,-2,-3, 2, 0, 0,-1, 0, 3,-3,-2,-1, 3, 8,-1,-5,-1, - 0,-2,-3,-3,-1,-3,-3,-3,-3, 3, 1,-2, 1, 0,-3,-1, 0,-3,-1, 5,-5,-1, - -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5, 1,-5, - 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0,-2,-1,-1,-5,-1 -}; - -int aln_sm_nt[] = { -/* X A G R C M S V T W K D Y H B N */ - -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2, - -2, 2,-1, 1,-2, 1,-2, 0,-2, 1,-2, 0,-2, 0,-2, 0, - -2,-1, 2, 1,-2,-2, 1, 0,-2,-2, 1, 0,-2,-2, 0, 0, - -2, 1, 1, 1,-2,-1,-1, 0,-2,-1,-1, 0,-2, 0, 0, 0, - -2,-2,-2,-2, 2, 1, 1, 0,-1,-2,-2,-2, 1, 0, 0, 0, - -2, 1,-2,-1, 1, 1,-1, 0,-2,-1,-2, 0,-1, 0, 0, 0, - -2,-2, 1,-1, 1,-1, 1, 0,-2,-2,-1, 0,-1, 0, 0, 0, - -2, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, - -2,-2,-2,-2,-1,-2,-2,-2, 2, 1, 1, 0, 1, 0, 0, 0, - -2, 1,-2,-1,-2,-1,-2, 0, 1, 1,-1, 0,-1, 0, 0, 0, - -2,-2, 1,-1,-2,-2,-1, 0, 1,-1, 1, 0,-1, 0, 0, 0, - -2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -2,-2,-2,-2, 1,-1,-1, 0, 1,-1,-1, 0, 1, 0, 0, 0, - -2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -int aln_sm_read[] = { -/* X A G R C M S V T W K D Y H B N */ - -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, - -17, 2,-17, 1,-17, 1,-17, 0,-17, 1,-17, 0,-17, 0,-17, 0, - -17,-17, 2, 1,-17,-17, 1, 0,-17,-17, 1, 0,-17,-17, 0, 0, - -17, 1, 1, 1,-17,-17,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0, - -17,-17,-17,-17, 2, 1, 1, 0,-17,-17,-17,-17, 1, 0, 0, 0, - -17, 1,-17,-17, 1, 1,-17, 0,-17,-17,-17, 0,-17, 0, 0, 0, - -17,-17, 1,-17, 1,-17, 1, 0,-17,-17,-17, 0,-17, 0, 0, 0, - -17, 0, 0, 0, 0, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, - -17,-17,-17,-17,-17,-17,-17,-17, 2, 1, 1, 0, 1, 0, 0, 0, - -17, 1,-17,-17,-17,-17,-17, 0, 1, 1,-17, 0,-17, 0, 0, 0, - -17,-17, 1,-17,-17,-17,-17, 0, 1,-17, 1, 0,-17, 0, 0, 0, - -17, 0, 0, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -17,-17,-17,-17, 1,-17,-17, 0, 1,-17,-17, 0, 1, 0, 0, 0, - -17, 0,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -17,-17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -int aln_sm_hs[] = { -/* A G C T N */ - 91, -31,-114,-123, -44, - -31, 100,-125,-114, -42, - -123,-125, 100, -31, -42, - -114,-114, -31, 91, -42, - -44, -42, -42, -42, -43 -}; - -int aln_sm_maq[] = { - 11, -19, -19, -19, -13, - -19, 11, -19, -19, -13, - -19, -19, 11, -19, -13, - -19, -19, -19, 11, -13, - -13, -13, -13, -13, -13 -}; - -int aln_sm_blast[] = { - 1, -3, -3, -3, -2, - -3, 1, -3, -3, -2, - -3, -3, 1, -3, -2, - -3, -3, -3, 1, -2, - -2, -2, -2, -2, -2 -}; - -/********************/ -/* START OF align.c */ -/********************/ - -AlnParam aln_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 }; -AlnParam aln_param_bwa = { 26, 9, 5, aln_sm_maq, 5, 50 }; -AlnParam aln_param_nt2nt = { 8, 2, 2, aln_sm_nt, 16, 75 }; -AlnParam aln_param_rd2rd = { 1, 19, 19, aln_sm_read, 16, 75 }; -AlnParam aln_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 }; - -AlnAln *aln_init_AlnAln() -{ - AlnAln *aa; - aa = (AlnAln*)malloc(sizeof(AlnAln)); - aa->path = 0; - aa->out1 = aa->out2 = aa->outm = 0; - aa->path_len = 0; - return aa; -} -void aln_free_AlnAln(AlnAln *aa) -{ - free(aa->path); free(aa->cigar32); - free(aa->out1); free(aa->out2); free(aa->outm); - free(aa); -} - -/***************************/ -/* START OF common_align.c */ -/***************************/ - -#define LOCAL_OVERFLOW_THRESHOLD 32000 -#define LOCAL_OVERFLOW_REDUCE 16000 -#define NT_LOCAL_SCORE int -#define NT_LOCAL_SHIFT 16 -#define NT_LOCAL_MASK 0xffff - -#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF; - -#define set_M(MM, cur, p, sc) \ -{ \ - if ((p)->M >= (p)->I) { \ - if ((p)->M >= (p)->D) { \ - (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \ - } else { \ - (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ - } \ - } else { \ - if ((p)->I > (p)->D) { \ - (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \ - } else { \ - (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ - } \ - } \ -} -#define set_I(II, cur, p) \ -{ \ - if ((p)->M - gap_open > (p)->I) { \ - (cur)->It = FROM_M; \ - (II) = (p)->M - gap_open - gap_ext; \ - } else { \ - (cur)->It = FROM_I; \ - (II) = (p)->I - gap_ext; \ - } \ -} -#define set_end_I(II, cur, p) \ -{ \ - if (gap_end >= 0) { \ - if ((p)->M - gap_open > (p)->I) { \ - (cur)->It = FROM_M; \ - (II) = (p)->M - gap_open - gap_end; \ - } else { \ - (cur)->It = FROM_I; \ - (II) = (p)->I - gap_end; \ - } \ - } else set_I(II, cur, p); \ -} -#define set_D(DD, cur, p) \ -{ \ - if ((p)->M - gap_open > (p)->D) { \ - (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_open - gap_ext; \ - } else { \ - (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_ext; \ - } \ -} -#define set_end_D(DD, cur, p) \ -{ \ - if (gap_end >= 0) { \ - if ((p)->M - gap_open > (p)->D) { \ - (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_open - gap_end; \ - } else { \ - (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_end; \ - } \ - } else set_D(DD, cur, p); \ -} - -typedef struct -{ - unsigned char Mt:3, It:2, Dt:2; -} dpcell_t; - -typedef struct -{ - int M, I, D; -} dpscore_t; - -/* build score profile for accelerating alignment, in theory */ -void aln_init_score_array(unsigned char *seq, int len, int row, int *score_matrix, int **s_array) -{ - int *tmp, *tmp2, i, k; - for (i = 0; i != row; ++i) { - tmp = score_matrix + i * row; - tmp2 = s_array[i]; - for (k = 0; k != len; ++k) - tmp2[k] = tmp[seq[k]]; - } -} -/*************************** - * banded global alignment * - ***************************/ -int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len) -{ - register int i, j; - dpcell_t **dpcell, *q; - dpscore_t *curr, *last, *s; - path_t *p; - int b1, b2, tmp_end; - int *mat, end, max; - unsigned char type, ctype; - - int gap_open, gap_ext, gap_end, b; - int *score_matrix, N_MATRIX_ROW; - - /* initialize some align-related parameters. just for compatibility */ - gap_open = ap->gap_open; - gap_ext = ap->gap_ext; - gap_end = ap->gap_end; - b = ap->band_width; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - - if (len1 == 0 || len2 == 0) { - *path_len = 0; - return 0; - } - /* calculate b1 and b2 */ - if (len1 > len2) { - b1 = len1 - len2 + b; - b2 = b; - } else { - b1 = b; - b2 = len2 - len1 + b; - } - if (b1 > len1) b1 = len1; - if (b2 > len2) b2 = len2; - --seq1; --seq2; - - /* allocate memory */ - end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); - dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1)); - for (j = 0; j <= len2; ++j) - dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end); - for (j = b2 + 1; j <= len2; ++j) - dpcell[j] -= j - b2; - curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); - last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); - - /* set first row */ - SET_INF(*curr); curr->M = 0; - for (i = 1, s = curr + 1; i < b1; ++i, ++s) { - SET_INF(*s); - set_end_D(s->D, dpcell[0] + i, s - 1); - } - s = curr; curr = last; last = s; - - /* core dynamic programming, part 1 */ - tmp_end = (b2 < len2)? b2 : len2 - 1; - for (j = 1; j <= tmp_end; ++j) { - q = dpcell[j]; s = curr; SET_INF(*s); - set_end_I(s->I, q, last); - end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - ++s; ++q; - for (i = 1; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_D(s->D, q, s - 1); - if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ - set_end_I(s->I, q, last + i); - } else s->I = MINOR_INF; - s = curr; curr = last; last = s; - } - /* last row for part 1, use set_end_D() instead of set_D() */ - if (j == len2 && b2 != len2 - 1) { - q = dpcell[j]; s = curr; SET_INF(*s); - set_end_I(s->I, q, last); - end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - ++s; ++q; - for (i = 1; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ - set_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_end_D(s->D, q, s - 1); - if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ - set_end_I(s->I, q, last + i); - } else s->I = MINOR_INF; - s = curr; curr = last; last = s; - ++j; - } - - /* core dynamic programming, part 2 */ - for (; j <= len2 - b2 + 1; ++j) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - end = j + b1 - 1; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_D(s->D, q, s - 1); - s->I = MINOR_INF; - s = curr; curr = last; last = s; - } - - /* core dynamic programming, part 3 */ - for (; j < len2; ++j) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - } - set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); - set_end_I(s->I, q, last + i); - set_D(s->D, q, s - 1); - s = curr; curr = last; last = s; - } - /* last row */ - if (j == len2) { - SET_INF(curr[j - b2]); - mat = score_matrix + seq2[j] * N_MATRIX_ROW; - for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { - set_M(s->M, q, last + i - 1, mat[seq1[i]]); - set_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - } - set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); - set_end_I(s->I, q, last + i); - set_end_D(s->D, q, s - 1); - s = curr; curr = last; last = s; - } - - /* backtrace */ - i = len1; j = len2; - q = dpcell[j] + i; - s = last + len1; - max = s->M; type = q->Mt; ctype = FROM_M; - if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; } - if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; } - - p = path; - p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */ - ++p; - do { - switch (ctype) { - case FROM_M: --i; --j; break; - case FROM_I: --j; break; - case FROM_D: --i; break; - } - q = dpcell[j] + i; - ctype = type; - switch (type) { - case FROM_M: type = q->Mt; break; - case FROM_I: type = q->It; break; - case FROM_D: type = q->Dt; break; - } - p->ctype = ctype; p->i = i; p->j = j; - ++p; - } while (i || j); - *path_len = p - path - 1; - - /* free memory */ - for (j = b2 + 1; j <= len2; ++j) - dpcell[j] += j - b2; - for (j = 0; j <= len2; ++j) - free(dpcell[j]); - free(dpcell); - free(curr); free(last); - - return max; -} -/************************************************* - * local alignment combined with banded strategy * - *************************************************/ -int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int _thres, int *_subo) -{ - register NT_LOCAL_SCORE *s; - register int i; - int q, r, qr, tmp_len, qr_shift; - int **s_array, *score_array; - int e, f; - int is_overflow, of_base; - NT_LOCAL_SCORE *eh, curr_h, last_h, curr_last_h; - int j, start_i, start_j, end_i, end_j; - path_t *p; - int score_f, score_r, score_g; - int start, end, max_score; - int thres, *suba, *ss; - - int gap_open, gap_ext; - int *score_matrix, N_MATRIX_ROW; - - /* initialize some align-related parameters. just for compatibility */ - gap_open = ap->gap_open; - gap_ext = ap->gap_ext; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - thres = _thres > 0? _thres : -_thres; - - if (len1 == 0 || len2 == 0) return -1; - - /* allocate memory */ - suba = (int*)malloc(sizeof(int) * (len2 + 1)); - eh = (NT_LOCAL_SCORE*)malloc(sizeof(NT_LOCAL_SCORE) * (len1 + 1)); - s_array = (int**)malloc(sizeof(int*) * N_MATRIX_ROW); - for (i = 0; i != N_MATRIX_ROW; ++i) - s_array[i] = (int*)malloc(sizeof(int) * len1); - /* initialization */ - aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); - q = gap_open; - r = gap_ext; - qr = q + r; - qr_shift = (qr+1) << NT_LOCAL_SHIFT; - tmp_len = len1 + 1; - start_i = start_j = end_i = end_j = 0; - for (i = 0, max_score = 0; i != N_MATRIX_ROW * N_MATRIX_ROW; ++i) - if (max_score < score_matrix[i]) max_score = score_matrix[i]; - /* convert the coordinate */ - --seq1; --seq2; - for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i]; - - /* forward dynamic programming */ - for (i = 0, s = eh; i != tmp_len; ++i, ++s) *s = 0; - score_f = 0; - is_overflow = of_base = 0; - suba[0] = 0; - for (j = 1, ss = suba + 1; j <= len2; ++j, ++ss) { - int subo = 0; - last_h = f = 0; - score_array = s_array[seq2[j]]; - if (is_overflow) { /* adjust eh[] array if overflow occurs. */ - /* If LOCAL_OVERFLOW_REDUCE is too small, optimal alignment might be missed. - * If it is too large, this block will be excuted frequently and therefore - * slow down the whole program. - * Acually, smaller LOCAL_OVERFLOW_REDUCE might also help to reduce the - * number of assignments because it sets some cells to zero when overflow - * happens. */ - int tmp, tmp2; - score_f -= LOCAL_OVERFLOW_REDUCE; - of_base += LOCAL_OVERFLOW_REDUCE; - is_overflow = 0; - for (i = 1, s = eh; i <= tmp_len; ++i, ++s) { - tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK; - if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; - else tmp2 -= LOCAL_OVERFLOW_REDUCE; - if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; - else tmp -= LOCAL_OVERFLOW_REDUCE; - *s = (tmp << NT_LOCAL_SHIFT) | tmp2; - } - } - for (i = 1, s = eh; i != tmp_len; ++i, ++s) { - /* prepare for calculate current h */ - curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i]; - if (curr_h < 0) curr_h = 0; - if (last_h > 0) { /* initialize f */ - f = (f > last_h - q)? f - r : last_h - qr; - if (curr_h < f) curr_h = f; - } - if (*(s+1) >= qr_shift) { /* initialize e */ - curr_last_h = *(s+1) >> NT_LOCAL_SHIFT; - e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr; - if (curr_h < e) curr_h = e; - *s = (last_h << NT_LOCAL_SHIFT) | e; - } else *s = last_h << NT_LOCAL_SHIFT; /* e = 0 */ - last_h = curr_h; - if (subo < curr_h) subo = curr_h; - if (score_f < curr_h) { - score_f = curr_h; end_i = i; end_j = j; - if (score_f > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; - } - } - *s = last_h << NT_LOCAL_SHIFT; - *ss = subo + of_base; - } - score_f += of_base; - - if (score_f < thres) { /* no matching residue at all, 090218 */ - if (path_len) *path_len = 0; - goto end_func; - } - if (path == 0) goto end_func; /* skip path-filling */ - - /* reverse dynamic programming */ - for (i = end_i, s = eh + end_i; i >= 0; --i, --s) *s = 0; - if (end_i == 0 || end_j == 0) goto end_func; /* no local match */ - score_r = score_matrix[seq1[end_i] * N_MATRIX_ROW + seq2[end_j]]; - is_overflow = of_base = 0; - start_i = end_i; start_j = end_j; - eh[end_i] = ((NT_LOCAL_SCORE)(qr + score_r)) << NT_LOCAL_SHIFT; /* in order to initialize f and e, 040408 */ - start = end_i - 1; - end = end_i - 3; - if (end <= 0) end = 0; - - /* second pass DP can be done in a band, speed will thus be enhanced */ - for (j = end_j - 1; j != 0; --j) { - last_h = f = 0; - score_array = s_array[seq2[j]]; - if (is_overflow) { /* adjust eh[] array if overflow occurs. */ - int tmp, tmp2; - score_r -= LOCAL_OVERFLOW_REDUCE; - of_base += LOCAL_OVERFLOW_REDUCE; - is_overflow = 0; - for (i = start, s = eh + start + 1; i >= end; --i, --s) { - tmp = *s >> NT_LOCAL_SHIFT; tmp2 = *s & NT_LOCAL_MASK; - if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; - else tmp2 -= LOCAL_OVERFLOW_REDUCE; - if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; - else tmp -= LOCAL_OVERFLOW_REDUCE; - *s = (tmp << NT_LOCAL_SHIFT) | tmp2; - } - } - for (i = start, s = eh + start + 1; i != end; --i, --s) { - /* prepare for calculate current h */ - curr_h = (*s >> NT_LOCAL_SHIFT) + score_array[i]; - if (curr_h < 0) curr_h = 0; - if (last_h > 0) { /* initialize f */ - f = (f > last_h - q)? f - r : last_h - qr; - if (curr_h < f) curr_h = f; - } - curr_last_h = *(s-1) >> NT_LOCAL_SHIFT; - e = ((*s & NT_LOCAL_MASK) > curr_last_h - q)? (*s & NT_LOCAL_MASK) - r : curr_last_h - qr; - if (e < 0) e = 0; - if (curr_h < e) curr_h = e; - *s = (last_h << NT_LOCAL_SHIFT) | e; - last_h = curr_h; - if (score_r < curr_h) { - score_r = curr_h; start_i = i; start_j = j; - if (score_r + of_base - qr == score_f) { - j = 1; break; - } - if (score_r > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; - } - } - *s = last_h << NT_LOCAL_SHIFT; - /* recalculate start and end, the boundaries of the band */ - if ((eh[start] >> NT_LOCAL_SHIFT) <= qr) --start; - if (start <= 0) start = 0; - end = start_i - (start_j - j) - (score_r + of_base + (start_j - j) * max_score) / r - 1; - if (end <= 0) end = 0; - } - - if (_subo) { - int tmp2 = 0, tmp = (int)(start_j - .33 * (end_j - start_j) + .499); - for (j = 1; j <= tmp; ++j) - if (tmp2 < suba[j]) tmp2 = suba[j]; - tmp = (int)(end_j + .33 * (end_j - start_j) + .499); - for (j = tmp; j <= len2; ++j) - if (tmp2 < suba[j]) tmp2 = suba[j]; - *_subo = tmp2; - } - - if (path_len == 0) { - path[0].i = start_i; path[0].j = start_j; - path[1].i = end_i; path[1].j = end_j; - goto end_func; - } - - score_r += of_base; - score_r -= qr; - -#ifdef DEBUG - /* this seems not a bug */ - if (score_f != score_r) - fprintf(stderr, "[aln_local_core] unknown flaw occurs: score_f(%d) != score_r(%d)\n", score_f, score_r); -#endif - - if (_thres > 0) { /* call global alignment to fill the path */ - score_g = 0; - j = (end_i - start_i > end_j - start_j)? end_i - start_i : end_j - start_j; - ++j; /* j is the maximum band_width */ - for (i = ap->band_width;; i <<= 1) { - AlnParam ap_real = *ap; - ap_real.gap_end = -1; - ap_real.band_width = i; - score_g = aln_global_core(seq1 + start_i, end_i - start_i + 1, seq2 + start_j, - end_j - start_j + 1, &ap_real, path, path_len); - if (score_g == score_r || score_f == score_g) break; - if (i > j) break; - } - if (score_r > score_g && score_f > score_g) { - fprintf(stderr, "[aln_local_core] Potential bug: (%d,%d) > %d\n", score_f, score_r, score_g); - score_f = score_r = -1; - } else score_f = score_g; - - /* convert coordinate */ - for (p = path + *path_len - 1; p >= path; --p) { - p->i += start_i - 1; - p->j += start_j - 1; - } - } else { /* just store the start and end */ - *path_len = 2; - path[1].i = start_i; path[1].j = start_j; - path->i = end_i; path->j = end_j; - } - -end_func: - /* free */ - free(eh); free(suba); - for (i = 0; i != N_MATRIX_ROW; ++i) { - ++s_array[i]; - free(s_array[i]); - } - free(s_array); - return score_f; -} -AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, - int type, int thres, int len1, int len2) -{ - unsigned char *seq11, *seq22; - int score; - int i, j, l; - path_t *p; - char *out1, *out2, *outm; - AlnAln *aa; - - if (len1 < 0) len1 = strlen(seq1); - if (len2 < 0) len2 = strlen(seq2); - - aa = aln_init_AlnAln(); - seq11 = (unsigned char*)malloc(sizeof(unsigned char) * len1); - seq22 = (unsigned char*)malloc(sizeof(unsigned char) * len2); - aa->path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 1)); - - if (ap->row < 10) { /* 4-nucleotide alignment */ - for (i = 0; i < len1; ++i) - seq11[i] = aln_nt4_table[(int)seq1[i]]; - for (j = 0; j < len2; ++j) - seq22[j] = aln_nt4_table[(int)seq2[j]]; - } else if (ap->row < 20) { /* 16-nucleotide alignment */ - for (i = 0; i < len1; ++i) - seq11[i] = aln_nt16_table[(int)seq1[i]]; - for (j = 0; j < len2; ++j) - seq22[j] = aln_nt16_table[(int)seq2[j]]; - } else { /* amino acids */ - for (i = 0; i < len1; ++i) - seq11[i] = aln_aa_table[(int)seq1[i]]; - for (j = 0; j < len2; ++j) - seq22[j] = aln_aa_table[(int)seq2[j]]; - } - - if (type == ALN_TYPE_GLOBAL) score = aln_global_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len); - else if (type == ALN_TYPE_LOCAL) score = aln_local_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, thres, &aa->subo); - else if (type == ALN_TYPE_EXTEND) score = aln_extend_core(seq11, len1, seq22, len2, ap, aa->path, &aa->path_len, 1, 0); - else { - free(seq11); free(seq22); free(aa->path); - aln_free_AlnAln(aa); - return 0; - } - aa->score = score; - - if (thres > 0) { - out1 = aa->out1 = (char*)malloc(sizeof(char) * (aa->path_len + 1)); - out2 = aa->out2 = (char*)malloc(sizeof(char) * (aa->path_len + 1)); - outm = aa->outm = (char*)malloc(sizeof(char) * (aa->path_len + 1)); - - --seq1; --seq2; - --seq11; --seq22; - - p = aa->path + aa->path_len - 1; - - for (l = 0; p >= aa->path; --p, ++l) { - switch (p->ctype) { - case FROM_M: out1[l] = seq1[p->i]; out2[l] = seq2[p->j]; - outm[l] = (seq11[p->i] == seq22[p->j] && seq11[p->i] != ap->row)? '|' : ' '; - break; - case FROM_I: out1[l] = '-'; out2[l] = seq2[p->j]; outm[l] = ' '; break; - case FROM_D: out1[l] = seq1[p->i]; out2[l] = '-'; outm[l] = ' '; break; - } - } - out1[l] = out2[l] = outm[l] = '\0'; - ++seq11; ++seq22; - } - - free(seq11); - free(seq22); - - p = aa->path + aa->path_len - 1; - aa->start1 = p->i? p->i : 1; - aa->end1 = aa->path->i; - aa->start2 = p->j? p->j : 1; - aa->end2 = aa->path->j; - aa->cigar32 = aln_path2cigar32(aa->path, aa->path_len, &aa->n_cigar); - - return aa; -} -AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int thres) -{ - return aln_stdaln_aux(seq1, seq2, ap, type, thres, -1, -1); -} - -/* for backward compatibility */ -uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar) -{ - uint32_t *cigar32; - uint16_t *cigar; - int i; - cigar32 = aln_path2cigar32(path, path_len, n_cigar); - cigar = (uint16_t*)cigar32; - for (i = 0; i < *n_cigar; ++i) - cigar[i] = (cigar32[i]&0xf)<<14 | (cigar32[i]>>4&0x3fff); - return cigar; -} - -/* newly added functions (2009-07-21) */ - -int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int G0, uint8_t *_mem) -{ - int q, r, qr; - int32_t **s_array, *score_array; - int is_overflow, of_base; - uint32_t *eh; - int i, j, end_i, end_j; - int score, start, end; - int *score_matrix, N_MATRIX_ROW; - uint8_t *mem, *_p; - - /* initialize some align-related parameters. just for compatibility */ - q = ap->gap_open; - r = ap->gap_ext; - qr = q + r; - score_matrix = ap->matrix; - N_MATRIX_ROW = ap->row; - - if (len1 == 0 || len2 == 0) return -1; - - /* allocate memory */ - mem = _mem? _mem : calloc((len1 + 2) * (N_MATRIX_ROW + 1), 4); - _p = mem; - eh = (uint32_t*)_p, _p += 4 * (len1 + 2); - s_array = calloc(N_MATRIX_ROW, sizeof(void*)); - for (i = 0; i != N_MATRIX_ROW; ++i) - s_array[i] = (int32_t*)_p, _p += 4 * len1; - /* initialization */ - aln_init_score_array(seq1, len1, N_MATRIX_ROW, score_matrix, s_array); - start = 1; end = 2; - end_i = end_j = 0; - score = 0; - is_overflow = of_base = 0; - /* convert the coordinate */ - --seq1; --seq2; - for (i = 0; i != N_MATRIX_ROW; ++i) --s_array[i]; - - /* dynamic programming */ - memset(eh, 0, 4 * (len1 + 2)); - eh[1] = (uint32_t)G0<<16; - for (j = 1; j <= len2; ++j) { - int _start, _end; - int h1 = 0, f = 0; - score_array = s_array[seq2[j]]; - /* set start and end */ - _start = j - ap->band_width; - if (_start < 1) _start = 1; - if (_start > start) start = _start; - _end = j + ap->band_width; - if (_end > len1 + 1) _end = len1 + 1; - if (_end < end) end = _end; - if (start == end) break; - /* adjust eh[] array if overflow occurs. */ - if (is_overflow) { - int tmp, tmp2; - score -= LOCAL_OVERFLOW_REDUCE; - of_base += LOCAL_OVERFLOW_REDUCE; - is_overflow = 0; - for (i = start; i <= end; ++i) { - uint32_t *s = &eh[i]; - tmp = *s >> 16; tmp2 = *s & 0xffff; - if (tmp2 < LOCAL_OVERFLOW_REDUCE) tmp2 = 0; - else tmp2 -= LOCAL_OVERFLOW_REDUCE; - if (tmp < LOCAL_OVERFLOW_REDUCE) tmp = 0; - else tmp -= LOCAL_OVERFLOW_REDUCE; - *s = (tmp << 16) | tmp2; - } - } - _start = _end = 0; - /* the inner loop */ - for (i = start; i < end; ++i) { - /* At the beginning of each cycle: - eh[i] -> h[j-1,i-1]<<16 | e[j,i] - f -> f[j,i] - h1 -> h[j,i-1] - */ - uint32_t *s = &eh[i]; - int h = (int)(*s >> 16); - int e = *s & 0xffff; /* this is e[j,i] */ - *s = (uint32_t)h1 << 16; /* eh[i] now stores h[j,i-1]<<16 */ - h += h? score_array[i] : 0; /* this is left_core() specific */ - /* calculate h[j,i]; don't need to test 0, as {e,f}>=0 */ - h = h > e? h : e; - h = h > f? h : f; /* h now is h[j,i] */ - h1 = h; - if (h > 0) { - if (_start == 0) _start = i; - _end = i; - if (score < h) { - score = h; end_i = i; end_j = j; - if (score > LOCAL_OVERFLOW_THRESHOLD) is_overflow = 1; - } - } - /* calculate e[j+1,i] and f[j,i+1] */ - h -= qr; - h = h > 0? h : 0; - e -= r; - e = e > h? e : h; - f -= r; - f = f > h? f : h; - *s |= e; - } - eh[end] = h1 << 16; - /* recalculate start and end, the boundaries of the band */ - if (_end <= 0) break; /* no cell in this row has a positive score */ - start = _start; - end = _end + 3; - } - - score += of_base - 1; - if (score <= 0) { - if (path_len) *path_len = 0; - goto end_left_func; - } - - if (path == 0) goto end_left_func; - - if (path_len == 0) { - path[0].i = end_i; path[0].j = end_j; - goto end_left_func; - } - - { /* call global alignment to fill the path */ - int score_g = 0; - j = (end_i - 1 > end_j - 1)? end_i - 1 : end_j - 1; - ++j; /* j is the maximum band_width */ - for (i = ap->band_width;; i <<= 1) { - AlnParam ap_real = *ap; - ap_real.gap_end = -1; - ap_real.band_width = i; - score_g = aln_global_core(seq1 + 1, end_i, seq2 + 1, end_j, &ap_real, path, path_len); - if (score == score_g) break; - if (i > j) break; - } - if (score > score_g) - fprintf(stderr, "[aln_left_core] no suitable bandwidth: %d < %d\n", score_g, score); - score = score_g; - } - -end_left_func: - /* free */ - free(s_array); - if (!_mem) free(mem); - return score; -} - -uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar) -{ - int i, n; - uint32_t *cigar; - unsigned char last_type; - - if (path_len == 0 || path == 0) { - *n_cigar = 0; - return 0; - } - - last_type = path->ctype; - for (i = n = 1; i < path_len; ++i) { - if (last_type != path[i].ctype) ++n; - last_type = path[i].ctype; - } - *n_cigar = n; - cigar = (uint32_t*)malloc(*n_cigar * 4); - - cigar[0] = 1u << 4 | path[path_len-1].ctype; - last_type = path[path_len-1].ctype; - for (i = path_len - 2, n = 0; i >= 0; --i) { - if (path[i].ctype == last_type) cigar[n] += 1u << 4; - else { - cigar[++n] = 1u << 4 | path[i].ctype; - last_type = path[i].ctype; - } - } - - return cigar; -} - -#ifdef STDALN_MAIN -int main() -{ - AlnAln *aln_local, *aln_global, *aln_left; - int i; - - aln_local = aln_stdaln("CGTGCGATGCactgCATACGGCTCGCCTAGATCA", "AAGGGATGCTCTGCATCgCTCGGCTAGCTGT", &aln_param_blast, 0, 1); - aln_global = aln_stdaln("CGTGCGATGCactgCATACGGCTCGCCTAGATCA", "AAGGGATGCTCTGCATCGgCTCGGCTAGCTGT", &aln_param_blast, 1, 1); -// aln_left = aln_stdaln( "GATGCACTGCATACGGCTCGCCTAGATCA", "GATGCTCTGCATCGgCTCGGCTAGCTGT", &aln_param_blast, 2, 1); - aln_left = aln_stdaln("CACCTTCGACTCACGTCTCATTCTCGGAGTCGAGTGGACGGTCCCTCATACACGAACAGGTTC", - "CACCTTCGACTTTCACCTCTCATTCTCGGACTCGAGTGGACGGTCCCTCATCCAAGAACAGGGTCTGTGAAA", &aln_param_blast, 2, 1); - - printf(">%d,%d\t%d,%d\n", aln_local->start1, aln_local->end1, aln_local->start2, aln_local->end2); - printf("%s\n%s\n%s\n", aln_local->out1, aln_local->outm, aln_local->out2); - - printf(">%d,%d\t%d,%d\t", aln_global->start1, aln_global->end1, aln_global->start2, aln_global->end2); - for (i = 0; i != aln_global->n_cigar; ++i) - printf("%d%c", aln_global->cigar32[i]>>4, "MID"[aln_global->cigar32[i]&0xf]); - printf("\n%s\n%s\n%s\n", aln_global->out1, aln_global->outm, aln_global->out2); - - printf(">%d\t%d,%d\t%d,%d\t", aln_left->score, aln_left->start1, aln_left->end1, aln_left->start2, aln_left->end2); - for (i = 0; i != aln_left->n_cigar; ++i) - printf("%d%c", aln_left->cigar32[i]>>4, "MID"[aln_left->cigar32[i]&0xf]); - printf("\n%s\n%s\n%s\n", aln_left->out1, aln_left->outm, aln_left->out2); - - aln_free_AlnAln(aln_local); - aln_free_AlnAln(aln_global); - aln_free_AlnAln(aln_left); - return 0; -} -#endif diff --git a/stdaln.h b/stdaln.h deleted file mode 100644 index f0048b3..0000000 --- a/stdaln.h +++ /dev/null @@ -1,162 +0,0 @@ -/* The MIT License - - Copyright (c) 2003-2006, 2008, by Heng Li - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* - 2009-07-23, 0.10.0 - - - Use 32-bit to store CIGAR - - - Report suboptimal aligments - - - Implemented half-fixed-half-open DP - - 2009-04-26, 0.9.10 - - - Allow to set a threshold for local alignment - - 2009-02-18, 0.9.9 - - - Fixed a bug when no residue matches - - 2008-08-04, 0.9.8 - - - Fixed the wrong declaration of aln_stdaln_aux() - - - Avoid 0 coordinate for global alignment - - 2008-08-01, 0.9.7 - - - Change gap_end penalty to 5 in aln_param_bwa - - - Add function to convert path_t to the CIGAR format - - 2008-08-01, 0.9.6 - - - The first gap now costs (gap_open+gap_ext), instead of - gap_open. Scoring systems are modified accordingly. - - - Gap end is now correctly handled. Previously it is not correct. - - - Change license to MIT. - - */ - -#ifndef LH3_STDALN_H_ -#define LH3_STDALN_H_ - - -#define STDALN_VERSION 0.11.0 - -#include - -#define FROM_M 0 -#define FROM_I 1 -#define FROM_D 2 -#define FROM_S 3 - -#define ALN_TYPE_LOCAL 0 -#define ALN_TYPE_GLOBAL 1 -#define ALN_TYPE_EXTEND 2 - -/* This is the smallest integer. It might be CPU-dependent in very RARE cases. */ -#define MINOR_INF -1073741823 - -typedef struct -{ - int gap_open; - int gap_ext; - int gap_end; - - int *matrix; - int row; - int band_width; -} AlnParam; - -typedef struct -{ - int i, j; - unsigned char ctype; -} path_t; - -typedef struct -{ - path_t *path; /* for advanced users... :-) */ - int path_len; /* for advanced users... :-) */ - int start1, end1; /* start and end of the first sequence, coordinations are 1-based */ - int start2, end2; /* start and end of the second sequence, coordinations are 1-based */ - int score, subo; /* score */ - - char *out1, *out2; /* print them, and then you will know */ - char *outm; - - int n_cigar; - uint32_t *cigar32; -} AlnAln; - -#ifdef __cplusplus -extern "C" { -#endif - - AlnAln *aln_stdaln_aux(const char *seq1, const char *seq2, const AlnParam *ap, - int type, int do_align, int len1, int len2); - AlnAln *aln_stdaln(const char *seq1, const char *seq2, const AlnParam *ap, int type, int do_align); - void aln_free_AlnAln(AlnAln *aa); - - int aln_global_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len); - int aln_local_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int _thres, int *_subo); - int aln_extend_core(unsigned char *seq1, int len1, unsigned char *seq2, int len2, const AlnParam *ap, - path_t *path, int *path_len, int G0, uint8_t *_mem); - uint16_t *aln_path2cigar(const path_t *path, int path_len, int *n_cigar); - uint32_t *aln_path2cigar32(const path_t *path, int path_len, int *n_cigar); - -#ifdef __cplusplus -} -#endif - -/******************** - * global variables * - ********************/ - -extern AlnParam aln_param_bwa; /* = { 37, 9, 0, aln_sm_maq, 5, 50 }; */ -extern AlnParam aln_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */ -extern AlnParam aln_param_nt2nt; /* = { 10, 2, 2, aln_sm_nt, 16, 75 }; */ -extern AlnParam aln_param_aa2aa; /* = { 20, 19, 19, aln_sm_read, 16, 75 }; */ -extern AlnParam aln_param_rd2rd; /* = { 12, 2, 2, aln_sm_blosum62, 22, 50 }; */ - -/* common nucleotide score matrix for 16 bases */ -extern int aln_sm_nt[], aln_sm_bwa[]; - -/* BLOSUM62 and BLOSUM45 */ -extern int aln_sm_blosum62[], aln_sm_blosum45[]; - -/* common read for 16 bases. note that read alignment is quite different from common nucleotide alignment */ -extern int aln_sm_read[]; - -/* human-mouse score matrix for 4 bases */ -extern int aln_sm_hs[]; - -#endif From 25366c72206df5066f268845c03c1437f8e55dfd Mon Sep 17 00:00:00 2001 From: Jon Sorenson Date: Tue, 5 Mar 2013 20:48:16 +0000 Subject: [PATCH 335/498] Fixing problem with linking to libm on some Ubuntu systems (I see this on machine running 11.04, kernel 3.0.0-14-virtual). Changing order of -lm on the command line seems to do the trick and should be tolerated in other environments. --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 96c3047..bd4ee22 100644 --- a/Makefile +++ b/Makefile @@ -23,10 +23,10 @@ SUBDIRS= . all:$(PROG) bwa:libbwa.a $(AOBJS) main.o - $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ $(LIBS) -L. -lbwa + $(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS) bwamem-lite:libbwa.a example.o - $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ $(LIBS) -L. -lbwa + $(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS) libbwa.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) From 6476343a8310ea4886ac2e35702818955bde1c81 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 19:56:37 -0500 Subject: [PATCH 336/498] r331: rewrote CIGAR generation for bwa-short When backtracking, bwa-short does not keep the detailed alignment or the exact start and end positions. To find the boundary and the CIGAR, the old code does a global alignment with a small end-gap penalty. It then deals with a lot of special cases to derive the right position and CIGAR, which are actually not always right. It is a mess. As the new ksw.{c,h} does not support a different end-gap penalty, the old strategy does not work. But we get something better. The new code finds the boundaries with ksw_extend(). It is cleaner and gives more accurate CIGAR in most cases. --- bwase.c | 89 ++++++++++++++++++++++++++++----------------------------- main.c | 2 +- 2 files changed, 44 insertions(+), 47 deletions(-) diff --git a/bwase.c b/bwase.c index eebe22b..83e4baa 100644 --- a/bwase.c +++ b/bwase.c @@ -156,59 +156,58 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se bwt_destroy(bwt); } -/* is_end_correct == 1 if (*pos+len) gives the correct coordinate on - * forward strand. This happens when p->pos is calculated by - * bwa_cal_pac_pos(). is_end_correct==0 if (*pos) gives the correct - * coordinate. This happens only for color-converted alignment. */ -bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, bwtint_t *_pos, - int ext, int *n_cigar, int is_end_correct) +#define SW_BW 50 + +bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, bwtint_t *_pos, int *n_cigar, int is_rev) { bwa_cigar_t *cigar = 0; uint32_t *cigar32 = 0; - ubyte_t *ref_seq; - int l = 0, ref_len; - int64_t k, __pos = *_pos; + ubyte_t *rseq; + int tle, qle, gtle, gscore, lscore; + int64_t k, rb, re, rlen; int8_t mat[25]; bwa_fill_scmat(1, 3, mat); - ref_len = len + abs(ext); - if (ext > 0) { - ref_seq = (ubyte_t*)calloc(ref_len, 1); - for (k = __pos; k < __pos + ref_len && k < l_pac; ++k) - ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - } else { - int64_t x = __pos + (is_end_correct? len : ref_len); - ref_seq = (ubyte_t*)calloc(ref_len, 1); - for (l = 0, k = x - ref_len > 0? x - ref_len : 0; k < x && k < l_pac; ++k) - ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; + if (!is_rev) { // forward strand, the end position is correct + re = *_pos + len; + if (re > l_pac) re = l_pac; + rb = re - (len + SW_BW); + if (rb < 0) rb = 0; + rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); + seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences + seq_reverse(rlen, rseq, 0); + lscore = ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + if (gscore > 0) tle = gtle, qle = len; + rb = re - tle; rlen = tle; + seq_reverse(len, seq, 0); + seq_reverse(rlen, rseq, 0); + ksw_global(qle, &seq[len-qle], rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); + if (qle < len) { // write soft clip + cigar = realloc(cigar, (*n_cigar + 1) * 4); + memmove(cigar + 1, cigar, *n_cigar * 4); + cigar[0] = (len - qle)<<4 | FROM_S; + ++(*n_cigar); + } + } else { // reverse strand, the start position is correct + rb = *_pos; re = rb + len + SW_BW; + if (re > l_pac) re = l_pac; + rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); + lscore = ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + if (gscore > 0) tle = gtle, qle = len; + re = rb + tle; rlen = tle; + ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension + if (qle < len) { + cigar = realloc(cigar, (*n_cigar + 1) * 4); + cigar[*n_cigar - 1] = (len - qle)<<4 | FROM_S; + ++(*n_cigar); + } } + *_pos = rb; - ksw_global(len, seq, l, ref_seq, 5, mat, 5, 1, 50, n_cigar, &cigar32); cigar = (bwa_cigar_t*)cigar32; for (k = 0; k < *n_cigar; ++k) cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); - - if (ext < 0 && is_end_correct) { // fix coordinate for reads mapped to the forward strand - for (l = k = 0; k < *n_cigar; ++k) { - if (__cigar_op(cigar[k]) == FROM_D) l -= __cigar_len(cigar[k]); - else if (__cigar_op(cigar[k]) == FROM_I) l += __cigar_len(cigar[k]); - } - __pos += l; - } - - if (__cigar_op(cigar[0]) == FROM_D) { // deletion at the 5'-end - __pos += __cigar_len(cigar[0]); - for (k = 0; k < *n_cigar - 1; ++k) cigar[k] = cigar[k+1]; - --(*n_cigar); - } - if (__cigar_op(cigar[*n_cigar-1]) == FROM_D) --(*n_cigar); // deletion at the 3'-end - - // change "I" at either end of the read to S. just in case. This should rarely happen... - if (__cigar_op(cigar[*n_cigar-1]) == FROM_I) cigar[*n_cigar-1] = __cigar_create(FROM_S, (__cigar_len(cigar[*n_cigar-1]))); - if (__cigar_op(cigar[0]) == FROM_I) cigar[0] = __cigar_create(FROM_S, (__cigar_len(cigar[0]))); - - *_pos = (bwtint_t)__pos; - free(ref_seq); + free(rseq); return cigar; } @@ -316,13 +315,11 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t bwt_multi1_t *q = s->multi + j; int n_cigar; if (q->gap == 0) continue; - q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, - (q->strand? 1 : -1) * q->gap, &n_cigar, 1); + q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, &n_cigar, q->strand); q->n_cigar = n_cigar; } if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; - s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, - (s->strand? 1 : -1) * (s->n_gapo + s->n_gape), &s->n_cigar, 1); + s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, &s->n_cigar, s->strand); } // generate MD tag str = (kstring_t*)calloc(1, sizeof(kstring_t)); diff --git a/main.c b/main.c index 00a21b9..0954e01 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r329-beta" +#define PACKAGE_VERSION "0.7.0-r331-beta" #endif static int usage() From 5fbd4546829d552eec52b1e45ea9ea86b7d5419c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2013 22:49:38 -0500 Subject: [PATCH 337/498] r332: added output threshold Otherwise there are far too many short hits --- bwamem.c | 4 +++- bwamem.h | 1 + fastmap.c | 4 +++- main.c | 2 +- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index cd3a1f5..7a6d175 100644 --- a/bwamem.c +++ b/bwamem.c @@ -44,6 +44,7 @@ mem_opt_t *mem_opt_init() o = calloc(1, sizeof(mem_opt_t)); o->flag = 0; o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; + o->T = 30; o->zdrop = 100; o->pen_unpaired = 9; o->pen_clip = 5; @@ -742,11 +743,12 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b int k; kstring_t str; str.l = str.m = 0; str.s = 0; - if (a->n > 0) { + if (a->n > 0 && a->a[0].score >= opt->T) { int mapq0 = -1; for (k = 0; k < a->n; ++k) { bwahit_t h; mem_alnreg_t *p = &a->a[k]; + if (p->score < opt->T) continue; if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; mem_alnreg2hit(p, &h); diff --git a/bwamem.h b/bwamem.h index 96a3308..14f04d5 100644 --- a/bwamem.h +++ b/bwamem.h @@ -24,6 +24,7 @@ typedef struct { int w; // band width int zdrop; // Z-dropoff + int T; // output score threshold; only affecting output int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor diff --git a/fastmap.c b/fastmap.c index d4e5626..e6c2b1e 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,13 +26,14 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:")) >= 0) { + while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); else if (c == 'O') opt->q = atoi(optarg); else if (c == 'E') opt->r = atoi(optarg); + else if (c == 'T') opt->T = atoi(optarg); else if (c == 'L') opt->pen_clip = atoi(optarg); else if (c == 'U') opt->pen_unpaired = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; @@ -74,6 +75,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); fprintf(stderr, "\n"); fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); + fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, " -H hard clipping\n"); diff --git a/main.c b/main.c index 0954e01..c342fd4 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r331-beta" +#define PACKAGE_VERSION "0.7.0-r332-beta" #endif static int usage() From 773b86331b2a207337b519fb6eb82d5e70d6bda3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Mar 2013 19:23:45 -0500 Subject: [PATCH 338/498] De-overlap paired-end reads --- pemerge.c | 267 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 pemerge.c diff --git a/pemerge.c b/pemerge.c new file mode 100644 index 0000000..90d53a3 --- /dev/null +++ b/pemerge.c @@ -0,0 +1,267 @@ +#include +#include +#include +#include +#include +#include "ksw.h" +#include "kseq.h" + +#ifdef _PEM_MAIN +KSEQ_INIT(gzFile, gzread) + +unsigned char nst_nt4_table[128] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +void bwa_fill_scmat(int a, int b, int8_t mat[25]) +{ + int i, j, k; + for (i = k = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + mat[k++] = i == j? a : -b; + mat[k++] = 0; // ambiguous base + } + for (j = 0; j < 5; ++j) mat[k++] = 0; +} +#else +#include "bwa.h" +KSEQ_DECLARE(gzFile) +#endif + +#define MAX_SCORE_RATIO 0.9f +#define MAX_ERR 8 + +static const char *err_msg[MAX_ERR+1] = { + "successful merges", + "low-scoring pairs", + "pairs where the best SW alignment is not an overlap (long left end)", + "pairs where the best SW alignment is not an overlap (long right end)", + "pairs with large 2nd best SW score", + "pairs with gapped overlap", + "pairs where the end-to-end ungapped alignment score is not high enough", + "pairs potentially with tandem overlaps", + "pairs with high sum of errors" +}; + +typedef struct { + kstring_t n, s, q; +} mseq_t; + +typedef struct { + int a, b, q, r, w; + int q_def, q_thres; + int T; + int8_t mat[25]; +} pem_opt_t; + +pem_opt_t *pem_opt_init() +{ + pem_opt_t *opt; + opt = calloc(1, sizeof(pem_opt_t)); + opt->a = 5; opt->b = 4; opt->q = 2, opt->r = 17; opt->w = 20; + opt->T = opt->a * 10; + opt->q_def = 20; + opt->q_thres = 70; + bwa_fill_scmat(opt->a, opt->b, opt->mat); + return opt; +} + +int bwa_pemerge(const pem_opt_t *opt, mseq_t x[2], uint8_t **seq_, uint8_t **qual_) +{ + uint8_t *s[2], *q[2], *seq, *qual; + int i, xtra, l, l_seq, sum_q; + kswr_t r; + + *seq_ = *qual_ = 0; + s[0] = malloc(x[0].s.l); q[0] = malloc(x[0].s.l); + s[1] = malloc(x[1].s.l); q[1] = malloc(x[1].s.l); + for (i = 0; i < x[0].s.l; ++i) { + int c = x[0].s.s[i]; + s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c]; + q[0][i] = x[0].q.l? x[0].q.s[i] - 33 : opt->q_def; + } + for (i = 0; i < x[1].s.l; ++i) { + int c = x[1].s.s[x[1].s.l - 1 - i]; + c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c]; + s[1][i] = c < 4? 3 - c : 4; + q[1][i] = x[1].q.l? x[1].q.s[x[1].q.l - 1 - i] - 33 : opt->q_def; + } + + xtra = KSW_XSTART | KSW_XSUBO; + r = ksw_align(x[1].s.l, s[1], x[0].s.l, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0); + ++r.qe; ++r.te; // change to the half-close-half-open coordinates + + if (r.score < opt->T) return -1; // poor alignment + if (r.tb < r.qb) return -2; // no enough space for the left end + if (x[0].s.l - r.te > x[1].s.l - r.qe) return -3; // no enough space for the right end + if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) return -4; // the second best score is too large + if (r.qe - r.qb != r.te - r.tb) return -5; // we do not allow gaps + + { // test tandem match; O(n^2) + int max_m, max_m2, min_l, max_l, max_l2; + max_m = max_m2 = 0; max_l = max_l2 = 0; + min_l = x[0].s.l < x[1].s.l? x[0].s.l : x[1].s.l; + for (l = 0; l < min_l; ++l) { + int m = 0, o = x[0].s.l - l; + int a = opt->a, b = -opt->b; + for (i = 0; i < l; ++i) + m += (s[0][o + i] == s[1][i])? a : b; + if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l; + else if (m > max_m2) max_m2 = m, max_l2 = l; + } + if (max_m < opt->T) return -6; + if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) + return -7; + //printf("*** %d,%d; %d,%d\n", max_m, max_m2, max_l, max_l2); + } + + l = x[0].s.l - (r.tb - r.qb); // length to merge + l_seq = x[0].s.l + x[1].s.l - l; + seq = malloc(l_seq + 1); + qual = malloc(l_seq + 1); + memcpy(seq, s[0], x[0].s.l); memcpy(seq + x[0].s.l, &s[1][l], x[1].s.l - l); + memcpy(qual, q[0], x[0].s.l); memcpy(qual + x[0].s.l, &q[1][l], x[1].s.l - l); + for (i = 0, sum_q = 0; i < l; ++i) { + int k = x[0].s.l - l + i; + if (s[0][k] == 4) { // ambiguous + seq[k] = s[1][i]; + qual[k] = q[1][i]; + } else if (s[1][i] == 4) { // do nothing + } else if (s[0][k] == s[1][i]) { + qual[k] = qual[k] > q[1][i]? qual[k] : q[1][i]; + } else { // s[0][k] != s[1][i] and neither is N + int qq = q[0][k] < q[1][i]? q[0][k] : q[1][i]; + sum_q += qq >= 3? qq<<1 : 1; + seq[k] = q[0][k] > q[1][i]? s[0][k] : s[1][i]; + qual[k] = abs((int)q[0][k] - (int)q[1][i]); + } + } + if (sum_q>>1 > opt->q_thres) { // too many mismatches + free(seq); free(qual); + return -8; + } + + for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33; + seq[l_seq] = qual[l_seq] = 0; + *seq_ = seq, *qual_ = qual; + return l_seq; +} + +static inline void kstrcpy(kstring_t *dst, const kstring_t *src) +{ + dst->l = 0; + if (src->l == 0) return; + if (dst->m < src->l + 1) { + dst->m = src->l + 2; + kroundup32(dst->m); + dst->s = realloc(dst->s, dst->m); + } + dst->l = src->l; + memcpy(dst->s, src->s, src->l + 1); +} + +static inline void kseq2mseq(mseq_t *ms, const kseq_t *ks) +{ + kstrcpy(&ms->n, &ks->name); + kstrcpy(&ms->s, &ks->seq); + kstrcpy(&ms->q, &ks->qual); +} + +static inline void print_seq(const char *n, const char *s, const char *q) +{ + putchar(q? '@' : '>'); + puts(n); puts(s); + if (q) { + puts("+"); puts(q); + } +} + +#ifdef _PEM_MAIN +int main(int argc, char *argv[]) +#else +int main_pemerge(int argc, char *argv[]) +#endif +{ + int c, flag = 0, i; + int64_t cnt[MAX_ERR+1]; + gzFile fp, fp2 = 0; + kseq_t *ks, *ks2 = 0; + mseq_t r[2]; + pem_opt_t *opt; + + opt = pem_opt_init(); + while ((c = getopt(argc, argv, "muQ:")) >= 0) { + if (c == 'm') flag |= 1; + else if (c == 'u') flag |= 2; + else if (c == 'Q') opt->q_thres = atoi(optarg); + } + if (flag == 0) flag = 3; + + if (optind == argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bwa pemerge [-mu] [read2.fq]\n\n"); + fprintf(stderr, "Options: -m output merged reads only\n"); + fprintf(stderr, " -u output unmerged reads only\n"); + fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres); + fprintf(stderr, "\n"); + free(opt); + return 1; + } + + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + ks = kseq_init(fp); + if (optind + 1 < argc) { + fp2 = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "r") : gzdopen(fileno(stdin), "r"); + ks2 = kseq_init(fp2); + } + + memset(r, 0, sizeof(mseq_t)<<1); + memset(cnt, 0, 8 * (MAX_ERR+1)); + while (kseq_read(ks) >= 0) { + uint8_t *seq, *qual; + int l_seq; + kseq2mseq(&r[0], ks); + if (ks2) { + if (kseq_read(ks2) < 0) break; + kseq2mseq(&r[1], ks2); + } else { + if (kseq_read(ks) < 0) break; + kseq2mseq(&r[1], ks); + } + l_seq = bwa_pemerge(opt, r, &seq, &qual); + if (l_seq > 0) { + ++cnt[0]; + if (flag & 1) { + if (r[0].n.l > 2 && (r[0].n.s[r[0].n.l-1] == '1' || r[0].n.s[r[0].n.l-1] == '2') && r[0].n.s[r[0].n.l-2] == '/') + r[0].n.s[r[0].n.l-2] = 0, r[0].n.l -= 2; + print_seq(r[0].n.s, (char*)seq, (char*)qual); + } + } else { + ++cnt[-l_seq]; + if (flag & 2) { + printf("*** %d\n", l_seq); + print_seq(r[0].n.s, r[0].s.s, r[0].q.l? r[0].q.s : 0); + print_seq(r[1].n.s, r[1].s.s, r[1].q.l? r[1].q.s : 0); + } + } + } + + fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]); + for (i = 1; i <= MAX_ERR; ++i) + fprintf(stderr, "%12ld %s\n", (long)cnt[i], err_msg[i]); + kseq_destroy(ks); + gzclose(fp); + if (ks2) { + kseq_destroy(ks2); + gzclose(fp2); + } + free(opt); + return 0; +} From 042e1f4442af7ddea66bc669c37e7bec9d2ded40 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Mar 2013 21:55:02 -0500 Subject: [PATCH 339/498] r334: added pemerge to bwa --- Makefile | 2 +- bntseq.c | 1 - main.c | 27 +++++++++++++++++++++++---- main.h | 28 ---------------------------- pemerge.c | 32 -------------------------------- 5 files changed, 24 insertions(+), 66 deletions(-) delete mode 100644 main.h diff --git a/Makefile b/Makefile index bd4ee22..93e3266 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ AR= ar DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ - is.o bwtindex.o bwape.o kopen.o \ + is.o bwtindex.o bwape.o kopen.o pemerge.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa bwamem-lite diff --git a/bntseq.c b/bntseq.c index 540e966..29b3b12 100644 --- a/bntseq.c +++ b/bntseq.c @@ -31,7 +31,6 @@ #include #include #include "bntseq.h" -#include "main.h" #include "utils.h" #include "kseq.h" diff --git a/main.c b/main.c index c342fd4..26f22ae 100644 --- a/main.c +++ b/main.c @@ -1,12 +1,29 @@ #include #include -#include "main.h" #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r332-beta" +#define PACKAGE_VERSION "0.7.0-r334-beta" #endif +int bwa_fa2pac(int argc, char *argv[]); +int bwa_pac2bwt(int argc, char *argv[]); +int bwa_bwtupdate(int argc, char *argv[]); +int bwa_bwt2sa(int argc, char *argv[]); +int bwa_index(int argc, char *argv[]); +int bwt_bwtgen_main(int argc, char *argv[]); + +int bwa_aln(int argc, char *argv[]); +int bwa_sai2sam_se(int argc, char *argv[]); +int bwa_sai2sam_pe(int argc, char *argv[]); + +int bwa_bwtsw2(int argc, char *argv[]); + +int main_fastmap(int argc, char *argv[]); +int main_mem(int argc, char *argv[]); + +int main_pemerge(int argc, char *argv[]); + static int usage() { fprintf(stderr, "\n"); @@ -15,12 +32,13 @@ static int usage() fprintf(stderr, "Contact: Heng Li \n\n"); fprintf(stderr, "Usage: bwa [options]\n\n"); fprintf(stderr, "Command: index index sequences in the FASTA format\n"); + fprintf(stderr, " mem BWA-MEM algorithm\n"); + fprintf(stderr, " fastmap identify super-maximal exact matches\n"); + fprintf(stderr, " pemerge merge overlapping paired ends\n"); fprintf(stderr, " aln gapped/ungapped alignment\n"); fprintf(stderr, " samse generate alignment (single ended)\n"); fprintf(stderr, " sampe generate alignment (paired ended)\n"); fprintf(stderr, " bwasw BWA-SW for long queries\n"); - fprintf(stderr, " fastmap identify super-maximal exact matches\n"); - fprintf(stderr, " mem BWA-MEM algorithm\n"); fprintf(stderr, "\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n"); @@ -56,6 +74,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); + else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; diff --git a/main.h b/main.h deleted file mode 100644 index 3e70362..0000000 --- a/main.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef BWA_MAIN_H -#define BWA_MAIN_H - -#ifdef __cplusplus -extern "C" { -#endif - - int bwa_fa2pac(int argc, char *argv[]); - int bwa_pac2bwt(int argc, char *argv[]); - int bwa_bwtupdate(int argc, char *argv[]); - int bwa_bwt2sa(int argc, char *argv[]); - int bwa_index(int argc, char *argv[]); - int bwa_aln(int argc, char *argv[]); - int bwt_bwtgen_main(int argc, char *argv[]); - - int bwa_sai2sam_se(int argc, char *argv[]); - int bwa_sai2sam_pe(int argc, char *argv[]); - - int bwa_bwtsw2(int argc, char *argv[]); - - int main_fastmap(int argc, char *argv[]); - int main_mem(int argc, char *argv[]); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/pemerge.c b/pemerge.c index 90d53a3..6756187 100644 --- a/pemerge.c +++ b/pemerge.c @@ -5,35 +5,8 @@ #include #include "ksw.h" #include "kseq.h" - -#ifdef _PEM_MAIN -KSEQ_INIT(gzFile, gzread) - -unsigned char nst_nt4_table[128] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; - -void bwa_fill_scmat(int a, int b, int8_t mat[25]) -{ - int i, j, k; - for (i = k = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - mat[k++] = i == j? a : -b; - mat[k++] = 0; // ambiguous base - } - for (j = 0; j < 5; ++j) mat[k++] = 0; -} -#else #include "bwa.h" KSEQ_DECLARE(gzFile) -#endif #define MAX_SCORE_RATIO 0.9f #define MAX_ERR 8 @@ -183,11 +156,7 @@ static inline void print_seq(const char *n, const char *s, const char *q) } } -#ifdef _PEM_MAIN -int main(int argc, char *argv[]) -#else int main_pemerge(int argc, char *argv[]) -#endif { int c, flag = 0, i; int64_t cnt[MAX_ERR+1]; @@ -246,7 +215,6 @@ int main_pemerge(int argc, char *argv[]) } else { ++cnt[-l_seq]; if (flag & 2) { - printf("*** %d\n", l_seq); print_seq(r[0].n.s, r[0].s.s, r[0].q.l? r[0].q.s : 0); print_seq(r[1].n.s, r[1].s.s, r[1].q.l? r[1].q.s : 0); } From 557d50c7e1d47ceff556847be52e9d9f51fa84a5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Mar 2013 21:57:13 -0500 Subject: [PATCH 340/498] r335: fixed a compiling error Caused by the last change --- bwtindex.c | 1 - main.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/bwtindex.c b/bwtindex.c index 298153d..934b382 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -33,7 +33,6 @@ #include #include "bntseq.h" #include "bwt.h" -#include "main.h" #include "utils.h" #ifdef _DIVBWT diff --git a/main.c b/main.c index 26f22ae..2663902 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r334-beta" +#define PACKAGE_VERSION "0.7.0-r335-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 72817b664e0fdab8a2f4837dea6c7a0a5f58eae5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 6 Mar 2013 23:38:07 -0500 Subject: [PATCH 341/498] r336: fine tuning pemerge --- bwase.c | 6 +++--- main.c | 2 +- pemerge.c | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/bwase.c b/bwase.c index 83e4baa..fd06c7d 100644 --- a/bwase.c +++ b/bwase.c @@ -163,7 +163,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l bwa_cigar_t *cigar = 0; uint32_t *cigar32 = 0; ubyte_t *rseq; - int tle, qle, gtle, gscore, lscore; + int tle, qle, gtle, gscore; int64_t k, rb, re, rlen; int8_t mat[25]; @@ -176,7 +176,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences seq_reverse(rlen, rseq, 0); - lscore = ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); if (gscore > 0) tle = gtle, qle = len; rb = re - tle; rlen = tle; seq_reverse(len, seq, 0); @@ -192,7 +192,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l rb = *_pos; re = rb + len + SW_BW; if (re > l_pac) re = l_pac; rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); - lscore = ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); if (gscore > 0) tle = gtle, qle = len; re = rb + tle; rlen = tle; ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension diff --git a/main.c b/main.c index 2663902..d9a87f1 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r335-beta" +#define PACKAGE_VERSION "0.7.0-r336-beta" #endif int bwa_fa2pac(int argc, char *argv[]); diff --git a/pemerge.c b/pemerge.c index 6756187..5f5e37e 100644 --- a/pemerge.c +++ b/pemerge.c @@ -18,7 +18,7 @@ static const char *err_msg[MAX_ERR+1] = { "pairs where the best SW alignment is not an overlap (long right end)", "pairs with large 2nd best SW score", "pairs with gapped overlap", - "pairs where the end-to-end ungapped alignment score is not high enough", + "pairs where the end-to-end alignment is inconsistent with SW", "pairs potentially with tandem overlaps", "pairs with high sum of errors" }; @@ -81,18 +81,18 @@ int bwa_pemerge(const pem_opt_t *opt, mseq_t x[2], uint8_t **seq_, uint8_t **qua int max_m, max_m2, min_l, max_l, max_l2; max_m = max_m2 = 0; max_l = max_l2 = 0; min_l = x[0].s.l < x[1].s.l? x[0].s.l : x[1].s.l; - for (l = 0; l < min_l; ++l) { + for (l = 1; l < min_l; ++l) { int m = 0, o = x[0].s.l - l; - int a = opt->a, b = -opt->b; - for (i = 0; i < l; ++i) - m += (s[0][o + i] == s[1][i])? a : b; + uint8_t *s0o = &s[0][o], *s1 = s[1]; + for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck! + m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i] if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l; else if (m > max_m2) max_m2 = m, max_l2 = l; } - if (max_m < opt->T) return -6; + if (max_m < opt->T || max_l != x[0].s.l - (r.tb - r.qb)) return -6; if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) return -7; - //printf("*** %d,%d; %d,%d\n", max_m, max_m2, max_l, max_l2); + if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) return -7; } l = x[0].s.l - (r.tb - r.qb); // length to merge From 3e3236dfc400b1157726af61468914f5ebcbdc8a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Mar 2013 11:00:15 -0500 Subject: [PATCH 342/498] r337: mem - always read even number of reads In the old code, we may read odd number of reads from an interleaved fastq. --- bwa.c | 2 +- fastmap.c | 5 +++++ main.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/bwa.c b/bwa.c index 76b54ae..991b23a 100644 --- a/bwa.c +++ b/bwa.c @@ -55,7 +55,7 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) kseq2bseq1(ks2, &seqs[n]); size += seqs[n++].l_seq; } - if (size >= chunk_size) break; + if (size >= chunk_size && (n&1) == 0) break; } if (size == 0) { // test if the 2nd file is finished if (ks2 && kseq_read(ks2) >= 0) diff --git a/fastmap.c b/fastmap.c index e6c2b1e..9204399 100644 --- a/fastmap.c +++ b/fastmap.c @@ -106,6 +106,11 @@ int main_mem(int argc, char *argv[]) } while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { int64_t size = 0; + if ((opt->flag & MEM_F_PE) && (n&1) == 1) { + if (bwa_verbose >= 2) + fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__); + n = n>>1<<1; + } if (!copy_comment) for (i = 0; i < n; ++i) { free(seqs[i].comment); seqs[i].comment = 0; diff --git a/main.c b/main.c index d9a87f1..f7b3a03 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r336-beta" +#define PACKAGE_VERSION "0.7.0-r337-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 1cadfa15520ae611c8324ee8528a28dea1f33f8d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Mar 2013 11:14:52 -0500 Subject: [PATCH 343/498] r338: pemerge - fixed memory leaks; multithreading pemerge is actually quite slow. --- main.c | 2 +- pemerge.c | 195 +++++++++++++++++++++++++++++++----------------------- 2 files changed, 113 insertions(+), 84 deletions(-) diff --git a/main.c b/main.c index f7b3a03..31bba4b 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r337-beta" +#define PACKAGE_VERSION "0.7.0-r338-beta" #endif int bwa_fa2pac(int argc, char *argv[]); diff --git a/pemerge.c b/pemerge.c index 5f5e37e..696137d 100644 --- a/pemerge.c +++ b/pemerge.c @@ -3,8 +3,10 @@ #include #include #include +#include #include "ksw.h" #include "kseq.h" +#include "kstring.h" #include "bwa.h" KSEQ_DECLARE(gzFile) @@ -23,14 +25,13 @@ static const char *err_msg[MAX_ERR+1] = { "pairs with high sum of errors" }; -typedef struct { - kstring_t n, s, q; -} mseq_t; - typedef struct { int a, b, q, r, w; int q_def, q_thres; int T; + int chunk_size; + int n_threads; + int flag; // bit 1: print merged; 2: print unmerged int8_t mat[25]; } pem_opt_t; @@ -42,67 +43,70 @@ pem_opt_t *pem_opt_init() opt->T = opt->a * 10; opt->q_def = 20; opt->q_thres = 70; + opt->chunk_size = 10000000; + opt->n_threads = 1; + opt->flag = 3; bwa_fill_scmat(opt->a, opt->b, opt->mat); return opt; } -int bwa_pemerge(const pem_opt_t *opt, mseq_t x[2], uint8_t **seq_, uint8_t **qual_) +int bwa_pemerge(const pem_opt_t *opt, bseq1_t x[2]) { uint8_t *s[2], *q[2], *seq, *qual; - int i, xtra, l, l_seq, sum_q; + int i, xtra, l, l_seq, sum_q, ret = 0; kswr_t r; - *seq_ = *qual_ = 0; - s[0] = malloc(x[0].s.l); q[0] = malloc(x[0].s.l); - s[1] = malloc(x[1].s.l); q[1] = malloc(x[1].s.l); - for (i = 0; i < x[0].s.l; ++i) { - int c = x[0].s.s[i]; + s[0] = malloc(x[0].l_seq); q[0] = malloc(x[0].l_seq); + s[1] = malloc(x[1].l_seq); q[1] = malloc(x[1].l_seq); + for (i = 0; i < x[0].l_seq; ++i) { + int c = x[0].seq[i]; s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c]; - q[0][i] = x[0].q.l? x[0].q.s[i] - 33 : opt->q_def; + q[0][i] = x[0].qual? x[0].qual[i] - 33 : opt->q_def; } - for (i = 0; i < x[1].s.l; ++i) { - int c = x[1].s.s[x[1].s.l - 1 - i]; + for (i = 0; i < x[1].l_seq; ++i) { + int c = x[1].seq[x[1].l_seq - 1 - i]; c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c]; s[1][i] = c < 4? 3 - c : 4; - q[1][i] = x[1].q.l? x[1].q.s[x[1].q.l - 1 - i] - 33 : opt->q_def; + q[1][i] = x[1].qual? x[1].qual[x[1].l_seq - 1 - i] - 33 : opt->q_def; } xtra = KSW_XSTART | KSW_XSUBO; - r = ksw_align(x[1].s.l, s[1], x[0].s.l, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0); + r = ksw_align(x[1].l_seq, s[1], x[0].l_seq, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0); ++r.qe; ++r.te; // change to the half-close-half-open coordinates - if (r.score < opt->T) return -1; // poor alignment - if (r.tb < r.qb) return -2; // no enough space for the left end - if (x[0].s.l - r.te > x[1].s.l - r.qe) return -3; // no enough space for the right end - if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) return -4; // the second best score is too large - if (r.qe - r.qb != r.te - r.tb) return -5; // we do not allow gaps + if (r.score < opt->T) { ret = -1; goto pem_ret; } // poor alignment + if (r.tb < r.qb) { ret = -2; goto pem_ret; } // no enough space for the left end + if (x[0].l_seq - r.te > x[1].l_seq - r.qe) { ret = -3; goto pem_ret; } // no enough space for the right end + if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) { ret = -4; goto pem_ret; } // the second best score is too large + if (r.qe - r.qb != r.te - r.tb) { ret = -5; goto pem_ret; } // we do not allow gaps { // test tandem match; O(n^2) int max_m, max_m2, min_l, max_l, max_l2; max_m = max_m2 = 0; max_l = max_l2 = 0; - min_l = x[0].s.l < x[1].s.l? x[0].s.l : x[1].s.l; + min_l = x[0].l_seq < x[1].l_seq? x[0].l_seq : x[1].l_seq; for (l = 1; l < min_l; ++l) { - int m = 0, o = x[0].s.l - l; + int m = 0, o = x[0].l_seq - l; uint8_t *s0o = &s[0][o], *s1 = s[1]; for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck! m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i] if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l; else if (m > max_m2) max_m2 = m, max_l2 = l; } - if (max_m < opt->T || max_l != x[0].s.l - (r.tb - r.qb)) return -6; - if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) - return -7; - if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) return -7; + if (max_m < opt->T || max_l != x[0].l_seq - (r.tb - r.qb)) { ret = -6; goto pem_ret; } + if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) { + ret = -7; goto pem_ret; + } + if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) { ret = -7; goto pem_ret; } } - l = x[0].s.l - (r.tb - r.qb); // length to merge - l_seq = x[0].s.l + x[1].s.l - l; + l = x[0].l_seq - (r.tb - r.qb); // length to merge + l_seq = x[0].l_seq + x[1].l_seq - l; seq = malloc(l_seq + 1); qual = malloc(l_seq + 1); - memcpy(seq, s[0], x[0].s.l); memcpy(seq + x[0].s.l, &s[1][l], x[1].s.l - l); - memcpy(qual, q[0], x[0].s.l); memcpy(qual + x[0].s.l, &q[1][l], x[1].s.l - l); + memcpy(seq, s[0], x[0].l_seq); memcpy(seq + x[0].l_seq, &s[1][l], x[1].l_seq - l); + memcpy(qual, q[0], x[0].l_seq); memcpy(qual + x[0].l_seq, &q[1][l], x[1].l_seq - l); for (i = 0, sum_q = 0; i < l; ++i) { - int k = x[0].s.l - l + i; + int k = x[0].l_seq - l + i; if (s[0][k] == 4) { // ambiguous seq[k] = s[1][i]; qual[k] = q[1][i]; @@ -118,51 +122,99 @@ int bwa_pemerge(const pem_opt_t *opt, mseq_t x[2], uint8_t **seq_, uint8_t **qua } if (sum_q>>1 > opt->q_thres) { // too many mismatches free(seq); free(qual); - return -8; + ret = -8; goto pem_ret; } for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33; seq[l_seq] = qual[l_seq] = 0; - *seq_ = seq, *qual_ = qual; - return l_seq; + + free(x[1].name); free(x[1].seq); free(x[1].qual); free(x[1].comment); + memset(&x[1], 0, sizeof(bseq1_t)); + free(x[0].seq); free(x[0].qual); + x[0].l_seq = l_seq; x[0].seq = (char*)seq; x[0].qual = (char*)qual; + +pem_ret: + free(s[0]); free(s[1]); free(q[0]); free(q[1]); + return ret; } -static inline void kstrcpy(kstring_t *dst, const kstring_t *src) +static inline void print_bseq(const bseq1_t *s, int rn) { - dst->l = 0; - if (src->l == 0) return; - if (dst->m < src->l + 1) { - dst->m = src->l + 2; - kroundup32(dst->m); - dst->s = realloc(dst->s, dst->m); - } - dst->l = src->l; - memcpy(dst->s, src->s, src->l + 1); + putchar(s->qual? '@' : '>'); + fputs(s->name, stdout); + if (rn == 1 || rn == 2) { + putchar('/'); putchar('0' + rn); putchar('\n'); + } else puts(" merged"); + puts(s->seq); + if (s->qual) { + puts("+"); puts(s->qual); + } } -static inline void kseq2mseq(mseq_t *ms, const kseq_t *ks) +typedef struct { + int n, start; + bseq1_t *seqs; + int64_t cnt[MAX_ERR+1]; + const pem_opt_t *opt; +} worker_t; + +void *worker(void *data) { - kstrcpy(&ms->n, &ks->name); - kstrcpy(&ms->s, &ks->seq); - kstrcpy(&ms->q, &ks->qual); + worker_t *w = (worker_t*)data; + int i; + for (i = w->start; i < w->n>>1; i += w->opt->n_threads) + ++w->cnt[-bwa_pemerge(w->opt, &w->seqs[i<<1])]; + return 0; } -static inline void print_seq(const char *n, const char *s, const char *q) +static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cnt[MAX_ERR+1]) { - putchar(q? '@' : '>'); - puts(n); puts(s); - if (q) { - puts("+"); puts(q); + int i, j, n = n_>>1<<1; + worker_t *w; + + w = calloc(opt->n_threads, sizeof(worker_t)); + for (i = 0; i < opt->n_threads; ++i) { + worker_t *p = &w[i]; + p->start = i; p->n = n; + p->opt = opt; + p->seqs = seqs; + } + if (opt->n_threads == 1) { + worker(w); + } else { + pthread_t *tid; + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); + for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker, &w[i]); + for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); + free(tid); + } + for (i = 0; i < opt->n_threads; ++i) { + worker_t *p = &w[i]; + for (j = 0; j <= MAX_ERR; ++j) cnt[j] += p->cnt[j]; + } + free(w); + for (i = 0; i < n>>1; ++i) { + if (seqs[i<<1|1].l_seq != 0) { + if (opt->flag&2) { + print_bseq(&seqs[i<<1|0], 1); + print_bseq(&seqs[i<<1|1], 2); + } + } else if (opt->flag&1) + print_bseq(&seqs[i<<1|0], 0); + } + for (i = 0; i < n; ++i) { + bseq1_t *s = &seqs[i]; + free(s->name); free(s->seq); free(s->qual); free(s->comment); } } int main_pemerge(int argc, char *argv[]) { - int c, flag = 0, i; + int c, flag = 0, i, n; int64_t cnt[MAX_ERR+1]; + bseq1_t *bseq; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; - mseq_t r[2]; pem_opt_t *opt; opt = pem_opt_init(); @@ -172,6 +224,7 @@ int main_pemerge(int argc, char *argv[]) else if (c == 'Q') opt->q_thres = atoi(optarg); } if (flag == 0) flag = 3; + opt->flag = flag; if (optind == argc) { fprintf(stderr, "\n"); @@ -191,34 +244,10 @@ int main_pemerge(int argc, char *argv[]) ks2 = kseq_init(fp2); } - memset(r, 0, sizeof(mseq_t)<<1); memset(cnt, 0, 8 * (MAX_ERR+1)); - while (kseq_read(ks) >= 0) { - uint8_t *seq, *qual; - int l_seq; - kseq2mseq(&r[0], ks); - if (ks2) { - if (kseq_read(ks2) < 0) break; - kseq2mseq(&r[1], ks2); - } else { - if (kseq_read(ks) < 0) break; - kseq2mseq(&r[1], ks); - } - l_seq = bwa_pemerge(opt, r, &seq, &qual); - if (l_seq > 0) { - ++cnt[0]; - if (flag & 1) { - if (r[0].n.l > 2 && (r[0].n.s[r[0].n.l-1] == '1' || r[0].n.s[r[0].n.l-1] == '2') && r[0].n.s[r[0].n.l-2] == '/') - r[0].n.s[r[0].n.l-2] = 0, r[0].n.l -= 2; - print_seq(r[0].n.s, (char*)seq, (char*)qual); - } - } else { - ++cnt[-l_seq]; - if (flag & 2) { - print_seq(r[0].n.s, r[0].s.s, r[0].q.l? r[0].q.s : 0); - print_seq(r[1].n.s, r[1].s.s, r[1].q.l? r[1].q.s : 0); - } - } + while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { + process_seqs(opt, n, bseq, cnt); + free(bseq); } fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]); From 503ca9ed2e9d82727c38ed587c368905da43cc66 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Mar 2013 11:22:19 -0500 Subject: [PATCH 344/498] r339: pemerge - expose some settings to CLI --- main.c | 2 +- pemerge.c | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/main.c b/main.c index 31bba4b..b06ce8f 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r338-beta" +#define PACKAGE_VERSION "0.7.0-r339-beta" #endif int bwa_fa2pac(int argc, char *argv[]); diff --git a/pemerge.c b/pemerge.c index 696137d..e37567a 100644 --- a/pemerge.c +++ b/pemerge.c @@ -210,7 +210,7 @@ static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cn int main_pemerge(int argc, char *argv[]) { - int c, flag = 0, i, n; + int c, flag = 0, i, n, min_ovlp = 10; int64_t cnt[MAX_ERR+1]; bseq1_t *bseq; gzFile fp, fp2 = 0; @@ -218,19 +218,24 @@ int main_pemerge(int argc, char *argv[]) pem_opt_t *opt; opt = pem_opt_init(); - while ((c = getopt(argc, argv, "muQ:")) >= 0) { + while ((c = getopt(argc, argv, "muQ:t:T:")) >= 0) { if (c == 'm') flag |= 1; else if (c == 'u') flag |= 2; else if (c == 'Q') opt->q_thres = atoi(optarg); + else if (c == 't') opt->n_threads = atoi(optarg); + else if (c == 'T') min_ovlp = atoi(optarg); } if (flag == 0) flag = 3; opt->flag = flag; + opt->T = opt->a * min_ovlp; if (optind == argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa pemerge [-mu] [read2.fq]\n\n"); fprintf(stderr, "Options: -m output merged reads only\n"); fprintf(stderr, " -u output unmerged reads only\n"); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -T INT minimum end overlap [%d]\n", min_ovlp); fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres); fprintf(stderr, "\n"); free(opt); From b0a76884e8db42f3004fd218a94d663289936f05 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Mar 2013 11:51:23 -0500 Subject: [PATCH 345/498] r340: feature freeze; updated the manpage I will stop adding new features to bwa and prepare for the next release. I will briefly evaluate the variant calling accuracy before the release. --- bwa.1 | 24 +++++++++++++++++++++++- fastmap.c | 2 +- main.c | 4 ++-- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/bwa.1 b/bwa.1 index 45b9921..b1bbb2a 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "27 Feburary 2013" "bwa-0.7.0" "Bioinformatics tools" +.TH bwa 1 "10 March 2013" "bwa-0.7.1" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -81,6 +81,8 @@ genome. .IR minSeedLen ] .RB [ -w .IR bandWidth ] +.RB [ -d +.IR zDropoff ] .RB [ -r .IR seedSplitRatio ] .RB [ -c @@ -163,6 +165,21 @@ Band width. Essentially, gaps longer than will not be found. Note that the maximum gap length is also affected by the scoring matrix and the hit length, not solely determined by this option. [100] .TP +.BI -d \ INT +Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between +the best and the current extension score is above +.RI | i - j |* A + INT , +where +.I i +and +.I j +are the current positions of the query and reference, respectively, and +.I A +is the matching score. Z-dropoff is similar to BLAST's X-dropoff except that it +doesn't penalize gaps in one of the sequences in the alignment. Z-dropoff not +only avoids unnecessary extension, but also reduces poor alignments inside a +long good alignment. [100] +.TP .BI -r \ FLOAT Trigger re-seeding for a MEM longer than .IR minSeedLen * FLOAT . @@ -215,6 +232,11 @@ and will be converted to a TAB in the output SAM. The read group ID will be attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'. [null] .TP +.BI -T \ INT +Don't output alignment with score lower than +.IR INT . +This option only affects output. [30] +.TP .B -a Output all found alignments for single-end or unpaired paired-end reads. These alignments will be flagged as secondary alignments. diff --git a/fastmap.c b/fastmap.c index 9204399..eda06bb 100644 --- a/fastmap.c +++ b/fastmap.c @@ -59,7 +59,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); - fprintf(stderr, " -d INT off-diagnal X-dropoff [%d]\n", opt->zdrop); + fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); diff --git a/main.c b/main.c index b06ce8f..da986a7 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r339-beta" +#define PACKAGE_VERSION "0.7.0-r340-beta" #endif int bwa_fa2pac(int argc, char *argv[]); @@ -34,7 +34,7 @@ static int usage() fprintf(stderr, "Command: index index sequences in the FASTA format\n"); fprintf(stderr, " mem BWA-MEM algorithm\n"); fprintf(stderr, " fastmap identify super-maximal exact matches\n"); - fprintf(stderr, " pemerge merge overlapping paired ends\n"); + fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n"); fprintf(stderr, " aln gapped/ungapped alignment\n"); fprintf(stderr, " samse generate alignment (single ended)\n"); fprintf(stderr, " sampe generate alignment (paired ended)\n"); From b5b50ac8da72bfe06810bbe24055dfb1218f83e3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 7 Mar 2013 21:35:57 -0500 Subject: [PATCH 346/498] r341: bugfix - wrong mate position when one end is mapped with a score less than -T. Caused by the -T option. --- bwamem_pair.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 9ff12b3..49e9ad2 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -303,7 +303,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co no_pairing: for (i = 0; i < 2; ++i) { - if (a[i].n) { + if (a[i].n && a[i].a[0].score >= opt->T) { mem_alnreg2hit(&a[i].a[0], &h[i]); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re); } else h[i].rb = h[i].re = -1; diff --git a/main.c b/main.c index da986a7..a772e3b 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r340-beta" +#define PACKAGE_VERSION "0.7.0-r341-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From fb21db0f4ce48d9d798f129f0f3f65e0c740c597 Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Fri, 8 Mar 2013 16:49:42 +0100 Subject: [PATCH 347/498] Print out the posterior probabilities for sub-optimal hits. --- Makefile | 4 ++-- bwase.c | 35 +++++++++++++++++++---------------- bwtaln.h | 1 + 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/Makefile b/Makefile index 5a2f792..5d9b018 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,11 @@ CC= gcc CXX= g++ -#CFLAGS= -g -Wall +CFLAGS= -g -Wall #CFLAGS= -pg -Wall -O2 #CFLAGS= -O3 -L/scr/plastilin/pkerp/local/lib #CFLAGS = -pg #CFLAGS = -O3 -pg -CFLAGS =-O3 -Wall +#CFLAGS =-O3 -Wall CXXFLAGS= $(CFLAGS) DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64 OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ diff --git a/bwase.c b/bwase.c index 7d9f129..1a71c06 100644 --- a/bwase.c +++ b/bwase.c @@ -128,7 +128,9 @@ void bwa_pssm_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int s s->best_pssm_score = best_score; s->posterior_prob = exp2((double)s->best_pssm_score) / total_prob; for (i = 0; i < n_aln; i++) { - const bwt_aln1_t *p = aln + i; + bwt_aln1_t *p = aln + i; + p->posterior_p = exp2((double)itf(p->pssm_score)) / total_prob; + if (itf(p->pssm_score) == best_score) s->c1 += p->l - p->k + 1; } @@ -543,7 +545,7 @@ static int64_t pos_5(const bwa_seq_t *p) void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2) { - int j; + int j,i; if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) { int seqid, nn, am = 0, flag = p->extra_flag; char XT; @@ -604,6 +606,13 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); + if (p->n_aln > 1) { + err_printf("\tXS:A:%f", p->aln[0].posterior_p); + for (i = 1; i < p->n_aln; i++) { + err_printf(";%f", p->aln[i].posterior_p); + } + err_printf("\n"); + } if (p->type != BWA_TYPE_NO_MATCH) { int i; // calculate XT tag @@ -721,8 +730,7 @@ int bwa_set_rg(const char *s) void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ) { extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); - int i, n_seqs, tot_seqs = 0, m_aln; - bwt_aln1_t *aln = 0; + int i, n_seqs, tot_seqs = 0; bwa_seq_t *seqs; bwa_seqio_t *ks; clock_t t; @@ -736,7 +744,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f srand48(bns->seed); fp_sa = xopen(fn_sa, "r"); - m_aln = 0; fread(&opt, sizeof(gap_opt_t), 1, fp_sa); if (!(opt.mode & BWA_MODE_COMPREAD)) // in color space; initialize ntpac ntbns = bwa_open_nt(prefix); @@ -753,20 +760,17 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f for (i = 0; i < n_seqs; ++i) { bwa_seq_t *p = seqs + i; p->posterior_prob = 5.0; - int n_aln; - fread(&n_aln, 4, 1, fp_sa); - if (n_aln > m_aln) { - m_aln = n_aln; - aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln); - } - fread(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); - if (aln && aln->pssm) { + fread(&p->n_aln, 4, 1, fp_sa); + //aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln); + p->aln = (bwt_aln1_t*)calloc(p->n_aln, sizeof(bwt_aln1_t)); + fread(p->aln, sizeof(bwt_aln1_t), p->n_aln, fp_sa); + if (p->aln && p->aln->pssm) { p->pssm = 1; - bwa_pssm_aln2seq_core(n_aln, aln, p, 1, n_occ); + bwa_pssm_aln2seq_core(p->n_aln, p->aln, p, 1, n_occ); adjust_pssm_score(bns, p, opt.prior); } else { p->pssm = 0; - bwa_aln2seq_core(n_aln, aln, p, 1, n_occ); + bwa_aln2seq_core(p->n_aln, p->aln, p, 1, n_occ); } } @@ -792,7 +796,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f if (ntbns) bns_destroy(ntbns); bns_destroy(bns); fclose(fp_sa); - free(aln); } int bwa_sai2sam_se(int argc, char *argv[]) diff --git a/bwtaln.h b/bwtaln.h index 0b16859..1ba9346 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -40,6 +40,7 @@ typedef struct { bwtint_t k, l; int score; float pssm_score; + float posterior_p; char pssm; //indicate whether this alignment was made with a PSSM search } bwt_aln1_t; From 017be4540734273d60e33caa38161b35c872bd2a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Mar 2013 12:06:45 -0500 Subject: [PATCH 348/498] r342: bugfix in bwasw - AS is off by one but I do not understand why the old code does not have the same problem. --- Makefile | 5 +---- bwtsw2_aux.c | 2 +- main.c | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 93e3266..b660557 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,7 @@ CC= gcc CFLAGS= -g -Wall -O2 -CXXFLAGS= $(CFLAGS) AR= ar -DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64 +DFLAGS= -DHAVE_PTHREAD LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \ @@ -17,8 +16,6 @@ SUBDIRS= . .c.o: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ -.cc.o: - $(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ all:$(PROG) diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 6527495..a84d7e0 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -153,7 +153,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0); + score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0) - 1; // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); if (score >= p->G) { p->G = score; diff --git a/main.c b/main.c index a772e3b..4240ac7 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r341-beta" +#define PACKAGE_VERSION "0.7.0-r342-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 274c0ac96c749b352144bdc539b058b703489cb1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Mar 2013 12:40:31 -0500 Subject: [PATCH 349/498] r343: bugfix in mem - wrong mate info for unmap SAM generation is always among the nastiest bits. I would need to refactor at some point (hardly happening). --- bwamem.c | 13 +++++++++++-- bwamem_pair.c | 6 +++++- main.c | 2 +- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index 7a6d175..766227d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -670,9 +670,13 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons if (mid == rid) { int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb; - kputw(p0 - p1 + (p0 > p1? 1 : -1), str); + kputw(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0), str); } else kputw(0, str); kputc('\t', str); + } else if (m && is_mapped(p)) { // then copy the position + kputsn("\t=\t", 3, str); + kputuw(pos - bns->anns[rid].offset + 1, str); + kputsn("\t0\t", 3, str); } else kputsn("\t*\t0\t0\t", 7, str); if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL kputsn("*\t*", 3, str); @@ -760,7 +764,12 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b else if (h.qual > mapq0) h.qual = mapq0; bwa_hit2sam(&str, opt->mat, opt->q, opt->r, p->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); } - } else bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, 0, opt->flag&MEM_F_HARDCLIP, m); + } else { + bwahit_t h; + memset(&h, 0, sizeof(bwahit_t)); + h.rb = h.re = -1; h.flag = extra_flag; + bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); + } s->sam = str.s; } diff --git a/bwamem_pair.c b/bwamem_pair.c index 49e9ad2..bbbbe02 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -306,7 +306,11 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co if (a[i].n && a[i].a[0].score >= opt->T) { mem_alnreg2hit(&a[i].a[0], &h[i]); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re); - } else h[i].rb = h[i].re = -1; + } else { + memset(&h[i], 0, sizeof(bwahit_t)); + h[i].rb = h[i].re = -1; + h[i].flag = 1<<(6+i) | 1; + } } mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]); mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]); diff --git a/main.c b/main.c index 4240ac7..1304a35 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r342-beta" +#define PACKAGE_VERSION "0.7.0-r343-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From af7b4d89808f1c48f320f4a2f76d3466d6ff50bc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Mar 2013 12:45:50 -0500 Subject: [PATCH 350/498] gcc wrongly thinks a variable may be uninitialized It should always be initialized. To avoid a warning, made a change. --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 766227d..40b481a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -612,7 +612,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons #define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1; uint32_t *cigar = 0; - int64_t pos; + int64_t pos = -1; bwahit_t ptmp, *p = &ptmp; if (!p_) { // in this case, generate an unmapped alignment From 66c9783dafccd0b202eef2c331c4a9dc8d44fcba Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Mar 2013 13:15:43 -0500 Subject: [PATCH 351/498] r345: bugfix in mem - wrong mate strand for unmap Received a clean bill from Picard --- bwamem.c | 1 + main.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 40b481a..224309f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -628,6 +628,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons } p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand + if (is_mapped(p) && m && !is_mapped(m) && (p->flag&0x10)) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read kputs(s->name, str); kputc('\t', str); if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag diff --git a/main.c b/main.c index 1304a35..3ae3f28 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r343-beta" +#define PACKAGE_VERSION "0.7.0-r345-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 5370bb23a3f9b263aa8c41e9610ad937094863bd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Mar 2013 14:14:42 -0500 Subject: [PATCH 352/498] Updated NEWS; added stddef.h for size_t I thought size_t is defined in stdlib.h, but it is not always. --- NEWS | 50 +++++++++++++++++++++++++++++++++++++++++++------- bwt.h | 1 + 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/NEWS b/NEWS index 35202f1..c94b28e 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,39 @@ +Release 0.7.1 (8 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Changes to BWA-MEM: + + * Bugfix: rare segmentation fault caused by a partial hit to the end of the + last sequence. + + * Bugfix: occasional mis-pairing given an interleaved fastq. + + * Bugfix: wrong mate information when the mate is unmapped. SAM generated by + BWA-MEM can now be validated with Picard. + + * Improved the performance and accuracy for ultra-long query sequences. + Short-read alignment is not affected. + +Changes to other components: + + * In BWA-backtrack and BWA-SW, replaced the code for global alignment, + Smith-Waterman and SW extension. The performance and accuracy of the two + algorithms stay the same. + + * Added an experimental subcommand to merge overlapping paired ends. The + algorithm is very conservative: it may miss true overlaps but rarely makes + mistakes. + +An important note is that like BWA-SW, BWA-MEM may output multiple primary +alignments for a read, which may cause problems to some tools. For aligning +sequence reads, it is advised to use `-M' to flag extra hits as secondary. This +option is not the default because multiple primary alignments are theoretically +possible in sequence alignment. + +(0.7.1: 8 March 2013, r347) + + + Beta Release 0.7.0 (28 Feburary, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -16,14 +52,14 @@ In addition to the algorithmic improvements, BWA-MEM also implements a few handy features in practical aspects: 1. BWA-MEM automatically switches between local and glocal (global wrt reads; - local wrt reference) alignment. It reports the end-to-end glocal alignment - if the glocal alignment is not much worse than the optimal local alignment. - Glocal alignment reduces reference bias. + local wrt reference) alignment. It reports the end-to-end glocal alignment + if the glocal alignment is not much worse than the optimal local alignment. + Glocal alignment reduces reference bias. 2. BWA-MEM automatically infers pair orientation from a batch of single-end alignments. It allows more than one orientations if there are sufficient - supporting reads. This feature has not been tested on reads from Illumina - jumping library yet. (EXPERIMENTAL) + supporting reads. This feature has not been tested on reads from Illumina + jumping library yet. (EXPERIMENTAL) 3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It is possible to convert a name-sorted BAM to an interleaved fastq on the fly @@ -37,8 +73,8 @@ handy features in practical aspects: files without replying on bash features. 6. BWA-MEM provides a few basic APIs for single-end mapping. The `example.c' - program in the source code directory implements a full single-end mapper in - 50 lines of code. + program in the source code directory implements a full single-end mapper in + 50 lines of code. The BWA-MEM algorithm is in the beta phase. It is not advised to use BWA-MEM for production use yet. However, when the implementation becomes stable after a diff --git a/bwt.h b/bwt.h index e7b0f97..c36bf9b 100644 --- a/bwt.h +++ b/bwt.h @@ -29,6 +29,7 @@ #define BWA_BWT_H #include +#include // requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80 #define OCC_INTV_SHIFT 7 From 1d132a546de2beafa01bd072c1dea4008e594797 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 8 Mar 2013 15:30:06 -0500 Subject: [PATCH 353/498] Release 0.7.1-r347 --- bwa.1 | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwa.1 b/bwa.1 index b1bbb2a..495df69 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "10 March 2013" "bwa-0.7.1" "Bioinformatics tools" +.TH bwa 1 "8 March 2013" "bwa-0.7.1" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool diff --git a/main.c b/main.c index 3ae3f28..505cfbd 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.0-r345-beta" +#define PACKAGE_VERSION "0.7.1-r347" #endif int bwa_fa2pac(int argc, char *argv[]); From c0a7ecfb1e7179a4b9dfea0cff35da4f9b02ef61 Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Sat, 9 Mar 2013 18:35:10 +0100 Subject: [PATCH 354/498] Name change and small usage change in the README file. --- README | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README b/README index 6c950c1..c855a38 100644 --- a/README +++ b/README @@ -1,15 +1,15 @@ -BPM Readme +BWA-PSSM Readme -The usage of BPM is very similar to that of bwa. Due to some of the +The usage of BWA-PSSM is very similar to that of BWA. Due to some of the underlying differences, howerver, the recommended parameters are different. Installation: -BPM requires the gdsl library. It can be downloaded from: +BWA-PSSM requires the gdsl library. It can be downloaded from: http://home.gna.org/gdsl/ -Once gdsl is installed, BPM can be compiled by simply running 'make'. +Once gdsl is installed, BWA-PSSM can be compiled by simply running 'make'. Usage: @@ -31,7 +31,7 @@ It is presumed that maximum length reads have not been degraded and should not b In the examples below, pssm-file can be either a pssm file as generated by fastq2wm33.pl or a regular fastq file. If it is a regular fastq file, it will be converted to a pssm internally. -bwa pssm -z 3.0 -l 18 -k 3 -n 30 -m 2000 index-file pssm-file | bwa samse index-file - pssm-file > out.sam +bwa pssm -m 2000 index-file pssm-file | bwa samse index-file - pssm-file > out.sam ** Helicos data ** From 9ea7f83974498297b9876f9c76e93ef42e76d2a2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 9 Mar 2013 18:03:15 -0500 Subject: [PATCH 355/498] Emergent bugfix: wrong TLEN sign It is interesting that Picard did not find the issue. --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 224309f..9350943 100644 --- a/bwamem.c +++ b/bwamem.c @@ -671,7 +671,7 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons if (mid == rid) { int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb; - kputw(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0), str); + kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); } else kputw(0, str); kputc('\t', str); } else if (m && is_mapped(p)) { // then copy the position From 740d2c131494fcc57a8084c2f6e5f51ad846d284 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 9 Mar 2013 18:03:57 -0500 Subject: [PATCH 356/498] Match to 'N' costs -1, instead of 0. This is to prevent alignment through 'N'. --- bwa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwa.c b/bwa.c index 991b23a..08d96b8 100644 --- a/bwa.c +++ b/bwa.c @@ -75,9 +75,9 @@ void bwa_fill_scmat(int a, int b, int8_t mat[25]) for (i = k = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) mat[k++] = i == j? a : -b; - mat[k++] = 0; // ambiguous base + mat[k++] = -1; // ambiguous base } - for (j = 0; j < 5; ++j) mat[k++] = 0; + for (j = 0; j < 5; ++j) mat[k++] = -1; } // Generate CIGAR when the alignment end points are known From 2d01a297fbccfcd81f306f202bec91dbafa89b03 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 9 Mar 2013 18:05:50 -0500 Subject: [PATCH 357/498] Improving 'properly paired' flag. If one end has a low quality tail that happens to have a score-20 hit, the pair won't be flagged as properly paired because bwa-mem thought it has multiple hits. By filtering with -T, we won't have this problem. --- bwa.1 | 2 +- bwamem_pair.c | 2 +- main.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bwa.1 b/bwa.1 index 495df69..3e92456 100644 --- a/bwa.1 +++ b/bwa.1 @@ -235,7 +235,7 @@ attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'. .BI -T \ INT Don't output alignment with score lower than .IR INT . -This option only affects output. [30] +This option affects output and occasionally SAM flag 2. [30] .TP .B -a Output all found alignments for single-end or unpaired paired-end reads. These diff --git a/bwamem_pair.c b/bwamem_pair.c index bbbbe02..b9a68f1 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -260,7 +260,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { for (j = 1; j < a[i].n; ++j) - if (a[i].a[j].secondary < 0) break; + if (a[i].a[j].secondary < 0 && a[i].a[j].score >= opt->T) break; is_multi[i] = j < a[i].n? 1 : 0; } if (is_multi[0] || is_multi[1]) goto no_pairing; // TODO: in rare cases, the true hit may be long but with low score diff --git a/main.c b/main.c index 505cfbd..a49bdf8 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.1-r347" +#define PACKAGE_VERSION "0.7.1-r348-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 5581cb9152876913685fae153dad976c24552ca2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 9 Mar 2013 18:15:41 -0500 Subject: [PATCH 358/498] Release bwa-0.7.2-r351 For the TLEN sign fix. Sorry for the significant bug in 0.7.0/0.7.1 --- NEWS | 10 ++++++++++ bwa.1 | 2 +- main.c | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index c94b28e..7474726 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,13 @@ +Release 0.7.2 (9 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Emergent bug fix: 0.7.0 and 0.7.1 give a wrong sign to TLEN. In addition, +flagging `properly paired' also gets improved a little. + +(0.7.2: 9 March 2013, r351) + + + Release 0.7.1 (8 March, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bwa.1 b/bwa.1 index 3e92456..8e90848 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "8 March 2013" "bwa-0.7.1" "Bioinformatics tools" +.TH bwa 1 "9 March 2013" "bwa-0.7.2" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool diff --git a/main.c b/main.c index a49bdf8..b0874f0 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.1-r348-beta" +#define PACKAGE_VERSION "0.7.2-r351" #endif int bwa_fa2pac(int argc, char *argv[]); From 8f0d43991356bd26bcb8fc9fcea4bdbba566373e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 21:25:17 -0400 Subject: [PATCH 359/498] prepare to replace the SAM printing code This move is dangerous as SAM printing is very complex, but it will benefit in the long run. The planned change will reduce the redundancy, improves clarity and most importantly makes it much easier to output multiple primary hits in an optional tag. --- bwamem.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++--- bwamem.h | 8 +++-- kstring.h | 17 +++++++++ 3 files changed, 124 insertions(+), 7 deletions(-) diff --git a/bwamem.c b/bwamem.c index 9350943..e1b6205 100644 --- a/bwamem.c +++ b/bwamem.c @@ -714,6 +714,95 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons #undef is_mapped } +void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m) +{ + int i, copy_mate = 0; + mem_aln_t ptmp = list[which], *p = &ptmp; // make a copy of the alignment to convert + + // set flag + p->flag |= m? 0x1 : 0; // is paired in sequencing + p->flag |= p->rid < 0? 4 : 0; // is mapped + p->flag |= m && m->rid < 0? 8 : 0; // is mate mapped + if (p->rid < 0 && m && m->rid >= 0) + p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0, copy_mate = 1; + p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand + p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand + if (p->rid >= 0 && m && m->rid <= 0 && p->is_rev) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read + + // print up to CIGAR + kputs(s->name, str); kputc('\t', str); // QNAME + kputw(p->flag, str); kputc('\t', str); // FLAG + if (p->rid >= 0) { // with coordinate + kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME + kputl(p->pos + 1, str); kputc('\t', str); // POS + kputw(p->mapq, str); kputc('\t', str); // MAPQ + if (p->n_cigar) { // aligned + for (i = 0; i < p->n_cigar; ++i) { + kputw(p->cigar[i]>>4, str); kputc("MIDSH"[p->cigar[i]&0xf], str); + } + } else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true) + } else kputsn("*\t0\t0\t*", 7, str); // without coordinte + kputc('\t', str); + + // print the mate position if applicable + if (m && m->rid) { + if (p->rid == m->rid) kputc('=', str); + else kputs(bns->anns[m->rid].name, str); + kputc('\t', str); + kputl(m->pos + 1, str); + if (p->rid == m->rid) { + int64_t p0 = p->r5 < bns->l_pac? p->r5 : (bns->l_pac<<1) - 1 - p->r5; + int64_t p1 = m->r5 < bns->l_pac? m->r5 : (bns->l_pac<<1) - 1 - m->r5; + kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); + } else kputc('0', str); + } else if (m && p->rid) { + kputsn("\t=\t", 3, str); + kputl(p->pos + 1, str); + kputsn("\t0\t", 3, str); + } else kputsn("*\t0\t0\t", 6, str); + + // print SEQ and QUAL + if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL + kputsn("*\t*", 3, str); + } else if (!p->is_rev) { // the forward strand + int i, qb = 0, qe = s->l_seq; + if (p->n_cigar) { + if ((p->cigar[0]&0xf) == 4) qb += p->cigar[0]>>4; + if ((p->cigar[p->n_cigar-1]&0xf) == 4) qe -= p->cigar[p->n_cigar-1]>>4; + } + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } else { // the reverse strand + int i, qb = 0, qe = s->l_seq; + if (p->n_cigar) { + if ((p->cigar[0]&0xf) == 4) qe -= p->cigar[0]>>4; + if ((p->cigar[p->n_cigar-1]&0xf) == 4) qb += p->cigar[p->n_cigar-1]>>4; + } + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; + kputc('\t', str); + if (s->qual) { // printf qual + ks_resize(str, str->l + (qe - qb) + 1); + for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i]; + str->s[str->l] = 0; + } else kputc('*', str); + } + + // print optional tags + if (p->n_cigar) { kputsn("\tNM:i:", 6, str); kputw(p->NM, str); } + if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } + if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } + if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } + if (s->comment) { kputc('\t', str); kputs(s->comment, str); } + kputc('\n', str); +} + /************************ * Integrated interface * ************************/ @@ -816,14 +905,20 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) { mem_aln_t a; - int i, w2, qb = ar->qb, qe = ar->qe, NM, score, is_rev; - int64_t pos, rb = ar->rb, re = ar->re; + int i, w2, qb, qe, NM, score, is_rev; + int64_t pos, rb, re; uint8_t *query; + memset(&a, 0, sizeof(mem_aln_t)); + if (ar == 0 || ar->rb < 0 || ar->re < 0) { // generate an unmapped record + a.rid = -1; a.pos = -1; a.flag |= 0x4; + return a; + } + qb = ar->qb, qe = ar->qe; + rb = ar->rb, re = ar->re; query = malloc(l_query); for (i = 0; i < l_query; ++i) // convert to the nt4 encoding query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; - memset(&a, 0, sizeof(mem_aln_t)); a.mapq = mem_approx_mapq_se(opt, ar); bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r); @@ -839,13 +934,14 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2)); if (clip5) { memmove(a.cigar+1, a.cigar, a.n_cigar * 4); - a.cigar[0] = clip5<<4|3; + a.cigar[0] = clip5<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3); ++a.n_cigar; } - if (clip3) a.cigar[a.n_cigar++] = clip3<<4|3; + if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3); } a.rid = bns_pos2rid(bns, pos); a.pos = pos - bns->anns[a.rid].offset; + a.r5 = rb; a.score = ar->score; a.sub = ar->sub; free(query); return a; } diff --git a/bwamem.h b/bwamem.h index 14f04d5..dce10b3 100644 --- a/bwamem.h +++ b/bwamem.h @@ -67,11 +67,15 @@ typedef struct { // TODO: This is an intermediate struct only. Better get rid of } bwahit_t; typedef struct { // This struct is only used for the convenience of API. - int rid; // reference sequence index in bntseq_t - int pos; // forward strand 5'-end mapping position + int64_t pos; // forward strand 5'-end mapping position + int rid; // reference sequence index in bntseq_t; <0 for unmapped + int flag; // extra flag uint32_t is_rev:1, mapq:8, NM:23; // is_rev: whether on the reverse strand; mapq: mapping quality; NM: edit distance int n_cigar; // number of CIGAR operations uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 + + int64_t r5; // position of the 5'-end of read (for computing TLEN) + int score, sub; } mem_aln_t; #ifdef __cplusplus diff --git a/kstring.h b/kstring.h index 81d7d60..04f1c42 100644 --- a/kstring.h +++ b/kstring.h @@ -89,6 +89,23 @@ static inline int kputuw(unsigned c, kstring_t *s) return 0; } +static inline int kputl(long c, kstring_t *s) +{ + char buf[32]; + long l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + int ksprintf(kstring_t *s, const char *fmt, ...); #endif From 47952b6f3ff3fd3ca48285ffc16f6535bb0b69aa Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 21:35:32 -0400 Subject: [PATCH 360/498] drop an unnecessary member from mem_aln_t --- bwamem.c | 17 ++++++++++++++--- bwamem.h | 1 - 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index e1b6205..3305658 100644 --- a/bwamem.c +++ b/bwamem.c @@ -714,6 +714,17 @@ void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, cons #undef is_mapped } +static inline int get_rlen(int n_cigar, const uint32_t *cigar) +{ + int k, l; + for (k = l = 0; k < n_cigar; ++k) { + int op = cigar[k]&0xf; + if (op == 0 || op == 2) + l += cigar[k]>>4; + } + return l; +} + void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m) { int i, copy_mate = 0; @@ -751,8 +762,8 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m kputc('\t', str); kputl(m->pos + 1, str); if (p->rid == m->rid) { - int64_t p0 = p->r5 < bns->l_pac? p->r5 : (bns->l_pac<<1) - 1 - p->r5; - int64_t p1 = m->r5 < bns->l_pac? m->r5 : (bns->l_pac<<1) - 1 - m->r5; + int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) : 0); + int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) : 0); kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); } else kputc('0', str); } else if (m && p->rid) { @@ -941,7 +952,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * } a.rid = bns_pos2rid(bns, pos); a.pos = pos - bns->anns[a.rid].offset; - a.r5 = rb; a.score = ar->score; a.sub = ar->sub; + a.score = ar->score; a.sub = ar->sub; free(query); return a; } diff --git a/bwamem.h b/bwamem.h index dce10b3..55cff76 100644 --- a/bwamem.h +++ b/bwamem.h @@ -74,7 +74,6 @@ typedef struct { // This struct is only used for the convenience of API. int n_cigar; // number of CIGAR operations uint32_t *cigar; // CIGAR in the BAM encoding: opLen<<4|op; op to integer mapping: MIDSH=>01234 - int64_t r5; // position of the 5'-end of read (for computing TLEN) int score, sub; } mem_aln_t; From c7edaa8e84eadceae0fd9bfc475ad0d6faf7c936 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 21:55:52 -0400 Subject: [PATCH 361/498] to test the new sam writer... --- bwamem.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 3305658..7d45c44 100644 --- a/bwamem.c +++ b/bwamem.c @@ -742,7 +742,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m // print up to CIGAR kputs(s->name, str); kputc('\t', str); // QNAME - kputw(p->flag, str); kputc('\t', str); // FLAG + kputw((p->flag&0xffff) | (p->flag&0x10000? 0x100 : 0), str); kputc('\t', str); // FLAG if (p->rid >= 0) { // with coordinate kputs(bns->anns[p->rid].name, str); kputc('\t', str); // RNAME kputl(p->pos + 1, str); kputc('\t', str); // POS @@ -874,6 +874,38 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b s->sam = str.s; } +void mem_aln2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) +{ + kstring_t str; + str.l = str.m = 0; str.s = 0; + if (a->n > 0 && a->a[0].score >= opt->T) { + int k; + kvec_t(mem_aln_t) aa; + kv_init(aa); + for (k = 0; k < a->n; ++k) { + mem_alnreg_t *p = &a->a[k]; + mem_aln_t *q; + if (p->score < opt->T) continue; + if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; + if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; + q = kv_pushp(mem_aln_t, aa); + *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); + q->flag |= extra_flag; + if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) q->flag |= 0x10000; + if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; + } + for (k = 0; k < aa.n; ++k) + mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m); + for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); + free(aa.a); + } else { + mem_aln_t t; + t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); + mem_aln2sam(bns, &str, s, 1, &t, 0, m); + } + s->sam = str.s; +} + mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq) { int i; From ebb45dc42ecbdef77ffd5cd45186f0ef74feb5c1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 21:59:15 -0400 Subject: [PATCH 362/498] new code works for SE --- bwamem.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 7d45c44..38ab6e6 100644 --- a/bwamem.c +++ b/bwamem.c @@ -874,7 +874,7 @@ void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, b s->sam = str.s; } -void mem_aln2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) +void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) { kstring_t str; str.l = str.m = 0; str.s = 0; @@ -1024,7 +1024,8 @@ static void *worker2(void *data) if (!(w->opt->flag&MEM_F_PE)) { for (i = w->start; i < w->n; i += w->step) { mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); - mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + //mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); free(w->regs[i].a); } } else { From 0b0455ca51a7291ea16caf4113c33c03c3691251 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 22:18:23 -0400 Subject: [PATCH 363/498] replace PE; BUGGY right now!! --- bwamem_pair.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index b9a68f1..7cbf5e0 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -230,15 +230,14 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) { extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a); - extern void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); - extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); - extern void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p, int is_hard, const bwahit_t *m); + extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m); + extern void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m); - int n = 0, i, j, z[2], o, subo, n_sub; + int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1; kstring_t str; mem_alnreg_v b[2]; - bwahit_t h[2]; + mem_aln_t h[2]; str.l = str.m = 0; str.s = 0; // perform SW for the best alignment @@ -256,7 +255,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) { - int is_multi[2], q_pe, extra_flag = 1, score_un, q_se[2]; + int is_multi[2], q_pe, score_un, q_se[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { for (j = 1; j < a[i].n; ++j) @@ -292,27 +291,21 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]); q_se[1] = mem_approx_mapq_se(opt, &a[1].a[0]); } - mem_alnreg2hit(&a[0].a[z[0]], &h[0]); h[0].qual = q_se[0]; h[0].flag |= 0x40 | extra_flag; - bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[0].seq, &h[0].qb, &h[0].qe, &h[0].rb, &h[0].re); - mem_alnreg2hit(&a[1].a[z[1]], &h[1]); h[1].qual = q_se[1]; h[1].flag |= 0x80 | extra_flag; - bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[1].seq, &h[1].qb, &h[1].qe, &h[1].rb, &h[1].re); - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[0], &h[0], opt->flag&MEM_F_HARDCLIP, &h[1]); s[0].sam = strdup(str.s); str.l = 0; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, &s[1], &h[1], opt->flag&MEM_F_HARDCLIP, &h[0]); s[1].sam = str.s; + // write SAM + h[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; + h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; + mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1]); s[0].sam = strdup(str.s); str.l = 0; + mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0]); s[1].sam = str.s; } else goto no_pairing; return n; no_pairing: for (i = 0; i < 2; ++i) { - if (a[i].n && a[i].a[0].score >= opt->T) { - mem_alnreg2hit(&a[i].a[0], &h[i]); - bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s[i].seq, &h[i].qb, &h[i].qe, &h[i].rb, &h[i].re); - } else { - memset(&h[i], 0, sizeof(bwahit_t)); - h[i].rb = h[i].re = -1; - h[i].flag = 1<<(6+i) | 1; - } + if (a[i].n && a[i].a[0].score >= opt->T) + h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]); + else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0); } - mem_sam_se(opt, bns, pac, &s[0], &a[0], 0x41, &h[1]); - mem_sam_se(opt, bns, pac, &s[1], &a[1], 0x81, &h[0]); + mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); + mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); return n; } From 26f4c704ed9dcaffb057163636165e52cf6f6b94 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 22:24:54 -0400 Subject: [PATCH 364/498] drop the old SAM writer --- bwamem.c | 148 -------------------------------------------------- bwamem.h | 7 --- bwamem_pair.c | 1 - 3 files changed, 156 deletions(-) diff --git a/bwamem.c b/bwamem.c index 38ab6e6..f780949 100644 --- a/bwamem.c +++ b/bwamem.c @@ -607,113 +607,6 @@ static inline int infer_bw(int l1, int l2, int score, int a, int q, int r) return w; } -void bwa_hit2sam(kstring_t *str, const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, const bwahit_t *p_, int is_hard, const bwahit_t *m) -{ -#define is_mapped(x) ((x)->rb >= 0 && (x)->rb < (x)->re && (x)->re <= bns->l_pac<<1) - int score, n_cigar, is_rev = 0, rid, mid, copy_mate = 0, NM = -1; - uint32_t *cigar = 0; - int64_t pos = -1; - bwahit_t ptmp, *p = &ptmp; - - if (!p_) { // in this case, generate an unmapped alignment - memset(&ptmp, 0, sizeof(bwahit_t)); - ptmp.rb = ptmp.re = -1; - } else ptmp = *p_; - p->flag |= m? 1 : 0; // is paired in sequencing - p->flag |= !is_mapped(p)? 4 : 0; // is mapped - p->flag |= m && !is_mapped(m)? 8 : 0; // is mate mapped - if (m && !is_mapped(p) && is_mapped(m)) { - p->rb = m->rb; p->re = m->re; p->qb = 0; p->qe = s->l_seq; - copy_mate = 1; - } - p->flag |= p->rb >= bns->l_pac? 0x10 : 0; // is reverse strand - p->flag |= m && m->rb >= bns->l_pac? 0x20 : 0; // is mate on reverse strand - if (is_mapped(p) && m && !is_mapped(m) && (p->flag&0x10)) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read - kputs(s->name, str); kputc('\t', str); - if (is_mapped(p)) { // has a coordinate, no matter whether it is mapped or copied from the mate - int sam_flag = p->flag&0xff; // the flag that will be outputed to SAM; it is not always the same as p->flag - if (p->flag&0x10000) sam_flag |= 0x100; - if (!copy_mate) { - int w2; - w2 = infer_bw(p->qe - p->qb, p->re - p->rb, p->score, mat[0], q, r); - w2 = w2 < w? w2 : w; - cigar = bwa_gen_cigar(mat, q, r, w2, bns->l_pac, pac, p->qe - p->qb, (uint8_t*)&s->seq[p->qb], p->rb, p->re, &score, &n_cigar, &NM); - p->flag |= n_cigar == 0? 4 : 0; // FIXME: check why this may happen (this has already happened) - } else n_cigar = 0, cigar = 0; - pos = bns_depos(bns, p->rb < bns->l_pac? p->rb : p->re - 1, &is_rev); - bns_cnt_ambi(bns, pos, p->re - p->rb, &rid); - kputw(sam_flag, str); kputc('\t', str); - kputs(bns->anns[rid].name, str); kputc('\t', str); kputuw(pos - bns->anns[rid].offset + 1, str); kputc('\t', str); - kputw(p->qual, str); kputc('\t', str); - if (n_cigar) { - int i, clip5, clip3; - clip5 = is_rev? s->l_seq - p->qe : p->qb; - clip3 = is_rev? p->qb : s->l_seq - p->qe; - if (clip5) { kputw(clip5, str); kputc("SH"[(is_hard!=0)], str); } - for (i = 0; i < n_cigar; ++i) { - kputw(cigar[i]>>4, str); kputc("MIDSH"[cigar[i]&0xf], str); - } - if (clip3) { kputw(clip3, str); kputc("SH"[(is_hard!=0)], str); } - } else kputc('*', str); - } else { // no coordinate - kputw(p->flag, str); - kputs("\t*\t0\t0\t*", str); - rid = -1; - } - if (m && is_mapped(m)) { // then print mate pos and isize - pos = bns_depos(bns, m->rb < bns->l_pac? m->rb : m->re - 1, &is_rev); - bns_cnt_ambi(bns, pos, m->re - m->rb, &mid); - kputc('\t', str); - if (mid == rid) kputc('=', str); - else kputs(bns->anns[mid].name, str); - kputc('\t', str); kputuw(pos - bns->anns[mid].offset + 1, str); - kputc('\t', str); - if (mid == rid) { - int64_t p0 = p->rb < bns->l_pac? p->rb : (bns->l_pac<<1) - 1 - p->rb; - int64_t p1 = m->rb < bns->l_pac? m->rb : (bns->l_pac<<1) - 1 - m->rb; - kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); - } else kputw(0, str); - kputc('\t', str); - } else if (m && is_mapped(p)) { // then copy the position - kputsn("\t=\t", 3, str); - kputuw(pos - bns->anns[rid].offset + 1, str); - kputsn("\t0\t", 3, str); - } else kputsn("\t*\t0\t0\t", 7, str); - if (p->flag&0x100) { // for secondary alignments, don't write SEQ and QUAL - kputsn("*\t*", 3, str); - } else if (!(p->flag&0x10)) { // print SEQ and QUAL, the forward strand - int i, qb = 0, qe = s->l_seq; - if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe; - ks_resize(str, str->l + (qe - qb) + 1); - for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; - kputc('\t', str); - if (s->qual) { // printf qual - ks_resize(str, str->l + (qe - qb) + 1); - for (i = qb; i < qe; ++i) str->s[str->l++] = s->qual[i]; - str->s[str->l] = 0; - } else kputc('*', str); - } else { // the reverse strand - int i, qb = 0, qe = s->l_seq; - if (!(p->flag&4) && is_hard) qb = p->qb, qe = p->qe; - ks_resize(str, str->l + (qe - qb) + 1); - for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; - kputc('\t', str); - if (s->qual) { // printf qual - ks_resize(str, str->l + (qe - qb) + 1); - for (i = qe-1; i >= qb; --i) str->s[str->l++] = s->qual[i]; - str->s[str->l] = 0; - } else kputc('*', str); - } - if (NM >= 0) { kputsn("\tNM:i:", 6, str); kputw(NM, str); } - if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } - if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } - if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } - if (s->comment) { kputc('\t', str); kputs(s->comment, str); } - kputc('\n', str); - free(cigar); -#undef is_mapped -} - static inline int get_rlen(int n_cigar, const uint32_t *cigar) { int k, l; @@ -834,46 +727,6 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) return mapq; } -void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h) -{ - h->rb = a->rb; h->re = a->re; h->qb = a->qb; h->qe = a->qe; - h->score = a->score; - h->sub = a->secondary >= 0? -1 : a->sub > a->csub? a->sub : a->csub; - h->qual = 0; // quality unset - h->flag = a->secondary >= 0? 0x100 : 0; // only the "secondary" bit is set -} - -void mem_sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const bwahit_t *m) -{ - int k; - kstring_t str; - str.l = str.m = 0; str.s = 0; - if (a->n > 0 && a->a[0].score >= opt->T) { - int mapq0 = -1; - for (k = 0; k < a->n; ++k) { - bwahit_t h; - mem_alnreg_t *p = &a->a[k]; - if (p->score < opt->T) continue; - if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; - if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; - mem_alnreg2hit(p, &h); - bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)s->seq, &h.qb, &h.qe, &h.rb, &h.re); - h.flag |= extra_flag; - if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) h.flag |= 0x10000; // print the sequence, but flag as secondary (for Picard) - h.qual = p->secondary >= 0? 0 : mem_approx_mapq_se(opt, p); - if (k == 0) mapq0 = h.qual; - else if (h.qual > mapq0) h.qual = mapq0; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, p->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); - } - } else { - bwahit_t h; - memset(&h, 0, sizeof(bwahit_t)); - h.rb = h.re = -1; h.flag = extra_flag; - bwa_hit2sam(&str, opt->mat, opt->q, opt->r, opt->w, bns, pac, s, &h, opt->flag&MEM_F_HARDCLIP, m); - } - s->sam = str.s; -} - void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) { kstring_t str; @@ -1024,7 +877,6 @@ static void *worker2(void *data) if (!(w->opt->flag&MEM_F_PE)) { for (i = w->start; i < w->n; i += w->step) { mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); - //mem_sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); free(w->regs[i].a); } diff --git a/bwamem.h b/bwamem.h index 55cff76..ae201e6 100644 --- a/bwamem.h +++ b/bwamem.h @@ -59,13 +59,6 @@ typedef struct { double avg, std; } mem_pestat_t; -typedef struct { // TODO: This is an intermediate struct only. Better get rid of it. - int64_t rb, re; - int qb, qe, flag, qual; - // optional info - int score, sub; -} bwahit_t; - typedef struct { // This struct is only used for the convenience of API. int64_t pos; // forward strand 5'-end mapping position int rid; // reference sequence index in bntseq_t; <0 for unmapped diff --git a/bwamem_pair.c b/bwamem_pair.c index 7cbf5e0..5f2c7bb 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -169,7 +169,6 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2]) { - extern void mem_alnreg2hit(const mem_alnreg_t *a, bwahit_t *h); pair64_v v, u; int r, i, k, y[4], ret; // y[] keeps the last hit kv_init(v); kv_init(u); From 0f88103d2a203a136fac73253eef30bc7311a220 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 23:01:51 -0400 Subject: [PATCH 365/498] SAM almost identical to 0.7.2 --- bwamem.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/bwamem.c b/bwamem.c index f780949..a10448a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -618,20 +618,22 @@ static inline int get_rlen(int n_cigar, const uint32_t *cigar) return l; } -void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m) +void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m_) { - int i, copy_mate = 0; - mem_aln_t ptmp = list[which], *p = &ptmp; // make a copy of the alignment to convert + int i; + mem_aln_t ptmp = list[which], *p = &ptmp, mtmp, *m = 0; // make a copy of the alignment to convert + if (m_) mtmp = *m_, m = &mtmp; // set flag p->flag |= m? 0x1 : 0; // is paired in sequencing - p->flag |= p->rid < 0? 4 : 0; // is mapped - p->flag |= m && m->rid < 0? 8 : 0; // is mate mapped - if (p->rid < 0 && m && m->rid >= 0) - p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0, copy_mate = 1; + p->flag |= p->rid < 0? 0x4 : 0; // is mapped + p->flag |= m && m->rid < 0? 0x8 : 0; // is mate mapped + if (p->rid < 0 && m && m->rid >= 0) // copy mate to alignment + p->rid = m->rid, p->pos = m->pos, p->is_rev = m->is_rev, p->n_cigar = 0; + if (m && m->rid < 0 && p->rid >= 0) // copy alignment to mate + m->rid = p->rid, m->pos = p->pos, m->is_rev = p->is_rev, m->n_cigar = 0; p->flag |= p->is_rev? 0x10 : 0; // is on the reverse strand p->flag |= m && m->is_rev? 0x20 : 0; // is mate on the reverse strand - if (p->rid >= 0 && m && m->rid <= 0 && p->is_rev) p->flag |= 0x20; // if mate is unmapped, it takes the strand of the current read // print up to CIGAR kputs(s->name, str); kputc('\t', str); // QNAME @@ -649,21 +651,18 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m kputc('\t', str); // print the mate position if applicable - if (m && m->rid) { + if (m && m->rid >= 0) { if (p->rid == m->rid) kputc('=', str); else kputs(bns->anns[m->rid].name, str); kputc('\t', str); - kputl(m->pos + 1, str); + kputl(m->pos + 1, str); kputc('\t', str); if (p->rid == m->rid) { int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) : 0); int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) : 0); - kputw(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); + kputw(m->n_cigar && p->n_cigar? p1 - p0 : 0, str); // compute TLEN if both ends mapped; otherwise, set to zero } else kputc('0', str); - } else if (m && p->rid) { - kputsn("\t=\t", 3, str); - kputl(p->pos + 1, str); - kputsn("\t0\t", 3, str); - } else kputsn("*\t0\t0\t", 6, str); + } else kputsn("*\t0\t0", 5, str); + kputc('\t', str); // print SEQ and QUAL if (p->flag & 0x100) { // for secondary alignments, don't write SEQ and QUAL @@ -754,6 +753,7 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa } else { mem_aln_t t; t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); + t.flag |= extra_flag; mem_aln2sam(bns, &str, s, 1, &t, 0, m); } s->sam = str.s; From 6c665189ad628a08ee421ce408d8fc25e7684878 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 23:16:18 -0400 Subject: [PATCH 366/498] r359: identical output to 0.7.2 (without -a) --- bwamem.c | 2 +- bwamem_pair.c | 2 ++ example.c | 2 +- main.c | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index a10448a..71eb436 100644 --- a/bwamem.c +++ b/bwamem.c @@ -837,7 +837,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * } a.rid = bns_pos2rid(bns, pos); a.pos = pos - bns->anns[a.rid].offset; - a.score = ar->score; a.sub = ar->sub; + a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub; free(query); return a; } diff --git a/bwamem_pair.c b/bwamem_pair.c index 5f2c7bb..6316f6a 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -295,6 +295,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1]); s[0].sam = strdup(str.s); str.l = 0; mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0]); s[1].sam = str.s; + free(h[0].cigar); free(h[1].cigar); } else goto no_pairing; return n; @@ -306,5 +307,6 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co } mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); + free(h[0].cigar); free(h[1].cigar); return n; } diff --git a/example.c b/example.c index b59eec2..7c25674 100644 --- a/example.c +++ b/example.c @@ -34,7 +34,7 @@ int main(int argc, char *argv[]) if (ar.a[i].secondary >= 0) continue; // skip secondary alignments a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR // print alignment - printf("%s\t%c\t%s\t%d\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, a.pos, a.mapq); + printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq); for (k = 0; k < a.n_cigar; ++k) // print CIGAR printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); printf("\t%d\n", a.NM); // print edit distance diff --git a/main.c b/main.c index b0874f0..e6cb68d 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r351" +#define PACKAGE_VERSION "0.7.2-r359-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From dab5b17c1a6c168e2ada49ccfa495b5e9eacc24a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 11 Mar 2013 23:43:58 -0400 Subject: [PATCH 367/498] r360: output alternative primary alignments in XA --- bwamem.c | 19 ++++++++++++++++++- main.c | 2 +- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 71eb436..7925dee 100644 --- a/bwamem.c +++ b/bwamem.c @@ -702,6 +702,22 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } + if (n > 1) { // output multiple primary hits + kputsn("\tXA:Z:", 6, str); + for (i = 0; i < n; ++i) { + const mem_aln_t *r = &list[i]; + int k; + if (i == which) continue; + kputs(bns->anns[r->rid].name, str); kputc(',', str); + kputc("+-"[r->is_rev], str); + kputl(r->pos+1, str); kputc(',', str); + for (k = 0; k < r->n_cigar; ++k) { + kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str); + } + kputc(',', str); + kputw(r->NM, str); kputc(';', str); + } + } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } kputc('\n', str); } @@ -742,7 +758,8 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; q = kv_pushp(mem_aln_t, aa); *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); - q->flag |= extra_flag; + q->flag |= extra_flag | (p->secondary >= 0? 0x100 : 0); // flag secondary + if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) q->flag |= 0x10000; if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; } diff --git a/main.c b/main.c index e6cb68d..c6d6e29 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r359-beta" +#define PACKAGE_VERSION "0.7.2-r360-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From aa7cdf4bb3e2f6c323f0b45a9528dcea8f694c75 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Mar 2013 00:00:04 -0400 Subject: [PATCH 368/498] r361: flag proper pair even if multi-primary Up to here, all the features in my checklist have been implemented. --- bwamem_pair.c | 6 ++++++ main.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 6316f6a..51a844b 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -305,6 +305,12 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]); else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0); } + if (h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. + int64_t dist; + int d; + d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); + if (!pes[d].failed && dist >= pes[d].low && dist <= pes[d].high) extra_flag |= 2; + } mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); free(h[0].cigar); free(h[1].cigar); diff --git a/main.c b/main.c index c6d6e29..26e6661 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r360-beta" +#define PACKAGE_VERSION "0.7.2-r361-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From c29b176cb6412fc9b2f6fafaa11e7bd4606334ff Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Mar 2013 00:14:36 -0400 Subject: [PATCH 369/498] r362: bugfix - occasionally wrong TLEN Use the 0.7.2 way to compute TLEN --- bwamem.c | 7 ++++--- main.c | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index 7925dee..924c97f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -657,9 +657,10 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m kputc('\t', str); kputl(m->pos + 1, str); kputc('\t', str); if (p->rid == m->rid) { - int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) : 0); - int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) : 0); - kputw(m->n_cigar && p->n_cigar? p1 - p0 : 0, str); // compute TLEN if both ends mapped; otherwise, set to zero + int64_t p0 = p->pos + (p->is_rev? get_rlen(p->n_cigar, p->cigar) - 1 : 0); + int64_t p1 = m->pos + (m->is_rev? get_rlen(m->n_cigar, m->cigar) - 1 : 0); + if (m->n_cigar == 0 || p->n_cigar == 0) kputc('0', str); + else kputl(-(p0 - p1 + (p0 > p1? 1 : p0 < p1? -1 : 0)), str); } else kputc('0', str); } else kputsn("*\t0\t0", 5, str); kputc('\t', str); diff --git a/main.c b/main.c index 26e6661..b749dfc 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r361-beta" +#define PACKAGE_VERSION "0.7.2-r362-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From bdf34f6ce7438bcc2d5ca3abce4f29930f93aef4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 Mar 2013 09:56:04 -0400 Subject: [PATCH 370/498] r363: XA=>XP; output mapQ in XP In BWA, XA gives hits "shadowed" by the primary hit. In BWA-MEM, we output primary hits only. Primary hits may have non-zero mapping quality. --- bwa.1 | 6 ++++-- bwamem.c | 16 ++++++++++------ main.c | 2 +- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/bwa.1 b/bwa.1 index 8e90848..f01dce1 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "9 March 2013" "bwa-0.7.2" "Bioinformatics tools" +.TH bwa 1 "13 March 2013" "bwa-0.7.3" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -574,11 +574,13 @@ XM Number of mismatches in the alignment XO Number of gap opens XG Number of gap extentions XT Type: Unique/Repeat/N/Mate-sw -XA Alternative hits; format: (chr,pos,CIGAR,NM;)* +XA Alternative hits; format: /(chr,pos,CIGAR,NM;)*/ _ XS Suboptimal alignment score XF Support from forward/reverse alignment XE Number of supporting seeds +_ +XP Alt primary hits; format: /(chr,pos,CIGAR;mapQ,NM;)+/ .TE .PP diff --git a/bwamem.c b/bwamem.c index 924c97f..b6a32f7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -703,20 +703,23 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } - if (n > 1) { // output multiple primary hits - kputsn("\tXA:Z:", 6, str); + for (i = 0; i < n; ++i) + if (i != which && !(list[i].flag&0x20000)) break; // 0x20000: shadowed multi hit + if (i < n) { // there are other primary hits; output them + kputsn("\tXP:Z:", 6, str); for (i = 0; i < n; ++i) { const mem_aln_t *r = &list[i]; int k; - if (i == which) continue; + if (i == which || (list[i].flag&0x20000)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit kputs(bns->anns[r->rid].name, str); kputc(',', str); kputc("+-"[r->is_rev], str); kputl(r->pos+1, str); kputc(',', str); for (k = 0; k < r->n_cigar; ++k) { kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str); } - kputc(',', str); - kputw(r->NM, str); kputc(';', str); + kputc(',', str); kputw(r->mapq, str); + kputc(',', str); kputw(r->NM, str); + kputc(';', str); } } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } @@ -833,7 +836,8 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * query = malloc(l_query); for (i = 0; i < l_query; ++i) // convert to the nt4 encoding query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; - a.mapq = mem_approx_mapq_se(opt, ar); + a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; + if (ar->secondary >= 0) a.flag |= 0x20000; bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r); w2 = w2 < opt->w? w2 : opt->w; diff --git a/main.c b/main.c index b749dfc..c5b84be 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r362-beta" +#define PACKAGE_VERSION "0.7.2-r363-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 925aa6f60db008f43bc575e22dcd0491999e7cab Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 13 Mar 2013 09:18:18 +0000 Subject: [PATCH 371/498] Added Makefile.bak and bwamem-lite to .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 57cb318..fba548e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ bwa test test64 .*.swp +Makefile.bak +bwamem-lite From e5355fe3a055307aaf8f6868770573a6c6aef0b4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 14 Mar 2013 22:01:26 -0400 Subject: [PATCH 372/498] r364: bug in mem pairing (no effect with -A=1) Forgot to adjust for matching score. This bug has no effect when -A takes the default value. --- bwamem_pair.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index 51a844b..23c99ef 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -201,7 +201,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ if (dist > pes[dir].high) break; if (dist < pes[dir].low) continue; ns = (dist - pes[dir].avg) / pes[dir].std; - q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) + .499); // .721 = 1/log(4) + q = (int)((v.a[i].y>>32) + (v.a[k].y>>32) + .721 * log(2. * erfc(fabs(ns) * M_SQRT1_2)) * opt->a + .499); // .721 = 1/log(4) if (q < 0) q = 0; p = kv_pushp(pair64_t, u); p->y = (uint64_t)k<<32 | i; diff --git a/main.c b/main.c index c5b84be..905161f 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r363-beta" +#define PACKAGE_VERSION "0.7.2-r364-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From dd511778374b100a98be761e698b2ba5be2bfa4b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 15 Mar 2013 11:59:05 -0400 Subject: [PATCH 373/498] r365: bugfix - wrong alignment (right mapping) The bug only happens when there is a 1bp del and 1bp ins which are close to the end and there are no other substitutions or indels. In this case, bwa mem gave a wrong band width. --- bwamem.c | 6 +++--- main.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index b6a32f7..da9f2be 100644 --- a/bwamem.c +++ b/bwamem.c @@ -556,7 +556,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int int prev = a->score; aw[0] = opt->w << i; a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); - if (bwa_verbose >= 4) printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); + if (bwa_verbose >= 4) { printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } // check whether we prefer to reach the end of the query @@ -574,7 +574,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int int prev = a->score; aw[1] = opt->w << i; a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); - if (bwa_verbose >= 4) printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); + if (bwa_verbose >= 4) { printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } // similar to the above @@ -601,7 +601,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int static inline int infer_bw(int l1, int l2, int score, int a, int q, int r) { int w; - if (l1 == l2 && l1 * a - score < (q + r)<<1) return 0; // to get equal alignment length, we need at least two gaps + if (l1 == l2 && l1 * a - score < (q + r - a)<<1) return 0; // to get equal alignment length, we need at least two gaps w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.); if (w < abs(l1 - l2)) w = abs(l1 - l2); return w; diff --git a/main.c b/main.c index 905161f..4761bc9 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r364-beta" +#define PACKAGE_VERSION "0.7.2-r365-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 7dec00c217268d1eebf9dd0fc42c73aa91800695 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 15 Mar 2013 12:51:53 -0400 Subject: [PATCH 374/498] Release BWA-0.7.3-r366 --- NEWS | 31 +++++++++++++++++++++++++++++++ bwa.1 | 4 ++-- main.c | 2 +- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 7474726..788ae40 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,34 @@ +Release 0.7.3 (15 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Changes to BWA-MEM: + + * Bugfix: pairing score is inaccurate when option -A does not take the default + value. This is a very minor issue even if it happens. + + * Bugfix: occasionally wrong CIGAR. This happens when in the alignment there + is a 1bp deletion and a 1bp insertion which are close to the end of the + reads, and there are no other substitutions or indels. BWA-MEM would not do + a gapped alignment due to the bug. + + * New feature: output other non-overlapping alignments in the XP tag such that + we can see the entire picture of alignment from one SAM line. XP gives the + position, CIGAR, NM and mapQ of each aligned subsequence of the query. + +BWA-MEM has been used to align ~300Gbp 100-700bp SE/PE reads. SNP/indel calling +has also been evaluated on part of these data. BWA-MEM generally gives better +pre-filtered SNP calls than BWA. No significant issues have been observed since +0.7.2, though minor improvements or bugs (e.g. the bug fixed in this release) +are still possible. If you find potential issues, please send bug reports to + (free registration required). + +In addition, more detailed description of the BWA-MEM algorithm can be found at +. + +(0.7.3: 15 March 2013, r366) + + + Release 0.7.2 (9 March, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bwa.1 b/bwa.1 index f01dce1..a02aebe 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "13 March 2013" "bwa-0.7.3" "Bioinformatics tools" +.TH bwa 1 "15 March 2013" "bwa-0.7.3" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -580,7 +580,7 @@ XS Suboptimal alignment score XF Support from forward/reverse alignment XE Number of supporting seeds _ -XP Alt primary hits; format: /(chr,pos,CIGAR;mapQ,NM;)+/ +XP Alt primary hits; format: /(chr,pos,CIGAR,mapQ,NM;)+/ .TE .PP diff --git a/main.c b/main.c index 4761bc9..a03c49f 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.2-r365-beta" +#define PACKAGE_VERSION "0.7.3-r366" #endif int bwa_fa2pac(int argc, char *argv[]); From 9346acde1b2c930e13948b7cb43076f2ebc63dfc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 15 Mar 2013 21:26:37 -0400 Subject: [PATCH 375/498] Release bwa-0.7.3a-r367 In 0.7.3, the wrong CIGAR bug was only fixed in one scenario, but not fixed in another corner case. --- NEWS | 10 ++++++++++ bwa.1 | 2 +- bwamem.c | 24 +++++++++++++++++------- bwamem.h | 3 ++- main.c | 2 +- 5 files changed, 31 insertions(+), 10 deletions(-) diff --git a/NEWS b/NEWS index 788ae40..0cf0591 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,13 @@ +Release 0.7.3a (15 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In 0.7.3, the wrong CIGAR bug was only fixed in one scenario, but not fixed +in another corner case. + +(0.7.3a: 15 March 2013, r367) + + + Release 0.7.3 (15 March, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bwa.1 b/bwa.1 index a02aebe..1edbf12 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "15 March 2013" "bwa-0.7.3" "Bioinformatics tools" +.TH bwa 1 "15 March 2013" "bwa-0.7.3a" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool diff --git a/bwamem.c b/bwamem.c index da9f2be..82dbe45 100644 --- a/bwamem.c +++ b/bwamem.c @@ -542,7 +542,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a = kv_pushp(mem_alnreg_t, *av); memset(a, 0, sizeof(mem_alnreg_t)); a->w = aw[0] = aw[1] = opt->w; - a->score = -1; + a->score = a->truesc = -1; if (s->qbeg) { // left extension uint8_t *rs, *qs; @@ -560,10 +560,15 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } // check whether we prefer to reach the end of the query - if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; // local hits - else a->qb = 0, a->rb = s->rbeg - gtle; // reach the end + if (gscore <= 0 || gscore <= a->score - opt->pen_clip) { // local extension + a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; + a->truesc = a->score; + } else { // to-end extension + a->qb = 0, a->rb = s->rbeg - gtle; + a->truesc = gscore; + } free(qs); free(rs); - } else a->score = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; + } else a->score = a->truesc = s->len * opt->a, a->qb = 0, a->rb = s->rbeg; if (s->qbeg + s->len != l_query) { // right extension int qle, tle, qe, re, gtle, gscore, sc0 = a->score; @@ -578,8 +583,13 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } // similar to the above - if (gscore <= 0 || gscore <= a->score - opt->pen_clip) a->qe = qe + qle, a->re = rmax[0] + re + tle; - else a->qe = l_query, a->re = rmax[0] + re + gtle; + if (gscore <= 0 || gscore <= a->score - opt->pen_clip) { // local extension + a->qe = qe + qle, a->re = rmax[0] + re + tle; + a->truesc += a->score - sc0; + } else { // to-end extension + a->qe = l_query, a->re = rmax[0] + re + gtle; + a->truesc += gscore - sc0; + } } else a->qe = l_query, a->re = s->rbeg + s->len; if (bwa_verbose >= 4) { printf("[%d]\taw={%d,%d}\tscore=%d\t[%d,%d) <=> [%ld,%ld)\n", k, aw[0], aw[1], a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); } @@ -839,7 +849,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; if (ar->secondary >= 0) a.flag |= 0x20000; bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); - w2 = infer_bw(qe - qb, re - rb, ar->score, opt->a, opt->q, opt->r); + w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->q, opt->r); w2 = w2 < opt->w? w2 : opt->w; a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); a.NM = NM; diff --git a/bwamem.h b/bwamem.h index ae201e6..b7a4e79 100644 --- a/bwamem.h +++ b/bwamem.h @@ -43,7 +43,8 @@ typedef struct { typedef struct { int64_t rb, re; // [rb,re): reference sequence in the alignment int qb, qe; // [qb,qe): query sequence in the alignment - int score; // best SW score + int score; // best local SW score + int truesc; // actual score corresponding to the aligned region; possibly smaller than $score int sub; // 2nd best SW score int csub; // SW score of a tandem hit int sub_n; // approximate number of suboptimal hits diff --git a/main.c b/main.c index a03c49f..d9e75e2 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r366" +#define PACKAGE_VERSION "0.7.3a-r367" #endif int bwa_fa2pac(int argc, char *argv[]); From 1e3cadbfc29714af36cb9fe9475ebaad9eb5a546 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 18 Mar 2013 20:49:32 -0400 Subject: [PATCH 376/498] r368: bugfix - wrong CIGAR when bridging 3 contigs In this case, bwa_fix_xref() will return insane coordinates. The old version did not check the return status and write wrong CIGAR. This bug only happen to very short assembly contigs. --- bwa.c | 2 +- bwamem.c | 9 ++++++++- main.c | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/bwa.c b/bwa.c index 08d96b8..e86a87e 100644 --- a/bwa.c +++ b/bwa.c @@ -144,7 +144,7 @@ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, { int ib, ie, is_rev; int64_t fb, fe, mid = -1; - if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary + if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary; actually with BWA-MEM, we should never come to here *qb = *qe = *rb = *re = -1; return -1; // unable to fix } else { diff --git a/bwamem.c b/bwamem.c index 82dbe45..1af7f87 100644 --- a/bwamem.c +++ b/bwamem.c @@ -772,6 +772,10 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; q = kv_pushp(mem_aln_t, aa); *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); + if (q->rid < 0) { // unfixable cross-reference alignment + --aa.n; + continue; + } q->flag |= extra_flag | (p->secondary >= 0? 0x100 : 0); // flag secondary if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) q->flag |= 0x10000; @@ -848,7 +852,10 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; if (ar->secondary >= 0) a.flag |= 0x20000; - bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re); + if (bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) { // unfixable cross-reference alignment + a.rid = -1; a.pos = -1; a.flag |= 0x4; + return a; + } w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->q, opt->r); w2 = w2 < opt->w? w2 : opt->w; a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); diff --git a/main.c b/main.c index d9e75e2..0a79581 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3a-r367" +#define PACKAGE_VERSION "0.7.3-r368-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 8437cd4edde26eff4c48bc77e0ec15ca9b535dd8 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 19 Mar 2013 01:04:57 -0400 Subject: [PATCH 377/498] r369: bugfix - segfault caused by the last change Sigh... Even the simplest change can lead to new bugs. --- bwamem.c | 51 ++++++++++++++++++++++++++------------------------- main.c | 2 +- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/bwamem.c b/bwamem.c index 1af7f87..d19848d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -759,37 +759,38 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) { kstring_t str; + kvec_t(mem_aln_t) aa; + int k; + + kv_init(aa); str.l = str.m = 0; str.s = 0; - if (a->n > 0 && a->a[0].score >= opt->T) { - int k; - kvec_t(mem_aln_t) aa; - kv_init(aa); - for (k = 0; k < a->n; ++k) { - mem_alnreg_t *p = &a->a[k]; - mem_aln_t *q; - if (p->score < opt->T) continue; - if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; - if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; - q = kv_pushp(mem_aln_t, aa); - *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); - if (q->rid < 0) { // unfixable cross-reference alignment - --aa.n; - continue; - } - q->flag |= extra_flag | (p->secondary >= 0? 0x100 : 0); // flag secondary - if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score - if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) q->flag |= 0x10000; - if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; + for (k = 0; k < a->n; ++k) { + mem_alnreg_t *p = &a->a[k]; + mem_aln_t *q; + if (p->score < opt->T) continue; + if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; + if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; + q = kv_pushp(mem_aln_t, aa); + *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); + if (q->rid < 0) { // unfixable cross-reference alignment + --aa.n; + continue; } - for (k = 0; k < aa.n; ++k) - mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m); - for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); - free(aa.a); - } else { + q->flag |= extra_flag | (p->secondary >= 0? 0x100 : 0); // flag secondary + if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score + if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) q->flag |= 0x10000; + if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; + } + if (aa.n == 0) { // no alignments good enough; then write an unaligned record mem_aln_t t; t = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, 0); t.flag |= extra_flag; mem_aln2sam(bns, &str, s, 1, &t, 0, m); + } else { + for (k = 0; k < aa.n; ++k) + mem_aln2sam(bns, &str, s, aa.n, aa.a, k, m); + for (k = 0; k < aa.n; ++k) free(aa.a[k].cigar); + free(aa.a); } s->sam = str.s; } diff --git a/main.c b/main.c index 0a79581..54c3093 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r368-beta" +#define PACKAGE_VERSION "0.7.3-r369-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 678e30e57fb19f7ed8b3448feb6cc79b78b2a281 Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Tue, 19 Mar 2013 14:34:30 +0100 Subject: [PATCH 378/498] Don't add a newline after the XS tag. --- bwase.c | 1 - 1 file changed, 1 deletion(-) diff --git a/bwase.c b/bwase.c index 1a71c06..0580277 100644 --- a/bwase.c +++ b/bwase.c @@ -611,7 +611,6 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in for (i = 1; i < p->n_aln; i++) { err_printf(";%f", p->aln[i].posterior_p); } - err_printf("\n"); } if (p->type != BWA_TYPE_NO_MATCH) { int i; From ce53264723c4c8bcfa3d05c7e97d1be385c118ca Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Tue, 19 Mar 2013 15:23:38 +0100 Subject: [PATCH 379/498] More consistent output of the XS tag. --- bwase.c | 24 +++++++++++++++--------- bwtaln.h | 1 + 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/bwase.c b/bwase.c index 0580277..245ef5d 100644 --- a/bwase.c +++ b/bwase.c @@ -100,7 +100,7 @@ float itf(int score) { return ((float)score) / 1000.; } -void bwa_pssm_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) +void bwa_pssm_aln2seq_core(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) { int i, cnt; double best_score, total_prob=0.0; @@ -162,6 +162,7 @@ void bwa_pssm_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int s for (l = q->k; l <= q->l; ++l) { s->multi[z].pos = l; s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z].posterior_p = q->posterior_p; s->multi[z++].mm = q->n_mm; } rest -= q->l - q->k + 1; @@ -172,6 +173,7 @@ void bwa_pssm_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int s while (x < p) p -= p * j / (i--); s->multi[z].pos = q->l - i; s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z].posterior_p = q->posterior_p; s->multi[z++].mm = q->n_mm; } rest = 0; @@ -196,11 +198,16 @@ void adjust_pssm_score(const bntseq_t *bns, bwa_seq_t *seq, float prior) { double P = prior; double p = seq->posterior_prob; double new_pp; + int i; new_pp = e / ((e / p) + L * ((1 - P) / P)); seq->posterior_prob = new_pp; - //fprintf(stderr, "best_score: %f e: %f L: %f P: %f p: %f new_pp: %f\n", seq->best_pssm_score, e, L, P, p, new_pp); + for (i = 0; i < seq->n_multi; i++) { + bwt_multi1_t *q = seq->multi + i; + p = seq->aln[i].posterior_p; + q->posterior_p = e / ((e / p) + L * ((1 - P) / P)); + } } void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s) @@ -545,7 +552,7 @@ static int64_t pos_5(const bwa_seq_t *p) void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2) { - int j,i; + int j; if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) { int seqid, nn, am = 0, flag = p->extra_flag; char XT; @@ -606,12 +613,6 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in if (bwa_rg_id) err_printf("\tRG:Z:%s", bwa_rg_id); if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); - if (p->n_aln > 1) { - err_printf("\tXS:A:%f", p->aln[0].posterior_p); - for (i = 1; i < p->n_aln; i++) { - err_printf(";%f", p->aln[i].posterior_p); - } - } if (p->type != BWA_TYPE_NO_MATCH) { int i; // calculate XT tag @@ -645,6 +646,11 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in } else err_printf("%dM", p->len); err_printf(",%d;", q->gap + q->mm); } + err_printf("\tXS:A:"); + for (i = 0; i < p->n_multi; ++i) { + bwt_multi1_t *q = p->multi + i; + err_printf("%f;", q->posterior_p); + } } } putchar('\n'); diff --git a/bwtaln.h b/bwtaln.h index 1ba9346..6cb4707 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -59,6 +59,7 @@ typedef struct { uint32_t n_cigar:15, gap:8, mm:8, strand:1; bwtint_t pos; bwa_cigar_t *cigar; + float posterior_p; } bwt_multi1_t; typedef struct { From f585ac016c0532deee8ef9634943a6216790b4d1 Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Wed, 20 Mar 2013 19:33:29 +0100 Subject: [PATCH 380/498] Fixed bug where the last posterior probability was set to 0 --- Makefile | 4 ++-- bwape.c | 4 ++-- bwase.c | 4 ++-- bwase.h | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 5d9b018..5a2f792 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,11 @@ CC= gcc CXX= g++ -CFLAGS= -g -Wall +#CFLAGS= -g -Wall #CFLAGS= -pg -Wall -O2 #CFLAGS= -O3 -L/scr/plastilin/pkerp/local/lib #CFLAGS = -pg #CFLAGS = -O3 -pg -#CFLAGS =-O3 -Wall +CFLAGS =-O3 -Wall CXXFLAGS= $(CFLAGS) DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64 OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ diff --git a/bwape.c b/bwape.c index 6fe2c36..5f32b6c 100644 --- a/bwape.c +++ b/bwape.c @@ -49,8 +49,8 @@ static kh_b128_t *g_hash; void adjust_pssm_score(const bntseq_t *bns, bwa_seq_t *seq, float prior); void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); -void bwa_pssm_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); -void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); +void bwa_pssm_aln2seq_core(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); +void bwa_aln2seq(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s); int bwa_approx_mapQ(const bwa_seq_t *p, int mm); int bwa_pssm_approx_mapQ(const bwa_seq_t *p, int mm); void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2); diff --git a/bwase.c b/bwase.c index 245ef5d..54b756b 100644 --- a/bwase.c +++ b/bwase.c @@ -205,12 +205,12 @@ void adjust_pssm_score(const bntseq_t *bns, bwa_seq_t *seq, float prior) { for (i = 0; i < seq->n_multi; i++) { bwt_multi1_t *q = seq->multi + i; - p = seq->aln[i].posterior_p; + p = q->posterior_p; q->posterior_p = e / ((e / p) + L * ((1 - P) / P)); } } -void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s) +void bwa_aln2seq(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s) { if (n_aln == 0 || !aln) { s->type = BWA_TYPE_NO_MATCH; diff --git a/bwase.h b/bwase.h index f8e9b0a..b499948 100644 --- a/bwase.h +++ b/bwase.h @@ -16,7 +16,7 @@ extern "C" { // Refine the approximate position of the sequence to an actual placement for the sequence. void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq, bntseq_t *ntbns); // Backfill certain alignment properties mainly centering around number of matches. - void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); + void bwa_aln2seq(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s); // Calculate the end position of a read given a certain sequence. int64_t pos_end(const bwa_seq_t *p); // From 1e118e0823823dce3e576e7f47200a5a7e3c5844 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 3 Apr 2013 23:57:19 -0400 Subject: [PATCH 381/498] r370: suppress "D" at the end of a cigar This is caused by seeds in tandem repeats, in which case, bwa-mem may not extend the true seed. The change in this commit is only a temporary cure. --- bwamem.c | 7 +++++++ main.c | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index d19848d..3da9775 100644 --- a/bwamem.c +++ b/bwamem.c @@ -863,6 +863,13 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * a.NM = NM; pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); a.is_rev = is_rev; + if (a.n_cigar > 0) { + if ((a.cigar[0]&0xf) == 2) { + pos += a.cigar[0]>>4; + --a.n_cigar; + memmove(a.cigar, a.cigar + 1, a.n_cigar * 4); + } else if ((a.cigar[a.n_cigar-1]&0xf) == 2) --a.n_cigar; + } if (qb != 0 || qe != l_query) { // add clipping to CIGAR int clip5, clip3; clip5 = is_rev? l_query - qe : qb; diff --git a/main.c b/main.c index 54c3093..16f6f27 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r369-beta" +#define PACKAGE_VERSION "0.7.3-r370-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From d7ca0885eb686128fe4c1b5e2cb1290faacac849 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 4 Apr 2013 00:43:43 -0400 Subject: [PATCH 382/498] r371: extend overlapping seeds to avoid misalignment in tandem repeats --- bwamem.c | 15 ++++++++++++++- main.c | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 3da9775..a221d28 100644 --- a/bwamem.c +++ b/bwamem.c @@ -537,7 +537,20 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int w = max_gap < opt->w? max_gap : opt->w; if (qd - rd < w && rd - qd < w) break; } - if (i < av->n) continue; + if (i < av->n) { // the seed is (almost) contained in an existing alignment + for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain + const mem_seed_t *t; + if (srt[i] == 0) continue; + t = &c->seeds[(uint32_t)srt[i]]; + if (t->len < s->len * .95) continue; // only check overlapping if t is long enough; TODO: more efficient by early stopping + if (s->qbeg <= t->qbeg && s->qbeg + s->len >= t->qbeg && t->qbeg - s->qbeg != t->rbeg - s->rbeg) break; + if (t->qbeg <= s->qbeg && t->qbeg + t->len >= s->qbeg && s->qbeg - t->qbeg != s->rbeg - t->rbeg) break; + } + if (i == c->n) { // no overlapping seeds; then skip extension + srt[k] = 0; // mark that seed extension has not been performed + continue; + } + } a = kv_pushp(mem_alnreg_t, *av); memset(a, 0, sizeof(mem_alnreg_t)); diff --git a/main.c b/main.c index 16f6f27..0d490bc 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r370-beta" +#define PACKAGE_VERSION "0.7.3-r371-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From d64eaa851d97d34d0255d2e03e4ec61a195efe0c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 9 Apr 2013 15:17:04 -0400 Subject: [PATCH 383/498] fixed an issue caused by a Mac/Darwin bug On Mac/Darwin, it is not possible to read >2GB data with one fread(). --- bwt.c | 16 ++++++++++++++-- main.c | 2 +- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/bwt.c b/bwt.c index 4ee9ea8..8656b0e 100644 --- a/bwt.c +++ b/bwt.c @@ -372,6 +372,18 @@ void bwt_dump_sa(const char *fn, const bwt_t *bwt) fclose(fp); } +static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a) +{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks + const int bufsize = 0x1000000; // 16M block + bwtint_t offset = 0; + while (size) { + int x = bufsize < size? bufsize : size; + if ((x = fread(a + offset, 1, x, fp)) == 0) break; + size -= x; offset += x; + } + return offset; +} + void bwt_restore_sa(const char *fn, bwt_t *bwt) { char skipped[256]; @@ -390,7 +402,7 @@ void bwt_restore_sa(const char *fn, bwt_t *bwt) bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); bwt->sa[0] = -1; - fread(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); + fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1); fclose(fp); } @@ -407,7 +419,7 @@ bwt_t *bwt_restore_bwt(const char *fn) fseek(fp, 0, SEEK_SET); fread(&bwt->primary, sizeof(bwtint_t), 1, fp); fread(bwt->L2+1, sizeof(bwtint_t), 4, fp); - fread(bwt->bwt, 4, bwt->bwt_size, fp); + fread_fix(fp, bwt->bwt_size<<2, bwt->bwt); bwt->seq_len = bwt->L2[4]; fclose(fp); bwt_gen_cnt_table(bwt); diff --git a/main.c b/main.c index 0d490bc..1104838 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r371-beta" +#define PACKAGE_VERSION "0.7.3-r372-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 53bb846407b3847b90b4b1d946ab1c3a880cdb3b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 9 Apr 2013 16:13:55 -0400 Subject: [PATCH 384/498] r373: optionally distable mate rescue --- bwamem.h | 1 + bwamem_pair.c | 25 +++++++++++++------------ fastmap.c | 6 ++++-- main.c | 2 +- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/bwamem.h b/bwamem.h index b7a4e79..40f47f8 100644 --- a/bwamem.h +++ b/bwamem.h @@ -16,6 +16,7 @@ typedef struct __smem_i smem_i; #define MEM_F_NOPAIRING 0x4 #define MEM_F_ALL 0x8 #define MEM_F_NO_MULTI 0x10 +#define MEM_F_NO_RESCUE 0x20 typedef struct { int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r diff --git a/bwamem_pair.c b/bwamem_pair.c index 23c99ef..0f6ff08 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -235,20 +235,21 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co int n = 0, i, j, z[2], o, subo, n_sub, extra_flag = 1; kstring_t str; - mem_alnreg_v b[2]; mem_aln_t h[2]; str.l = str.m = 0; str.s = 0; - // perform SW for the best alignment - kv_init(b[0]); kv_init(b[1]); - for (i = 0; i < 2; ++i) - for (j = 0; j < a[i].n; ++j) - if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) - kv_push(mem_alnreg_t, b[i], a[i].a[j]); - for (i = 0; i < 2; ++i) - for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) - n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); - free(b[0].a); free(b[1].a); + if (!(opt->flag & MEM_F_NO_RESCUE)) { // then perform SW for the best alignment + mem_alnreg_v b[2]; + kv_init(b[0]); kv_init(b[1]); + for (i = 0; i < 2; ++i) + for (j = 0; j < a[i].n; ++j) + if (a[i].a[j].score >= a[i].a[0].score - opt->pen_unpaired) + kv_push(mem_alnreg_t, b[i], a[i].a[j]); + for (i = 0; i < 2; ++i) + for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) + n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); + free(b[0].a); free(b[1].a); + } mem_mark_primary_se(opt, a[0].n, a[0].a); mem_mark_primary_se(opt, a[1].n, a[1].a); if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; @@ -305,7 +306,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, &a[i].a[0]); else h[i] = mem_reg2aln(opt, bns, pac, s[i].l_seq, s[i].seq, 0); } - if (h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. + if (!(opt->flag & MEM_F_NOPAIRING) && h[0].rid == h[1].rid && h[0].rid >= 0) { // if the top hits from the two ends constitute a proper pair, flag it. int64_t dist; int d; d = mem_infer_dir(bns->l_pac, a[0].a[0].rb, a[1].a[0].rb, &dist); diff --git a/fastmap.c b/fastmap.c index eda06bb..98963c0 100644 --- a/fastmap.c +++ b/fastmap.c @@ -26,7 +26,7 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) { + while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); @@ -42,6 +42,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'a') opt->flag |= MEM_F_ALL; else if (c == 'p') opt->flag |= MEM_F_PE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; + else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'd') opt->zdrop = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg); @@ -63,7 +64,8 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); - fprintf(stderr, " -P skip pairing; perform mate SW only\n"); + fprintf(stderr, " -S skip mate rescue\n"); + fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); diff --git a/main.c b/main.c index 1104838..c9acc72 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r372-beta" +#define PACKAGE_VERSION "0.7.3-r373-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 3d8a8c1e373249b799006fb6e571982b4175daaa Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 10 Apr 2013 01:09:37 -0400 Subject: [PATCH 385/498] r374: fix - clipping penalty not always working This only happens to gaps where mem underestimates the bandwidth without considering the clipping penalty. --- bwamem.c | 4 ++-- ksw.c | 4 ++-- ksw.h | 2 +- main.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bwamem.c b/bwamem.c index a221d28..6fc2e5a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -568,7 +568,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[0] = opt->w << i; - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->pen_clip, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); if (bwa_verbose >= 4) { printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } @@ -591,7 +591,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[1] = opt->w << i; - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->pen_clip, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); if (bwa_verbose >= 4) { printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } diff --git a/ksw.c b/ksw.c index e331390..2daf809 100644 --- a/ksw.c +++ b/ksw.c @@ -359,7 +359,7 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) { eh_t *eh; // score array int8_t *qp; // query profile @@ -381,7 +381,7 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, k = m * m; for (i = 0, max = 0; i < k; ++i) // get the max score max = max > mat[i]? max : mat[i]; - max_gap = (int)((double)(qlen * max - gapo) / gape + 1.); + max_gap = (int)((double)(qlen * max + end_bonus - gapo) / gape + 1.); max_gap = max_gap > 1? max_gap : 1; w = w < max_gap? w : max_gap; // DP loop diff --git a/ksw.h b/ksw.h index 2dd6499..97559fd 100644 --- a/ksw.h +++ b/ksw.h @@ -102,7 +102,7 @@ extern "C" { * * @return best semi-local alignment score */ - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); + int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); #ifdef __cplusplus } diff --git a/main.c b/main.c index c9acc72..b47c897 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r373-beta" +#define PACKAGE_VERSION "0.7.3-r374-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 47520134e784a9c3d6661a5de7e74e21a54fdbfb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 10 Apr 2013 11:04:32 -0400 Subject: [PATCH 386/498] r375: fixed compiling errors by the last change --- bwase.c | 4 ++-- bwtsw2_aux.c | 4 ++-- main.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bwase.c b/bwase.c index fd06c7d..0d49739 100644 --- a/bwase.c +++ b/bwase.c @@ -176,7 +176,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences seq_reverse(rlen, rseq, 0); - ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, >le, &gscore, 0); if (gscore > 0) tle = gtle, qle = len; rb = re - tle; rlen = tle; seq_reverse(len, seq, 0); @@ -192,7 +192,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l rb = *_pos; re = rb + len + SW_BW; if (re > l_pac) re = l_pac; rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); - ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, -1, len<<1, &qle, &tle, >le, &gscore, 0); + ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, >le, &gscore, 0); if (gscore > 0) tle = gtle, qle = len; re = rb + tle; rlen = tle; ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index a84d7e0..d26d3fa 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -125,7 +125,7 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, p->G, &qle, &tle, 0, 0, 0); + score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0); if (score > p->G) { // extensible p->G = score; p->k -= tle; @@ -153,7 +153,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; lt = j; - score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, -1, 1, &qle, &tle, 0, 0, 0) - 1; + score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0) - 1; // if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); if (score >= p->G) { p->G = score; diff --git a/main.c b/main.c index b47c897..8a0542c 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r374-beta" +#define PACKAGE_VERSION "0.7.3-r375-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 499cf4c00d6bde76e28c48500a058351f8267038 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 10 Apr 2013 12:18:56 -0400 Subject: [PATCH 387/498] r376: reduce wasteful seed extension mainly for contig alignment --- bwamem.c | 4 ++-- main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 6fc2e5a..182b57a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -543,8 +543,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (srt[i] == 0) continue; t = &c->seeds[(uint32_t)srt[i]]; if (t->len < s->len * .95) continue; // only check overlapping if t is long enough; TODO: more efficient by early stopping - if (s->qbeg <= t->qbeg && s->qbeg + s->len >= t->qbeg && t->qbeg - s->qbeg != t->rbeg - s->rbeg) break; - if (t->qbeg <= s->qbeg && t->qbeg + t->len >= s->qbeg && s->qbeg - t->qbeg != s->rbeg - t->rbeg) break; + if (s->qbeg <= t->qbeg && s->qbeg + s->len - t->qbeg >= s->len>>2 && t->qbeg - s->qbeg != t->rbeg - s->rbeg) break; + if (t->qbeg <= s->qbeg && t->qbeg + t->len - s->qbeg >= s->len>>2 && s->qbeg - t->qbeg != s->rbeg - t->rbeg) break; } if (i == c->n) { // no overlapping seeds; then skip extension srt[k] = 0; // mark that seed extension has not been performed diff --git a/main.c b/main.c index 8a0542c..8c1343c 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r375-beta" +#define PACKAGE_VERSION "0.7.3-r376-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 2087dc162f5ff5eb6c8da5d4bead13199013864f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 17 Apr 2013 16:50:20 -0400 Subject: [PATCH 388/498] r377: increased unpaired penalty from 9 to 17 This leads to more aggressive pairing - more properly paired reads. I have found a few cases where, for example, read1 is umambiguously mapped to chr20 while its 100bp mate has a perfect match to another chr but has 3 mismatches and 1 deletion when it is paired with read1 on chr20. With longer reads, it seems that the chr20 hit is correct, although it is not obvious how this happened in evolution. --- bwa.1 | 4 ++-- bwamem.c | 2 +- main.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bwa.1 b/bwa.1 index 1edbf12..9d45a3d 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "15 March 2013" "bwa-0.7.3a" "Bioinformatics tools" +.TH bwa 1 "15 March 2013" "bwa-0.7.4" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -220,7 +220,7 @@ deducted. [5] Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as .RI scoreRead1+scoreRead2- INT and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these -two scores to determine whether we should force pairing. [9] +two scores to determine whether we should force pairing. [17] .TP .B -p Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details. diff --git a/bwamem.c b/bwamem.c index 182b57a..921b512 100644 --- a/bwamem.c +++ b/bwamem.c @@ -46,7 +46,7 @@ mem_opt_t *mem_opt_init() o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; o->T = 30; o->zdrop = 100; - o->pen_unpaired = 9; + o->pen_unpaired = 17; o->pen_clip = 5; o->min_seed_len = 19; o->split_width = 10; diff --git a/main.c b/main.c index 8c1343c..ea92d81 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r376-beta" +#define PACKAGE_VERSION "0.7.3-r377-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From be11e27e126fd680835eb5287c5d66c9f0d4a32d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 19 Apr 2013 12:00:37 -0400 Subject: [PATCH 389/498] r378: bugfix - wrong CIGAR This is actually caused by a bug in SSE2-SW, where the query begin may be smaller than the true one if there is an exact tandem repeat. --- bwape.c | 6 +++--- ksw.c | 4 +++- main.c | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/bwape.c b/bwape.c index 9fd12b1..92161a5 100644 --- a/bwape.c +++ b/bwape.c @@ -404,7 +404,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u bwa_cigar_t *cigar = 0; ubyte_t *ref_seq; bwtint_t k, x, y, l; - int xtra; + int xtra, gscore; int8_t mat[25]; bwa_fill_scmat(1, 3, mat); @@ -422,12 +422,12 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u // do alignment xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0); r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0); - ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32); + gscore = ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32); cigar = (bwa_cigar_t*)cigar32; for (k = 0; k < *n_cigar; ++k) cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); - if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score) { // poor hit or tandem hits + if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score || gscore != r.score) { // poor hit or tandem hits or weird alignment free(cigar); free(ref_seq); *n_cigar = 0; return 0; } diff --git a/ksw.c b/ksw.c index 2daf809..a786a2b 100644 --- a/ksw.c +++ b/ksw.c @@ -201,10 +201,11 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, r.score = gmax + q->shift < 255? gmax : 255; r.te = te; if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score - int max = -1, low, high, qlen = slen * 16; + int max = -1, tmp, low, high, qlen = slen * 16; uint8_t *t = (uint8_t*)Hmax; for (i = 0; i < qlen; ++i, ++t) if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; + else ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp; //printf("%d,%d\n", max, gmax); if (b) { i = (r.score + q->max - 1) / q->max; @@ -306,6 +307,7 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, uint16_t *t = (uint16_t*)Hmax; for (i = 0, r.qe = -1; i < qlen; ++i, ++t) if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; + else ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp; if (b) { i = (r.score + q->max - 1) / q->max; low = te - i; high = te + i; diff --git a/main.c b/main.c index ea92d81..75e4e6b 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r377-beta" +#define PACKAGE_VERSION "0.7.3-r378-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From f0c94d80d1bc1639549aa272c1d1950668f19c48 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 19 Apr 2013 12:04:00 -0400 Subject: [PATCH 390/498] r379: fixed compiling error --- ksw.c | 4 ++-- main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ksw.c b/ksw.c index a786a2b..0a9b40e 100644 --- a/ksw.c +++ b/ksw.c @@ -205,7 +205,7 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, uint8_t *t = (uint8_t*)Hmax; for (i = 0; i < qlen; ++i, ++t) if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; - else ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp; + else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp; //printf("%d,%d\n", max, gmax); if (b) { i = (r.score + q->max - 1) / q->max; @@ -307,7 +307,7 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, uint16_t *t = (uint16_t*)Hmax; for (i = 0, r.qe = -1; i < qlen; ++i, ++t) if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; - else ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp; + else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp; if (b) { i = (r.score + q->max - 1) / q->max; low = te - i; high = te + i; diff --git a/main.c b/main.c index 75e4e6b..1580451 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r378-beta" +#define PACKAGE_VERSION "0.7.3-r379-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From db7a98636f9907107c41a5c3e2f11a62a264758d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 19 Apr 2013 12:04:44 -0400 Subject: [PATCH 391/498] r380: er... another compiling error --- ksw.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ksw.c b/ksw.c index 0a9b40e..7d9dfb3 100644 --- a/ksw.c +++ b/ksw.c @@ -303,7 +303,7 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, } r.score = gmax; r.te = te; { - int max = -1, low, high, qlen = slen * 8; + int max = -1, tmp, low, high, qlen = slen * 8; uint16_t *t = (uint16_t*)Hmax; for (i = 0, r.qe = -1; i < qlen; ++i, ++t) if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; diff --git a/main.c b/main.c index 1580451..4fa2522 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r379-beta" +#define PACKAGE_VERSION "0.7.3-r380-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 3f8caef33c92d1dff61a547ad7bf0cd72ea776a2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 19 Apr 2013 17:44:35 -0400 Subject: [PATCH 392/498] r381: fixed a bug when upper bound < max read len --- bwape.c | 5 +++++ main.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/bwape.c b/bwape.c index 92161a5..6752a18 100644 --- a/bwape.c +++ b/bwape.c @@ -106,6 +106,11 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + if (ii->low > ii->high) { + fprintf(stderr, "[infer_isize] fail to infer insert size: upper bound is smaller than read length\n"); + free(isizes); + return -1; + } for (i = 0, x = n = 0; i < tot; ++i) if (isizes[i] >= ii->low && isizes[i] <= ii->high) ++n, x += isizes[i]; diff --git a/main.c b/main.c index 4fa2522..85e41b2 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r380-beta" +#define PACKAGE_VERSION "0.7.3-r381-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From f6ae0d4d0f2f16392d254f0621008299f53eac0f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 19 Apr 2013 17:52:06 -0400 Subject: [PATCH 393/498] r382: similar treatment in bwa-sw (see r381) --- bwtsw2_pair.c | 8 +++++++- main.c | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index cad96e9..cdd822f 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -46,7 +46,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) p75 = isize[(int)(.75 * k + .499)]; ksprintf(msg, "[%s] infer the insert size distribution from %d high-quality pairs.\n", __func__, k); if (k < 8) { - ksprintf(msg, "[%s] fail to infer the insert size distribution.\n", __func__); + ksprintf(msg, "[%s] fail to infer the insert size distribution: too few good pairs.\n", __func__); free(isize); r.failed = 1; return r; @@ -55,6 +55,12 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) r.low = tmp > max_len? tmp : max_len; if (r.low < 1) r.low = 1; r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); + if (r.low > r.high) { + ksprintf(msg, "[%s] fail to infer the insert size distribution: upper bound is smaller than max read length.\n", __func__); + free(isize); + r.failed = 1; + return r; + } ksprintf(msg, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); ksprintf(msg, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high); for (i = x = 0, r.avg = 0; i < k; ++i) diff --git a/main.c b/main.c index 85e41b2..aee4772 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r381-beta" +#define PACKAGE_VERSION "0.7.3-r382-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 78ed00021f07e6b5e1d531795ecf0289feac13ff Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 23 Apr 2013 11:25:46 -0400 Subject: [PATCH 394/498] r384: updated NEWS --- NEWS | 33 +++++++++++++++++++++++++++++++++ main.c | 4 ++++ 2 files changed, 37 insertions(+) diff --git a/NEWS b/NEWS index 0cf0591..42e7c79 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,36 @@ +Release 0.7.4 (23 April, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is a bugfix release. Most of bugs are considered to be minor which only + very rarely. + ccur + + * Bugfix: wrong CIGAR when a query sequence bridges three or more target + sequences. This only happens when aligning reads to short assembly contigs. + + * Bugfix: leading "D" operator in CIGAR. + + * Extend more seeds for better alignment around tandem repeats. This is also + a cause of the leading "D" operator in CIGAR. + + * Bugfix: SSE2-SSW may occasionally find incorrect query starting position + around tandem repeat. This will lead to a suboptimal CIGAR in BWA-MEM and + a wrong CIGAR in BWA. + + * Bugfix: clipping penalty does not work as is intended when there is a gap + towards the end of a read. + + * Fixed an issue caused by a bug in the libc from Mac/Darwin. In Darwin, + fread() is unable to read a data block longer than 2GB due to an integer + overflow bug in its implementation. + +Since version 0.7.4, BWA-MEM is considered to reach similar stability to +BWA-backtrack for short-read mapping. + +(0.7.4: 23 April, r385) + + + Release 0.7.3a (15 March, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/main.c b/main.c index aee4772..f7616b7 100644 --- a/main.c +++ b/main.c @@ -46,6 +46,10 @@ static int usage() fprintf(stderr, " bwtupdate update .bwt to the new format\n"); fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); fprintf(stderr, "\n"); + fprintf(stderr, "Note: To use BWA, you need to first index the genome with `bwa index'. There are\n"); + fprintf(stderr, " three alignment algorithms in BWA: `mem', `bwasw' and `aln/samse/sampe'. If\n"); + fprintf(stderr, " you are not sure which to use, try `bwa mem' first. Please `man ./bwa.1' for\n"); + fprintf(stderr, " for the manual.\n\n"); return 1; } From 2f6897c72bf1c23504cf23c07d98a8ff90251d61 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 23 Apr 2013 11:27:30 -0400 Subject: [PATCH 395/498] r384: don't compile bwamem-lite by default --- Makefile | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index b660557..abbe42f 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ bwtsw2_chain.o fastmap.o bwtsw2_pair.o -PROG= bwa bwamem-lite +PROG= bwa INCLUDES= LIBS= -lm -lz -lpthread SUBDIRS= . diff --git a/main.c b/main.c index f7616b7..2c1678e 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r382-beta" +#define PACKAGE_VERSION "0.7.3-r384-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From c14aaad1ce72f5784bfe04df757a6b12fe07b7ea Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 23 Apr 2013 11:40:56 -0400 Subject: [PATCH 396/498] Released bwa-0.7.4-r385 --- bwa.1 | 37 ++++++++++++++++++++----------------- main.c | 2 +- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/bwa.1 b/bwa.1 index 9d45a3d..d25ba4a 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "15 March 2013" "bwa-0.7.4" "Bioinformatics tools" +.TH bwa 1 "23 April 2013" "bwa-0.7.4" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -44,7 +44,12 @@ for the BWA-MEM algorithm. .SH COMMANDS AND OPTIONS .TP .B index -bwa index [-p prefix] [-a algoType] +.B bwa index +.RB [ -p +.IR prefix ] +.RB [ -a +.IR algoType ] +.I db.fa Index database sequences in the FASTA format. @@ -55,20 +60,16 @@ Index database sequences in the FASTA format. Prefix of the output database [same as db filename] .TP .BI -a \ STR -Algorithm for constructing BWT index. Available options are: -.RS -.TP +Algorithm for constructing BWT index. BWA implements two algorithms for BWT +construction: .B is -IS linear-time algorithm for constructing suffix array. It requires -5.37N memory where N is the size of the database. IS is moderately fast, -but does not work with database larger than 2GB. IS is the default -algorithm due to its simplicity. The current codes for IS algorithm are -reimplemented by Yuta Mori. -.TP -.B bwtsw -Algorithm implemented in BWT-SW. This method works with the whole human -genome. -.RE +and +.BR bwtsw . +The first algorithm is a little faster for small database but requires large +RAM and does not work for databases with total length longer than 2GB. The +second algorithm is adapted from the BWT-SW source code. It in theory works +with database with trillions of bases. When this option is not specified, the +appropriate algorithm will be chosen automatically. .RE .TP @@ -220,10 +221,12 @@ deducted. [5] Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as .RI scoreRead1+scoreRead2- INT and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these -two scores to determine whether we should force pairing. [17] +two scores to determine whether we should force pairing. A larger value leads to +more aggressive read pair. [17] .TP .B -p -Assume the first input query file is interleaved paired-end FASTA/Q. See the command description for details. +Assume the first input query file is interleaved paired-end FASTA/Q. See the +command description for details. .TP .BI -R \ STR Complete read group header line. '\\t' can be used in diff --git a/main.c b/main.c index 2c1678e..816fa17 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.3-r384-beta" +#define PACKAGE_VERSION "0.7.4-r385" #endif int bwa_fa2pac(int argc, char *argv[]); From 6b59bc550055daf5fba544af9a2e2a319fa0aa78 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 23 Apr 2013 12:30:22 -0400 Subject: [PATCH 397/498] r386: typo in NEWS --- NEWS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/NEWS b/NEWS index 42e7c79..86f4114 100644 --- a/NEWS +++ b/NEWS @@ -2,8 +2,7 @@ Release 0.7.4 (23 April, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This is a bugfix release. Most of bugs are considered to be minor which only - very rarely. - ccur +occur very rarely. * Bugfix: wrong CIGAR when a query sequence bridges three or more target sequences. This only happens when aligning reads to short assembly contigs. From 8896cb942e320ccda75dd77d8b9aa5e009e80b21 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 24 Apr 2013 16:00:02 -0400 Subject: [PATCH 398/498] r386: bugfix - samse/pe segfault This happens when a read is aligned across the forward-reverse boundary. --- bwase.c | 9 +++++++++ main.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/bwase.c b/bwase.c index 0d49739..b2ea700 100644 --- a/bwase.c +++ b/bwase.c @@ -181,6 +181,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l rb = re - tle; rlen = tle; seq_reverse(len, seq, 0); seq_reverse(rlen, rseq, 0); + if (rlen == 0) goto refine_gapped_err; ksw_global(qle, &seq[len-qle], rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); if (qle < len) { // write soft clip cigar = realloc(cigar, (*n_cigar + 1) * 4); @@ -195,6 +196,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, >le, &gscore, 0); if (gscore > 0) tle = gtle, qle = len; re = rb + tle; rlen = tle; + if (rlen == 0) goto refine_gapped_err; ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension if (qle < len) { cigar = realloc(cigar, (*n_cigar + 1) * 4); @@ -209,6 +211,11 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); free(rseq); return cigar; + +refine_gapped_err: + free(rseq); + *n_cigar = 0; + return 0; } char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq, @@ -320,6 +327,7 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t } if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, &s->n_cigar, s->strand); + if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH; } // generate MD tag str = (kstring_t*)calloc(1, sizeof(kstring_t)); @@ -457,6 +465,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in for (i = 0; i < p->n_multi; ++i) { bwt_multi1_t *q = p->multi + i; int k; + if (q->cigar == 0) continue; j = pos_end_multi(q, p->len) - q->pos; nn = bns_cnt_ambi(bns, q->pos, j, &seqid); err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+', diff --git a/main.c b/main.c index 816fa17..88ab94b 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.4-r385" +#define PACKAGE_VERSION "0.7.4-r386-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 19cb7cd7edadc5f983750a5c0e0c9169e2ef5312 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 26 Apr 2013 12:31:18 -0400 Subject: [PATCH 399/498] r388: cleanup mem_process_seqs() interface Print output outside the function and allow to feed insert size distribution. --- bwamem.c | 23 ++++++++++++----------- bwamem.h | 9 ++++++--- fastmap.c | 6 +++++- main.c | 2 +- 4 files changed, 24 insertions(+), 16 deletions(-) diff --git a/bwamem.c b/bwamem.c index 921b512..c3a08cd 100644 --- a/bwamem.c +++ b/bwamem.c @@ -951,7 +951,7 @@ static void *worker2(void *data) return 0; } -void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs) +void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs, const mem_pestat_t *pes0) { int i; worker_t *w; @@ -967,29 +967,30 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn p->seqs = seqs; p->regs = regs; p->pes = &pes[0]; } + #ifdef HAVE_PTHREAD if (opt->n_threads == 1) { +#endif worker1(w); - if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes); + if (opt->flag&MEM_F_PE) { // paired-end mode + if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 + else mem_pestat(opt, bns->l_pac, n, regs, pes); // otherwise, infer the insert size distribution from data + } worker2(w); +#ifdef HAVE_PTHREAD } else { pthread_t *tid; tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); - if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes); + if (opt->flag&MEM_F_PE) { + if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); + else mem_pestat(opt, bns->l_pac, n, regs, pes); + } for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); free(tid); } -#else - worker1(w); - if (opt->flag&MEM_F_PE) mem_pestat(opt, bns->l_pac, n, regs, pes); - worker2(w); #endif - for (i = 0; i < n; ++i) { - fputs(seqs[i].sam, stdout); - free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); - } free(regs); free(w); } diff --git a/bwamem.h b/bwamem.h index 40f47f8..76be8e3 100644 --- a/bwamem.h +++ b/bwamem.h @@ -57,8 +57,9 @@ typedef struct { typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; typedef struct { - int low, high, failed; - double avg, std; + int low, high; // lower and upper bounds within which a read pair is considered to be properly paired + int failed; // non-zero if the orientation is not supported by sufficient data + double avg, std; // mean and stddev of the insert size distribution } mem_pestat_t; typedef struct { // This struct is only used for the convenience of API. @@ -103,8 +104,10 @@ extern "C" { * @param pac 2-bit encoded reference * @param n number of query sequences * @param seqs query sequences; $seqs[i].seq/sam to be modified after the call + * @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements, + * corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info. */ - void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs); + void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs, const mem_pestat_t *pes0); /** * Find the aligned regions for one query sequence diff --git a/fastmap.c b/fastmap.c index 98963c0..3e8e3b4 100644 --- a/fastmap.c +++ b/fastmap.c @@ -120,7 +120,11 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < n; ++i) size += seqs[i].l_seq; if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size); - mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs); + mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs, 0); + for (i = 0; i < n; ++i) { + fputs(seqs[i].sam, stdout); + free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); + } free(seqs); } diff --git a/main.c b/main.c index 88ab94b..cb4289c 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.4-r386-beta" +#define PACKAGE_VERSION "0.7.4-r388-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 1a2bd2cf918648e3150e50c38a7547479813a0b7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 27 Apr 2013 10:08:01 -0400 Subject: [PATCH 400/498] r389: return non-zero upon errors --- fastmap.c | 10 +++++++++- kopen.c | 4 ++-- main.c | 4 ++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fastmap.c b/fastmap.c index 3e8e3b4..69dbcbe 100644 --- a/fastmap.c +++ b/fastmap.c @@ -90,9 +90,12 @@ int main_mem(int argc, char *argv[]) bwa_fill_scmat(opt->a, opt->b, opt->mat); if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak - bwa_print_sam_hdr(idx->bns, rg_line); ko = kopen(argv[optind + 1], &fd); + if (ko == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]); + return 1; + } fp = gzdopen(fd, "r"); ks = kseq_init(fp); if (optind + 2 < argc) { @@ -101,11 +104,16 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__); } else { ko2 = kopen(argv[optind + 2], &fd2); + if (ko2 == 0) { + if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 2]); + return 1; + } fp2 = gzdopen(fd2, "r"); ks2 = kseq_init(fp2); opt->flag |= MEM_F_PE; } } + bwa_print_sam_hdr(idx->bns, rg_line); while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { int64_t size = 0; if ((opt->flag & MEM_F_PE) && (n&1) == 1) { diff --git a/kopen.c b/kopen.c index 8887932..c0bc975 100644 --- a/kopen.c +++ b/kopen.c @@ -292,14 +292,14 @@ void *kopen(const char *fn, int *_fd) #else *_fd = open(fn, O_RDONLY); #endif - if (*_fd) { + if (*_fd >= 0) { aux = calloc(1, sizeof(koaux_t)); aux->type = KO_FILE; aux->fd = *_fd; } } } - *_fd = aux->fd; + if (aux) *_fd = aux->fd; return aux; } diff --git a/main.c b/main.c index cb4289c..d40d996 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.4-r388-beta" +#define PACKAGE_VERSION "0.7.4-r389-beta" #endif int bwa_fa2pac(int argc, char *argv[]); @@ -92,5 +92,5 @@ int main(int argc, char *argv[]) fprintf(stderr, " %s", argv[i]); fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime()); } - return 0; + return ret; } From 0aa7e0a4022fea82a2f038a6d44aa3721e3c60f3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 29 Apr 2013 13:58:28 +0100 Subject: [PATCH 401/498] Ensure exit status of 1 if given invalid options or index files are not found. Added missing default cases in option scanning. Ensure exit value is 1 if bwa_idx_load or bwa_idx_infer_prefix fail. These changes extend the previous one, which only fixed the mem aligner. --- bwape.c | 2 +- bwase.c | 2 +- bwtaln.c | 2 +- bwtsw2_main.c | 3 ++- fastmap.c | 4 +++- pemerge.c | 1 + 6 files changed, 9 insertions(+), 5 deletions(-) diff --git a/bwape.c b/bwape.c index b219527..030af0d 100644 --- a/bwape.c +++ b/bwape.c @@ -765,7 +765,7 @@ int bwa_sai2sam_pe(int argc, char *argv[]) } if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); - return 0; + return 1; } bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line); free(prefix); free(popt); diff --git a/bwase.c b/bwase.c index 41ab175..bf7e9cb 100644 --- a/bwase.c +++ b/bwase.c @@ -611,7 +611,7 @@ int bwa_sai2sam_se(int argc, char *argv[]) } if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); - return 0; + return 1; } bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line); free(prefix); diff --git a/bwtaln.c b/bwtaln.c index b906e1e..f6b26a8 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -306,7 +306,7 @@ int bwa_aln(int argc, char *argv[]) if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { fprintf(stderr, "[%s] fail to locate the index\n", __func__); free(opt); - return 0; + return 1; } bwa_aln_core(prefix, argv[optind+1], opt); free(opt); free(prefix); diff --git a/bwtsw2_main.c b/bwtsw2_main.c index ab126f2..40a9e0a 100644 --- a/bwtsw2_main.c +++ b/bwtsw2_main.c @@ -37,6 +37,7 @@ int bwa_bwtsw2(int argc, char *argv[]) case 'S': opt->skip_sw = 1; break; case 'C': opt->cpy_cmt = 1; break; case 'G': opt->max_chain_gap = atoi(optarg); break; + default: return 1; } } opt->qr = opt->q + opt->r; @@ -79,7 +80,7 @@ int bwa_bwtsw2(int argc, char *argv[]) opt->t *= opt->a; opt->coef *= opt->a; - if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 0; + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1; bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0); bwa_idx_destroy(idx); free(opt); diff --git a/fastmap.c b/fastmap.c index d0651f9..592df02 100644 --- a/fastmap.c +++ b/fastmap.c @@ -52,6 +52,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'R') { if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak } else if (c == 's') opt->split_width = atoi(optarg); + else return 1; } if (opt->n_threads < 1) opt->n_threads = 1; if (optind + 1 >= argc) { @@ -164,6 +165,7 @@ int main_fastmap(int argc, char *argv[]) case 'p': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; + default: return 1; } } if (optind + 1 >= argc) { @@ -173,7 +175,7 @@ int main_fastmap(int argc, char *argv[]) fp = xzopen(argv[optind + 1], "r"); seq = kseq_init(fp); - idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS); + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1; itr = smem_itr_init(idx->bwt); while (kseq_read(seq) >= 0) { err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); diff --git a/pemerge.c b/pemerge.c index dbcfaab..b944819 100644 --- a/pemerge.c +++ b/pemerge.c @@ -226,6 +226,7 @@ int main_pemerge(int argc, char *argv[]) else if (c == 'Q') opt->q_thres = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg); else if (c == 'T') min_ovlp = atoi(optarg); + else return 1; } if (flag == 0) flag = 3; opt->flag = flag; From 96e445d9e43e7d4a4cb32a1681098a6e7a170561 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 2 May 2013 15:12:01 +0100 Subject: [PATCH 402/498] Reduce dependency on utils.h - new malloc wrapping scheme. Remove xmalloc, xcalloc, xrealloc and xstrdup from utils.h and revert calls to the normal malloc, calloc, realloc, strdup. Add new files malloc_wrap.[ch] with the wrapper functions. malloc_wrap.h #defines malloc etc. to the wrapper, but only if USE_MALLOC_WRAPPERS has been defined. Put #include "malloc_wrap.h" in any file that uses *alloc or strdup. This is also in a #ifdef USE_MALLOC_WRAPPERS ... #endif block to make using the wrappers optional. Add -DUSE_MALLOC_WRAPPERS into the makefile so they should normally get added. This is an improvement on the previous method as we now don't need to worry about stray function calls that were not changed to the wrapped version and the code will still work even if the wrapping is disabled. Other possible methods of doing this are using malloc_hook (glibc-specific), adding -include malloc_wrap.h to the gcc command-line (somewhat gcc-specific) or making our own malloc function and using dlopen (scary). This way is probably the most portable. --- Makefile | 70 +++++++++++++++++++++++++++----------------------- bamlite.c | 17 +++++++----- bamlite.h | 6 ++++- bntseq.c | 38 +++++++++++++++------------ bwa.c | 26 +++++++++++-------- bwamem.c | 42 ++++++++++++++++-------------- bwamem_pair.c | 9 +++++-- bwape.c | 24 +++++++++-------- bwase.c | 26 +++++++++++-------- bwaseqio.c | 28 +++++++++++--------- bwt.c | 12 ++++++--- bwt_gen.c | 20 +++++++++------ bwt_lite.c | 15 ++++++----- bwtaln.c | 16 +++++++----- bwtgap.c | 15 ++++++----- bwtindex.c | 25 ++++++++++-------- bwtsw2_aux.c | 59 +++++++++++++++++++++++------------------- bwtsw2_chain.c | 11 +++++--- bwtsw2_core.c | 33 +++++++++++++----------- bwtsw2_pair.c | 11 +++++--- example.c | 4 +++ is.c | 6 ++++- kbtree.h | 21 ++++++++------- khash.h | 11 +++++--- kopen.c | 42 +++++++++++++++--------------- kseq.h | 21 ++++++++------- ksort.h | 9 ++++--- kstring.c | 9 ++++--- kstring.h | 17 +++++++----- ksw.c | 25 ++++++++++-------- ksw.h | 4 +++ kvec.h | 12 ++++++--- malloc_wrap.c | 57 ++++++++++++++++++++++++++++++++++++++++ malloc_wrap.h | 47 +++++++++++++++++++++++++++++++++ pemerge.c | 18 ++++++++----- utils.c | 41 ----------------------------- utils.h | 11 -------- 37 files changed, 516 insertions(+), 342 deletions(-) create mode 100644 malloc_wrap.c create mode 100644 malloc_wrap.h diff --git a/Makefile b/Makefile index 5bbbc89..7d78889 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,13 @@ CC= gcc CFLAGS= -g -Wall -O2 +WRAP_MALLOC= -DUSE_MALLOC_WRAPPERS AR= ar -DFLAGS= -DHAVE_PTHREAD +DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ - bwtsw2_chain.o fastmap.o bwtsw2_pair.o + bwtsw2_chain.o fastmap.o bwtsw2_pair.o malloc_wrap.o PROG= bwa INCLUDES= LIBS= -lm -lz -lpthread @@ -32,39 +33,44 @@ clean: rm -f gmon.out *.o a.out $(PROG) *~ *.a depend: - ( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) -- *.c ) + ( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c ) # DO NOT DELETE THIS LINE -- make depend depends on it. QSufSort.o: QSufSort.h -bamlite.o: utils.h bamlite.h -bntseq.o: bntseq.h utils.h kseq.h -bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kseq.h -bwamem.o: kstring.h utils.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h -bwamem.o: ksort.h kbtree.h -bwamem_pair.o: kstring.h utils.h bwamem.h bwt.h bntseq.h bwa.h kvec.h ksw.h -bwape.o: bwtaln.h bwt.h kvec.h bntseq.h utils.h bwase.h bwa.h ksw.h khash.h -bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h bwa.h ksw.h -bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h kseq.h -bwt.o: utils.h bwt.h kvec.h -bwt_gen.o: QSufSort.h utils.h -bwt_lite.o: bwt_lite.h utils.h -bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h -bwtgap.o: bwtgap.h bwt.h bwtaln.h utils.h -bwtindex.o: bntseq.h bwt.h utils.h -bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h bwa.h -bwtsw2_aux.o: ksw.h kseq.h ksort.h -bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h utils.h ksort.h -bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h utils.h khash.h -bwtsw2_core.o: ksort.h +bamlite.o: bamlite.h utils.h malloc_wrap.h +bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h +bwa.o: bntseq.h bwa.h bwt.h ksw.h malloc_wrap.h utils.h kseq.h +bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h +bwamem.o: ksort.h utils.h kbtree.h +bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h +bwamem_pair.o: utils.h ksw.h +bwape.o: bwtaln.h bwt.h kvec.h malloc_wrap.h bntseq.h utils.h bwase.h bwa.h +bwape.o: ksw.h khash.h +bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h +bwase.o: bwa.h ksw.h +bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h +bwt.o: utils.h bwt.h kvec.h malloc_wrap.h +bwt_gen.o: QSufSort.h utils.h malloc_wrap.h +bwt_lite.o: bwt_lite.h malloc_wrap.h +bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h +bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h +bwtindex.o: bntseq.h bwt.h utils.h malloc_wrap.h +bwtsw2_aux.o: bntseq.h bwt_lite.h utils.h bwtsw2.h bwt.h kstring.h +bwtsw2_aux.o: malloc_wrap.h bwa.h ksw.h kseq.h ksort.h +bwtsw2_chain.o: bwtsw2.h bntseq.h bwt_lite.h bwt.h malloc_wrap.h ksort.h +bwtsw2_core.o: bwt_lite.h bwtsw2.h bntseq.h bwt.h kvec.h malloc_wrap.h +bwtsw2_core.o: khash.h ksort.h bwtsw2_main.o: bwt.h bwtsw2.h bntseq.h bwt_lite.h utils.h bwa.h -bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h ksw.h -example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h utils.h -fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h utils.h kseq.h -is.o: utils.h -kopen.o: utils.h -kstring.o: kstring.h utils.h -ksw.o: ksw.h utils.h +bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h +bwtsw2_pair.o: malloc_wrap.h ksw.h +example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h +fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h +is.o: utils.h malloc_wrap.h +kopen.o: malloc_wrap.h +kstring.o: kstring.h malloc_wrap.h +ksw.o: ksw.h malloc_wrap.h main.o: utils.h -pemerge.o: ksw.h kseq.h utils.h kstring.h bwa.h bntseq.h bwt.h -utils.o: utils.h ksort.h kseq.h +malloc_wrap.o: malloc_wrap.h +pemerge.o: ksw.h malloc_wrap.h kseq.h kstring.h bwa.h bntseq.h bwt.h utils.h +utils.o: utils.h ksort.h malloc_wrap.h kseq.h diff --git a/bamlite.c b/bamlite.c index ec365d1..851cb6f 100644 --- a/bamlite.c +++ b/bamlite.c @@ -2,9 +2,12 @@ #include #include #include -#include "utils.h" #include "bamlite.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + /********************* * from bam_endian.c * *********************/ @@ -54,7 +57,7 @@ int bam_is_be; bam_header_t *bam_header_init() { bam_is_be = bam_is_big_endian(); - return (bam_header_t*)xcalloc(1, sizeof(bam_header_t)); + return (bam_header_t*)calloc(1, sizeof(bam_header_t)); } void bam_header_destroy(bam_header_t *header) @@ -87,17 +90,17 @@ bam_header_t *bam_header_read(bamFile fp) // read plain text and the number of reference sequences if (bam_read(fp, &header->l_text, 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&header->l_text); - header->text = (char*)xcalloc(header->l_text + 1, 1); + header->text = (char*)calloc(header->l_text + 1, 1); if (bam_read(fp, header->text, header->l_text) != header->l_text) goto fail; if (bam_read(fp, &header->n_targets, 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&header->n_targets); // read reference sequence names and lengths - header->target_name = (char**)xcalloc(header->n_targets, sizeof(char*)); - header->target_len = (uint32_t*)xcalloc(header->n_targets, 4); + header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); + header->target_len = (uint32_t*)calloc(header->n_targets, 4); for (i = 0; i != header->n_targets; ++i) { if (bam_read(fp, &name_len, 4) != 4) goto fail; if (bam_is_be) bam_swap_endian_4p(&name_len); - header->target_name[i] = (char*)xcalloc(name_len, 1); + header->target_name[i] = (char*)calloc(name_len, 1); if (bam_read(fp, header->target_name[i], name_len) != name_len) { goto fail; } @@ -152,7 +155,7 @@ int bam_read1(bamFile fp, bam1_t *b) if (b->m_data < b->data_len) { b->m_data = b->data_len; kroundup32(b->m_data); - b->data = (uint8_t*)xrealloc(b->data, b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); } if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; diff --git a/bamlite.h b/bamlite.h index 0c080fd..640e863 100644 --- a/bamlite.h +++ b/bamlite.h @@ -5,6 +5,10 @@ #include #include "utils.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + typedef gzFile bamFile; #define bam_open(fn, mode) xzopen(fn, mode) #define bam_dopen(fd, mode) gzdopen(fd, mode) @@ -72,7 +76,7 @@ typedef struct { #define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) #define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) -#define bam_init1() ((bam1_t*)xcalloc(1, sizeof(bam1_t))) +#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t))) #define bam_destroy1(b) do { \ if (b) { free((b)->data); free(b); } \ } while (0) diff --git a/bntseq.c b/bntseq.c index 01a5e3c..e1cd323 100644 --- a/bntseq.c +++ b/bntseq.c @@ -37,6 +37,10 @@ #include "kseq.h" KSEQ_DECLARE(gzFile) +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + unsigned char nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -97,13 +101,13 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c long long xx; int i; int scanres; - bns = (bntseq_t*)xcalloc(1, sizeof(bntseq_t)); + bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); { // read .ann fp = xopen(fname = ann_filename, "r"); scanres = fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed); if (scanres != 3) goto badread; bns->l_pac = xx; - bns->anns = (bntann1_t*)xcalloc(bns->n_seqs, sizeof(bntann1_t)); + bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t)); for (i = 0; i < bns->n_seqs; ++i) { bntann1_t *p = bns->anns + i; char *q = str; @@ -111,7 +115,7 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c // read gi and sequence name scanres = fscanf(fp, "%u%s", &p->gi, str); if (scanres != 2) goto badread; - p->name = xstrdup(str); + p->name = strdup(str); // read fasta comments while (str - q < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; while (c != '\n' && c != EOF) c = fgetc(fp); @@ -120,8 +124,8 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c goto badread; } *q = 0; - if (q - str > 1) p->anno = xstrdup(str + 1); // skip leading space - else p->anno = xstrdup(""); + if (q - str > 1) p->anno = strdup(str + 1); // skip leading space + else p->anno = strdup(""); // read the rest scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); if (scanres != 3) goto badread; @@ -137,7 +141,7 @@ bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, c if (scanres != 3) goto badread; l_pac = xx; xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); - bns->ambs = bns->n_holes? (bntamb1_t*)xcalloc(bns->n_holes, sizeof(bntamb1_t)) : 0; + bns->ambs = bns->n_holes? (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)) : 0; for (i = 0; i < bns->n_holes; ++i) { bntamb1_t *p = bns->ambs + i; scanres = fscanf(fp, "%lld%d%s", &xx, &p->len, str); @@ -193,11 +197,11 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_ int i, lasts; if (bns->n_seqs == *m_seqs) { *m_seqs <<= 1; - bns->anns = (bntann1_t*)xrealloc(bns->anns, *m_seqs * sizeof(bntann1_t)); + bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t)); } p = bns->anns + bns->n_seqs; - p->name = xstrdup((char*)seq->name.s); - p->anno = seq->comment.s? xstrdup((char*)seq->comment.s) : xstrdup("(null)"); + p->name = strdup((char*)seq->name.s); + p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); p->gi = 0; p->len = seq->seq.l; p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; p->n_ambs = 0; @@ -209,7 +213,7 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_ } else { if (bns->n_holes == *m_holes) { (*m_holes) <<= 1; - bns->ambs = (bntamb1_t*)xrealloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t)); + bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t)); } *q = bns->ambs + bns->n_holes; (*q)->len = 1; @@ -224,7 +228,7 @@ static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_ if (c >= 4) c = lrand48()&3; if (bns->l_pac == *m_pac) { // double the pac size *m_pac <<= 1; - pac = xrealloc(pac, *m_pac/4); + pac = realloc(pac, *m_pac/4); memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4); } _set_pac(pac, bns->l_pac, c); @@ -249,13 +253,13 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) // initialization seq = kseq_init(fp_fa); - bns = (bntseq_t*)xcalloc(1, sizeof(bntseq_t)); + bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); bns->seed = 11; // fixed seed for random generator srand48(bns->seed); m_seqs = m_holes = 8; m_pac = 0x10000; - bns->anns = (bntann1_t*)xcalloc(m_seqs, sizeof(bntann1_t)); - bns->ambs = (bntamb1_t*)xcalloc(m_holes, sizeof(bntamb1_t)); - pac = xcalloc(m_pac/4, 1); + bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); + bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); + pac = calloc(m_pac/4, 1); q = bns->ambs; strcpy(name, prefix); strcat(name, ".pac"); fp = xopen(name, "wb"); @@ -263,7 +267,7 @@ int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); if (!for_only) { // add the reverse complemented sequence m_pac = (bns->l_pac * 2 + 3) / 4 * 4; - pac = xrealloc(pac, m_pac/4); + pac = realloc(pac, m_pac/4); memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); @@ -357,7 +361,7 @@ uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end if (beg >= l_pac || end <= l_pac) { int64_t k, l = 0; *len = end - beg; - seq = xmalloc(end - beg); + seq = malloc(end - beg); if (beg >= l_pac) { // reverse strand int64_t beg_f = (l_pac<<1) - 1 - end; int64_t end_f = (l_pac<<1) - 1 - beg; diff --git a/bwa.c b/bwa.c index 181124c..a20c027 100644 --- a/bwa.c +++ b/bwa.c @@ -7,6 +7,10 @@ #include "ksw.h" #include "utils.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + int bwa_verbose = 3; char bwa_rg_id[256]; @@ -25,10 +29,10 @@ static inline void trim_readno(kstring_t *s) static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) { // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice - s->name = xstrdup(ks->name.s); - s->comment = ks->comment.l? xstrdup(ks->comment.s) : 0; - s->seq = xstrdup(ks->seq.s); - s->qual = ks->qual.l? xstrdup(ks->qual.s) : 0; + s->name = strdup(ks->name.s); + s->comment = ks->comment.l? strdup(ks->comment.s) : 0; + s->seq = strdup(ks->seq.s); + s->qual = ks->qual.l? strdup(ks->qual.s) : 0; s->l_seq = strlen(s->seq); } @@ -45,7 +49,7 @@ bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) } if (n >= m) { m = m? m<<1 : 256; - seqs = xrealloc(seqs, m * sizeof(bseq1_t)); + seqs = realloc(seqs, m * sizeof(bseq1_t)); } trim_readno(&ks->name); kseq2bseq1(ks, &seqs[n]); @@ -98,7 +102,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; } if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP - cigar = xmalloc(4); + cigar = malloc(4); cigar[0] = l_query<<4 | 0; *n_cigar = 1; for (i = 0, *score = 0; i < l_query; ++i) @@ -205,7 +209,7 @@ char *bwa_idx_infer_prefix(const char *hint) int l_hint; FILE *fp; l_hint = strlen(hint); - prefix = xmalloc(l_hint + 3 + 4 + 1); + prefix = malloc(l_hint + 3 + 4 + 1); strcpy(prefix, hint); strcpy(prefix + l_hint, ".64.bwt"); if ((fp = fopen(prefix, "rb")) != 0) { @@ -234,7 +238,7 @@ bwt_t *bwa_idx_load_bwt(const char *hint) if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); return 0; } - tmp = xcalloc(strlen(prefix) + 5, 1); + tmp = calloc(strlen(prefix) + 5, 1); strcat(strcpy(tmp, prefix), ".bwt"); // FM-index bwt = bwt_restore_bwt(tmp); strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA) @@ -252,12 +256,12 @@ bwaidx_t *bwa_idx_load(const char *hint, int which) if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); return 0; } - idx = xcalloc(1, sizeof(bwaidx_t)); + idx = calloc(1, sizeof(bwaidx_t)); if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); if (which & BWA_IDX_BNS) { idx->bns = bns_restore(prefix); if (which & BWA_IDX_PAC) { - idx->pac = xcalloc(idx->bns->l_pac/4+1, 1); + idx->pac = calloc(idx->bns->l_pac/4+1, 1); err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence err_fclose(idx->bns->fp_pac); idx->bns->fp_pac = 0; @@ -312,7 +316,7 @@ char *bwa_set_rg(const char *s) if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__); goto err_set_rg; } - rg_line = xstrdup(s); + rg_line = strdup(s); bwa_escape(rg_line); if ((p = strstr(rg_line, "\tID:")) == 0) { if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__); diff --git a/bwamem.c b/bwamem.c index 9f3aa9b..779a221 100644 --- a/bwamem.c +++ b/bwamem.c @@ -15,6 +15,10 @@ #include "ksort.h" #include "utils.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + /* Theory on probability and scoring *ungapped* alignment * * s'(a,b) = log[P(b|a)/P(b)] = log[4P(b|a)], assuming uniform base distribution @@ -41,7 +45,7 @@ mem_opt_t *mem_opt_init() { mem_opt_t *o; - o = xcalloc(1, sizeof(mem_opt_t)); + o = calloc(1, sizeof(mem_opt_t)); o->flag = 0; o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; o->T = 30; @@ -79,12 +83,12 @@ struct __smem_i { smem_i *smem_itr_init(const bwt_t *bwt) { smem_i *itr; - itr = xcalloc(1, sizeof(smem_i)); + itr = calloc(1, sizeof(smem_i)); itr->bwt = bwt; - itr->tmpvec[0] = xcalloc(1, sizeof(bwtintv_v)); - itr->tmpvec[1] = xcalloc(1, sizeof(bwtintv_v)); - itr->matches = xcalloc(1, sizeof(bwtintv_v)); - itr->sub = xcalloc(1, sizeof(bwtintv_v)); + itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); + itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); + itr->matches = calloc(1, sizeof(bwtintv_v)); + itr->sub = calloc(1, sizeof(bwtintv_v)); return itr; } @@ -181,7 +185,7 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, c if (y >= 0 && x - y <= opt->w && y - x <= opt->w && x - last->len < opt->max_chain_gap && y - last->len < opt->max_chain_gap) { // grow the chain if (c->n == c->m) { c->m <<= 1; - c->seeds = xrealloc(c->seeds, c->m * sizeof(mem_seed_t)); + c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t)); } c->seeds[c->n++] = *p; return 1; @@ -215,7 +219,7 @@ static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) * } else to_add = 1; if (to_add) { // add the seed as a new chain tmp.n = 1; tmp.m = 4; - tmp.seeds = xcalloc(tmp.m, sizeof(mem_seed_t)); + tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); tmp.seeds[0] = s; kb_putp(chn, tree, &tmp); } @@ -283,7 +287,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) flt_aux_t *a; int i, j, n; if (n_chn <= 1) return n_chn; // no need to filter - a = xmalloc(sizeof(flt_aux_t) * n_chn); + a = malloc(sizeof(flt_aux_t) * n_chn); for (i = 0; i < n_chn; ++i) { mem_chain_t *c = &chains[i]; int64_t end; @@ -309,7 +313,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) ks_introsort(mem_flt, n_chn, a); { // reorder chains such that the best chain appears first mem_chain_t *swap; - swap = xmalloc(sizeof(mem_chain_t) * n_chn); + swap = malloc(sizeof(mem_chain_t) * n_chn); for (i = 0; i < n_chn; ++i) { swap[i] = *((mem_chain_t*)a[i].p); a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed @@ -512,7 +516,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); assert(rlen == rmax[1] - rmax[0]); - srt = xmalloc(c->n * 8); + srt = malloc(c->n * 8); for (i = 0; i < c->n; ++i) srt[i] = (uint64_t)c->seeds[i].len<<32 | i; ks_introsort_64(c->n, srt); @@ -560,10 +564,10 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int if (s->qbeg) { // left extension uint8_t *rs, *qs; int qle, tle, gtle, gscore; - qs = xmalloc(s->qbeg); + qs = malloc(s->qbeg); for (i = 0; i < s->qbeg; ++i) qs[i] = query[s->qbeg - 1 - i]; tmp = s->rbeg - rmax[0]; - rs = xmalloc(tmp); + rs = malloc(tmp); for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i]; for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; @@ -838,7 +842,7 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * { // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence mem_alnreg_v ar; char *seq; - seq = xmalloc(l_seq); + seq = malloc(l_seq); memcpy(seq, seq_, l_seq); // makes a copy of seq_ ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq); mem_mark_primary_se(opt, ar.n, ar.a); @@ -861,7 +865,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * } qb = ar->qb, qe = ar->qe; rb = ar->rb, re = ar->re; - query = xmalloc(l_query); + query = malloc(l_query); for (i = 0; i < l_query; ++i) // convert to the nt4 encoding query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; @@ -887,7 +891,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * int clip5, clip3; clip5 = is_rev? l_query - qe : qb; clip3 = is_rev? qb : l_query - qe; - a.cigar = xrealloc(a.cigar, 4 * (a.n_cigar + 2)); + a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2)); if (clip5) { memmove(a.cigar+1, a.cigar, a.n_cigar * 4); a.cigar[0] = clip5<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3); @@ -958,8 +962,8 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn mem_alnreg_v *regs; mem_pestat_t pes[4]; - w = xcalloc(opt->n_threads, sizeof(worker_t)); - regs = xmalloc(n * sizeof(mem_alnreg_v)); + w = calloc(opt->n_threads, sizeof(worker_t)); + regs = malloc(n * sizeof(mem_alnreg_v)); for (i = 0; i < opt->n_threads; ++i) { worker_t *p = &w[i]; p->start = i; p->step = opt->n_threads; p->n = n; @@ -980,7 +984,7 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn #ifdef HAVE_PTHREAD } else { pthread_t *tid; - tid = (pthread_t*)xcalloc(opt->n_threads, sizeof(pthread_t)); + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); if (opt->flag&MEM_F_PE) { diff --git a/bwamem_pair.c b/bwamem_pair.c index 19fc83b..06aacff 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -8,6 +8,11 @@ #include "utils.h" #include "ksw.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + + #define MIN_RATIO 0.8 #define MIN_DIR_CNT 10 #define MIN_DIR_RATIO 0.05 @@ -121,7 +126,7 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate is_larger = !(r>>1); // whether the mate has larger coordinate if (is_rev) { - rev = xmalloc(l_ms); // this is the reverse complement of $ms + rev = malloc(l_ms); // this is the reverse complement of $ms for (i = 0; i < l_ms; ++i) rev[l_ms - 1 - i] = ms[i] < 4? 3 - ms[i] : 4; seq = rev; } else seq = (uint8_t*)ms; @@ -294,7 +299,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co // write SAM h[0] = mem_reg2aln(opt, bns, pac, s[0].l_seq, s[0].seq, &a[0].a[z[0]]); h[0].mapq = q_se[0]; h[0].flag |= 0x40 | extra_flag; h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; - mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1]); s[0].sam = xstrdup(str.s); str.l = 0; + mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1]); s[0].sam = strdup(str.s); str.l = 0; mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0]); s[1].sam = str.s; free(h[0].cigar); free(h[1].cigar); } else goto no_pairing; diff --git a/bwape.c b/bwape.c index 030af0d..2a7f46e 100644 --- a/bwape.c +++ b/bwape.c @@ -12,6 +12,10 @@ #include "bwa.h" #include "ksw.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + typedef struct { int n; bwtint_t *a; @@ -50,7 +54,7 @@ void bwa_print_sam_PG(); pe_opt_t *bwa_init_pe_opt() { pe_opt_t *po; - po = (pe_opt_t*)xcalloc(1, sizeof(pe_opt_t)); + po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t)); po->max_isize = 500; po->force_isize = 0; po->max_occ = 100000; @@ -83,7 +87,7 @@ static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ii->avg = ii->std = -1.0; ii->low = ii->high = ii->high_bayesian = 0; - isizes = (uint64_t*)xcalloc(n_seqs, 8); + isizes = (uint64_t*)calloc(n_seqs, 8); for (i = 0, tot = 0; i != n_seqs; ++i) { bwa_seq_t *p[2]; p[0] = seqs[0] + i; p[1] = seqs[1] + i; @@ -263,9 +267,9 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw pe_data_t *d; aln_buf_t *buf[2]; - d = (pe_data_t*)xcalloc(1, sizeof(pe_data_t)); - buf[0] = (aln_buf_t*)xcalloc(n_seqs, sizeof(aln_buf_t)); - buf[1] = (aln_buf_t*)xcalloc(n_seqs, sizeof(aln_buf_t)); + d = (pe_data_t*)calloc(1, sizeof(pe_data_t)); + buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); + buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); if (_bwt == 0) { // load forward SA strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); @@ -338,7 +342,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw if (ret) { // not in the hash table; ret must equal 1 as we never remove elements poslist_t *z = &kh_val(g_hash, iter); z->n = r->l - r->k + 1; - z->a = (bwtint_t*)xmalloc(sizeof(bwtint_t) * z->n); + z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n); for (l = r->k; l <= r->l; ++l) { int strand; z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand)<<1; @@ -420,7 +424,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0; // get reference subsequence - ref_seq = (ubyte_t*)xcalloc(reglen, 1); + ref_seq = (ubyte_t*)calloc(reglen, 1); for (k = *beg, l = 0; l < reglen && k < l_pac; ++k) ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; @@ -453,7 +457,7 @@ bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const u { // update cigar and coordinate; int start = r.qb, end = r.qe + 1; *beg += r.tb; - cigar = (bwa_cigar_t*)xrealloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); + cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); if (start) { memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar)); cigar[0] = __cigar_create(3, start); @@ -497,7 +501,7 @@ ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, // load reference sequence if (_pacseq == 0) { - pacseq = (ubyte_t*)xcalloc(bns->l_pac/4+1, 1); + pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); err_rewind(bns->fp_pac); err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); } else pacseq = (ubyte_t*)_pacseq; @@ -653,7 +657,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f if (popt->is_preload) { strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); - pac = (ubyte_t*)xcalloc(bns->l_pac/4+1, 1); + pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1); err_rewind(bns->fp_pac); err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); } diff --git a/bwase.c b/bwase.c index bf7e9cb..85165d6 100644 --- a/bwase.c +++ b/bwase.c @@ -12,6 +12,10 @@ #include "bwa.h" #include "ksw.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + int g_log_n[256]; void bwa_print_sam_PG(); @@ -59,7 +63,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma * simply output all hits, but the following samples "rest" * number of random hits. */ rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa - s->multi = xcalloc(rest, sizeof(bwt_multi1_t)); + s->multi = calloc(rest, sizeof(bwt_multi1_t)); for (k = 0; k < n_aln; ++k) { const bwt_aln1_t *q = aln + k; if (q->l - q->k + 1 <= rest) { @@ -184,7 +188,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l if (rlen == 0) goto refine_gapped_err; ksw_global(qle, &seq[len-qle], rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); if (qle < len) { // write soft clip - cigar = xrealloc(cigar, (*n_cigar + 1) * 4); + cigar = realloc(cigar, (*n_cigar + 1) * 4); memmove(cigar + 1, cigar, *n_cigar * 4); cigar[0] = (len - qle)<<4 | FROM_S; ++(*n_cigar); @@ -199,7 +203,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l if (rlen == 0) goto refine_gapped_err; ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension if (qle < len) { - cigar = xrealloc(cigar, (*n_cigar + 1) * 4); + cigar = realloc(cigar, (*n_cigar + 1) * 4); cigar[*n_cigar - 1] = (len - qle)<<4 | FROM_S; ++(*n_cigar); } @@ -265,7 +269,7 @@ char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_ } ksprintf(str, "%d", u); *_nm = nm; - return xstrdup(str->s); + return strdup(str->s); } void bwa_correct_trimmed(bwa_seq_t *s) @@ -277,11 +281,11 @@ void bwa_correct_trimmed(bwa_seq_t *s) } else { if (s->cigar == 0) { s->n_cigar = 2; - s->cigar = xcalloc(s->n_cigar, sizeof(bwa_cigar_t)); + s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t)); s->cigar[0] = __cigar_create(0, s->len); } else { ++s->n_cigar; - s->cigar = xrealloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); + s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); } s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len)); } @@ -291,11 +295,11 @@ void bwa_correct_trimmed(bwa_seq_t *s) } else { if (s->cigar == 0) { s->n_cigar = 2; - s->cigar = xcalloc(s->n_cigar, sizeof(bwa_cigar_t)); + s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t)); s->cigar[1] = __cigar_create(0, s->len); } else { ++s->n_cigar; - s->cigar = xrealloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); + s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t)); } s->cigar[0] = __cigar_create(3, (s->full_len - s->len)); @@ -311,7 +315,7 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t kstring_t *str; if (!_pacseq) { - pacseq = (ubyte_t*)xcalloc(bns->l_pac/4+1, 1); + pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); err_rewind(bns->fp_pac); err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); } else pacseq = _pacseq; @@ -330,7 +334,7 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH; } // generate MD tag - str = (kstring_t*)xcalloc(1, sizeof(kstring_t)); + str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = 0; i != n_seqs; ++i) { bwa_seq_t *s = seqs + i; if (s->type != BWA_TYPE_NO_MATCH) { @@ -559,7 +563,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f err_fread_noeof(&n_aln, 4, 1, fp_sa); if (n_aln > m_aln) { m_aln = n_aln; - aln = (bwt_aln1_t*)xrealloc(aln, sizeof(bwt_aln1_t) * m_aln); + aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln); } err_fread_noeof(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); bwa_aln2seq_core(n_aln, aln, p, 1, n_occ); diff --git a/bwaseqio.c b/bwaseqio.c index 57ed654..d157945 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -7,6 +7,10 @@ #include "kseq.h" KSEQ_DECLARE(gzFile) +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + extern unsigned char nst_nt4_table[256]; static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; @@ -22,7 +26,7 @@ bwa_seqio_t *bwa_bam_open(const char *fn, int which) { bwa_seqio_t *bs; bam_header_t *h; - bs = (bwa_seqio_t*)xcalloc(1, sizeof(bwa_seqio_t)); + bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); bs->is_bam = 1; bs->which = which; bs->fp = bam_open(fn, "r"); @@ -35,7 +39,7 @@ bwa_seqio_t *bwa_seq_open(const char *fn) { gzFile fp; bwa_seqio_t *bs; - bs = (bwa_seqio_t*)xcalloc(1, sizeof(bwa_seqio_t)); + bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); fp = xzopen(fn, "r"); bs->ks = kseq_init(fp); return bs; @@ -93,7 +97,7 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com b = bam_init1(); n_seqs = 0; - seqs = (bwa_seq_t*)xcalloc(n_needed, sizeof(bwa_seq_t)); + seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while (bam_read1(bs->fp, b) >= 0) { uint8_t *s, *q; int go = 0; @@ -108,8 +112,8 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; s = bam1_seq(b); q = bam1_qual(b); - p->seq = (ubyte_t*)xcalloc(p->len + 1, 1); - p->qual = (ubyte_t*)xcalloc(p->len + 1, 1); + p->seq = (ubyte_t*)calloc(p->len + 1, 1); + p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; @@ -119,11 +123,11 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); - p->rseq = (ubyte_t*)xcalloc(p->full_len, 1); + p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); - p->name = xstrdup((const char*)bam1_qname(b)); + p->name = strdup((const char*)bam1_qname(b)); if (n_seqs == n_needed) break; } *n = n_seqs; @@ -153,7 +157,7 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int tri } if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input n_seqs = 0; - seqs = (bwa_seq_t*)xcalloc(n_needed, sizeof(bwa_seq_t)); + seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { // skip reads that are marked to be filtered by Casava @@ -184,18 +188,18 @@ bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int tri p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; - p->seq = (ubyte_t*)xcalloc(p->full_len, 1); + p->seq = (ubyte_t*)calloc(p->full_len, 1); for (i = 0; i != p->full_len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; if (seq->qual.l) { // copy quality - p->qual = (ubyte_t*)xstrdup((char*)seq->qual.s); + p->qual = (ubyte_t*)strdup((char*)seq->qual.s); if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); } - p->rseq = (ubyte_t*)xcalloc(p->full_len, 1); + p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); - p->name = xstrdup((const char*)seq->name.s); + p->name = strdup((const char*)seq->name.s); { // trim /[12]$ int t = strlen(p->name); if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; diff --git a/bwt.c b/bwt.c index edd0afc..c9bf6a3 100644 --- a/bwt.c +++ b/bwt.c @@ -34,6 +34,10 @@ #include "bwt.h" #include "kvec.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + void bwt_gen_cnt_table(bwt_t *bwt) { int i, j; @@ -66,7 +70,7 @@ void bwt_cal_sa(bwt_t *bwt, int intv) if (bwt->sa) free(bwt->sa); bwt->sa_intv = intv; bwt->n_sa = (bwt->seq_len + intv) / intv; - bwt->sa = (bwtint_t*)xcalloc(bwt->n_sa, sizeof(bwtint_t)); + bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); // calculate SA value isa = 0; sa = bwt->seq_len; for (i = 0; i < bwt->seq_len; ++i) { @@ -397,7 +401,7 @@ void bwt_restore_sa(const char *fn, bwt_t *bwt) xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; - bwt->sa = (bwtint_t*)xcalloc(bwt->n_sa, sizeof(bwtint_t)); + bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); bwt->sa[0] = -1; fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1); @@ -409,11 +413,11 @@ bwt_t *bwt_restore_bwt(const char *fn) bwt_t *bwt; FILE *fp; - bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t)); + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); fp = xopen(fn, "rb"); err_fseek(fp, 0, SEEK_END); bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2; - bwt->bwt = (uint32_t*)xcalloc(bwt->bwt_size, 4); + bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4); err_fseek(fp, 0, SEEK_SET); err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp); err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp); diff --git a/bwt_gen.c b/bwt_gen.c index e9a5c93..d68b30f 100644 --- a/bwt_gen.c +++ b/bwt_gen.c @@ -30,6 +30,10 @@ #include "QSufSort.h" #include "utils.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + typedef uint64_t bgint_t; typedef int64_t sbgint_t; @@ -320,25 +324,25 @@ BWT *BWTCreate(const bgint_t textLength, unsigned int *decodeTable) { BWT *bwt; - bwt = (BWT*)xcalloc(1, sizeof(BWT)); + bwt = (BWT*)calloc(1, sizeof(BWT)); bwt->textLength = 0; - bwt->cumulativeFreq = (bgint_t*)xcalloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); + bwt->cumulativeFreq = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); initializeVAL_bg(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); bwt->bwtSizeInWord = 0; // Generate decode tables if (decodeTable == NULL) { - bwt->decodeTable = (unsigned*)xcalloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int)); + bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int)); GenerateDNAOccCountTable(bwt->decodeTable); } else { bwt->decodeTable = decodeTable; } bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength); - bwt->occValueMajor = (bgint_t*)xcalloc(bwt->occMajorSizeInWord, sizeof(bgint_t)); + bwt->occValueMajor = (bgint_t*)calloc(bwt->occMajorSizeInWord, sizeof(bgint_t)); bwt->occSizeInWord = 0; bwt->occValue = NULL; @@ -354,16 +358,16 @@ BWTInc *BWTIncCreate(const bgint_t textLength, unsigned int initialMaxBuildSize, if (textLength < incMaxBuildSize) incMaxBuildSize = textLength; if (textLength < initialMaxBuildSize) initialMaxBuildSize = textLength; - bwtInc = (BWTInc*)xcalloc(1, sizeof(BWTInc)); + bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc)); bwtInc->numberOfIterationDone = 0; bwtInc->bwt = BWTCreate(textLength, NULL); bwtInc->initialMaxBuildSize = initialMaxBuildSize; bwtInc->incMaxBuildSize = incMaxBuildSize; - bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)xcalloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); + bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); // Build frequently accessed data - bwtInc->packedShift = (unsigned*)xcalloc(CHAR_PER_WORD, sizeof(unsigned int)); + bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int)); for (i=0; ipackedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; @@ -373,7 +377,7 @@ BWTInc *BWTIncCreate(const bgint_t textLength, unsigned int initialMaxBuildSize, + incMaxBuildSize/5 * 3 * (sizeof(bgint_t) / 4); // space for the 3 temporary arrays in each iteration if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD; // lh3: otherwise segfaul when availableWord is too small fprintf(stderr, "[%s] textLength=%ld, availableWord=%ld\n", __func__, (long)textLength, (long)bwtInc->availableWord); - bwtInc->workingMemory = (unsigned*)xcalloc(bwtInc->availableWord, BYTES_IN_WORD); + bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD); return bwtInc; } diff --git a/bwt_lite.c b/bwt_lite.c index 83dafc4..6cd3b1d 100644 --- a/bwt_lite.c +++ b/bwt_lite.c @@ -2,7 +2,10 @@ #include #include #include "bwt_lite.h" -#include "utils.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif int is_sa(const uint8_t *T, uint32_t *SA, int n); int is_bwt(uint8_t *T, int n); @@ -11,21 +14,21 @@ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq) { bwtl_t *b; int i; - b = (bwtl_t*)xcalloc(1, sizeof(bwtl_t)); + b = (bwtl_t*)calloc(1, sizeof(bwtl_t)); b->seq_len = len; { // calculate b->bwt uint8_t *s; - b->sa = (uint32_t*)xcalloc(len + 1, 4); + b->sa = (uint32_t*)calloc(len + 1, 4); is_sa(seq, b->sa, len); - s = (uint8_t*)xcalloc(len + 1, 1); + s = (uint8_t*)calloc(len + 1, 1); for (i = 0; i <= len; ++i) { if (b->sa[i] == 0) b->primary = i; else s[i] = seq[b->sa[i] - 1]; } for (i = b->primary; i < len; ++i) s[i] = s[i + 1]; b->bwt_size = (len + 15) / 16; - b->bwt = (uint32_t*)xcalloc(b->bwt_size, 4); + b->bwt = (uint32_t*)calloc(b->bwt_size, 4); for (i = 0; i < len; ++i) b->bwt[i>>4] |= s[i] << ((15 - (i&15)) << 1); free(s); @@ -33,7 +36,7 @@ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq) { // calculate b->occ uint32_t c[4]; b->n_occ = (len + 15) / 16 * 4; - b->occ = (uint32_t*)xcalloc(b->n_occ, 4); + b->occ = (uint32_t*)calloc(b->n_occ, 4); memset(c, 0, 16); for (i = 0; i < len; ++i) { if (i % 16 == 0) diff --git a/bwtaln.c b/bwtaln.c index f6b26a8..e772792 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -17,10 +17,14 @@ #include #endif +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + gap_opt_t *gap_init_opt() { gap_opt_t *o; - o = (gap_opt_t*)xcalloc(1, sizeof(gap_opt_t)); + o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t)); /* IMPORTANT: s_mm*10 should be about the average base error rate. Voilating this requirement will break pairing! */ o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4; @@ -90,7 +94,7 @@ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff; stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt); - seed_w = (bwt_width_t*)xcalloc(opt->seed_len+1, sizeof(bwt_width_t)); + seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); w = 0; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = seqs + i; @@ -100,7 +104,7 @@ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; if (max_l < p->len) { max_l = p->len; - w = (bwt_width_t*)xrealloc(w, (max_l + 1) * sizeof(bwt_width_t)); + w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t)); memset(w, 0, (max_l + 1) * sizeof(bwt_width_t)); } bwt_cal_width(bwt, p->len, p->seq, w); @@ -163,7 +167,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) ks = bwa_open_reads(opt->mode, fn_fa); { // load BWT - char *str = (char*)xcalloc(strlen(prefix) + 10, 1); + char *str = (char*)calloc(strlen(prefix) + 10, 1); strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); free(str); } @@ -186,8 +190,8 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) int j; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - data = (thread_aux_t*)xcalloc(opt->n_threads, sizeof(thread_aux_t)); - tid = (pthread_t*)xcalloc(opt->n_threads, sizeof(pthread_t)); + data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (j = 0; j < opt->n_threads; ++j) { data[j].tid = j; data[j].bwt = bwt; data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; diff --git a/bwtgap.c b/bwtgap.c index cef9561..16d9025 100644 --- a/bwtgap.c +++ b/bwtgap.c @@ -3,7 +3,10 @@ #include #include "bwtgap.h" #include "bwtaln.h" -#include "utils.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif #define STATE_M 0 #define STATE_I 1 @@ -14,9 +17,9 @@ gap_stack_t *gap_init_stack2(int max_score) { gap_stack_t *stack; - stack = (gap_stack_t*)xcalloc(1, sizeof(gap_stack_t)); + stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t)); stack->n_stacks = max_score; - stack->stacks = (gap_stack1_t*)xcalloc(stack->n_stacks, sizeof(gap_stack1_t)); + stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t)); return stack; } @@ -52,7 +55,7 @@ static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, i q = stack->stacks + score; if (q->n_entries == q->m_entries) { q->m_entries = q->m_entries? q->m_entries<<1 : 4; - q->stack = (gap_entry_t*)xrealloc(q->stack, sizeof(gap_entry_t) * q->m_entries); + q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries); } p = q->stack + q->n_entries; p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l; @@ -111,7 +114,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid bwt_aln1_t *aln; m_aln = 4; n_aln = 0; - aln = (bwt_aln1_t*)xcalloc(m_aln, sizeof(bwt_aln1_t)); + aln = (bwt_aln1_t*)calloc(m_aln, sizeof(bwt_aln1_t)); // check whether there are too many N for (j = _j = 0; j < len; ++j) @@ -178,7 +181,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid gap_shadow(l - k + 1, len, bwt->seq_len, e.last_diff_pos, width); if (n_aln == m_aln) { m_aln <<= 1; - aln = (bwt_aln1_t*)xrealloc(aln, m_aln * sizeof(bwt_aln1_t)); + aln = (bwt_aln1_t*)realloc(aln, m_aln * sizeof(bwt_aln1_t)); memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t)); } p = aln + n_aln; diff --git a/bwtindex.c b/bwtindex.c index 6c1731c..9e3ec15 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -39,6 +39,11 @@ #include "divsufsort.h" #endif +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + + int is_bwt(ubyte_t *T, int n); int64_t bwa_seq_len(const char *fn_pac) @@ -62,18 +67,18 @@ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) FILE *fp; // initialization - bwt = (bwt_t*)xcalloc(1, sizeof(bwt_t)); + bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); bwt->seq_len = bwa_seq_len(fn_pac); bwt->bwt_size = (bwt->seq_len + 15) >> 4; fp = xopen(fn_pac, "rb"); // prepare sequence pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); - buf2 = (ubyte_t*)xcalloc(pac_size, 1); + buf2 = (ubyte_t*)calloc(pac_size, 1); err_fread_noeof(buf2, 1, pac_size, fp); err_fclose(fp); memset(bwt->L2, 0, 5 * 4); - buf = (ubyte_t*)xcalloc(bwt->seq_len + 1, 1); + buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); for (i = 0; i < bwt->seq_len; ++i) { buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; ++bwt->L2[1+buf[i]]; @@ -91,7 +96,7 @@ bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) err_fatal_simple("libdivsufsort is not compiled in."); #endif } - bwt->bwt = (u_int32_t*)xcalloc(bwt->bwt_size, 4); + bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); for (i = 0; i < bwt->seq_len; ++i) bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); free(buf); @@ -127,7 +132,7 @@ void bwt_bwtupdate_core(bwt_t *bwt) n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size - buf = (uint32_t*)xcalloc(bwt->bwt_size, 4); // will be the new bwt + buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt c[0] = c[1] = c[2] = c[3] = 0; for (i = k = 0; i < bwt->seq_len; ++i) { if (i % OCC_INTERVAL == 0) { @@ -196,7 +201,7 @@ int bwa_index(int argc, char *argv[]) // the "index" command else if (strcmp(optarg, "is") == 0) algo_type = 3; else err_fatal(__func__, "unknown algorithm: '%s'.", optarg); break; - case 'p': prefix = xstrdup(optarg); break; + case 'p': prefix = strdup(optarg); break; case '6': is_64 = 1; break; default: return 1; } @@ -215,13 +220,13 @@ int bwa_index(int argc, char *argv[]) // the "index" command return 1; } if (prefix == 0) { - prefix = xmalloc(strlen(argv[optind]) + 4); + prefix = malloc(strlen(argv[optind]) + 4); strcpy(prefix, argv[optind]); if (is_64) strcat(prefix, ".64"); } - str = (char*)xcalloc(strlen(prefix) + 10, 1); - str2 = (char*)xcalloc(strlen(prefix) + 10, 1); - str3 = (char*)xcalloc(strlen(prefix) + 10, 1); + str = (char*)calloc(strlen(prefix) + 10, 1); + str2 = (char*)calloc(strlen(prefix) + 10, 1); + str3 = (char*)calloc(strlen(prefix) + 10, 1); { // nucleotide indexing gzFile fp = xzopen(argv[optind], "r"); diff --git a/bwtsw2_aux.c b/bwtsw2_aux.c index 074af10..d225187 100644 --- a/bwtsw2_aux.c +++ b/bwtsw2_aux.c @@ -22,6 +22,11 @@ KSEQ_DECLARE(gzFile) #define __left_lt(a, b) ((a).end > (b).end) KSORT_INIT(hit, bsw2hit_t, __left_lt) +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + + extern unsigned char nst_nt4_table[256]; unsigned char nt_comp_table[256] = { @@ -48,7 +53,7 @@ extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level); bsw2opt_t *bsw2_init_opt() { - bsw2opt_t *o = (bsw2opt_t*)xcalloc(1, sizeof(bsw2opt_t)); + bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t)); o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30; o->bw = 50; o->max_ins = 20000; @@ -73,11 +78,11 @@ void bsw2_destroy(bwtsw2_t *b) bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b) { bwtsw2_t *p; - p = xcalloc(1, sizeof(bwtsw2_t)); + p = calloc(1, sizeof(bwtsw2_t)); p->max = p->n = b->n; if (b->n) { kroundup32(p->max); - p->hits = xcalloc(p->max, sizeof(bsw2hit_t)); + p->hits = calloc(p->max, sizeof(bsw2hit_t)); memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t)); } return p; @@ -100,10 +105,10 @@ void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq int8_t mat[25]; bwa_fill_scmat(opt->a, opt->b, mat); - query = xcalloc(lq, 1); + query = calloc(lq, 1); // sort according to the descending order of query end ks_introsort(hit, b->n, b->hits); - target = xcalloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); + target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); // reverse _query for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i]; // core loop @@ -144,7 +149,7 @@ void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, int8_t mat[25]; bwa_fill_scmat(opt->a, opt->b, mat); - target = xcalloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); + target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); for (i = 0; i < b->n; ++i) { bsw2hit_t *p = b->hits + i; int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; @@ -192,7 +197,7 @@ static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], int64_t l_p } #endif if (q->cigar && (beg != 0 || end < lq)) { // write soft clipping - q->cigar = xrealloc(q->cigar, 4 * (q->n_cigar + 2)); + q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2)); if (beg != 0) { memmove(q->cigar + 1, q->cigar, q->n_cigar * 4); q->cigar[0] = beg<<4 | 4; @@ -223,7 +228,7 @@ static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse) int i; if (b[0]->n + b[1]->n > b[0]->max) { b[0]->max = b[0]->n + b[1]->n; - b[0]->hits = xrealloc(b[0]->hits, b[0]->max * sizeof(bsw2hit_t)); + b[0]->hits = realloc(b[0]->hits, b[0]->max * sizeof(bsw2hit_t)); } for (i = 0; i < b[1]->n; ++i) { bsw2hit_t *p = b[0]->hits + b[0]->n + i; @@ -251,9 +256,9 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8 _b = bsw2_core(bns, opt, query, target, pool); bwtl_destroy(query); for (k = 0; k < 2; ++k) { - bb[k] = xcalloc(2, sizeof(void*)); - bb[k][0] = xcalloc(1, sizeof(bwtsw2_t)); - bb[k][1] = xcalloc(1, sizeof(bwtsw2_t)); + bb[k] = calloc(2, sizeof(void*)); + bb[k][0] = calloc(1, sizeof(bwtsw2_t)); + bb[k][1] = calloc(1, sizeof(bwtsw2_t)); } for (k = 0; k < 2; ++k) { // separate _b into bb[2] based on the strand for (j = 0; j < _b[k]->n; ++j) { @@ -261,7 +266,7 @@ static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8 p = bb[_b[k]->hits[j].is_rev][k]; if (p->n == p->max) { p->max = p->max? p->max<<1 : 8; - p->hits = xrealloc(p->hits, p->max * sizeof(bsw2hit_t)); + p->hits = realloc(p->hits, p->max * sizeof(bsw2hit_t)); } q = &p->hits[p->n++]; *q = _b[k]->hits[j]; @@ -340,7 +345,7 @@ static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *c uint32_t *cn; bwtint_t kk = 0; nc = mq[0] = mq[1] = nlen[0] = nlen[1] = 0; - cn = xcalloc(n_cigar + 3, 4); + cn = calloc(n_cigar + 3, 4); x = coor; y = 0; for (i = j = 0; i < n_cigar; ++i) { int op = cigar[i]&0xf, ln = cigar[i]>>4; @@ -398,9 +403,9 @@ static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8 if (b->n<<1 < b->max) { b->max = b->n; kroundup32(b->max); - b->hits = xrealloc(b->hits, b->max * sizeof(bsw2hit_t)); + b->hits = realloc(b->hits, b->max * sizeof(bsw2hit_t)); } - b->aux = xcalloc(b->n, sizeof(bsw2aux_t)); + b->aux = calloc(b->n, sizeof(bsw2aux_t)); // generate CIGAR gen_cigar(opt, qlen, seq, bns->l_pac, pac, b, name); // fix CIGAR, generate mapQ, and write chromosomal position @@ -559,7 +564,7 @@ static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t bsw2opt_t opt; bsw2global_t *pool = bsw2_global_init(); bwtsw2_t **buf; - buf = xcalloc(_seq->n, sizeof(void*)); + buf = calloc(_seq->n, sizeof(void*)); for (x = 0; x < _seq->n; ++x) { bsw2seq1_t *p = _seq->seq + x; uint8_t *seq[2], *rseq[2]; @@ -570,10 +575,10 @@ static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t if (pool->max_l < l) { // then enlarge working space for aln_extend_core() int tmp = ((l + 1) / 2 * opt.a + opt.r) / opt.r + l; pool->max_l = l; - pool->aln_mem = xrealloc(pool->aln_mem, (tmp + 2) * 24); + pool->aln_mem = realloc(pool->aln_mem, (tmp + 2) * 24); } // set seq[2] and rseq[2] - seq[0] = xcalloc(l * 4, 1); + seq[0] = calloc(l * 4, 1); seq[1] = seq[0] + l; rseq[0] = seq[1] + l; rseq[1] = rseq[0] + l; // convert sequences to 2-bit representation @@ -586,7 +591,7 @@ static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t rseq[1][i] = c; } if (l - k < opt.t) { // too few unambiguous bases - buf[x] = xcalloc(1, sizeof(bwtsw2_t)); + buf[x] = calloc(1, sizeof(bwtsw2_t)); free(seq[0]); continue; } // alignment @@ -618,7 +623,7 @@ static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t bsw2seq1_t *p = _seq->seq + x; uint8_t *seq[2]; int i; - seq[0] = xmalloc(p->l * 2); seq[1] = seq[0] + p->l; + seq[0] = malloc(p->l * 2); seq[1] = seq[0] + p->l; for (i = 0; i < p->l; ++i) { int c = nst_nt4_table[(int)p->seq[i]]; if (c >= 4) c = (int)(drand48() * 4); @@ -674,16 +679,16 @@ static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t * int j; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - data = (thread_aux_t*)xcalloc(opt->n_threads, sizeof(thread_aux_t)); - tid = (pthread_t*)xcalloc(opt->n_threads, sizeof(pthread_t)); + data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (j = 0; j < opt->n_threads; ++j) { thread_aux_t *p = data + j; p->tid = j; p->_opt = opt; p->bns = bns; p->is_pe = is_pe; p->pac = pac; p->target = target; - p->_seq = xcalloc(1, sizeof(bsw2seq_t)); + p->_seq = calloc(1, sizeof(bsw2seq_t)); p->_seq->max = (_seq->n + opt->n_threads - 1) / opt->n_threads + 1; p->_seq->n = 0; - p->_seq->seq = xcalloc(p->_seq->max, sizeof(bsw2seq1_t)); + p->_seq->seq = calloc(p->_seq->max, sizeof(bsw2seq1_t)); } for (i = 0; i < _seq->n; ++i) { // assign sequences to each thread bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq; @@ -728,13 +733,13 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c bsw2seq_t *_seq; bseq1_t *bseq; - pac = xcalloc(bns->l_pac/4+1, 1); + pac = calloc(bns->l_pac/4+1, 1); for (l = 0; l < bns->n_seqs; ++l) err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); fp = xzopen(fn, "r"); ks = kseq_init(fp); - _seq = xcalloc(1, sizeof(bsw2seq_t)); + _seq = calloc(1, sizeof(bsw2seq_t)); if (fn2) { fp2 = xzopen(fn2, "r"); ks2 = kseq_init(fp2); @@ -745,7 +750,7 @@ void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, c if (n > _seq->max) { _seq->max = n; kroundup32(_seq->max); - _seq->seq = xrealloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); + _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); } _seq->n = n; for (i = 0; i < n; ++i) { diff --git a/bwtsw2_chain.c b/bwtsw2_chain.c index 6bd320f..ade77e7 100644 --- a/bwtsw2_chain.c +++ b/bwtsw2_chain.c @@ -1,6 +1,9 @@ #include #include "bwtsw2.h" -#include "utils.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif typedef struct { uint32_t tbeg, tend; @@ -49,9 +52,9 @@ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) char *flag; // initialization n[0] = b[0]->n; n[1] = b[1]->n; - z[0] = xcalloc(n[0] + n[1], sizeof(hsaip_t)); + z[0] = calloc(n[0] + n[1], sizeof(hsaip_t)); z[1] = z[0] + n[0]; - chain[0] = xcalloc(n[0] + n[1], sizeof(hsaip_t)); + chain[0] = calloc(n[0] + n[1], sizeof(hsaip_t)); for (k = j = 0; k < 2; ++k) { for (i = 0; i < b[k]->n; ++i) { bsw2hit_t *p = b[k]->hits + i; @@ -74,7 +77,7 @@ void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) } //for (k = 0; k < m[0]; ++k) printf("%d, [%d,%d), [%d,%d)\n", chain[0][k].chain, chain[0][k].tbeg, chain[0][k].tend, chain[0][k].qbeg, chain[0][k].qend); // filtering - flag = xcalloc(m[0] + m[1], 1); + flag = calloc(m[0] + m[1], 1); ks_introsort(hsaip, m[0] + m[1], chain[0]); for (k = 1; k < m[0] + m[1]; ++k) { hsaip_t *p = chain[0] + k; diff --git a/bwtsw2_core.c b/bwtsw2_core.c index d64f74b..1119601 100644 --- a/bwtsw2_core.c +++ b/bwtsw2_core.c @@ -7,7 +7,10 @@ #include "bwtsw2.h" #include "bwt.h" #include "kvec.h" -#include "utils.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif typedef struct { bwtint_t k, l; @@ -72,7 +75,7 @@ typedef struct __mempool_t { inline static bsw2entry_p mp_alloc(mempool_t *mp) { ++mp->cnt; - if (kv_size(mp->pool) == 0) return (bsw2entry_t*)xcalloc(1, sizeof(bsw2entry_t)); + if (kv_size(mp->pool) == 0) return (bsw2entry_t*)calloc(1, sizeof(bsw2entry_t)); else return kv_pop(mp->pool); } inline static void mp_free(mempool_t *mp, bsw2entry_p e) @@ -134,7 +137,7 @@ static void cut_tail(bsw2entry_t *u, int T, bsw2entry_t *aux) if (u->n <= T) return; if (aux->max < u->n) { aux->max = u->n; - aux->array = (bsw2cell_t*)xrealloc(aux->array, aux->max * sizeof(bsw2cell_t)); + aux->array = (bsw2cell_t*)realloc(aux->array, aux->max * sizeof(bsw2cell_t)); } a = (int*)aux->array; for (i = n = 0; i != u->n; ++i) @@ -185,7 +188,7 @@ static void merge_entry(const bsw2opt_t * __restrict opt, bsw2entry_t *u, bsw2en int i; if (u->n + v->n >= u->max) { u->max = u->n + v->n; - u->array = (bsw2cell_t*)xrealloc(u->array, u->max * sizeof(bsw2cell_t)); + u->array = (bsw2cell_t*)realloc(u->array, u->max * sizeof(bsw2cell_t)); } for (i = 0; i != v->n; ++i) { bsw2cell_t *p = v->array + i; @@ -203,7 +206,7 @@ static inline bsw2cell_t *push_array_p(bsw2entry_t *e) { if (e->n == e->max) { e->max = e->max? e->max<<1 : 256; - e->array = (bsw2cell_t*)xrealloc(e->array, sizeof(bsw2cell_t) * e->max); + e->array = (bsw2cell_t*)realloc(e->array, sizeof(bsw2cell_t) * e->max); } return e->array + e->n; } @@ -251,7 +254,7 @@ static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, i if (p->G >= t && p->ql - p->qk + 1 <= IS) { // good narrow hit if (b1->max == b1->n) { b1->max = b1->max? b1->max<<1 : 4; - b1->hits = xrealloc(b1->hits, b1->max * sizeof(bsw2hit_t)); + b1->hits = realloc(b1->hits, b1->max * sizeof(bsw2hit_t)); } q = &b1->hits[b1->n++]; q->k = p->qk; q->l = p->ql; @@ -280,7 +283,7 @@ int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int else if (p->G > 0) ++n; } b->n = b->max = n; - b->hits = xcalloc(b->max, sizeof(bsw2hit_t)); + b->hits = calloc(b->max, sizeof(bsw2hit_t)); for (i = j = 0; i < old_n; ++i) { bsw2hit_t *p = old_hits + i; if (p->l - p->k + 1 <= IS) { // the hit is no so repetitive @@ -400,9 +403,9 @@ bsw2global_t *bsw2_global_init() { bsw2global_t *pool; bsw2stack_t *stack; - pool = xcalloc(1, sizeof(bsw2global_t)); - stack = xcalloc(1, sizeof(bsw2stack_t)); - stack->pool = (mempool_t*)xcalloc(1, sizeof(mempool_t)); + pool = calloc(1, sizeof(bsw2global_t)); + stack = calloc(1, sizeof(bsw2stack_t)); + stack->pool = (mempool_t*)calloc(1, sizeof(mempool_t)); pool->stack = (void*)stack; return pool; } @@ -462,13 +465,13 @@ bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *ta rhash = kh_init(qintv); init_bwtsw2(target, query, stack); heap_size = opt->z; - heap = xcalloc(heap_size, sizeof(int)); + heap = calloc(heap_size, sizeof(int)); // initialize the return struct - b = (bwtsw2_t*)xcalloc(1, sizeof(bwtsw2_t)); + b = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t)); b->n = b->max = target->seq_len * 2; - b->hits = xcalloc(b->max, sizeof(bsw2hit_t)); - b1 = (bwtsw2_t*)xcalloc(1, sizeof(bwtsw2_t)); - b_ret = xcalloc(2, sizeof(void*)); + b->hits = calloc(b->max, sizeof(bsw2hit_t)); + b1 = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t)); + b_ret = calloc(2, sizeof(void*)); b_ret[0] = b; b_ret[1] = b1; // initialize timer getrusage(0, &last); diff --git a/bwtsw2_pair.c b/bwtsw2_pair.c index df62e3f..24905df 100644 --- a/bwtsw2_pair.c +++ b/bwtsw2_pair.c @@ -7,9 +7,12 @@ #include "bntseq.h" #include "bwtsw2.h" #include "kstring.h" -#include "utils.h" #include "ksw.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + #define MIN_RATIO 0.8 #define OUTLIER_BOUND 2.0 #define MAX_STDDEV 4.0 @@ -27,7 +30,7 @@ bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) bsw2pestat_t r; memset(&r, 0, sizeof(bsw2pestat_t)); - isize = xcalloc(n, 8); + isize = calloc(n, 8); for (i = k = 0; i < n; i += 2) { bsw2hit_t *t[2]; int l; @@ -116,7 +119,7 @@ void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const b if (end > l_pac) end = l_pac; if (end - beg < l_mseq) return; // generate the sequence - seq = xmalloc(l_mseq + (end - beg)); + seq = malloc(l_mseq + (end - beg)); ref = seq + l_mseq; for (k = beg; k < end; ++k) ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3; @@ -195,7 +198,7 @@ void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, b a[which].flag |= BSW2_FLAG_RESCUED; if (p[1]->max == 0) { p[1]->max = 1; - p[1]->hits = xmalloc(sizeof(bsw2hit_t)); + p[1]->hits = malloc(sizeof(bsw2hit_t)); } p[1]->hits[0] = a[which]; p[1]->n = 1; diff --git a/example.c b/example.c index aafe5d4..a6c9bdd 100644 --- a/example.c +++ b/example.c @@ -7,6 +7,10 @@ #include "kseq.h" // for the FASTA/Q parser KSEQ_DECLARE(gzFile) +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + int main(int argc, char *argv[]) { bwaidx_t *idx; diff --git a/is.c b/is.c index 8e94abd..1891668 100644 --- a/is.c +++ b/is.c @@ -27,6 +27,10 @@ #include #include "utils.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + typedef unsigned char ubyte_t; #define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i]) @@ -205,7 +209,7 @@ int is_sa(const ubyte_t *T, int *SA, int n) int is_bwt(ubyte_t *T, int n) { int *SA, i, primary = 0; - SA = (int*)xcalloc(n+1, sizeof(int)); + SA = (int*)calloc(n+1, sizeof(int)); if (is_sa(T, SA, n)) err_fatal_simple("is_sa failed"); diff --git a/kbtree.h b/kbtree.h index bab4f0a..2b76953 100644 --- a/kbtree.h +++ b/kbtree.h @@ -31,7 +31,10 @@ #include #include #include -#include "utils.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif typedef struct { int32_t is_internal:1, n:31; @@ -52,7 +55,7 @@ typedef struct { kbtree_##name##_t *kb_init_##name(int size) \ { \ kbtree_##name##_t *b; \ - b = (kbtree_##name##_t*)xcalloc(1, sizeof(kbtree_##name##_t)); \ + b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t)); \ b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \ if (b->t < 2) { \ free(b); return 0; \ @@ -61,7 +64,7 @@ typedef struct { b->off_ptr = 4 + b->n * sizeof(key_t); \ b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \ b->elen = (b->off_ptr + 3) >> 2 << 2; \ - b->root = (kbnode_t*)xcalloc(1, b->ilen); \ + b->root = (kbnode_t*)calloc(1, b->ilen); \ ++b->n_nodes; \ return b; \ } @@ -70,7 +73,7 @@ typedef struct { int i, max = 8; \ kbnode_t *x, **top, **stack = 0; \ if (b) { \ - top = stack = (kbnode_t**)xcalloc(max, sizeof(kbnode_t*)); \ + top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*)); \ *top++ = (b)->root; \ while (top != stack) { \ x = *--top; \ @@ -79,7 +82,7 @@ typedef struct { if (__KB_PTR(b, x)[i]) { \ if (top - stack == max) { \ max <<= 1; \ - stack = (kbnode_t**)xrealloc(stack, max * sizeof(kbnode_t*)); \ + stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \ top = stack + (max>>1); \ } \ *top++ = __KB_PTR(b, x)[i]; \ @@ -173,7 +176,7 @@ typedef struct { static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \ { \ kbnode_t *z; \ - z = (kbnode_t*)xcalloc(1, y->is_internal? b->ilen : b->elen); \ + z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen); \ ++b->n_nodes; \ z->is_internal = y->is_internal; \ z->n = b->t - 1; \ @@ -211,7 +214,7 @@ typedef struct { r = b->root; \ if (r->n == 2 * b->t - 1) { \ ++b->n_nodes; \ - s = (kbnode_t*)xcalloc(1, b->ilen); \ + s = (kbnode_t*)calloc(1, b->ilen); \ b->root = s; s->is_internal = 1; s->n = 0; \ __KB_PTR(b, s)[0] = r; \ __kb_split_##name(b, s, 0, r); \ @@ -333,13 +336,13 @@ typedef struct { #define __kb_traverse(key_t, b, __func) do { \ int __kmax = 8; \ __kbstack_t *__kstack, *__kp; \ - __kp = __kstack = (__kbstack_t*)xcalloc(__kmax, sizeof(__kbstack_t)); \ + __kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \ __kp->x = (b)->root; __kp->i = 0; \ for (;;) { \ while (__kp->x && __kp->i <= __kp->x->n) { \ if (__kp - __kstack == __kmax - 1) { \ __kmax <<= 1; \ - __kstack = (__kbstack_t*)xrealloc(__kstack, __kmax * sizeof(__kbstack_t)); \ + __kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \ __kp = __kstack + (__kmax>>1) - 1; \ } \ (__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \ diff --git a/khash.h b/khash.h index e206d35..12e5542 100644 --- a/khash.h +++ b/khash.h @@ -115,7 +115,10 @@ int main() { #include #include #include -#include "utils.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif /* compipler specific configuration */ @@ -161,13 +164,13 @@ typedef khint_t khiter_t; #endif #ifndef kcalloc -#define kcalloc(N,Z) xcalloc(N,Z) +#define kcalloc(N,Z) calloc(N,Z) #endif #ifndef kmalloc -#define kmalloc(Z) xmalloc(Z) +#define kmalloc(Z) malloc(Z) #endif #ifndef krealloc -#define krealloc(P,Z) xrealloc(P,Z) +#define krealloc(P,Z) realloc(P,Z) #endif #ifndef kfree #define kfree(P) free(P) diff --git a/kopen.c b/kopen.c index 82f2812..d238226 100644 --- a/kopen.c +++ b/kopen.c @@ -14,7 +14,9 @@ #include #endif -#include "utils.h" +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif #ifdef _WIN32 #define _KO_NO_NET @@ -82,7 +84,7 @@ static int http_open(const char *fn) // set ->http_host for (p = (char*)fn + 7; *p && *p != '/'; ++p); l = p - fn - 7; - http_host = xcalloc(l + 1, 1); + http_host = calloc(l + 1, 1); strncpy(http_host, fn + 7, l); http_host[l] = 0; for (q = http_host; *q && *q != ':'; ++q); @@ -91,21 +93,21 @@ static int http_open(const char *fn) proxy = getenv("http_proxy"); // set host, port and path if (proxy == 0) { - host = xstrdup(http_host); // when there is no proxy, server name is identical to http_host name. - port = xstrdup(*q? q : "80"); - path = xstrdup(*p? p : "/"); + host = strdup(http_host); // when there is no proxy, server name is identical to http_host name. + port = strdup(*q? q : "80"); + path = strdup(*p? p : "/"); } else { - host = (strstr(proxy, "http://") == proxy)? xstrdup(proxy + 7) : xstrdup(proxy); + host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); for (q = host; *q && *q != ':'; ++q); if (*q == ':') *q++ = 0; - port = xstrdup(*q? q : "80"); - path = xstrdup(fn); + port = strdup(*q? q : "80"); + path = strdup(fn); } /* connect; adapted from khttp_connect() in knetfile.c */ l = 0; fd = socket_connect(host, port); - buf = xcalloc(bufsz, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. + buf = calloc(bufsz, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. l += snprintf(buf + l, bufsz, "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n", path, http_host); if (write_bytes(fd, buf, l) != 0) { @@ -152,7 +154,7 @@ static int kftp_get_response(ftpaux_t *aux) while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O if (n >= aux->max_response) { aux->max_response = aux->max_response? aux->max_response<<1 : 256; - aux->response = xrealloc(aux->response, aux->max_response); + aux->response = realloc(aux->response, aux->max_response); } aux->response[n++] = c; if (c == '\n') { @@ -186,10 +188,10 @@ static int ftp_open(const char *fn) for (p = (char*)fn + 6; *p && *p != '/'; ++p); if (*p != '/') return 0; l = p - fn - 6; - port = xstrdup("21"); - host = xcalloc(l + 1, 1); + port = strdup("21"); + host = calloc(l + 1, 1); strncpy(host, fn + 6, l); - retr = xcalloc(strlen(p) + 8, 1); + retr = calloc(strlen(p) + 8, 1); sprintf(retr, "RETR %s\r\n", p); /* connect to ctrl */ @@ -241,8 +243,8 @@ static char **cmd2argv(const char *cmd) for (i = beg + 1, argc = 0; i < end; ++i) if (isspace(cmd[i]) && !isspace(cmd[i-1])) ++argc; - argv = (char**)xcalloc(argc + 2, sizeof(void*)); - argv[0] = str = (char*)xcalloc(end - beg + 1, 1); + argv = (char**)calloc(argc + 2, sizeof(void*)); + argv[0] = str = (char*)calloc(end - beg + 1, 1); strncpy(argv[0], cmd + beg, end - beg); for (i = argc = 1; i < end - beg; ++i) if (isspace(str[i])) str[i] = 0; @@ -266,15 +268,15 @@ void *kopen(const char *fn, int *_fd) koaux_t *aux = 0; *_fd = -1; if (strstr(fn, "http://") == fn) { - aux = xcalloc(1, sizeof(koaux_t)); + aux = calloc(1, sizeof(koaux_t)); aux->type = KO_HTTP; aux->fd = http_open(fn); } else if (strstr(fn, "ftp://") == fn) { - aux = xcalloc(1, sizeof(koaux_t)); + aux = calloc(1, sizeof(koaux_t)); aux->type = KO_FTP; aux->fd = ftp_open(fn); } else if (strcmp(fn, "-") == 0) { - aux = xcalloc(1, sizeof(koaux_t)); + aux = calloc(1, sizeof(koaux_t)); aux->type = KO_STDIN; aux->fd = STDIN_FILENO; } else { @@ -308,7 +310,7 @@ void *kopen(const char *fn, int *_fd) exit(1); } else { /* parent process */ close(pfd[1]); - aux = xcalloc(1, sizeof(koaux_t)); + aux = calloc(1, sizeof(koaux_t)); aux->type = KO_PIPE; aux->fd = pfd[0]; aux->pid = pid; @@ -320,7 +322,7 @@ void *kopen(const char *fn, int *_fd) *_fd = open(fn, O_RDONLY); #endif if (*_fd >= 0) { - aux = xcalloc(1, sizeof(koaux_t)); + aux = calloc(1, sizeof(koaux_t)); aux->type = KO_FILE; aux->fd = *_fd; } diff --git a/kseq.h b/kseq.h index 55405a8..642cd33 100644 --- a/kseq.h +++ b/kseq.h @@ -31,7 +31,10 @@ #include #include #include -#include "utils.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r #define KS_SEP_TAB 1 // isspace() && !' ' @@ -51,9 +54,9 @@ #define __KS_BASIC(type_t, __bufsize) \ static inline kstream_t *ks_init(type_t f) \ { \ - kstream_t *ks = (kstream_t*)xcalloc(1, sizeof(kstream_t)); \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ ks->f = f; \ - ks->buf = (unsigned char*)xmalloc(__bufsize); \ + ks->buf = (unsigned char*)malloc(__bufsize); \ return ks; \ } \ static inline void ks_destroy(kstream_t *ks) \ @@ -121,7 +124,7 @@ typedef struct __kstring_t { if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ str->m = str->l + (i - ks->begin) + 1; \ kroundup32(str->m); \ - str->s = (char*)xrealloc(str->s, str->m); \ + str->s = (char*)realloc(str->s, str->m); \ } \ memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ str->l = str->l + (i - ks->begin); \ @@ -133,7 +136,7 @@ typedef struct __kstring_t { } \ if (str->s == 0) { \ str->m = 1; \ - str->s = (char*)xcalloc(1, 1); \ + str->s = (char*)calloc(1, 1); \ } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ str->s[str->l] = '\0'; \ return str->l; \ @@ -152,7 +155,7 @@ typedef struct __kstring_t { #define __KSEQ_BASIC(SCOPE, type_t) \ SCOPE kseq_t *kseq_init(type_t fd) \ { \ - kseq_t *s = (kseq_t*)xcalloc(1, sizeof(kseq_t)); \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ s->f = ks_init(fd); \ return s; \ } \ @@ -184,7 +187,7 @@ typedef struct __kstring_t { if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ seq->seq.m = 256; \ - seq->seq.s = (char*)xmalloc(seq->seq.m); \ + seq->seq.s = (char*)malloc(seq->seq.m); \ } \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ if (c == '\n') continue; /* skip empty lines */ \ @@ -195,13 +198,13 @@ typedef struct __kstring_t { if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ seq->seq.m = seq->seq.l + 2; \ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ - seq->seq.s = (char*)xrealloc(seq->seq.s, seq->seq.m); \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ if (c != '+') return seq->seq.l; /* FASTA */ \ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)xrealloc(seq->qual.s, seq->qual.m); \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ if (c == -1) return -2; /* error: no quality string */ \ diff --git a/ksort.h b/ksort.h index 9f334e2..5851b0d 100644 --- a/ksort.h +++ b/ksort.h @@ -57,7 +57,10 @@ #include #include -#include "utils.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif typedef struct { void *left, *right; @@ -73,7 +76,7 @@ typedef struct { int curr, shift; \ \ a2[0] = array; \ - a2[1] = temp? temp : (type_t*)xmalloc(sizeof(type_t) * n); \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ for (curr = 0, shift = 0; (1ul< #include #include "kstring.h" -#include "utils.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif int ksprintf(kstring_t *s, const char *fmt, ...) { @@ -13,7 +16,7 @@ int ksprintf(kstring_t *s, const char *fmt, ...) if (l + 1 > s->m - s->l) { s->m = s->l + l + 2; kroundup32(s->m); - s->s = (char*)xrealloc(s->s, s->m); + s->s = (char*)realloc(s->s, s->m); va_start(ap, fmt); l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); } @@ -27,7 +30,7 @@ int ksprintf(kstring_t *s, const char *fmt, ...) int main() { kstring_t *s; - s = (kstring_t*)xcalloc(1, sizeof(kstring_t)); + s = (kstring_t*)calloc(1, sizeof(kstring_t)); ksprintf(s, "abcdefg: %d", 100); printf("%s\n", s->s); free(s); diff --git a/kstring.h b/kstring.h index 04194ec..fe7fa95 100644 --- a/kstring.h +++ b/kstring.h @@ -3,7 +3,10 @@ #include #include -#include "utils.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif #ifndef kroundup32 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) @@ -22,7 +25,7 @@ static inline void ks_resize(kstring_t *s, size_t size) if (s->m < size) { s->m = size; kroundup32(s->m); - s->s = (char*)xrealloc(s->s, s->m); + s->s = (char*)realloc(s->s, s->m); } } @@ -31,7 +34,7 @@ static inline int kputsn(const char *p, int l, kstring_t *s) if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); - s->s = (char*)xrealloc(s->s, s->m); + s->s = (char*)realloc(s->s, s->m); } memcpy(s->s + s->l, p, l); s->l += l; @@ -49,7 +52,7 @@ static inline int kputc(int c, kstring_t *s) if (s->l + 1 >= s->m) { s->m = s->l + 2; kroundup32(s->m); - s->s = (char*)xrealloc(s->s, s->m); + s->s = (char*)realloc(s->s, s->m); } s->s[s->l++] = c; s->s[s->l] = 0; @@ -66,7 +69,7 @@ static inline int kputw(int c, kstring_t *s) if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); - s->s = (char*)xrealloc(s->s, s->m); + s->s = (char*)realloc(s->s, s->m); } for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; s->s[s->l] = 0; @@ -83,7 +86,7 @@ static inline int kputuw(unsigned c, kstring_t *s) if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); - s->s = (char*)xrealloc(s->s, s->m); + s->s = (char*)realloc(s->s, s->m); } for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; s->s[s->l] = 0; @@ -100,7 +103,7 @@ static inline int kputl(long c, kstring_t *s) if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); - s->s = (char*)xrealloc(s->s, s->m); + s->s = (char*)realloc(s->s, s->m); } for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; s->s[s->l] = 0; diff --git a/ksw.c b/ksw.c index c1ff5a6..454c49d 100644 --- a/ksw.c +++ b/ksw.c @@ -27,7 +27,10 @@ #include #include #include "ksw.h" -#include "utils.h" + +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif #ifdef __GNUC__ #define LIKELY(x) __builtin_expect((x),1) @@ -64,7 +67,7 @@ kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t size = size > 1? 2 : 1; p = 8 * (3 - size); // # values per __m128i slen = (qlen + p - 1) / p; // segmented length - q = (kswq_t*)xmalloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory + q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory q->H0 = q->qp + slen * m; q->H1 = q->H0 + slen; @@ -186,7 +189,7 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append if (n_b == m_b) { m_b = m_b? m_b<<1 : 8; - b = (uint64_t*)xrealloc(b, 8 * m_b); + b = (uint64_t*)realloc(b, 8 * m_b); } b[n_b++] = (uint64_t)imax<<32 | i; } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last @@ -289,7 +292,7 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { if (n_b == m_b) { m_b = m_b? m_b<<1 : 8; - b = (uint64_t*)xrealloc(b, 8 * m_b); + b = (uint64_t*)realloc(b, 8 * m_b); } b[n_b++] = (uint64_t)imax<<32 | i; } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last @@ -369,8 +372,8 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore, max_off; if (h0 < 0) h0 = 0; // allocate memory - qp = xmalloc(qlen * m); - eh = xcalloc(qlen + 1, 8); + qp = malloc(qlen * m); + eh = calloc(qlen + 1, 8); // generate the query profile for (k = i = 0; k < m; ++k) { const int8_t *p = &mat[k * m]; @@ -461,7 +464,7 @@ static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { if (*n_cigar == *m_cigar) { *m_cigar = *m_cigar? (*m_cigar)<<1 : 4; - cigar = xrealloc(cigar, (*m_cigar) << 2); + cigar = realloc(cigar, (*m_cigar) << 2); } cigar[(*n_cigar)++] = len<<4 | op; } else cigar[(*n_cigar)-1] += len<<4; @@ -477,9 +480,9 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, if (n_cigar_) *n_cigar_ = 0; // allocate memory n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix - z = xmalloc(n_col * tlen); - qp = xmalloc(qlen * m); - eh = xcalloc(qlen + 1, 8); + z = malloc(n_col * tlen); + qp = malloc(qlen * m); + eh = calloc(qlen + 1, 8); // generate the query profile for (k = i = 0; k < m; ++k) { const int8_t *p = &mat[k * m]; @@ -620,7 +623,7 @@ int main(int argc, char *argv[]) if (!forward_only) { // reverse if ((int)ksq->seq.m > max_rseq) { max_rseq = ksq->seq.m; - rseq = (uint8_t*)xrealloc(rseq, max_rseq); + rseq = (uint8_t*)realloc(rseq, max_rseq); } for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j) rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; diff --git a/ksw.h b/ksw.h index 97559fd..f9d22c6 100644 --- a/ksw.h +++ b/ksw.h @@ -3,6 +3,10 @@ #include +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + #define KSW_XBYTE 0x10000 #define KSW_XSTOP 0x20000 #define KSW_XSUBO 0x40000 diff --git a/kvec.h b/kvec.h index 7aaf4a5..83ad483 100644 --- a/kvec.h +++ b/kvec.h @@ -50,6 +50,10 @@ int main() { #include +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #define kvec_t(type) struct { size_t n, m; type *a; } @@ -60,7 +64,7 @@ int main() { #define kv_size(v) ((v).n) #define kv_max(v) ((v).m) -#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)xrealloc((v).a, sizeof(type) * (v).m)) +#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) #define kv_copy(type, v1, v0) do { \ if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ @@ -71,19 +75,19 @@ int main() { #define kv_push(type, v, x) do { \ if ((v).n == (v).m) { \ (v).m = (v).m? (v).m<<1 : 2; \ - (v).a = (type*)xrealloc((v).a, sizeof(type) * (v).m); \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ } \ (v).a[(v).n++] = (x); \ } while (0) #define kv_pushp(type, v) ((((v).n == (v).m)? \ ((v).m = ((v).m? (v).m<<1 : 2), \ - (v).a = (type*)xrealloc((v).a, sizeof(type) * (v).m), 0) \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ : 0), &(v).a[(v).n++]) #define kv_a(type, v, i) (((v).m <= (size_t)(i)? \ ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ - (v).a = (type*)xrealloc((v).a, sizeof(type) * (v).m), 0) \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ : (v).n <= (size_t)(i)? (v).n = (i) + 1 \ : 0), (v).a[(i)]) diff --git a/malloc_wrap.c b/malloc_wrap.c new file mode 100644 index 0000000..100b8cb --- /dev/null +++ b/malloc_wrap.c @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#ifdef USE_MALLOC_WRAPPERS +/* Don't wrap ourselves */ +# undef USE_MALLOC_WRAPPERS +#endif +#include "malloc_wrap.h" + +void *wrap_calloc(size_t nmemb, size_t size, + const char *file, unsigned int line, const char *func) { + void *p = calloc(nmemb, size); + if (NULL == p) { + fprintf(stderr, + "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", + func, nmemb * size, file, line, strerror(errno)); + exit(EXIT_FAILURE); + } + return p; +} + +void *wrap_malloc(size_t size, + const char *file, unsigned int line, const char *func) { + void *p = malloc(size); + if (NULL == p) { + fprintf(stderr, + "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", + func, size, file, line, strerror(errno)); + exit(EXIT_FAILURE); + } + return p; +} + +void *wrap_realloc(void *ptr, size_t size, + const char *file, unsigned int line, const char *func) { + void *p = realloc(ptr, size); + if (NULL == p) { + fprintf(stderr, + "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", + func, size, file, line, strerror(errno)); + exit(EXIT_FAILURE); + } + return p; +} + +char *wrap_strdup(const char *s, + const char *file, unsigned int line, const char *func) { + char *p = strdup(s); + if (NULL == p) { + fprintf(stderr, + "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", + func, strlen(s), file, line, strerror(errno)); + exit(EXIT_FAILURE); + } + return p; +} diff --git a/malloc_wrap.h b/malloc_wrap.h new file mode 100644 index 0000000..a55876a --- /dev/null +++ b/malloc_wrap.h @@ -0,0 +1,47 @@ +#ifndef MALLOC_WRAP_H +#define MALLOC_WRAP_H + +#include /* Avoid breaking the usual definitions */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + + void *wrap_calloc(size_t nmemb, size_t size, + const char *file, unsigned int line, const char *func); + void *wrap_malloc(size_t size, + const char *file, unsigned int line, const char *func); + void *wrap_realloc(void *ptr, size_t size, + const char *file, unsigned int line, const char *func); + char *wrap_strdup(const char *s, + const char *file, unsigned int line, const char *func); + +#ifdef __cplusplus +} +#endif + +#ifdef USE_MALLOC_WRAPPERS +# ifdef calloc +# undef calloc +# endif +# define calloc(n, s) wrap_calloc( (n), (s), __FILE__, __LINE__, __func__) + +# ifdef malloc +# undef malloc +# endif +# define malloc(s) wrap_malloc( (s), __FILE__, __LINE__, __func__) + +# ifdef realloc +# undef realloc +# endif +# define realloc(p, s) wrap_realloc((p), (s), __FILE__, __LINE__, __func__) + +# ifdef strdup +# undef strdup +# endif +# define strdup(s) wrap_strdup( (s), __FILE__, __LINE__, __func__) + +#endif /* USE_MALLOC_WRAPPERS */ + +#endif /* MALLOC_WRAP_H */ diff --git a/pemerge.c b/pemerge.c index b944819..725885f 100644 --- a/pemerge.c +++ b/pemerge.c @@ -12,6 +12,10 @@ #include "utils.h" KSEQ_DECLARE(gzFile) +#ifdef USE_MALLOC_WRAPPERS +# include "malloc_wrap.h" +#endif + #define MAX_SCORE_RATIO 0.9f #define MAX_ERR 8 @@ -40,7 +44,7 @@ typedef struct { pem_opt_t *pem_opt_init() { pem_opt_t *opt; - opt = xcalloc(1, sizeof(pem_opt_t)); + opt = calloc(1, sizeof(pem_opt_t)); opt->a = 5; opt->b = 4; opt->q = 2, opt->r = 17; opt->w = 20; opt->T = opt->a * 10; opt->q_def = 20; @@ -58,8 +62,8 @@ int bwa_pemerge(const pem_opt_t *opt, bseq1_t x[2]) int i, xtra, l, l_seq, sum_q, ret = 0; kswr_t r; - s[0] = xmalloc(x[0].l_seq); q[0] = xmalloc(x[0].l_seq); - s[1] = xmalloc(x[1].l_seq); q[1] = xmalloc(x[1].l_seq); + s[0] = malloc(x[0].l_seq); q[0] = malloc(x[0].l_seq); + s[1] = malloc(x[1].l_seq); q[1] = malloc(x[1].l_seq); for (i = 0; i < x[0].l_seq; ++i) { int c = x[0].seq[i]; s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c]; @@ -103,8 +107,8 @@ int bwa_pemerge(const pem_opt_t *opt, bseq1_t x[2]) l = x[0].l_seq - (r.tb - r.qb); // length to merge l_seq = x[0].l_seq + x[1].l_seq - l; - seq = xmalloc(l_seq + 1); - qual = xmalloc(l_seq + 1); + seq = malloc(l_seq + 1); + qual = malloc(l_seq + 1); memcpy(seq, s[0], x[0].l_seq); memcpy(seq + x[0].l_seq, &s[1][l], x[1].l_seq - l); memcpy(qual, q[0], x[0].l_seq); memcpy(qual + x[0].l_seq, &q[1][l], x[1].l_seq - l); for (i = 0, sum_q = 0; i < l; ++i) { @@ -174,7 +178,7 @@ static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cn int i, j, n = n_>>1<<1; worker_t *w; - w = xcalloc(opt->n_threads, sizeof(worker_t)); + w = calloc(opt->n_threads, sizeof(worker_t)); for (i = 0; i < opt->n_threads; ++i) { worker_t *p = &w[i]; p->start = i; p->n = n; @@ -185,7 +189,7 @@ static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cn worker(w); } else { pthread_t *tid; - tid = (pthread_t*)xcalloc(opt->n_threads, sizeof(pthread_t)); + tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker, &w[i]); for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); free(tid); diff --git a/utils.c b/utils.c index ad2f734..00be7f0 100644 --- a/utils.c +++ b/utils.c @@ -264,47 +264,6 @@ int err_gzclose(gzFile file) return ret; } -void *err_calloc(size_t nmemb, size_t size, const char *file, unsigned int line, const char *func) -{ - void *p = calloc(nmemb, size); - if (NULL == p) - { - err_fatal(func, "Failed to allocate %zd bytes at %s line %u: %s\n", nmemb * size, file, line, strerror(errno)); - } - return p; -} - -void *err_malloc(size_t size, const char *file, unsigned int line, const char *func) -{ - void *p = malloc(size); - if (NULL == p) - { - err_fatal(func, "Failed to allocate %zd bytes at %s line %u: %s\n", size, file, line, strerror(errno)); - } - return p; -} - -void *err_realloc(void *ptr, size_t size, const char *file, unsigned int line, const char *func) -{ - void *p = realloc(ptr, size); - if (NULL == p) - { - err_fatal(func, "Failed to allocate %zd bytes at %s line %u: %s\n", size, file, line, strerror(errno)); - } - return p; -} - -char *err_strdup(const char *s, const char *file, unsigned int line, const char *func) -{ - char *p = strdup(s); - - if (NULL == p) - { - err_fatal(func, "Failed to allocate %zd bytes at %s line %u: %s\n", strlen(s), file, line, strerror(errno)); - } - return p; -} - /********* * Timer * *********/ diff --git a/utils.h b/utils.h index 8567d3f..5ef6ac4 100644 --- a/utils.h +++ b/utils.h @@ -48,12 +48,6 @@ #define xassert(cond, msg) if ((cond) == 0) _err_fatal_simple_core(__func__, msg) -#define xcalloc(n, s) err_calloc( (n), (s), __FILE__, __LINE__, __func__) -#define xmalloc(s) err_malloc( (s), __FILE__, __LINE__, __func__) -#define xrealloc(p, s) err_realloc((p), (s), __FILE__, __LINE__, __func__) -#define xstrdup(s) err_strdup( (s), __FILE__, __LINE__, __func__) - - typedef struct { uint64_t x, y; } pair64_t; @@ -91,11 +85,6 @@ extern "C" { int err_fclose(FILE *stream); int err_gzclose(gzFile file); - void *err_calloc(size_t nmemb, size_t size, const char *file, unsigned int line, const char *func); - void *err_malloc(size_t size, const char *file, unsigned int line, const char *func); - void *err_realloc(void *ptr, size_t size, const char *file, unsigned int line, const char *func); - char *err_strdup(const char *s, const char *file, unsigned int line, const char *func); - double cputime(); double realtime(); From 5c43a1fdc9dbdf141200c1b2353d5c3502426433 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 3 May 2013 11:38:48 +0100 Subject: [PATCH 403/498] Removed more dependencies on utils.h bamlite.c now includes some wrappers for gzopen/gzread/gzclose that print messages when errors occur. They do not attempt to quit the program but pass on the return code. bwaseqio.c now checks the return codes from bam_open, bam_close and bam_read1. Code in bwt_gen.c now checks for IO errors itself instead of using the wrappers. A benefit of this is it can now say which file had a problem. Removed call to err_fatal_simple in is_bwt and unnecessary inclusion of malloc_wrap.h in ksw.h. --- Makefile | 10 +++--- bamlite.c | 46 ++++++++++++++++++++++++++++ bamlite.h | 25 ++++++++++++--- bwaseqio.c | 10 ++++-- bwt_gen.c | 90 +++++++++++++++++++++++++++++++++++++++++++----------- is.c | 3 +- ksw.h | 4 --- 7 files changed, 152 insertions(+), 36 deletions(-) diff --git a/Makefile b/Makefile index 7d78889..d39a787 100644 --- a/Makefile +++ b/Makefile @@ -38,9 +38,9 @@ depend: # DO NOT DELETE THIS LINE -- make depend depends on it. QSufSort.o: QSufSort.h -bamlite.o: bamlite.h utils.h malloc_wrap.h +bamlite.o: bamlite.h malloc_wrap.h bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h -bwa.o: bntseq.h bwa.h bwt.h ksw.h malloc_wrap.h utils.h kseq.h +bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h malloc_wrap.h kseq.h bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h bwamem.o: ksort.h utils.h kbtree.h bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h @@ -51,7 +51,7 @@ bwase.o: bwase.h bntseq.h bwt.h bwtaln.h utils.h kstring.h malloc_wrap.h bwase.o: bwa.h ksw.h bwaseqio.o: bwtaln.h bwt.h utils.h bamlite.h malloc_wrap.h kseq.h bwt.o: utils.h bwt.h kvec.h malloc_wrap.h -bwt_gen.o: QSufSort.h utils.h malloc_wrap.h +bwt_gen.o: QSufSort.h malloc_wrap.h bwt_lite.o: bwt_lite.h malloc_wrap.h bwtaln.o: bwtaln.h bwt.h bwtgap.h utils.h bwa.h bntseq.h malloc_wrap.h bwtgap.o: bwtgap.h bwt.h bwtaln.h malloc_wrap.h @@ -66,11 +66,11 @@ bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h bwtsw2_pair.o: malloc_wrap.h ksw.h example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h -is.o: utils.h malloc_wrap.h +is.o: malloc_wrap.h kopen.o: malloc_wrap.h kstring.o: kstring.h malloc_wrap.h ksw.o: ksw.h malloc_wrap.h main.o: utils.h malloc_wrap.o: malloc_wrap.h -pemerge.o: ksw.h malloc_wrap.h kseq.h kstring.h bwa.h bntseq.h bwt.h utils.h +pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h utils.o: utils.h ksort.h malloc_wrap.h kseq.h diff --git a/bamlite.c b/bamlite.c index 851cb6f..3704beb 100644 --- a/bamlite.c +++ b/bamlite.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "bamlite.h" #ifdef USE_MALLOC_WRAPPERS @@ -162,3 +163,48 @@ int bam_read1(bamFile fp, bam1_t *b) if (bam_is_be) swap_endian_data(c, b->data_len, b->data); return 4 + block_len; } + + +#ifdef USE_VERBOSE_ZLIB_WRAPPERS +// Versions of gzopen, gzread and gzclose that print up error messages + +gzFile bamlite_gzopen(const char *fn, const char *mode) { + gzFile fp; + if (strcmp(fn, "-") == 0) { + fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode); + if (!fp) { + fprintf(stderr, "Couldn't open %s : %s", + (strstr(mode, "r"))? "stdin" : "stdout", + strerror(errno)); + } + return fp; + } + if ((fp = gzopen(fn, mode)) == 0) { + fprintf(stderr, "Couldn't open %s : %s\n", fn, + errno ? strerror(errno) : "Out of memory"); + } + return fp; +} + +int bamlite_gzread(gzFile file, void *ptr, unsigned int len) { + int ret = gzread(file, ptr, len); + + if (ret < 0) { + int errnum = 0; + const char *msg = gzerror(file, &errnum); + fprintf(stderr, "gzread error: %s\n", + Z_ERRNO == errnum ? strerror(errno) : msg); + } + return ret; +} + +int bamlite_gzclose(gzFile file) { + int ret = gzclose(file); + if (Z_OK != ret) { + fprintf(stderr, "gzclose error: %s\n", + Z_ERRNO == ret ? strerror(errno) : zError(ret)); + } + + return ret; +} +#endif /* USE_VERBOSE_ZLIB_WRAPPERS */ diff --git a/bamlite.h b/bamlite.h index 640e863..efab7ac 100644 --- a/bamlite.h +++ b/bamlite.h @@ -3,17 +3,26 @@ #include #include -#include "utils.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" #endif +#define USE_VERBOSE_ZLIB_WRAPPERS + typedef gzFile bamFile; -#define bam_open(fn, mode) xzopen(fn, mode) -#define bam_dopen(fd, mode) gzdopen(fd, mode) -#define bam_close(fp) err_gzclose(fp) -#define bam_read(fp, buf, size) err_gzread(fp, buf, size) +#ifdef USE_VERBOSE_ZLIB_WRAPPERS +/* These print error messages on failure */ +# define bam_open(fn, mode) bamlite_gzopen(fn, mode) +# define bam_dopen(fd, mode) gzdopen(fd, mode) +# define bam_close(fp) bamlite_gzclose(fp) +# define bam_read(fp, buf, size) bamlite_gzread(fp, buf, size) +#else +# define bam_open(fn, mode) gzopen(fn, mode) +# define bam_dopen(fd, mode) gzdopen(fd, mode) +# define bam_close(fp) gzclose(fp) +# define bam_read(fp, buf, size) gzread(fp, buf, size) +#endif /* USE_VERBOSE_ZLIB_WRAPPERS */ typedef struct { int32_t n_targets; @@ -92,6 +101,12 @@ extern "C" { bam_header_t *bam_header_read(bamFile fp); int bam_read1(bamFile fp, bam1_t *b); +#ifdef USE_VERBOSE_ZLIB_WRAPPERS + gzFile bamlite_gzopen(const char *fn, const char *mode); + int bamlite_gzread(gzFile file, void *ptr, unsigned int len); + int bamlite_gzclose(gzFile file); +#endif /* USE_VERBOSE_ZLIB_WRAPPERS */ + #ifdef __cplusplus } #endif diff --git a/bwaseqio.c b/bwaseqio.c index d157945..d850307 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -30,6 +30,7 @@ bwa_seqio_t *bwa_bam_open(const char *fn, int which) bs->is_bam = 1; bs->which = which; bs->fp = bam_open(fn, "r"); + if (0 == bs->fp) err_fatal_simple("Couldn't open bam file"); h = bam_header_read(bs->fp); bam_header_destroy(h); return bs; @@ -48,8 +49,9 @@ bwa_seqio_t *bwa_seq_open(const char *fn) void bwa_seq_close(bwa_seqio_t *bs) { if (bs == 0) return; - if (bs->is_bam) bam_close(bs->fp); - else { + if (bs->is_bam) { + if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file"); + } else { err_gzclose(bs->ks->f->f); kseq_destroy(bs->ks); } @@ -94,11 +96,12 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; + int res; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); - while (bam_read1(bs->fp, b) >= 0) { + while ((res = bam_read1(bs->fp, b)) >= 0) { uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; @@ -130,6 +133,7 @@ static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_com p->name = strdup((const char*)bam1_qname(b)); if (n_seqs == n_needed) break; } + if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); diff --git a/bwt_gen.c b/bwt_gen.c index d68b30f..6139d80 100644 --- a/bwt_gen.c +++ b/bwt_gen.c @@ -27,8 +27,8 @@ #include #include #include +#include #include "QSufSort.h" -#include "utils.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" @@ -1448,13 +1448,29 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxB packedFile = (FILE*)fopen(inputFileName, "rb"); if (packedFile == NULL) { - fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open inputFileName!\n"); + fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open %s : %s\n", + inputFileName, strerror(errno)); exit(1); } - err_fseek(packedFile, -1, SEEK_END); - packedFileLen = err_ftell(packedFile); - err_fread_noeof(&lastByteLength, sizeof(unsigned char), 1, packedFile); + if (fseek(packedFile, -1, SEEK_END) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } + packedFileLen = ftell(packedFile); + if (packedFileLen == -1) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't ftell on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } + if (fread(&lastByteLength, sizeof(unsigned char), 1, packedFile) != 1) { + fprintf(stderr, + "BWTIncConstructFromPacked() : Can't read from %s : %s\n", + inputFileName, + ferror(packedFile)? strerror(errno) : "Unexpected end of file"); + exit(1); + } totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); bwtInc = BWTIncCreate(totalTextLength, initialMaxBuildSize, incMaxBuildSize); @@ -1468,10 +1484,23 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxB } textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte - err_fseek(packedFile, -2, SEEK_CUR); - err_fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); - err_fread_noeof(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile); - err_fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR); + if (fseek(packedFile, -((long)textSizeInByte + 2), SEEK_CUR) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } + if (fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile) != textSizeInByte + 1) { + fprintf(stderr, + "BWTIncConstructFromPacked() : Can't read from %s : %s\n", + inputFileName, + ferror(packedFile)? strerror(errno) : "Unexpected end of file"); + exit(1); + } + if (fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); BWTIncConstruct(bwtInc, textToLoad); @@ -1484,9 +1513,23 @@ BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxB textToLoad = totalTextLength - processedTextLength; } textSizeInByte = textToLoad / CHAR_PER_BYTE; - err_fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); - err_fread_noeof(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile); - err_fseek(packedFile, -((long)textSizeInByte), SEEK_CUR); + if (fseek(packedFile, -((long)textSizeInByte), SEEK_CUR) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } + if (fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile) != textSizeInByte) { + fprintf(stderr, + "BWTIncConstructFromPacked() : Can't read from %s : %s\n", + inputFileName, + ferror(packedFile)? strerror(errno) : "Unexpected end of file"); + exit(1); + } + if (fseek(packedFile, -((long)textSizeInByte), SEEK_CUR) != 0) { + fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", + inputFileName, strerror(errno)); + exit(1); + } ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); BWTIncConstruct(bwtInc, textToLoad); processedTextLength += textToLoad; @@ -1531,15 +1574,28 @@ void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *o bwtFile = (FILE*)fopen(bwtFileName, "wb"); if (bwtFile == NULL) { - fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Cannot open BWT code file!\n"); + fprintf(stderr, + "BWTSaveBwtCodeAndOcc(): Cannot open %s for writing: %s\n", + bwtFileName, strerror(errno)); exit(1); } - err_fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile); - err_fwrite(bwt->cumulativeFreq + 1, sizeof(bgint_t), ALPHABET_SIZE, bwtFile); bwtLength = BWTFileSizeInWord(bwt->textLength); - err_fwrite(bwt->bwtCode, sizeof(unsigned int), bwtLength, bwtFile); - err_fclose(bwtFile); + + if (fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile) != 1 + || fwrite(bwt->cumulativeFreq + 1, + sizeof(bgint_t), ALPHABET_SIZE, bwtFile) != ALPHABET_SIZE + || fwrite(bwt->bwtCode, + sizeof(unsigned int), bwtLength, bwtFile) != bwtLength) { + fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Error writing to %s : %s\n", + bwtFileName, strerror(errno)); + exit(1); + } + if (fclose(bwtFile) != 0) { + fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Error on closing %s : %s\n", + bwtFileName, strerror(errno)); + exit(1); + } } void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) diff --git a/is.c b/is.c index 1891668..46f1772 100644 --- a/is.c +++ b/is.c @@ -25,7 +25,6 @@ */ #include -#include "utils.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" @@ -211,7 +210,7 @@ int is_bwt(ubyte_t *T, int n) int *SA, i, primary = 0; SA = (int*)calloc(n+1, sizeof(int)); - if (is_sa(T, SA, n)) err_fatal_simple("is_sa failed"); + if (is_sa(T, SA, n)) return -1; for (i = 0; i <= n; ++i) { if (SA[i] == 0) primary = i; diff --git a/ksw.h b/ksw.h index f9d22c6..97559fd 100644 --- a/ksw.h +++ b/ksw.h @@ -3,10 +3,6 @@ #include -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - #define KSW_XBYTE 0x10000 #define KSW_XSTOP 0x20000 #define KSW_XSUBO 0x40000 From 9a6abe51b67e7bdb9eaf429d5a4ed2d74ad45099 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 22 May 2013 18:57:51 -0400 Subject: [PATCH 404/498] r391: better method to resolve xref alignment The old method does not work when the alignment bridges three chr. This may actually happen often. The new method does not work all the time, either, but should be better than the old one. It is also simpler, arguably. --- Makefile | 6 +++--- bwa.c | 56 +++++++++++++++++++++++++------------------------------- bwamem.c | 4 ---- main.c | 2 +- 4 files changed, 29 insertions(+), 39 deletions(-) diff --git a/Makefile b/Makefile index d39a787..85bb185 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,13 @@ CC= gcc CFLAGS= -g -Wall -O2 -WRAP_MALLOC= -DUSE_MALLOC_WRAPPERS +WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o +LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o malloc_wrap.o AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ - bwtsw2_chain.o fastmap.o bwtsw2_pair.o malloc_wrap.o + bwtsw2_chain.o fastmap.o bwtsw2_pair.o PROG= bwa INCLUDES= LIBS= -lm -lz -lpthread diff --git a/bwa.c b/bwa.c index a20c027..35d3e95 100644 --- a/bwa.c +++ b/bwa.c @@ -146,57 +146,51 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) { - int ib, ie, is_rev; - int64_t fb, fe, mid = -1; + int is_rev; + int64_t cb, ce, fm; + bntann1_t *ra; if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary; actually with BWA-MEM, we should never come to here *qb = *qe = *rb = *re = -1; return -1; // unable to fix - } else { - fb = bns_depos(bns, *rb < bns->l_pac? *rb : *re - 1, &is_rev); - ib = bns_pos2rid(bns, fb); - if (fb - bns->anns[ib].offset + (*re - *rb) <= bns->anns[ib].len) return 0; // no need to fix - fe = bns_depos(bns, *re - 1 < bns->l_pac? *re - 1 : *rb, &is_rev); - ie = bns_pos2rid(bns, fe); - if (ie - ib > 1) { // bridge three or more references - *qb = *qe = *rb = *re = -1; - return -2; // unable to fix - } else { - int l = bns->anns[ib].offset + bns->anns[ib].len - fb; - mid = is_rev? *re - l : *rb + l; - } } - if (mid >= 0) { + fm = bns_depos(bns, (*rb + *re) >> 1, &is_rev); // coordinate of the middle point on the forward strand + ra = &bns->anns[bns_pos2rid(bns, fm)]; // annotation of chr corresponding to the middle point + cb = is_rev? (bns->l_pac<<1) - (ra->offset + ra->len) : ra->offset; // chr start on the mapping strand + ce = cb + ra->len; // chr end + if (cb > *rb || ce < *re) { // fix is needed int i, score, n_cigar, y, NM; uint32_t *cigar; int64_t x; + cb = cb > *rb? cb : *rb; + ce = ce < *re? ce : *re; cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM); for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) { int op = cigar[i]&0xf, len = cigar[i]>>4; if (op == 0) { - if (x <= mid && mid < x + len) { - if (mid - *rb > *re - mid) { // the first part is longer - if (x == mid) { // need to check the previous operation - assert(i); // mid != *rb should always stand - if ((cigar[i-1]&0xf) == 1) *qe = y - (cigar[i-1]>>4), *re = x; - else if ((cigar[i-1]&0xf) == 2) *qe = y, *re = x - (cigar[i-1]>>4); - else abort(); // should not be here - } else *qe = y + (mid - x), *re = mid; - } else *qb = y + (mid - x), *rb = mid; + if (x <= cb && cb < x + len) + *qb = y + (cb - x), *rb = cb; + if (x < ce && ce <= x + len) { + *qe = y + (ce - x), *re = ce; break; } else x += len, y += len; - } else if (op == 1) { // insertion + } else if (op == 1) { y += len; - } else if (op == 2) { // deletion - if (x <= mid && mid < x + len) { - if (mid - *rb > *re - mid) *qe = y, *re = x; - else *qb = y, *rb = x + len; + } else if (op == 2) { + if (x <= cb && cb < x + len) + *qb = y, *rb = x + len; + if (x < ce && ce <= x + len) { + *qe = y, *re = x; break; } else x += len; } else abort(); // should not be here } free(cigar); + if (*qb == *qe || *rb == *re) { // TODO: this may happen in theory, but should be very very rare... + fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Sorry.\n", __func__); + exit(1); + } } - return 1; + return 0; } /********************* diff --git a/bwamem.c b/bwamem.c index 779a221..5ee833b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -789,10 +789,6 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; q = kv_pushp(mem_aln_t, aa); *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); - if (q->rid < 0) { // unfixable cross-reference alignment - --aa.n; - continue; - } q->flag |= extra_flag | (p->secondary >= 0? 0x100 : 0); // flag secondary if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) q->flag |= 0x10000; diff --git a/main.c b/main.c index cc1a4b4..96d8064 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.4-r389-beta" +#define PACKAGE_VERSION "0.7.4-r391-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 9735d7a31ae6f4dce5073e2769e0cbe3c74f06c2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 22 May 2013 19:45:16 -0400 Subject: [PATCH 405/498] conform to the latest (unpublished) SAM spec for chimeric alignments --- Makefile | 1 + bwa.c | 6 +----- bwamem.c | 55 ++++++++++++++++++++++++++++++------------------------- bwamem.h | 1 - fastmap.c | 2 -- 5 files changed, 32 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index 85bb185..69b2ccd 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,5 @@ CC= gcc +#CC= clang --analyze CFLAGS= -g -Wall -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar diff --git a/bwa.c b/bwa.c index 35d3e95..f8949f7 100644 --- a/bwa.c +++ b/bwa.c @@ -185,12 +185,8 @@ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, } else abort(); // should not be here } free(cigar); - if (*qb == *qe || *rb == *re) { // TODO: this may happen in theory, but should be very very rare... - fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Sorry.\n", __func__); - exit(1); - } } - return 0; + return (*qb == *qe || *rb == *re)? -2 : 0; } /********************* diff --git a/bwamem.c b/bwamem.c index 5ee833b..2fdfd46 100644 --- a/bwamem.c +++ b/bwamem.c @@ -671,7 +671,9 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m kputw(p->mapq, str); kputc('\t', str); // MAPQ if (p->n_cigar) { // aligned for (i = 0; i < p->n_cigar; ++i) { - kputw(p->cigar[i]>>4, str); kputc("MIDSH"[p->cigar[i]&0xf], str); + int c = p->cigar[i]&0xf; + if (c == 3 || c == 4) c = which? 4 : 3; // use hard clipping for supplementary alignments + kputw(p->cigar[i]>>4, str); kputc("MIDSH"[c], str); } } else kputc('*', str); // having a coordinate but unaligned (e.g. when copy_mate is true) } else kputsn("*\t0\t0\t*", 7, str); // without coordinte @@ -698,8 +700,8 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m } else if (!p->is_rev) { // the forward strand int i, qb = 0, qe = s->l_seq; if (p->n_cigar) { - if ((p->cigar[0]&0xf) == 4) qb += p->cigar[0]>>4; - if ((p->cigar[p->n_cigar-1]&0xf) == 4) qe -= p->cigar[p->n_cigar-1]>>4; + if (which && ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3)) qb += p->cigar[0]>>4; + if (which && ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3)) qe -= p->cigar[p->n_cigar-1]>>4; } ks_resize(str, str->l + (qe - qb) + 1); for (i = qb; i < qe; ++i) str->s[str->l++] = "ACGTN"[(int)s->seq[i]]; @@ -730,23 +732,25 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } - for (i = 0; i < n; ++i) - if (i != which && !(list[i].flag&0x20000)) break; // 0x20000: shadowed multi hit - if (i < n) { // there are other primary hits; output them - kputsn("\tXP:Z:", 6, str); - for (i = 0; i < n; ++i) { - const mem_aln_t *r = &list[i]; - int k; - if (i == which || (list[i].flag&0x20000)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit - kputs(bns->anns[r->rid].name, str); kputc(',', str); - kputc("+-"[r->is_rev], str); - kputl(r->pos+1, str); kputc(',', str); - for (k = 0; k < r->n_cigar; ++k) { - kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str); + if (!(p->flag & 0x20000)) { // not multi-hit + for (i = 0; i < n; ++i) + if (i != which && !(list[i].flag&0x20000)) break; // 0x20000: shadowed multi hit + if (i < n) { // there are other primary hits; output them + kputsn("\tSP:Z:", 6, str); + for (i = 0; i < n; ++i) { + const mem_aln_t *r = &list[i]; + int k; + if (i == which || (list[i].flag&0x20000)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit + kputs(bns->anns[r->rid].name, str); kputc(',', str); + kputl(r->pos+1, str); kputc(',', str); + kputc("+-"[r->is_rev], str); kputc(',', str); + for (k = 0; k < r->n_cigar; ++k) { + kputw(r->cigar[k]>>4, str); kputc("MIDSH"[r->cigar[k]&0xf], str); + } + kputc(',', str); kputw(r->mapq, str); + kputc(',', str); kputw(r->NM, str); + kputc(';', str); } - kputc(',', str); kputw(r->mapq, str); - kputc(',', str); kputw(r->NM, str); - kputc(';', str); } } if (s->comment) { kputc('\t', str); kputs(s->comment, str); } @@ -791,7 +795,8 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); q->flag |= extra_flag | (p->secondary >= 0? 0x100 : 0); // flag secondary if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score - if ((opt->flag&MEM_F_NO_MULTI) && k && p->secondary < 0) q->flag |= 0x10000; + if (k && p->secondary < 0) // if supplementary + q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800; if (k && q->mapq > aa.a[0].mapq) q->mapq = aa.a[0].mapq; } if (aa.n == 0) { // no alignments good enough; then write an unaligned record @@ -866,9 +871,9 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; if (ar->secondary >= 0) a.flag |= 0x20000; - if (bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) { // unfixable cross-reference alignment - a.rid = -1; a.pos = -1; a.flag |= 0x4; - return a; + if (bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) { + fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Abort. Sorry.\n", __func__); + exit(1); } w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->q, opt->r); w2 = w2 < opt->w? w2 : opt->w; @@ -890,10 +895,10 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2)); if (clip5) { memmove(a.cigar+1, a.cigar, a.n_cigar * 4); - a.cigar[0] = clip5<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3); + a.cigar[0] = clip5<<4 | 3; ++a.n_cigar; } - if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | (opt->flag&MEM_F_HARDCLIP? 4 : 3); + if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | 3; } a.rid = bns_pos2rid(bns, pos); a.pos = pos - bns->anns[a.rid].offset; diff --git a/bwamem.h b/bwamem.h index 76be8e3..be1862a 100644 --- a/bwamem.h +++ b/bwamem.h @@ -11,7 +11,6 @@ struct __smem_i; typedef struct __smem_i smem_i; -#define MEM_F_HARDCLIP 0x1 #define MEM_F_PE 0x2 #define MEM_F_NOPAIRING 0x4 #define MEM_F_ALL 0x8 diff --git a/fastmap.c b/fastmap.c index 592df02..b00ec00 100644 --- a/fastmap.c +++ b/fastmap.c @@ -39,7 +39,6 @@ int main_mem(int argc, char *argv[]) else if (c == 'U') opt->pen_unpaired = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; - else if (c == 'H') opt->flag |= MEM_F_HARDCLIP; else if (c == 'a') opt->flag |= MEM_F_ALL; else if (c == 'p') opt->flag |= MEM_F_PE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; @@ -82,7 +81,6 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); - fprintf(stderr, " -H hard clipping\n"); fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n"); fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n"); fprintf(stderr, "\n"); From 0e759bc1f5822a4589844abc416ca0d5eda0b3b0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 22 May 2013 19:55:07 -0400 Subject: [PATCH 406/498] removed a redundant flag --- bwamem.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bwamem.c b/bwamem.c index 2fdfd46..33ca148 100644 --- a/bwamem.c +++ b/bwamem.c @@ -732,15 +732,15 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } - if (!(p->flag & 0x20000)) { // not multi-hit + if (!(p->flag & 0x100)) { // not multi-hit for (i = 0; i < n; ++i) - if (i != which && !(list[i].flag&0x20000)) break; // 0x20000: shadowed multi hit + if (i != which && !(list[i].flag&0x100)) break; if (i < n) { // there are other primary hits; output them kputsn("\tSP:Z:", 6, str); for (i = 0; i < n; ++i) { const mem_aln_t *r = &list[i]; int k; - if (i == which || (list[i].flag&0x20000)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit + if (i == which || (list[i].flag&0x100)) continue; // proceed if: 1) different from the current; 2) not shadowed multi hit kputs(bns->anns[r->rid].name, str); kputc(',', str); kputl(r->pos+1, str); kputc(',', str); kputc("+-"[r->is_rev], str); kputc(',', str); @@ -793,7 +793,7 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; q = kv_pushp(mem_aln_t, aa); *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); - q->flag |= extra_flag | (p->secondary >= 0? 0x100 : 0); // flag secondary + q->flag |= extra_flag; // flag secondary if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score if (k && p->secondary < 0) // if supplementary q->flag |= (opt->flag&MEM_F_NO_MULTI)? 0x10000 : 0x800; @@ -870,7 +870,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * for (i = 0; i < l_query; ++i) // convert to the nt4 encoding query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; - if (ar->secondary >= 0) a.flag |= 0x20000; + if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment if (bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) { fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Abort. Sorry.\n", __func__); exit(1); From 9441bb7f2a4f326abe4dca17930b985576661d8a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 22 May 2013 20:02:53 -0400 Subject: [PATCH 407/498] r394: added future plan --- bwamem.c | 1 + main.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 33ca148..bfc2811 100644 --- a/bwamem.c +++ b/bwamem.c @@ -777,6 +777,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) return mapq; } +// TODO (future plan): group hits into a uint64_t[] array. This will be cleaner and more flexible void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m) { kstring_t str; diff --git a/main.c b/main.c index 96d8064..96dde70 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.4-r391-beta" +#define PACKAGE_VERSION "0.7.4-r394-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 3d2450ed970b503c68bbbd43d0bc2984bde86676 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 23 May 2013 12:45:14 -0400 Subject: [PATCH 408/498] r395: bugfix - hard clipping not applied on revaln --- bwamem.c | 4 ++-- main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index bfc2811..c0bd0ab 100644 --- a/bwamem.c +++ b/bwamem.c @@ -714,8 +714,8 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m } else { // the reverse strand int i, qb = 0, qe = s->l_seq; if (p->n_cigar) { - if ((p->cigar[0]&0xf) == 4) qe -= p->cigar[0]>>4; - if ((p->cigar[p->n_cigar-1]&0xf) == 4) qb += p->cigar[p->n_cigar-1]>>4; + if (which && ((p->cigar[0]&0xf) == 4 || (p->cigar[0]&0xf) == 3)) qe -= p->cigar[0]>>4; + if (which && ((p->cigar[p->n_cigar-1]&0xf) == 4 || (p->cigar[p->n_cigar-1]&0xf) == 3)) qb += p->cigar[p->n_cigar-1]>>4; } ks_resize(str, str->l + (qe - qb) + 1); for (i = qe-1; i >= qb; --i) str->s[str->l++] = "TGCAN"[(int)s->seq[i]]; diff --git a/main.c b/main.c index 96dde70..72877e4 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.4-r394-beta" +#define PACKAGE_VERSION "0.7.4-r395-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From bde5005f39ccdde63ddc28a91ab7b76d2bb10192 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 23 May 2013 12:48:18 -0400 Subject: [PATCH 409/498] r396: er... the new tag is named SA not SP --- bwamem.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index c0bd0ab..801ea7e 100644 --- a/bwamem.c +++ b/bwamem.c @@ -736,7 +736,7 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m for (i = 0; i < n; ++i) if (i != which && !(list[i].flag&0x100)) break; if (i < n) { // there are other primary hits; output them - kputsn("\tSP:Z:", 6, str); + kputsn("\tSA:Z:", 6, str); for (i = 0; i < n; ++i) { const mem_aln_t *r = &list[i]; int k; diff --git a/main.c b/main.c index 72877e4..5e5a024 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.4-r395-beta" +#define PACKAGE_VERSION "0.7.4-r396-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 599e8407799ec603c3d36e9275cea959ede3a0bf Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 24 May 2013 16:28:18 -0400 Subject: [PATCH 410/498] r397: multi changes/bugfixes to bwa-backtrack 1. Check .sai versioning 2. Keep track of #ins and #del during backtrack 3. Use info above to get accurate aligned regions; don't call SW extension any more 4. Identify alignment crossing the for-rev boundary 5. Fixed a bug in printing the XA tag: ungapped alignments missing --- bwape.c | 8 ++++- bwase.c | 101 ++++++++++++++++++++----------------------------------- bwtaln.c | 2 ++ bwtaln.h | 7 ++-- bwtgap.c | 24 +++++++------ bwtgap.h | 3 +- main.c | 2 +- 7 files changed, 68 insertions(+), 79 deletions(-) diff --git a/bwape.c b/bwape.c index 2a7f46e..7dc3d74 100644 --- a/bwape.c +++ b/bwape.c @@ -633,7 +633,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f gap_opt_t opt, opt0; khint_t iter; isize_info_t last_ii; // this is for the last batch of reads - char str[1024]; + char str[1024], magic[2][4]; bwt_t *bwt; uint8_t *pac; @@ -648,6 +648,12 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f g_hash = kh_init(b128); last_ii.avg = -1.0; + err_fread_noeof(magic[0], 1, 4, fp_sa[0]); + err_fread_noeof(magic[1], 1, 4, fp_sa[1]); + if (strncmp(magic[0], SAI_MAGIC, 4) != 0 || strncmp(magic[1], SAI_MAGIC, 4) != 0) { + fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__); + exit(1); + } err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); ks[0] = bwa_open_reads(opt.mode, fn_fa[0]); opt0 = opt; diff --git a/bwase.c b/bwase.c index 85165d6..dcf29bf 100644 --- a/bwase.c +++ b/bwase.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "bwase.h" #include "bwtaln.h" #include "bntseq.h" @@ -36,6 +37,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma if (p->score > best) break; if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) { s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; + s->ref_shift = (int)p->n_del - (int)p->n_ins; s->score = p->score; s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); } @@ -71,6 +73,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma for (l = q->k; l <= q->l; ++l) { s->multi[z].pos = l; s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins; s->multi[z++].mm = q->n_mm; } rest -= q->l - q->k + 1; @@ -81,6 +84,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma while (x < p) p -= p * j / (i--); s->multi[z].pos = q->l - i; s->multi[z].gap = q->n_gapo + q->n_gape; + s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins; s->multi[z++].mm = q->n_mm; } rest = 0; @@ -107,16 +111,15 @@ int bwa_approx_mapQ(const bwa_seq_t *p, int mm) return (23 < g_log_n[n])? 0 : 23 - g_log_n[n]; } -bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand) +bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int ref_len, int *strand) { bwtint_t pos_f; int is_rev; - pos_f = bns_depos(bns, bwt_sa(bwt, sapos), &is_rev); // pos_f + pos_f = bwt_sa(bwt, sapos); // position on the forward-reverse coordinate + if (pos_f < bns->l_pac && bns->l_pac < pos_f + ref_len) return (bwtint_t)-1; + pos_f = bns_depos(bns, pos_f, &is_rev); // position on the forward strand; this may be the first base or the last base *strand = !is_rev; - /* NB: For gapped alignment, pacpos may not be correct, which will be fixed - * in bwa_refine_gapped_core(). This line also determines the way "x" is - * calculated in bwa_refine_gapped_core() when (ext < 0 && is_end == 0). */ - if (is_rev) pos_f = pos_f + 1 < len? 0 : pos_f - len + 1; // mapped to the forward strand + if (is_rev) pos_f = pos_f + 1 < ref_len? 0 : pos_f - ref_len + 1; // position of the first base return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset } @@ -132,9 +135,11 @@ void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return; max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm; seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); - seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len, &strand); + //fprintf(stderr, "%d\n", seq->ref_shift); + seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len + seq->ref_shift, &strand); seq->strand = strand; seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); + if (seq->pos == (bwtint_t)-1) seq->type = BWA_TYPE_NO_MATCH; } void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr) @@ -150,9 +155,9 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr); for (j = n_multi = 0; j < p->n_multi; ++j) { bwt_multi1_t *q = p->multi + j; - q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len, &strand); + q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len + p->ref_shift, &strand); q->strand = strand; - if (q->pos != p->pos) + if (q->pos != p->pos && q->pos != (bwtint_t)-1) p->multi[n_multi++] = *q; } p->n_multi = n_multi; @@ -162,64 +167,25 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se #define SW_BW 50 -bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, bwtint_t *_pos, int *n_cigar, int is_rev) +bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, int ref_shift, bwtint_t rb, int *n_cigar) { bwa_cigar_t *cigar = 0; uint32_t *cigar32 = 0; ubyte_t *rseq; - int tle, qle, gtle, gscore; - int64_t k, rb, re, rlen; + int64_t k, re, rlen; int8_t mat[25]; bwa_fill_scmat(1, 3, mat); - if (!is_rev) { // forward strand, the end position is correct - re = *_pos + len; - if (re > l_pac) re = l_pac; - rb = re - (len + SW_BW); - if (rb < 0) rb = 0; - rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); - seq_reverse(len, seq, 0); // as we need to do left extension, we have to reverse both query and reference sequences - seq_reverse(rlen, rseq, 0); - ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, >le, &gscore, 0); - if (gscore > 0) tle = gtle, qle = len; - rb = re - tle; rlen = tle; - seq_reverse(len, seq, 0); - seq_reverse(rlen, rseq, 0); - if (rlen == 0) goto refine_gapped_err; - ksw_global(qle, &seq[len-qle], rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); - if (qle < len) { // write soft clip - cigar = realloc(cigar, (*n_cigar + 1) * 4); - memmove(cigar + 1, cigar, *n_cigar * 4); - cigar[0] = (len - qle)<<4 | FROM_S; - ++(*n_cigar); - } - } else { // reverse strand, the start position is correct - rb = *_pos; re = rb + len + SW_BW; - if (re > l_pac) re = l_pac; - rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); - ksw_extend(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, 0, -1, len<<1, &qle, &tle, >le, &gscore, 0); - if (gscore > 0) tle = gtle, qle = len; - re = rb + tle; rlen = tle; - if (rlen == 0) goto refine_gapped_err; - ksw_global(qle, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension - if (qle < len) { - cigar = realloc(cigar, (*n_cigar + 1) * 4); - cigar[*n_cigar - 1] = (len - qle)<<4 | FROM_S; - ++(*n_cigar); - } - } - *_pos = rb; - + re = rb + len + ref_shift; + assert(re <= l_pac); + rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); + assert(re - rb == rlen); + ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension cigar = (bwa_cigar_t*)cigar32; for (k = 0; k < *n_cigar; ++k) cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); free(rseq); return cigar; - -refine_gapped_err: - free(rseq); - *n_cigar = 0; - return 0; } char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq, @@ -311,7 +277,7 @@ void bwa_correct_trimmed(bwa_seq_t *s) void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq) { ubyte_t *pacseq; - int i, j; + int i, j, k; kstring_t *str; if (!_pacseq) { @@ -322,15 +288,18 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t for (i = 0; i != n_seqs; ++i) { bwa_seq_t *s = seqs + i; seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!! - for (j = 0; j < s->n_multi; ++j) { + for (j = k = 0; j < s->n_multi; ++j) { bwt_multi1_t *q = s->multi + j; int n_cigar; - if (q->gap == 0) continue; - q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, &q->pos, &n_cigar, q->strand); - q->n_cigar = n_cigar; + if (q->gap) { // gapped alignment + q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, q->pos, &n_cigar); + q->n_cigar = n_cigar; + if (q->cigar) s->multi[k++] = *q; + } else s->multi[k++] = *q; } + s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; - s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, &s->pos, &s->n_cigar, s->strand); + s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, s->pos, &s->n_cigar); if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH; } // generate MD tag @@ -339,8 +308,7 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t bwa_seq_t *s = seqs + i; if (s->type != BWA_TYPE_NO_MATCH) { int nm; - s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, - bns->l_pac, pacseq, str, &nm); + s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, bns->l_pac, pacseq, str, &nm); s->nm = nm; } } @@ -487,7 +455,6 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in for (i = 0; i < p->n_multi; ++i) { bwt_multi1_t *q = p->multi + i; int k; - if (q->cigar == 0) continue; j = pos_end_multi(q, p->len) - q->pos; nn = bns_cnt_ambi(bns, q->pos, j, &seqid); err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+', @@ -538,6 +505,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f bntseq_t *bns; FILE *fp_sa; gap_opt_t opt; + char magic[4]; // initialization bwase_initialize(); @@ -546,6 +514,11 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f fp_sa = xopen(fn_sa, "r"); m_aln = 0; + err_fread_noeof(magic, 1, 4, fp_sa); + if (strncmp(magic, SAI_MAGIC, 4) != 0) { + fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__); + exit(1); + } err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa); bwa_print_sam_hdr(bns, rg_line); //bwa_print_sam_PG(); diff --git a/bwtaln.c b/bwtaln.c index e772792..68d0274 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -116,6 +116,7 @@ void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, for (j = 0; j < p->len; ++j) // we need to complement p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j]; p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); + //fprintf(stderr, "mm=%lld,ins=%lld,del=%lld,gapo=%lld\n", p->aln->n_mm, p->aln->n_ins, p->aln->n_del, p->aln->n_gapo); // clean up the unused data in the record free(p->name); free(p->seq); free(p->rseq); free(p->qual); p->name = 0; p->seq = p->rseq = p->qual = 0; @@ -173,6 +174,7 @@ void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) } // core loop + err_fwrite(SAI_MAGIC, 1, 4, stdout); err_fwrite(opt, sizeof(gap_opt_t), 1, stdout); while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) { tot_seqs += n_seqs; diff --git a/bwtaln.h b/bwtaln.h index 556f259..4616ff5 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -33,14 +33,15 @@ #define FROM_D 2 #define FROM_S 3 +#define SAI_MAGIC "SAI\1" + typedef struct { bwtint_t w; int bid; } bwt_width_t; typedef struct { - uint32_t n_mm:16, n_gapo:8, n_gape:8; - int score; + uint64_t n_mm:8, n_gapo:8, n_gape:8, score:20, n_ins:10, n_del:10; bwtint_t k, l; } bwt_aln1_t; @@ -57,6 +58,7 @@ typedef uint16_t bwa_cigar_t; typedef struct { uint32_t n_cigar:15, gap:8, mm:8, strand:1; + int ref_shift; bwtint_t pos; bwa_cigar_t *cigar; } bwt_multi1_t; @@ -77,6 +79,7 @@ typedef struct { // alignment information bwtint_t sa, pos; uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ + int ref_shift; int n_cigar; bwa_cigar_t *cigar; // for multi-threading only diff --git a/bwtgap.c b/bwtgap.c index 16d9025..08bc1f4 100644 --- a/bwtgap.c +++ b/bwtgap.c @@ -45,7 +45,7 @@ static void gap_reset_stack(gap_stack_t *stack) stack->n_entries = 0; } -static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, +static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, int n_ins, int n_del, int state, int is_diff, const gap_opt_t *opt) { int score; @@ -59,7 +59,9 @@ static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, i } p = q->stack + q->n_entries; p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l; - p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; p->state = state; + p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; + p->n_ins = n_ins; p->n_del = n_del; + p->state = state; p->last_diff_pos = is_diff? i : 0; ++(q->n_entries); ++(stack->n_entries); @@ -106,7 +108,7 @@ static inline int int_log2(uint32_t v) bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width, bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack) -{ +{ // $seq is the reverse complement of the input read int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt); int best_diff = opt->max_diff + 1, max_diff = opt->max_diff; int best_cnt = 0; @@ -126,7 +128,7 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid //for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w); gap_reset_stack(stack); // reset stack - gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, opt); + gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, 0, 0, opt); while (stack->n_entries) { gap_entry_t e; @@ -186,8 +188,10 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid } p = aln + n_aln; p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; + p->n_ins = e.n_ins; p->n_del = e.n_del; p->k = k; p->l = l; p->score = score; + //fprintf(stderr, "*** n_mm=%d,n_gapo=%d,n_gape=%d,n_ins=%d,n_del=%d\n", e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del); ++n_aln; } continue; @@ -214,24 +218,24 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid if (e.state == STATE_M) { // gap open if (e.n_gapo < opt->max_gapo) { // gap open is allowed // insertion - gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_I, 1, opt); + gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins + 1, e.n_del, STATE_I, 1, opt); // deletion for (j = 0; j != 4; ++j) { k = bwt->L2[j] + cnt_k[j] + 1; l = bwt->L2[j] + cnt_l[j]; - if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, STATE_D, 1, opt); + if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins, e.n_del + 1, STATE_D, 1, opt); } } } else if (e.state == STATE_I) { // extention of an insertion if (e.n_gape < opt->max_gape) // gap extention is allowed - gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_I, 1, opt); + gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins + 1, e.n_del, STATE_I, 1, opt); } else if (e.state == STATE_D) { // extention of a deletion if (e.n_gape < opt->max_gape) { // gap extention is allowed if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) { for (j = 0; j != 4; ++j) { k = bwt->L2[j] + cnt_k[j] + 1; l = bwt->L2[j] + cnt_l[j]; - if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, STATE_D, 1, opt); + if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins, e.n_del + 1, STATE_D, 1, opt); } } } @@ -244,13 +248,13 @@ bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_wid int is_mm = (j != 4 || seq[i] > 3); k = bwt->L2[c] + cnt_k[c] + 1; l = bwt->L2[c] + cnt_l[c]; - if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, STATE_M, is_mm, opt); + if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, is_mm, opt); } } else if (seq[i] < 4) { // try exact match only int c = seq[i] & 3; k = bwt->L2[c] + cnt_k[c] + 1; l = bwt->L2[c] + cnt_l[c]; - if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt); + if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, 0, opt); } } diff --git a/bwtgap.h b/bwtgap.h index 8398762..7dd6165 100644 --- a/bwtgap.h +++ b/bwtgap.h @@ -7,8 +7,9 @@ typedef struct { // recursion stack u_int32_t info; // score<<21 | i u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6; - bwtint_t k, l; // (k,l) is the SA region of [i,n-1] + u_int32_t n_ins:16, n_del:16; int last_diff_pos; + bwtint_t k, l; // (k,l) is the SA region of [i,n-1] } gap_entry_t; typedef struct { diff --git a/main.c b/main.c index 5e5a024..d243144 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.4-r396-beta" +#define PACKAGE_VERSION "0.7.4-r397-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 607e11d43d840ec5abbbbcf685e8b469521b91a6 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 24 May 2013 20:49:36 -0400 Subject: [PATCH 411/498] updated README --- README | 36 --------------------------- README.md | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ bwa.1 | 11 +++++---- bwase.c | 21 +++++++++++----- 4 files changed, 94 insertions(+), 47 deletions(-) delete mode 100644 README create mode 100644 README.md diff --git a/README b/README deleted file mode 100644 index dd1d335..0000000 --- a/README +++ /dev/null @@ -1,36 +0,0 @@ -Released packages can be downloaded from SourceForge.net: - - http://sourceforge.net/projects/bio-bwa/files/ - -Introduction and FAQ are available at: - - http://bio-bwa.sourceforge.net - -Manual page at: - - http://bio-bwa.sourceforge.net/bwa.shtml - -Mailing list: - - bio-bwa-help@lists.sourceforge.net - -To sign up: - - http://sourceforge.net/mail/?group_id=276243 - -Publications (Open Access): - - http://www.ncbi.nlm.nih.gov/pubmed/20080505 - http://www.ncbi.nlm.nih.gov/pubmed/19451168 - -Incomplete list of citations (via HubMed.org): - - http://www.hubmed.org/references.cgi?uids=20080505 - http://www.hubmed.org/references.cgi?uids=19451168 - -Related projects: - - http://pbwa.sourceforge.net/ - http://www.many-core.group.cam.ac.uk/projects/lam.shtml - http://biodoop-seal.sourceforge.net/ - http://gitorious.org/bwa-cuda diff --git a/README.md b/README.md new file mode 100644 index 0000000..a46a691 --- /dev/null +++ b/README.md @@ -0,0 +1,73 @@ +###Getting started + + git clone https://github.com/lh3/bwa.git + cd bwa; make + ./bwa index ref.fa + ./bwa mem ref.fa read-se.fq.gz | gzip -3 > aln-se.sam.gz + ./bwa mem ref.fa read1.fq read2.fq | gzip -3 > aln-pe.sam.gz + +###Introduction + +BWA is a software package for mapping low-divergent sequences against a large +reference genome, such as the human genome. It consists of three algorithms: +BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina +sequence reads up to 100bp, while the rest two for longer sequences ranged from +70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as the support of +long reads and chimeric alignment, but BWA-MEM, which is the latest, is +generally recommended for high-quality queries as it is faster and more +accurate. BWA-MEM also has better performance than BWA-backtrack for 70-100bp +Illumina reads. + +For all the algorithms, BWA first needs to construct the FM-index for the +reference genome (the **index** command). Alignment algorithms are invoked with +different sub-commands: **aln**/**samse**/**sampe** for BWA-backtrack, +**bwasw** for BWA-SW and **mem** for the BWA-MEM algorithm. + +###Availability + +BWA is released under [GPLv3][1]. The latest souce code is [freely +available][2] at github. Released packages can [be downloaded ][3] at +SourceForge. After you acquire the source code, simply use `make` to compile +and copy the single executable `bwa` to the destination you want. + +###Seeking helps + +The detailed usage is described in the man page available together with the +source code. You can use `man ./bwa.1` to view the man page in a terminal. The +[HTML version][4] of the man page can be found at the [BWA website][5]. If you +have questions about BWA, you may [sign up the mailing list][6] and then send +the questions to [bio-bwa-help@sourceforge.net][7]. You may also ask questions +in forums such as [BioStar][8] and [SEQanswers][9]. + +###Citing BWA + +* Li H. and Durbin R. (2009) Fast and accurate short read alignment with + Burrows-Wheeler transform. *Bioinformatics*, **25**, 1754-1760. [PMID: + [19451168][10]]. (if you use the BWA-backtrack algorithm) + +* Li H. and Durbin R. (2010) Fast and accurate long-read alignment with + Burrows-Wheeler transform. *Bioinformatics*, **26**, 589-595. [PMID: + [20080505][11]]. (if you use the BWA-SW algorithm) + +* Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs + with BWA-MEM. [arXiv:1303.3997v1][12] [q-bio.GN]. (if you use the BWA-MEM + algorithm or the **fastmap** command) + +Please note that the last reference is a preprint hosted at [arXiv.org][13]. I +do not have plan to submit it to a peer-reviewed journal in the near future. + + + +[1]: http://en.wikipedia.org/wiki/GNU_General_Public_License +[2]: https://github.com/lh3/bwa +[3]: http://sourceforge.net/projects/bio-bwa/files/ +[4]: http://bio-bwa.sourceforge.net/bwa.shtml +[5]: http://bio-bwa.sourceforge.net/ +[6]: https://lists.sourceforge.net/lists/listinfo/bio-bwa-help +[7]: mailto:bio-bwa-help@sourceforge.net +[8]: http://biostars.org +[9]: http://seqanswers.com/ +[10]: http://www.ncbi.nlm.nih.gov/pubmed/19451168 +[11]: http://www.ncbi.nlm.nih.gov/pubmed/20080505 +[12]: http://arxiv.org/abs/1303.3997 +[13]: http://arxiv.org/ diff --git a/bwa.1 b/bwa.1 index d25ba4a..e63fe8d 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "23 April 2013" "bwa-0.7.4" "Bioinformatics tools" +.TH bwa 1 "24 May 2013" "bwa-0.7.5" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -718,12 +718,13 @@ If you use the BWA-SW algorithm, please cite: Li H. and Durbin R. (2010) Fast and accurate long-read alignment with Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505] .PP -If you use the fastmap component of BWA, please cite: +If you use BWA-MEM or the fastmap component of BWA, please cite: .PP -Li H. (2012) Exploring single-sample SNP and INDEL calling with whole-genome de -novo assembly. Bioinformatics, 28, 1838-1844. [PMID: 22569178] +Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with +BWA-MEM. arXiv:1303.3997v1 [q-bio.GN]. .PP -The BWA-MEM algorithm has not been published yet. +It is likely that the BWA-MEM manuscript will not appear in a peer-reviewed +journal. .SH HISTORY BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW diff --git a/bwase.c b/bwase.c index dcf29bf..5bb8116 100644 --- a/bwase.c +++ b/bwase.c @@ -167,20 +167,29 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se #define SW_BW 50 -bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, int ref_shift, bwtint_t rb, int *n_cigar) +bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, int ref_shift, bwtint_t *_rb, int *n_cigar) { bwa_cigar_t *cigar = 0; uint32_t *cigar32 = 0; ubyte_t *rseq; - int64_t k, re, rlen; + int64_t k, rb, re, rlen; int8_t mat[25]; bwa_fill_scmat(1, 3, mat); - re = rb + len + ref_shift; + rb = *_rb; re = rb + len + ref_shift; assert(re <= l_pac); rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); assert(re - rb == rlen); - ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); // right extension + ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); + assert(*n_cigar > 0); + if ((cigar32[*n_cigar - 1]&0xf) == 1) cigar32[*n_cigar - 1] = (cigar32[*n_cigar - 1]>>4<<4) | 4; // change endding ins to soft clipping + if ((cigar32[0]&0xf) == 1) cigar32[0] = (cigar32[0]>>4<<4) | 4; // change beginning ins to soft clipping + if ((cigar32[*n_cigar - 1]&0xf) == 2) --*n_cigar; // delete endding del + if ((cigar32[0]&0xf) == 2) { // delete beginning del + *_rb += cigar32[0]>>4; + --*n_cigar; + memmove(cigar32, cigar32+1, (*n_cigar) * 4); + } cigar = (bwa_cigar_t*)cigar32; for (k = 0; k < *n_cigar; ++k) cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); @@ -292,14 +301,14 @@ void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t bwt_multi1_t *q = s->multi + j; int n_cigar; if (q->gap) { // gapped alignment - q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, q->pos, &n_cigar); + q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, &q->pos, &n_cigar); q->n_cigar = n_cigar; if (q->cigar) s->multi[k++] = *q; } else s->multi[k++] = *q; } s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; - s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, s->pos, &s->n_cigar); + s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, &s->pos, &s->n_cigar); if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH; } // generate MD tag From 02d9bf123f15c763b837c5cadcff8d4315990ddf Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 24 May 2013 20:51:24 -0400 Subject: [PATCH 412/498] Minor formatting issues --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a46a691..4689b6d 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Illumina reads. For all the algorithms, BWA first needs to construct the FM-index for the reference genome (the **index** command). Alignment algorithms are invoked with -different sub-commands: **aln**/**samse**/**sampe** for BWA-backtrack, +different sub-commands: **aln/samse/sampe** for BWA-backtrack, **bwasw** for BWA-SW and **mem** for the BWA-MEM algorithm. ###Availability From 6fed8fa0b4cb79283b410001fb48b68fa39c07a5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 24 May 2013 21:17:49 -0400 Subject: [PATCH 413/498] minor changes --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4689b6d..19ab3f7 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ in forums such as [BioStar][8] and [SEQanswers][9]. * Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. [arXiv:1303.3997v1][12] [q-bio.GN]. (if you use the BWA-MEM - algorithm or the **fastmap** command) + algorithm or the **fastmap** command, or want to cite the whole BWA package) Please note that the last reference is a preprint hosted at [arXiv.org][13]. I do not have plan to submit it to a peer-reviewed journal in the near future. From 73619754f89c348a1668bf19bbff3691b50705ac Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 27 May 2013 22:24:35 -0400 Subject: [PATCH 414/498] r401: bugfix - forgot to change sampe some changes to samse should also be applied to sampe --- bwape.c | 8 ++++---- bwase.c | 2 +- main.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bwape.c b/bwape.c index 7dc3d74..08490e7 100644 --- a/bwape.c +++ b/bwape.c @@ -296,7 +296,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw int strand; int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff; p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff); - p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len, &strand); + p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len + p[j]->ref_shift, &strand); p[j]->strand = strand; } } @@ -345,7 +345,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n); for (l = r->k; l <= r->l; ++l) { int strand; - z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand)<<1; + z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand)<<1; z->a[l - r->k] |= strand; } } @@ -357,7 +357,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw } else { // then calculate on the fly for (l = r->k; l <= r->l; ++l) { int strand; - x.x = bwa_sa2pos(bns, bwt, l, p[j]->len, &strand); + x.x = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand); x.y = k<<2 | strand<<1 | j; kv_push(pair64_t, d->arr, x); } @@ -377,7 +377,7 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw for (k = 0, n_multi = 0; k < p[j]->n_multi; ++k) { int strand; bwt_multi1_t *q = p[j]->multi + k; - q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len, &strand); + q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len + q->ref_shift, &strand); q->strand = strand; if (q->pos != p[j]->pos) p[j]->multi[n_multi++] = *q; diff --git a/bwase.c b/bwase.c index 5bb8116..4c9e0fd 100644 --- a/bwase.c +++ b/bwase.c @@ -155,7 +155,7 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr); for (j = n_multi = 0; j < p->n_multi; ++j) { bwt_multi1_t *q = p->multi + j; - q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len + p->ref_shift, &strand); + q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len + q->ref_shift, &strand); q->strand = strand; if (q->pos != p->pos && q->pos != (bwtint_t)-1) p->multi[n_multi++] = *q; diff --git a/main.c b/main.c index d243144..de50a70 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.4-r397-beta" +#define PACKAGE_VERSION "0.7.4-r401-beta" #endif int bwa_fa2pac(int argc, char *argv[]); From 4e992769d5a2daf5c457e6e204ef23bf572d4626 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 28 May 2013 11:18:42 -0400 Subject: [PATCH 415/498] r402: updated NEWS (prepare for the 0.7.5 release) --- NEWS | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 2 +- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 86f4114..fff9587 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,54 @@ +Release 0.7.5 (28 May, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Changes in all components: + + * Improved error checking. Patches probived by Rob Davies. + + * Updated README. + + * Bugfix: return code is zero upon errors. + +Changes in BWA-MEM: + + * Changed the way a chimeric alignment is reported (conforming to the upcoming + SAM spec v1.5). With 0.7.5, if the read has a chimeric alignment, the paired + or the top hit uses soft clipping and is marked with neither 0x800 nor 0x100 + bits. All the other hits part of the chimeric alignment will use hard + clipping and be marked with 0x800 if option "-M" is not in use, or marked + with 0x100 otherwise. + + * Other hits part of a chimeric alignment are now reported in the SA tag, + conforming to the SAM spec v1.5. + + * Better method for resolving an alignment bridging two or more short + reference sequences. The current strategy maps the query to the reference + sequence that covers the middle point of the alignment. For most + applications, this change has no effects. + +Changes in BWA-backtrack: + + * Added a magic number to .sai files. This prevents samse/sampe from reading + corrupted (e.g. a .sai file containing LSF log) or incompatible .sai + generated by a different version of bwa. + + * Bugfix: alignments in the XA:Z: tag were wrong. + + * Keep track of #ins and #del during backtracking. This simplifies the code + and reduces errors in rare corner cases. I should have done this in the + early days of bwa. + +In addition, if you use BWA-MEM or the fastmap command of BWA, please cite: + + - Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs + with BWA-MEM. arXiv:1303.3997v2 [q-bio.GN]. + +Thank you. + +(0.7.5: 28 May 2013, r402) + + + Release 0.7.4 (23 April, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/README.md b/README.md index 19ab3f7..d903e38 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ in forums such as [BioStar][8] and [SEQanswers][9]. [20080505][11]]. (if you use the BWA-SW algorithm) * Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs - with BWA-MEM. [arXiv:1303.3997v1][12] [q-bio.GN]. (if you use the BWA-MEM + with BWA-MEM. [arXiv:1303.3997v2][12] [q-bio.GN]. (if you use the BWA-MEM algorithm or the **fastmap** command, or want to cite the whole BWA package) Please note that the last reference is a preprint hosted at [arXiv.org][13]. I From 01c9cd152b2ac9004a53421c8aefb1d7a58e132a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 28 May 2013 13:40:31 -0400 Subject: [PATCH 416/498] fixed a typo in NEWS --- NEWS | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index fff9587..230c3b3 100644 --- a/NEWS +++ b/NEWS @@ -3,7 +3,8 @@ Release 0.7.5 (28 May, 2013) Changes in all components: - * Improved error checking. Patches probived by Rob Davies. + * Improved error checking on memory allocation and file reading/writing. + Patches provided by Rob Davies. * Updated README. @@ -29,7 +30,7 @@ Changes in BWA-MEM: Changes in BWA-backtrack: * Added a magic number to .sai files. This prevents samse/sampe from reading - corrupted (e.g. a .sai file containing LSF log) or incompatible .sai + corrupted .sai (e.g. a .sai file containing LSF log) or incompatible .sai generated by a different version of bwa. * Bugfix: alignments in the XA:Z: tag were wrong. From ef18cb91cb2211250898d2ec94203b13328e4a6a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 29 May 2013 11:49:08 -0400 Subject: [PATCH 417/498] Release bwa-0.7.5-r404 --- NEWS | 8 ++++---- main.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/NEWS b/NEWS index 230c3b3..f53e337 100644 --- a/NEWS +++ b/NEWS @@ -1,10 +1,10 @@ -Release 0.7.5 (28 May, 2013) +Release 0.7.5 (29 May, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Changes in all components: - * Improved error checking on memory allocation and file reading/writing. - Patches provided by Rob Davies. + * Improved error checking on memory allocation and file I/O. Patches provided + by Rob Davies. * Updated README. @@ -46,7 +46,7 @@ In addition, if you use BWA-MEM or the fastmap command of BWA, please cite: Thank you. -(0.7.5: 28 May 2013, r402) +(0.7.5: 29 May 2013, r404) diff --git a/main.c b/main.c index de50a70..2c6d08c 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.4-r401-beta" +#define PACKAGE_VERSION "0.7.5-r404" #endif int bwa_fa2pac(int argc, char *argv[]); From 7ec8b5c9e7c77f034925039d69eba20e0470b91f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 30 May 2013 16:20:16 -0400 Subject: [PATCH 418/498] Release bwa-0.7.5a --- NEWS | 10 ++++++++++ bwase.c | 4 ++-- main.c | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index f53e337..29627f0 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,13 @@ +Release 0.7.5a (30 May, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Fixed a bug in BWA-backtrack which leads to off-by-one mapping errors in rare +cases. + +(0.7.5a: 30 May 2013, r405) + + + Release 0.7.5 (29 May, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bwase.c b/bwase.c index 4c9e0fd..746add5 100644 --- a/bwase.c +++ b/bwase.c @@ -182,8 +182,8 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l assert(re - rb == rlen); ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); assert(*n_cigar > 0); - if ((cigar32[*n_cigar - 1]&0xf) == 1) cigar32[*n_cigar - 1] = (cigar32[*n_cigar - 1]>>4<<4) | 4; // change endding ins to soft clipping - if ((cigar32[0]&0xf) == 1) cigar32[0] = (cigar32[0]>>4<<4) | 4; // change beginning ins to soft clipping + if ((cigar32[*n_cigar - 1]&0xf) == 1) cigar32[*n_cigar - 1] = (cigar32[*n_cigar - 1]>>4<<4) | 3; // change endding ins to soft clipping + if ((cigar32[0]&0xf) == 1) cigar32[0] = (cigar32[0]>>4<<4) | 3; // change beginning ins to soft clipping if ((cigar32[*n_cigar - 1]&0xf) == 2) --*n_cigar; // delete endding del if ((cigar32[0]&0xf) == 2) { // delete beginning del *_rb += cigar32[0]>>4; diff --git a/main.c b/main.c index 2c6d08c..662d54f 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5-r404" +#define PACKAGE_VERSION "0.7.5a-r405" #endif int bwa_fa2pac(int argc, char *argv[]); From 20dc9dd41add925d7770b27f9f0588afec5f519c Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 14 Jun 2013 13:57:22 +0100 Subject: [PATCH 419/498] Check that paired reads have the same QNAME This detects desynchronised input files, which occasionally happens due to user error or system failure. Checking the names just after printing them has no real performance implications because the strings are already in cache. (It might be better to check while reading the input, but that would be more complicated in the two-input-files case.) --- bwamem_pair.c | 2 ++ bwape.c | 1 + 2 files changed, 3 insertions(+) diff --git a/bwamem_pair.c b/bwamem_pair.c index 06aacff..c218925 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -301,6 +301,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co h[1] = mem_reg2aln(opt, bns, pac, s[1].l_seq, s[1].seq, &a[1].a[z[1]]); h[1].mapq = q_se[1]; h[1].flag |= 0x80 | extra_flag; mem_aln2sam(bns, &str, &s[0], 1, &h[0], 0, &h[1]); s[0].sam = strdup(str.s); str.l = 0; mem_aln2sam(bns, &str, &s[1], 1, &h[1], 0, &h[0]); s[1].sam = str.s; + if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); free(h[0].cigar); free(h[1].cigar); } else goto no_pairing; return n; @@ -319,6 +320,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co } mem_reg2sam_se(opt, bns, pac, &s[0], &a[0], 0x41|extra_flag, &h[1]); mem_reg2sam_se(opt, bns, pac, &s[1], &a[1], 0x81|extra_flag, &h[0]); + if (strcmp(s[0].name, s[1].name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", s[0].name, s[1].name); free(h[0].cigar); free(h[1].cigar); return n; } diff --git a/bwape.c b/bwape.c index 08490e7..2c96e06 100644 --- a/bwape.c +++ b/bwape.c @@ -706,6 +706,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f } bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2); bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2); + if (strcmp(p[0]->name, p[1]->name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", p[0]->name, p[1]->name); } fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); From 128ffc089b02f50fb9845d7dbadcf9accaa61a94 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 14 Jun 2013 14:00:24 +0100 Subject: [PATCH 420/498] Complain when bwa mem is given too many filenames Reads in extra .fq filenames beyond "bwa mem index one.fq two.fq" will not be aligned, so complain about such invalid usage instead. --- fastmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastmap.c b/fastmap.c index b00ec00..40ccdf9 100644 --- a/fastmap.c +++ b/fastmap.c @@ -54,7 +54,7 @@ int main_mem(int argc, char *argv[]) else return 1; } if (opt->n_threads < 1) opt->n_threads = 1; - if (optind + 1 >= argc) { + if (optind + 1 >= argc || optind + 3 < argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); fprintf(stderr, "Algorithm options:\n\n"); From b88718d8f467121ad3a454758e99d05d5e7bc94a Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 14 Jun 2013 14:03:08 +0100 Subject: [PATCH 421/498] Reformat note for 80 columns, and fix typo --- main.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/main.c b/main.c index 662d54f..0a5d8a8 100644 --- a/main.c +++ b/main.c @@ -46,10 +46,11 @@ static int usage() fprintf(stderr, " bwtupdate update .bwt to the new format\n"); fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); fprintf(stderr, "\n"); - fprintf(stderr, "Note: To use BWA, you need to first index the genome with `bwa index'. There are\n"); - fprintf(stderr, " three alignment algorithms in BWA: `mem', `bwasw' and `aln/samse/sampe'. If\n"); - fprintf(stderr, " you are not sure which to use, try `bwa mem' first. Please `man ./bwa.1' for\n"); - fprintf(stderr, " for the manual.\n\n"); + fprintf(stderr, +"Note: To use BWA, you need to first index the genome with `bwa index'.\n" +" There are three alignment algorithms in BWA: `mem', `bwasw', and\n" +" `aln/samse/sampe'. If you are not sure which to use, try `bwa mem'\n" +" first. Please `man ./bwa.1' for the manual.\n\n"); return 1; } From 3b84c03c1e01686525bcb29caf4e3d4f1b94c46e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 28 Aug 2013 15:59:05 -0400 Subject: [PATCH 422/498] r406: allow to use diff clipping penalties for 5'-end or for 3'-end --- bwamem.c | 10 +++++----- bwamem.h | 2 +- fastmap.c | 12 +++++++++--- main.c | 2 +- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/bwamem.c b/bwamem.c index 801ea7e..06fc8ac 100644 --- a/bwamem.c +++ b/bwamem.c @@ -51,7 +51,7 @@ mem_opt_t *mem_opt_init() o->T = 30; o->zdrop = 100; o->pen_unpaired = 17; - o->pen_clip = 5; + o->pen_clip5 = o->pen_clip3 = 5; o->min_seed_len = 19; o->split_width = 10; o->max_occ = 10000; @@ -572,12 +572,12 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[0] = opt->w << i; - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->pen_clip, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); + a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); if (bwa_verbose >= 4) { printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } // check whether we prefer to reach the end of the query - if (gscore <= 0 || gscore <= a->score - opt->pen_clip) { // local extension + if (gscore <= 0 || gscore <= a->score - opt->pen_clip5) { // local extension a->qb = s->qbeg - qle, a->rb = s->rbeg - tle; a->truesc = a->score; } else { // to-end extension @@ -595,12 +595,12 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[1] = opt->w << i; - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->pen_clip, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); + a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); if (bwa_verbose >= 4) { printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } // similar to the above - if (gscore <= 0 || gscore <= a->score - opt->pen_clip) { // local extension + if (gscore <= 0 || gscore <= a->score - opt->pen_clip3) { // local extension a->qe = qe + qle, a->re = rmax[0] + re + tle; a->truesc += a->score - sc0; } else { // to-end extension diff --git a/bwamem.h b/bwamem.h index be1862a..a81c92b 100644 --- a/bwamem.h +++ b/bwamem.h @@ -20,7 +20,7 @@ typedef struct __smem_i smem_i; typedef struct { int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r int pen_unpaired; // phred-scaled penalty for unpaired reads - int pen_clip; // clipping penalty. This score is not deducted from the DP score. + int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score. int w; // band width int zdrop; // Z-dropoff diff --git a/fastmap.c b/fastmap.c index b00ec00..8b4ae3f 100644 --- a/fastmap.c +++ b/fastmap.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "bwa.h" #include "bwamem.h" #include "kvec.h" @@ -35,7 +36,6 @@ int main_mem(int argc, char *argv[]) else if (c == 'O') opt->q = atoi(optarg); else if (c == 'E') opt->r = atoi(optarg); else if (c == 'T') opt->T = atoi(optarg); - else if (c == 'L') opt->pen_clip = atoi(optarg); else if (c == 'U') opt->pen_unpaired = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; @@ -48,7 +48,13 @@ int main_mem(int argc, char *argv[]) else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); else if (c == 'C') copy_comment = 1; - else if (c == 'R') { + else if (c == 'L') { + char *p; + opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10); + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + opt->pen_clip3 = strtol(p+1, &p, 10); + fprintf(stderr, "%d,%d\n", opt->pen_clip5, opt->pen_clip3); + } else if (c == 'R') { if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak } else if (c == 's') opt->split_width = atoi(optarg); else return 1; @@ -71,7 +77,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r); - fprintf(stderr, " -L INT penalty for clipping [%d]\n", opt->pen_clip); + fprintf(stderr, " -L INT penalty for clipping [%d]\n", opt->pen_clip5); fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); diff --git a/main.c b/main.c index 662d54f..06d24bc 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r405" +#define PACKAGE_VERSION "0.7.5a-r406" #endif int bwa_fa2pac(int argc, char *argv[]); From 623da055e1aa0a673286bb64777a7155bead490b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 6 Sep 2013 12:31:47 -0400 Subject: [PATCH 423/498] alternative way to estimate mapQ the old mapQ estimate is too conservative --- bwamem.c | 15 +++++++++++++-- bwamem.h | 2 ++ fastmap.c | 8 ++++++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/bwamem.c b/bwamem.c index 06fc8ac..e0aa3e4 100644 --- a/bwamem.c +++ b/bwamem.c @@ -63,6 +63,8 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->max_matesw = 100; + o->mapQ_coef_len = 100; + o->mapQ_coef_fac = log(o->mapQ_coef_len); bwa_fill_scmat(o->a, o->b, o->mat); return o; } @@ -768,9 +770,18 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) sub = a->csub > sub? a->csub : sub; if (sub >= a->score) return 0; l = a->qe - a->qb > a->re - a->rb? a->qe - a->qb : a->re - a->rb; - mapq = a->score? (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499) : 0; identity = 1. - (double)(l * opt->a - a->score) / (opt->a + opt->b) / l; - mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; + if (a->score == 0) { + mapq = 0; + } else if (opt->mapQ_coef_len > 0) { + double tmp; + tmp = 6.02 * (a->score - sub) / opt->a * identity; + if (l > opt->mapQ_coef_len) tmp *= log(l) / opt->mapQ_coef_fac; + mapq = (int)(tmp + .499); + } else { + mapq = (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499); + mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; + } if (a->sub_n > 0) mapq -= (int)(4.343 * log(a->sub_n+1) + .499); if (mapq > 60) mapq = 60; if (mapq < 0) mapq = 0; diff --git a/bwamem.h b/bwamem.h index a81c92b..01b690c 100644 --- a/bwamem.h +++ b/bwamem.h @@ -35,6 +35,8 @@ typedef struct { int chunk_size; // process chunk_size-bp sequences in a batch float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain + float mapQ_coef_len; + int mapQ_coef_fac; int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset diff --git a/fastmap.c b/fastmap.c index 598194b..21e47e0 100644 --- a/fastmap.c +++ b/fastmap.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "bwa.h" #include "bwamem.h" #include "kvec.h" @@ -28,7 +29,7 @@ int main_mem(int argc, char *argv[]) void *ko = 0, *ko2 = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:")) >= 0) { + while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); @@ -48,7 +49,10 @@ int main_mem(int argc, char *argv[]) else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); else if (c == 'C') copy_comment = 1; - else if (c == 'L') { + else if (c == 'Q') { + opt->mapQ_coef_len = atoi(optarg); + opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0; + } else if (c == 'L') { char *p; opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10); if (*p != 0 && ispunct(*p) && isdigit(p[1])) From 451d60f3be09cc8f4e76537290dd116eff172166 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 6 Sep 2013 12:37:38 -0400 Subject: [PATCH 424/498] slight modification --- bwamem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index e0aa3e4..0b8ec48 100644 --- a/bwamem.c +++ b/bwamem.c @@ -775,9 +775,9 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) mapq = 0; } else if (opt->mapQ_coef_len > 0) { double tmp; - tmp = 6.02 * (a->score - sub) / opt->a * identity; - if (l > opt->mapQ_coef_len) tmp *= log(l) / opt->mapQ_coef_fac; - mapq = (int)(tmp + .499); + tmp = l < opt->mapQ_coef_len? 1. : opt->mapQ_coef_fac / log(l); + tmp *= identity; + mapq = (int)(6.02 * (a->score - sub) / opt->a * tmp * tmp + .499); } else { mapq = (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499); mapq = identity < 0.95? (int)(mapq * identity * identity + .499) : mapq; From 1346f03ff12a9e1510d6f8835734a1a08bbd7661 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 6 Sep 2013 14:04:41 -0400 Subject: [PATCH 425/498] use the old mapQ by default the new mapQ overestimate --- bwamem.c | 4 ++-- main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 0b8ec48..488913a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -63,8 +63,8 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->max_matesw = 100; - o->mapQ_coef_len = 100; - o->mapQ_coef_fac = log(o->mapQ_coef_len); +// o->mapQ_coef_len = 100; o->mapQ_coef_fac = log(o->mapQ_coef_len); + o->mapQ_coef_len = o->mapQ_coef_fac = 0; bwa_fill_scmat(o->a, o->b, o->mat); return o; } diff --git a/main.c b/main.c index dd60d1c..fdf48b8 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r406" +#define PACKAGE_VERSION "0.7.5a-r411" #endif int bwa_fa2pac(int argc, char *argv[]); From 1e2cff20baff8ae22ee62360dd565099b7befc76 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 9 Sep 2013 08:57:45 -0400 Subject: [PATCH 426/498] more conservative mapQ --- bwamem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 488913a..44124c7 100644 --- a/bwamem.c +++ b/bwamem.c @@ -776,7 +776,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) } else if (opt->mapQ_coef_len > 0) { double tmp; tmp = l < opt->mapQ_coef_len? 1. : opt->mapQ_coef_fac / log(l); - tmp *= identity; + tmp *= a->seedcov < l? (double)a->seedcov / identity : 1; mapq = (int)(6.02 * (a->score - sub) / opt->a * tmp * tmp + .499); } else { mapq = (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499); From b51a66e4c1dc6722051fd4a7c8b8b6ad098ad106 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 9 Sep 2013 11:36:50 -0400 Subject: [PATCH 427/498] r413: fixed an issue causing redundant alignment I have seen a fosmid aligned to the same position but with two slightly different CIGARs: 30000M and 29900M50D100M, possibly caused by tandem repeats. 0.7.5a will regard them as two distinct alignments and generates a very small mapping quality. However, these two are essentially the same. Although there is ambiguity in aligning the end of the fosmid, we should not penalize the entire alignment with a small mapQ. This commit fixes this issue. More testing is needed, though. --- bwamem.c | 34 +++++++++++++++++++++++++++++----- bwamem.h | 1 + main.c | 2 +- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/bwamem.c b/bwamem.c index 44124c7..ebc2ac2 100644 --- a/bwamem.c +++ b/bwamem.c @@ -63,6 +63,7 @@ mem_opt_t *mem_opt_init() o->chunk_size = 10000000; o->n_threads = 1; o->max_matesw = 100; + o->mask_level_redun = 0.95; // o->mapQ_coef_len = 100; o->mapQ_coef_fac = log(o->mapQ_coef_len); o->mapQ_coef_len = o->mapQ_coef_fac = 0; bwa_fill_scmat(o->a, o->b, o->mat); @@ -235,7 +236,7 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) int i, j; for (i = 0; i < chn->n; ++i) { mem_chain_t *p = &chn->a[i]; - err_printf("%d", p->n); + err_printf("CHAIN(%d) n=%d", i, p->n); for (j = 0; j < p->n; ++j) { bwtint_t pos; int is_rev, ref_id; @@ -365,13 +366,34 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) * De-overlap single-end hits * ******************************/ +#define alnreg_slt2(a, b) ((a).re < (b).re) +KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2) + #define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) -int mem_sort_and_dedup(int n, mem_alnreg_t *a) +int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) { - int m, i; + int m, i, j; if (n <= 1) return n; + ks_introsort(mem_ars2, n, a); + for (i = 1; i < n; ++i) { + mem_alnreg_t *p = &a[i]; + if (p->rb >= a[i-1].re) continue; + for (j = i - 1; j >= 0 && p->rb < a[j].re; --j) { + mem_alnreg_t *q = &a[j]; + int64_t or, oq, mr, mq; + if (q->qe == q->qb) continue; // a[j] has been excluded + or = q->re - p->rb; // overlap length on the reference + oq = q->qb < p->qb? q->qe - p->qb : p->qe - q->qb; // overlap length on the query + mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment + mq = q->qe - q->qb < p->qe - p->qb? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment + if (or > mask_level_redun * mr && oq > mask_level_redun * mq) { // one of the hits is redundant + if (q->score < p->score) q->qe = q->qb; + else p->qe = p->qb; + } + } + } ks_introsort(mem_ars, n, a); for (i = 1; i < n; ++i) { // mark identical hits if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb) @@ -477,7 +499,7 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, a.score = x.score; a.csub = x.score2; kv_push(mem_alnreg_t, *av, a); - if (bwa_verbose >= 4) printf("SHORT: [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re); + if (bwa_verbose >= 4) printf("chain2aln(short): [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re); return 0; } @@ -563,6 +585,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->w = aw[0] = aw[1] = opt->w; a->score = a->truesc = -1; + if (bwa_verbose >= 4) err_printf("Extending from seed [%ld,%ld,%ld]\n", (long)s->len, (long)s->qbeg, (long)s->rbeg); if (s->qbeg) { // left extension uint8_t *rs, *qs; int qle, tle, gtle, gscore; @@ -842,12 +865,13 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse for (i = 0; i < chn.n; ++i) { mem_chain_t *p = &chn.a[i]; int ret; + if (bwa_verbose >= 4) err_printf("===> Processing chain(%d) <===\n", i); ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); free(chn.a[i].seeds); } free(chn.a); - regs.n = mem_sort_and_dedup(regs.n, regs.a); + regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun); return regs; } diff --git a/bwamem.h b/bwamem.h index 01b690c..1beaa23 100644 --- a/bwamem.h +++ b/bwamem.h @@ -35,6 +35,7 @@ typedef struct { int chunk_size; // process chunk_size-bp sequences in a batch float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain + float mask_level_redun; float mapQ_coef_len; int mapQ_coef_fac; int max_ins; // when estimating insert size distribution, skip pairs with insert longer than this value diff --git a/main.c b/main.c index fdf48b8..d5b61a7 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r411" +#define PACKAGE_VERSION "0.7.5a-r413" #endif int bwa_fa2pac(int argc, char *argv[]); From ebb7b02e9b5346c05c8bed12f016df812cc88455 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 9 Sep 2013 16:57:55 -0400 Subject: [PATCH 428/498] r414: fixed a bug caused by the last commit --- bwamem.c | 12 ++++++++++-- main.c | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index ebc2ac2..6cc4639 100644 --- a/bwamem.c +++ b/bwamem.c @@ -389,11 +389,19 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) mr = q->re - q->rb < p->re - p->rb? q->re - q->rb : p->re - p->rb; // min ref len in alignment mq = q->qe - q->qb < p->qe - p->qb? q->qe - q->qb : p->qe - p->qb; // min qry len in alignment if (or > mask_level_redun * mr && oq > mask_level_redun * mq) { // one of the hits is redundant - if (q->score < p->score) q->qe = q->qb; - else p->qe = p->qb; + if (p->score < q->score) { + p->qe = p->qb; + break; + } else q->qe = q->qb; } } } + for (i = 0, m = 0; i < n; ++i) // exclude identical hits + if (a[i].qe > a[i].qb) { + if (m != i) a[m++] = a[i]; + else ++m; + } + n = m; ks_introsort(mem_ars, n, a); for (i = 1; i < n; ++i) { // mark identical hits if (a[i].score == a[i-1].score && a[i].rb == a[i-1].rb && a[i].qb == a[i-1].qb) diff --git a/main.c b/main.c index d5b61a7..d1e3895 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r413" +#define PACKAGE_VERSION "0.7.5a-r414" #endif int bwa_fa2pac(int argc, char *argv[]); From 7144a0cefce0da81fea16b2e64a3ca285b78731e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 9 Sep 2013 17:51:05 -0400 Subject: [PATCH 429/498] r415: bug in the new (optional) mapQ computation I may use the new method as the default. Testing needed. --- bwamem.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 6cc4639..293dd55 100644 --- a/bwamem.c +++ b/bwamem.c @@ -807,7 +807,7 @@ int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a) } else if (opt->mapQ_coef_len > 0) { double tmp; tmp = l < opt->mapQ_coef_len? 1. : opt->mapQ_coef_fac / log(l); - tmp *= a->seedcov < l? (double)a->seedcov / identity : 1; + tmp *= identity * identity; mapq = (int)(6.02 * (a->score - sub) / opt->a * tmp * tmp + .499); } else { mapq = (int)(MEM_MAPQ_COEF * (1. - (double)sub / a->score) * log(a->seedcov) + .499); diff --git a/main.c b/main.c index d1e3895..998719f 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r414" +#define PACKAGE_VERSION "0.7.5a-r415" #endif int bwa_fa2pac(int argc, char *argv[]); From c564653b402304ed14330feca4bcd76f26990f43 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 12 Sep 2013 10:41:43 -0400 Subject: [PATCH 430/498] r416: removed a line of debugging code --- fastmap.c | 1 - main.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fastmap.c b/fastmap.c index 21e47e0..05e7d7f 100644 --- a/fastmap.c +++ b/fastmap.c @@ -57,7 +57,6 @@ int main_mem(int argc, char *argv[]) opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10); if (*p != 0 && ispunct(*p) && isdigit(p[1])) opt->pen_clip3 = strtol(p+1, &p, 10); - fprintf(stderr, "%d,%d\n", opt->pen_clip5, opt->pen_clip3); } else if (c == 'R') { if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak } else if (c == 's') opt->split_width = atoi(optarg); diff --git a/main.c b/main.c index 998719f..a215408 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r415" +#define PACKAGE_VERSION "0.7.5a-r416" #endif int bwa_fa2pac(int argc, char *argv[]); From 19d33faa303636d0d2f428ef5401baed2f38f23d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 2 Nov 2013 12:13:11 -0400 Subject: [PATCH 431/498] use kthread for multi-threading Bwa-mem should have better performance with many CPU cores. --- Makefile | 2 +- bwamem.c | 74 +++++++++++++++---------------------------------------- kthread.c | 53 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 55 deletions(-) create mode 100644 kthread.c diff --git a/Makefile b/Makefile index 69b2ccd..ff48a20 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -LOBJS= utils.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o malloc_wrap.o +LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o malloc_wrap.o AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ diff --git a/bwamem.c b/bwamem.c index 293dd55..61fa0dd 100644 --- a/bwamem.c +++ b/bwamem.c @@ -952,7 +952,6 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * } typedef struct { - int start, step, n; const mem_opt_t *opt; const bwt_t *bwt; const bntseq_t *bns; @@ -962,84 +961,51 @@ typedef struct { mem_alnreg_v *regs; } worker_t; -static void *worker1(void *data) +static void worker1(void *data, int i, int tid) { worker_t *w = (worker_t*)data; - int i; if (!(w->opt->flag&MEM_F_PE)) { - for (i = w->start; i < w->n; i += w->step) - w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); - } else { // for PE we align the two ends in the same thread in case the 2nd read is of worse quality, in which case some threads may be faster/slower - for (i = w->start; i < w->n>>1; i += w->step) { - w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq); - w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq); - } + w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); + } else { + w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq); + w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq); } - return 0; } -static void *worker2(void *data) +static void worker2(void *data, int i, int tid) { extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); worker_t *w = (worker_t*)data; - int i; if (!(w->opt->flag&MEM_F_PE)) { - for (i = w->start; i < w->n; i += w->step) { - mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); - mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); - free(w->regs[i].a); - } + mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); + mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + free(w->regs[i].a); } else { - int n = 0; - for (i = w->start; i < w->n>>1; i += w->step) { // not implemented yet - n += mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]); - free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); - } - fprintf(stderr, "[M::%s@%d] performed mate-SW for %d reads\n", __func__, w->start, n); + mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]); + free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } - return 0; } void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs, const mem_pestat_t *pes0) { + extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); int i; - worker_t *w; + worker_t w; mem_alnreg_v *regs; mem_pestat_t pes[4]; - w = calloc(opt->n_threads, sizeof(worker_t)); regs = malloc(n * sizeof(mem_alnreg_v)); for (i = 0; i < opt->n_threads; ++i) { - worker_t *p = &w[i]; - p->start = i; p->step = opt->n_threads; p->n = n; + worker_t *p = &w; p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac; p->seqs = seqs; p->regs = regs; p->pes = &pes[0]; } - -#ifdef HAVE_PTHREAD - if (opt->n_threads == 1) { -#endif - worker1(w); - if (opt->flag&MEM_F_PE) { // paired-end mode - if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 - else mem_pestat(opt, bns->l_pac, n, regs, pes); // otherwise, infer the insert size distribution from data - } - worker2(w); -#ifdef HAVE_PTHREAD - } else { - pthread_t *tid; - tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); - for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker1, &w[i]); - for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); - if (opt->flag&MEM_F_PE) { - if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); - else mem_pestat(opt, bns->l_pac, n, regs, pes); - } - for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker2, &w[i]); - for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); - free(tid); + kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions + if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided + if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 + else mem_pestat(opt, bns->l_pac, n, regs, pes); // otherwise, infer the insert size distribution from data } -#endif - free(regs); free(w); + kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment + free(regs); } diff --git a/kthread.c b/kthread.c new file mode 100644 index 0000000..a44426b --- /dev/null +++ b/kthread.c @@ -0,0 +1,53 @@ +#include +#include + +struct kt_for_t; + +typedef struct { + struct kt_for_t *t; + int i; +} ktf_worker_t; + +typedef struct kt_for_t { + int n_threads, n; + ktf_worker_t *w; + void (*func)(void*,int,int); + void *data; +} kt_for_t; + +static inline int steal_work(kt_for_t *t) +{ + int i, k, min = 0x7fffffff, min_i = -1; + for (i = 0; i < t->n_threads; ++i) + if (min > t->w[i].i) min = t->w[i].i, min_i = i; + k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads); + return k >= t->n? -1 : k; +} + +static void *ktf_worker(void *data) +{ + ktf_worker_t *w = (ktf_worker_t*)data; + int i; + for (;;) { + i = __sync_fetch_and_add(&w->i, w->t->n_threads); + if (i >= w->t->n) break; + w->t->func(w->t->data, i, w - w->t->w); + } + while ((i = steal_work(w->t)) >= 0) + w->t->func(w->t->data, i, w - w->t->w); + pthread_exit(0); +} + +void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n) +{ + int i; + kt_for_t t; + pthread_t *tid; + t.func = func, t.data = data, t.n_threads = n_threads, t.n = n; + t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t)); + tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); + for (i = 0; i < n_threads; ++i) + t.w[i].t = &t, t.w[i].i = i; + for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]); + for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); +} From deb19593aac9c8210fef426e1d2371d71eacec1e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 2 Nov 2013 12:25:53 -0400 Subject: [PATCH 432/498] r418: use the new mapQ estimator by default --- bwamem.c | 4 ++-- main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 61fa0dd..c1f190f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -64,8 +64,8 @@ mem_opt_t *mem_opt_init() o->n_threads = 1; o->max_matesw = 100; o->mask_level_redun = 0.95; -// o->mapQ_coef_len = 100; o->mapQ_coef_fac = log(o->mapQ_coef_len); - o->mapQ_coef_len = o->mapQ_coef_fac = 0; + o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); +// o->mapQ_coef_len = o->mapQ_coef_fac = 0; bwa_fill_scmat(o->a, o->b, o->mat); return o; } diff --git a/main.c b/main.c index a215408..8f76706 100644 --- a/main.c +++ b/main.c @@ -3,7 +3,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r416" +#define PACKAGE_VERSION "0.7.5a-r418" #endif int bwa_fa2pac(int argc, char *argv[]); From ff6faf811a543ca2acebd81ecd0de727a6779a32 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 19 Nov 2013 11:08:45 -0500 Subject: [PATCH 433/498] r419: print the @PG line --- bwa.c | 2 ++ main.c | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/bwa.c b/bwa.c index f8949f7..33edd7f 100644 --- a/bwa.c +++ b/bwa.c @@ -277,9 +277,11 @@ void bwa_idx_destroy(bwaidx_t *idx) void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line) { int i; + extern char *bwa_pg; for (i = 0; i < bns->n_seqs; ++i) err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); if (rg_line) err_printf("%s\n", rg_line); + err_printf("%s\n", bwa_pg); } static char *bwa_escape(char *s) diff --git a/main.c b/main.c index 8f76706..a783924 100644 --- a/main.c +++ b/main.c @@ -1,9 +1,10 @@ #include #include +#include "kstring.h" #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r418" +#define PACKAGE_VERSION "0.7.5a-r419" #endif int bwa_fa2pac(int argc, char *argv[]); @@ -24,6 +25,8 @@ int main_mem(int argc, char *argv[]); int main_pemerge(int argc, char *argv[]); +char *bwa_pg; + static int usage() { fprintf(stderr, "\n"); @@ -63,7 +66,11 @@ int main(int argc, char *argv[]) { int i, ret; double t_real; + kstring_t pg = {0,0,0}; t_real = realtime(); + ksprintf(&pg, "@PG\tID:bwa\tPN:bwa\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]); + for (i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]); + bwa_pg = pg.s; if (argc < 2) return usage(); if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1); else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1); @@ -93,5 +100,6 @@ int main(int argc, char *argv[]) fprintf(stderr, " %s", argv[i]); fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime()); } + free(bwa_pg); return ret; } From 6e3fa0515ad8a8374f33fbab432f4133011554cc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 20 Nov 2013 09:50:46 -0500 Subject: [PATCH 434/498] r420: inferred bandwidth is not used in the final --- bwamem.c | 4 +++- main.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index c1f190f..689c74e 100644 --- a/bwamem.c +++ b/bwamem.c @@ -920,7 +920,9 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * exit(1); } w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->q, opt->r); - w2 = w2 < opt->w? w2 : opt->w; + if (bwa_verbose >= 4) fprintf(stderr, "Band width: infer=%d, opt=%d, alnreg=%d\n", w2, opt->w, ar->w); + if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w; + else w2 = opt->w; a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); a.NM = NM; pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); diff --git a/main.c b/main.c index a783924..fb90dbd 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r419" +#define PACKAGE_VERSION "0.7.5a-r420" #endif int bwa_fa2pac(int argc, char *argv[]); From ff4762f3c7685cd2f3fbc0666e708284e1307540 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 20 Nov 2013 10:04:16 -0500 Subject: [PATCH 435/498] r421: bw doubling in the final alignment In some cases, the band width used in the final alignment needs to be larger than the band width in extension. --- bwamem.c | 8 +++++++- main.c | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 689c74e..b1e9f4b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -923,7 +923,13 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * if (bwa_verbose >= 4) fprintf(stderr, "Band width: infer=%d, opt=%d, alnreg=%d\n", w2, opt->w, ar->w); if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w; else w2 = opt->w; - a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); + i = 0; a.cigar = 0; + do { + free(a.cigar); + a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); + if (bwa_verbose >= 4) fprintf(stderr, "Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc); + w2 <<= 1; + } while (++i < 3 && score < ar->truesc - opt->a); a.NM = NM; pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); a.is_rev = is_rev; diff --git a/main.c b/main.c index fb90dbd..2afe8f6 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r420" +#define PACKAGE_VERSION "0.7.5a-r421" #endif int bwa_fa2pac(int argc, char *argv[]); From 29aa855432798216a39d9a52c53bbeca7b774e1e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 21 Nov 2013 14:43:50 -0500 Subject: [PATCH 436/498] r422: matesw hits not sorted --- bwamem_pair.c | 2 ++ main.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index c218925..648c67e 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -108,6 +108,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) { + extern int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun); int i, r, skip[4], n = 0; for (r = 0; r < 4; ++r) skip[r] = pes[r].failed? 1 : 0; @@ -166,6 +167,7 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me } ++n; } + if (n) ma->n = mem_sort_and_dedup(ma->n, ma->a, opt->mask_level_redun); if (rev) free(rev); free(ref); } diff --git a/main.c b/main.c index 2afe8f6..5acd89f 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r421" +#define PACKAGE_VERSION "0.7.5a-r422" #endif int bwa_fa2pac(int argc, char *argv[]); From 4219e586238a602b84f4d8be920514e3a736e9af Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 23 Nov 2013 09:36:26 -0500 Subject: [PATCH 437/498] r423: bugfix - SE hits not random --- bwamem.c | 27 ++++++++++++++------------- bwamem.h | 3 ++- bwamem_pair.c | 6 +++--- fastmap.c | 4 +++- main.c | 2 +- 5 files changed, 23 insertions(+), 19 deletions(-) diff --git a/bwamem.c b/bwamem.c index b1e9f4b..5089d75 100644 --- a/bwamem.c +++ b/bwamem.c @@ -372,6 +372,9 @@ KSORT_INIT(mem_ars2, mem_alnreg_t, alnreg_slt2) #define alnreg_slt(a, b) ((a).score > (b).score || ((a).score == (b).score && ((a).rb < (b).rb || ((a).rb == (b).rb && (a).qb < (b).qb)))) KSORT_INIT(mem_ars, mem_alnreg_t, alnreg_slt) +#define alnreg_hlt(a, b) ((a).score > (b).score || ((a).score == (b).score && (a).hash < (b).hash)) +KSORT_INIT(mem_ars_hash, mem_alnreg_t, alnreg_hlt) + int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) { int m, i, j; @@ -415,13 +418,14 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) return m; } -void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a) // IMPORTANT: must run mem_sort_and_dedup() before calling this function +void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) // IMPORTANT: must run mem_sort_and_dedup() before calling this function { // similar to the loop in mem_chain_flt() int i, k, tmp; kvec_t(int) z; if (n == 0) return; kv_init(z); - for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1; + for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i); + ks_introsort(mem_ars_hash, n, a); tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; kv_push(int, z, 0); for (i = 1; i < n; ++i) { @@ -890,7 +894,7 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * seq = malloc(l_seq); memcpy(seq, seq_, l_seq); // makes a copy of seq_ ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq); - mem_mark_primary_se(opt, ar.n, ar.a); + mem_mark_primary_se(opt, ar.n, ar.a, lrand48()); free(seq); return ar; } @@ -967,6 +971,7 @@ typedef struct { const mem_pestat_t *pes; bseq1_t *seqs; mem_alnreg_v *regs; + int64_t n_processed; } worker_t; static void worker1(void *data, int i, int tid) @@ -985,30 +990,26 @@ static void worker2(void *data, int i, int tid) extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); worker_t *w = (worker_t*)data; if (!(w->opt->flag&MEM_F_PE)) { - mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a); + mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); free(w->regs[i].a); } else { - mem_sam_pe(w->opt, w->bns, w->pac, w->pes, i, &w->seqs[i<<1], &w->regs[i<<1]); + mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } } -void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs, const mem_pestat_t *pes0) +void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0) { extern void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n); - int i; worker_t w; mem_alnreg_v *regs; mem_pestat_t pes[4]; regs = malloc(n * sizeof(mem_alnreg_v)); - for (i = 0; i < opt->n_threads; ++i) { - worker_t *p = &w; - p->opt = opt; p->bwt = bwt; p->bns = bns; p->pac = pac; - p->seqs = seqs; p->regs = regs; - p->pes = &pes[0]; - } + w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; + w.seqs = seqs; w.regs = regs; w.n_processed = n_processed; + w.pes = &pes[0]; kt_for(opt->n_threads, worker1, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // find mapping positions if (opt->flag&MEM_F_PE) { // infer insert sizes if not provided if (pes0) memcpy(pes, pes0, 4 * sizeof(mem_pestat_t)); // if pes0 != NULL, set the insert-size distribution as pes0 diff --git a/bwamem.h b/bwamem.h index 1beaa23..8b24c51 100644 --- a/bwamem.h +++ b/bwamem.h @@ -54,6 +54,7 @@ typedef struct { int w; // actual band width used in extension int seedcov; // length of regions coverged by seeds int secondary; // index of the parent hit shadowing the current hit; <0 if primary + uint64_t hash; } mem_alnreg_t; typedef struct { size_t n, m; mem_alnreg_t *a; } mem_alnreg_v; @@ -109,7 +110,7 @@ extern "C" { * @param pes0 insert-size info; if NULL, infer from data; if not NULL, it should be an array with 4 elements, * corresponding to each FF, FR, RF and RR orientation. See mem_pestat() for more info. */ - void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int n, bseq1_t *seqs, const mem_pestat_t *pes0); + void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int64_t n_processed, int n, bseq1_t *seqs, const mem_pestat_t *pes0); /** * Find the aligned regions for one query sequence diff --git a/bwamem_pair.c b/bwamem_pair.c index 648c67e..1729b8c 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -235,7 +235,7 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) { - extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a); + extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); extern int mem_approx_mapq_se(const mem_opt_t *opt, const mem_alnreg_t *a); extern void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a, int extra_flag, const mem_aln_t *m); extern void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const mem_aln_t *list, int which, const mem_aln_t *m); @@ -257,8 +257,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); free(b[0].a); free(b[1].a); } - mem_mark_primary_se(opt, a[0].n, a[0].a); - mem_mark_primary_se(opt, a[1].n, a[1].a); + mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); + mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1); if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) { diff --git a/fastmap.c b/fastmap.c index 05e7d7f..1884ebc 100644 --- a/fastmap.c +++ b/fastmap.c @@ -27,6 +27,7 @@ int main_mem(int argc, char *argv[]) bwaidx_t *idx; char *rg_line = 0; void *ko = 0, *ko2 = 0; + int64_t n_processed = 0; opt = mem_opt_init(); while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:")) >= 0) { @@ -137,7 +138,8 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < n; ++i) size += seqs[i].l_seq; if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size); - mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n, seqs, 0); + mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, 0); + n_processed += n; for (i = 0; i < n; ++i) { err_fputs(seqs[i].sam, stdout); free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); diff --git a/main.c b/main.c index 5acd89f..6d3e301 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r422" +#define PACKAGE_VERSION "0.7.5a-r423" #endif int bwa_fa2pac(int argc, char *argv[]); From 8b6ec749079b5df9d25f4fb88a77a2530150a45e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 25 Nov 2013 15:48:04 -0500 Subject: [PATCH 438/498] r424: fixed a bw bug in samse/pe --- bwase.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwase.c b/bwase.c index 746add5..540aa6a 100644 --- a/bwase.c +++ b/bwase.c @@ -180,7 +180,7 @@ bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int l assert(re <= l_pac); rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); assert(re - rb == rlen); - ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW, n_cigar, &cigar32); + ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW > abs(rlen - len) * 1.5? SW_BW : abs(rlen - len) * 1.5, n_cigar, &cigar32); assert(*n_cigar > 0); if ((cigar32[*n_cigar - 1]&0xf) == 1) cigar32[*n_cigar - 1] = (cigar32[*n_cigar - 1]>>4<<4) | 3; // change endding ins to soft clipping if ((cigar32[0]&0xf) == 1) cigar32[0] = (cigar32[0]>>4<<4) | 3; // change beginning ins to soft clipping diff --git a/main.c b/main.c index 6d3e301..4ebdfe8 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r423" +#define PACKAGE_VERSION "0.7.5a-r424" #endif int bwa_fa2pac(int argc, char *argv[]); From f70d80a5a22d30097510ddf87dfda8399237927c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 30 Dec 2013 15:40:18 -0500 Subject: [PATCH 439/498] r427: fixed bugs in backtrack See comments in ksw_global() for details. --- ksw.c | 30 +++++++++++++++++++----------- main.c | 2 +- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/ksw.c b/ksw.c index 454c49d..db018fa 100644 --- a/ksw.c +++ b/ksw.c @@ -502,26 +502,34 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF; for (j = beg; LIKELY(j < end); ++j) { - // This loop is organized in a similar way to ksw_extend() and ksw_sse2(), except: - // 1) not checking h>0; 2) recording direction for backtracking + // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) + // Cells are computed in the following order: + // M(i,j) = H(i-1,j-1) + S(i,j) + // H(i,j) = max{M(i,j), E(i,j), F(i,j)} + // E(i+1,j) = max{M(i,j)-gapo, E(i,j)} - gape + // F(i,j+1) = max{M(i,j)-gapo, F(i,j)} - gape + // We have to separate M(i,j); otherwise the direction may not be recorded correctly. + // However, a CIGAR like "10M3I3D10M" allowed by local() and extend() is disallowed by global(). + // Such a CIGAR may occur, in theory, if mismatch_penalty > 2*gap_ext_penalty + 2*gap_open_penalty/k. + // In practice, this should happen very rarely given a reasonable scoring system. eh_t *p = &eh[j]; - int32_t h = p->h, e = p->e; + int32_t h, m = p->h, e = p->e; uint8_t d; // direction p->h = h1; - h += q[j]; - d = h >= e? 0 : 1; - h = h >= e? h : e; + m += q[j]; + d = m >= e? 0 : 1; + h = m >= e? m : e; d = h >= f? d : 2; h = h >= f? h : f; h1 = h; - h -= gapoe; + m -= gapoe; e -= gape; - d |= e > h? 1<<2 : 0; - e = e > h? e : h; + d |= e > m? 1<<2 : 0; + e = e > m? e : m; p->e = e; f -= gape; - d |= f > h? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two - f = f > h? f : h; + d |= f > m? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two + f = f > m? f : m; zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell } eh[end].h = h1; eh[end].e = MINUS_INF; diff --git a/main.c b/main.c index 4ebdfe8..3595cbc 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r424" +#define PACKAGE_VERSION "0.7.5a-r427" #endif int bwa_fa2pac(int argc, char *argv[]); From 74a1a53499927de79888ebabb3223feb60e93b7b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 30 Dec 2013 15:49:41 -0500 Subject: [PATCH 440/498] print debugging msg to stdout --- bwamem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 5089d75..9ab22e3 100644 --- a/bwamem.c +++ b/bwamem.c @@ -924,14 +924,14 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * exit(1); } w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->q, opt->r); - if (bwa_verbose >= 4) fprintf(stderr, "Band width: infer=%d, opt=%d, alnreg=%d\n", w2, opt->w, ar->w); + if (bwa_verbose >= 4) printf("Band width: infer=%d, opt=%d, alnreg=%d\n", w2, opt->w, ar->w); if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w; else w2 = opt->w; i = 0; a.cigar = 0; do { free(a.cigar); a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); - if (bwa_verbose >= 4) fprintf(stderr, "Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc); + if (bwa_verbose >= 4) printf("Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc); w2 <<= 1; } while (++i < 3 && score < ar->truesc - opt->a); a.NM = NM; From 3afcdc7746fa8a4656df7ef4cbbe24a855f87ffb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 30 Dec 2013 16:05:43 -0500 Subject: [PATCH 441/498] debugging code only: print seeds --- bwamem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/bwamem.c b/bwamem.c index 9ab22e3..a609e0d 100644 --- a/bwamem.c +++ b/bwamem.c @@ -215,6 +215,7 @@ static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) * s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; s.len = slen; + if (bwa_verbose >= 5) printf("SEED l=%d,qb=%d,rb=%ld\n", s.len, s.qbeg, (long)s.rbeg); if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip if (kb_size(tree)) { kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain From 10cb6b05077ad193dcf00adf814b780dbf2ed564 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 30 Dec 2013 16:18:45 -0500 Subject: [PATCH 442/498] r428: allow to change the default chain_drop_ratio --- bwamem.c | 40 +++++++++++++++++++++++----------------- fastmap.c | 4 +++- main.c | 2 +- 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/bwamem.c b/bwamem.c index a609e0d..9b05528 100644 --- a/bwamem.c +++ b/bwamem.c @@ -232,12 +232,32 @@ static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) * } } +int mem_chain_weight(const mem_chain_t *c) +{ + int64_t end; + int j, w = 0, tmp; + for (j = 0, end = 0; j < c->n; ++j) { + const mem_seed_t *s = &c->seeds[j]; + if (s->qbeg >= end) w += s->len; + else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end; + end = end > s->qbeg + s->len? end : s->qbeg + s->len; + } + tmp = w; + for (j = 0, end = 0; j < c->n; ++j) { + const mem_seed_t *s = &c->seeds[j]; + if (s->rbeg >= end) w += s->len; + else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end; + end = end > s->qbeg + s->len? end : s->qbeg + s->len; + } + return w < tmp? w : tmp; +} + void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) { int i, j; for (i = 0; i < chn->n; ++i) { mem_chain_t *p = &chn->a[i]; - err_printf("CHAIN(%d) n=%d", i, p->n); + err_printf("CHAIN(%d) n=%d w=%d", i, p->n, mem_chain_weight(p)); for (j = 0; j < p->n; ++j) { bwtint_t pos; int is_rev, ref_id; @@ -294,22 +314,8 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) a = malloc(sizeof(flt_aux_t) * n_chn); for (i = 0; i < n_chn; ++i) { mem_chain_t *c = &chains[i]; - int64_t end; - int w = 0, tmp; - for (j = 0, end = 0; j < c->n; ++j) { - const mem_seed_t *s = &c->seeds[j]; - if (s->qbeg >= end) w += s->len; - else if (s->qbeg + s->len > end) w += s->qbeg + s->len - end; - end = end > s->qbeg + s->len? end : s->qbeg + s->len; - } - tmp = w; - for (j = 0, end = 0; j < c->n; ++j) { - const mem_seed_t *s = &c->seeds[j]; - if (s->rbeg >= end) w += s->len; - else if (s->rbeg + s->len > end) w += s->rbeg + s->len - end; - end = end > s->qbeg + s->len? end : s->qbeg + s->len; - } - w = w < tmp? w : tmp; + int w; + w = mem_chain_weight(c); a[i].beg = c->seeds[0].qbeg; a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; a[i].w = w; a[i].p = c; a[i].p2 = 0; diff --git a/fastmap.c b/fastmap.c index 1884ebc..40cea8c 100644 --- a/fastmap.c +++ b/fastmap.c @@ -30,7 +30,7 @@ int main_mem(int argc, char *argv[]) int64_t n_processed = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:")) >= 0) { + while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); @@ -49,6 +49,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'd') opt->zdrop = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); + else if (c == 'D') opt->chain_drop_ratio = atof(optarg); else if (c == 'C') copy_comment = 1; else if (c == 'Q') { opt->mapQ_coef_len = atoi(optarg); @@ -75,6 +76,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); + fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->chain_drop_ratio); fprintf(stderr, " -S skip mate rescue\n"); fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); diff --git a/main.c b/main.c index 3595cbc..bb11578 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r427" +#define PACKAGE_VERSION "0.7.5a-r428" #endif int bwa_fa2pac(int argc, char *argv[]); From c26ba4e3767658b3d0abc979457debf44ae1baf7 Mon Sep 17 00:00:00 2001 From: Bradford Powell Date: Sun, 5 Jan 2014 14:54:48 -0500 Subject: [PATCH 443/498] fix duplicate PG lines in bwape and bwase --- bwape.c | 2 -- bwase.c | 3 --- main.c | 5 ----- 3 files changed, 10 deletions(-) diff --git a/bwape.c b/bwape.c index 2c96e06..82fc50b 100644 --- a/bwape.c +++ b/bwape.c @@ -49,7 +49,6 @@ int bwa_approx_mapQ(const bwa_seq_t *p, int mm); void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2); bntseq_t *bwa_open_nt(const char *prefix); void bwa_print_sam_SQ(const bntseq_t *bns); -void bwa_print_sam_PG(); pe_opt_t *bwa_init_pe_opt() { @@ -671,7 +670,6 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f // core loop bwa_print_sam_hdr(bns, rg_line); - bwa_print_sam_PG(); while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) { int cnt_chg; isize_info_t ii; diff --git a/bwase.c b/bwase.c index 540aa6a..30f306e 100644 --- a/bwase.c +++ b/bwase.c @@ -19,8 +19,6 @@ int g_log_n[256]; -void bwa_print_sam_PG(); - void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) { int i, cnt, best; @@ -530,7 +528,6 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f } err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa); bwa_print_sam_hdr(bns, rg_line); - //bwa_print_sam_PG(); // set ks ks = bwa_open_reads(opt.mode, fn_fa); // core loop diff --git a/main.c b/main.c index bb11578..69bf765 100644 --- a/main.c +++ b/main.c @@ -57,11 +57,6 @@ static int usage() return 1; } -void bwa_print_sam_PG() -{ - err_printf("@PG\tID:bwa\tPN:bwa\tVN:%s\n", PACKAGE_VERSION); -} - int main(int argc, char *argv[]) { int i, ret; From ea3dc2f00300131baa7303c92326a7ab099d0288 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 29 Jan 2014 10:51:02 -0500 Subject: [PATCH 444/498] r430: fix a bug producing incorrect alignment Ksw uses two rounds of SSE2-SW to find the boundaries of an alignment. If the second round gives a different score from the first round, it will fail. The fix checks if this happens, though I have not dig into an example to understand why this may happen in the first place. --- bwamem.c | 4 +++- bwamem_pair.c | 2 +- main.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 9b05528..27b9c4c 100644 --- a/bwamem.c +++ b/bwamem.c @@ -910,7 +910,7 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) { mem_aln_t a; - int i, w2, qb, qe, NM, score, is_rev; + int i, w2, qb, qe, NM, score, is_rev, last_sc = -(1<<30); int64_t pos, rb, re; uint8_t *query; @@ -939,6 +939,8 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * free(a.cigar); a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); if (bwa_verbose >= 4) printf("Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc); + if (score == last_sc) break; // it is possible that global alignment and local alignment give different scores + last_sc = score; w2 <<= 1; } while (++i < 3 && score < ar->truesc - opt->a); a.NM = NM; diff --git a/bwamem_pair.c b/bwamem_pair.c index 1729b8c..f1aa73a 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -147,7 +147,7 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len; aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); - if (aln.score >= opt->min_seed_len) { + if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb; b.qe = is_rev? l_ms - aln.qb : aln.qe + 1; b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; diff --git a/main.c b/main.c index 69bf765..20cbf83 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r428" +#define PACKAGE_VERSION "0.7.5a-r430" #endif int bwa_fa2pac(int argc, char *argv[]); From f524c7d3d87bcc5a5b920ddb81a1ff4a9e439ea8 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 29 Jan 2014 12:05:11 -0500 Subject: [PATCH 445/498] r431: added the MD tag to bwa-mem --- bwa.c | 27 +++++++++++++++++++++------ bwamem.c | 26 ++++++++++++++++++-------- main.c | 22 +--------------------- 3 files changed, 40 insertions(+), 35 deletions(-) diff --git a/bwa.c b/bwa.c index 33edd7f..21b71b8 100644 --- a/bwa.c +++ b/bwa.c @@ -6,6 +6,7 @@ #include "bwa.h" #include "ksw.h" #include "utils.h" +#include "kstring.h" #ifdef USE_MALLOC_WRAPPERS # include "malloc_wrap.h" @@ -91,6 +92,8 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa uint8_t tmp, *rseq; int i; int64_t rlen; + kstring_t str; + *n_cigar = 0; *NM = -1; if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); @@ -122,18 +125,30 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); } {// compute NM - int k, x, y, n_mm = 0, n_gap = 0; - for (k = 0, x = y = 0; k < *n_cigar; ++k) { + int k, x, y, u, n_mm = 0, n_gap = 0; + str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR + for (k = 0, x = y = u = 0; k < *n_cigar; ++k) { int op = cigar[k]&0xf; int len = cigar[k]>>4; if (op == 0) { // match - for (i = 0; i < len; ++i) - if (query[x + i] != rseq[y + i]) ++n_mm; + for (i = 0; i < len; ++i) { + if (query[x + i] != rseq[y + i]) { + kputw(u, &str); kputc("ACGTN"[rseq[y+i]], &str); + ++n_mm; u = 0; + } else ++u; + } x += len; y += len; - } else if (op == 1) x += len, n_gap += len; - else if (op == 2) y += len, n_gap += len; + } else if (op == 2) { // deletion + kputw(u, &str); kputc('^', &str); + for (i = 0; i < len; ++i) + kputc("ACGTN"[rseq[y+i]], &str); + u = 0; + y += len, n_gap += len; + } else if (op == 1) x += len, n_gap += len; // insertion } + kputw(u, &str); kputc(0, &str); *NM = n_mm + n_gap; + cigar = (uint32_t*)str.s; } if (rb >= l_pac) // reverse back query for (i = 0; i < l_query>>1; ++i) diff --git a/bwamem.c b/bwamem.c index 27b9c4c..6f77064 100644 --- a/bwamem.c +++ b/bwamem.c @@ -772,7 +772,10 @@ void mem_aln2sam(const bntseq_t *bns, kstring_t *str, bseq1_t *s, int n, const m } // print optional tags - if (p->n_cigar) { kputsn("\tNM:i:", 6, str); kputw(p->NM, str); } + if (p->n_cigar) { + kputsn("\tNM:i:", 6, str); kputw(p->NM, str); + kputsn("\tMD:Z:", 6, str); kputs((char*)(p->cigar + p->n_cigar), str); + } if (p->score >= 0) { kputsn("\tAS:i:", 6, str); kputw(p->score, str); } if (p->sub >= 0) { kputsn("\tXS:i:", 6, str); kputw(p->sub, str); } if (bwa_rg_id[0]) { kputsn("\tRG:Z:", 6, str); kputs(bwa_rg_id, str); } @@ -910,7 +913,7 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) { mem_aln_t a; - int i, w2, qb, qe, NM, score, is_rev, last_sc = -(1<<30); + int i, w2, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD; int64_t pos, rb, re; uint8_t *query; @@ -943,27 +946,34 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * last_sc = score; w2 <<= 1; } while (++i < 3 && score < ar->truesc - opt->a); + l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1; a.NM = NM; pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); a.is_rev = is_rev; - if (a.n_cigar > 0) { + if (a.n_cigar > 0) { // squeeze out leading or trailing deletions if ((a.cigar[0]&0xf) == 2) { pos += a.cigar[0]>>4; --a.n_cigar; - memmove(a.cigar, a.cigar + 1, a.n_cigar * 4); - } else if ((a.cigar[a.n_cigar-1]&0xf) == 2) --a.n_cigar; + memmove(a.cigar, a.cigar + 1, a.n_cigar * 4 + l_MD); + } else if ((a.cigar[a.n_cigar-1]&0xf) == 2) { + --a.n_cigar; + memmove(a.cigar + a.n_cigar, a.cigar + a.n_cigar + 1, l_MD); // MD needs to be moved accordingly + } } if (qb != 0 || qe != l_query) { // add clipping to CIGAR int clip5, clip3; clip5 = is_rev? l_query - qe : qb; clip3 = is_rev? qb : l_query - qe; - a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2)); + a.cigar = realloc(a.cigar, 4 * (a.n_cigar + 2) + l_MD); if (clip5) { - memmove(a.cigar+1, a.cigar, a.n_cigar * 4); + memmove(a.cigar+1, a.cigar, a.n_cigar * 4 + l_MD); // make room for 5'-end clipping a.cigar[0] = clip5<<4 | 3; ++a.n_cigar; } - if (clip3) a.cigar[a.n_cigar++] = clip3<<4 | 3; + if (clip3) { + memmove(a.cigar + a.n_cigar + 1, a.cigar + a.n_cigar, l_MD); // make room for 3'-end clipping + a.cigar[a.n_cigar++] = clip3<<4 | 3; + } } a.rid = bns_pos2rid(bns, pos); a.pos = pos - bns->anns[a.rid].offset; diff --git a/main.c b/main.c index 20cbf83..46e794d 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r430" +#define PACKAGE_VERSION "0.7.5a-r431" #endif int bwa_fa2pac(int argc, char *argv[]); @@ -14,17 +14,9 @@ int bwa_bwt2sa(int argc, char *argv[]); int bwa_index(int argc, char *argv[]); int bwt_bwtgen_main(int argc, char *argv[]); -int bwa_aln(int argc, char *argv[]); -int bwa_sai2sam_se(int argc, char *argv[]); -int bwa_sai2sam_pe(int argc, char *argv[]); - -int bwa_bwtsw2(int argc, char *argv[]); - int main_fastmap(int argc, char *argv[]); int main_mem(int argc, char *argv[]); -int main_pemerge(int argc, char *argv[]); - char *bwa_pg; static int usage() @@ -37,11 +29,6 @@ static int usage() fprintf(stderr, "Command: index index sequences in the FASTA format\n"); fprintf(stderr, " mem BWA-MEM algorithm\n"); fprintf(stderr, " fastmap identify super-maximal exact matches\n"); - fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n"); - fprintf(stderr, " aln gapped/ungapped alignment\n"); - fprintf(stderr, " samse generate alignment (single ended)\n"); - fprintf(stderr, " sampe generate alignment (paired ended)\n"); - fprintf(stderr, " bwasw BWA-SW for long queries\n"); fprintf(stderr, "\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n"); @@ -73,15 +60,8 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1); else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); - else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); - else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); - else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); - else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); - else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); - else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); - else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; From 5fdab3ae1399261e2a13a80a2b9229765b47e3b9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 31 Jan 2014 11:12:59 -0500 Subject: [PATCH 446/498] Released bwa-0.7.6-r432 --- NEWS | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 4 +++- bwa.1 | 14 ++++++-------- main.c | 2 +- 4 files changed, 66 insertions(+), 10 deletions(-) diff --git a/NEWS b/NEWS index 29627f0..eb9c37a 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,59 @@ +Release 0.7.6 (31 Januaray, 2014) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Changes in BWA-MEM: + + * Changed the way mapping quality is estimated. The new method tends to give + the same alignment a higher mapping quality. On paired-end reads, the change + is minor as with pairing, the mapping quality is usually high. For short + single-end reads, the difference is considerable. + + * Improved load balance when many threads are spawned. However, bwa-mem is + still not very thread efficient, probably due to the frequent heap memory + allocation. Further improvement is a little difficult and may affect the + code stability. + + * Allow to use different clipping penalties for 5'- and 3'-ends. This helps + when we do not want to clip one end. + + * Print the @PG line, including the command line options. + + * Improved the band width estimate: a) fixed a bug causing the band + width extimated from extension not used in the final global alignment; b) + try doubled band width if the global alignment score is smaller. + Insufficient band width leads to wrong CIGAR and spurious mismatches/indels. + + * Added a new option -D to fine tune a heuristic on dropping suboptimal hits. + Reducing -D increases accuracy but decreases the mapping speed. If unsure, + leave it to the default. + + * Bugfix: for a repetitive single-end read, the reported hit is not randomly + distributed among equally best hits. + + * Bugfix: missing paired-end hits due to unsorted list of SE hits. + + * Bugfix: incorrect CIGAR caused by a defect in the global alignment. + + * Bugfix: incorrect CIGAR caused by failed SW rescue. + + * Bugfix: alignments largely mapped to the same position are regarded to be + distinct from each other, which leads to underestimated mapping quality. + + * Added the MD tag. + +There are no changes to BWA-backtrack in this release. However, it has a few +known issues yet to be fixed. If you prefer BWA-track, It is still advised to +use bwa-0.6.x. + +While I developed BWA-MEM, I also found a few issues with BWA-SW. It is now +possible to improve BWA-SW with the lessons learned from BWA-MEM. However, as +BWA-MEM is usually better, I will not improve BWA-SW until I find applications +where BWA-SW may excel. + +(0.7.6: 31 January 2014, r432) + + + Release 0.7.5a (30 May, 2013) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/README.md b/README.md index d903e38..009a4ca 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,8 @@ different sub-commands: **aln/samse/sampe** for BWA-backtrack, BWA is released under [GPLv3][1]. The latest souce code is [freely available][2] at github. Released packages can [be downloaded ][3] at SourceForge. After you acquire the source code, simply use `make` to compile -and copy the single executable `bwa` to the destination you want. +and copy the single executable `bwa` to the destination you want. The only +dependency of BWA is [zlib][14]. ###Seeking helps @@ -71,3 +72,4 @@ do not have plan to submit it to a peer-reviewed journal in the near future. [11]: http://www.ncbi.nlm.nih.gov/pubmed/20080505 [12]: http://arxiv.org/abs/1303.3997 [13]: http://arxiv.org/ +[14]: http://zlib.net/ diff --git a/bwa.1 b/bwa.1 index e63fe8d..5949a1b 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "24 May 2013" "bwa-0.7.5" "Bioinformatics tools" +.TH bwa 1 "31 January 2014" "bwa-0.7.6" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -158,7 +158,7 @@ Number of threads [1] Minimum seed length. Matches shorter than .I INT will be missed. The alignment speed is usually insensitive to this value unless -it significantly deviates 20. [19] +it significantly deviates from 20. [19] .TP .BI -w \ INT Band width. Essentially, gaps longer than @@ -210,12 +210,13 @@ Gap extension penalty. A gap of length k costs O + k*E (i.e. .B -O is for opening a zero-length gap). [1] .TP -.BI -L \ INT +.BI -L \ INT[,INT] Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best score reaching the end of query. If this score is larger than the best SW score minus the clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag reports the best SW score; clipping penalty is not -deducted. [5] +deduced. If two numbers are provided, the first is for 5'-end clipping and +second for 3'-end clipping. [5] .TP .BI -U \ INT Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as @@ -250,10 +251,6 @@ transfer read meta information (e.g. barcode) to the SAM output. Note that the FASTA/Q comment (the string after a space in the header line) must conform the SAM spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output. .TP -.B -H -Use hard clipping 'H' in the SAM output. This option may dramatically reduce -the redundancy of output when mapping long contig or BAC sequences. -.TP .B -M Mark shorter split hits as secondary (for Picard compatibility). .TP @@ -569,6 +566,7 @@ NM Edit distance MD Mismatching positions/bases AS Alignment score BC Barcode sequence +SA Supplementary alignments _ X0 Number of best hits X1 Number of suboptimal hits found by BWA diff --git a/main.c b/main.c index 46e794d..0a22f69 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.5a-r431" +#define PACKAGE_VERSION "0.7.6-r432" #endif int bwa_fa2pac(int argc, char *argv[]); From 7c50bad56782b65dcad4b9fbf5867e8e9b3a8ccc Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 31 Jan 2014 12:58:21 -0500 Subject: [PATCH 447/498] Release bwa-0.7.6a-r433 --- bwa.c | 5 +++-- main.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bwa.c b/bwa.c index 21b71b8..aec04d8 100644 --- a/bwa.c +++ b/bwa.c @@ -128,8 +128,9 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa int k, x, y, u, n_mm = 0, n_gap = 0; str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR for (k = 0, x = y = u = 0; k < *n_cigar; ++k) { - int op = cigar[k]&0xf; - int len = cigar[k]>>4; + int op, len; + cigar = (uint32_t*)str.s; + op = cigar[k]&0xf, len = cigar[k]>>4; if (op == 0) { // match for (i = 0; i < len; ++i) { if (query[x + i] != rseq[y + i]) { diff --git a/main.c b/main.c index 0a22f69..f872917 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.6-r432" +#define PACKAGE_VERSION "0.7.6a-r433" #endif int bwa_fa2pac(int argc, char *argv[]); From e2748223f8d70e8b0a928bc10cb6c4842dcffe0b Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Fri, 7 Feb 2014 14:18:44 +0100 Subject: [PATCH 448/498] Added an option to specify the fastq base number --- Makefile | 4 ++-- bwaseqio.c | 6 +++--- bwtaln.c | 1 + bwtaln.h | 1 + bwtpssm.c | 6 ++++-- bwtpssmgap.c | 1 + seq2pssm.c | 4 ++-- 7 files changed, 14 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 5a2f792..5d9b018 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,11 @@ CC= gcc CXX= g++ -#CFLAGS= -g -Wall +CFLAGS= -g -Wall #CFLAGS= -pg -Wall -O2 #CFLAGS= -O3 -L/scr/plastilin/pkerp/local/lib #CFLAGS = -pg #CFLAGS = -O3 -pg -CFLAGS =-O3 -Wall +#CFLAGS =-O3 -Wall CXXFLAGS= $(CFLAGS) DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64 OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ diff --git a/bwaseqio.c b/bwaseqio.c index edf90e7..a83948d 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -151,7 +151,7 @@ bwa_seq_t *bwa_read_pssm_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, in double match_score = 1, mismatch_score = -2, wild_score =0; int qualscores = 1; // do we use quality scores? kseq_t *seq = bs->ks; - int n_seqs, l, i,j, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; + int n_seqs, l, i,j, is_comp = mode&BWA_MODE_COMPREAD, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; if (l_bc > 15) { @@ -168,8 +168,8 @@ bwa_seq_t *bwa_read_pssm_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, in //exit(1); } - if (is_64 && seq->qual.l) - for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; + if (opt) + for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= (opt->fastq_base - 33); if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; if (l_bc) { // then trim barcode diff --git a/bwtaln.c b/bwtaln.c index 5ecf03b..99b3874 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -40,6 +40,7 @@ gap_opt_t *gap_init_opt() o->use_error_model=0; o->prior = 0.8; o->parclip = 0; + o->fastq_base = 33; return o; } diff --git a/bwtaln.h b/bwtaln.h index 6cb4707..d7984f3 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -128,6 +128,7 @@ typedef struct { char pssm_ratio_provided; float prior; int parclip; + int fastq_base; } gap_opt_t; #define BWA_PET_STD 1 diff --git a/bwtpssm.c b/bwtpssm.c index 6458655..167e42d 100644 --- a/bwtpssm.c +++ b/bwtpssm.c @@ -332,7 +332,7 @@ int bwa_pssm(int argc, char *argv[]) opt->max_entries = 400; - while ((c = getopt(argc, argv, "pn:z:y:o:e:i:d:l:k:cLR:m:t:NM:O:E:D:S:G:P:q:f:b012IB:")) >= 0) { + while ((c = getopt(argc, argv, "pn:z:y:o:e:i:d:l:k:cLR:m:t:NM:O:E:D:S:G:P:q:f:F:b012IB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -363,16 +363,18 @@ int bwa_pssm(int argc, char *argv[]) case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break; case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; case 'f': xreopen(optarg, "wb", stdout); break; + case 'F': opt->fastq_base=atoi(optarg); break; case 'b': opt->mode |= BWA_MODE_BAM; break; case '0': opt->mode |= BWA_MODE_BAM_SE; break; case '1': opt->mode |= BWA_MODE_BAM_READ1; break; case '2': opt->mode |= BWA_MODE_BAM_READ2; break; - case 'I': opt->mode |= BWA_MODE_IL13; break; + case 'I': opt->mode |= BWA_MODE_IL13; opt->fastq_base=64; break; case 'Y': opt->mode |= BWA_MODE_CFY; break; case 'B': opt->mode |= atoi(optarg) << 24; break; default: return 1; } } + if (opte > 0) { opt->max_gape = opte; opt->mode &= ~BWA_MODE_GAPE; diff --git a/bwtpssmgap.c b/bwtpssmgap.c index afbc4ad..579be32 100644 --- a/bwtpssmgap.c +++ b/bwtpssmgap.c @@ -317,6 +317,7 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const //if (i == 4) //fprintf(stderr, "yay"); + int max_entries = 0; //fprintf(stderr, "pssm #1 id:%d %d \t[%d][%d,%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%lu]\t[%lu,%lu]\t%d\t[%6d, **%6d**, %6d, %6d]\n", mat->id, i, max_entries, gp_heap->empty_left, a, i, seq[i], "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos, curr_score, e.score_offset, mat->thresholds[i], mat->bi[i]); m = max_diff - (e.n_mm + e.n_gapo); diff --git a/seq2pssm.c b/seq2pssm.c index fe72035..625cb7c 100644 --- a/seq2pssm.c +++ b/seq2pssm.c @@ -337,7 +337,7 @@ float *read_ascii_quality_scores(char *filename) { * with it, and each of these will be added to the matrix. */ PSSM error_model_to_pssm(PSSM mat, ubyte_t *seq, ubyte_t *qual, int len, int alphsize, - const float *error_model) { + const float *error_model, int fastq_base) { int i, k, q; @@ -755,7 +755,7 @@ int sequence_to_pssm(bwa_seq_t *s, int alphsize, float psnp, Probs *mc, float sc free_probs(P); if (opt->use_error_model) - error_model_to_pssm(s->mat, s->seq+nf, s->rqual+nf, s->len-nf, alphsize, opt->error_lookup); + error_model_to_pssm(s->mat, s->seq+nf, s->rqual+nf, s->len-nf, alphsize, opt->error_lookup, opt->fastq_base); /* if (debug) From 652b802f367e7b5f7174c00767ef21bc7e17d28e Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Fri, 7 Feb 2014 14:24:13 +0100 Subject: [PATCH 449/498] Removed some print statements --- Makefile | 4 ++-- bwtpssmgap.c | 17 ----------------- 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index 5d9b018..5a2f792 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,11 @@ CC= gcc CXX= g++ -CFLAGS= -g -Wall +#CFLAGS= -g -Wall #CFLAGS= -pg -Wall -O2 #CFLAGS= -O3 -L/scr/plastilin/pkerp/local/lib #CFLAGS = -pg #CFLAGS = -O3 -pg -#CFLAGS =-O3 -Wall +CFLAGS =-O3 -Wall CXXFLAGS= $(CFLAGS) DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64 OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ diff --git a/bwtpssmgap.c b/bwtpssmgap.c index 579be32..54611d0 100644 --- a/bwtpssmgap.c +++ b/bwtpssmgap.c @@ -295,27 +295,14 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const g_visited++; visited++; - //no more space - /* - while (gp_heap->empty_left < 10) { - //fprintf(stderr, "burning entries\n"); - min_score = gap_destroy_min(gp_heap); - } - */ - gap_pop(gp_heap, mat->id, &e); // get the best entry k = e.k; l = e.l; // SA interval a = e.info>>20&1; i = e.info&0xffff; // strand, length - // fprintf(stderr, "best_found: %f mat->be[mat->length-1]-e.score_offset: %f\n", best_found, mat->be[mat->length-1] + e.score_offset); if (!(opt->mode & BWA_MODE_NONSTOP) && best_found > mat->be[mat->length-1] + e.score_offset + desired_mapq) { break; } - //fprintf(stderr, "e.score_offset: %f min_score: %f\n", e.score_offset, min_score); - // - //if (i == 4) - //fprintf(stderr, "yay"); int max_entries = 0; //fprintf(stderr, "pssm #1 id:%d %d \t[%d][%d,%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%lu]\t[%lu,%lu]\t%d\t[%6d, **%6d**, %6d, %6d]\n", mat->id, i, max_entries, gp_heap->empty_left, a, i, seq[i], "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos, curr_score, e.score_offset, mat->thresholds[i], mat->bi[i]); @@ -343,7 +330,6 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const if (i == 0) { if (curr_score > best_found) { if (num_hits_found >= 2) { - //fprintf(stderr, "moving thresholds\n"); calc_and_set_reverse_thresholds(mat, 1, get_length(mat), curr_score); addMinWidthToThresholds(mat, width); } @@ -356,7 +342,6 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const hit_found = 1; num_hits_found += 1; e.score_offset = 0; - //e.pssm_score = mat->be[i-1]; } else { continue; // no hit, skip @@ -508,7 +493,6 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const curr_offset = -((mat->be[i] - mat->be[i-1]) - base_score) + e.score_offset; } - //fprintf(stderr, "base_score: %d\n", base_score); if (curr_offset > min_score) { k = bwt->L2[c] + cnt_k[c] + 1; @@ -518,7 +502,6 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const if (k <= l) gap_push(gp_heap, mat->id, a, i, k, l, e.n_mm, e.n_gapo, e.n_gape, STATE_M, 0, opt, curr_score + base_score, curr_offset); } } - //fprintf(stderr, "id: %d gp_heap->empty_left: %d gp_heap->size-2: %d\n", mat->id, gp_heap->empty_left , (gp_heap->size - 2)); gap_finish_push(gp_heap); } From 7077f8a65de736f6321facc5d2a52d97455bb195 Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Fri, 7 Feb 2014 15:52:45 +0100 Subject: [PATCH 450/498] Reverted some supposed bug fixes --- Makefile | 4 ++-- bwaseqio.c | 7 +++---- bwtaln.c | 1 - bwtaln.h | 1 - bwtpssm.c | 5 ++--- seq2pssm.c | 4 ++-- 6 files changed, 9 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 5a2f792..5d9b018 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,11 @@ CC= gcc CXX= g++ -#CFLAGS= -g -Wall +CFLAGS= -g -Wall #CFLAGS= -pg -Wall -O2 #CFLAGS= -O3 -L/scr/plastilin/pkerp/local/lib #CFLAGS = -pg #CFLAGS = -O3 -pg -CFLAGS =-O3 -Wall +#CFLAGS =-O3 -Wall CXXFLAGS= $(CFLAGS) DFLAGS= -DHAVE_PTHREAD #-D_FILE_OFFSET_BITS=64 OBJS= QSufSort.o bwt_gen.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o \ diff --git a/bwaseqio.c b/bwaseqio.c index a83948d..7ccea82 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -151,7 +151,7 @@ bwa_seq_t *bwa_read_pssm_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, in double match_score = 1, mismatch_score = -2, wild_score =0; int qualscores = 1; // do we use quality scores? kseq_t *seq = bs->ks; - int n_seqs, l, i,j, is_comp = mode&BWA_MODE_COMPREAD, l_bc = mode>>24; + int n_seqs, l, i,j, is_comp = mode&BWA_MODE_COMPREAD, is_64=mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; if (l_bc > 15) { @@ -167,9 +167,8 @@ bwa_seq_t *bwa_read_pssm_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, in continue; //exit(1); } - - if (opt) - for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= (opt->fastq_base - 33); + if (is_64 && seq->qual.l) + for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; if (l_bc) { // then trim barcode diff --git a/bwtaln.c b/bwtaln.c index 99b3874..5ecf03b 100644 --- a/bwtaln.c +++ b/bwtaln.c @@ -40,7 +40,6 @@ gap_opt_t *gap_init_opt() o->use_error_model=0; o->prior = 0.8; o->parclip = 0; - o->fastq_base = 33; return o; } diff --git a/bwtaln.h b/bwtaln.h index d7984f3..6cb4707 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -128,7 +128,6 @@ typedef struct { char pssm_ratio_provided; float prior; int parclip; - int fastq_base; } gap_opt_t; #define BWA_PET_STD 1 diff --git a/bwtpssm.c b/bwtpssm.c index 167e42d..e3769e9 100644 --- a/bwtpssm.c +++ b/bwtpssm.c @@ -332,7 +332,7 @@ int bwa_pssm(int argc, char *argv[]) opt->max_entries = 400; - while ((c = getopt(argc, argv, "pn:z:y:o:e:i:d:l:k:cLR:m:t:NM:O:E:D:S:G:P:q:f:F:b012IB:")) >= 0) { + while ((c = getopt(argc, argv, "pn:z:y:o:e:i:d:l:k:cLR:m:t:NM:O:E:D:S:G:P:q:f:b012IB:")) >= 0) { switch (c) { case 'n': if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; @@ -363,12 +363,11 @@ int bwa_pssm(int argc, char *argv[]) case 'c': opt->mode &= ~BWA_MODE_COMPREAD; break; case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; case 'f': xreopen(optarg, "wb", stdout); break; - case 'F': opt->fastq_base=atoi(optarg); break; case 'b': opt->mode |= BWA_MODE_BAM; break; case '0': opt->mode |= BWA_MODE_BAM_SE; break; case '1': opt->mode |= BWA_MODE_BAM_READ1; break; case '2': opt->mode |= BWA_MODE_BAM_READ2; break; - case 'I': opt->mode |= BWA_MODE_IL13; opt->fastq_base=64; break; + case 'I': opt->mode |= BWA_MODE_IL13; break; case 'Y': opt->mode |= BWA_MODE_CFY; break; case 'B': opt->mode |= atoi(optarg) << 24; break; default: return 1; diff --git a/seq2pssm.c b/seq2pssm.c index 625cb7c..fe72035 100644 --- a/seq2pssm.c +++ b/seq2pssm.c @@ -337,7 +337,7 @@ float *read_ascii_quality_scores(char *filename) { * with it, and each of these will be added to the matrix. */ PSSM error_model_to_pssm(PSSM mat, ubyte_t *seq, ubyte_t *qual, int len, int alphsize, - const float *error_model, int fastq_base) { + const float *error_model) { int i, k, q; @@ -755,7 +755,7 @@ int sequence_to_pssm(bwa_seq_t *s, int alphsize, float psnp, Probs *mc, float sc free_probs(P); if (opt->use_error_model) - error_model_to_pssm(s->mat, s->seq+nf, s->rqual+nf, s->len-nf, alphsize, opt->error_lookup, opt->fastq_base); + error_model_to_pssm(s->mat, s->seq+nf, s->rqual+nf, s->len-nf, alphsize, opt->error_lookup); /* if (debug) From 14aa43cca0116ba4dba6fa54d17d868c9ed5d13a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 12 Feb 2014 15:39:02 -0500 Subject: [PATCH 451/498] r434: added the missing bwasw/aln commands! --- main.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/main.c b/main.c index f872917..86de68e 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.6a-r433" +#define PACKAGE_VERSION "0.7.6a-dev-r434" #endif int bwa_fa2pac(int argc, char *argv[]); @@ -14,9 +14,17 @@ int bwa_bwt2sa(int argc, char *argv[]); int bwa_index(int argc, char *argv[]); int bwt_bwtgen_main(int argc, char *argv[]); +int bwa_aln(int argc, char *argv[]); +int bwa_sai2sam_se(int argc, char *argv[]); +int bwa_sai2sam_pe(int argc, char *argv[]); + +int bwa_bwtsw2(int argc, char *argv[]); + int main_fastmap(int argc, char *argv[]); int main_mem(int argc, char *argv[]); +int main_pemerge(int argc, char *argv[]); + char *bwa_pg; static int usage() @@ -29,6 +37,11 @@ static int usage() fprintf(stderr, "Command: index index sequences in the FASTA format\n"); fprintf(stderr, " mem BWA-MEM algorithm\n"); fprintf(stderr, " fastmap identify super-maximal exact matches\n"); + fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n"); + fprintf(stderr, " aln gapped/ungapped alignment\n"); + fprintf(stderr, " samse generate alignment (single ended)\n"); + fprintf(stderr, " sampe generate alignment (paired ended)\n"); + fprintf(stderr, " bwasw BWA-SW for long queries\n"); fprintf(stderr, "\n"); fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); fprintf(stderr, " pac2bwt generate BWT from PAC\n"); @@ -60,8 +73,15 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1); else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); + else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); + else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); + else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); + else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); + else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); + else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; From 4adc34eccb6e18c53cbd6668c83f33bf4b0704a4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 18 Feb 2014 10:32:24 -0500 Subject: [PATCH 452/498] r435: bugfix - base not complemented on the rev --- bwa.c | 7 +++++-- main.c | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/bwa.c b/bwa.c index aec04d8..d08899b 100644 --- a/bwa.c +++ b/bwa.c @@ -93,6 +93,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa int i; int64_t rlen; kstring_t str; + const char *int2base; *n_cigar = 0; *NM = -1; if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand @@ -127,6 +128,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa {// compute NM int k, x, y, u, n_mm = 0, n_gap = 0; str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR + int2base = rb < l_pac? "ACGTN" : "TGCAN"; for (k = 0, x = y = u = 0; k < *n_cigar; ++k) { int op, len; cigar = (uint32_t*)str.s; @@ -134,7 +136,8 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa if (op == 0) { // match for (i = 0; i < len; ++i) { if (query[x + i] != rseq[y + i]) { - kputw(u, &str); kputc("ACGTN"[rseq[y+i]], &str); + kputw(u, &str); + kputc(int2base[rseq[y+i]], &str); ++n_mm; u = 0; } else ++u; } @@ -142,7 +145,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa } else if (op == 2) { // deletion kputw(u, &str); kputc('^', &str); for (i = 0; i < len; ++i) - kputc("ACGTN"[rseq[y+i]], &str); + kputc(int2base[rseq[y+i]], &str); u = 0; y += len, n_gap += len; } else if (op == 1) x += len, n_gap += len; // insertion diff --git a/main.c b/main.c index 86de68e..55d43cf 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.6a-dev-r434" +#define PACKAGE_VERSION "0.7.6a+dev-r435" #endif int bwa_fa2pac(int argc, char *argv[]); From bdd14d2946a3bda730ffecce3713889460c501f5 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 19 Feb 2014 10:08:43 -0500 Subject: [PATCH 453/498] r436: fix rare MD/NM-CIGAR inconsistencies --- bwa.c | 3 ++- main.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bwa.c b/bwa.c index d08899b..42c1b7b 100644 --- a/bwa.c +++ b/bwa.c @@ -125,7 +125,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa // NW alignment *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); } - {// compute NM + {// compute NM and MD int k, x, y, u, n_mm = 0, n_gap = 0; str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR int2base = rb < l_pac? "ACGTN" : "TGCAN"; @@ -133,6 +133,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa int op, len; cigar = (uint32_t*)str.s; op = cigar[k]&0xf, len = cigar[k]>>4; + if (op == 2 && (k == 0 || k == *n_cigar - 1)) continue; // skip the leading or trailing deletions if (op == 0) { // match for (i = 0; i < len; ++i) { if (query[x + i] != rseq[y + i]) { diff --git a/main.c b/main.c index 55d43cf..bdcc476 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.6a+dev-r435" +#define PACKAGE_VERSION "0.7.6a+dev-r436" #endif int bwa_fa2pac(int argc, char *argv[]); From 52391a985560a7e3da4a7fe52e3cceee244cc30a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 19 Feb 2014 10:54:26 -0500 Subject: [PATCH 454/498] r437: print timing for each batch of reads --- bwamem.c | 4 ++++ main.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index 6f77064..19ca561 100644 --- a/bwamem.c +++ b/bwamem.c @@ -1024,7 +1024,9 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn worker_t w; mem_alnreg_v *regs; mem_pestat_t pes[4]; + double ctime, rtime; + ctime = cputime(); rtime = realtime(); regs = malloc(n * sizeof(mem_alnreg_v)); w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; w.seqs = seqs; w.regs = regs; w.n_processed = n_processed; @@ -1036,4 +1038,6 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn } kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); // generate alignment free(regs); + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] Processed %d reads in %.3f CPU sec, %.3f real sec\n", __func__, n, cputime() - ctime, realtime() - rtime); } diff --git a/main.c b/main.c index bdcc476..5a26b71 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.6a+dev-r436" +#define PACKAGE_VERSION "0.7.6a+dev-r437" #endif int bwa_fa2pac(int argc, char *argv[]); From 17fb85a2275e2c9e2c3e69c68cc42b90eaeff58a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 19 Feb 2014 11:31:54 -0500 Subject: [PATCH 455/498] r438: still an issue in MD It occurs when the global alignment disagrees with the local alignment. --- bwa.c | 13 +++++++------ main.c | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/bwa.c b/bwa.c index 42c1b7b..f1d4649 100644 --- a/bwa.c +++ b/bwa.c @@ -133,7 +133,6 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa int op, len; cigar = (uint32_t*)str.s; op = cigar[k]&0xf, len = cigar[k]>>4; - if (op == 2 && (k == 0 || k == *n_cigar - 1)) continue; // skip the leading or trailing deletions if (op == 0) { // match for (i = 0; i < len; ++i) { if (query[x + i] != rseq[y + i]) { @@ -144,11 +143,13 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa } x += len; y += len; } else if (op == 2) { // deletion - kputw(u, &str); kputc('^', &str); - for (i = 0; i < len; ++i) - kputc(int2base[rseq[y+i]], &str); - u = 0; - y += len, n_gap += len; + if (k > 0 && k < *n_cigar) { // don't do the following if D is the first or the last CIGAR + kputw(u, &str); kputc('^', &str); + for (i = 0; i < len; ++i) + kputc(int2base[rseq[y+i]], &str); + u = 0; n_gap += len; + } + y += len; } else if (op == 1) x += len, n_gap += len; // insertion } kputw(u, &str); kputc(0, &str); diff --git a/main.c b/main.c index 5a26b71..5006f99 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.6a+dev-r437" +#define PACKAGE_VERSION "0.7.6a+dev-r438" #endif int bwa_fa2pac(int argc, char *argv[]); From ce026a07fc8f01a1e45c86fdd40acf6843c378ac Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 19 Feb 2014 13:10:33 -0500 Subject: [PATCH 456/498] r439: expose mem_opt_t::max_matesw --- README.md | 3 ++- fastmap.c | 4 +++- main.c | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 009a4ca..ac1e57e 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ different sub-commands: **aln/samse/sampe** for BWA-backtrack, ###Availability BWA is released under [GPLv3][1]. The latest souce code is [freely -available][2] at github. Released packages can [be downloaded ][3] at +available][2] at github. Released packages can [be downloaded][3] at SourceForge. After you acquire the source code, simply use `make` to compile and copy the single executable `bwa` to the destination you want. The only dependency of BWA is [zlib][14]. @@ -73,3 +73,4 @@ do not have plan to submit it to a peer-reviewed journal in the near future. [12]: http://arxiv.org/abs/1303.3997 [13]: http://arxiv.org/ [14]: http://zlib.net/ +[15]: https://github.com/lh3/bwa/tree/mem diff --git a/fastmap.c b/fastmap.c index 40cea8c..72d850c 100644 --- a/fastmap.c +++ b/fastmap.c @@ -30,7 +30,7 @@ int main_mem(int argc, char *argv[]) int64_t n_processed = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:")) >= 0) { + while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); @@ -50,6 +50,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); else if (c == 'D') opt->chain_drop_ratio = atof(optarg); + else if (c == 'm') opt->max_matesw = atoi(optarg); else if (c == 'C') copy_comment = 1; else if (c == 'Q') { opt->mapQ_coef_len = atoi(optarg); @@ -77,6 +78,7 @@ int main_mem(int argc, char *argv[]) // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->chain_drop_ratio); + fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw); fprintf(stderr, " -S skip mate rescue\n"); fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); diff --git a/main.c b/main.c index 5006f99..c264abe 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.6a+dev-r438" +#define PACKAGE_VERSION "0.7.6a+dev-r439" #endif int bwa_fa2pac(int argc, char *argv[]); From e87981737301bbcfeef68d9b164f8f28ea8ac574 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 20 Feb 2014 13:06:40 -0500 Subject: [PATCH 457/498] r440: a condition not work due to a typo --- bwa.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwa.c b/bwa.c index f1d4649..140d57e 100644 --- a/bwa.c +++ b/bwa.c @@ -143,7 +143,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa } x += len; y += len; } else if (op == 2) { // deletion - if (k > 0 && k < *n_cigar) { // don't do the following if D is the first or the last CIGAR + if (k > 0 && k < *n_cigar - 1) { // don't do the following if D is the first or the last CIGAR kputw(u, &str); kputc('^', &str); for (i = 0; i < len; ++i) kputc(int2base[rseq[y+i]], &str); diff --git a/main.c b/main.c index c264abe..803cf10 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.6a+dev-r439" +#define PACKAGE_VERSION "0.7.6a+dev-r440" #endif int bwa_fa2pac(int argc, char *argv[]); From 1c19bc630f7cade6a27914f687bc4c862b3e567f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 25 Feb 2014 01:05:37 -0500 Subject: [PATCH 458/498] Released bwa-0.7.7-r441 --- NEWS | 17 +++++++++++++++++ bwa.1 | 2 +- main.c | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index eb9c37a..a7c64ed 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,20 @@ +Release 0.7.7 (25 Feburary, 2014) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release fixes incorrect MD tags in the BWA-MEM output. + +A note about short-read mapping to GRCh38. The new human reference genome +GRCh38 contains 60Mbp program generated alpha repeat arrays, some of which are +hard masked as they cannot be localized. These highly repetitive arrays make +BWA-MEM ~50% slower. If you are concerned with the performance of BWA-MEM, you +may consider to use option "-c2000 -m50". On simulated data, this setting helps +the performance at a very minor cost on accuracy. I may consider to change the +default in future releases. + +(0.7.7: 25 Feburary 2014, r441) + + + Release 0.7.6 (31 Januaray, 2014) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bwa.1 b/bwa.1 index 5949a1b..601a529 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "31 January 2014" "bwa-0.7.6" "Bioinformatics tools" +.TH bwa 1 "25 Feburary 2014" "bwa-0.7.7" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool diff --git a/main.c b/main.c index 803cf10..a8df9c0 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.6a+dev-r440" +#define PACKAGE_VERSION "0.7.7-r441" #endif int bwa_fa2pac(int argc, char *argv[]); From 2e9463ebf1c40e40778f04c1447336111b5ce235 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 26 Feb 2014 22:04:19 -0500 Subject: [PATCH 459/498] dev-r442: suppress exact full-length matches --- bwamem.c | 21 ++++++++++++++++++--- bwamem.h | 1 + fastmap.c | 4 +++- main.c | 2 +- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/bwamem.c b/bwamem.c index 19ca561..4a08fed 100644 --- a/bwamem.c +++ b/bwamem.c @@ -111,7 +111,7 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query) itr->len = len; } -const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) +const bwtintv_v *smem_next2(smem_i *itr, int split_len, int split_width, int start_width) { int i, max, max_i, ori_start; itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; @@ -119,7 +119,7 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases if (itr->start == itr->len) return 0; ori_start = itr->start; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, start_width, itr->matches, itr->tmpvec); // search for SMEM if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match bwtintv_t *p = &itr->matches->a[i]; @@ -152,6 +152,11 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) return itr->matches; } +const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) +{ + return smem_next2(itr, split_len, split_width, 1); +} + /******************************** * Chaining while finding SMEMs * ********************************/ @@ -200,8 +205,9 @@ static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) * { const bwtintv_v *a; int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); + int start_width = (opt->flag & MEM_F_NO_EXACT)? 2 : 1; split_len = split_len < itr->len? split_len : itr->len; - while ((a = smem_next(itr, split_len, opt->split_width)) != 0) { // to find all SMEM and some internal MEM + while ((a = smem_next2(itr, split_len, opt->split_width, start_width)) != 0) { // to find all SMEM and some internal MEM int i; for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start bwtintv_t *p = &a->a[i]; @@ -425,6 +431,13 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) return m; } +int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int qlen) +{ + if (!(opt->flag & MEM_F_NO_EXACT) || n == 0 || a->truesc != qlen * opt->a) return n; + memmove(a, a + 1, (n - 1) * sizeof(mem_alnreg_t)); + return n - 1; +} + void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id) // IMPORTANT: must run mem_sort_and_dedup() before calling this function { // similar to the loop in mem_chain_flt() int i, k, tmp; @@ -894,6 +907,8 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse } free(chn.a); regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun); + if (opt->flag & MEM_F_NO_EXACT) + regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq); return regs; } diff --git a/bwamem.h b/bwamem.h index 8b24c51..9686d27 100644 --- a/bwamem.h +++ b/bwamem.h @@ -16,6 +16,7 @@ typedef struct __smem_i smem_i; #define MEM_F_ALL 0x8 #define MEM_F_NO_MULTI 0x10 #define MEM_F_NO_RESCUE 0x20 +#define MEM_F_NO_EXACT 0x40 typedef struct { int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r diff --git a/fastmap.c b/fastmap.c index 72d850c..2b2d3da 100644 --- a/fastmap.c +++ b/fastmap.c @@ -30,7 +30,7 @@ int main_mem(int argc, char *argv[]) int64_t n_processed = 0; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "paMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:")) >= 0) { + while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); @@ -45,6 +45,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'p') opt->flag |= MEM_F_PE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; + else if (c == 'e') opt->flag |= MEM_F_NO_EXACT; else if (c == 'c') opt->max_occ = atoi(optarg); else if (c == 'd') opt->zdrop = atoi(optarg); else if (c == 'v') bwa_verbose = atoi(optarg); @@ -81,6 +82,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw); fprintf(stderr, " -S skip mate rescue\n"); fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); + fprintf(stderr, " -e discard full-length exact matches\n"); fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); diff --git a/main.c b/main.c index a8df9c0..4edf9bd 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.7-r441" +#define PACKAGE_VERSION "0.7.7+dev-r442" #endif int bwa_fa2pac(int argc, char *argv[]); From 8ede4ffbfa9e03032dba774e52b815dd84210113 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 16 Mar 2014 15:18:22 -0400 Subject: [PATCH 460/498] Fixed clang compiling warnings --- Makefile | 2 +- bwt.h | 8 ++++---- bwt_lite.c | 2 +- bwt_lite.h | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index ff48a20..6490932 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ CC= gcc #CC= clang --analyze -CFLAGS= -g -Wall -O2 +CFLAGS= -g -Wall -O2 -Wno-unused-function WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) diff --git a/bwt.h b/bwt.h index c36bf9b..d2ff0ac 100644 --- a/bwt.h +++ b/bwt.h @@ -96,14 +96,14 @@ extern "C" { void bwt_bwtupdate_core(bwt_t *bwt); - inline bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c); - inline void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]); + bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c); + void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]); bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k); // more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values void bwt_gen_cnt_table(bwt_t *bwt); - inline void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol); - inline void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]); + void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol); + void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]); int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end); int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0); diff --git a/bwt_lite.c b/bwt_lite.c index 6cd3b1d..9b47270 100644 --- a/bwt_lite.c +++ b/bwt_lite.c @@ -56,7 +56,7 @@ bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq) } return b; } -inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c) +uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c) { uint32_t n, b; if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; diff --git a/bwt_lite.h b/bwt_lite.h index 0096b93..4fadcce 100644 --- a/bwt_lite.h +++ b/bwt_lite.h @@ -17,9 +17,9 @@ extern "C" { #endif bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq); - inline uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c); - inline void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]); - inline void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]); + uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c); + void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]); + void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]); void bwtl_destroy(bwtl_t *bwt); #ifdef __cplusplus From 8929bd1c25af96b0b4456a33652bd81756b6623c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 16 Mar 2014 15:18:58 -0400 Subject: [PATCH 461/498] r443: more verbose debugging information --- bwamem.c | 35 +++++++++++++++++++++++++---------- main.c | 2 +- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/bwamem.c b/bwamem.c index 19ca561..c9b32b0 100644 --- a/bwamem.c +++ b/bwamem.c @@ -215,7 +215,7 @@ static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) * s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; s.len = slen; - if (bwa_verbose >= 5) printf("SEED l=%d,qb=%d,rb=%ld\n", s.len, s.qbeg, (long)s.rbeg); + if (bwa_verbose >= 5) printf("* Found SEED: length=%d,query_beg=%d,ref_beg=%ld\n", s.len, s.qbeg, (long)s.rbeg); if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip if (kb_size(tree)) { kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain @@ -257,14 +257,14 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) int i, j; for (i = 0; i < chn->n; ++i) { mem_chain_t *p = &chn->a[i]; - err_printf("CHAIN(%d) n=%d w=%d", i, p->n, mem_chain_weight(p)); + err_printf("* Found CHAIN(%d): n=%d; weight=%d", i, p->n, mem_chain_weight(p)); for (j = 0; j < p->n; ++j) { bwtint_t pos; int is_rev, ref_id; pos = bns_depos(bns, p->seeds[j].rbeg, &is_rev); if (is_rev) pos -= p->seeds[j].len - 1; bns_cnt_ambi(bns, pos, p->seeds[j].len, &ref_id); - err_printf("\t%d,%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); + err_printf("\t%d;%d,%ld(%s:%c%ld)", p->seeds[j].len, p->seeds[j].qbeg, (long)p->seeds[j].rbeg, bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - bns->anns[ref_id].offset) + 1); } err_putchar('\n'); } @@ -518,7 +518,7 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, a.score = x.score; a.csub = x.score2; kv_push(mem_alnreg_t, *av, a); - if (bwa_verbose >= 4) printf("chain2aln(short): [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re); + if (bwa_verbose >= 4) printf("** Added alignment region via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re); return 0; } @@ -584,7 +584,9 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int w = max_gap < opt->w? max_gap : opt->w; if (qd - rd < w && rd - qd < w) break; } - if (i < av->n) { // the seed is (almost) contained in an existing alignment + if (i < av->n) { // the seed is (almost) contained in an existing alignment; further testing is needed to confirm it is not leading to a different aln + if (bwa_verbose >= 4) + printf("** Seed(%d) [%ld;%ld,%ld] is almost contained in an existing alignment. Confirming whether extension is needed...\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg); for (i = k + 1; i < c->n; ++i) { // check overlapping seeds in the same chain const mem_seed_t *t; if (srt[i] == 0) continue; @@ -597,6 +599,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int srt[k] = 0; // mark that seed extension has not been performed continue; } + if (bwa_verbose >= 4) + printf("** Seed(%d) might lead to a different alignment even though it is contained. Extension will be performed.\n", k); } a = kv_pushp(mem_alnreg_t, *av); @@ -604,7 +608,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->w = aw[0] = aw[1] = opt->w; a->score = a->truesc = -1; - if (bwa_verbose >= 4) err_printf("Extending from seed [%ld,%ld,%ld]\n", (long)s->len, (long)s->qbeg, (long)s->rbeg); + if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg); if (s->qbeg) { // left extension uint8_t *rs, *qs; int qle, tle, gtle, gscore; @@ -616,8 +620,13 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[0] = opt->w << i; + if (bwa_verbose >= 4) { + int j; + printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rs[j]]); putchar('\n'); + printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)qs[j]]); putchar('\n'); + } a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); - if (bwa_verbose >= 4) { printf("L\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } + if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } // check whether we prefer to reach the end of the query @@ -639,8 +648,13 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int for (i = 0; i < MAX_BAND_TRY; ++i) { int prev = a->score; aw[1] = opt->w << i; + if (bwa_verbose >= 4) { + int j; + printf("*** Right ref: "); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[re+j]]); putchar('\n'); + printf("*** Right query: "); for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[qe+j]]); putchar('\n'); + } a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); - if (bwa_verbose >= 4) { printf("R\t%d < %d; w=%d; max_off=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } + if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } // similar to the above @@ -652,7 +666,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int a->truesc += gscore - sc0; } } else a->qe = l_query, a->re = s->rbeg + s->len; - if (bwa_verbose >= 4) { printf("[%d]\taw={%d,%d}\tscore=%d\t[%d,%d) <=> [%ld,%ld)\n", k, aw[0], aw[1], a->score, a->qb, a->qe, (long)a->rb, (long)a->re); fflush(stdout); } + if (bwa_verbose >= 4) printf("*** Added alignment region: [%d,%d) <=> [%ld,%ld); score=%d; {left,right}_bandwidth={%d,%d}\n", a->qb, a->qe, (long)a->rb, (long)a->re, a->score, aw[0], aw[1]); // compute seedcov for (i = 0, a->seedcov = 0; i < c->n; ++i) { @@ -887,7 +901,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse for (i = 0; i < chn.n; ++i) { mem_chain_t *p = &chn.a[i]; int ret; - if (bwa_verbose >= 4) err_printf("===> Processing chain(%d) <===\n", i); + if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); free(chn.a[i].seeds); @@ -996,6 +1010,7 @@ typedef struct { static void worker1(void *data, int i, int tid) { worker_t *w = (worker_t*)data; + if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name); if (!(w->opt->flag&MEM_F_PE)) { w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); } else { diff --git a/main.c b/main.c index a8df9c0..2fd0138 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.7-r441" +#define PACKAGE_VERSION "0.7.7-r443" #endif int bwa_fa2pac(int argc, char *argv[]); From 7d63e76245ba668cf625a327972eb33fc5e95140 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 16 Mar 2014 23:25:04 -0400 Subject: [PATCH 462/498] r444: more debugging output in CIGAR generation Also found a potential issue which should not affect accuracy but may hurt speed. Will investigate later. --- bwa.c | 8 ++++++-- bwamem.c | 12 ++++++++---- main.c | 2 +- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/bwa.c b/bwa.c index 140d57e..0a02905 100644 --- a/bwa.c +++ b/bwa.c @@ -106,6 +106,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; } if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP + // FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance. cigar = malloc(4); cigar[0] = l_query<<4 | 0; *n_cigar = 1; @@ -113,8 +114,6 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa *score += mat[rseq[i]*5 + query[i]]; } else { int w, max_gap, min_w; - //printf("[Q] "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); - //printf("[R] "); for (i = 0; i < re - rb; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); // set the band-width max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.); max_gap = max_gap > 1? max_gap : 1; @@ -123,6 +122,11 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa min_w = abs(rlen - l_query) + 3; w = w > min_w? w : min_w; // NW alignment + if (bwa_verbose >= 4) { + printf("* Global bandwidth: %d\n", w); + printf("* Global ref: "); for (i = 0; i < rlen; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); + printf("* Global query: "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); + } *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); } {// compute NM and MD diff --git a/bwamem.c b/bwamem.c index c9b32b0..0ca116f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -948,14 +948,14 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * exit(1); } w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->q, opt->r); - if (bwa_verbose >= 4) printf("Band width: infer=%d, opt=%d, alnreg=%d\n", w2, opt->w, ar->w); + if (bwa_verbose >= 4) printf("* Band width: inferred=%d, cmd_opt=%d, alnreg=%d\n", w2, opt->w, ar->w); if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w; - else w2 = opt->w; + else w2 = opt->w; // TODO: check if we need this line. Need to test on many reads. i = 0; a.cigar = 0; do { free(a.cigar); a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); - if (bwa_verbose >= 4) printf("Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc); + if (bwa_verbose >= 4) printf("* Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc); if (score == last_sc) break; // it is possible that global alignment and local alignment give different scores last_sc = score; w2 <<= 1; @@ -1010,11 +1010,13 @@ typedef struct { static void worker1(void *data, int i, int tid) { worker_t *w = (worker_t*)data; - if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name); if (!(w->opt->flag&MEM_F_PE)) { + if (bwa_verbose >= 4) printf("=====> Processing read '%s' <=====\n", w->seqs[i].name); w->regs[i] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i].l_seq, w->seqs[i].seq); } else { + if (bwa_verbose >= 4) printf("=====> Processing read '%s'/1 <=====\n", w->seqs[i<<1|0].name); w->regs[i<<1|0] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|0].l_seq, w->seqs[i<<1|0].seq); + if (bwa_verbose >= 4) printf("=====> Processing read '%s'/2 <=====\n", w->seqs[i<<1|1].name); w->regs[i<<1|1] = mem_align1_core(w->opt, w->bwt, w->bns, w->pac, w->seqs[i<<1|1].l_seq, w->seqs[i<<1|1].seq); } } @@ -1024,10 +1026,12 @@ static void worker2(void *data, int i, int tid) extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); worker_t *w = (worker_t*)data; if (!(w->opt->flag&MEM_F_PE)) { + if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name); mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); free(w->regs[i].a); } else { + if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name); mem_sam_pe(w->opt, w->bns, w->pac, w->pes, (w->n_processed>>1) + i, &w->seqs[i<<1], &w->regs[i<<1]); free(w->regs[i<<1|0].a); free(w->regs[i<<1|1].a); } diff --git a/main.c b/main.c index 2fd0138..0c355fc 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.7-r443" +#define PACKAGE_VERSION "0.7.7-r444" #endif int bwa_fa2pac(int argc, char *argv[]); From e6931bec0377e0a0e028443b25d39cabdca23b0e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 17 Mar 2014 00:01:00 -0400 Subject: [PATCH 463/498] r445: unnecessarily large bandwidth in global --- bwamem.c | 4 ++-- main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 0ca116f..bac1f7f 100644 --- a/bwamem.c +++ b/bwamem.c @@ -687,7 +687,7 @@ static inline int infer_bw(int l1, int l2, int score, int a, int q, int r) { int w; if (l1 == l2 && l1 * a - score < (q + r - a)<<1) return 0; // to get equal alignment length, we need at least two gaps - w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 1.); + w = ((double)((l1 < l2? l1 : l2) * a - score - q) / r + 2.); if (w < abs(l1 - l2)) w = abs(l1 - l2); return w; } @@ -950,7 +950,7 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->q, opt->r); if (bwa_verbose >= 4) printf("* Band width: inferred=%d, cmd_opt=%d, alnreg=%d\n", w2, opt->w, ar->w); if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w; - else w2 = opt->w; // TODO: check if we need this line. Need to test on many reads. +// else w2 = opt->w; // TODO: check if we need this line on long reads. On 1-800bp reads, it does not matter and it should be. i = 0; a.cigar = 0; do { free(a.cigar); diff --git a/main.c b/main.c index 0c355fc..8434b00 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.7-r444" +#define PACKAGE_VERSION "0.7.7-master-r445" #endif int bwa_fa2pac(int argc, char *argv[]); From 0c783399e82f1438eab751fdd671637ef361135d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 28 Mar 2014 10:54:23 -0400 Subject: [PATCH 464/498] dev-448: different ins/del penalties --- ksw.c | 69 ++++++++++++++++++++++++++++++++++------------------------ ksw.h | 3 +++ main.c | 2 +- 3 files changed, 44 insertions(+), 30 deletions(-) diff --git a/ksw.c b/ksw.c index db018fa..5eafb1b 100644 --- a/ksw.c +++ b/ksw.c @@ -107,11 +107,11 @@ kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t return q; } -kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) +kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e) { int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; uint64_t *b; - __m128i zero, gapoe, gape, shift, *H0, *H1, *E, *Hmax; + __m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax; kswr_t r; #define __max_16(ret, xx) do { \ @@ -128,8 +128,10 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; m_b = n_b = 0; b = 0; zero = _mm_set1_epi32(0); - gapoe = _mm_set1_epi8(_gapo + _gape); - gape = _mm_set1_epi8(_gape); + oe_del = _mm_set1_epi8(_o_del + _e_del); + e_del = _mm_set1_epi8(_e_del); + oe_ins = _mm_set1_epi8(_o_ins + _e_ins); + e_ins = _mm_set1_epi8(_e_ins); shift = _mm_set1_epi8(q->shift); H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; slen = q->slen; @@ -141,7 +143,7 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, // the core loop for (i = 0; i < tlen; ++i) { int j, k, cmp, imax; - __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + __m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian for (j = 0; LIKELY(j < slen); ++j) { @@ -159,13 +161,14 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, max = _mm_max_epu8(max, h); // set max _mm_store_si128(H1 + j, h); // save to H'(i,j) // now compute E'(i+1,j) - h = _mm_subs_epu8(h, gapoe); // h=H'(i,j)-gapo - e = _mm_subs_epu8(e, gape); // e=E'(i,j)-gape - e = _mm_max_epu8(e, h); // e=E'(i+1,j) + e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del + t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del + e = _mm_max_epu8(e, t); // e=E'(i+1,j) _mm_store_si128(E + j, e); // save to E'(i+1,j) // now compute F'(i,j+1) - f = _mm_subs_epu8(f, gape); - f = _mm_max_epu8(f, h); + f = _mm_subs_epu8(f, e_ins); + t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins + f = _mm_max_epu8(f, t); // get H'(i-1,j) and prepare for the next j h = _mm_load_si128(H0 + j); // h=H'(i-1,j) } @@ -176,8 +179,8 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, h = _mm_load_si128(H1 + j); h = _mm_max_epu8(h, f); // h=H'(i,j) _mm_store_si128(H1 + j, h); - h = _mm_subs_epu8(h, gapoe); - f = _mm_subs_epu8(f, gape); + h = _mm_subs_epu8(h, oe_ins); + f = _mm_subs_epu8(f, e_ins); cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero)); if (UNLIKELY(cmp == 0xffff)) goto end_loop16; } @@ -225,11 +228,11 @@ kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, return r; } -kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, int xtra) // the first gap costs -(_o+_e) +kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e) { int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; uint64_t *b; - __m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax; + __m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax; kswr_t r; #define __max_8(ret, xx) do { \ @@ -245,8 +248,10 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; m_b = n_b = 0; b = 0; zero = _mm_set1_epi32(0); - gapoe = _mm_set1_epi16(_gapo + _gape); - gape = _mm_set1_epi16(_gape); + oe_del = _mm_set1_epi16(_o_del + _e_del); + e_del = _mm_set1_epi16(_e_del); + oe_ins = _mm_set1_epi16(_o_ins + _e_ins); + e_ins = _mm_set1_epi16(_e_ins); H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; slen = q->slen; for (i = 0; i < slen; ++i) { @@ -257,7 +262,7 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, // the core loop for (i = 0; i < tlen; ++i) { int j, k, imax; - __m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector + __m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example h = _mm_slli_si128(h, 2); for (j = 0; LIKELY(j < slen); ++j) { @@ -267,12 +272,13 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, h = _mm_max_epi16(h, f); max = _mm_max_epi16(max, h); _mm_store_si128(H1 + j, h); - h = _mm_subs_epu16(h, gapoe); - e = _mm_subs_epu16(e, gape); - e = _mm_max_epi16(e, h); + e = _mm_subs_epu16(e, e_del); + t = _mm_subs_epu16(h, oe_del); + e = _mm_max_epi16(e, t); _mm_store_si128(E + j, e); - f = _mm_subs_epu16(f, gape); - f = _mm_max_epi16(f, h); + f = _mm_subs_epu16(f, e_ins); + t = _mm_subs_epu16(h, oe_ins); + f = _mm_max_epi16(f, t); h = _mm_load_si128(H0 + j); } for (k = 0; LIKELY(k < 16); ++k) { @@ -281,8 +287,8 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, h = _mm_load_si128(H1 + j); h = _mm_max_epi16(h, f); _mm_store_si128(H1 + j, h); - h = _mm_subs_epu16(h, gapoe); - f = _mm_subs_epu16(f, gape); + h = _mm_subs_epu16(h, oe_ins); + f = _mm_subs_epu16(f, e_ins); if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; } } @@ -326,30 +332,30 @@ kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _gapo, int _gape, return r; } -static void revseq(int l, uint8_t *s) +static inline void revseq(int l, uint8_t *s) { int i, t; for (i = 0; i < l>>1; ++i) t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t; } -kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry) +kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry) { int size; kswq_t *q; kswr_t r, rr; - kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int); + kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int, int, int); q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat); if (qry && *qry == 0) *qry = q; func = q->size == 2? ksw_i16 : ksw_u8; size = q->size; - r = func(q, tlen, target, gapo, gape, xtra); + r = func(q, tlen, target, o_del, e_del, o_ins, e_ins, xtra); if (qry == 0) free(q); if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r; revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end q = ksw_qinit(size, r.qe + 1, query, m, mat); - rr = func(q, tlen, target, gapo, gape, KSW_XSTOP | r.score); + rr = func(q, tlen, target, o_del, e_del, o_ins, e_ins, KSW_XSTOP | r.score); revseq(r.qe + 1, query); revseq(r.te + 1, target); free(q); if (r.score == rr.score) @@ -357,6 +363,11 @@ kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, con return r; } +kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry) +{ + return ksw_align2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, xtra, qry); +} + /******************** *** SW extension *** ********************/ diff --git a/ksw.h b/ksw.h index 97559fd..5d45a67 100644 --- a/ksw.h +++ b/ksw.h @@ -61,6 +61,7 @@ extern "C" { * query profile will be deallocated in ksw_align(). */ kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry); + kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry); /** * Banded global alignment @@ -80,6 +81,7 @@ extern "C" { * @return score of the alignment */ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar, uint32_t **cigar); + int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar, uint32_t **cigar); /** * Extend alignment @@ -103,6 +105,7 @@ extern "C" { * @return best semi-local alignment score */ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); + int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); #ifdef __cplusplus } diff --git a/main.c b/main.c index 4edf9bd..64d1d0b 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.7+dev-r442" +#define PACKAGE_VERSION "0.7.7+dev-r448" #endif int bwa_fa2pac(int argc, char *argv[]); From 578bb55c38dfa7ea45cb11cf092ac5ba9dddd8fd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 28 Mar 2014 14:15:38 -0400 Subject: [PATCH 465/498] dev-449: unequal ins/del in global() and extend() --- ksw.c | 80 +++++++++++++++++++++++++++++++++++++--------------------- main.c | 2 +- 2 files changed, 52 insertions(+), 30 deletions(-) diff --git a/ksw.c b/ksw.c index 5eafb1b..bb055bb 100644 --- a/ksw.c +++ b/ksw.c @@ -376,11 +376,11 @@ typedef struct { int32_t h, e; } eh_t; -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) +int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) { eh_t *eh; // score array int8_t *qp; // query profile - int i, j, k, gapoe = gapo + gape, beg, end, max, max_i, max_j, max_gap, max_ie, gscore, max_off; + int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; if (h0 < 0) h0 = 0; // allocate memory qp = malloc(qlen * m); @@ -391,25 +391,28 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; } // fill the first row - eh[0].h = h0; eh[1].h = h0 > gapoe? h0 - gapoe : 0; - for (j = 2; j <= qlen && eh[j-1].h > gape; ++j) - eh[j].h = eh[j-1].h - gape; + eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0; + for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j) + eh[j].h = eh[j-1].h - e_ins; // adjust $w if it is too large k = m * m; for (i = 0, max = 0; i < k; ++i) // get the max score max = max > mat[i]? max : mat[i]; - max_gap = (int)((double)(qlen * max + end_bonus - gapo) / gape + 1.); - max_gap = max_gap > 1? max_gap : 1; - w = w < max_gap? w : max_gap; + max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); + max_ins = max_ins > 1? max_ins : 1; + w = w < max_ins? w : max_ins; + max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); + max_del = max_del > 1? max_del : 1; + w = w < max_del? w : max_del; // TODO: is this necessary? // DP loop max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; max_off = 0; beg = 0, end = qlen; for (i = 0; LIKELY(i < tlen); ++i) { - int f = 0, h1, m = 0, mj = -1; + int t, f = 0, h1, m = 0, mj = -1; int8_t *q = &qp[target[i] * qlen]; // compute the first column - h1 = h0 - (gapo + gape * (i + 1)); + h1 = h0 - (o_del + e_del * (i + 1)); if (h1 < 0) h1 = 0; // apply the band and the constraint (if provided) if (beg < i - w) beg = i - w; @@ -430,23 +433,31 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, h1 = h; // save H(i,j) to h1 for the next column mj = m > h? mj : j; // record the position where max score is achieved m = m > h? m : h; // m is stored at eh[mj+1] - h -= gapoe; - h = h > 0? h : 0; - e -= gape; - e = e > h? e : h; // computed E(i+1,j) + t = h - oe_del; + t = t > 0? t : 0; + e -= e_del; + e = e > t? e : t; // computed E(i+1,j) p->e = e; // save E(i+1,j) for the next row - f -= gape; - f = f > h? f : h; // computed F(i,j+1) + t = h - oe_ins; + t = t > 0? t : 0; + f -= e_ins; + f = f > t? f : t; // computed F(i,j+1) } eh[end].h = h1; eh[end].e = 0; if (j == qlen) { max_ie = gscore > h1? max_ie : i; gscore = gscore > h1? gscore : h1; } - if (m == 0 || (zdrop > 0 && max - m - abs((i - max_i) - (j - max_j)) * gape > zdrop)) break; // drop to zero, or below Z-dropoff + if (m == 0) break; if (m > max) { max = m, max_i = i, max_j = mj; max_off = max_off > abs(mj - i)? max_off : abs(mj - i); + } else if (zdrop > 0) { + if (i - max_i > mj - max_j) { + if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break; + } else { + if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break; + } } // update beg and end for the next round for (j = mj; j >= beg && eh[j].h; --j); @@ -464,6 +475,11 @@ int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, return max; } +int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off) +{ + return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off); +} + /******************** * Global alignment * ********************/ @@ -482,11 +498,11 @@ static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, return cigar; } -int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_) +int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_) { eh_t *eh; int8_t *qp; // query profile - int i, j, k, gapoe = gapo + gape, score, n_col; + int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, score, n_col; uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex if (n_cigar_) *n_cigar_ = 0; // allocate memory @@ -502,16 +518,16 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, // fill the first row eh[0].h = 0; eh[0].e = MINUS_INF; for (j = 1; j <= qlen && j <= w; ++j) - eh[j].h = -(gapo + gape * j), eh[j].e = MINUS_INF; + eh[j].h = -(o_ins + e_ins * j), eh[j].e = MINUS_INF; for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band // DP loop for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop - int32_t f = MINUS_INF, h1, beg, end; + int32_t f = MINUS_INF, h1, beg, end, t; int8_t *q = &qp[target[i] * qlen]; uint8_t *zi = &z[i * n_col]; beg = i > w? i - w : 0; end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence - h1 = beg == 0? -(gapo + gape * (i + 1)) : MINUS_INF; + h1 = beg == 0? -(o_del + e_del * (i + 1)) : MINUS_INF; for (j = beg; LIKELY(j < end); ++j) { // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) // Cells are computed in the following order: @@ -533,14 +549,15 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, d = h >= f? d : 2; h = h >= f? h : f; h1 = h; - m -= gapoe; - e -= gape; - d |= e > m? 1<<2 : 0; - e = e > m? e : m; + t = m - oe_del; + e -= e_del; + d |= e > t? 1<<2 : 0; + e = e > t? e : t; p->e = e; - f -= gape; - d |= f > m? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two - f = f > m? f : m; + t = m - oe_ins; + f -= e_ins; + d |= f > t? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two + f = f > t? f : t; zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell } eh[end].h = h1; eh[end].e = MINUS_INF; @@ -566,6 +583,11 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, return score; } +int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_) +{ + return ksw_global2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, n_cigar_, cigar_); +} + /******************************************* * Main function (not compiled by default) * *******************************************/ diff --git a/main.c b/main.c index 64d1d0b..8cba4ed 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.7+dev-r448" +#define PACKAGE_VERSION "0.7.7+dev-r449" #endif int bwa_fa2pac(int argc, char *argv[]); From 9ce50a4e5e74b61debd2eac81cfcc57f57a2b6e9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 28 Mar 2014 14:54:06 -0400 Subject: [PATCH 466/498] dev-450: support diff ins/del penalties. NO TEST!! --- bwa.c | 24 +++++++++++++++----- bwa.h | 2 ++ bwamem.c | 29 ++++++++++++++++-------- bwamem.h | 4 +++- bwamem_pair.c | 6 +++-- fastmap.c | 63 +++++++++++++++++++++++++++------------------------ main.c | 2 +- 7 files changed, 81 insertions(+), 49 deletions(-) diff --git a/bwa.c b/bwa.c index 0a02905..0e9e606 100644 --- a/bwa.c +++ b/bwa.c @@ -86,7 +86,7 @@ void bwa_fill_scmat(int a, int b, int8_t mat[25]) } // Generate CIGAR when the alignment end points are known -uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) +uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) { uint32_t *cigar = 0; uint8_t tmp, *rseq; @@ -113,9 +113,11 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa for (i = 0, *score = 0; i < l_query; ++i) *score += mat[rseq[i]*5 + query[i]]; } else { - int w, max_gap, min_w; + int w, max_gap, max_ins, max_del, min_w; // set the band-width - max_gap = (int)((double)(((l_query+1)>>1) * mat[0] - q) / r + 1.); + max_ins = (int)((double)(((l_query+1)>>1) * mat[0] - o_ins) / e_ins + 1.); + max_del = (int)((double)(((l_query+1)>>1) * mat[0] - o_del) / e_del + 1.); + max_gap = max_ins > max_del? max_ins : max_del; max_gap = max_gap > 1? max_gap : 1; w = (max_gap + abs(rlen - l_query) + 1) >> 1; w = w < w_? w : w_; @@ -127,7 +129,7 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa printf("* Global ref: "); for (i = 0; i < rlen; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); printf("* Global query: "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); } - *score = ksw_global(l_query, query, rlen, rseq, 5, mat, q, r, w, n_cigar, &cigar); + *score = ksw_global2(l_query, query, rlen, rseq, 5, mat, o_del, e_del, o_ins, e_ins, w, n_cigar, &cigar); } {// compute NM and MD int k, x, y, u, n_mm = 0, n_gap = 0; @@ -169,7 +171,12 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa return cigar; } -int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) +uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) +{ + return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM); +} + +int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) { int is_rev; int64_t cb, ce, fm; @@ -188,7 +195,7 @@ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, int64_t x; cb = cb > *rb? cb : *rb; ce = ce < *re? ce : *re; - cigar = bwa_gen_cigar(mat, q, r, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM); + cigar = bwa_gen_cigar2(mat, o_del, e_del, o_ins, e_ins, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM); for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) { int op = cigar[i]&0xf, len = cigar[i]>>4; if (op == 0) { @@ -214,6 +221,11 @@ int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, return (*qb == *qe || *rb == *re)? -2 : 0; } +int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) +{ + return bwa_fix_xref2(mat, q, r, q, r, w, bns, pac, query, qb, qe, rb, re); +} + /********************* * Full index reader * *********************/ diff --git a/bwa.h b/bwa.h index 9d5b2aa..8d46e58 100644 --- a/bwa.h +++ b/bwa.h @@ -32,7 +32,9 @@ extern "C" { void bwa_fill_scmat(int a, int b, int8_t mat[25]); uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); + uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); + int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); char *bwa_idx_infer_prefix(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint); diff --git a/bwamem.c b/bwamem.c index 7f08e20..33b86ad 100644 --- a/bwamem.c +++ b/bwamem.c @@ -47,7 +47,10 @@ mem_opt_t *mem_opt_init() mem_opt_t *o; o = calloc(1, sizeof(mem_opt_t)); o->flag = 0; - o->a = 1; o->b = 4; o->q = 6; o->r = 1; o->w = 100; + o->a = 1; o->b = 4; + o->o_del = o->o_ins = 6; + o->e_del = o->e_ins = 1; + o->w = 100; o->T = 30; o->zdrop = 100; o->pen_unpaired = 17; @@ -446,7 +449,9 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i kv_init(z); for (i = 0; i < n; ++i) a[i].sub = 0, a[i].secondary = -1, a[i].hash = hash_64(id+i); ks_introsort(mem_ars_hash, n, a); - tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; + tmp = opt->a + opt->b; + tmp = opt->o_del + opt->e_del > tmp? opt->o_del + opt->e_del : tmp; + tmp = opt->o_ins + opt->e_ins > tmp? opt->o_ins + opt->e_ins : tmp; kv_push(int, z, 0); for (i = 1; i < n; ++i) { for (k = 0; k < z.n; ++k) { @@ -522,7 +527,7 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); assert(rlen == re - rb); xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); - x = ksw_align(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->q, opt->r, xtra, 0); + x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); free(rseq); if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1; @@ -537,7 +542,9 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, static inline int cal_max_gap(const mem_opt_t *opt, int qlen) { - int l = (int)((double)(qlen * opt->a - opt->q) / opt->r + 1.); + int l_del = (int)((double)(qlen * opt->a - opt->o_del) / opt->e_del + 1.); + int l_ins = (int)((double)(qlen * opt->a - opt->o_ins) / opt->e_ins + 1.); + int l = l_del > l_ins? l_del : l_ins; l = l > 1? l : 1; return l < opt->w<<1? l : opt->w<<1; } @@ -638,7 +645,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int printf("*** Left ref: "); for (j = 0; j < tmp; ++j) putchar("ACGTN"[(int)rs[j]]); putchar('\n'); printf("*** Left query: "); for (j = 0; j < s->qbeg; ++j) putchar("ACGTN"[(int)qs[j]]); putchar('\n'); } - a->score = ksw_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->q, opt->r, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); + a->score = ksw_extend2(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, >le, &gscore, &max_off[0]); if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); } if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break; } @@ -666,7 +673,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int printf("*** Right ref: "); for (j = 0; j < rmax[1] - rmax[0] - re; ++j) putchar("ACGTN"[(int)rseq[re+j]]); putchar('\n'); printf("*** Right query: "); for (j = 0; j < l_query - qe; ++j) putchar("ACGTN"[(int)query[qe+j]]); putchar('\n'); } - a->score = ksw_extend(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->q, opt->r, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); + a->score = ksw_extend2(l_query - qe, query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, >le, &gscore, &max_off[1]); if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); } if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break; } @@ -942,7 +949,7 @@ mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t * mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) { mem_aln_t a; - int i, w2, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD; + int i, w2, tmp, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD; int64_t pos, rb, re; uint8_t *query; @@ -958,18 +965,20 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment - if (bwa_fix_xref(opt->mat, opt->q, opt->r, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) { + if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) { fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Abort. Sorry.\n", __func__); exit(1); } - w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->q, opt->r); + tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del); + w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins); + w2 = w2 > tmp? w2 : tmp; if (bwa_verbose >= 4) printf("* Band width: inferred=%d, cmd_opt=%d, alnreg=%d\n", w2, opt->w, ar->w); if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w; // else w2 = opt->w; // TODO: check if we need this line on long reads. On 1-800bp reads, it does not matter and it should be. i = 0; a.cigar = 0; do { free(a.cigar); - a.cigar = bwa_gen_cigar(opt->mat, opt->q, opt->r, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); + a.cigar = bwa_gen_cigar2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, w2, bns->l_pac, pac, qe - qb, (uint8_t*)&query[qb], rb, re, &score, &a.n_cigar, &NM); if (bwa_verbose >= 4) printf("* Final alignment: w2=%d, global_sc=%d, local_sc=%d\n", w2, score, ar->truesc); if (score == last_sc) break; // it is possible that global alignment and local alignment give different scores last_sc = score; diff --git a/bwamem.h b/bwamem.h index 9686d27..5291491 100644 --- a/bwamem.h +++ b/bwamem.h @@ -19,7 +19,9 @@ typedef struct __smem_i smem_i; #define MEM_F_NO_EXACT 0x40 typedef struct { - int a, b, q, r; // match score, mismatch penalty and gap open/extension penalty. A gap of size k costs q+k*r + int a, b; // match score and mismatch penalty + int o_del, e_del; + int o_ins, e_ins; int pen_unpaired; // phred-scaled penalty for unpaired reads int pen_clip5,pen_clip3;// clipping penalty. This score is not deducted from the DP score. int w; // band width diff --git a/bwamem_pair.c b/bwamem_pair.c index f1aa73a..4c9c3de 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -145,7 +145,7 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me kswr_t aln; mem_alnreg_t b; int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len; - aln = ksw_align(l_ms, seq, len, ref, 5, opt->mat, opt->q, opt->r, xtra, 0); + aln = ksw_align2(l_ms, seq, len, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb; @@ -219,7 +219,9 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ y[v.a[i].y&3] = i; } if (u.n) { // found at least one proper pair - int tmp = opt->a + opt->b > opt->q + opt->r? opt->a + opt->b : opt->q + opt->r; + int tmp = opt->a + opt->b; + tmp = tmp > opt->o_del + opt->e_del? tmp : opt->o_del + opt->e_del; + tmp = tmp > opt->o_ins + opt->e_ins? tmp : opt->o_ins + opt->e_ins; ks_introsort_128(u.n, u.a); i = u.a[u.n-1].y >> 32; k = u.a[u.n-1].y << 32 >> 32; z[v.a[i].y&1] = v.a[i].y<<32>>34; // index of the best pair diff --git a/fastmap.c b/fastmap.c index 2b2d3da..e926160 100644 --- a/fastmap.c +++ b/fastmap.c @@ -25,7 +25,7 @@ int main_mem(int argc, char *argv[]) kseq_t *ks, *ks2 = 0; bseq1_t *seqs; bwaidx_t *idx; - char *rg_line = 0; + char *p, *rg_line = 0; void *ko = 0, *ko2 = 0; int64_t n_processed = 0; @@ -35,8 +35,6 @@ int main_mem(int argc, char *argv[]) else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); else if (c == 'B') opt->b = atoi(optarg); - else if (c == 'O') opt->q = atoi(optarg); - else if (c == 'E') opt->r = atoi(optarg); else if (c == 'T') opt->T = atoi(optarg); else if (c == 'U') opt->pen_unpaired = atoi(optarg); else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; @@ -56,8 +54,15 @@ int main_mem(int argc, char *argv[]) else if (c == 'Q') { opt->mapQ_coef_len = atoi(optarg); opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0; + } else if (c == 'O') { + opt->o_del = opt->o_ins = strtol(optarg, &p, 10); + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + opt->o_ins = strtol(p+1, &p, 10); + } else if (c == 'E') { + opt->e_del = opt->e_ins = strtol(optarg, &p, 10); + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + opt->e_ins = strtol(p+1, &p, 10); } else if (c == 'L') { - char *p; opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10); if (*p != 0 && ispunct(*p) && isdigit(p[1])) opt->pen_clip3 = strtol(p+1, &p, 10); @@ -71,33 +76,33 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); fprintf(stderr, "Algorithm options:\n\n"); - fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); - fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); - fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); - fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); -// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); - fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); - fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->chain_drop_ratio); - fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw); - fprintf(stderr, " -S skip mate rescue\n"); - fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); - fprintf(stderr, " -e discard full-length exact matches\n"); - fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); - fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); - fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->q); - fprintf(stderr, " -E INT gap extension penalty; a gap of size k cost {-O} + {-E}*k [%d]\n", opt->r); - fprintf(stderr, " -L INT penalty for clipping [%d]\n", opt->pen_clip5); - fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); + fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); + fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); + fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); + fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); + fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); +// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); + fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); + fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->chain_drop_ratio); + fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw); + fprintf(stderr, " -S skip mate rescue\n"); + fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); + fprintf(stderr, " -e discard full-length exact matches\n"); + fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); + fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); + fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins); + fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins); + fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3); + fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); fprintf(stderr, "\nInput/output options:\n\n"); - fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); - fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); + fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); + fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); fprintf(stderr, "\n"); - fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); - fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); - fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); - fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); - fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n"); + fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); + fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); + fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); + fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); + fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n"); fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n"); fprintf(stderr, "\n"); free(opt); diff --git a/main.c b/main.c index 8cba4ed..43178dc 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.7+dev-r449" +#define PACKAGE_VERSION "0.7.7+dev-r450" #endif int bwa_fa2pac(int argc, char *argv[]); From 417c6d66c714d47e1243d7502fde4864499acbe1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 31 Mar 2014 10:52:45 -0400 Subject: [PATCH 467/498] dev-r451: fixed a few bugs when -A!=1 Something is still wrong. --- bwa.1 | 19 +++++++++++++------ bwamem_pair.c | 11 +++++++---- main.c | 2 +- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/bwa.1 b/bwa.1 index 601a529..47bdd61 100644 --- a/bwa.1 +++ b/bwa.1 @@ -1,4 +1,4 @@ -.TH bwa 1 "25 Feburary 2014" "bwa-0.7.7" "Bioinformatics tools" +.TH bwa 1 "31 March 2014" "bwa-0.7.8" "Bioinformatics tools" .SH NAME .PP bwa - Burrows-Wheeler Alignment Tool @@ -148,9 +148,10 @@ not work with split alignments. One may consider to use option .B -M to flag shorter split hits as secondary. -.B OPTIONS: .RS .TP 10 +.B ALGORITHM OPTIONS: +.TP .BI -t \ INT Number of threads [1] .TP @@ -202,11 +203,14 @@ Matching score. [1] .BI -B \ INT Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4] .TP -.BI -O \ INT -Gap open penalty. [6] +.BI -O \ INT[,INT] +Gap open penalty. If two numbers are specified, the first is the penalty of +openning a deletion and the second for openning an insertion. [6] .TP -.BI -E \ INT -Gap extension penalty. A gap of length k costs O + k*E (i.e. +.BI -E \ INT[,INT] +Gap extension penalty. If two numbers are specified, the first is the penalty +of extending a deletion and second for extending an insertion. A gap of length +k costs O + k*E (i.e. .B -O is for opening a zero-length gap). [1] .TP @@ -224,6 +228,9 @@ Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these two scores to determine whether we should force pairing. A larger value leads to more aggressive read pair. [17] + +.TP +.B INPUT/OUTPUT OPTIONS: .TP .B -p Assume the first input query file is interleaved paired-end FASTA/Q. See the diff --git a/bwamem_pair.c b/bwamem_pair.c index 4c9c3de..b240f42 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -144,7 +144,7 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me if (len == re - rb) { // no funny things happening kswr_t aln; mem_alnreg_t b; - int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | opt->min_seed_len; + int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); aln = ksw_align2(l_ms, seq, len, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 @@ -235,6 +235,8 @@ int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_ return ret; } +#define raw_mapq(diff, a) ((int)(6.02 * (diff) / (a) + .499)) + int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]) { extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); @@ -276,10 +278,11 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co score_un = a[0].a[0].score + a[1].a[0].score - opt->pen_unpaired; //q_pe = o && subo < o? (int)(MEM_MAPQ_COEF * (1. - (double)subo / o) * log(a[0].a[z[0]].seedcov + a[1].a[z[1]].seedcov) + .499) : 0; subo = subo > score_un? subo : score_un; - q_pe = (o - subo) * 6; + q_pe = raw_mapq(o - subo, opt->a); if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499); if (q_pe < 0) q_pe = 0; if (q_pe > 60) q_pe = 60; + //printf("[1] %ld, %d, %d\n", (long)id, q_pe, n_sub); // the following assumes no split hits if (o > score_un) { // paired alignment is preferred mem_alnreg_t *c[2]; @@ -293,8 +296,8 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co q_se[1] = q_se[1] > q_pe? q_se[1] : q_pe < q_se[1] + 40? q_pe : q_se[1] + 40; extra_flag |= 2; // cap at the tandem repeat score - q_se[0] = q_se[0] < (c[0]->score - c[0]->csub) * 6? q_se[0] : (c[0]->score - c[0]->csub) * 6; - q_se[1] = q_se[1] < (c[1]->score - c[1]->csub) * 6? q_se[1] : (c[1]->score - c[1]->csub) * 6; + q_se[0] = q_se[0] < raw_mapq(c[0]->score - c[0]->csub, opt->a)? q_se[0] : raw_mapq(c[0]->score - c[0]->csub, opt->a); + q_se[1] = q_se[1] < raw_mapq(c[1]->score - c[1]->csub, opt->a)? q_se[1] : raw_mapq(c[1]->score - c[1]->csub, opt->a); } else { // the unpaired alignment is preferred z[0] = z[1] = 0; q_se[0] = mem_approx_mapq_se(opt, &a[0].a[0]); diff --git a/main.c b/main.c index 43178dc..3a8811e 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.7+dev-r450" +#define PACKAGE_VERSION "0.7.7+dev-r451" #endif int bwa_fa2pac(int argc, char *argv[]); From b7076d902392720313d7da32a62fa63ecd283ba2 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 31 Mar 2014 11:21:03 -0400 Subject: [PATCH 468/498] dev-r452: allow to specify insert size at cmd This is also very useful for debugging. --- fastmap.c | 31 +++++++++++++++++++++++++++---- main.c | 2 +- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/fastmap.c b/fastmap.c index e926160..12484bf 100644 --- a/fastmap.c +++ b/fastmap.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include "bwa.h" @@ -28,9 +29,13 @@ int main_mem(int argc, char *argv[]) char *p, *rg_line = 0; void *ko = 0, *ko2 = 0; int64_t n_processed = 0; + mem_pestat_t pes[4], *pes0 = 0; + + memset(pes, 0, 4 * sizeof(mem_pestat_t)); + for (i = 0; i < 4; ++i) pes[i].failed = 1; opt = mem_opt_init(); - while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:")) >= 0) { + while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg); @@ -50,6 +55,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'r') opt->split_factor = atof(optarg); else if (c == 'D') opt->chain_drop_ratio = atof(optarg); else if (c == 'm') opt->max_matesw = atoi(optarg); + else if (c == 's') opt->split_width = atoi(optarg); else if (c == 'C') copy_comment = 1; else if (c == 'Q') { opt->mapQ_coef_len = atoi(optarg); @@ -68,7 +74,21 @@ int main_mem(int argc, char *argv[]) opt->pen_clip3 = strtol(p+1, &p, 10); } else if (c == 'R') { if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak - } else if (c == 's') opt->split_width = atoi(optarg); + } else if (c == 'I') { // specify the insert size distribution + pes0 = pes; + pes[1].failed = 0; + pes[1].avg = strtod(optarg, &p); + pes[1].std = pes[1].avg * .1; + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + pes[1].std = strtod(p+1, &p); + pes[1].high = (int)(pes[1].avg + 4. * pes[1].std + .499); + pes[1].low = (int)(pes[1].avg - 4. * pes[1].std + .499); + if (pes[1].low < 1) pes[1].low = 1; + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + pes[1].high = (int)(strtod(p+1, &p) + .499); + if (*p != 0 && ispunct(*p) && isdigit(p[1])) + pes[1].low = (int)(strtod(p+1, &p) + .499); + } else return 1; } if (opt->n_threads < 1) opt->n_threads = 1; @@ -102,7 +122,10 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); - fprintf(stderr, " -M mark shorter split hits as secondary (for Picard/GATK compatibility)\n"); + fprintf(stderr, " -M mark shorter split hits as secondary\n\n"); + fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n"); + fprintf(stderr, " specify the mean, standard deviation (10%% of mean), max (4 sigma from the mean)\n"); + fprintf(stderr, " and min of the insert size distribution. FR orientation only. [inferred]\n"); fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n"); fprintf(stderr, "\n"); free(opt); @@ -149,7 +172,7 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < n; ++i) size += seqs[i].l_seq; if (bwa_verbose >= 3) fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size); - mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, 0); + mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0); n_processed += n; for (i = 0; i < n; ++i) { err_fputs(seqs[i].sam, stdout); diff --git a/main.c b/main.c index 3a8811e..849d984 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.7+dev-r451" +#define PACKAGE_VERSION "0.7.7+dev-r452" #endif int bwa_fa2pac(int argc, char *argv[]); From b27bdf1ae03e00cb1fc6f346f83ae3cf8fab93bb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 31 Mar 2014 11:52:52 -0400 Subject: [PATCH 469/498] dev-453: change of -A scales -TdBOELU These paramemters are all proportional to -A. --- bwamem_pair.c | 1 - fastmap.c | 34 +++++++++++++++++++++++++++------- main.c | 2 +- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index b240f42..4a7cdf3 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -282,7 +282,6 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co if (n_sub > 0) q_pe -= (int)(4.343 * log(n_sub+1) + .499); if (q_pe < 0) q_pe = 0; if (q_pe > 60) q_pe = 60; - //printf("[1] %ld, %d, %d\n", (long)id, q_pe, n_sub); // the following assumes no split hits if (o > score_un) { // paired alignment is preferred mem_alnreg_t *c[2]; diff --git a/fastmap.c b/fastmap.c index 12484bf..892436c 100644 --- a/fastmap.c +++ b/fastmap.c @@ -20,7 +20,7 @@ int kclose(void *a); int main_mem(int argc, char *argv[]) { - mem_opt_t *opt; + mem_opt_t *opt, opt0; int fd, fd2, i, c, n, copy_comment = 0; gzFile fp, fp2 = 0; kseq_t *ks, *ks2 = 0; @@ -35,13 +35,15 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < 4; ++i) pes[i].failed = 1; opt = mem_opt_init(); + opt0.a = opt0.b = opt0.o_del = opt0.e_del = opt0.o_ins = opt0.e_ins = opt0.pen_unpaired = -1; + opt0.pen_clip5 = opt0.pen_clip3 = opt0.zdrop = opt0.T = -1; while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); - else if (c == 'A') opt->a = atoi(optarg); - else if (c == 'B') opt->b = atoi(optarg); - else if (c == 'T') opt->T = atoi(optarg); - else if (c == 'U') opt->pen_unpaired = atoi(optarg); + else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1; + else if (c == 'B') opt->b = atoi(optarg), opt0.b = 1; + else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1; + else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1; else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'a') opt->flag |= MEM_F_ALL; @@ -50,7 +52,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; else if (c == 'e') opt->flag |= MEM_F_NO_EXACT; else if (c == 'c') opt->max_occ = atoi(optarg); - else if (c == 'd') opt->zdrop = atoi(optarg); + else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); else if (c == 'D') opt->chain_drop_ratio = atof(optarg); @@ -61,14 +63,17 @@ int main_mem(int argc, char *argv[]) opt->mapQ_coef_len = atoi(optarg); opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0; } else if (c == 'O') { + opt0.o_del = opt0.o_ins = 1; opt->o_del = opt->o_ins = strtol(optarg, &p, 10); if (*p != 0 && ispunct(*p) && isdigit(p[1])) opt->o_ins = strtol(p+1, &p, 10); } else if (c == 'E') { + opt0.e_del = opt0.e_ins = 1; opt->e_del = opt->e_ins = strtol(optarg, &p, 10); if (*p != 0 && ispunct(*p) && isdigit(p[1])) opt->e_ins = strtol(p+1, &p, 10); } else if (c == 'L') { + opt0.pen_clip5 = opt0.pen_clip3 = 1; opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10); if (*p != 0 && ispunct(*p) && isdigit(p[1])) opt->pen_clip3 = strtol(p+1, &p, 10); @@ -88,6 +93,9 @@ int main_mem(int argc, char *argv[]) pes[1].high = (int)(strtod(p+1, &p) + .499); if (*p != 0 && ispunct(*p) && isdigit(p[1])) pes[1].low = (int)(strtod(p+1, &p) + .499); + if (bwa_verbose >= 3) + fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n", + __func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low); } else return 1; } @@ -108,7 +116,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -S skip mate rescue\n"); fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); fprintf(stderr, " -e discard full-length exact matches\n"); - fprintf(stderr, " -A INT score for a sequence match [%d]\n", opt->a); + fprintf(stderr, " -A INT score for a sequence match, which scales [-TdBOELU] unless overridden [%d]\n", opt->a); fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins); fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins); @@ -132,6 +140,18 @@ int main_mem(int argc, char *argv[]) return 1; } + if (opt0.a == 1) { // matching score is changed + if (opt0.b != 1) opt->b *= opt->a; + if (opt0.T != 1) opt->T *= opt->a; + if (opt0.o_del != 1) opt->o_del *= opt->a; + if (opt0.e_del != 1) opt->e_del *= opt->a; + if (opt0.o_ins != 1) opt->o_ins *= opt->a; + if (opt0.e_ins != 1) opt->e_ins *= opt->a; + if (opt0.zdrop != 1) opt->zdrop *= opt->a; + if (opt0.pen_clip5 != 1) opt->pen_clip5 *= opt->a; + if (opt0.pen_clip3 != 1) opt->pen_clip3 *= opt->a; + if (opt0.pen_unpaired != 1) opt->pen_unpaired *= opt->a; + } bwa_fill_scmat(opt->a, opt->b, opt->mat); if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak diff --git a/main.c b/main.c index 849d984..9a3d044 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.7+dev-r452" +#define PACKAGE_VERSION "0.7.7+dev-r453" #endif int bwa_fa2pac(int argc, char *argv[]); From 127c00cc9651bf9b049e217fd845d81d6a146abd Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 31 Mar 2014 12:03:27 -0400 Subject: [PATCH 470/498] dev-454: wording change in command line prompt --- fastmap.c | 5 +++-- main.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fastmap.c b/fastmap.c index 892436c..5d6d9d6 100644 --- a/fastmap.c +++ b/fastmap.c @@ -132,8 +132,9 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, " -M mark shorter split hits as secondary\n\n"); fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n"); - fprintf(stderr, " specify the mean, standard deviation (10%% of mean), max (4 sigma from the mean)\n"); - fprintf(stderr, " and min of the insert size distribution. FR orientation only. [inferred]\n"); + fprintf(stderr, " specify the mean, standard deviation (10%% of mean if absent), max (4\n"); + fprintf(stderr, " sigma from the mean if absent) and min of the insert size distribution.\n"); + fprintf(stderr, " FR orientation only. [inferred]\n"); fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n"); fprintf(stderr, "\n"); free(opt); diff --git a/main.c b/main.c index 9a3d044..5a983c9 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.7+dev-r453" +#define PACKAGE_VERSION "0.7.7+dev-r454" #endif int bwa_fa2pac(int argc, char *argv[]); From 3efb7c0e91e5232f4858d9e6033bd5eb9de2348b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 31 Mar 2014 15:27:23 -0400 Subject: [PATCH 471/498] r455: release bwa-0.7.8 --- NEWS | 32 ++++++++++++++++++++++++++++++++ bwa.1 | 7 +++++++ fastmap.c | 4 ++-- main.c | 2 +- 4 files changed, 42 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index a7c64ed..40f4433 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,35 @@ +Release 0.7.8 (31 March, 2014) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Changes in BWA-MEM: + + * Bugfix: off-diagonal X-dropoff (option -d) not working as intended. + Short-read alignment is not affected. + + * Bugfix: unnecessarily large bandwidth used during global alignment, + which reduces the mapping speed by ~5% for short reads. Results are not + affected. + + * Bugfix: when the matching score is not one, paired-end mapping quality is + inaccurate. + + * When the matching score (option -A) is changed, scale all score-related + options accordingly unless overridden by users. + + * Allow to specify different gap open (or extension) penalties for deletions + and insertions separately. + + * Allow to specify the insert size distribution. + + * Better and more detailed debugging information. + +With the default setting, 0.7.8 and 0.7.7 gave identical output on one million +100bp read pairs. + +(0.7.8: 31 March 2014, r455) + + + Release 0.7.7 (25 Feburary, 2014) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bwa.1 b/bwa.1 index 47bdd61..b6354e5 100644 --- a/bwa.1 +++ b/bwa.1 @@ -267,6 +267,13 @@ supported throughout BWA. Ideally, a value 0 for disabling all the output to stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for all normal messages; 4 or higher for debugging. When this option takes value 4, the output is not SAM. [3] +.TP +.BI -I \ FLOAT[,FLOAT[,INT[,INT]]] +Specify the mean, standard deviation (10% of the mean if absent), max (4 sigma +from the mean if absent) and min (4 sigma if absent) of the insert size +distribution. Only applicable to the FR orientation. By default, BWA-MEM infers +these numbers and the pair orientations given enough reads. [inferred] + .RE .TP diff --git a/fastmap.c b/fastmap.c index 5d6d9d6..093fb7b 100644 --- a/fastmap.c +++ b/fastmap.c @@ -132,8 +132,8 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); fprintf(stderr, " -M mark shorter split hits as secondary\n\n"); fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n"); - fprintf(stderr, " specify the mean, standard deviation (10%% of mean if absent), max (4\n"); - fprintf(stderr, " sigma from the mean if absent) and min of the insert size distribution.\n"); + fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n"); + fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n"); fprintf(stderr, " FR orientation only. [inferred]\n"); fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n"); fprintf(stderr, "\n"); diff --git a/main.c b/main.c index 5a983c9..0ae7978 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.7+dev-r454" +#define PACKAGE_VERSION "0.7.8-r455" #endif int bwa_fa2pac(int argc, char *argv[]); From 9a5705289c3c68040257217a768d53655779744c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 3 Apr 2014 13:38:08 -0400 Subject: [PATCH 472/498] added more debugging infomation I can see a bug, but I do not know where it comes from. --- bwamem.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/bwamem.c b/bwamem.c index 33b86ad..077b835 100644 --- a/bwamem.c +++ b/bwamem.c @@ -42,6 +42,8 @@ * When there are gaps, l should be the length of alignment matches (i.e. the M operator in CIGAR) */ +static const bntseq_t *global_bns = 0; // for debugging only + mem_opt_t *mem_opt_init() { mem_opt_t *o; @@ -182,14 +184,17 @@ typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; #define chain_cmp(a, b) (((b).pos < (a).pos) - ((a).pos < (b).pos)) KBTREE_INIT(chn, mem_chain_t, chain_cmp) +// return 1 if the seed is merged into the chain static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p) { int64_t qend, rend, x, y; const mem_seed_t *last = &c->seeds[c->n-1]; qend = last->qbeg + last->len; rend = last->rbeg + last->len; - if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) + if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) { + if (bwa_verbose >= 5) printf("** contained\n"); return 1; // contained seed; do nothing + } if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand x = p->qbeg - last->qbeg; // always non-negtive y = p->rbeg - last->rbeg; @@ -199,8 +204,10 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, c c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t)); } c->seeds[c->n++] = *p; + if (bwa_verbose >= 5) printf("** appended\n"); return 1; - } + } else if (bwa_verbose >= 5) + printf("** new chain: %ld, %ld, %ld, %ld\n", (long)y, (long)abs(x-y), (long)(x - last->len), (long)(y - last->len)); return 0; // request to add a new chain } @@ -224,7 +231,15 @@ static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) * s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; s.len = slen; - if (bwa_verbose >= 5) printf("* Found SEED: length=%d,query_beg=%d,ref_beg=%ld\n", s.len, s.qbeg, (long)s.rbeg); + if (bwa_verbose >= 5) { + bwtint_t pos; + int is_rev, ref_id; + pos = bns_depos(global_bns, s.rbeg, &is_rev); + if (is_rev) pos -= s.len - 1; + bns_cnt_ambi(global_bns, pos, s.len, &ref_id); + printf("* Found SEED: length=%d,query_beg=%d,ref_beg=%ld; %s:%c%ld\n", s.len, s.qbeg, (long)s.rbeg, \ + global_bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - global_bns->anns[ref_id].offset) + 1); + } if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip if (kb_size(tree)) { kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain @@ -1070,6 +1085,7 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn double ctime, rtime; ctime = cputime(); rtime = realtime(); + global_bns = bns; regs = malloc(n * sizeof(mem_alnreg_v)); w.opt = opt; w.bwt = bwt; w.bns = bns; w.pac = pac; w.seqs = seqs; w.regs = regs; w.n_processed = n_processed; From acfe7613db4bb2191e349874c0aab563308245eb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 3 Apr 2014 15:10:50 -0400 Subject: [PATCH 473/498] dev-457: separated interval collection and seeding --- bwamem.c | 149 ++++++++++++++++++++++++++++++++++++------------------- main.c | 2 +- 2 files changed, 100 insertions(+), 51 deletions(-) diff --git a/bwamem.c b/bwamem.c index 077b835..ea936f1 100644 --- a/bwamem.c +++ b/bwamem.c @@ -162,6 +162,65 @@ const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) return smem_next2(itr, split_len, split_width, 1); } +/*************************** + * Collection SA invervals * + ***************************/ + +#define intv_lt(a, b) ((a).info < (b).info) +KSORT_INIT(mem_intv, bwtintv_t, intv_lt) + +typedef struct { + bwtintv_v mem, mem1, *tmpv[2]; +} smem_aux_t; + +static smem_aux_t *smem_aux_init() +{ + smem_aux_t *a; + a = calloc(1, sizeof(smem_aux_t)); + a->tmpv[0] = calloc(1, sizeof(bwtintv_v)); + a->tmpv[1] = calloc(1, sizeof(bwtintv_v)); + return a; +} + +static void smem_aux_destroy(smem_aux_t *a) +{ + free(a->tmpv[0]->a); free(a->tmpv[1]->a); + free(a->mem.a); free(a->mem1.a); + free(a); +} + +static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a) +{ + int i, k, x = 0, old_n; + int start_width = (opt->flag & MEM_F_NO_EXACT)? 2 : 1; + int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); + a->mem.n = 0; + // first pass: find all SMEMs + while (x < len) { + if (seq[x] < 4) { + x = bwt_smem1(bwt, len, seq, x, start_width, &a->mem1, a->tmpv); + for (i = 0; i < a->mem1.n; ++i) { + bwtintv_t *p = &a->mem1.a[i]; + int slen = (uint32_t)p->info - (p->info>>32); // seed length + if (slen >= opt->min_seed_len && p->x[2] <= opt->max_occ) + kv_push(bwtintv_t, a->mem, *p); + } + } else ++x; + } + // second pass: find MEMs inside a long SMEM + old_n = a->mem.n; + for (k = 0; k < old_n; ++k) { + bwtintv_t *p = &a->mem.a[k]; + int start = p->info>>32, end = (int32_t)p->info; + if (end - start < split_len || p->x[2] > opt->split_width) continue; + bwt_smem1(bwt, len, seq, (start + end)>>1, p->x[2]+1, &a->mem1, a->tmpv); + for (i = 0; i < a->mem1.n; ++i) + kv_push(bwtintv_t, a->mem, a->mem1.a[i]); + } + // sort + ks_introsort(mem_intv, a->mem.n, a->mem.a); +} + /******************************** * Chaining while finding SMEMs * ********************************/ @@ -211,51 +270,6 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, c return 0; // request to add a new chain } -static void mem_insert_seed(const mem_opt_t *opt, int64_t l_pac, kbtree_t(chn) *tree, smem_i *itr) -{ - const bwtintv_v *a; - int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); - int start_width = (opt->flag & MEM_F_NO_EXACT)? 2 : 1; - split_len = split_len < itr->len? split_len : itr->len; - while ((a = smem_next2(itr, split_len, opt->split_width, start_width)) != 0) { // to find all SMEM and some internal MEM - int i; - for (i = 0; i < a->n; ++i) { // go through each SMEM/MEM up to itr->start - bwtintv_t *p = &a->a[i]; - int slen = (uint32_t)p->info - (p->info>>32); // seed length - int64_t k; - if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive - for (k = 0; k < p->x[2]; ++k) { - mem_chain_t tmp, *lower, *upper; - mem_seed_t s; - int to_add = 0; - s.rbeg = tmp.pos = bwt_sa(itr->bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference - s.qbeg = p->info>>32; - s.len = slen; - if (bwa_verbose >= 5) { - bwtint_t pos; - int is_rev, ref_id; - pos = bns_depos(global_bns, s.rbeg, &is_rev); - if (is_rev) pos -= s.len - 1; - bns_cnt_ambi(global_bns, pos, s.len, &ref_id); - printf("* Found SEED: length=%d,query_beg=%d,ref_beg=%ld; %s:%c%ld\n", s.len, s.qbeg, (long)s.rbeg, \ - global_bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - global_bns->anns[ref_id].offset) + 1); - } - if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip - if (kb_size(tree)) { - kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain - if (!lower || !test_and_merge(opt, l_pac, lower, &s)) to_add = 1; - } else to_add = 1; - if (to_add) { // add the seed as a new chain - tmp.n = 1; tmp.m = 4; - tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); - tmp.seeds[0] = s; - kb_putp(chn, tree, &tmp); - } - } - } - } -} - int mem_chain_weight(const mem_chain_t *c) { int64_t end; @@ -296,16 +310,52 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int len, const uint8_t *seq) { + int i; mem_chain_v chain; - smem_i *itr; kbtree_t(chn) *tree; + smem_aux_t *aux; kv_init(chain); if (len < opt->min_seed_len) return chain; // if the query is shorter than the seed length, no match tree = kb_init(chn, KB_DEFAULT_SIZE); - itr = smem_itr_init(bwt); - smem_set_query(itr, len, seq); - mem_insert_seed(opt, l_pac, tree, itr); + + aux = smem_aux_init(); + mem_collect_intv(opt, bwt, len, seq, aux); + for (i = 0; i < aux->mem.n; ++i) { + bwtintv_t *p = &aux->mem.a[i]; + int slen = (uint32_t)p->info - (p->info>>32); // seed length + int64_t k; + if (slen < opt->min_seed_len || p->x[2] > opt->max_occ) continue; // ignore if too short or too repetitive + for (k = 0; k < p->x[2]; ++k) { + mem_chain_t tmp, *lower, *upper; + mem_seed_t s; + int to_add = 0; + s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference + s.qbeg = p->info>>32; + s.len = slen; + if (bwa_verbose >= 5) { + bwtint_t pos; + int is_rev, ref_id; + pos = bns_depos(global_bns, s.rbeg, &is_rev); + if (is_rev) pos -= s.len - 1; + bns_cnt_ambi(global_bns, pos, s.len, &ref_id); + printf("* Found SEED: length=%d,query_beg=%d,ref_beg=%ld; %s:%c%ld\n", s.len, s.qbeg, (long)s.rbeg, \ + global_bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - global_bns->anns[ref_id].offset) + 1); + } + if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip + if (kb_size(tree)) { + kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain + if (!lower || !test_and_merge(opt, l_pac, lower, &s)) to_add = 1; + } else to_add = 1; + if (to_add) { // add the seed as a new chain + tmp.n = 1; tmp.m = 4; + tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); + tmp.seeds[0] = s; + kb_putp(chn, tree, &tmp); + } + } + } + smem_aux_destroy(aux); kv_resize(mem_chain_t, chain, kb_size(tree)); @@ -313,7 +363,6 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int __kb_traverse(mem_chain_t, tree, traverse_func); #undef traverse_func - smem_itr_destroy(itr); kb_destroy(chn, tree); return chain; } diff --git a/main.c b/main.c index 0ae7978..1923384 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8-r455" +#define PACKAGE_VERSION "0.7.8+dev-r457" #endif int bwa_fa2pac(int argc, char *argv[]); From b3225581be1808b80b00eddcaf9aa731434afee6 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 3 Apr 2014 15:23:48 -0400 Subject: [PATCH 474/498] dev-458: simplified the smem iterator simpler but less powful. --- bwamem.c | 34 +++------------------------------- bwamem.h | 2 +- fastmap.c | 12 ++++++------ main.c | 2 +- 4 files changed, 11 insertions(+), 39 deletions(-) diff --git a/bwamem.c b/bwamem.c index ea936f1..6abcf00 100644 --- a/bwamem.c +++ b/bwamem.c @@ -116,7 +116,7 @@ void smem_set_query(smem_i *itr, int len, const uint8_t *query) itr->len = len; } -const bwtintv_v *smem_next2(smem_i *itr, int split_len, int split_width, int start_width) +const bwtintv_v *smem_next(smem_i *itr) { int i, max, max_i, ori_start; itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; @@ -124,44 +124,16 @@ const bwtintv_v *smem_next2(smem_i *itr, int split_len, int split_width, int sta while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases if (itr->start == itr->len) return 0; ori_start = itr->start; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, start_width, itr->matches, itr->tmpvec); // search for SMEM + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match bwtintv_t *p = &itr->matches->a[i]; int len = (uint32_t)p->info - (p->info>>32); if (max < len) max = len, max_i = i; } - if (split_len > 0 && max >= split_len && itr->matches->a[max_i].x[2] <= split_width) { // if the longest SMEM is unique and long - int j; - bwtintv_v *a = itr->tmpvec[0]; // reuse tmpvec[0] for merging - bwtintv_t *p = &itr->matches->a[max_i]; - bwt_smem1(itr->bwt, itr->len, itr->query, ((uint32_t)p->info + (p->info>>32))>>1, itr->matches->a[max_i].x[2]+1, itr->sub, itr->tmpvec); // starting from the middle of the longest MEM - i = j = 0; a->n = 0; - while (i < itr->matches->n && j < itr->sub->n) { // ordered merge - int64_t xi = itr->matches->a[i].info>>32<<32 | (itr->len - (uint32_t)itr->matches->a[i].info); - int64_t xj = itr->sub->a[j].info>>32<<32 | (itr->len - (uint32_t)itr->sub->a[j].info); - if (xi < xj) { - kv_push(bwtintv_t, *a, itr->matches->a[i]); - ++i; - } else if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) { - kv_push(bwtintv_t, *a, itr->sub->a[j]); - ++j; - } else ++j; - } - for (; i < itr->matches->n; ++i) kv_push(bwtintv_t, *a, itr->matches->a[i]); - for (; j < itr->sub->n; ++j) - if ((uint32_t)itr->sub->a[j].info - (itr->sub->a[j].info>>32) >= max>>1 && (uint32_t)itr->sub->a[j].info > ori_start) - kv_push(bwtintv_t, *a, itr->sub->a[j]); - kv_copy(bwtintv_t, *itr->matches, *a); - } return itr->matches; } -const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width) -{ - return smem_next2(itr, split_len, split_width, 1); -} - /*************************** * Collection SA invervals * ***************************/ @@ -410,7 +382,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) int e_min = a[j].end < a[i].end? a[j].end : a[i].end; if (e_min > b_max) { // have overlap int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg; - if (e_min - b_max >= min_l * opt->mask_level) { // significant overlap + if (e_min - b_max >= min_l * opt->mask_level && min_l < opt->max_chain_gap) { // significant overlap if (a[j].p2 == 0) a[j].p2 = a[i].p; if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1) break; diff --git a/bwamem.h b/bwamem.h index 5291491..86e7d47 100644 --- a/bwamem.h +++ b/bwamem.h @@ -86,7 +86,7 @@ extern "C" { smem_i *smem_itr_init(const bwt_t *bwt); void smem_itr_destroy(smem_i *itr); void smem_set_query(smem_i *itr, int len, const uint8_t *query); - const bwtintv_v *smem_next(smem_i *itr, int split_len, int split_width); + const bwtintv_v *smem_next(smem_i *itr); mem_opt_t *mem_opt_init(void); void mem_fill_scmat(int a, int b, int8_t mat[25]); diff --git a/fastmap.c b/fastmap.c index 093fb7b..5aac1f7 100644 --- a/fastmap.c +++ b/fastmap.c @@ -37,7 +37,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); opt0.a = opt0.b = opt0.o_del = opt0.e_del = opt0.o_ins = opt0.e_ins = opt0.pen_unpaired = -1; opt0.pen_clip5 = opt0.pen_clip3 = opt0.zdrop = opt0.T = -1; - while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:")) >= 0) { + while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1; @@ -58,6 +58,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'D') opt->chain_drop_ratio = atof(optarg); else if (c == 'm') opt->max_matesw = atoi(optarg); else if (c == 's') opt->split_width = atoi(optarg); + else if (c == 'N') opt->max_chain_gap = atoi(optarg); else if (c == 'C') copy_comment = 1; else if (c == 'Q') { opt->mapQ_coef_len = atoi(optarg); @@ -215,7 +216,7 @@ int main_mem(int argc, char *argv[]) int main_fastmap(int argc, char *argv[]) { - int c, i, min_iwidth = 20, min_len = 17, print_seq = 0, split_width = 0; + int c, i, min_iwidth = 20, min_len = 17, print_seq = 0; kseq_t *seq; bwtint_t k; gzFile fp; @@ -223,9 +224,8 @@ int main_fastmap(int argc, char *argv[]) const bwtintv_v *a; bwaidx_t *idx; - while ((c = getopt(argc, argv, "w:l:ps:")) >= 0) { + while ((c = getopt(argc, argv, "w:l:p")) >= 0) { switch (c) { - case 's': split_width = atoi(optarg); break; case 'p': print_seq = 1; break; case 'w': min_iwidth = atoi(optarg); break; case 'l': min_len = atoi(optarg); break; @@ -233,7 +233,7 @@ int main_fastmap(int argc, char *argv[]) } } if (optind + 1 >= argc) { - fprintf(stderr, "Usage: bwa fastmap [-p] [-s splitWidth=%d] [-l minLen=%d] [-w maxSaSize=%d] \n", split_width, min_len, min_iwidth); + fprintf(stderr, "Usage: bwa fastmap [-p] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); return 1; } @@ -250,7 +250,7 @@ int main_fastmap(int argc, char *argv[]) for (i = 0; i < seq->seq.l; ++i) seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s); - while ((a = smem_next(itr, min_len<<1, split_width)) != 0) { + while ((a = smem_next(itr)) != 0) { for (i = 0; i < a->n; ++i) { bwtintv_t *p = &a->a[i]; if ((uint32_t)p->info - (p->info>>32) < min_len) continue; diff --git a/main.c b/main.c index 1923384..673cb8a 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r457" +#define PACKAGE_VERSION "0.7.8+dev-r458" #endif int bwa_fa2pac(int argc, char *argv[]); From b6bd33b26c3de5205dd3e1b614c8bf451539b6db Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 3 Apr 2014 18:58:49 -0400 Subject: [PATCH 475/498] dev-459: don't hard code the drop ratio In the old code, if a secondary alignment is 50% worse, it won't be outputted. --- bwamem.c | 3 ++- main.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 6abcf00..ab9f349 100644 --- a/bwamem.c +++ b/bwamem.c @@ -917,7 +917,7 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa mem_aln_t *q; if (p->score < opt->T) continue; if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; - if (p->secondary >= 0 && p->score < a->a[p->secondary].score * .5) continue; + if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->chain_drop_ratio) continue; q = kv_pushp(mem_aln_t, aa); *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); q->flag |= extra_flag; // flag secondary @@ -966,6 +966,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun); if (opt->flag & MEM_F_NO_EXACT) regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq); + if (bwa_verbose >= 4) err_printf("* %ld chains remain after removing duplicated chains\n", regs.n); return regs; } diff --git a/main.c b/main.c index 673cb8a..d06b3cd 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r458" +#define PACKAGE_VERSION "0.7.8+dev-r459" #endif int bwa_fa2pac(int argc, char *argv[]); From 066ec4aa95d5e426658d512c4668c9b758054b13 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 4 Apr 2014 10:44:34 -0400 Subject: [PATCH 476/498] dev-460: disallow a cigar 20M2D2I30M in extension Global alignment does not allow contiguous insertions and deletions, but local alignment and extension allow such CIGARs. The optimal global alignment may have a lower score than extension, which actually happens often for PacBio data. This commit disallows a CIGAR like 20M2D2I30M to fix this inconsistency. Local alignment has not been changed. --- ksw.c | 12 ++++++------ main.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ksw.c b/ksw.c index bb055bb..74123cb 100644 --- a/ksw.c +++ b/ksw.c @@ -425,20 +425,20 @@ int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape eh_t *p = &eh[j]; - int h = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) + int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) p->h = h1; // set H(i,j-1) for the next row - h += q[j]; - h = h > e? h : e; + M += q[j]; // separating H and M to disallow a cigar like "100M3I3D20M" + h = M > e? M : e; h = h > f? h : f; h1 = h; // save H(i,j) to h1 for the next column mj = m > h? mj : j; // record the position where max score is achieved m = m > h? m : h; // m is stored at eh[mj+1] - t = h - oe_del; + t = M - oe_del; t = t > 0? t : 0; e -= e_del; e = e > t? e : t; // computed E(i+1,j) p->e = e; // save E(i+1,j) for the next row - t = h - oe_ins; + t = M - oe_ins; t = t > 0? t : 0; f -= e_ins; f = f > t? f : t; // computed F(i,j+1) @@ -536,7 +536,7 @@ int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, // E(i+1,j) = max{M(i,j)-gapo, E(i,j)} - gape // F(i,j+1) = max{M(i,j)-gapo, F(i,j)} - gape // We have to separate M(i,j); otherwise the direction may not be recorded correctly. - // However, a CIGAR like "10M3I3D10M" allowed by local() and extend() is disallowed by global(). + // However, a CIGAR like "10M3I3D10M" allowed by local() is disallowed by global(). // Such a CIGAR may occur, in theory, if mismatch_penalty > 2*gap_ext_penalty + 2*gap_open_penalty/k. // In practice, this should happen very rarely given a reasonable scoring system. eh_t *p = &eh[j]; diff --git a/main.c b/main.c index d06b3cd..16fd48d 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r459" +#define PACKAGE_VERSION "0.7.8+dev-r460" #endif int bwa_fa2pac(int argc, char *argv[]); From 41f720dfa7def8651a1e740eeafaacf557a43634 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 4 Apr 2014 16:05:41 -0400 Subject: [PATCH 477/498] dev-461: added a heuristic for PacBio data See the comment above mem_test_chain_sw() for details. --- bwamem.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++---- bwamem.h | 1 + fastmap.c | 9 ++++++-- main.c | 2 +- 4 files changed, 70 insertions(+), 7 deletions(-) diff --git a/bwamem.c b/bwamem.c index ab9f349..ae1ce68 100644 --- a/bwamem.c +++ b/bwamem.c @@ -69,6 +69,7 @@ mem_opt_t *mem_opt_init() o->n_threads = 1; o->max_matesw = 100; o->mask_level_redun = 0.95; + o->min_HSP_score = 0; o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); // o->mapQ_coef_len = o->mapQ_coef_fac = 0; bwa_fill_scmat(o->a, o->b, o->mat); @@ -527,6 +528,62 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i #define MEM_SHORT_LEN 200 #define MAX_BAND_TRY 2 +/* mem_test_chain_sw() uses SSE2-SW to align a short chain with 50bp added to + * each end of the chain. If the SW score is below opt->min_HSP_score, it will + * return 0, informing the caller to discard the chain. This heuristic is + * somewhat similar to BLAST which drops a seed hit if ungapped extension is + * below a certain score (true for old BLAST; don't know how BLAST+ works). + * + * For PacBio data, we need to set high matching score and low gap penalties; + * otherwise we are likely to get fragmented alignments. However, with such + * settings, we can often extend most random seed hits to the end. These + * extensions are wasteful and time consuming. By testing the chain with SW, + * we can discard bad chains before performing the expensive extension. + * + * Although probably it is not a bad idea to use this function for + * low-divergence sequences, more testing is needed. For now, I only recommend + * to use mem_test_chain_sw() for PacBio data. It is disabled by default. + */ +int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c) +{ + int i, qb, qe; + int64_t rb, re, rlen; + uint8_t *rseq = 0; + kswr_t x; + + if (c->n == 0) return -1; + qb = l_query; qe = 0; + rb = l_pac<<1; re = 0; + for (i = 0; i < c->n; ++i) { + const mem_seed_t *s = &c->seeds[i]; + qb = qb < s->qbeg? qb : s->qbeg; + qe = qe > s->qbeg + s->len? qe : s->qbeg + s->len; + rb = rb < s->rbeg? rb : s->rbeg; + re = re > s->rbeg + s->len? re : s->rbeg + s->len; + } + qb -= MEM_SHORT_EXT; qe += MEM_SHORT_EXT; + qb = qb > 0? qb : 0; + qe = qe < l_query? qe : l_query; + rb -= MEM_SHORT_EXT; re += MEM_SHORT_EXT; + rb = rb > 0? rb : 0; + re = re < l_pac<<1? re : l_pac<<1; + if (rb < l_pac && l_pac < re) { + if (c->seeds[0].rbeg < l_pac) re = l_pac; + else rb = l_pac; + } + if ((re - rb) - (qe - qb) > MEM_SHORT_EXT || (qe - qb) - (re - rb) > MEM_SHORT_EXT) return 1; + if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1; + if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1; + + rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); + assert(rlen == re - rb); + x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0); + free(rseq); + if (x.score >= opt->min_HSP_score) return 1; + if (bwa_verbose >= 4) printf("** give up the chain due to small HSP score %d.\n", x.score); + return 0; +} + int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { int i, qb, qe, xtra; @@ -565,14 +622,13 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); free(rseq); - if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1; - a.rb = rb + x.tb; a.re = rb + x.te + 1; a.qb = qb + x.qb; a.qe = qb + x.qe + 1; a.score = x.score; a.csub = x.score2; + if (bwa_verbose >= 4) printf("** Attempted alignment via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld); score=%d; %d/%d\n", a.qb, a.qe, (long)a.rb, (long)a.re, x.score, a.qe-a.qb, qe-qb); + if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1; kv_push(mem_alnreg_t, *av, a); - if (bwa_verbose >= 4) printf("** Added alignment region via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld)\n", a.qb, a.qe, (long)a.rb, (long)a.re); return 0; } @@ -958,7 +1014,8 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse mem_chain_t *p = &chn.a[i]; int ret; if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); - ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); + if (opt->min_HSP_score > 0) ret = mem_test_chain_sw(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p); + else ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); free(chn.a[i].seeds); } diff --git a/bwamem.h b/bwamem.h index 86e7d47..dd68d16 100644 --- a/bwamem.h +++ b/bwamem.h @@ -36,6 +36,7 @@ typedef struct { int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed int n_threads; // number of threads int chunk_size; // process chunk_size-bp sequences in a batch + int min_HSP_score; // used in mem_test_chain(); disabled by default float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain float mask_level_redun; diff --git a/fastmap.c b/fastmap.c index 5aac1f7..1db00cb 100644 --- a/fastmap.c +++ b/fastmap.c @@ -37,7 +37,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); opt0.a = opt0.b = opt0.o_del = opt0.e_del = opt0.o_ins = opt0.e_ins = opt0.pen_unpaired = -1; opt0.pen_clip5 = opt0.pen_clip3 = opt0.zdrop = opt0.T = -1; - while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:")) >= 0) { + while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:u:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1; @@ -56,6 +56,7 @@ int main_mem(int argc, char *argv[]) else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); else if (c == 'D') opt->chain_drop_ratio = atof(optarg); + else if (c == 'u') opt->min_HSP_score = atoi(optarg); else if (c == 'm') opt->max_matesw = atoi(optarg); else if (c == 's') opt->split_width = atoi(optarg); else if (c == 'N') opt->max_chain_gap = atoi(optarg); @@ -101,6 +102,7 @@ int main_mem(int argc, char *argv[]) else return 1; } if (opt->n_threads < 1) opt->n_threads = 1; + if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; if (optind + 1 >= argc || optind + 3 < argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); @@ -123,6 +125,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins); fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3); fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); + fprintf(stderr, " -u INT drop a chain if local SW score below INT; 0 to disable [%d]\n", opt->min_HSP_score); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); @@ -136,7 +139,9 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n"); fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n"); fprintf(stderr, " FR orientation only. [inferred]\n"); - fprintf(stderr, "\nNote: Please read the man page for detailed description of the command line and options.\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n"); + fprintf(stderr, " `-k18 -u200 -w200 -c1000 -r10 -A3 -O3 -E1' is recommended for PacBio reads as of early 2014.\n"); fprintf(stderr, "\n"); free(opt); return 1; diff --git a/main.c b/main.c index 16fd48d..9dc0cdf 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r460" +#define PACKAGE_VERSION "0.7.8+dev-r461" #endif int bwa_fa2pac(int argc, char *argv[]); From 114901b005fa505fa87a5ee99a0b6d0e1bccaecb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 4 Apr 2014 17:01:04 -0400 Subject: [PATCH 478/498] dev-r462: refined setting for PacBio; weight flt The recommended setting in the last commit is wrong. If we can extend a random seed hit to the full length, we will force the read aligned through break points, which is wrong. The new setting is better but it may lead to a small fraction of fragmented alignments. In addition, I added a filter on the minimum chain weight and tied min_HSP_score to this filter. It doubles the mapping speed. --- bwamem.c | 21 +++++++++++++++++---- bwamem.h | 2 +- fastmap.c | 9 ++++----- main.c | 2 +- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/bwamem.c b/bwamem.c index ae1ce68..5ee153a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -69,7 +69,7 @@ mem_opt_t *mem_opt_init() o->n_threads = 1; o->max_matesw = 100; o->mask_level_redun = 0.95; - o->min_HSP_score = 0; + o->min_chain_weight = 0; o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); // o->mapQ_coef_len = o->mapQ_coef_fac = 0; bwa_fill_scmat(o->a, o->b, o->mat); @@ -357,6 +357,15 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) flt_aux_t *a; int i, j, n; if (n_chn <= 1) return n_chn; // no need to filter + for (i = j = 0; i < n_chn; ++i) { + mem_chain_t *c = &chains[i]; + int w; + w = mem_chain_weight(c); + if (w >= opt->min_chain_weight) + chains[j++] = *c; + } + n_chn = j; + if (n_chn == 0) return 0; a = malloc(sizeof(flt_aux_t) * n_chn); for (i = 0; i < n_chn; ++i) { mem_chain_t *c = &chains[i]; @@ -526,10 +535,13 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i #define MEM_SHORT_EXT 50 #define MEM_SHORT_LEN 200 + +#define MEM_HSP_COEF 1.5 + #define MAX_BAND_TRY 2 /* mem_test_chain_sw() uses SSE2-SW to align a short chain with 50bp added to - * each end of the chain. If the SW score is below opt->min_HSP_score, it will + * each end of the chain. If the SW score is below min_HSP_score, it will * return 0, informing the caller to discard the chain. This heuristic is * somewhat similar to BLAST which drops a seed hit if ungapped extension is * below a certain score (true for old BLAST; don't know how BLAST+ works). @@ -547,6 +559,7 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c) { int i, qb, qe; + int min_HSP_score = (int)(opt->min_chain_weight * opt->a * MEM_HSP_COEF + .499); int64_t rb, re, rlen; uint8_t *rseq = 0; kswr_t x; @@ -579,7 +592,7 @@ int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, i assert(rlen == re - rb); x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0); free(rseq); - if (x.score >= opt->min_HSP_score) return 1; + if (x.score >= min_HSP_score) return 1; if (bwa_verbose >= 4) printf("** give up the chain due to small HSP score %d.\n", x.score); return 0; } @@ -1014,7 +1027,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse mem_chain_t *p = &chn.a[i]; int ret; if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); - if (opt->min_HSP_score > 0) ret = mem_test_chain_sw(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p); + if (opt->min_chain_weight > 0) ret = mem_test_chain_sw(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p); else ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); free(chn.a[i].seeds); diff --git a/bwamem.h b/bwamem.h index dd68d16..27514cd 100644 --- a/bwamem.h +++ b/bwamem.h @@ -30,13 +30,13 @@ typedef struct { int T; // output score threshold; only affecting output int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length + int min_chain_weight; float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor int split_width; // split into a seed if its occurence is smaller than this value int max_occ; // skip a seed if its occurence is larger than this value int max_chain_gap; // do not chain seed if it is max_chain_gap-bp away from the closest seed int n_threads; // number of threads int chunk_size; // process chunk_size-bp sequences in a batch - int min_HSP_score; // used in mem_test_chain(); disabled by default float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain float mask_level_redun; diff --git a/fastmap.c b/fastmap.c index 1db00cb..c971f9b 100644 --- a/fastmap.c +++ b/fastmap.c @@ -37,7 +37,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); opt0.a = opt0.b = opt0.o_del = opt0.e_del = opt0.o_ins = opt0.e_ins = opt0.pen_unpaired = -1; opt0.pen_clip5 = opt0.pen_clip3 = opt0.zdrop = opt0.T = -1; - while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:u:")) >= 0) { + while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg); else if (c == 'w') opt->w = atoi(optarg); else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1; @@ -56,10 +56,10 @@ int main_mem(int argc, char *argv[]) else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg); else if (c == 'D') opt->chain_drop_ratio = atof(optarg); - else if (c == 'u') opt->min_HSP_score = atoi(optarg); else if (c == 'm') opt->max_matesw = atoi(optarg); else if (c == 's') opt->split_width = atoi(optarg); else if (c == 'N') opt->max_chain_gap = atoi(optarg); + else if (c == 'W') opt->min_chain_weight = atoi(optarg); else if (c == 'C') copy_comment = 1; else if (c == 'Q') { opt->mapQ_coef_len = atoi(optarg); @@ -102,7 +102,7 @@ int main_mem(int argc, char *argv[]) else return 1; } if (opt->n_threads < 1) opt->n_threads = 1; - if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; +// if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; // TODO: tie ->T to MEM_HSP_COEF if (optind + 1 >= argc || optind + 3 < argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); @@ -125,7 +125,6 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins); fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3); fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); - fprintf(stderr, " -u INT drop a chain if local SW score below INT; 0 to disable [%d]\n", opt->min_HSP_score); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); @@ -141,7 +140,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " FR orientation only. [inferred]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n"); - fprintf(stderr, " `-k18 -u200 -w200 -c1000 -r10 -A3 -O3 -E1' is recommended for PacBio reads as of early 2014.\n"); + fprintf(stderr, " `-k17 -W40 -w200 -c1000 -r10 -A2 -O2 -E1' is recommended for PacBio reads as of early 2014.\n"); fprintf(stderr, "\n"); free(opt); return 1; diff --git a/main.c b/main.c index 9dc0cdf..1089477 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r461" +#define PACKAGE_VERSION "0.7.8+dev-r462" #endif int bwa_fa2pac(int argc, char *argv[]); From 172ba832412fc1e6a74509a7597a8389766cc15e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 7 Apr 2014 11:29:36 -0400 Subject: [PATCH 479/498] dev-463: added option -x to change multiple params I hate to copy-paste long command line options. --- bwamem.c | 8 +++++- fastmap.c | 84 +++++++++++++++++++++++++++++++++++++------------------ main.c | 2 +- 3 files changed, 65 insertions(+), 29 deletions(-) diff --git a/bwamem.c b/bwamem.c index 5ee153a..8d15f15 100644 --- a/bwamem.c +++ b/bwamem.c @@ -1036,7 +1036,13 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun); if (opt->flag & MEM_F_NO_EXACT) regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq); - if (bwa_verbose >= 4) err_printf("* %ld chains remain after removing duplicated chains\n", regs.n); + if (bwa_verbose >= 4) { + err_printf("* %ld chains remain after removing duplicated chains\n", regs.n); + for (i = 0; i < regs.n; ++i) { + mem_alnreg_t *p = ®s.a[i]; + printf("** %d, [%d,%d) <=> [%ld,%ld)\n", p->score, p->qb, p->qe, (long)p->rb, (long)p->re); + } + } return regs; } diff --git a/fastmap.c b/fastmap.c index c971f9b..8ffa07b 100644 --- a/fastmap.c +++ b/fastmap.c @@ -18,6 +18,22 @@ extern unsigned char nst_nt4_table[256]; void *kopen(const char *fn, int *_fd); int kclose(void *a); +static void update_a(mem_opt_t *opt, const mem_opt_t *opt0) +{ + if (opt0->a) { // matching score is changed + if (!opt0->b) opt->b *= opt->a; + if (!opt0->T) opt->T *= opt->a; + if (!opt0->o_del) opt->o_del *= opt->a; + if (!opt0->e_del) opt->e_del *= opt->a; + if (!opt0->o_ins) opt->o_ins *= opt->a; + if (!opt0->e_ins) opt->e_ins *= opt->a; + if (!opt0->zdrop) opt->zdrop *= opt->a; + if (!opt0->pen_clip5) opt->pen_clip5 *= opt->a; + if (!opt0->pen_clip3) opt->pen_clip3 *= opt->a; + if (!opt0->pen_unpaired) opt->pen_unpaired *= opt->a; + } +} + int main_mem(int argc, char *argv[]) { mem_opt_t *opt, opt0; @@ -27,6 +43,7 @@ int main_mem(int argc, char *argv[]) bseq1_t *seqs; bwaidx_t *idx; char *p, *rg_line = 0; + const char *mode = 0; void *ko = 0, *ko2 = 0; int64_t n_processed = 0; mem_pestat_t pes[4], *pes0 = 0; @@ -35,11 +52,11 @@ int main_mem(int argc, char *argv[]) for (i = 0; i < 4; ++i) pes[i].failed = 1; opt = mem_opt_init(); - opt0.a = opt0.b = opt0.o_del = opt0.e_del = opt0.o_ins = opt0.e_ins = opt0.pen_unpaired = -1; - opt0.pen_clip5 = opt0.pen_clip3 = opt0.zdrop = opt0.T = -1; - while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:")) >= 0) { - if (c == 'k') opt->min_seed_len = atoi(optarg); - else if (c == 'w') opt->w = atoi(optarg); + memset(&opt0, 0, sizeof(mem_opt_t)); + while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:")) >= 0) { + if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; + else if (c == 'x') mode = optarg; + else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1; else if (c == 'B') opt->b = atoi(optarg), opt0.b = 1; else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1; @@ -51,17 +68,18 @@ int main_mem(int argc, char *argv[]) else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; else if (c == 'e') opt->flag |= MEM_F_NO_EXACT; - else if (c == 'c') opt->max_occ = atoi(optarg); + else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1; else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; else if (c == 'v') bwa_verbose = atoi(optarg); - else if (c == 'r') opt->split_factor = atof(optarg); - else if (c == 'D') opt->chain_drop_ratio = atof(optarg); - else if (c == 'm') opt->max_matesw = atoi(optarg); - else if (c == 's') opt->split_width = atoi(optarg); - else if (c == 'N') opt->max_chain_gap = atoi(optarg); - else if (c == 'W') opt->min_chain_weight = atoi(optarg); + else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.; + else if (c == 'D') opt->chain_drop_ratio = atof(optarg), opt0.chain_drop_ratio = 1.; + else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1; + else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1; + else if (c == 'N') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1; + else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1; else if (c == 'C') copy_comment = 1; else if (c == 'Q') { + opt0.mapQ_coef_len = 1; opt->mapQ_coef_len = atoi(optarg); opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0; } else if (c == 'O') { @@ -102,7 +120,6 @@ int main_mem(int argc, char *argv[]) else return 1; } if (opt->n_threads < 1) opt->n_threads = 1; -// if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; // TODO: tie ->T to MEM_HSP_COEF if (optind + 1 >= argc || optind + 3 < argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); @@ -115,16 +132,19 @@ int main_mem(int argc, char *argv[]) // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->chain_drop_ratio); + fprintf(stderr, " -W INT discard a chain if seeded bases shorter than INT [0]\n"); fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw); fprintf(stderr, " -S skip mate rescue\n"); fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); fprintf(stderr, " -e discard full-length exact matches\n"); - fprintf(stderr, " -A INT score for a sequence match, which scales [-TdBOELU] unless overridden [%d]\n", opt->a); + fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a); fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins); fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins); fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3); fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); + fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n"); + fprintf(stderr, " pacbio: -k17 -W40 -w200 -c1000 -r10 -A2 -B7 -O2 -E1 -L0\n"); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); @@ -140,25 +160,35 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " FR orientation only. [inferred]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n"); - fprintf(stderr, " `-k17 -W40 -w200 -c1000 -r10 -A2 -O2 -E1' is recommended for PacBio reads as of early 2014.\n"); fprintf(stderr, "\n"); free(opt); return 1; } - if (opt0.a == 1) { // matching score is changed - if (opt0.b != 1) opt->b *= opt->a; - if (opt0.T != 1) opt->T *= opt->a; - if (opt0.o_del != 1) opt->o_del *= opt->a; - if (opt0.e_del != 1) opt->e_del *= opt->a; - if (opt0.o_ins != 1) opt->o_ins *= opt->a; - if (opt0.e_ins != 1) opt->e_ins *= opt->a; - if (opt0.zdrop != 1) opt->zdrop *= opt->a; - if (opt0.pen_clip5 != 1) opt->pen_clip5 *= opt->a; - if (opt0.pen_clip3 != 1) opt->pen_clip3 *= opt->a; - if (opt0.pen_unpaired != 1) opt->pen_unpaired *= opt->a; - } + if (mode) { + if (strcmp(mode, "pacbio") == 0) { + if (!opt0.a) opt->a = 2, opt0.a = 1; + update_a(opt, &opt0); + if (!opt0.b) opt->b = 7; + if (!opt0.o_del) opt->o_del = 2; + if (!opt0.e_del) opt->e_del = 1; + if (!opt0.o_ins) opt->o_ins = 2; + if (!opt0.e_ins) opt->e_ins = 1; + if (!opt0.w) opt->w = 200; + if (!opt0.min_seed_len) opt->min_seed_len = 17; + if (!opt0.min_chain_weight) opt->min_chain_weight = 40; + if (!opt0.max_occ) opt->max_occ = 1000; + if (!opt0.pen_clip5) opt->pen_clip5 = 0; + if (!opt0.pen_clip3) opt->pen_clip3 = 0; + if (opt0.split_factor == 0.) opt->split_factor = 10.; + } else { + fprintf(stderr, "[E::%s] unknown read type '%s'\n", __func__, mode); + return 1; // FIXME memory leak + } + } else update_a(opt, &opt0); +// if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; // TODO: tie ->T to MEM_HSP_COEF bwa_fill_scmat(opt->a, opt->b, opt->mat); + if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak ko = kopen(argv[optind + 1], &fd); diff --git a/main.c b/main.c index 1089477..dd06b42 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r462" +#define PACKAGE_VERSION "0.7.8+dev-r463" #endif int bwa_fa2pac(int argc, char *argv[]); From b45aeb87e19662010de53672c5474ed173dd263e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Apr 2014 11:40:54 -0400 Subject: [PATCH 480/498] dev-464: preset for pacbio read2read aln --- fastmap.c | 19 +++++++++++++------ main.c | 2 +- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fastmap.c b/fastmap.c index 8ffa07b..35797fd 100644 --- a/fastmap.c +++ b/fastmap.c @@ -145,6 +145,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n"); fprintf(stderr, " pacbio: -k17 -W40 -w200 -c1000 -r10 -A2 -B7 -O2 -E1 -L0\n"); + fprintf(stderr, " pbread: -k13 -W30 -w200 -c1000 -r10 -A2 -B5 -O2 -E1\n"); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); @@ -166,21 +167,27 @@ int main_mem(int argc, char *argv[]) } if (mode) { - if (strcmp(mode, "pacbio") == 0) { + if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread") == 0) { if (!opt0.a) opt->a = 2, opt0.a = 1; update_a(opt, &opt0); - if (!opt0.b) opt->b = 7; if (!opt0.o_del) opt->o_del = 2; if (!opt0.e_del) opt->e_del = 1; if (!opt0.o_ins) opt->o_ins = 2; if (!opt0.e_ins) opt->e_ins = 1; if (!opt0.w) opt->w = 200; - if (!opt0.min_seed_len) opt->min_seed_len = 17; - if (!opt0.min_chain_weight) opt->min_chain_weight = 40; if (!opt0.max_occ) opt->max_occ = 1000; - if (!opt0.pen_clip5) opt->pen_clip5 = 0; - if (!opt0.pen_clip3) opt->pen_clip3 = 0; if (opt0.split_factor == 0.) opt->split_factor = 10.; + if (strcmp(mode, "pbread") == 0) { + if (!opt0.b) opt->b = 5; + if (!opt0.min_seed_len) opt->min_seed_len = 13; + if (!opt0.min_chain_weight) opt->min_chain_weight = 30; + } else { + if (!opt0.b) opt->b = 7; + if (!opt0.min_seed_len) opt->min_seed_len = 17; + if (!opt0.min_chain_weight) opt->min_chain_weight = 40; + if (!opt0.pen_clip5) opt->pen_clip5 = 0; + if (!opt0.pen_clip3) opt->pen_clip3 = 0; + } } else { fprintf(stderr, "[E::%s] unknown read type '%s'\n", __func__, mode); return 1; // FIXME memory leak diff --git a/main.c b/main.c index dd06b42..a177e6f 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r463" +#define PACKAGE_VERSION "0.7.8+dev-r464" #endif int bwa_fa2pac(int argc, char *argv[]); From f12dfae7729f395d1eb33540f22be2d132c08d0a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Apr 2014 16:29:36 -0400 Subject: [PATCH 481/498] dev-465: a new output format for read overlap Also moved a few functions to bwamem_extra.c. File bwamem.c is becoming far too long. --- Makefile | 2 +- bwamem.c | 89 +++++----------------------------------- bwamem.h | 3 +- bwamem_extra.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++ fastmap.c | 15 ++++--- main.c | 2 +- 6 files changed, 134 insertions(+), 86 deletions(-) create mode 100644 bwamem_extra.c diff --git a/Makefile b/Makefile index 6490932..aa51937 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 -Wno-unused-function WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS AR= ar DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC) -LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o malloc_wrap.o +LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o AOBJS= QSufSort.o bwt_gen.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \ is.o bwtindex.o bwape.o kopen.o pemerge.o \ bwtsw2_core.o bwtsw2_main.o bwtsw2_aux.o bwt_lite.o \ diff --git a/bwamem.c b/bwamem.c index 8d15f15..2e5104a 100644 --- a/bwamem.c +++ b/bwamem.c @@ -76,65 +76,6 @@ mem_opt_t *mem_opt_init() return o; } -/*************************** - * SMEM iterator interface * - ***************************/ - -struct __smem_i { - const bwt_t *bwt; - const uint8_t *query; - int start, len; - bwtintv_v *matches; // matches; to be returned by smem_next() - bwtintv_v *sub; // sub-matches inside the longest match; temporary - bwtintv_v *tmpvec[2]; // temporary arrays -}; - -smem_i *smem_itr_init(const bwt_t *bwt) -{ - smem_i *itr; - itr = calloc(1, sizeof(smem_i)); - itr->bwt = bwt; - itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); - itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); - itr->matches = calloc(1, sizeof(bwtintv_v)); - itr->sub = calloc(1, sizeof(bwtintv_v)); - return itr; -} - -void smem_itr_destroy(smem_i *itr) -{ - free(itr->tmpvec[0]->a); free(itr->tmpvec[0]); - free(itr->tmpvec[1]->a); free(itr->tmpvec[1]); - free(itr->matches->a); free(itr->matches); - free(itr->sub->a); free(itr->sub); - free(itr); -} - -void smem_set_query(smem_i *itr, int len, const uint8_t *query) -{ - itr->query = query; - itr->start = 0; - itr->len = len; -} - -const bwtintv_v *smem_next(smem_i *itr) -{ - int i, max, max_i, ori_start; - itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; - if (itr->start >= itr->len || itr->start < 0) return 0; - while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases - if (itr->start == itr->len) return 0; - ori_start = itr->start; - itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM - if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here - for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match - bwtintv_t *p = &itr->matches->a[i]; - int len = (uint32_t)p->info - (p->info>>32); - if (max < len) max = len, max_i = i; - } - return itr->matches; -} - /*************************** * Collection SA invervals * ***************************/ @@ -165,7 +106,7 @@ static void smem_aux_destroy(smem_aux_t *a) static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, const uint8_t *seq, smem_aux_t *a) { int i, k, x = 0, old_n; - int start_width = (opt->flag & MEM_F_NO_EXACT)? 2 : 1; + int start_width = (opt->flag & MEM_F_SELF_OVLP)? 2 : 1; int split_len = (int)(opt->min_seed_len * opt->split_factor + .499); a->mem.n = 0; // first pass: find all SMEMs @@ -357,7 +298,7 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) flt_aux_t *a; int i, j, n; if (n_chn <= 1) return n_chn; // no need to filter - for (i = j = 0; i < n_chn; ++i) { + for (i = j = 0; i < n_chn; ++i) { // filter out chains with small weight mem_chain_t *c = &chains[i]; int w; w = mem_chain_weight(c); @@ -482,7 +423,7 @@ int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun) int mem_test_and_remove_exact(const mem_opt_t *opt, int n, mem_alnreg_t *a, int qlen) { - if (!(opt->flag & MEM_F_NO_EXACT) || n == 0 || a->truesc != qlen * opt->a) return n; + if (!(opt->flag & MEM_F_SELF_OVLP) || n == 0 || a->truesc != qlen * opt->a) return n; memmove(a, a + 1, (n - 1) * sizeof(mem_alnreg_t)); return n - 1; } @@ -1034,7 +975,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse } free(chn.a); regs.n = mem_sort_and_dedup(regs.n, regs.a, opt->mask_level_redun); - if (opt->flag & MEM_F_NO_EXACT) + if (opt->flag & MEM_F_SELF_OVLP) regs.n = mem_test_and_remove_exact(opt, regs.n, regs.a, l_seq); if (bwa_verbose >= 4) { err_printf("* %ld chains remain after removing duplicated chains\n", regs.n); @@ -1046,19 +987,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse return regs; } -mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_) -{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence - mem_alnreg_v ar; - char *seq; - seq = malloc(l_seq); - memcpy(seq, seq_, l_seq); // makes a copy of seq_ - ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq); - mem_mark_primary_se(opt, ar.n, ar.a, lrand48()); - free(seq); - return ar; -} -// This routine is only used for the API purpose mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) { mem_aln_t a; @@ -1087,7 +1016,6 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * w2 = w2 > tmp? w2 : tmp; if (bwa_verbose >= 4) printf("* Band width: inferred=%d, cmd_opt=%d, alnreg=%d\n", w2, opt->w, ar->w); if (w2 > opt->w) w2 = w2 < ar->w? w2 : ar->w; -// else w2 = opt->w; // TODO: check if we need this line on long reads. On 1-800bp reads, it does not matter and it should be. i = 0; a.cigar = 0; do { free(a.cigar); @@ -1161,11 +1089,16 @@ static void worker1(void *data, int i, int tid) static void worker2(void *data, int i, int tid) { extern int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], uint64_t id, bseq1_t s[2], mem_alnreg_v a[2]); + extern void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a); worker_t *w = (worker_t*)data; if (!(w->opt->flag&MEM_F_PE)) { if (bwa_verbose >= 4) printf("=====> Finalizing read '%s' <=====\n", w->seqs[i].name); - mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); - mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + if (w->opt->flag & MEM_F_ALN_REG) { + mem_reg2ovlp(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i]); + } else { + mem_mark_primary_se(w->opt, w->regs[i].n, w->regs[i].a, w->n_processed + i); + mem_reg2sam_se(w->opt, w->bns, w->pac, &w->seqs[i], &w->regs[i], 0, 0); + } free(w->regs[i].a); } else { if (bwa_verbose >= 4) printf("=====> Finalizing read pair '%s' <=====\n", w->seqs[i<<1|0].name); diff --git a/bwamem.h b/bwamem.h index 27514cd..7b7a7e8 100644 --- a/bwamem.h +++ b/bwamem.h @@ -16,7 +16,8 @@ typedef struct __smem_i smem_i; #define MEM_F_ALL 0x8 #define MEM_F_NO_MULTI 0x10 #define MEM_F_NO_RESCUE 0x20 -#define MEM_F_NO_EXACT 0x40 +#define MEM_F_SELF_OVLP 0x40 +#define MEM_F_ALN_REG 0x80 typedef struct { int a, b; // match score and mismatch penalty diff --git a/bwamem_extra.c b/bwamem_extra.c new file mode 100644 index 0000000..58e9f67 --- /dev/null +++ b/bwamem_extra.c @@ -0,0 +1,109 @@ +#include "bwa.h" +#include "bwamem.h" +#include "bntseq.h" +#include "kstring.h" + +/*************************** + * SMEM iterator interface * + ***************************/ + +struct __smem_i { + const bwt_t *bwt; + const uint8_t *query; + int start, len; + bwtintv_v *matches; // matches; to be returned by smem_next() + bwtintv_v *sub; // sub-matches inside the longest match; temporary + bwtintv_v *tmpvec[2]; // temporary arrays +}; + +smem_i *smem_itr_init(const bwt_t *bwt) +{ + smem_i *itr; + itr = calloc(1, sizeof(smem_i)); + itr->bwt = bwt; + itr->tmpvec[0] = calloc(1, sizeof(bwtintv_v)); + itr->tmpvec[1] = calloc(1, sizeof(bwtintv_v)); + itr->matches = calloc(1, sizeof(bwtintv_v)); + itr->sub = calloc(1, sizeof(bwtintv_v)); + return itr; +} + +void smem_itr_destroy(smem_i *itr) +{ + free(itr->tmpvec[0]->a); free(itr->tmpvec[0]); + free(itr->tmpvec[1]->a); free(itr->tmpvec[1]); + free(itr->matches->a); free(itr->matches); + free(itr->sub->a); free(itr->sub); + free(itr); +} + +void smem_set_query(smem_i *itr, int len, const uint8_t *query) +{ + itr->query = query; + itr->start = 0; + itr->len = len; +} + +const bwtintv_v *smem_next(smem_i *itr) +{ + int i, max, max_i, ori_start; + itr->tmpvec[0]->n = itr->tmpvec[1]->n = itr->matches->n = itr->sub->n = 0; + if (itr->start >= itr->len || itr->start < 0) return 0; + while (itr->start < itr->len && itr->query[itr->start] > 3) ++itr->start; // skip ambiguous bases + if (itr->start == itr->len) return 0; + ori_start = itr->start; + itr->start = bwt_smem1(itr->bwt, itr->len, itr->query, ori_start, 1, itr->matches, itr->tmpvec); // search for SMEM + if (itr->matches->n == 0) return itr->matches; // well, in theory, we should never come here + for (i = max = 0, max_i = 0; i < itr->matches->n; ++i) { // look for the longest match + bwtintv_t *p = &itr->matches->a[i]; + int len = (uint32_t)p->info - (p->info>>32); + if (max < len) max = len, max_i = i; + } + return itr->matches; +} + +/*********************** + *** Extra functions *** + ***********************/ + +mem_alnreg_v mem_align1(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq_) +{ // the difference from mem_align1_core() is that this routine: 1) calls mem_mark_primary_se(); 2) does not modify the input sequence + extern mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, const uint8_t *pac, int l_seq, char *seq); + extern void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t id); + mem_alnreg_v ar; + char *seq; + seq = malloc(l_seq); + memcpy(seq, seq_, l_seq); // makes a copy of seq_ + ar = mem_align1_core(opt, bwt, bns, pac, l_seq, seq); + mem_mark_primary_se(opt, ar.n, ar.a, lrand48()); + free(seq); + return ar; +} + +void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, bseq1_t *s, mem_alnreg_v *a) +{ + int i; + kstring_t str = {0,0,0}; + for (i = 0; i < a->n; ++i) { + const mem_alnreg_t *p = &a->a[i]; + int is_rev, rid, qb = p->qb, qe = p->qe; + int64_t pos, rb = p->rb, re = p->re; + if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)s->seq, &qb, &qe, &rb, &re) < 0) { + fprintf(stderr, "[E::%s] Internal errors when processing read '%s'. Please let the developer know. Abort. Sorry.\n", __func__, s->name); + exit(1); + } + pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); + rid = bns_pos2rid(bns, pos); + pos -= bns->anns[rid].offset; + kputs(s->name, &str); kputc('\t', &str); + kputw(s->l_seq, &str); kputc('\t', &str); + if (is_rev) qb ^= qe, qe ^= qb, qb ^= qe; // swap + kputw(qb, &str); kputc('\t', &str); kputw(qe, &str); kputc('\t', &str); + kputs(bns->anns[rid].name, &str); kputc('\t', &str); + kputw(bns->anns[rid].len, &str); kputc('\t', &str); + kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str); + kputw(p->truesc, &str); kputc('\n', &str); + } + s->sam = str.s; +} + diff --git a/fastmap.c b/fastmap.c index 35797fd..7dc8001 100644 --- a/fastmap.c +++ b/fastmap.c @@ -53,7 +53,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "epaMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:")) >= 0) { + while ((c = getopt(argc, argv, "epaFMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == 'x') mode = optarg; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; @@ -67,7 +67,8 @@ int main_mem(int argc, char *argv[]) else if (c == 'p') opt->flag |= MEM_F_PE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; - else if (c == 'e') opt->flag |= MEM_F_NO_EXACT; + else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP; + else if (c == 'F') opt->flag |= MEM_F_ALN_REG; else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1; else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; else if (c == 'v') bwa_verbose = atoi(optarg); @@ -145,7 +146,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n"); fprintf(stderr, " pacbio: -k17 -W40 -w200 -c1000 -r10 -A2 -B7 -O2 -E1 -L0\n"); - fprintf(stderr, " pbread: -k13 -W30 -w200 -c1000 -r10 -A2 -B5 -O2 -E1\n"); + fprintf(stderr, " pbread: -k13 -W30 -w100 -c1000 -r10 -A2 -B5 -O2 -E1 -aeD.02\n"); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); @@ -174,15 +175,18 @@ int main_mem(int argc, char *argv[]) if (!opt0.e_del) opt->e_del = 1; if (!opt0.o_ins) opt->o_ins = 2; if (!opt0.e_ins) opt->e_ins = 1; - if (!opt0.w) opt->w = 200; if (!opt0.max_occ) opt->max_occ = 1000; if (opt0.split_factor == 0.) opt->split_factor = 10.; if (strcmp(mode, "pbread") == 0) { + opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG; if (!opt0.b) opt->b = 5; + if (!opt0.w) opt->w = 100; if (!opt0.min_seed_len) opt->min_seed_len = 13; if (!opt0.min_chain_weight) opt->min_chain_weight = 30; + if (opt0.chain_drop_ratio == 0.) opt->chain_drop_ratio = .02; } else { if (!opt0.b) opt->b = 7; + if (!opt0.w) opt->w = 200; if (!opt0.min_seed_len) opt->min_seed_len = 17; if (!opt0.min_chain_weight) opt->min_chain_weight = 40; if (!opt0.pen_clip5) opt->pen_clip5 = 0; @@ -220,7 +224,8 @@ int main_mem(int argc, char *argv[]) opt->flag |= MEM_F_PE; } } - bwa_print_sam_hdr(idx->bns, rg_line); + if (!(opt->flag & MEM_F_ALN_REG)) + bwa_print_sam_hdr(idx->bns, rg_line); while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { int64_t size = 0; if ((opt->flag & MEM_F_PE) && (n&1) == 1) { diff --git a/main.c b/main.c index a177e6f..0ef8307 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r464" +#define PACKAGE_VERSION "0.7.8+dev-r465" #endif int bwa_fa2pac(int argc, char *argv[]); From c0a308a8b6e28866926290d9339b65a5d750280d Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Apr 2014 17:33:07 -0400 Subject: [PATCH 482/498] dev-466: simplified chain filtering --- bwamem.c | 119 +++++++++++++++++++------------------------------------ main.c | 2 +- 2 files changed, 42 insertions(+), 79 deletions(-) diff --git a/bwamem.c b/bwamem.c index 2e5104a..602f666 100644 --- a/bwamem.c +++ b/bwamem.c @@ -145,7 +145,8 @@ typedef struct { } mem_seed_t; typedef struct { - int n, m; + int n, m, first; + uint32_t w:30, kept:2; int64_t pos; mem_seed_t *seeds; } mem_chain_t; @@ -164,10 +165,8 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, c const mem_seed_t *last = &c->seeds[c->n-1]; qend = last->qbeg + last->len; rend = last->rbeg + last->len; - if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) { - if (bwa_verbose >= 5) printf("** contained\n"); + if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) return 1; // contained seed; do nothing - } if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand x = p->qbeg - last->qbeg; // always non-negtive y = p->rbeg - last->rbeg; @@ -177,10 +176,8 @@ static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, c c->seeds = realloc(c->seeds, c->m * sizeof(mem_seed_t)); } c->seeds[c->n++] = *p; - if (bwa_verbose >= 5) printf("** appended\n"); return 1; - } else if (bwa_verbose >= 5) - printf("** new chain: %ld, %ld, %ld, %ld\n", (long)y, (long)abs(x-y), (long)(x - last->len), (long)(y - last->len)); + } return 0; // request to add a new chain } @@ -247,15 +244,6 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; s.len = slen; - if (bwa_verbose >= 5) { - bwtint_t pos; - int is_rev, ref_id; - pos = bns_depos(global_bns, s.rbeg, &is_rev); - if (is_rev) pos -= s.len - 1; - bns_cnt_ambi(global_bns, pos, s.len, &ref_id); - printf("* Found SEED: length=%d,query_beg=%d,ref_beg=%ld; %s:%c%ld\n", s.len, s.qbeg, (long)s.rbeg, \ - global_bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - global_bns->anns[ref_id].offset) + 1); - } if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip if (kb_size(tree)) { kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain @@ -285,84 +273,59 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int * Filtering chains * ********************/ -typedef struct { - int beg, end, w; - void *p, *p2; -} flt_aux_t; +#define chn_beg(ch) ((ch).seeds->qbeg) +#define chn_end(ch) ((ch).seeds[(ch).n-1].qbeg + (ch).seeds[(ch).n-1].len) #define flt_lt(a, b) ((a).w > (b).w) -KSORT_INIT(mem_flt, flt_aux_t, flt_lt) +KSORT_INIT(mem_flt, mem_chain_t, flt_lt) -int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *chains) +int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a) { - flt_aux_t *a; - int i, j, n; - if (n_chn <= 1) return n_chn; // no need to filter - for (i = j = 0; i < n_chn; ++i) { // filter out chains with small weight - mem_chain_t *c = &chains[i]; - int w; - w = mem_chain_weight(c); - if (w >= opt->min_chain_weight) - chains[j++] = *c; - } - n_chn = j; - if (n_chn == 0) return 0; - a = malloc(sizeof(flt_aux_t) * n_chn); - for (i = 0; i < n_chn; ++i) { - mem_chain_t *c = &chains[i]; - int w; - w = mem_chain_weight(c); - a[i].beg = c->seeds[0].qbeg; - a[i].end = c->seeds[c->n-1].qbeg + c->seeds[c->n-1].len; - a[i].w = w; a[i].p = c; a[i].p2 = 0; + int i, k; + kvec_t(int) chains = {0,0,0}; // this keeps int indices of the non-overlapping chains + if (n_chn == 0) return 0; // no need to filter + // compute the weight of each chain and drop chains with small weight + for (i = k = 0; i < n_chn; ++i) { + mem_chain_t *c = &a[i]; + c->first = -1; c->kept = 0; + c->w = mem_chain_weight(c); + if (c->w < opt->min_chain_weight) free(c->seeds); + else a[k++] = *c; } + n_chn = k; ks_introsort(mem_flt, n_chn, a); - { // reorder chains such that the best chain appears first - mem_chain_t *swap; - swap = malloc(sizeof(mem_chain_t) * n_chn); - for (i = 0; i < n_chn; ++i) { - swap[i] = *((mem_chain_t*)a[i].p); - a[i].p = &chains[i]; // as we will memcpy() below, a[i].p is changed - } - memcpy(chains, swap, sizeof(mem_chain_t) * n_chn); - free(swap); - } - for (i = 1, n = 1; i < n_chn; ++i) { - for (j = 0; j < n; ++j) { - int b_max = a[j].beg > a[i].beg? a[j].beg : a[i].beg; - int e_min = a[j].end < a[i].end? a[j].end : a[i].end; + // pairwise chain comparisons + kv_push(int, chains, 0); + for (i = 1; i < n_chn; ++i) { + for (k = 0; k < chains.n; ++k) { + int j = chains.a[k]; + int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]); + int e_min = chn_end(a[j]) < chn_end(a[i])? chn_end(a[j]) : chn_end(a[i]); if (e_min > b_max) { // have overlap - int min_l = a[i].end - a[i].beg < a[j].end - a[j].beg? a[i].end - a[i].beg : a[j].end - a[j].beg; + int li = chn_end(a[i]) - chn_beg(a[i]); + int lj = chn_end(a[j]) - chn_beg(a[j]); + int min_l = li < lj? li : lj; if (e_min - b_max >= min_l * opt->mask_level && min_l < opt->max_chain_gap) { // significant overlap - if (a[j].p2 == 0) a[j].p2 = a[i].p; + if (a[j].first < 0) a[j].first = i; // keep the first shadowed hit s.t. mapq can be more accurate if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1) break; } } } - if (j == n) a[n++] = a[i]; // if have no significant overlap with better chains, keep it. + if (k == chains.n) kv_push(int, chains, i); } - for (i = 0; i < n; ++i) { // mark chains to be kept - mem_chain_t *c = (mem_chain_t*)a[i].p; - if (c->n > 0) c->n = -c->n; - c = (mem_chain_t*)a[i].p2; - if (c && c->n > 0) c->n = -c->n; + for (i = 0; i < chains.n; ++i) { + mem_chain_t *c = &a[chains.a[i]]; + c->kept = 2; + if (c->first >= 0) a[c->first].kept = 1; } - free(a); - for (i = 0; i < n_chn; ++i) { // free discarded chains - mem_chain_t *c = &chains[i]; - if (c->n >= 0) { - free(c->seeds); - c->n = c->m = 0; - } else c->n = -c->n; - } - for (i = n = 0; i < n_chn; ++i) { // squeeze out discarded chains - if (chains[i].n > 0) { - if (n != i) chains[n++] = chains[i]; - else ++n; - } + free(chains.a); + for (i = k = 0; i < n_chn; ++i) { // free discarded chains + mem_chain_t *c = &a[i]; + if (c->kept == 0) free(c->seeds); + else a[k++] = a[i]; } - return n; + return k; } /****************************** diff --git a/main.c b/main.c index 0ef8307..7a46a28 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r465" +#define PACKAGE_VERSION "0.7.8+dev-r466" #endif int bwa_fa2pac(int argc, char *argv[]); From 99f6f9a0d1ce4fbb776bff5aa1a976c1d1bcef25 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Apr 2014 21:45:49 -0400 Subject: [PATCH 483/498] dev-467: limit the max #chains to extend --- bwamem.c | 47 +++++++++++++++++++++++++++++++++-------------- bwamem.h | 3 ++- fastmap.c | 16 +++++++++------- main.c | 2 +- 4 files changed, 45 insertions(+), 23 deletions(-) diff --git a/bwamem.c b/bwamem.c index 602f666..bfb45d9 100644 --- a/bwamem.c +++ b/bwamem.c @@ -63,15 +63,15 @@ mem_opt_t *mem_opt_init() o->max_chain_gap = 10000; o->max_ins = 10000; o->mask_level = 0.50; - o->chain_drop_ratio = 0.50; + o->drop_ratio = 0.50; o->split_factor = 1.5; o->chunk_size = 10000000; o->n_threads = 1; o->max_matesw = 100; o->mask_level_redun = 0.95; o->min_chain_weight = 0; + o->max_chain_extend = 1<<30; o->mapQ_coef_len = 50; o->mapQ_coef_fac = log(o->mapQ_coef_len); -// o->mapQ_coef_len = o->mapQ_coef_fac = 0; bwa_fill_scmat(o->a, o->b, o->mat); return o; } @@ -98,7 +98,8 @@ static smem_aux_t *smem_aux_init() static void smem_aux_destroy(smem_aux_t *a) { - free(a->tmpv[0]->a); free(a->tmpv[1]->a); + free(a->tmpv[0]->a); free(a->tmpv[0]); + free(a->tmpv[1]->a); free(a->tmpv[1]); free(a->mem.a); free(a->mem1.a); free(a); } @@ -135,9 +136,9 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co ks_introsort(mem_intv, a->mem.n, a->mem.a); } -/******************************** - * Chaining while finding SMEMs * - ********************************/ +/************ + * Chaining * + ************/ typedef struct { int64_t rbeg; @@ -295,8 +296,10 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a) n_chn = k; ks_introsort(mem_flt, n_chn, a); // pairwise chain comparisons + a[0].kept = 3; kv_push(int, chains, 0); for (i = 1; i < n_chn; ++i) { + int large_ovlp = 0; for (k = 0; k < chains.n; ++k) { int j = chains.a[k]; int b_max = chn_beg(a[j]) > chn_beg(a[i])? chn_beg(a[j]) : chn_beg(a[i]); @@ -306,25 +309,35 @@ int mem_chain_flt(const mem_opt_t *opt, int n_chn, mem_chain_t *a) int lj = chn_end(a[j]) - chn_beg(a[j]); int min_l = li < lj? li : lj; if (e_min - b_max >= min_l * opt->mask_level && min_l < opt->max_chain_gap) { // significant overlap + large_ovlp = 1; if (a[j].first < 0) a[j].first = i; // keep the first shadowed hit s.t. mapq can be more accurate - if (a[i].w < a[j].w * opt->chain_drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1) + if (a[i].w < a[j].w * opt->drop_ratio && a[j].w - a[i].w >= opt->min_seed_len<<1) break; } } } - if (k == chains.n) kv_push(int, chains, i); + if (k == chains.n) { + kv_push(int, chains, i); + a[i].kept = large_ovlp? 2 : 3; + } } for (i = 0; i < chains.n; ++i) { mem_chain_t *c = &a[chains.a[i]]; - c->kept = 2; if (c->first >= 0) a[c->first].kept = 1; } free(chains.a); + for (i = k = 0; i < n_chn; ++i) { // don't extend more than opt->max_chain_extend .kept=1/2 chains + if (a[i].kept == 0 || a[i].kept == 3) continue; + if (++k >= opt->max_chain_extend) break; + } + for (; i < n_chn; ++i) + if (a[i].kept < 3) a[i].kept = 0; for (i = k = 0; i < n_chn; ++i) { // free discarded chains mem_chain_t *c = &a[i]; if (c->kept == 0) free(c->seeds); else a[k++] = a[i]; } + n_chn = k; return k; } @@ -890,7 +903,7 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa mem_aln_t *q; if (p->score < opt->T) continue; if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; - if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->chain_drop_ratio) continue; + if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue; q = kv_pushp(mem_aln_t, aa); *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); q->flag |= extra_flag; // flag secondary @@ -950,8 +963,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse return regs; } - -mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) +mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar, const char *name) { mem_aln_t a; int i, w2, tmp, qb, qe, NM, score, is_rev, last_sc = -(1<<30), l_MD; @@ -971,8 +983,10 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) { - fprintf(stderr, "[E::%s] If you see this message, please let the developer know. Abort. Sorry.\n", __func__); - exit(1); + if (name) fprintf(stderr, "[E::%s] Internal code inconsistency for read '%s'. Please contact the developer. Sorry.\n", __func__, name); + else fprintf(stderr, "[E::%s] Internal code inconsistency. Please contact the developer. Sorry.\n", __func__); + a.rid = -1; a.pos = -1; a.flag |= 0x4; + return a; } tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del); w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins); @@ -1024,6 +1038,11 @@ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t * return a; } +mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) +{ + return mem_reg2aln2(opt, bns, pac, l_query, query_, ar, 0); +} + typedef struct { const mem_opt_t *opt; const bwt_t *bwt; diff --git a/bwamem.h b/bwamem.h index 7b7a7e8..a6d6aa9 100644 --- a/bwamem.h +++ b/bwamem.h @@ -32,6 +32,7 @@ typedef struct { int flag; // see MEM_F_* macros int min_seed_len; // minimum seed length int min_chain_weight; + int max_chain_extend; float split_factor; // split into a seed if MEM is longer than min_seed_len*split_factor int split_width; // split into a seed if its occurence is smaller than this value int max_occ; // skip a seed if its occurence is larger than this value @@ -39,7 +40,7 @@ typedef struct { int n_threads; // number of threads int chunk_size; // process chunk_size-bp sequences in a batch float mask_level; // regard a hit as redundant if the overlap with another better hit is over mask_level times the min length of the two hits - float chain_drop_ratio; // drop a chain if its seed coverage is below chain_drop_ratio times the seed coverage of a better chain overlapping with the small chain + float drop_ratio; // drop a chain if its seed coverage is below drop_ratio times the seed coverage of a better chain overlapping with the small chain float mask_level_redun; float mapQ_coef_len; int mapQ_coef_fac; diff --git a/fastmap.c b/fastmap.c index 7dc8001..f0f92b8 100644 --- a/fastmap.c +++ b/fastmap.c @@ -53,7 +53,7 @@ int main_mem(int argc, char *argv[]) opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "epaFMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:")) >= 0) { + while ((c = getopt(argc, argv, "epaFMCSPHk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == 'x') mode = optarg; else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; @@ -73,10 +73,11 @@ int main_mem(int argc, char *argv[]) else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; else if (c == 'v') bwa_verbose = atoi(optarg); else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.; - else if (c == 'D') opt->chain_drop_ratio = atof(optarg), opt0.chain_drop_ratio = 1.; + else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.; else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1; else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1; - else if (c == 'N') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1; + else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1; + else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1; else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1; else if (c == 'C') copy_comment = 1; else if (c == 'Q') { @@ -132,7 +133,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); // fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); - fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->chain_drop_ratio); + fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio); fprintf(stderr, " -W INT discard a chain if seeded bases shorter than INT [0]\n"); fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw); fprintf(stderr, " -S skip mate rescue\n"); @@ -168,7 +169,7 @@ int main_mem(int argc, char *argv[]) } if (mode) { - if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread") == 0) { + if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread1") == 0) { if (!opt0.a) opt->a = 2, opt0.a = 1; update_a(opt, &opt0); if (!opt0.o_del) opt->o_del = 2; @@ -177,13 +178,14 @@ int main_mem(int argc, char *argv[]) if (!opt0.e_ins) opt->e_ins = 1; if (!opt0.max_occ) opt->max_occ = 1000; if (opt0.split_factor == 0.) opt->split_factor = 10.; - if (strcmp(mode, "pbread") == 0) { + if (strcmp(mode, "pbread1") == 0) { opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG; if (!opt0.b) opt->b = 5; if (!opt0.w) opt->w = 100; if (!opt0.min_seed_len) opt->min_seed_len = 13; if (!opt0.min_chain_weight) opt->min_chain_weight = 30; - if (opt0.chain_drop_ratio == 0.) opt->chain_drop_ratio = .02; + if (!opt0.max_chain_extend) opt->max_chain_extend = 20; + if (opt0.drop_ratio == 0.) opt->drop_ratio = .01; } else { if (!opt0.b) opt->b = 7; if (!opt0.w) opt->w = 200; diff --git a/main.c b/main.c index 7a46a28..cbf3bef 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r466" +#define PACKAGE_VERSION "0.7.8+dev-r467" #endif int bwa_fa2pac(int argc, char *argv[]); From d766591c1e8ea661fc758d08eb2fc1edc30c2fcf Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Apr 2014 22:11:36 -0400 Subject: [PATCH 484/498] dev-468: fixed a segfault caused by NULL --- fastmap.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fastmap.c b/fastmap.c index f0f92b8..f3dd2f6 100644 --- a/fastmap.c +++ b/fastmap.c @@ -245,7 +245,7 @@ int main_mem(int argc, char *argv[]) mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0); n_processed += n; for (i = 0; i < n; ++i) { - err_fputs(seqs[i].sam, stdout); + if (seqs[i].sam) err_fputs(seqs[i].sam, stdout); free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); } free(seqs); diff --git a/main.c b/main.c index cbf3bef..b100b8d 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r467" +#define PACKAGE_VERSION "0.7.8+dev-r468" #endif int bwa_fa2pac(int argc, char *argv[]); From db58392e9baa7606a6c9d6b48ba4e26f0b89cc7a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 9 Apr 2014 13:20:04 -0400 Subject: [PATCH 485/498] dev-469: fixed wrong command line prompt --- fastmap.c | 6 +++--- main.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fastmap.c b/fastmap.c index f3dd2f6..9bdacb4 100644 --- a/fastmap.c +++ b/fastmap.c @@ -147,7 +147,7 @@ int main_mem(int argc, char *argv[]) fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n"); fprintf(stderr, " pacbio: -k17 -W40 -w200 -c1000 -r10 -A2 -B7 -O2 -E1 -L0\n"); - fprintf(stderr, " pbread: -k13 -W30 -w100 -c1000 -r10 -A2 -B5 -O2 -E1 -aeD.02\n"); + fprintf(stderr, " pbread: -k13 -W30 -w100 -c1000 -r10 -A2 -B5 -O2 -E1 -N20 -FeaD.01\n"); fprintf(stderr, "\nInput/output options:\n\n"); fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); @@ -169,7 +169,7 @@ int main_mem(int argc, char *argv[]) } if (mode) { - if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread1") == 0) { + if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) { if (!opt0.a) opt->a = 2, opt0.a = 1; update_a(opt, &opt0); if (!opt0.o_del) opt->o_del = 2; @@ -178,7 +178,7 @@ int main_mem(int argc, char *argv[]) if (!opt0.e_ins) opt->e_ins = 1; if (!opt0.max_occ) opt->max_occ = 1000; if (opt0.split_factor == 0.) opt->split_factor = 10.; - if (strcmp(mode, "pbread1") == 0) { + if (strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) { opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG; if (!opt0.b) opt->b = 5; if (!opt0.w) opt->w = 100; diff --git a/main.c b/main.c index b100b8d..91f28bc 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r468" +#define PACKAGE_VERSION "0.7.8+dev-r469" #endif int bwa_fa2pac(int argc, char *argv[]); From ccbbe48c4f4124623551e1c72ad257a7040b8ad3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 10 Apr 2014 11:43:17 -0400 Subject: [PATCH 486/498] dev-470: don't stop on bwa_fix_xref2() failures Peter Field has sent me an example caused by an alignment bridging three adjacent chromosomes/contigs. Bwa-mem always aligns the query to the contig covering the middle point of the alignment. In this example, it chooses the middle contig, which should not be aligned. This leads to weird things failing bwa_fix_xref2(), which cannot be fixed unless we build the contig boundaries into the FM-index. In the old code, bwa-mem halts when bwa_fix_xref2() fails. With this commit, bwa-mem will give a warning instead of halting. --- bwa.c | 7 ++++--- bwamem.c | 27 +++++++++++++++++++++------ bwamem.h | 1 + bwamem_extra.c | 5 +++-- main.c | 2 +- 5 files changed, 30 insertions(+), 12 deletions(-) diff --git a/bwa.c b/bwa.c index 0e9e606..08881c0 100644 --- a/bwa.c +++ b/bwa.c @@ -178,9 +178,10 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) { - int is_rev; - int64_t cb, ce, fm; + int is_rev, ori_ql = *qe - *qb; + int64_t cb, ce, fm, ori_rl = *re - *rb; bntann1_t *ra; + assert(ori_ql > 0 && ori_rl > 0); if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary; actually with BWA-MEM, we should never come to here *qb = *qe = *rb = *re = -1; return -1; // unable to fix @@ -218,7 +219,7 @@ int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_i } free(cigar); } - return (*qb == *qe || *rb == *re)? -2 : 0; + return (*qe - *qb < .33 * ori_ql || *re - *rb < .33 * ori_rl)? -2 : 0; } int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) diff --git a/bwamem.c b/bwamem.c index bfb45d9..9a58968 100644 --- a/bwamem.c +++ b/bwamem.c @@ -905,7 +905,11 @@ void mem_reg2sam_se(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pa if (p->secondary >= 0 && !(opt->flag&MEM_F_ALL)) continue; if (p->secondary >= 0 && p->score < a->a[p->secondary].score * opt->drop_ratio) continue; q = kv_pushp(mem_aln_t, aa); - *q = mem_reg2aln(opt, bns, pac, s->l_seq, s->seq, p); + *q = mem_reg2aln2(opt, bns, pac, s->l_seq, s->seq, p, s->name); + if (q->rid < 0) { + --aa.n; + continue; + } q->flag |= extra_flag; // flag secondary if (p->secondary >= 0) q->sub = -1; // don't output sub-optimal score if (k && p->secondary < 0) // if supplementary @@ -982,11 +986,10 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment - if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) { - if (name) fprintf(stderr, "[E::%s] Internal code inconsistency for read '%s'. Please contact the developer. Sorry.\n", __func__, name); - else fprintf(stderr, "[E::%s] Internal code inconsistency. Please contact the developer. Sorry.\n", __func__); - a.rid = -1; a.pos = -1; a.flag |= 0x4; - return a; + if ((ret = bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re)) < 0) { + if (bwa_verbose >= 2 && name) + fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, name); + goto err_reg2aln; } tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del); w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins); @@ -1002,6 +1005,11 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t last_sc = score; w2 <<= 1; } while (++i < 3 && score < ar->truesc - opt->a); + if (score < 0) { + if (bwa_verbose >= 2 && name) + fprintf(stderr, "[W::%s] A hit to read '%s' has been dropped.\n", __func__, name); + goto err_reg2aln; + } l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1; a.NM = NM; pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); @@ -1036,6 +1044,13 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub; free(query); return a; + +err_reg2aln: + free(a.cigar); + memset(&a, 0, sizeof(mem_aln_t)); + a.rid = -1; a.pos = -1; a.flag |= 0x4; + free(query); + return a; } mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) diff --git a/bwamem.h b/bwamem.h index a6d6aa9..7cfe8b8 100644 --- a/bwamem.h +++ b/bwamem.h @@ -148,6 +148,7 @@ extern "C" { * @return CIGAR, strand, mapping quality and forward-strand position */ mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar); + mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_seq, const char *seq, const mem_alnreg_t *ar, const char *name); /** * Infer the insert size distribution from interleaved alignment regions diff --git a/bwamem_extra.c b/bwamem_extra.c index 58e9f67..157026c 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -89,8 +89,9 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int is_rev, rid, qb = p->qb, qe = p->qe; int64_t pos, rb = p->rb, re = p->re; if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)s->seq, &qb, &qe, &rb, &re) < 0) { - fprintf(stderr, "[E::%s] Internal errors when processing read '%s'. Please let the developer know. Abort. Sorry.\n", __func__, s->name); - exit(1); + if (bwa_verbose >= 2) + fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, s->name); + continue; } pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); rid = bns_pos2rid(bns, pos); diff --git a/main.c b/main.c index 91f28bc..dec0b04 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r469" +#define PACKAGE_VERSION "0.7.8+dev-r470" #endif int bwa_fa2pac(int argc, char *argv[]); From 23e0e99ec0744fd70f48dbac8ecbe73d6b219f97 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 10 Apr 2014 11:54:17 -0400 Subject: [PATCH 487/498] dev-471: fixed a compiling error from last commit --- bwamem.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem.c b/bwamem.c index 9a58968..189e58b 100644 --- a/bwamem.c +++ b/bwamem.c @@ -986,7 +986,7 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment - if ((ret = bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re)) < 0) { + if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) { if (bwa_verbose >= 2 && name) fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, name); goto err_reg2aln; diff --git a/main.c b/main.c index dec0b04..6a43b5f 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r470" +#define PACKAGE_VERSION "0.7.8+dev-r471" #endif int bwa_fa2pac(int argc, char *argv[]); From 8638cfadc875a44f7db5ee822e71f7cddb5221fa Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 10 Apr 2014 20:54:27 -0400 Subject: [PATCH 488/498] dev-472: get rid of bwa_fix_xref() This function causes all kinds of problems when the reference genome consists of many short reads/contigs/chromsomes. Some of the problems are nearly unfixable at the point where bwa_fix_xref() gets called. This commit attempts to fix the problem at the root. It disallows chains spanning multiple contigs and never retrieves sequences bridging two adjacent contigs. Thus all the chaining, extension, SW and global alignments are confined to on contig only. This commit brings many changes. I have tested it on a couple examples including Peter Field's PacBio example. It works well so far. --- bntseq.c | 32 +++++++++++++++++++++ bntseq.h | 3 ++ bwa.c | 51 --------------------------------- bwa.h | 2 -- bwamem.c | 77 ++++++++++++++++++++++---------------------------- bwamem.h | 1 + bwamem_extra.c | 5 ---- bwamem_pair.c | 16 ++++++----- main.c | 2 +- 9 files changed, 79 insertions(+), 110 deletions(-) diff --git a/bntseq.c b/bntseq.c index e1cd323..b63dff4 100644 --- a/bntseq.c +++ b/bntseq.c @@ -329,6 +329,15 @@ int bns_pos2rid(const bntseq_t *bns, int64_t pos_f) return mid; } +int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re) +{ + int is_rev, rid_b, rid_e; + if (rb < bns->l_pac && re > bns->l_pac) return -2; + rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev)); + rid_e = bns_pos2rid(bns, bns_depos(bns, re, &is_rev) - 1); + return rid_b == rid_e? rid_b : -1; +} + int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) { int left, mid, right, nn; @@ -374,3 +383,26 @@ uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end } else *len = 0; // if bridging the forward-reverse boundary, return nothing return seq; } + +uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid) +{ + int64_t far_beg, far_end, len; + int is_rev; + uint8_t *seq; + + if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap + assert(*beg <= mid && mid < *end); + *rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev)); + far_beg = bns->anns[*rid].offset; + far_end = far_beg + bns->anns[*rid].len; + if (is_rev) { // flip to the reverse strand + int64_t tmp = far_beg; + far_beg = (bns->l_pac<<1) - 1 - far_end; + far_end = (bns->l_pac<<1) - 1 - tmp; + } + *beg = *beg > far_beg? *beg : far_beg; + *end = *end < far_end? *end : far_end; + seq = bns_get_seq(bns->l_pac, pac, *beg, *end, &len); + assert(seq && *end - *beg == len); // assertion failure should never happen + return seq; +} diff --git a/bntseq.h b/bntseq.h index 4061438..6437cf6 100644 --- a/bntseq.h +++ b/bntseq.h @@ -28,6 +28,7 @@ #ifndef BWT_BNTSEQ_H #define BWT_BNTSEQ_H +#include #include #include #include @@ -75,6 +76,8 @@ extern "C" { int bns_pos2rid(const bntseq_t *bns, int64_t pos_f); int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); + uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid); + int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re); #ifdef __cplusplus } diff --git a/bwa.c b/bwa.c index 08881c0..db3b947 100644 --- a/bwa.c +++ b/bwa.c @@ -176,57 +176,6 @@ uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pa return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM); } -int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) -{ - int is_rev, ori_ql = *qe - *qb; - int64_t cb, ce, fm, ori_rl = *re - *rb; - bntann1_t *ra; - assert(ori_ql > 0 && ori_rl > 0); - if (*rb < bns->l_pac && *re > bns->l_pac) { // cross the for-rev boundary; actually with BWA-MEM, we should never come to here - *qb = *qe = *rb = *re = -1; - return -1; // unable to fix - } - fm = bns_depos(bns, (*rb + *re) >> 1, &is_rev); // coordinate of the middle point on the forward strand - ra = &bns->anns[bns_pos2rid(bns, fm)]; // annotation of chr corresponding to the middle point - cb = is_rev? (bns->l_pac<<1) - (ra->offset + ra->len) : ra->offset; // chr start on the mapping strand - ce = cb + ra->len; // chr end - if (cb > *rb || ce < *re) { // fix is needed - int i, score, n_cigar, y, NM; - uint32_t *cigar; - int64_t x; - cb = cb > *rb? cb : *rb; - ce = ce < *re? ce : *re; - cigar = bwa_gen_cigar2(mat, o_del, e_del, o_ins, e_ins, w, bns->l_pac, pac, *qe - *qb, query + *qb, *rb, *re, &score, &n_cigar, &NM); - for (i = 0, x = *rb, y = *qb; i < n_cigar; ++i) { - int op = cigar[i]&0xf, len = cigar[i]>>4; - if (op == 0) { - if (x <= cb && cb < x + len) - *qb = y + (cb - x), *rb = cb; - if (x < ce && ce <= x + len) { - *qe = y + (ce - x), *re = ce; - break; - } else x += len, y += len; - } else if (op == 1) { - y += len; - } else if (op == 2) { - if (x <= cb && cb < x + len) - *qb = y, *rb = x + len; - if (x < ce && ce <= x + len) { - *qe = y, *re = x; - break; - } else x += len; - } else abort(); // should not be here - } - free(cigar); - } - return (*qe - *qb < .33 * ori_ql || *re - *rb < .33 * ori_rl)? -2 : 0; -} - -int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re) -{ - return bwa_fix_xref2(mat, q, r, q, r, w, bns, pac, query, qb, qe, rb, re); -} - /********************* * Full index reader * *********************/ diff --git a/bwa.h b/bwa.h index 8d46e58..bbc2525 100644 --- a/bwa.h +++ b/bwa.h @@ -33,8 +33,6 @@ extern "C" { void bwa_fill_scmat(int a, int b, int8_t mat[25]); uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); - int bwa_fix_xref(const int8_t mat[25], int q, int r, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); - int bwa_fix_xref2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w, const bntseq_t *bns, const uint8_t *pac, uint8_t *query, int *qb, int *qe, int64_t *rb, int64_t *re); char *bwa_idx_infer_prefix(const char *hint); bwt_t *bwa_idx_load_bwt(const char *hint); diff --git a/bwamem.c b/bwamem.c index 189e58b..fe83f93 100644 --- a/bwamem.c +++ b/bwamem.c @@ -143,11 +143,11 @@ static void mem_collect_intv(const mem_opt_t *opt, const bwt_t *bwt, int len, co typedef struct { int64_t rbeg; int32_t qbeg, len; -} mem_seed_t; +} mem_seed_t; // unaligned memory typedef struct { - int n, m, first; - uint32_t w:30, kept:2; + int n, m, first, rid; + int w, kept; int64_t pos; mem_seed_t *seeds; } mem_chain_t; @@ -160,12 +160,13 @@ typedef struct { size_t n, m; mem_chain_t *a; } mem_chain_v; KBTREE_INIT(chn, mem_chain_t, chain_cmp) // return 1 if the seed is merged into the chain -static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p) +static int test_and_merge(const mem_opt_t *opt, int64_t l_pac, mem_chain_t *c, const mem_seed_t *p, int seed_rid) { int64_t qend, rend, x, y; const mem_seed_t *last = &c->seeds[c->n-1]; qend = last->qbeg + last->len; rend = last->rbeg + last->len; + if (seed_rid != c->rid) return 0; // different chr; request a new chain if (p->qbeg >= c->seeds[0].qbeg && p->qbeg + p->len <= qend && p->rbeg >= c->seeds[0].rbeg && p->rbeg + p->len <= rend) return 1; // contained seed; do nothing if ((last->rbeg < l_pac || c->seeds[0].rbeg < l_pac) && p->rbeg >= l_pac) return 0; // don't chain if on different strand @@ -220,9 +221,10 @@ void mem_print_chain(const bntseq_t *bns, mem_chain_v *chn) } } -mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int len, const uint8_t *seq) +mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bns, int len, const uint8_t *seq) { int i; + int64_t l_pac = bns->l_pac; mem_chain_v chain; kbtree_t(chn) *tree; smem_aux_t *aux; @@ -241,19 +243,21 @@ mem_chain_v mem_chain(const mem_opt_t *opt, const bwt_t *bwt, int64_t l_pac, int for (k = 0; k < p->x[2]; ++k) { mem_chain_t tmp, *lower, *upper; mem_seed_t s; - int to_add = 0; + int rid, to_add = 0; s.rbeg = tmp.pos = bwt_sa(bwt, p->x[0] + k); // this is the base coordinate in the forward-reverse reference s.qbeg = p->info>>32; s.len = slen; - if (s.rbeg < l_pac && l_pac < s.rbeg + s.len) continue; // bridging forward-reverse boundary; skip + rid = bns_intv2rid(bns, s.rbeg, s.rbeg + s.len); + if (rid < 0) continue; // bridging multiple reference sequences or the forward-reverse boundary if (kb_size(tree)) { kb_intervalp(chn, tree, &tmp, &lower, &upper); // find the closest chain - if (!lower || !test_and_merge(opt, l_pac, lower, &s)) to_add = 1; + if (!lower || !test_and_merge(opt, l_pac, lower, &s, rid)) to_add = 1; } else to_add = 1; if (to_add) { // add the seed as a new chain tmp.n = 1; tmp.m = 4; tmp.seeds = calloc(tmp.m, sizeof(mem_seed_t)); tmp.seeds[0] = s; + tmp.rid = rid; kb_putp(chn, tree, &tmp); } } @@ -473,11 +477,11 @@ void mem_mark_primary_se(const mem_opt_t *opt, int n, mem_alnreg_t *a, int64_t i * low-divergence sequences, more testing is needed. For now, I only recommend * to use mem_test_chain_sw() for PacBio data. It is disabled by default. */ -int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c) +int mem_test_chain_sw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c) { - int i, qb, qe; + int i, qb, qe, rid; int min_HSP_score = (int)(opt->min_chain_weight * opt->a * MEM_HSP_COEF + .499); - int64_t rb, re, rlen; + int64_t rb, re, l_pac = bns->l_pac; uint8_t *rseq = 0; kswr_t x; @@ -505,8 +509,8 @@ int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, i if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1; if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1; - rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); - assert(rlen == re - rb); + rseq = bns_fetch_seq(bns, pac, &rb, c->seeds[0].rbeg, &re, &rid); + assert(c->rid == rid); x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, KSW_XSTART, 0); free(rseq); if (x.score >= min_HSP_score) return 1; @@ -514,10 +518,10 @@ int mem_test_chain_sw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, i return 0; } -int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) +int mem_chain2aln_short(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { - int i, qb, qe, xtra; - int64_t rb, re, rlen; + int i, qb, qe, xtra, rid; + int64_t rb, re, l_pac = bns->l_pac; uint8_t *rseq = 0; mem_alnreg_t a; kswr_t x; @@ -547,8 +551,8 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, if (qe - qb >= opt->w * 4 || re - rb >= opt->w * 4) return 1; if (qe - qb >= MEM_SHORT_LEN || re - rb >= MEM_SHORT_LEN) return 1; - rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); - assert(rlen == re - rb); + rseq = bns_fetch_seq(bns, pac, &rb, c->seeds[0].rbeg, &re, &rid); + assert(c->rid == rid); xtra = KSW_XSUBO | KSW_XSTART | ((qe - qb) * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); x = ksw_align2(qe - qb, (uint8_t*)query + qb, re - rb, rseq, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); free(rseq); @@ -556,6 +560,7 @@ int mem_chain2aln_short(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, a.qb = qb + x.qb; a.qe = qb + x.qe + 1; a.score = x.score; a.csub = x.score2; + a.rid = c->rid; if (bwa_verbose >= 4) printf("** Attempted alignment via mem_chain2aln_short(): [%d,%d) <=> [%ld,%ld); score=%d; %d/%d\n", a.qb, a.qe, (long)a.rb, (long)a.re, x.score, a.qe-a.qb, qe-qb); if (x.tb < MEM_SHORT_EXT>>1 || x.te > re - rb - (MEM_SHORT_EXT>>1)) return 1; kv_push(mem_alnreg_t, *av, a); @@ -571,10 +576,10 @@ static inline int cal_max_gap(const mem_opt_t *opt, int qlen) return l < opt->w<<1? l : opt->w<<1; } -void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) +void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const uint8_t *query, const mem_chain_t *c, mem_alnreg_v *av) { - int i, k, max_off[2], aw[2]; // aw: actual bandwidth used in extension - int64_t rlen, rmax[2], tmp, max = 0; + int i, k, rid, max_off[2], aw[2]; // aw: actual bandwidth used in extension + int64_t l_pac = bns->l_pac, rmax[2], tmp, max = 0; const mem_seed_t *s; uint8_t *rseq = 0; uint64_t *srt; @@ -598,8 +603,8 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int else rmax[0] = l_pac; } // retrieve the reference sequence - rseq = bns_get_seq(l_pac, pac, rmax[0], rmax[1], &rlen); - assert(rlen == rmax[1] - rmax[0]); + rseq = bns_fetch_seq(bns, pac, &rmax[0], c->seeds[0].rbeg, &rmax[1], &rid); + assert(c->rid == rid); srt = malloc(c->n * 8); for (i = 0; i < c->n; ++i) @@ -649,6 +654,7 @@ void mem_chain2aln(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, int memset(a, 0, sizeof(mem_alnreg_t)); a->w = aw[0] = aw[1] = opt->w; a->score = a->truesc = -1; + a->rid = c->rid; if (bwa_verbose >= 4) err_printf("** ---> Extending from seed(%d) [%ld;%ld,%ld] <---\n", k, (long)s->len, (long)s->qbeg, (long)s->rbeg); if (s->qbeg) { // left extension @@ -939,7 +945,7 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse for (i = 0; i < l_seq; ++i) // convert to 2-bit encoding if we have not done so seq[i] = seq[i] < 4? seq[i] : nst_nt4_table[(int)seq[i]]; - chn = mem_chain(opt, bwt, bns->l_pac, l_seq, (uint8_t*)seq); + chn = mem_chain(opt, bwt, bns, l_seq, (uint8_t*)seq); chn.n = mem_chain_flt(opt, chn.n, chn.a); if (bwa_verbose >= 4) mem_print_chain(bns, &chn); @@ -948,9 +954,9 @@ mem_alnreg_v mem_align1_core(const mem_opt_t *opt, const bwt_t *bwt, const bntse mem_chain_t *p = &chn.a[i]; int ret; if (bwa_verbose >= 4) err_printf("* ---> Processing chain(%d) <---\n", i); - if (opt->min_chain_weight > 0) ret = mem_test_chain_sw(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p); - else ret = mem_chain2aln_short(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); - if (ret > 0) mem_chain2aln(opt, bns->l_pac, pac, l_seq, (uint8_t*)seq, p, ®s); + if (opt->min_chain_weight > 0) ret = mem_test_chain_sw(opt, bns, pac, l_seq, (uint8_t*)seq, p); + else ret = mem_chain2aln_short(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s); + if (ret > 0) mem_chain2aln(opt, bns, pac, l_seq, (uint8_t*)seq, p, ®s); free(chn.a[i].seeds); } free(chn.a); @@ -986,11 +992,6 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t query[i] = query_[i] < 5? query_[i] : nst_nt4_table[(int)query_[i]]; a.mapq = ar->secondary < 0? mem_approx_mapq_se(opt, ar) : 0; if (ar->secondary >= 0) a.flag |= 0x100; // secondary alignment - if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)query, &qb, &qe, &rb, &re) < 0) { - if (bwa_verbose >= 2 && name) - fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, name); - goto err_reg2aln; - } tmp = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_del, opt->e_del); w2 = infer_bw(qe - qb, re - rb, ar->truesc, opt->a, opt->o_ins, opt->e_ins); w2 = w2 > tmp? w2 : tmp; @@ -1005,11 +1006,6 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t last_sc = score; w2 <<= 1; } while (++i < 3 && score < ar->truesc - opt->a); - if (score < 0) { - if (bwa_verbose >= 2 && name) - fprintf(stderr, "[W::%s] A hit to read '%s' has been dropped.\n", __func__, name); - goto err_reg2aln; - } l_MD = strlen((char*)(a.cigar + a.n_cigar)) + 1; a.NM = NM; pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); @@ -1044,13 +1040,6 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub; free(query); return a; - -err_reg2aln: - free(a.cigar); - memset(&a, 0, sizeof(mem_aln_t)); - a.rid = -1; a.pos = -1; a.flag |= 0x4; - free(query); - return a; } mem_aln_t mem_reg2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int l_query, const char *query_, const mem_alnreg_t *ar) diff --git a/bwamem.h b/bwamem.h index 7cfe8b8..53472fe 100644 --- a/bwamem.h +++ b/bwamem.h @@ -52,6 +52,7 @@ typedef struct { typedef struct { int64_t rb, re; // [rb,re): reference sequence in the alignment int qb, qe; // [qb,qe): query sequence in the alignment + int rid; // reference seq ID int score; // best local SW score int truesc; // actual score corresponding to the aligned region; possibly smaller than $score int sub; // 2nd best SW score diff --git a/bwamem_extra.c b/bwamem_extra.c index 157026c..aee1eb4 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -88,11 +88,6 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_alnreg_t *p = &a->a[i]; int is_rev, rid, qb = p->qb, qe = p->qe; int64_t pos, rb = p->rb, re = p->re; - if (bwa_fix_xref2(opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, opt->w, bns, pac, (uint8_t*)s->seq, &qb, &qe, &rb, &re) < 0) { - if (bwa_verbose >= 2) - fprintf(stderr, "[W::%s] A cross-chr hit of read '%s' has been dropped.\n", __func__, s->name); - continue; - } pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); rid = bns_pos2rid(bns, pos); pos -= bns->anns[rid].offset; diff --git a/bwamem_pair.c b/bwamem_pair.c index 4a7cdf3..cec25da 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -106,10 +106,11 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * } } -int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) +int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], const mem_alnreg_t *a, int l_ms, const uint8_t *ms, mem_alnreg_v *ma) { extern int mem_sort_and_dedup(int n, mem_alnreg_t *a, float mask_level_redun); - int i, r, skip[4], n = 0; + int64_t l_pac = bns->l_pac; + int i, r, skip[4], n = 0, rid; for (r = 0; r < 4; ++r) skip[r] = pes[r].failed? 1 : 0; for (i = 0; i < ma->n; ++i) { // check which orinentation has been found @@ -122,7 +123,7 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me for (r = 0; r < 4; ++r) { int is_rev, is_larger; uint8_t *seq, *rev = 0, *ref; - int64_t rb, re, len; + int64_t rb, re; if (skip[r]) continue; is_rev = (r>>1 != (r&1)); // whether to reverse complement the mate is_larger = !(r>>1); // whether the mate has larger coordinate @@ -140,14 +141,15 @@ int mem_matesw(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const me } if (rb < 0) rb = 0; if (re > l_pac<<1) re = l_pac<<1; - ref = bns_get_seq(l_pac, pac, rb, re, &len); - if (len == re - rb) { // no funny things happening + ref = bns_fetch_seq(bns, pac, &rb, (rb+re)>>1, &re, &rid); + if (a->rid == rid) { // no funny things happening kswr_t aln; mem_alnreg_t b; int tmp, xtra = KSW_XSUBO | KSW_XSTART | (l_ms * opt->a < 250? KSW_XBYTE : 0) | (opt->min_seed_len * opt->a); - aln = ksw_align2(l_ms, seq, len, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); + aln = ksw_align2(l_ms, seq, re - rb, ref, 5, opt->mat, opt->o_del, opt->e_del, opt->o_ins, opt->e_ins, xtra, 0); memset(&b, 0, sizeof(mem_alnreg_t)); if (aln.score >= opt->min_seed_len && aln.qb >= 0) { // something goes wrong if aln.qb < 0 + b.rid = a->rid; b.qb = is_rev? l_ms - (aln.qe + 1) : aln.qb; b.qe = is_rev? l_ms - aln.qb : aln.qe + 1; b.rb = is_rev? (l_pac<<1) - (rb + aln.te + 1) : rb + aln.tb; @@ -258,7 +260,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co kv_push(mem_alnreg_t, b[i], a[i].a[j]); for (i = 0; i < 2; ++i) for (j = 0; j < b[i].n && j < opt->max_matesw; ++j) - n += mem_matesw(opt, bns->l_pac, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); + n += mem_matesw(opt, bns, pac, pes, &b[i].a[j], s[!i].l_seq, (uint8_t*)s[!i].seq, &a[!i]); free(b[0].a); free(b[1].a); } mem_mark_primary_se(opt, a[0].n, a[0].a, id<<1|0); diff --git a/main.c b/main.c index 6a43b5f..49cc9ae 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r471" +#define PACKAGE_VERSION "0.7.8+dev-r472" #endif int bwa_fa2pac(int argc, char *argv[]); From f02cd42679af4cb15a119e018b9987ecafa881d7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 10 Apr 2014 21:03:13 -0400 Subject: [PATCH 489/498] dev-473: added a few assertions to make sure the new change works as is expected --- bwamem.c | 1 + bwamem_extra.c | 1 + main.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/bwamem.c b/bwamem.c index fe83f93..346715c 100644 --- a/bwamem.c +++ b/bwamem.c @@ -1036,6 +1036,7 @@ mem_aln_t mem_reg2aln2(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t } } a.rid = bns_pos2rid(bns, pos); + assert(a.rid == ar->rid); a.pos = pos - bns->anns[a.rid].offset; a.score = ar->score; a.sub = ar->sub > ar->csub? ar->sub : ar->csub; free(query); diff --git a/bwamem_extra.c b/bwamem_extra.c index aee1eb4..2eb87d6 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -90,6 +90,7 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int64_t pos, rb = p->rb, re = p->re; pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); rid = bns_pos2rid(bns, pos); + assert(rid == a->rid); pos -= bns->anns[rid].offset; kputs(s->name, &str); kputc('\t', &str); kputw(s->l_seq, &str); kputc('\t', &str); diff --git a/main.c b/main.c index 49cc9ae..d78832b 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r472" +#define PACKAGE_VERSION "0.7.8+dev-r473" #endif int bwa_fa2pac(int argc, char *argv[]); From e80bccc9239c5a87e0580aff12ff62dfff830d63 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 10 Apr 2014 21:04:02 -0400 Subject: [PATCH 490/498] dev-474: fixed a typo --- bwamem_extra.c | 2 +- main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bwamem_extra.c b/bwamem_extra.c index 2eb87d6..e717d7b 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -90,7 +90,7 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, int64_t pos, rb = p->rb, re = p->re; pos = bns_depos(bns, rb < bns->l_pac? rb : re - 1, &is_rev); rid = bns_pos2rid(bns, pos); - assert(rid == a->rid); + assert(rid == p->rid); pos -= bns->anns[rid].offset; kputs(s->name, &str); kputc('\t', &str); kputw(s->l_seq, &str); kputc('\t', &str); diff --git a/main.c b/main.c index d78832b..2cf9471 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r473" +#define PACKAGE_VERSION "0.7.8+dev-r474" #endif int bwa_fa2pac(int argc, char *argv[]); From 07182d906144db796617be4c203834d5b304dc68 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 10 Apr 2014 21:09:06 -0400 Subject: [PATCH 491/498] dev-475: -F outputs unit score, not raw score --- bwamem_extra.c | 3 ++- main.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bwamem_extra.c b/bwamem_extra.c index e717d7b..96cdbcd 100644 --- a/bwamem_extra.c +++ b/bwamem_extra.c @@ -99,7 +99,8 @@ void mem_reg2ovlp(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, kputs(bns->anns[rid].name, &str); kputc('\t', &str); kputw(bns->anns[rid].len, &str); kputc('\t', &str); kputw(pos, &str); kputc('\t', &str); kputw(pos + (re - rb), &str); kputc('\t', &str); - kputw(p->truesc, &str); kputc('\n', &str); + ksprintf(&str, "%.3f", (double)p->truesc / opt->a / (qe - qb > re - rb? qe - qb : re - rb)); + kputc('\n', &str); } s->sam = str.s; } diff --git a/main.c b/main.c index 2cf9471..a64fd5f 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r474" +#define PACKAGE_VERSION "0.7.8+dev-r475" #endif int bwa_fa2pac(int argc, char *argv[]); From 6fda93502f3e326ed04b5cb06c4b17069e71bed0 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 10 Apr 2014 21:38:14 -0400 Subject: [PATCH 492/498] r705: pairing performed on one chr only Change of versioning: the revision number is acquired with: git rev-list --all --count This counts the total number of commits across all branches. --- bwamem_pair.c | 7 +++++-- main.c | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/bwamem_pair.c b/bwamem_pair.c index cec25da..bbd2cdb 100644 --- a/bwamem_pair.c +++ b/bwamem_pair.c @@ -58,6 +58,7 @@ void mem_pestat(const mem_opt_t *opt, int64_t l_pac, int n, const mem_alnreg_v * if (r[0]->n == 0 || r[1]->n == 0) continue; if (cal_sub(opt, r[0]) > MIN_RATIO * r[0]->a[0].score) continue; if (cal_sub(opt, r[1]) > MIN_RATIO * r[1]->a[0].score) continue; + if (r[0]->a[0].rid != r[1]->a[0].rid) continue; // not on the same chr dir = mem_infer_dir(l_pac, r[0]->a[0].rb, r[1]->a[0].rb, &is); if (is && is <= opt->max_ins) kv_push(uint64_t, isize[dir], is); } @@ -176,16 +177,18 @@ int mem_matesw(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co return n; } -int mem_pair(const mem_opt_t *opt, int64_t l_pac, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2]) +int mem_pair(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, const mem_pestat_t pes[4], bseq1_t s[2], mem_alnreg_v a[2], int id, int *sub, int *n_sub, int z[2]) { pair64_v v, u; int r, i, k, y[4], ret; // y[] keeps the last hit + int64_t l_pac = bns->l_pac; kv_init(v); kv_init(u); for (r = 0; r < 2; ++r) { // loop through read number for (i = 0; i < a[r].n; ++i) { pair64_t key; mem_alnreg_t *e = &a[r].a[i]; key.x = e->rb < l_pac? e->rb : (l_pac<<1) - 1 - e->rb; // forward position + key.x = (uint64_t)e->rid<<32 | (key.x - bns->anns[e->rid].offset); key.y = (uint64_t)e->score << 32 | i << 2 | (e->rb >= l_pac)<<1 | r; kv_push(pair64_t, v, key); } @@ -267,7 +270,7 @@ int mem_sam_pe(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac, co mem_mark_primary_se(opt, a[1].n, a[1].a, id<<1|1); if (opt->flag&MEM_F_NOPAIRING) goto no_pairing; // pairing single-end hits - if (a[0].n && a[1].n && (o = mem_pair(opt, bns->l_pac, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) { + if (a[0].n && a[1].n && (o = mem_pair(opt, bns, pac, pes, s, a, id, &subo, &n_sub, z)) > 0) { int is_multi[2], q_pe, score_un, q_se[2]; // check if an end has multiple hits even after mate-SW for (i = 0; i < 2; ++i) { diff --git a/main.c b/main.c index a64fd5f..07584b9 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8+dev-r475" +#define PACKAGE_VERSION "0.7.8-r705-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From f2b7d67ed9cddeb6c6479f370449215effbb4920 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 13 Apr 2014 12:51:44 -0400 Subject: [PATCH 493/498] output extra debugging information --- bntseq.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bntseq.c b/bntseq.c index b63dff4..403d088 100644 --- a/bntseq.c +++ b/bntseq.c @@ -403,6 +403,10 @@ uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, in *beg = *beg > far_beg? *beg : far_beg; *end = *end < far_end? *end : far_end; seq = bns_get_seq(bns->l_pac, pac, *beg, *end, &len); + if (seq == 0 || *end - *beg != len) { + fprintf(stderr, "[E::%s] begin=%ld, mid=%ld, end=%ld, len=%ld, seq=%p, rid=%d, far_beg=%ld, far_end=%ld\n", + __func__, (long)*beg, (long)mid, (long)*end, (long)len, seq, *rid, (long)far_beg, (long)far_end); + } assert(seq && *end - *beg == len); // assertion failure should never happen return seq; } From 836d46423962b150372f0a95162f7796ba243238 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 14 Apr 2014 09:55:55 -0400 Subject: [PATCH 494/498] r713: a bug in retrieving ref seq on rev --- bntseq.c | 4 ++-- main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bntseq.c b/bntseq.c index 403d088..eddae84 100644 --- a/bntseq.c +++ b/bntseq.c @@ -397,8 +397,8 @@ uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, in far_end = far_beg + bns->anns[*rid].len; if (is_rev) { // flip to the reverse strand int64_t tmp = far_beg; - far_beg = (bns->l_pac<<1) - 1 - far_end; - far_end = (bns->l_pac<<1) - 1 - tmp; + far_beg = (bns->l_pac<<1) - far_end; + far_end = (bns->l_pac<<1) - tmp; } *beg = *beg > far_beg? *beg : far_beg; *end = *end < far_end? *end : far_end; diff --git a/main.c b/main.c index 07584b9..5f31591 100644 --- a/main.c +++ b/main.c @@ -4,7 +4,7 @@ #include "utils.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.8-r705-dirty" +#define PACKAGE_VERSION "0.7.8-r713-dirty" #endif int bwa_fa2pac(int argc, char *argv[]); From 421bf265fb4b75d348892038d9ce8173cb0c1dea Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Mon, 21 Apr 2014 18:52:13 +0200 Subject: [PATCH 495/498] Removed all of the compilation problems and warnings. --- bwape.c | 4 ++-- bwase.c | 2 +- bwase.h | 3 ++- bwaseqio.c | 1 + bwtaln.h | 5 ++++- bwtpssm.c | 3 --- bwtpssm.h | 1 + bwtpssmgap.c | 4 +--- kseq.h | 6 ++---- pssm.c | 2 +- seq2pssm.c | 4 ++-- 11 files changed, 17 insertions(+), 18 deletions(-) diff --git a/bwape.c b/bwape.c index 2902642..63203ce 100644 --- a/bwape.c +++ b/bwape.c @@ -43,9 +43,9 @@ typedef struct { extern int g_log_n[256]; // in bwase.c static kh_b128_t *g_hash; -void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); +void bwa_aln2seq_core(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); void bwa_pssm_aln2seq_core(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); -void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); +void bwa_aln2seq(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s); int bwa_approx_mapQ(const bwa_seq_t *p, int mm); int bwa_pssm_approx_mapQ(const bwa_seq_t *p, int mm); void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2); diff --git a/bwase.c b/bwase.c index 6fb97c7..835db14 100644 --- a/bwase.c +++ b/bwase.c @@ -21,7 +21,7 @@ int g_log_n[256]; float exp2f(float e); double exp2(double e); -void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) +void bwa_aln2seq_core(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) { int i, cnt, best; if (n_aln == 0) { diff --git a/bwase.h b/bwase.h index 26a9f68..3f1d2fa 100644 --- a/bwase.h +++ b/bwase.h @@ -16,9 +16,10 @@ extern "C" { // Refine the approximate position of the sequence to an actual placement for the sequence. void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq); // Backfill certain alignment properties mainly centering around number of matches. - void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); + void bwa_aln2seq(int n_aln, bwt_aln1_t *aln, bwa_seq_t *s); // Calculate the end position of a read given a certain sequence. int64_t pos_end(const bwa_seq_t *p); + void adjust_pssm_score(const bntseq_t *bns, bwa_seq_t *seq, float prior); // bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); diff --git a/bwaseqio.c b/bwaseqio.c index 7f83a37..bd6c6f6 100644 --- a/bwaseqio.c +++ b/bwaseqio.c @@ -3,6 +3,7 @@ #include "bwtaln.h" #include "utils.h" #include "bamlite.h" +#include "seq2pssm.h" #include "kseq.h" KSEQ_DECLARE(gzFile) diff --git a/bwtaln.h b/bwtaln.h index d8daeb4..d92cdcb 100644 --- a/bwtaln.h +++ b/bwtaln.h @@ -2,6 +2,7 @@ #define BWTALN_H #include +#include #include "pssm.h" #include "bwt.h" @@ -39,12 +40,12 @@ typedef struct { bwtint_t w; int bid; + float min_drop; } bwt_width_t; typedef struct { uint64_t n_mm:8, n_gapo:8, n_gape:8, score:20, n_ins:10, n_del:10; bwtint_t k, l; - int score; float pssm_score; float posterior_p; char pssm; //indicate whether this alignment was made with a PSSM search @@ -113,6 +114,8 @@ typedef struct { #define BWA_MODE_BAM_READ2 0x100 #define BWA_MODE_IL13 0x200 +#define ERROR_MODEL_LENGTH 128 // Encompases all the possible quality scores and then some + typedef struct { int s_mm, s_gapo, s_gape; float p_gapo, p_gape, p_del, p_snp; diff --git a/bwtpssm.c b/bwtpssm.c index e3769e9..889939a 100644 --- a/bwtpssm.c +++ b/bwtpssm.c @@ -251,7 +251,6 @@ void bwa_pssm_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) qualprobs = phred_ascii_quality_scores(qbase); // initialization - g_visited = 0; ks = bwa_open_reads(opt->mode, fn_fa); { // load BWT @@ -309,8 +308,6 @@ void bwa_pssm_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) bwa_free_read_seq(n_seqs, seqs); fprintf(stderr, "[bwa_pssm_core] %d sequences have been processed.\n", tot_seqs); } - fprintf(stderr, "g_visited: %lu\n", g_visited); - free(mc->p); free(mc->powers); free(mc->counts); diff --git a/bwtpssm.h b/bwtpssm.h index fcef371..5993295 100644 --- a/bwtpssm.h +++ b/bwtpssm.h @@ -13,6 +13,7 @@ extern "C" { bwa_seq_t *bwa_read_pssm_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual, Probs *mc, float *qualprobs,const gap_opt_t *opt); void bwa_cal_pssm_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt); + bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); /* rgoya: Temporary clone of aln_path2cigar to accomodate for bwa_cigar_t, __cigar_op and __cigar_len while keeping stdaln stand alone */ diff --git a/bwtpssmgap.c b/bwtpssmgap.c index 54611d0..d5218ef 100644 --- a/bwtpssmgap.c +++ b/bwtpssmgap.c @@ -292,7 +292,6 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const int curr_threshold; min_score = -INT_MAX; - g_visited++; visited++; gap_pop(gp_heap, mat->id, &e); // get the best entry @@ -304,7 +303,6 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const break; } - int max_entries = 0; //fprintf(stderr, "pssm #1 id:%d %d \t[%d][%d,%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%lu]\t[%lu,%lu]\t%d\t[%6d, **%6d**, %6d, %6d]\n", mat->id, i, max_entries, gp_heap->empty_left, a, i, seq[i], "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos, curr_score, e.score_offset, mat->thresholds[i], mat->bi[i]); m = max_diff - (e.n_mm + e.n_gapo); @@ -379,7 +377,7 @@ bwt_aln1_t *bwt_match_pssm(bwt_t *const bwt, int len, const ubyte_t *seq, const memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t)); } p = aln + n_aln; - p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; p->a = a; + p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; p->k = k; p->l = l; p->score = score; p->pssm_score = curr_score; diff --git a/kseq.h b/kseq.h index f037098..1905aa6 100644 --- a/kseq.h +++ b/kseq.h @@ -179,7 +179,7 @@ typedef struct __kstring_t { #define __KSEQ_READ(SCOPE) \ SCOPE int kseq_read(kseq_t *seq) \ { \ - int c; \ + int c, i, j; \ kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ @@ -205,7 +205,7 @@ typedef struct __kstring_t { seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+' && c != &) return seq->seq.l; /* FASTA */ \ + if (c != '+' && c != '&') return seq->seq.l; /* FASTA */ \ if (c == '+') { \ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ seq->qual.m = seq->seq.m; \ @@ -217,7 +217,6 @@ typedef struct __kstring_t { for (i = 0; i < 4; i++) \ seq->scores[i] = (float *) realloc(seq->scores[i], seq->seq.l* sizeof(float)); \ } \ - while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ if (c == -1) return -2; /* error: no quality string */ \ if (seq->qual.m > 0) { \ @@ -243,7 +242,6 @@ typedef struct __kstring_t { } \ seq->last_char = 0; /* we have not come to the next header line */ \ return seq->seq.l; \ - } #define __KSEQ_TYPE(type_t) \ diff --git a/pssm.c b/pssm.c index 2d055b4..c3618a4 100644 --- a/pssm.c +++ b/pssm.c @@ -137,7 +137,7 @@ PSSM init_matrix_score(int order, int length, int alphabet_size, int *scores, in char errormsg[160]; sprintf(errormsg, "Mismatch between list size (%i) and size calculated from order, length and alphabet size (%i).", nScores, pssm->offsets[length]); - fprintf(stderr,errormsg); + fprintf(stderr, "%s\n", errormsg); return NULL; } diff --git a/seq2pssm.c b/seq2pssm.c index fe72035..80a5707 100644 --- a/seq2pssm.c +++ b/seq2pssm.c @@ -154,7 +154,7 @@ PSSM string_to_pssm(ubyte_t *seq, int len, int alphsize, float match, float mism } /* This function makes a matrix */ - mat = init_matrix_score(0, len, alphsize+1, base, nScores, -0.5); + mat = init_matrix_score(0, len, alphsize+1, base, nScores, 0); return mat; } @@ -528,7 +528,7 @@ int mismatch_threshold(PSSM mat, int M) { int order = mat->order; int scorediff[MAXPSSMSIZE], t, hscore; int *scores = mat->scores; - const int infty = 1.e-100; + const int infty = 0; hscore = 0.; for (i = 0; i < mat->length; ++i) From 6333c4435fb87856b0f1b2a92eeb9810c32ac1e4 Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Tue, 22 Apr 2014 09:23:55 +0200 Subject: [PATCH 496/498] Added SAI_magic to bwtpssm.c and removed the mismatch number setting --- bwtpssm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bwtpssm.c b/bwtpssm.c index 889939a..1d1a5d1 100644 --- a/bwtpssm.c +++ b/bwtpssm.c @@ -263,6 +263,7 @@ void bwa_pssm_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) mc = markov_chain(bwt->L2, 4); // core loop + err_fwrite(SAI_MAGIC, 1, 4, stdout); fwrite(opt, sizeof(gap_opt_t), 1, stdout); while ((seqs = bwa_read_pssm_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual, mc, qualprobs, opt)) != 0) { tot_seqs += n_seqs; @@ -418,6 +419,8 @@ int bwa_pssm(int argc, char *argv[]) opt->prior = 0.8; } + /* The maximum number of mismatches is set to 30 when using PSSMs + * if (opt->fnr > 0.0) { int i, k; for (i = 17, k = 0; i <= 250; ++i) { @@ -426,6 +429,8 @@ int bwa_pssm(int argc, char *argv[]) k = l; } } + + */ bwa_pssm_core(argv[optind], argv[optind+1], opt); free(opt); return 0; From 46d725c57877b3dd9687078ebbd402bab82ab3b0 Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Tue, 22 Apr 2014 09:56:39 +0200 Subject: [PATCH 497/498] Changed fwrite to err_fwrite --- bwtpssm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bwtpssm.c b/bwtpssm.c index 1d1a5d1..37825fc 100644 --- a/bwtpssm.c +++ b/bwtpssm.c @@ -264,7 +264,7 @@ void bwa_pssm_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) // core loop err_fwrite(SAI_MAGIC, 1, 4, stdout); - fwrite(opt, sizeof(gap_opt_t), 1, stdout); + err_fwrite(opt, sizeof(gap_opt_t), 1, stdout); while ((seqs = bwa_read_pssm_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual, mc, qualprobs, opt)) != 0) { tot_seqs += n_seqs; t = clock(); @@ -301,8 +301,8 @@ void bwa_pssm_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) fprintf(stderr, "[bwa_pssm_core] write to the disk... "); for (i = 0; i < n_seqs; ++i) { bwa_seq_t *p = seqs + i; - fwrite(&p->n_aln, 4, 1, stdout); - if (p->n_aln) fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); + err_fwrite(&p->n_aln, 4, 1, stdout); + if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); } fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); From ac940a3cb44dcd4aa8e1503411ab04c97fcbcc5f Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Tue, 22 Apr 2014 13:05:01 +0200 Subject: [PATCH 498/498] Read in PSSM matrices without crashing --- kseq.h | 2 +- pssm.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kseq.h b/kseq.h index 1905aa6..4d09b43 100644 --- a/kseq.h +++ b/kseq.h @@ -193,7 +193,7 @@ typedef struct __kstring_t { seq->seq.m = 256; \ seq->seq.s = (char*)malloc(seq->seq.m); \ } \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@' && c != '&') { \ if (c == '\n') continue; /* skip empty lines */ \ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ diff --git a/pssm.c b/pssm.c index c3618a4..8d78a22 100644 --- a/pssm.c +++ b/pssm.c @@ -42,7 +42,7 @@ PSSM init_matrix(int order, int length, int alphabet_size){ // Check if longer than max-length of PSSMs if(length >= MAXPSSMSIZE) { - fprintf(stderr,"Matrix is to long."); + fprintf(stderr,"Matrix is too long: %d", length); return NULL; }