From b6e6bd1065682cf7bff3ce8d4cfedd89dc744c00 Mon Sep 17 00:00:00 2001 From: kbseah Date: Tue, 3 Mar 2015 15:32:00 +0100 Subject: [PATCH] Added functions winnow and winnow.mark to R package, updated manual --- README.md | 43 ++++++- extract_PE_reads_from_sam_and_fastq.pl | 161 ------------------------- genome.bin.tools_1.2.tar.gz | Bin 0 -> 13273 bytes genome_bin_tools.r | 34 +++++- 4 files changed, 72 insertions(+), 166 deletions(-) delete mode 100755 extract_PE_reads_from_sam_and_fastq.pl create mode 100644 genome.bin.tools_1.2.tar.gz diff --git a/README.md b/README.md index 84c444b..6af83d2 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Contact: Brandon Seah (kbseah@mpi-bremen.de) -Cite: Brandon Seah (2014), genome-bin-tools, Online: https://github.com/kbseah/genome-bin-tools +Cite: Brandon Seah (2014,2015), genome-bin-tools, Online: https://github.com/kbseah/genome-bin-tools ## 0. Introduction @@ -36,6 +36,8 @@ Use your favorite assemblier, like IDBA-UD (http://i.cs.hku.hk/~alse/hkubrg/proj BBmap is convenient because it calculates length and GC along with the coverage per scaffold, and outputs them all to the same file. If you use a different mapping tool, you will have to aggregate all these data together in a tab-separated table with the same header names as the bbmap output. +NOTE: Newer versions of BBmap (after 33.x) insert a comment character `#` in the header line of the covstats file. Remove this character before running the scripts. + For differential coverage binning, you will need a second read library from a different sample where at least some of the genomes present in the first sample should also be present. Map the second read library onto the same assembly and save the coverage values as a different file: ``` @@ -144,7 +146,23 @@ If you see a cluster of scaffolds which you would like to save as a bin, you can `bin` is now an object of class genomestatsbin. Type the name of the bin object to see a summary of the bin. If you imported the marker, SSU, and/or tRNA data, a summary of how many of each are contained in the bin will be reported (this is useful if the marker genes are typical single-copy genes, for example). -### 3c. Fishing for connected contigs using Fastg files (experimental) +### 3c. Taking subsets of genomic bins + +If you want to get a subset of scaffolds based on GC, coverage, or length, use the function winnow() + +```R + > bin2 <- winnow(d,cov=c(200,Inf)) # Return the subset of contigs in d which have coverage above 200 + > bin2 <- winnow(d,gc=c(0.2,0.5),cov=c(100,200),len=c(1000,Inf)) # Return the subset of scaffolds in d that have GC between 20-50%, coverage 100 to 200, and length above 1000 +``` + +If you want to get a subset of scaffolds containing marker genes that belong to a particular taxon, use the function winnow.mark() + +```R + > bin3 <- winnow.mark(d,param="Class",value="Gammaproteobacteria") # Returns all scaffolds containing a marker gene whose value for "Class" is "Gammaproteobacteria +``` +The functions winnow() and winnow.mark() work for both genomestats and diffcovstats objects + +### 3d. Fishing for connected contigs using Fastg files (experimental) Fastg files are generated by newer versions of the SPAdes assembler, and contain contig connectivity information generated during the assembly process. These can be useful, e.g. to "fish" scaffolds that are from the same genome but which were inadvertently left out of the interactively chosen bin. @@ -154,8 +172,8 @@ For now you will have to manually edit the `genome_bin_tools.r` file to specify Fish out a new bin from an old bin using a Fastg file: ```R - > bin2 <- fastg_fishing.genomestatsbin(d,bin1,"path/to/fastg/file.fastg") - > bin2 <- fastg_fishing.genomestatsbin(d,bin1,"path/to/fastg/file.fastg", save=TRUE,file="bin2.scaffolds.list") # Save list of scaffolds in new bin to external file + > bin4 <- fastg_fishing.genomestatsbin(d,bin1,"path/to/fastg/file.fastg") + > bin4 <- fastg_fishing.genomestatsbin(d,bin1,"path/to/fastg/file.fastg", save=TRUE,file="bin2.scaffolds.list") # Save list of scaffolds in new bin to external file ``` You can compare the two bins by plotting them overlaid: @@ -207,3 +225,20 @@ Identical syntax to `fastg_fishing.genomestats`: > Bin2 <- fastg_fishing.genomestats(D,Bin1,"/path/to/file.fastg") ``` +## 5. Mapping and reassembly + +Once you have your final bin, either the shortlist of scaffolds in that bin was exported with the save parameter when that bin was defined, or you can use the native R function write(): + +```R + > write(as.character(bin1$scaff$ID),file="shortlist.file") # for genomestatsbin object + > write(as.character(bin2$diffcov$ID),file="shortlist2.file") # for diffcovstatsbin object +``` + +This gives you a list of scaffold names in your bin. Use a tool like faSomeRecords to retrieve the corresponding Fasta sequence records. Use a mapper like BBmap to get reads that map to those scaffolds, and then reassemble with your favorite assembler. + +``` + $ faSomeRecords original_assembly.fasta shortlist.file shortlist.fasta + $ bbmap.sh ref=shortlist.fasta outputunmapped=f outm=reads_for_reassembly.fastq.gz in=original_reads.fastq +``` + +Good luck! diff --git a/extract_PE_reads_from_sam_and_fastq.pl b/extract_PE_reads_from_sam_and_fastq.pl deleted file mode 100755 index 2a35f10..0000000 --- a/extract_PE_reads_from_sam_and_fastq.pl +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/perl - -## Extract Illumina reads that map to target in SAM file and extract dangling ends from Fastq file for reassembly -## Contact: kbseah@mpi-bremen.de -## Version 2 -- KBS 2014-10-23 -- Added output reporting, option to specify length of read header suffixes, changed indents to spaces -## Version 1 -- KBS 2014-09-23 -- Fixed Fastq header recognition -## Version 0 -- KBS 2014-03-18 - -## HEAD ############################################################################################################# - -use strict; -use warnings; -use Getopt::Long; -use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ; - -my $targetlistfile; # List of scaffolds for which to extract the mapped reads (thereof) -my $samfile; # SAM file of the mapping from the readfile to the scaffolds contained in the targets list -my $readfile; # Fastq-formatted reads -my $is_gzip = 0; # Flag for gzipped read input -my $suffix_length=0; # Length of suffix (in characters) appended by user to the Illumina read headers, e.g. _1 and _2 to distinguish forward and reverse reads -my $targetscaffolds_counter=0; -my $targetheaders_counter=0; -my $scannedreads_counter=0; -my $foundreads_counter=0; -my %targets_hash; -my %reads_hash; - -## MAIN ############################################################################################################# - -if (@ARGV == 0) { usage(); } - -GetOptions ("target|t=s" => \$targetlistfile, - "sam|s=s" => \$samfile, - "reads|r=s" => \$readfile, - "gzipped|g" => \$is_gzip, - "suffix|x=i" => \$suffix_length) -or die("Error in command line arguments\n"); - -print STDERR "Reading list of target scaffolds\n"; -read_target_file(); -print STDERR "Found $targetscaffolds_counter target scaffolds in list\n"; -print STDERR "Getting list of reads that map to targets from SAM file\n"; -scan_sam_file(); -print STDERR "Found $targetheaders_counter read headers to be extracted\n"; -if ($is_gzip == 1) { - print STDERR "The input reads are gzipped (option -g|--gzipped was specified)\n"; - filter_fq_gz_file(); -} -else { filter_fq_file(); } - -## SUBROUTINES ###################################################################################################### - -sub usage { - print STDERR "\n"; - print STDERR "Read extractor for reassembly\n"; - print STDERR "Version 2 - Brandon Seah - 2014-10-23 \n"; - print STDERR " \t Find reads that map to specified targets in SAM file, and then extract dangling ends from Fastq file for reassembly \n"; - print STDERR " \t Assumes that Fastq headers are produced by Illumina machine and start with \@HWI\n"; - print STDERR "\n"; - print STDERR "Usage: \n"; - print STDERR " \$ perl extract_PE_reads_from_sam_and_fastq.pl [options] -t -s -r \n"; - print STDERR "\n"; - print STDERR "Options: \n"; - print STDERR " \t --target|-t FILE Plain text list of target scaffolds (one scaffold name per line, no \"\>\") for which to extract the mapped reads \n"; - print STDERR " \t --sam|-s FILE SAM file with mapping of reads to scaffolds\n"; - print STDERR " \t --reads|-r FILE Fastq or gzipped Fastq formatted reads from which to extract those which map to the target scaffolds\n"; - print STDERR " \t --gzipped|-g Input reads are gzipped (default: no)\n"; - print STDERR " \t --suffix|-x INT Input read header names have a suffix added of length INT (e.g. \_1 or \_2) (default = 0)\n"; - print STDERR " \n"; - print STDERR "Use hyphen if streaming to STDIN, e.g.: \n"; - print STDERR " \$ samtools view BAMFILE | perl extract_PE_reads_from_sam_and_fastq.pl --target TARGET --sam - --reads READS | gzip > output.fq.gz \n"; - print STDERR "\n"; - exit; -} - -sub read_target_file { # Read file containing list of scaffolds to extract the reads that map thereto -open (TARGETSINPUT, "< $targetlistfile") or die ("Cannot open target list file: $!\n"); -while () { - chomp; - if (!exists $targets_hash{$_}) { - $targets_hash{$_} = 1; - $targetscaffolds_counter++; - } -} -close (TARGETSINPUT); -} - -sub scan_sam_file { # Scan SAM file of mapping to find header names of reads that should be extracted - open (SAMINPUT, "< $samfile") or die ("Cannot open SAM file: $!\n"); - while () { - chomp; - if ($_ =~ /^@/) { next; } - my @theline = split('\t',$_); - if ( exists $targets_hash{$theline[2]} ) { - my $header_nosuffix = substr $theline[0], 0, 0-$suffix_length; # Chops off the last $suffix_length characters from the read header name - $reads_hash{$header_nosuffix} = 1; - $targetheaders_counter++; - } - } - close (SAMINPUT); -} - -sub filter_fq_file { # Filter input Fastq file to output reads that map to the target scaffolds -my $writeflag = 0; -open (FASTQ, "< $readfile") or die ("Cannot open Fastq file: $!\n"); -while () { - chomp; - if ($_ =~ /^\@HWI/) { # Important! @ character alone will also match Phred quality score symbol and will cause quality lines to be misread as header lines. @HWI assumes Illumina output - $scannedreads_counter++; - if ($scannedreads_counter > 0 && $scannedreads_counter%500000==0) { # Output a progress message every 500000 reads - print STDERR "Scanned $scannedreads_counter reads from input Fastq file\n"; - } - my $readname = substr $_, 1, 0-$suffix_length; # Remove any suffix characters that have been added to the read headers, and the @ character at beginning of Fastq header line - if (exists $reads_hash{$readname}) { - $writeflag = 1; - print STDOUT $_, "\n"; - $foundreads_counter++; - } - else { $writeflag = 0; } - } - elsif ( $writeflag == 1) { print STDOUT $_, "\n"; } - else { next; } -} -close(FASTQ); -} - -sub filter_fq_gz_file { # Filter input gzipped Fastq file -my $writeflag = 0; -my $theinput = IO::Uncompress::Gunzip->new( $readfile ) or die "Cannot open Fastq.gzip file: $GunzipError\n"; -while (<$theinput>) { - chomp; - if ($_ =~ /^\@HWI/) { - $scannedreads_counter++; - if ($scannedreads_counter>0 && $scannedreads_counter%500000==0) { - print STDERR "Scanned $scannedreads_counter reads from input Fastq.gz file\n"; - } - my $readname = substr $_, 1, 0-$suffix_length; # Remove any suffix characters that have been added to the read headers, and the @ character at beginning of Fastq header line - if (exists $reads_hash{$readname}) { - $writeflag = 1; - print STDOUT $_, "\n"; - $foundreads_counter++; - } - else { $writeflag = 0; } - } - elsif ( $writeflag == 1) { print STDOUT $_, "\n"; } - else { next; } -} -close($theinput); -} - -#sub gunzip_example { -# my $inputfile = $ARGV[0]; # specify file to be unzipped -# my $counter = 0; -# my $theinput = IO::Uncompress::Gunzip->new( $inputfile ) or die "Gunzip failed: $GunzipError\n"; -# while (<$theinput>) { -# chomp; -# my @theline = split('\t',$_); -# if ($theline[1] == 161) { $counter = $counter + 1; } -# } -# print STDOUT $counter, "\n"; -#} \ No newline at end of file diff --git a/genome.bin.tools_1.2.tar.gz b/genome.bin.tools_1.2.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..5a30458aa39c3379a3bd0632abdcf9b7c504893b GIT binary patch literal 13273 zcmV;~GbYR*iwFP!000001MPkNd)i3S;QV_06&;e9z@CgjHs-m^kbN_DChO-iNuDIL zdwY|d9{~+$BP3Q5wwb)%zx}GJyQOY@0JbqsGGccekovo;s_Uz(CfpBZyftwBRvZLg zw14va-SOMgH}6khzxr-VpEi7s4iE9)?!iF^|F!LoP0#Vu?RCEE*oVFD!D0KT*Zr>D z=^nHXzhmw1cJNt5u@k~jJ`E!7Ov}s7oN(q2zx&ocZ=B&LXTqPbNd^#V@0W9UEZ^7O zyRip9PyLvO&M^)=qdd4BdH5-Mq(M z**FN#{&_s2gnF`?QQ zvj5p_-+%S&*XQrvJUf2Ay*}3AfBT?^_-`L}4(+ae0Qi5{>s9!_lMla~2Vwlii2)`2 zM*aW&{`(Whn@^o5A3y%jdb9Rj_4&3xIsAXSzx6&4#e3auIsUiX^87#OK}o>>gWgf^ zJJzl6|33DAI1Pe`&J^EnTR)tbA*PsoS`ix`!&^Hp721- zt)Wdb{c6rv|Iwu8LjLcC8}ddp7r$L!zNQ|qi5B%h&l;DH{V4=U3Tt6{r0U7v500#b1s}?VS`$#%$)~+hf z;|rkw;^l40|9bl0ImpuggI=$q|GTCC+Gbbee@En>M)>G0AD_KAw$x8W_BSK?ROe1e zz6KB##s3r0|1EL-o9ll%-3KRx5khU5T=-6FayxzWPCwRFMy{k82pP5V-}3r5Lr%U zk`m${i0kPfmlASk)QX*f#~Z|B7z|D__NEo_dB6T4u3J;6z{CFgX4;UwaYJ@#Lr{Xo zGjd|5H4dE_Z=9a=#W;b#vQi0OJxusK!T+hpk{f)<{YgAk55=#hSVbrigfl0m6`b*K zLhFYk07e;pxW0Ig+cyq_8KYT6YM6G(Fl}R)wtA?Zh-u2Lz$U0?Tp5B4+ZrtU_0yBO zh=b6R?OhvCrVaV#Dz}^p z+h{G5+VFJkY?7Mmrtp5}rlFAmnv(NwKj6HfTtttEv_vv#d!A#=_3H<15+|EoP0pM)5D|kt8Df zdNKJ-Z^|xh%HFsscn+YvRBWYAZr%s6!|x*aG}~Kp~TQUNs$i)8i%C{YY+ibB|ho6@#O*-`2o+W@~NPv8W-PE8>@AC!oGL? z@6+bR=}Uibq&}V9Df=zu4Bw4i^eN=*-0>D1$CN3J)(d&?TR5tr6A76^lO7V$WHFz6 zAekh+E`0ajd9Gjfps{>P?s@5c;umftyc40w0AwE+5|uO>&9aNKNnm~Sr*eqEZs#8g zr9ir(@+~!-sX=UcNqJ=bCFg(nh@_%sMi46KE7?84a7iF2+`!`fQzBTVZREPyc0Lao zv9j7GsO~e6nO|oi-4qP(5zu4a+};4OA5x?Q3m?&J%(w$cBxUA!>JVVgfv@C#%o>02 zFyInUr@@79E^r1^C1);7Pul#RvUan%o$(*zuvwg0W*o;tVOL}fF>?Kh$2S;czt1{b z7-iWI#W?nc!F*|q^D2)KC}3v<`EqPPsbf=QP?#>I_+m6Dn*deLz;VIgNs!G7@#Vu8 zveJg*&q>U=8Kk}HBuOAwuEL{4oXGe>Z67s8rY>QDTv0Ojsbj9G- z2%78;#)J}UdH`p@L2gs}Ucs6+zAcg4JgF@mRaPHjtd^z}DO$z-vgIvQj}*6T-=Jlp zT!MjEr7TH7A)_P7?!*l=B(>EWpea!L=2=Ta5wL5N&<+bRTB3+Z5RGC6!QNyPNSOK> zg-i7~eVuNkj|t_|F=;9vq-4myZbv1(DI~w9TPuM6O!(fjJg1v+SAi26ohcY^O4srwyp$G8gGVbpGogO^!@_lxPZK(6 zDV3Std#X3f@H&ONEmdn~!9>qyK9*95_(xZzmFdw?DWSWcRK6!g^8eV0VjyO2GzHvj zZ9yBj&i}95J#GJ8^wrI22a_)o0i@Qj|FlJ!OT)JBY2hBbc-L{uFR&UqI5nK&RRAOq7cD1*+{q zlY8QwSo^sXMnDacltKNH@@|pL@Z0EYst0osXkv`%67!*3y5|3-m@x;I;Jsu$!U(o!+5c(f{4k ze{-|%ko@b;<{9dr8N;=Re{tQ4_WucKzwo8M2IbGVyMOD1znuP~3OaOR-tstn#U=29 z@DA^P9@+=F`#*;V75(4U{ugYvY5tQmQwZOdZq<_JK%}D)_oGFmRKdEqpZE&4gjztn z%Qm~ZRgJ*B!-yWFc%W8A*bz1bJQO!XAB9(r_Cn}? zh5MN90@xR*-xQi>p}UF$Kpp3jJhr66nQq;|dXGdcc}u74HD22J*vx2&r@;jb&4<$o zyKq7{pL}mgX2c+j`G^gG?)z$vNW{1>$OZEjHM{AsU|TLri=^WcWBUi0VnzE?Clae1 zM6o`M=&xcR&LwqUG7j2&)yvHkYOknyw1L7BsXh}p#v5Xddfk$m{dGgwXb`YDAqS*k zLfJ;}hXxJmRLs&G$L~5ot(d>Ch!|kiB)CN68zEpLsT%#2SS{gKEu4dK zXsQ`;^etv-dCmB-ol!UQAeyK4R{z&3!&h_&0vnPAJ&qX2U!40R!Z{9@Rdw28AD9M_)!OzsM1dc z-rRWIPM}RDdXQ>(^$9%im^nB5K;iX^q6Mz=@LeX=3F$4;)p9@qY^eWVMA2-E#fTg zrU{J3&{;&p-VK0nhm$FGdDLRBk=Y_&N@S^|{zoVViJ!_k_KBXL0F=&fT@Py6OOB2o zKVhdMV1C@OI~14k62~yb12Y&_WNICCpHVe%XB|J z^?^U9faW~iHQ|wH38gL#PzWn@cMsM%hxos^{aJ2wv0QySu=BrpwYR)J}%z%h^cZCsk`M1T>#S}Wtfr0!fd ziseGp#kG_j$8pKgu{#Ola!2R?R>HsBfdB7x>?;1tuFwD5hJP`es`&qJfdAi0_?NZu zU*H$zf9V``yS+}A@_!sv`Je6q|92SvrQim@okV{TXMRcSmp^UTm%>;XRY;gBBuo_& zrV0sDg@ma>!hB01VYVJ#rHY19MMK#j8j7ZL`Z8jnWF+(|6iW3ekpJ`Uv$xa%uCxEO z?QH%ZOa))b|GW6)H+z-$ciwM0amx{Rs)e90-G7v%k#KifdbWnV08YAqIS}H-ycJE6 z!vFm9ug;v!MI!a4@FrC)$q{Pp(2a6(Or~#ViHrD|plEmr{-Jb4PmG$<-)x|q<Kjl+&{Ry zDbbfbm-n0Rbt9lLhqexGt}2JQ>QjFHXEI^leg228-ObtmI#vAloqRHzZ5;lqGrbn1 zaP@+rgLGb;+A7UylkQ0I5R+2r*<4YKBhTePi0i{e7^2FNZlbZbM4p7Ry-K2b&O!-L{Zn=-21kxoYHgg`-tXCTXx~ zlF{_}*F{5%@E_NIn@K>|+5dWNJIDVYRrf!4ga6sh&IJ7etc-*4xZhSRj0*V0l+vYN zhaXAwCns`;!v8H=t-hKJrQJ|+I3jyA={)3f(sao0rkG)gU^PiWei?htfP3lzUfcv8 zbU5Y0N_P%SSOAQzxTwh;z*xR!c*DT!(^V&!oEJ>`^-KJA3ElL`mkQ%2TX<&%+3F6w zS{A>u;Z>{EQu=UW*=a>dfvjoClROoLxhsTDmp$#v{m>b?$si;zGQ6_!7J7Z-csvGP za@fELaC8M5cnfqE3qp({24C=A5^1j=vJ*Z=XH4zJ?TcfOrKkzEwc;JAuh98rlOVW{vtAkE3B# zBq~p?+tL+B)lzy*m|B1{#Ug_hcGvpg8*DH6v}r@nj?`37zRC5K8KYdOhBNvXP!f9C zfDLK%+U7mLhwRxLgauVTtq&cf`>X{hw+bHG>bx5A%YO9lP_L;1jd$8@i=a1sB{E9> z1>cxJKXY8aFM`38n5pA{IQo#jQX};I`NhShbP56A40$yeahJ*qDZ>kls}TxVPH?Pk&cDV`;s>!{b^5nlC2 zxBL~<50VlDB6C|SGSRKErfv6(zQCd=L+~= zRy@;T2>kGfHOv>5Xtr5EAy@&4Fe^Wj>QVrOcIzl166v02ieHajtT@CZy#S6*fW((Y zA#f0PLld&@3G%#`E2S?L0lo-yL7+l|iV|=hS%e69cYt)!Z(luQIDF4pwy@?XSPWz( zy^cHaVYyod%Hgd9oxErLJPcotl1L^7I-&1J4x0oQY=WsIUPVz{4WUtaF^=8 zuS4mFUx5-wZoQVIy-=B3afS?Z@^RSLuc<1%Nej@pEAr&-`1D`nxu#7VZzX-0G z3PLw=VixH4C%`mCoRri(6?7HqjVt zB7LrxJd;h?NbbzZoD%8_t)Jfc~UIs=F((dl9W>{ zkRn&hkV+t{JD4YMY zeblbvKkWwp3pYDM0<_aZ2|1DoNZwLnJ%(h2^$ce2AHWSFx=Tel3w(uPXoJPVs*w2e1?Lzn+@GKzc*tP@(*- z@jrwu*`@>NTK@kq=l|WS?tkqF|8tvdBEOPhhfdt6zs>|{)T@2)6GuBLd+d1LljIR~ z1E`A8(vm`~QUBn-6E|z~Mdg*-3VZJ#|L^$xhvPnX{@Ydj=X=8c-1)yT34oe94XDuj zYsUXA8i3cI{|EW|AJzH4L;No_0I$S-tyXpVf5Z5H=l);W`(N;&Tb=*+ga6k%{>_Tj z>HjU@zucScX#m#h|2nz+4|cCQ|9A2!+Uy+uYo79y^AFB(^zEn{b8eRb($7%@AUQ5AJY#z{pE2#{gF!q0}z#lP*2mq z2tK0z59+qg9GylqpiUEm`RHPUd?+Xjg`xRgHu4&;?2GVg4CqiKvSGUu6&?<=*}F2~ z`HN#lv5(TbSri-_NL#<3i;KE6ViPKFo+ z{|AfEevRB?e=e*y>u4JguYb-8N%CGGb$9L(KOZZ;5gZ)i%Wpki?#{ zbYC*B^qt8&s_MIwf2;3M-k%lg+NG84pXha_c7BJ=H|zH}EAHR@CW~B>;^>!r!VG9H z25>Cm7px2zTE>vAER}!~50H(xs$ErQ`gYELx=^@_{O`c+Yv;dxg!$hOj;i}VJ3Rke zD(kP#1k8bU&XNn?lm)F{f8hXYJ`aPK2Lqz4u2a9+;cuS(S%k);-bL496^MKw1F{#L z`+Al*{4Fcd1UQV5)#+;;J(Pjih=gu+`lrqslt?7d8Z}egXg6h5`^Q4L$uTUtLlNbn z;ewTE-hka;H{v&Ne2>(a#rekMV>3FvygA+P(~2}dVvn>Ce?ErbPS@-n57_E_tv)wD z|7VWBzvVvgkneT7rTRbk$n*cOE&Tru4-Vk`?^fsk{p`PpwM+S64Ihxvo$!A>4E81T zzr7cCb?8YAdTsuuO z5diBa!%L@ZUnXEqlDF7kQt28Sj2Ku!ii-`b;OE*eSj?e71*`m2$A8Gixy%*&>2oKi578ReidXSH|vK;kA2_ z><73h72+w)lK@YTfJ4x7BES_9H?j?SmWh+2_lv z&$U&a13xC5OcHBA>H{hDgSc8qVPtZ7B-kW$=D^CeYF9~jG*9UAU5t^!pRo7%I{@Ss ziGgHJK}<%7%bz5aVz3P^02=Z8jS-)Rd?-SFdK}llL5Y9>ys34TuHlscxd{)SVCqJh zv~n1PzDg#or@ElM9#L%eNF}q>C!y~JwEQxdplRj_JAvnG8~Q7bUnKe#+o+EiH&k!& z9ZoKrm5~u=T%4Mb!T|Y^MbJwN+ueG zF)yIqB?IAaw7~QvPLBN-@Y<@?{uwMNZ;68xY~%tA#YlZnaneqT4TzRh?9;x|oHa5@s~31)aDl%NuA(P31=4b8SD&TH zQO_V3B!fb_JoFYLY#^7x+hRN(wyso@dLmG0_kAWq9fi~?oVk_2nXeIng^JKcxwx@* z(o>pRRPTqv5Q|>MQ2V+)P<5ngNjGx8uRfnRvBN}MD{Xm;PqPt~m82_^e^(F#W5LQA zmy33d)}m-p)))qAzD7$JIjO8ETBU2Wr3n4$SP>FkZz6&F!qH+&3++;JDqC0&&HR?7URWmBXStoB*u){@GuOcJq%(XWg zMQQc&`b90>v|6q-!E^=^Q|%zEiqfMS$-f>49XPkea+xgp z&d((XT(XJJ>_49_MjhzTd}6b!@7dYe>sNqiuU|bsJ5zmP(l{UwKCirh?o0n;_yFHf z{j*m8bJ$h!KYBgX1-9)@2Z~ktpL@{%Xd7Oj{waEEayccBOLRXab&H)6?_2Yei(gct z_!0TZ#wnL9<1mlKYs2VCEa8`-57PH`N9v$jF_1L$^Uum1b;?k72U;g>JeAU^QaatQ z(y16Z%5_eK;^s9}P9^>AOyg86E>{YtZ&cw#`&FiSsWeQLhUv@GFzGB(p@J#HuwK!S z{v)=9Qd7JG%0({3c)k9ZT78L$ZrIY%~by7wo)11HlwKY*x{jT&-Ccsu|s6U+= zs!;#46|bL-^*`NiuaoM3@O`IuP~HF9rT%BFTTkJ`4IFw_dh={gp)*6lSsYey$PPn} zV!7d|BhKKQ%I1Y+lM00a(<2rg(q~FQH)O=9kaKl?n8ZwwA>vo!iy$?l0u6LVKvf2z z6S^Gz+(BR*i9p$j!XeQ@!Q@qnu9l|7G49E!FCEODBfi>}WPDqKiesW1@s0Au;MZoN z3Zl)+xdo-~t&`^?58^bM4E_T&>GrOll%5(VkoS<>X0IEq_yk2PM?P#w?E&`8+H3@KLf zAqqudfsClu#nMObOaNPe%XsH{=MLfRz=?oU&?hqC@fmaqOjCF!))@J{GdiV8N14EN z*)ZePL)7}mlE8cUd+(l`ed#J6aly8EJoRfl#p0cV!rAxU^MBOWJuo{W}9!?+YlTTCi)%M>-3F5OVj$l)*{ zmkY6c3f#F`epCM-=sxB&Kw?m`^p?o#MX-}-EKg4i^+aiZ>c&!{_bupGrwf{vqO?uK z$dwHyD6MNMN}D6hS=nBSe0e=s+9qlZ^GY!xdAgCd!4~Q$hYQb1J3z!xUE%;*w^2l} zwVDW!-a#TCFG4~Md=`;>zI9E*pIc3EL(YZOevA08#GEuTH^3@c@>cTS*oorF+1QPy z0I~TvfTJzTe{J3Vf6)65-X0xP_kVXN|E=8cM21^aYD>c0FD{*`!kPsC`havoBh4@I zu#nEEx^MCV&E4!593GLPoS63>Xmup)gC$K!W&}@kpirYoQFh&^l#axWE!nAj?4G;v z(h&I++e8eeW|$PCM55DTd3(IE#8 z^1XwT+4Jy_6naQxtTnrEva&GLHx3-(1bO-0yQXnpaZb2q8yoUBGK)vb6zB5`Xb!d% z0Jp>>{?|=x;%XW>gE#?}0J=>~;mHXfDvM6i(i6vWr7CU!=Jy}LD$41jK7DXS3F!s_w1XSs` zD!^^#fcS-sUxY9oyYL70lKaFZOT%vD{=ubafM{`vltD@_2~x1l%+cASw|19yx1Djj zI!9Y_weTDB%)TzoCf!9T;a0{j*liT>(T-CGzS64;vl#(%i;xK4>`ws>6jVqaZ+Rw4 zML96p&tPL+@2N!r5GaH79E$OV)|bWVdv4vFWqmJT`|)g!|Fz_yb|%$hdjqz|_Cgu? zPt36;kNuXiWC@(L8?t{ZGZ0IvN&2SA3r%Thw5O(Kwu)jnTSD|9`|agR3x5WS)E!A? zb}x?L&?p#U>YkRG!qUDrF@cHhD*IRU`EupIY#i8Y+XL6ie+NgY{C9YS^55Z6mH+wP znO&>l;RQx zs&h927SR2J?)BuP4wq!TQReze()N_kgz`OMSU&(Ku>jurv*!`tPlJ8J@G;BI>ccRk zn``p5ER0Y8&cg`CDPMG2ZG3nFe7<~Qw>yV>o%WvnzwpINcgX!ny?gQI<(`d?uWBa- zF%NvC!cW-k`90PU75-~BclQP%hdx>(-n<=SnVcr-CeD?{?-{&YmB;0)rvEKMCXkwL zb^xr)|J3XD68#@Nw>w96<^QpB`k!KVvf&ARC*Q^vao8>4nU#9|n>VfWBY}MDTXc7B z79(>TumLLUI0)8W z5PYKRx<2V0A22KhMDlT*+*kgDvFWG5g>=IwB^@ITz=)$*tMqo`AkdDHffFW@y?CS1 z)y>IjO1=xF^{Wzry7YK9W$FSlyaNBXb^T|p{=d_)Q~iGr<3IO0-AezzbNnxj>r(V= zeMFa4;au+c>Q7#6e7&1LhR%D#i$A&nz!`|}E$c4u=*h2A|El$OcCwdt^p$0JB`1Gt zCs*WZ<)xj>{fVNRF%-0+x&c$&fVtlrFv*n_aYz?lc+u(BZtlAnJ#PQL%e61O5a)KQ z>n_zrmYrT?dEv%?TMQ5%X`|<)l-PnIpt{|1Pq$mHd!xlj16A1ys?YB9Ke=$qx03=_ z%YU72_aM>#_d0F#|7st!tNbrJl>hSavlFRq^EbM$SV_R`ZApD^Fv_BH%?Jkn9K5G~F2`KIypTW<` zFm)P`V@RTJ0vDCMBPJ;Mg>gc;A1$J!u+Y3|3t%X70TnZH#0xS7$0Vg-=LXjgPIe7v963AKXbvWXo#eQ*S zE((Q!Lkk4)IUl+lqz`Rs5=e~+DtI-+YyO%_GDveQ5n~R+A5xlbT2?}Dy{XAicY-^{ zRWX<3#jy~}R=c-JjrHoLQjs7NR&jOYQj8OrwtMQ2Ty*cthoe-H12q+a2+bD;Q8abj zFj_qa5k_#+p5mfj_?j*f8lm^|OAr=!K|CmdJu;=*f?2{-PKbm50yvJB8u^ieZMxb* z?LxkcLno5zCV%X?!+7P&3;LxG6&Ykn)le5mCI$3Y%nGHHZ5AL0VGb~3&AKbHnC%H) zFnR;v!y|XZv-@kxJFxas zq%5$3=*y^TK4AkmXW8W*9ahUd=aNSzzL2}_dqhX!!wMb$NatS&^GrNesk%^$BtFmR zd#W@tj}<9+tTm2D2Q>bFpjV`pxPUE`)u_Sxi{rXhb}7pKv$(7}U-TMu03ewsK0-!f zh66uI28@6PuiQi!C%qG_k>CWR+?I7~-Gq&>1gaSd6<1(~JpUxXCE7p7Cw7ONOr`%Q zdfj5cR1uhy++ZCq6IDcLu5z$(QmYWQO~UV$@%B@g_GaLmx+R|Vn3@!bd=^2p^C+62LzY;;MCVIz_{tg!OJ zNS8@Df@%?;cP5}1zfd7beVDHowV zrHm-MXh}}T0O+vE2s_ad@c5E?iCj1moe<_ab38_0D=j915B+q)0FVA z@IU;(9=K70lH(|@m4|5#Csv=?ZOw6-=EyO`Z1nF? zZ%yn$bHCGWE0{@NiU7OpFJ?RVq2kEGKzZS_UOy>3v}sb}Y{gK$nysa7<(oD9%@r0k zn+ks6Zz2}sKbaOJ*e>Y}$-wFj?t;J>efd*Nl6))zT%&Hc5tI(-?;-u|(%&Qc+iTaG z1uZn{aO8yUWE$5svEP`F-c_?;@?*R>MRj~351Td8UayLP?7`;1zB_=rCWItm`ELp) z0)XMK0Wn5>1ANLM28Sa2pX9ZLoS&6W$;5JJdXkqH5)!Uqw1}6T_F?Qn(PKN)cDf_l zCTJB4(Arout!NIs%8`YN3d@|nD29L8+GSb2H1fJg%j`;?9qB_$iJlMS8Q0CCB{UKS z7ZRmKuAUgWLliD6pXoFN9%j@qpIfrs%}Ib+|55fvi-}9^){&*rfpotztz3^JeCZTG zu;urQ^H&Me@Iao|a@F)@!`H~H>2wx;H?w$xhR1|<?n;VM-(6z0Ft6D!jDS~QvOHzW; zJJbvJH|@$;$D+sOJA9}%xKp~pDHW-1&h5hfyFDl1HTGY-*G}VqbUGOSvE4beEBo(` z?Y~8?yBohKF#g_Agb-=_y_SowP>~kpWQ4AZQ!?I#**D{Zdp$+XeVBTe^@K^POO?N7 z`+ad-ppJn@In!-+@+y>N#uO|qyafe~BSF@?hKW~e@lRvn{YnhHm1VZF%o>*2Ke0`A zDXg5R)2)oLtBkR4(xj)^16?IwE=Z+VoKGhWX(cZ`ltrDw2t(Jm(X!i{Xuk;qYbNB{ z7VN7f>0&kWYBpi_mu6l4c8#lMSf@XI63kn$`8BDsyE(64Qwpz}GyH~8wv6lF)aoj3 zBI!0)da~N$iVcc!X#>;SHC4ICMW=#6Ec1HY5*@BK2&tG}m*2C{{Go~&`sbQvyC_f- zmurUQZ1Eqhl|)}GR*T>$XLU0dZ38=!+#O{uDY6l4WJOX7R$xKeM1}vQ*pD*HQCW|6 zX+7G+#=+pv#;RegF;(Ps^2N4` zyrCkyH|vcP#fFJwM+(dmE36QC3xryQ0_%g7ni>i=quB27?OPiP_5T+jMg$kz2>r21 z{Ewpp5&yf_JveOJN8Rt*og?^PrT^cZ{$Ff(qQF-_x1_N@7Cvh-sX2&dhI}1vbd4+5 zhO_+_361X9-dc3A~SKIAY1gYAZ=9zuq^A=$#9}ZKQh2tSgkr zUe1xqb9PST>9;W0$~Z9xGrTFNMm8ltNduNj_$lt+0m zM-9s97-Pm$jE$G4@7|LVfnF$Znaw*G5+Ws3*u!Ib{`e|W(k>}!i;^I`eRB#S$wwgL z{jIs!U;9juuJJcM7!1`4!f*p7hT0Gll5O{U-mkdsXic3+@F zF14B4B9i5sfpmk!bvvxz1LF=!<%Rdn#(&Yy?OnwGv~4?){|^s482_`~tKxs|T>ck~ znyCbk&+bnXpk2M$o>D-w^x}dLfAhD)0g(%3RszFyV{lG!Sm Xt@>1->QjBH4}AVV0|L9z0Hgr`1^Y?z literal 0 HcmV?d00001 diff --git a/genome_bin_tools.r b/genome_bin_tools.r index b790f27..04752f3 100644 --- a/genome_bin_tools.r +++ b/genome_bin_tools.r @@ -2,7 +2,8 @@ ## Tools for interactive genome binning in R -## Version 1 - 2014-10-28 - Converted previous version to object-oriented paradigm +## Version 1 - 2014-10-28 - Converted previous version to object-oriented paradigm +## Version 1.2 - 2015-03-03 - Added functions winnow and winnow.mark ## Contact: kbseah@mpi-bremen.de ## Required packages: @@ -185,6 +186,37 @@ plot.genomestatsbin <- function(x, ... ) { # inherit the same plot plot.genomestats (x, ...) } +winnow <- function(x,gc=c(0,1),cov=c(0,Inf),cov2=c(0,Inf),len=c(0,Inf),save=FALSE,file="bin_scaffolds.list") { + # "Winnow" a genomestats(bin) or diffcovstats(bin) object by GC% + # Only return those contigs which are in that GC range + if (class(x) == "genomestatsbin" | class(x) =="genomestats") { + scafflist <- as.character(x$scaff$ID[which(x$scaff$Ref_GC >gc[1] & x$scaff$Ref_GC < gc[2] & x$scaff$Avg_fold>cov[1] & x$scaff$Avg_foldlen[1] & x$scaff$Length gc[1] & x$diffcov$Ref_GC < gc[2] & x$diffcov$Avg_fold_1>cov[1] & x$diffcov$Avg_fold_1cov2[1] & x$diffcov$Avg_fold_2len[1] & x$diffcov$Length