From f00ea478e683fea438b9613124324c6a30b72c64 Mon Sep 17 00:00:00 2001 From: Guillaume Marcais <gmarcais@cs.cmu.edu> Date: Tue, 20 Mar 2018 15:35:02 -0400 Subject: [PATCH 1/8] Distribute 'swig/python/jellyfish.py' --- swig/Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/swig/Makefile.am b/swig/Makefile.am index 7daa5cf2..e71804f1 100644 --- a/swig/Makefile.am +++ b/swig/Makefile.am @@ -22,6 +22,7 @@ BUILT_SOURCES += $(PYTHON_BUILT) if PYTHON_DEPRECATED pythonglobaldir = $(PYTHON_SITE_PKG) pythonglobal_SCRIPTS = swig/python/jellyfish.py +EXTRA_DIST += $(pythonglobal_SCRIPTS) endif pythonextdir = $(PYTHON_SITE_PKG)/dna_jellyfish pythonext_SCRIPTS = swig/python/__init__.pyc From c615c6d66725667d2f113f3bbc795aef5d295dbb Mon Sep 17 00:00:00 2001 From: Guillaume Marcais <Guillaume Marcais guillaume@marcais.net> Date: Wed, 28 Mar 2018 18:48:43 -0400 Subject: [PATCH 2/8] No warning error in SWIG or script compilation. --- Makefile.simple | 40 ---------------------------------------- swig/Makefile.am | 4 ++++ 2 files changed, 4 insertions(+), 40 deletions(-) delete mode 100644 Makefile.simple diff --git a/Makefile.simple b/Makefile.simple deleted file mode 100644 index 0e1730b7..00000000 --- a/Makefile.simple +++ /dev/null @@ -1,40 +0,0 @@ -# This Makefile is meant to be run in the traget directories. There -# name start with a '_' by convention and contain a config.h. - -CC = g++ -CPPFLAGS = -Wall -Werror -g -O2 -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -I. -# Uncomment following to use SSE -# CPPFLAGS += -msse -msse2 -DSSE -LDFLAGS = -lpthread -TARGETS = jellyfish -# old TARGETS = mer_counter dump_stats dump_stats_compacted hash_merge -SOURCES = $(patsubst %,%.cc,$(TARGETS)) mer_count_thread.cc - -.SECONDARY: - -#VPATH=..:../lib - -%.d: %.cc - $(CC) -M -MM -MG -MP $(CPPFLAGS) $< > $@.tmp - sed -e 's/$*.o/& $@/g' $@.tmp > $@ && rm $@.tmp - -all: $(TARGETS) - -jellyfish: dump_stats_compacted.o square_binary_matrix.o hash_merge.o storage.o misc.o mer_count_thread.o mer_counter.o - -# compare_dump3: compare_dump3.o -# dump_stats_compacted: dump_stats_compacted.o square_binary_matrix.o -total_mers_single_fasta: total_mers_single_fasta.o -mer_counter_C: mer_counter_C.o - -# hash_merge: hash_merge.o storage.o misc.o -# mer_counter: mer_counter.o mer_count_thread.o storage.o misc.o -# dump_mers: dump_mers.o storage.o misc.o -dump_stats: dump_stats.o storage.o misc.o - -clean: - rm -f $(TARGETS) *.o *.d - -# Is the following OK? Shouldn't we include more than that? -include $(SOURCES:.cc=.d) --include local.mk diff --git a/swig/Makefile.am b/swig/Makefile.am index e71804f1..b6d5660a 100644 --- a/swig/Makefile.am +++ b/swig/Makefile.am @@ -1,6 +1,7 @@ # SWIG SWIG_SRC = swig/jellyfish.i swig/hash_counter.i swig/hash_set.i \ swig/mer_dna.i swig/mer_file.i swig/string_mers.i +SWIG_CXXFLAGS = -Wno-error if HAVE_SWIG SWIG_V_GEN = $(swig_v_GEN_$(V)) @@ -29,6 +30,7 @@ pythonext_SCRIPTS = swig/python/__init__.pyc pythonext_LTLIBRARIES = swig/python/_dna_jellyfish.la swig_python__dna_jellyfish_la_SOURCES = swig/python/swig_wrap.cpp $(SWIG_SRC) swig_python__dna_jellyfish_la_CPPFLAGS = $(PYTHON_CPPFLAGS) -I$(srcdir)/include +swig_python_jellyfish_la_CXXFLAGS = $(AM_CXXFLAGS) $(SWIG_CXXFLAGS) swig_python__dna_jellyfish_la_LDFLAGS = -module swig_python__dna_jellyfish_la_LIBADD = libjellyfish-2.0.la CLEANFILES += $(PYTHON_BUILT) $(pythonext_SCRIPTS) @@ -49,6 +51,7 @@ rubyextdir = $(RUBY_EXT_LIB) rubyext_LTLIBRARIES = swig/ruby/jellyfish.la swig_ruby_jellyfish_la_SOURCES = swig/ruby/swig_wrap.cpp $(SWIG_SRC) swig_ruby_jellyfish_la_CPPFLAGS = $(RUBY_EXT_CFLAGS) -I$(srcdir)/include +swig_ruby_jellyfish_la_CXXFLAGS = $(AM_CXXFLAGS) $(SWIG_CXXFLAGS) swig_ruby_jellyfish_la_LDFLAGS = -module swig_ruby_jellyfish_la_LIBADD = libjellyfish-2.0.la CLEANFILES += $(RUBY_BUILT) @@ -63,6 +66,7 @@ perlext_SCRIPTS = swig/perl5/jellyfish.pm perlext_LTLIBRARIES = swig/perl5/jellyfish.la swig_perl5_jellyfish_la_SOURCES = swig/perl5/swig_wrap.cpp $(SWIG_SRC) swig_perl5_jellyfish_la_CPPFLAGS = $(PERL_EXT_CPPFLAGS) -I$(PERL_EXT_INC) -I$(srcdir)/include +swig_perl5_jellyfish_la_CXXFLAGS = $(AM_CXXFLAGS) $(SWIG_CXXFLAGS) swig_perl5_jellyfish_la_LDFLAGS = -module swig_perl5_jellyfish_la_LIBADD = libjellyfish-2.0.la CLEANFILES += $(PERL_BUILT) From 6f9ee226a3bfd06aa0c4f6c4d232f4336af25a31 Mon Sep 17 00:00:00 2001 From: Guillaume Marcais <gmarcais@cs.cmu.edu> Date: Wed, 28 Mar 2018 18:53:16 -0400 Subject: [PATCH 3/8] Added missing jellyfish.py --- swig/python/jellyfish.py | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 swig/python/jellyfish.py diff --git a/swig/python/jellyfish.py b/swig/python/jellyfish.py new file mode 100644 index 00000000..ab988134 --- /dev/null +++ b/swig/python/jellyfish.py @@ -0,0 +1,4 @@ +import warnings + +warnings.warn("Module 'jellyfish' is deprecated. Use 'dna_jellyfish'", DeprecationWarning) +import dna_jellyfish as jellyfish From 667ef2dc8d4444d3a72190ef674ed280d9272b10 Mon Sep 17 00:00:00 2001 From: Guillaume Marcais <Guillaume Marcais guillaume@marcais.net> Date: Thu, 29 Mar 2018 17:15:24 -0400 Subject: [PATCH 4/8] Check for posix_memalign or aligned_alloc --- configure.ac | 3 +++ lib/rectangular_binary_matrix.cc | 21 +++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index aed45e19..8e7b826c 100644 --- a/configure.ac +++ b/configure.ac @@ -156,6 +156,9 @@ AM_COND_IF([PERL_BINDING], [AS_IF([test x$enable_perl_binding != xyes], [PERL_EXT_LIB=$enable_perl_binding])] [AX_PERL_EXT([$prefix])]) +# Check for aligned memory allocation functions +AC_CHECK_FUNCS([posix_memalign aligned_alloc]) + AC_OUTPUT diff --git a/lib/rectangular_binary_matrix.cc b/lib/rectangular_binary_matrix.cc index b932d1c9..c2e39469 100644 --- a/lib/rectangular_binary_matrix.cc +++ b/lib/rectangular_binary_matrix.cc @@ -14,6 +14,7 @@ along with Jellyfish. If not, see <http://www.gnu.org/licenses/>. */ +#include <cstdlib> #include <new> #include <stdexcept> #include <vector> @@ -21,6 +22,23 @@ #include <assert.h> #include <jellyfish/rectangular_binary_matrix.hpp> +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#ifdef HAVE_POSIX_MEMALIGN +inline int __mem_align(void** memptr, size_t alignement, size_t size) { + return posix_memalign(memptr, alignement, size); +} +#elif HAVE_ALIGNED_ALLOC +inline int __mem_align(void** memptr, size_t alignment, size_t size) { + *memptr = aligned_alloc(alignment, size); + return memptr == NULL; +} +#else +#error No function to allocate aligned memory +#endif + uint64_t *jellyfish::RectangularBinaryMatrix::alloc(unsigned int r, unsigned int c) { if(r > (sizeof(uint64_t) * 8) || r == 0 || c == 0) { std::ostringstream err; @@ -31,8 +49,7 @@ uint64_t *jellyfish::RectangularBinaryMatrix::alloc(unsigned int r, unsigned int // Make sure the number of words allocated is a multiple of // 8. Necessary for loop unrolling of vector multiplication size_t alloc_columns = (c / 8 + (c % 8 != 0)) * 8; - // if(posix_memalign(&mem, sizeof(uint64_t) * 2, alloc_columns * sizeof(uint64_t))) - if(!(mem = aligned_alloc(sizeof(uint64_t) * 2, alloc_columns * sizeof(uint64_t)))) + if(__mem_align(&mem, sizeof(uint64_t) * 2, alloc_columns * sizeof(uint64_t))) throw std::bad_alloc(); memset(mem, '\0', sizeof(uint64_t) * alloc_columns); return (uint64_t *)mem; From 591d4d1aec41154578a366d5dd8b7baf96454350 Mon Sep 17 00:00:00 2001 From: Guillaume Marcais <gmarcais@cs.cmu.edu> Date: Fri, 30 Mar 2018 13:23:21 -0400 Subject: [PATCH 5/8] Bump version to 2.2.10 --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 8e7b826c..e4fc8b68 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([jellyfish], [2.2.9], [gmarcais@umd.edu]) +AC_INIT([jellyfish], [2.2.10], [gmarcais@umd.edu]) AC_CANONICAL_HOST AC_CONFIG_MACRO_DIR([m4]) AM_INIT_AUTOMAKE([subdir-objects foreign parallel-tests color-tests]) From 21ca89987a2ae70800afe7d1bff172d0d5fa4945 Mon Sep 17 00:00:00 2001 From: Guillaume Marcais <gmarcais@cs.cmu.edu> Date: Tue, 3 Apr 2018 15:33:41 -0400 Subject: [PATCH 6/8] Build extension from the tarball with setup.py, extconf.rb and Makefile.PL --- swig/Makefile.am | 5 +++-- swig/perl5/Makefile.PL | 28 ++++++++++++++++++++-------- swig/python/setup.py | 38 +++++++++++++++----------------------- swig/ruby/extconf.rb | 24 +++++++++++++++++++----- 4 files changed, 57 insertions(+), 38 deletions(-) diff --git a/swig/Makefile.am b/swig/Makefile.am index b6d5660a..49692685 100644 --- a/swig/Makefile.am +++ b/swig/Makefile.am @@ -40,7 +40,7 @@ pythonc_v_GEN_0 = @echo " PYTHONC " $@; %/__init__.pyc: %/dna_jellyfish.py $(PYTHONC_V_GEN)$(PYTHON) -c 'import py_compile, sys; py_compile.compile(sys.argv[1], sys.argv[2])' $< $@ swig/python/dna_jellyfish.py: swig/python/swig_wrap.cpp -EXTRA_DIST += $(PYTHON_BUILT) +EXTRA_DIST += $(PYTHON_BUILT) swig/python/setup.py endif # Ruby support @@ -55,6 +55,7 @@ swig_ruby_jellyfish_la_CXXFLAGS = $(AM_CXXFLAGS) $(SWIG_CXXFLAGS) swig_ruby_jellyfish_la_LDFLAGS = -module swig_ruby_jellyfish_la_LIBADD = libjellyfish-2.0.la CLEANFILES += $(RUBY_BUILT) +EXTRA_DIST += swig/ruby/extconf.rb endif # Perl5 support @@ -71,5 +72,5 @@ swig_perl5_jellyfish_la_LDFLAGS = -module swig_perl5_jellyfish_la_LIBADD = libjellyfish-2.0.la CLEANFILES += $(PERL_BUILT) swig/perl5/jellyfish.pm: swig/perl5/swig_wrap.cpp -EXTRA_DIST += $(PERL_BUILT) +EXTRA_DIST += $(PERL_BUILT) swig/perl5/Makefile.PL endif diff --git a/swig/perl5/Makefile.PL b/swig/perl5/Makefile.PL index f8461f9c..db18249b 100644 --- a/swig/perl5/Makefile.PL +++ b/swig/perl5/Makefile.PL @@ -4,17 +4,29 @@ use strict; use warnings; use ExtUtils::MakeMaker; -mkdir('lib'); -system('swig', '-c++', '-perl5', '-o', 'jellyfish_wrap.cxx', '../jellyfish.i'); +sub pkgconfig { + my ($cmd) = @_; + my $res = `pkg-config $cmd`; + if($?) { + print(STDERR "\nCan't get compilation information for Jellyfish, check you PKG_CONFIG_PATH\n"); + exit(1); + } + chomp $res; + return $res; +} -my $jf_cflags = `pkg-config --cflags jellyfish-2.0`; -my $jf_libs = `pkg-config --libs jellyfish-2.0`; -my $jf_rpath = `pkg-config --libs-only-L jellyfish-2.0 | sed -e 's/-L/-Wl,-rpath,/g'`; +my $pkg = "jellyfish-2.0"; +my $jf_cflags = pkgconfig("--cflags $pkg"); +my $jf_libs = pkgconfig("--libs $pkg"); +my $jf_rpath = pkgconfig("--libs-only-L $pkg"); +$jf_rpath =~ s/-L/-Wl,-rpath,/g; + +print("$jf_cflags\n$jf_libs\n$jf_rpath\n"); WriteMakefile(NAME => 'jellyfish', - CC => 'g++-4.4', - LD => 'g++-4.4', + CC => 'g++', + LD => 'g++', CCFLAGS => "-std=c++0x $jf_cflags", LIBS => "$jf_libs", - OBJECT => 'jellyfish_wrap.o', + OBJECT => 'swig_wrap.o', VERSION => '0.0.1'); diff --git a/swig/python/setup.py b/swig/python/setup.py index 0177ef51..dadde60c 100644 --- a/swig/python/setup.py +++ b/swig/python/setup.py @@ -1,36 +1,28 @@ #! /usr/bin/env python import os +import sys import re from distutils.core import setup, Extension -# Don't use Extension support for swig, it is more annoying than -# anything! (1)It does not support very well having the .i in a -# different directory. (2)It runs swig every single time, even if it -# ran successfully before. So (3)doing first build, then install as -# root rebuilds the modules as root. -swig_time = os.path.getmtime('../jellyfish.i') -older = True -try: - older = os.path.getmtime('jellyfish_wrap.cxx') < swig_time or os.path.getmtime('jellyfish.py') < swig_time -except FileNotFoundError: - older = True +def pkgconfig(s): + pipe = os.popen("pkg-config {}".format(s)) + res = pipe.read().rstrip().split() + ret = pipe.close() + if ret is not None: + sys.stderr.write("\nCan't get compilation information for Jellyfish, check you PKG_CONFIG_PATH\n") + exit(1) + return res -if older: - print("Running swig: swig -c++ -python -o jellyfish_wrap.cxx ../jellyfish.i") - os.system("swig -c++ -python -o jellyfish_wrap.cxx ../jellyfish.i") - -jf_include = [re.sub(r'-I', '', x) for x in os.popen("pkg-config --cflags-only-I jellyfish-2.0").read().rstrip().split()] -jf_cflags = os.popen("pkg-config --cflags-only-other").read().rstrip().split() - -jf_libs = [re.sub(r'-l', '', x) for x in os.popen("pkg-config --libs-only-l jellyfish-2.0").read().rstrip().split()] -jf_libdir = [re.sub(r'-L', '', x) for x in os.popen("pkg-config --libs-only-L jellyfish-2.0").read().rstrip().split()] +jf_libs = [re.sub(r'-l', '', x) for x in pkgconfig("--libs-only-l jellyfish-2.0")] +jf_include = [re.sub(r'-I', '', x) for x in pkgconfig("--cflags-only-I jellyfish-2.0")] +jf_cflags = pkgconfig("--cflags-only-other jellyfish-2.0") +jf_libdir = [re.sub(r'-L', '', x) for x in pkgconfig("--libs-only-L jellyfish-2.0")] jf_rpath = [re.sub(r'^', '-Wl,-rpath,', x) for x in jf_libdir] -jf_ldflags = os.popen("pkg-config --libs-only-other jellyfish-2.0").read().rstrip().split() - +jf_ldflags = pkgconfig("--libs-only-other jellyfish-2.0") jellyfish_module = Extension('_dna_jellyfish', - sources = ['jellyfish_wrap.cxx'], + sources = ['swig_wrap.cpp'], include_dirs = jf_include, libraries = jf_libs, library_dirs = jf_libdir, diff --git a/swig/ruby/extconf.rb b/swig/ruby/extconf.rb index ea4aa2e5..5c430dd2 100644 --- a/swig/ruby/extconf.rb +++ b/swig/ruby/extconf.rb @@ -2,11 +2,25 @@ require 'mkmf' -swig = find_executable('swig') -system(swig, '-c++', '-ruby', '-o', 'jellyfish_wrap.cxx', '../jellyfish.i') +def pkgconfig(s) + res = `pkg-config #{s}` + if $? != 0 + STDERR.puts("\nCan't get compilation information for Jellyfish, check you PKG_CONFIG_PATH\n"); + exit(1) + end + res.chomp +end -$defs << `pkg-config --cflags jellyfish-2.0`.chomp << '-std=c++0x' -$libs << `pkg-config --libs jellyfish-2.0`.chomp -$libs << `pkg-config --libs-only-L jellyfish-2.0 | sed -e 's/-L/-Wl,-rpath,/g'`.chomp +$defs << pkgconfig("--cflags jellyfish-2.0") << '-std=c++0x' +libs = [pkgconfig("--libs jellyfish-2.0"), + pkgconfig("--libs-only-L jellyfish-2.0").gsub(/-L/, "-Wl,-rpath,")] + +if Array === $libs + $libs += libs +elsif String === $libs + $libs += " " + libs.join(" ") +else + $libs = libs.join(" ") +end create_makefile('jellyfish') From af5904b2d3f570d841cd4a08d5b2090d14279b00 Mon Sep 17 00:00:00 2001 From: Guillaume Marcais <gmarcais@cs.cmu.edu> Date: Tue, 3 Apr 2018 15:44:25 -0400 Subject: [PATCH 7/8] Updated installation documentation of extensions. --- swig/Readme.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/swig/Readme.md b/swig/Readme.md index 37d21ca3..a429fffc 100644 --- a/swig/Readme.md +++ b/swig/Readme.md @@ -7,6 +7,41 @@ output files and use the Jellyfish hash within these scripting languages, which is much more convenient than in C++, although somewhat slower. +Installation +============ + +See the installation instructions on the +[main page](../../tree/master) on how to install the bindings at the +same times as Jellyfish. + +Additionally, it is possible to install the bindings from the +distribution tarball. If Jellyfish is already installed (that is +`pkg-config --exists jellyfish-2.0 && echo OK` prints `OK`), then the +bindings can be installed as follows. (`$PREFIX` should be set to the +desired installation location) + +Python: +```Shell +cd swig/python +python setup.py build +python setup.py install --prefix=$PREFIX +``` + +Ruby: +```Shell +cd swig/ruby +ruby extconf.rb +make +make install prefix=$PREFIX +``` + +Perl 5: +```Shell +cd swig/perl5 +perl Makefile.PL SITEPREFIX=$PREFIX +make +make install +``` Examples ======== From d7fa74a94fb82fdd24d9bca29a8fc801843160ad Mon Sep 17 00:00:00 2001 From: Guillaume Marcais <gmarcais@cs.cmu.edu> Date: Mon, 16 Apr 2018 08:44:09 -0400 Subject: [PATCH 8/8] Added doc/Readme.md --- doc/Readme.md | 334 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 334 insertions(+) create mode 100644 doc/Readme.md diff --git a/doc/Readme.md b/doc/Readme.md new file mode 100644 index 00000000..8609a4e0 --- /dev/null +++ b/doc/Readme.md @@ -0,0 +1,334 @@ +# Getting started + + +## Counting all k-mers + + +The basic command to count all k-mers is as follows: + +``` +jellyfish count -m 21 -s 100M -t 10 -C reads.fasta +``` + +This will count canonical (`-C`) 21-mers (`-m 21`), using a hash with +100 million elements (`-s 100M`) and 10 threads (`-t 10`) in the +sequences in the file `reads.fasta`. The output is written in the file +`mer counts.jf` by default (change with `-o` switch). + +To compute the histogram of the k-mer occurrences, use the +`histo subcommand (see section [histo](#histo)): + +``` +jellyfish histo mer_counts.jf +``` + +To query the counts of a particular k-mer, use the +`query` subcommand (see section [query](#query)): + +``` +jellyfish query mer_counts.jf AACGTTG +``` + +To output all the counts for all the k-mers in the file, use the +`dump` subcommand (see section [dump](#dump)): + +``` +jellyfish dump mer_counts.jf > mer_counts_dumps.fa +``` + +To get some information on how, when and where this jellyfish file was +generated, use the `info` subcommand (see section [info](#info)): + +``` +jellyfish info mer_counts.jf +``` + +For more detail information, see the relevant sections in this +document. All commands understand `--help` and will produce some +information about the switches available. + +### Counting k-mers in sequencing reads + +In sequencing reads, it is unknown which strands of the DNA is +sequenced. As a consequence, a k-mer or its reverse complement are +essentially equivalent. The canonical representative of a k-mer `m` is +by definition `m` or the reverse complement of `m`, whichever comes +first lexicographically. The `-C` switch instructs to save in the hash +only canonical k-mers, while the count is the number of occurrences +of both a k-mer and it reverse complement. + +The size parameter (given with `-s`) is an indication of the number +k-mers that will be stored in the hash. For sequencing reads, this +size should be the size of the genome plus the k-mers generated by +sequencing errors. For example, if the error rate is e (e.g.Illumina +reads, usually e~1%), with an estimated genome size of G and +a coverage of c, the number of expected k-mers is G+Gcek. + +> NOTE: unlike in Jellyfish 1, this `-s` parameter is only +> an estimation. If the size given is too small to fit all the k-mers, +> the hash size will be increased automatically or partial results will +> be written to disk and finally merged automatically. Running +> `jellyfish merge` should never be necessary, as now +> jellyfish now takes care of this task on its own. + +If the low frequency k-mers (k-mers occurring only once), which are +mostly due to sequencing errors, are not of interest, one might +consider counting only high-frequency k-mers (see +section [Counting high frequency k-mers](#Counting-high-frequency-k-mers)), +which uses less memory and is potentially faster. + +### Counting k-mers in a genome + +In an actual genome or finished sequence, a k-mer and its reverse +complement are not equivalent, hence using the `-C` switch does not +make sense. In addition, the size for the hash can be set directly to +the size of the genome. + +## Counting high-frequency k-mers + +Jellyfish offers two way to count only high-frequency k-mers (meaning +only k-mers with count >1), which reduces significantly the memory +usage. Both methods are based on using Bloom filters. The first method +is a one pass method, which provides approximate count for some +percentage of the k-mers. The second method is a two pass method which +provides exact count. In both methods, most of the low-frequency +k-mers are not reported. + +### One pass method + +Adding the `--bf-size` switch make jellyfish first insert all k-mers +first into a Bloom filter and only insert into the hash the k-mers +which have already been seen at least once. The argument to +`--bf-size` should the total number of k-mer expected in the data set +while the `--size` argument should be the number of $k$-mers occurring +more than once. For example: + +``` +jellyfish count -m 25 -s 3G --bf-size 100G -t 16 homo_sapiens.fa +``` + +would be appropriate for counting 25-mers in human reads at 30x +coverage. The approximate memory usage is 9 bits per k-mer in the +Bloom filter. + +The count reported for each k-mer (by `jellyfish dump` or `jellyfish +query`) is one less than the actual count. Meaning, the count 1 k-mer +are not reported, count 2 k-mer are reported to have count 1, etc. + +The drawback of this method is some percentage of the $k$-mer that +should not be reported (because they occur only once) are +reported. This is due to the random nature of the Bloom filter data +structure. The percentage is <1% by default and can be changed with +the `--bf-fp switch. + +### Two passes method + +In the two pass method, first a Bloom counter is created from the +reads with `jellyfish bc`. Then this Bloom counter is given to the +`jelllyfish count` command and only the k-mers which have been seen +twice in the first pass will be inserted in the hash. For example, +with a human data set similar that in +section [One pass method](#One-pass-method): + +``` +jellyfish bc -m 25 -s 100G -t 16 -o homo_sapiens.bc homo_sapiens.fa +jellyfish count -m 25 -s 3G -t 16 --bc homo_sapiens.bc homo_sapiens.fa +``` + +The advantage of this method is that the counts reported for the +k-mers are all correct. Most count 1 k-mer are not reported, except +for a small percentage (set by the false positive rate, `-f` switch of +the `bc` subcommand) of them which are reported (correctly with count +1). All other k-mers are reported with the correct count. + +The drawback of this method is that it requires to parse the entire +reads data set twice and the memory usage of the Bloom counter is +greater than that of the Bloom filter (slightly less than twice as +much). + +# FAQ + +## How to read compressed files (or other format)? + +Jellyfish only reads FASTA or FASTQ formatted input files. By reading +from pipes, jellyfish can read compressed files, like this: + +``` +zcat *.fastq.gz | jellyfish count /dev/fd/0 ... +``` + +or by using the `<()` redirection provided by the shell (e.g. bash, zsh): + +``` +jellyfish count <(zcat file1.fastq.gz) <(zcat file2.fasta.gz) ... +``` + +## How to read multiple files at once? + +Often, jellyfish can parse an input sequence file faster than `gzip` +or `fastq-dump` (to parse SRA files) can output the sequence. This +leads to many threads in jellyfish going partially unused. Jellyfish +can be instructed to open multiple file at once. For example, to read +two short read archive files simultaneously: + +``` +jellyfish count -F 2 <(fastq-dump -Z file1.sra) <(fastq-dump -Z file2.sra) ... +``` + +Another way is to use "generators". First, create a file containing, +one per line, commands to generate sequence. Then pass this file to +jellyfish and the number of generators to run +simultaneously. Jellyfish will spawn subprocesses running the commands +passed and read their standard output for sequence. By default, the +commands are run using the shell in the SHELL environment variable, +and this can be changed by the `-S` switch. Multiple generators will +be run simultaneously as specified by the `-G` switch. For example: + +``` +ls *.fasta.gz | xargs -n 1 echo gunzip -c > generators +jellyfish count -g generators -G 4 ... +``` + +The first command created the command list into the 'generators' file, +each command unzipping one FASTA file in the current directory. The +second command runs jellyfish with 4 concurrent generators. + +## How to reduce the output size? + +The output file was design to be easy to read, but the file generated +can be rather large. By default, a 4 bytes counter value is saved for +every k-mer (i.e.a maximum count of over 4 billion). Instead, a +counter size of 2 bytes or 1 byte can be used with the switch +`--out-counter-len`, which reduces significantly the output size. + +The count of k-mers which cannot be represented with the given number +of bytes will have a value equal to the maximum value that can be +represented. Meaning, if the counter field uses 1 byte, any +k-mers with count greater or equal to 255 will be reported of having +a count of 255. + +Also, low frequency and high frequency k-mers can be skipped using the +`--L and `--U` switches respectively. Although it might be more +appropriate to filter out the low frequency $k$-mers using Bloom +filters, as shown in +section [Counting-high-frequency-k-mers](#Counting-high-frequency-k-mers). + +## How much memory is needed? + +The memory needed to count k-mers, given the various parameters of the +`count` subcommand is obtained by using the +`mem` subcommand. It understand all the same switches, but +only the `--mer-len` (`-m`), `--size` (`-s`), +`--counter-len` (`-c`) and `--reprobes` (`-p`) +switches are taken into account. + +For example, for 24-mers, with an initial hash size of 1 billion, and +default parameters otherwise, the memory usage is: + +``` +jellyfish mem -m 24 -s 1G +4521043056 (4G) +``` + +Conversely, if the `--size` switch is not given but the +`--mem` switch is, then the maximum initial hash size that would +fit in the given memory is returned. For example, this is the maximum +hash size for 31-mers in 8 Gb of RAM: + +``` +jellyfish mem -m 31 --mem 8g +1073741824 (1G) +``` + +# Subcommands + + +## histo + +The `histo` subcommand outputs the histogram of k-mers +frequencies. The last bin, with value one above the high setting set +by the `h` switch (10000 by default), is a catch all: all k-mers with +a count greater than the high setting are tallied in that one bin. If +the low setting is set (`-l` switch), then the first bin, with value +one below the low setting, is also similarly a catch all. + +By default, the bins with a zero count are skipped. This can be changed +with the `f` switch. + +## dump + +The `dump` subcommand outputs a list of all the k-mers in +the file associated with their count. By default, the output is in FASTA +format, where the header line contains the count of the k-mer and the +sequence part is the sequence of the k-mer. This format has the +advantage that the output contains the sequence of $k$-mers and can be +directly fed into another program expecting the very common FASTA +format. A more convenient column format (for human beings) is selected +with the `-c` switch. + +Low frequency and high frequency k-mers can be skipped with the `-L` +and `-U` switches respectively. + +In the output of the `dump` subcommand, the k-mers are sorted +according to the hash function used by Jellyfish. The output can be +considered to be "fairly pseudo-random". By "fairly" we mean that NO +guarantee is made about the actual randomness of this order, it is +just good enough for the hash table to work properly. And by +"pseudo-random" we mean that the order is actually deterministic: +given the same hash function, the output will be always the same and +two different files generated with the same hash function can be +merged easily. + +## query + +The `query` subcommand outputs the k-mers and their counts for some +subset of k-mers. It will outputs the counts of all the k-mers +passed on the command line or of all the k-mers in the sequence read +from the FASTA or FASTQ formatted file passed to the switch +`-s` (this switch can be given multiple times). + +## info + +The `info` subcommand outputs some information about the jellyfish +file and the command used to generated it, in which directory and at +what time the command was run. Hopefully, the information given should +be enough to rerun jellyfish under the same conditions and reproduce +the output file. In particular, the `-c` switch outputs the command, +properly escaped and ready to run in a shell. + +The header is saved in JSON format and contains more information than is +written by the default. The full header in JSON format can be written +out using the `-j` switch. + +## merge + +The `merge` subcommand is of little direct use with version version 2 +of Jellyfish. When intermediary files were written to disk, because +not all k-mers would fit in memory, they can be merged into one file +containing the final result with the `merge` subcommand. The `count` +subcommand will merge intermediary files automatically as needed. + +## stats + +The `stats` subcommand computes some statistics about the +mers. Although these statistics could be computed from the histogram, it +provides quick summary information. The fields are: + +* _Unique_: The number of k-mer occuring exactly once +* _Distinct_: The number of k-mers, ignoring their multiplicity + (i.e.the cardinality of the set of $k$-mers) +* _Total_: The number of k-mers with multiplicity (i.e.the sum of the + number of occurence of all the mers) +* _Max\_count_: The maximum of the number of occurrences + +## mem + +The `mem` subcommand shows how much memory a count subcommand will +need or conversely how large of a hash size will fit in a given amount +of memory. + +## cite + +The `cite` subcommand prints the citation for the jellyfish +paper. With the `-b`, it is formatted in Bibtex format. How +convenient!