From f00ea478e683fea438b9613124324c6a30b72c64 Mon Sep 17 00:00:00 2001
From: Guillaume Marcais <gmarcais@cs.cmu.edu>
Date: Tue, 20 Mar 2018 15:35:02 -0400
Subject: [PATCH 1/8] Distribute 'swig/python/jellyfish.py'

---
 swig/Makefile.am | 1 +
 1 file changed, 1 insertion(+)

diff --git a/swig/Makefile.am b/swig/Makefile.am
index 7daa5cf2..e71804f1 100644
--- a/swig/Makefile.am
+++ b/swig/Makefile.am
@@ -22,6 +22,7 @@ BUILT_SOURCES += $(PYTHON_BUILT)
 if PYTHON_DEPRECATED
 pythonglobaldir = $(PYTHON_SITE_PKG)
 pythonglobal_SCRIPTS = swig/python/jellyfish.py
+EXTRA_DIST += $(pythonglobal_SCRIPTS)
 endif
 pythonextdir = $(PYTHON_SITE_PKG)/dna_jellyfish
 pythonext_SCRIPTS = swig/python/__init__.pyc

From c615c6d66725667d2f113f3bbc795aef5d295dbb Mon Sep 17 00:00:00 2001
From: Guillaume Marcais <Guillaume Marcais guillaume@marcais.net>
Date: Wed, 28 Mar 2018 18:48:43 -0400
Subject: [PATCH 2/8] No warning error in SWIG or script compilation.

---
 Makefile.simple  | 40 ----------------------------------------
 swig/Makefile.am |  4 ++++
 2 files changed, 4 insertions(+), 40 deletions(-)
 delete mode 100644 Makefile.simple

diff --git a/Makefile.simple b/Makefile.simple
deleted file mode 100644
index 0e1730b7..00000000
--- a/Makefile.simple
+++ /dev/null
@@ -1,40 +0,0 @@
-# This Makefile is meant to be run in the traget directories. There
-# name start with a '_' by convention and contain a config.h.
-
-CC = g++
-CPPFLAGS = -Wall -Werror -g -O2 -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -I.
-# Uncomment following to use SSE
-# CPPFLAGS += -msse -msse2 -DSSE
-LDFLAGS = -lpthread
-TARGETS = jellyfish 
-# old TARGETS = mer_counter dump_stats dump_stats_compacted hash_merge
-SOURCES = $(patsubst %,%.cc,$(TARGETS)) mer_count_thread.cc
-
-.SECONDARY:
-
-#VPATH=..:../lib
-
-%.d: %.cc
-	$(CC) -M -MM -MG -MP $(CPPFLAGS) $< > $@.tmp
-	sed -e 's/$*.o/& $@/g' $@.tmp > $@ && rm $@.tmp
-
-all: $(TARGETS)
-
-jellyfish: dump_stats_compacted.o square_binary_matrix.o hash_merge.o storage.o misc.o mer_count_thread.o mer_counter.o
-
-# compare_dump3: compare_dump3.o
-# dump_stats_compacted: dump_stats_compacted.o square_binary_matrix.o
-total_mers_single_fasta: total_mers_single_fasta.o
-mer_counter_C: mer_counter_C.o
-
-# hash_merge: hash_merge.o storage.o misc.o
-# mer_counter: mer_counter.o mer_count_thread.o storage.o misc.o
-# dump_mers: dump_mers.o storage.o misc.o
-dump_stats: dump_stats.o storage.o misc.o
-
-clean:
-	rm -f $(TARGETS) *.o *.d
-
-# Is the following OK? Shouldn't we include more than that?
-include $(SOURCES:.cc=.d)
--include local.mk
diff --git a/swig/Makefile.am b/swig/Makefile.am
index e71804f1..b6d5660a 100644
--- a/swig/Makefile.am
+++ b/swig/Makefile.am
@@ -1,6 +1,7 @@
 # SWIG
 SWIG_SRC = swig/jellyfish.i swig/hash_counter.i swig/hash_set.i	\
            swig/mer_dna.i swig/mer_file.i swig/string_mers.i
+SWIG_CXXFLAGS = -Wno-error
 
 if HAVE_SWIG
 SWIG_V_GEN = $(swig_v_GEN_$(V))
@@ -29,6 +30,7 @@ pythonext_SCRIPTS = swig/python/__init__.pyc
 pythonext_LTLIBRARIES = swig/python/_dna_jellyfish.la
 swig_python__dna_jellyfish_la_SOURCES = swig/python/swig_wrap.cpp $(SWIG_SRC)
 swig_python__dna_jellyfish_la_CPPFLAGS = $(PYTHON_CPPFLAGS) -I$(srcdir)/include
+swig_python_jellyfish_la_CXXFLAGS = $(AM_CXXFLAGS) $(SWIG_CXXFLAGS)
 swig_python__dna_jellyfish_la_LDFLAGS = -module
 swig_python__dna_jellyfish_la_LIBADD = libjellyfish-2.0.la
 CLEANFILES += $(PYTHON_BUILT) $(pythonext_SCRIPTS)
@@ -49,6 +51,7 @@ rubyextdir = $(RUBY_EXT_LIB)
 rubyext_LTLIBRARIES = swig/ruby/jellyfish.la
 swig_ruby_jellyfish_la_SOURCES = swig/ruby/swig_wrap.cpp $(SWIG_SRC)
 swig_ruby_jellyfish_la_CPPFLAGS = $(RUBY_EXT_CFLAGS) -I$(srcdir)/include
+swig_ruby_jellyfish_la_CXXFLAGS = $(AM_CXXFLAGS) $(SWIG_CXXFLAGS)
 swig_ruby_jellyfish_la_LDFLAGS = -module
 swig_ruby_jellyfish_la_LIBADD = libjellyfish-2.0.la
 CLEANFILES += $(RUBY_BUILT)
@@ -63,6 +66,7 @@ perlext_SCRIPTS = swig/perl5/jellyfish.pm
 perlext_LTLIBRARIES = swig/perl5/jellyfish.la
 swig_perl5_jellyfish_la_SOURCES = swig/perl5/swig_wrap.cpp $(SWIG_SRC)
 swig_perl5_jellyfish_la_CPPFLAGS = $(PERL_EXT_CPPFLAGS) -I$(PERL_EXT_INC) -I$(srcdir)/include
+swig_perl5_jellyfish_la_CXXFLAGS = $(AM_CXXFLAGS) $(SWIG_CXXFLAGS)
 swig_perl5_jellyfish_la_LDFLAGS = -module
 swig_perl5_jellyfish_la_LIBADD = libjellyfish-2.0.la
 CLEANFILES += $(PERL_BUILT)

From 6f9ee226a3bfd06aa0c4f6c4d232f4336af25a31 Mon Sep 17 00:00:00 2001
From: Guillaume Marcais <gmarcais@cs.cmu.edu>
Date: Wed, 28 Mar 2018 18:53:16 -0400
Subject: [PATCH 3/8] Added missing jellyfish.py

---
 swig/python/jellyfish.py | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 swig/python/jellyfish.py

diff --git a/swig/python/jellyfish.py b/swig/python/jellyfish.py
new file mode 100644
index 00000000..ab988134
--- /dev/null
+++ b/swig/python/jellyfish.py
@@ -0,0 +1,4 @@
+import warnings
+
+warnings.warn("Module 'jellyfish' is deprecated. Use 'dna_jellyfish'", DeprecationWarning)
+import dna_jellyfish as jellyfish

From 667ef2dc8d4444d3a72190ef674ed280d9272b10 Mon Sep 17 00:00:00 2001
From: Guillaume Marcais <Guillaume Marcais guillaume@marcais.net>
Date: Thu, 29 Mar 2018 17:15:24 -0400
Subject: [PATCH 4/8] Check for posix_memalign or aligned_alloc

---
 configure.ac                     |  3 +++
 lib/rectangular_binary_matrix.cc | 21 +++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index aed45e19..8e7b826c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -156,6 +156,9 @@ AM_COND_IF([PERL_BINDING],
            [AS_IF([test x$enable_perl_binding != xyes], [PERL_EXT_LIB=$enable_perl_binding])]
            [AX_PERL_EXT([$prefix])])
 
+# Check for aligned memory allocation functions
+AC_CHECK_FUNCS([posix_memalign aligned_alloc])
+
 AC_OUTPUT
 
 
diff --git a/lib/rectangular_binary_matrix.cc b/lib/rectangular_binary_matrix.cc
index b932d1c9..c2e39469 100644
--- a/lib/rectangular_binary_matrix.cc
+++ b/lib/rectangular_binary_matrix.cc
@@ -14,6 +14,7 @@
     along with Jellyfish.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+#include <cstdlib>
 #include <new>
 #include <stdexcept>
 #include <vector>
@@ -21,6 +22,23 @@
 #include <assert.h>
 #include <jellyfish/rectangular_binary_matrix.hpp>
 
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#ifdef HAVE_POSIX_MEMALIGN
+inline int __mem_align(void** memptr, size_t alignement, size_t size) {
+  return posix_memalign(memptr, alignement, size);
+}
+#elif HAVE_ALIGNED_ALLOC
+inline int __mem_align(void** memptr, size_t alignment, size_t size) {
+  *memptr = aligned_alloc(alignment, size);
+  return memptr == NULL;
+}
+#else
+#error No function to allocate aligned memory
+#endif
+
 uint64_t *jellyfish::RectangularBinaryMatrix::alloc(unsigned int r, unsigned int c) {
   if(r > (sizeof(uint64_t) * 8) || r == 0 || c == 0) {
     std::ostringstream err;
@@ -31,8 +49,7 @@ uint64_t *jellyfish::RectangularBinaryMatrix::alloc(unsigned int r, unsigned int
   // Make sure the number of words allocated is a multiple of
   // 8. Necessary for loop unrolling of vector multiplication
   size_t alloc_columns = (c / 8 + (c % 8 != 0)) * 8;
-  //  if(posix_memalign(&mem, sizeof(uint64_t) * 2, alloc_columns * sizeof(uint64_t)))
-  if(!(mem = aligned_alloc(sizeof(uint64_t) * 2, alloc_columns * sizeof(uint64_t))))
+  if(__mem_align(&mem, sizeof(uint64_t) * 2, alloc_columns * sizeof(uint64_t)))
     throw std::bad_alloc();
   memset(mem, '\0', sizeof(uint64_t) * alloc_columns);
   return (uint64_t *)mem;

From 591d4d1aec41154578a366d5dd8b7baf96454350 Mon Sep 17 00:00:00 2001
From: Guillaume Marcais <gmarcais@cs.cmu.edu>
Date: Fri, 30 Mar 2018 13:23:21 -0400
Subject: [PATCH 5/8] Bump version to 2.2.10

---
 configure.ac | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 8e7b826c..e4fc8b68 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([jellyfish], [2.2.9], [gmarcais@umd.edu])
+AC_INIT([jellyfish], [2.2.10], [gmarcais@umd.edu])
 AC_CANONICAL_HOST
 AC_CONFIG_MACRO_DIR([m4])
 AM_INIT_AUTOMAKE([subdir-objects foreign parallel-tests color-tests])

From 21ca89987a2ae70800afe7d1bff172d0d5fa4945 Mon Sep 17 00:00:00 2001
From: Guillaume Marcais <gmarcais@cs.cmu.edu>
Date: Tue, 3 Apr 2018 15:33:41 -0400
Subject: [PATCH 6/8] Build extension from the tarball with setup.py,
 extconf.rb and Makefile.PL

---
 swig/Makefile.am       |  5 +++--
 swig/perl5/Makefile.PL | 28 ++++++++++++++++++++--------
 swig/python/setup.py   | 38 +++++++++++++++-----------------------
 swig/ruby/extconf.rb   | 24 +++++++++++++++++++-----
 4 files changed, 57 insertions(+), 38 deletions(-)

diff --git a/swig/Makefile.am b/swig/Makefile.am
index b6d5660a..49692685 100644
--- a/swig/Makefile.am
+++ b/swig/Makefile.am
@@ -40,7 +40,7 @@ pythonc_v_GEN_0 = @echo "  PYTHONC " $@;
 %/__init__.pyc: %/dna_jellyfish.py
 	$(PYTHONC_V_GEN)$(PYTHON) -c 'import py_compile, sys; py_compile.compile(sys.argv[1], sys.argv[2])' $< $@
 swig/python/dna_jellyfish.py: swig/python/swig_wrap.cpp
-EXTRA_DIST += $(PYTHON_BUILT)
+EXTRA_DIST += $(PYTHON_BUILT) swig/python/setup.py
 endif
 
 # Ruby support
@@ -55,6 +55,7 @@ swig_ruby_jellyfish_la_CXXFLAGS = $(AM_CXXFLAGS) $(SWIG_CXXFLAGS)
 swig_ruby_jellyfish_la_LDFLAGS = -module
 swig_ruby_jellyfish_la_LIBADD = libjellyfish-2.0.la
 CLEANFILES += $(RUBY_BUILT)
+EXTRA_DIST += swig/ruby/extconf.rb
 endif
 
 # Perl5 support
@@ -71,5 +72,5 @@ swig_perl5_jellyfish_la_LDFLAGS = -module
 swig_perl5_jellyfish_la_LIBADD = libjellyfish-2.0.la
 CLEANFILES += $(PERL_BUILT)
 swig/perl5/jellyfish.pm: swig/perl5/swig_wrap.cpp
-EXTRA_DIST += $(PERL_BUILT)
+EXTRA_DIST += $(PERL_BUILT) swig/perl5/Makefile.PL
 endif
diff --git a/swig/perl5/Makefile.PL b/swig/perl5/Makefile.PL
index f8461f9c..db18249b 100644
--- a/swig/perl5/Makefile.PL
+++ b/swig/perl5/Makefile.PL
@@ -4,17 +4,29 @@ use strict;
 use warnings;
 use ExtUtils::MakeMaker;
 
-mkdir('lib');
-system('swig', '-c++', '-perl5', '-o', 'jellyfish_wrap.cxx', '../jellyfish.i');
+sub pkgconfig {
+  my ($cmd) = @_;
+  my $res = `pkg-config $cmd`;
+  if($?) {
+    print(STDERR "\nCan't get compilation information for Jellyfish, check you PKG_CONFIG_PATH\n");
+    exit(1);
+  }
+  chomp $res;
+  return $res;
+}
 
-my $jf_cflags = `pkg-config --cflags jellyfish-2.0`;
-my $jf_libs   = `pkg-config --libs jellyfish-2.0`;
-my $jf_rpath  = `pkg-config --libs-only-L jellyfish-2.0 | sed -e 's/-L/-Wl,-rpath,/g'`;
+my $pkg = "jellyfish-2.0";
+my $jf_cflags = pkgconfig("--cflags $pkg");
+my $jf_libs   = pkgconfig("--libs $pkg");
+my $jf_rpath  = pkgconfig("--libs-only-L $pkg");
+$jf_rpath =~ s/-L/-Wl,-rpath,/g;
+
+print("$jf_cflags\n$jf_libs\n$jf_rpath\n");
 
 WriteMakefile(NAME      => 'jellyfish',
-              CC        => 'g++-4.4',
-              LD        => 'g++-4.4',
+              CC        => 'g++',
+              LD        => 'g++',
               CCFLAGS   => "-std=c++0x $jf_cflags",
               LIBS      => "$jf_libs",
-              OBJECT    => 'jellyfish_wrap.o',
+              OBJECT    => 'swig_wrap.o',
               VERSION   => '0.0.1');
diff --git a/swig/python/setup.py b/swig/python/setup.py
index 0177ef51..dadde60c 100644
--- a/swig/python/setup.py
+++ b/swig/python/setup.py
@@ -1,36 +1,28 @@
 #! /usr/bin/env python
 
 import os
+import sys
 import re
 from distutils.core import setup, Extension
 
-# Don't use Extension support for swig, it is more annoying than
-# anything!  (1)It does not support very well having the .i in a
-# different directory. (2)It runs swig every single time, even if it
-# ran successfully before. So (3)doing first build, then install as
-# root rebuilds the modules as root.
-swig_time = os.path.getmtime('../jellyfish.i')
-older = True
-try:
-    older = os.path.getmtime('jellyfish_wrap.cxx') < swig_time or os.path.getmtime('jellyfish.py') < swig_time
-except FileNotFoundError:
-    older = True
+def pkgconfig(s):
+    pipe = os.popen("pkg-config {}".format(s))
+    res = pipe.read().rstrip().split()
+    ret = pipe.close()
+    if ret is not None:
+        sys.stderr.write("\nCan't get compilation information for Jellyfish, check you PKG_CONFIG_PATH\n")
+        exit(1)
+    return res
 
-if older:
-    print("Running swig: swig -c++ -python -o jellyfish_wrap.cxx ../jellyfish.i")
-    os.system("swig -c++ -python -o jellyfish_wrap.cxx ../jellyfish.i")
-
-jf_include = [re.sub(r'-I', '', x) for x in os.popen("pkg-config --cflags-only-I jellyfish-2.0").read().rstrip().split()]
-jf_cflags  = os.popen("pkg-config --cflags-only-other").read().rstrip().split()
-
-jf_libs    = [re.sub(r'-l', '', x) for x in os.popen("pkg-config --libs-only-l jellyfish-2.0").read().rstrip().split()]
-jf_libdir  = [re.sub(r'-L', '', x) for x in os.popen("pkg-config --libs-only-L jellyfish-2.0").read().rstrip().split()]
+jf_libs    = [re.sub(r'-l', '', x) for x in pkgconfig("--libs-only-l jellyfish-2.0")]
+jf_include = [re.sub(r'-I', '', x) for x in pkgconfig("--cflags-only-I jellyfish-2.0")]
+jf_cflags  = pkgconfig("--cflags-only-other jellyfish-2.0")
+jf_libdir  = [re.sub(r'-L', '', x) for x in pkgconfig("--libs-only-L jellyfish-2.0")]
 jf_rpath   = [re.sub(r'^', '-Wl,-rpath,', x) for x in jf_libdir]
-jf_ldflags = os.popen("pkg-config --libs-only-other jellyfish-2.0").read().rstrip().split()
-
+jf_ldflags = pkgconfig("--libs-only-other jellyfish-2.0")
 
 jellyfish_module = Extension('_dna_jellyfish',
-                             sources = ['jellyfish_wrap.cxx'],
+                             sources = ['swig_wrap.cpp'],
                              include_dirs = jf_include,
                              libraries = jf_libs,
                              library_dirs = jf_libdir,
diff --git a/swig/ruby/extconf.rb b/swig/ruby/extconf.rb
index ea4aa2e5..5c430dd2 100644
--- a/swig/ruby/extconf.rb
+++ b/swig/ruby/extconf.rb
@@ -2,11 +2,25 @@
 
 require 'mkmf'
 
-swig = find_executable('swig')
-system(swig, '-c++', '-ruby', '-o', 'jellyfish_wrap.cxx', '../jellyfish.i')
+def pkgconfig(s)
+  res = `pkg-config #{s}`
+  if $? != 0
+    STDERR.puts("\nCan't get compilation information for Jellyfish, check you PKG_CONFIG_PATH\n");
+    exit(1)
+  end
+  res.chomp
+end
 
-$defs << `pkg-config --cflags jellyfish-2.0`.chomp << '-std=c++0x'
-$libs << `pkg-config --libs jellyfish-2.0`.chomp
-$libs << `pkg-config --libs-only-L jellyfish-2.0 | sed -e 's/-L/-Wl,-rpath,/g'`.chomp
+$defs << pkgconfig("--cflags jellyfish-2.0") << '-std=c++0x'
+libs = [pkgconfig("--libs jellyfish-2.0"),
+        pkgconfig("--libs-only-L jellyfish-2.0").gsub(/-L/, "-Wl,-rpath,")]
+
+if Array === $libs
+  $libs += libs
+elsif String === $libs
+  $libs += " " + libs.join(" ")
+else
+  $libs = libs.join(" ")
+end
 
 create_makefile('jellyfish')

From af5904b2d3f570d841cd4a08d5b2090d14279b00 Mon Sep 17 00:00:00 2001
From: Guillaume Marcais <gmarcais@cs.cmu.edu>
Date: Tue, 3 Apr 2018 15:44:25 -0400
Subject: [PATCH 7/8] Updated installation documentation of extensions.

---
 swig/Readme.md | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/swig/Readme.md b/swig/Readme.md
index 37d21ca3..a429fffc 100644
--- a/swig/Readme.md
+++ b/swig/Readme.md
@@ -7,6 +7,41 @@ output files and use the Jellyfish hash within these scripting
 languages, which is much more convenient than in C++, although
 somewhat slower.
 
+Installation
+============
+
+See the installation instructions on the
+[main page](../../tree/master) on how to install the bindings at the
+same times as Jellyfish.
+
+Additionally, it is possible to install the bindings from the
+distribution tarball. If Jellyfish is already installed (that is
+`pkg-config --exists jellyfish-2.0 && echo OK` prints `OK`), then the
+bindings can be installed as follows. (`$PREFIX` should be set to the
+desired installation location)
+
+Python:
+```Shell
+cd swig/python
+python setup.py build
+python setup.py install --prefix=$PREFIX
+```
+
+Ruby:
+```Shell
+cd swig/ruby
+ruby extconf.rb
+make
+make install prefix=$PREFIX
+```
+
+Perl 5:
+```Shell
+cd swig/perl5
+perl Makefile.PL SITEPREFIX=$PREFIX
+make
+make install
+```
 
 Examples
 ========

From d7fa74a94fb82fdd24d9bca29a8fc801843160ad Mon Sep 17 00:00:00 2001
From: Guillaume Marcais <gmarcais@cs.cmu.edu>
Date: Mon, 16 Apr 2018 08:44:09 -0400
Subject: [PATCH 8/8] Added doc/Readme.md

---
 doc/Readme.md | 334 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 334 insertions(+)
 create mode 100644 doc/Readme.md

diff --git a/doc/Readme.md b/doc/Readme.md
new file mode 100644
index 00000000..8609a4e0
--- /dev/null
+++ b/doc/Readme.md
@@ -0,0 +1,334 @@
+# Getting started
+
+
+## Counting all k-mers
+
+
+The basic command to count all k-mers is as follows:
+
+```
+jellyfish count -m 21 -s 100M -t 10 -C reads.fasta
+```
+
+This will count canonical (`-C`) 21-mers (`-m 21`), using a hash with
+100 million elements (`-s 100M`) and 10 threads (`-t 10`) in the
+sequences in the file `reads.fasta`. The output is written in the file
+`mer counts.jf` by default (change with `-o` switch).
+
+To compute the histogram of the k-mer occurrences, use the
+`histo subcommand (see section [histo](#histo)):
+
+```
+jellyfish histo mer_counts.jf
+```
+
+To query the counts of a particular k-mer, use the
+`query` subcommand (see section [query](#query)):
+
+```
+jellyfish query mer_counts.jf AACGTTG
+```
+
+To output all the counts for all the k-mers in the file, use the
+`dump` subcommand (see section [dump](#dump)):
+
+```
+jellyfish dump mer_counts.jf > mer_counts_dumps.fa
+```
+
+To get some information on how, when and where this jellyfish file was
+generated, use the `info` subcommand (see section [info](#info)):
+
+```
+jellyfish info mer_counts.jf
+```
+
+For more detail information, see the relevant sections in this
+document.  All commands understand `--help` and will produce some
+information about the switches available.
+
+### Counting k-mers in sequencing reads
+
+In sequencing reads, it is unknown which strands of the DNA is
+sequenced. As a consequence, a k-mer or its reverse complement are
+essentially equivalent. The canonical representative of a k-mer `m` is
+by definition `m` or the reverse complement of `m`, whichever comes
+first lexicographically. The `-C` switch instructs to save in the hash
+only canonical k-mers, while the count is the number of occurrences
+of both a k-mer and it reverse complement.
+
+The size parameter (given with `-s`) is an indication of the number
+k-mers that will be stored in the hash. For sequencing reads, this
+size should be the size of the genome plus the k-mers generated by
+sequencing errors. For example, if the error rate is e (e.g.Illumina
+reads, usually e~1%), with an estimated genome size of G and
+a coverage of c, the number of expected k-mers is G+Gcek.
+
+> NOTE: unlike in Jellyfish 1, this `-s` parameter is only
+> an estimation. If the size given is too small to fit all the k-mers,
+> the hash size will be increased automatically or partial results will
+> be written to disk and finally merged automatically. Running
+> `jellyfish merge` should never be necessary, as now
+> jellyfish now takes care of this task on its own.
+
+If the low frequency k-mers (k-mers occurring only once), which are
+mostly due to sequencing errors, are not of interest, one might
+consider counting only high-frequency k-mers (see
+section [Counting high frequency k-mers](#Counting-high-frequency-k-mers)),
+which uses less memory and is potentially faster.
+
+### Counting k-mers in a genome
+
+In an actual genome or finished sequence, a k-mer and its reverse
+complement are not equivalent, hence using the `-C` switch does not
+make sense. In addition, the size for the hash can be set directly to
+the size of the genome.
+
+## Counting high-frequency k-mers
+
+Jellyfish offers two way to count only high-frequency k-mers (meaning
+only k-mers with count >1), which reduces significantly the memory
+usage. Both methods are based on using Bloom filters. The first method
+is a one pass method, which provides approximate count for some
+percentage of the k-mers. The second method is a two pass method which
+provides exact count. In both methods, most of the low-frequency
+k-mers are not reported.
+
+### One pass method
+
+Adding the `--bf-size` switch make jellyfish first insert all k-mers
+first into a Bloom filter and only insert into the hash the k-mers
+which have already been seen at least once. The argument to
+`--bf-size` should the total number of k-mer expected in the data set
+while the `--size` argument should be the number of $k$-mers occurring
+more than once. For example:
+
+```
+jellyfish count -m 25 -s 3G --bf-size 100G -t 16 homo_sapiens.fa
+```
+
+would be appropriate for counting 25-mers in human reads at 30x
+coverage. The approximate memory usage is 9 bits per k-mer in the
+Bloom filter.
+
+The count reported for each k-mer (by `jellyfish dump` or `jellyfish
+query`) is one less than the actual count.  Meaning, the count 1 k-mer
+are not reported, count 2 k-mer are reported to have count 1, etc.
+
+The drawback of this method is some percentage of the $k$-mer that
+should not be reported (because they occur only once) are
+reported. This is due to the random nature of the Bloom filter data
+structure. The percentage is <1% by default and can be changed with
+the `--bf-fp switch.
+
+### Two passes method
+
+In the two pass method, first a Bloom counter is created from the
+reads with `jellyfish bc`. Then this Bloom counter is given to the
+`jelllyfish count` command and only the k-mers which have been seen
+twice in the first pass will be inserted in the hash. For example,
+with a human data set similar that in
+section [One pass method](#One-pass-method):
+
+```
+jellyfish bc -m 25 -s 100G -t 16 -o homo_sapiens.bc homo_sapiens.fa
+jellyfish count -m 25 -s 3G -t 16 --bc homo_sapiens.bc homo_sapiens.fa
+```
+
+The advantage of this method is that the counts reported for the
+k-mers are all correct. Most count 1 k-mer are not reported, except
+for a small percentage (set by the false positive rate, `-f` switch of
+the `bc` subcommand) of them which are reported (correctly with count
+1). All other k-mers are reported with the correct count.
+
+The drawback of this method is that it requires to parse the entire
+reads data set twice and the memory usage of the Bloom counter is
+greater than that of the Bloom filter (slightly less than twice as
+much).
+
+# FAQ
+
+## How to read compressed files (or other format)?
+
+Jellyfish only reads FASTA or FASTQ formatted input files. By reading
+from pipes, jellyfish can read compressed files, like this:
+
+```
+zcat *.fastq.gz | jellyfish count /dev/fd/0 ...
+```
+
+or by using the `<()` redirection provided by the shell (e.g. bash, zsh):
+
+```
+jellyfish count <(zcat file1.fastq.gz) <(zcat file2.fasta.gz) ...
+```
+
+## How to read multiple files at once?
+
+Often, jellyfish can parse an input sequence file faster than `gzip`
+or `fastq-dump` (to parse SRA files) can output the sequence. This
+leads to many threads in jellyfish going partially unused. Jellyfish
+can be instructed to open multiple file at once. For example, to read
+two short read archive files simultaneously:
+
+```
+jellyfish count -F 2 <(fastq-dump -Z file1.sra) <(fastq-dump -Z file2.sra) ...
+```
+
+Another way is to use "generators". First, create a file containing,
+one per line, commands to generate sequence. Then pass this file to
+jellyfish and the number of generators to run
+simultaneously. Jellyfish will spawn subprocesses running the commands
+passed and read their standard output for sequence. By default, the
+commands are run using the shell in the SHELL environment variable,
+and this can be changed by the `-S` switch. Multiple generators will
+be run simultaneously as specified by the `-G` switch. For example:
+
+```
+ls *.fasta.gz | xargs -n 1 echo gunzip -c > generators
+jellyfish count -g generators -G 4 ...
+```
+
+The first command created the command list into the 'generators' file,
+each command unzipping one FASTA file in the current directory. The
+second command runs jellyfish with 4 concurrent generators.
+
+## How to reduce the output size?
+
+The output file was design to be easy to read, but the file generated
+can be rather large. By default, a 4 bytes counter value is saved for
+every k-mer (i.e.a maximum count of over 4 billion).  Instead, a
+counter size of 2 bytes or 1 byte can be used with the switch
+`--out-counter-len`, which reduces significantly the output size.
+
+The count of k-mers which cannot be represented with the given number
+of bytes will have a value equal to the maximum value that can be
+represented. Meaning, if the counter field uses 1 byte, any
+k-mers with count greater or equal to 255 will be reported of having
+a count of 255.
+
+Also, low frequency and high frequency k-mers can be skipped using the
+`--L and `--U` switches respectively. Although it might be more
+appropriate to filter out the low frequency $k$-mers using Bloom
+filters, as shown in
+section [Counting-high-frequency-k-mers](#Counting-high-frequency-k-mers).
+
+## How much memory is needed?
+
+The memory needed to count k-mers, given the various parameters of the
+`count` subcommand is obtained by using the
+`mem` subcommand. It understand all the same switches, but
+only the `--mer-len` (`-m`), `--size` (`-s`),
+`--counter-len` (`-c`) and `--reprobes` (`-p`)
+switches are taken into account.
+
+For example, for 24-mers, with an initial hash size of 1 billion, and
+default parameters otherwise, the memory usage is:
+
+```
+jellyfish mem -m 24 -s 1G
+4521043056 (4G)
+```
+
+Conversely, if the `--size` switch is not given but the
+`--mem` switch is, then the maximum initial hash size that would
+fit in the given memory is returned. For example, this is the maximum
+hash size for 31-mers in 8 Gb of RAM:
+
+```
+jellyfish mem -m 31 --mem 8g
+1073741824 (1G)
+```
+
+# Subcommands
+
+
+## histo
+
+The `histo` subcommand outputs the histogram of k-mers
+frequencies. The last bin, with value one above the high setting set
+by the `h` switch (10000 by default), is a catch all: all k-mers with
+a count greater than the high setting are tallied in that one bin. If
+the low setting is set (`-l` switch), then the first bin, with value
+one below the low setting, is also similarly a catch all.
+
+By default, the bins with a zero count are skipped. This can be changed
+with the `f` switch.
+
+## dump
+
+The `dump` subcommand outputs a list of all the k-mers in
+the file associated with their count. By default, the output is in FASTA
+format, where the header line contains the count of the k-mer and the
+sequence part is the sequence of the k-mer. This format has the
+advantage that the output contains the sequence of $k$-mers and can be
+directly fed into another program expecting the very common FASTA
+format. A more convenient column format (for human beings) is selected
+with the `-c` switch.
+
+Low frequency and high frequency k-mers can be skipped with the `-L`
+and `-U` switches respectively.
+
+In the output of the `dump` subcommand, the k-mers are sorted
+according to the hash function used by Jellyfish. The output can be
+considered to be "fairly pseudo-random". By "fairly" we mean that NO
+guarantee is made about the actual randomness of this order, it is
+just good enough for the hash table to work properly. And by
+"pseudo-random" we mean that the order is actually deterministic:
+given the same hash function, the output will be always the same and
+two different files generated with the same hash function can be
+merged easily.
+
+## query
+
+The `query` subcommand outputs the k-mers and their counts for some
+subset of k-mers. It will outputs the counts of all the k-mers
+passed on the command line or of all the k-mers in the sequence read
+from the FASTA or FASTQ formatted file passed to the switch
+`-s` (this switch can be given multiple times).
+
+## info
+
+The `info` subcommand outputs some information about the jellyfish
+file and the command used to generated it, in which directory and at
+what time the command was run. Hopefully, the information given should
+be enough to rerun jellyfish under the same conditions and reproduce
+the output file. In particular, the `-c` switch outputs the command,
+properly escaped and ready to run in a shell.
+
+The header is saved in JSON format and contains more information than is
+written by the default. The full header in JSON format can be written
+out using the `-j` switch.
+
+## merge
+
+The `merge` subcommand is of little direct use with version version 2
+of Jellyfish. When intermediary files were written to disk, because
+not all k-mers would fit in memory, they can be merged into one file
+containing the final result with the `merge` subcommand. The `count`
+subcommand will merge intermediary files automatically as needed.
+
+## stats
+
+The `stats` subcommand computes some statistics about the
+mers. Although these statistics could be computed from the histogram, it
+provides quick summary information. The fields are:
+
+* _Unique_: The number of k-mer occuring exactly once
+* _Distinct_: The number of k-mers, ignoring their multiplicity
+    (i.e.the cardinality of the set of $k$-mers)
+* _Total_: The number of k-mers with multiplicity (i.e.the sum of the
+    number of occurence of all the mers)
+* _Max\_count_: The maximum of the number of occurrences
+
+## mem
+
+The `mem` subcommand shows how much memory a count subcommand will
+need or conversely how large of a hash size will fit in a given amount
+of memory.
+
+## cite
+
+The `cite` subcommand prints the citation for the jellyfish
+paper. With the `-b`, it is formatted in Bibtex format. How
+convenient!