open2c · agalitsyna · Jul 28, 2022 · Nov 8, 2022 · Nov 8, 2022 · Nov 9, 2022
diff --git a/VERSION b/VERSION
@@ -1 +1,2 @@
-version='0.3.3'
+version='0.3.4_dev'
+
diff --git a/configs/local.config b/configs/local.config
@@ -41,7 +41,7 @@ process {
     //
 
     withName: map_parse_sort_chunks {
-        cpus = 8
+        cpus = 5
     }
 
     withName: merge_dedup_splitbam {
@@ -64,7 +64,7 @@ process {
 }
 
 executor {
-    cpus = 40
+    cpus = 20
 }
 
 

diff --git a/distiller.nf b/distiller.nf
@@ -14,6 +14,8 @@ nextflow.enable.dsl=1
 MIN_RES = params['bin'].resolutions.collect { it as int }.min()
 ASSEMBLY_NAME = params['input'].genome.assembly_name
 
+SINGLE_END = params['input'].get('single_end', false)
+
 pairsgz_decompress_command = 'bgzip -cd -@ 3'
 
 switch(params.compression_format) {
@@ -136,30 +138,41 @@ def fastqDumpCmd(file_or_srr, library, run, srr_start=0, srr_end=-1, threads=1,
     def bgzip_threads = Math.max(1,((threads as int)-2).intdiv(2))
     def cmd = ''
 
-    if (use_custom_split) {
-        cmd = """
-            #cp -r $HOME/.ncbi/ . # Fix for sra-tools requiring ncbi folder locally
-            HOME=`readlink -e ./`
-            fastq-dump ${file_or_srr} -Z --split-spot ${srr_start_flag} ${srr_end_flag} \
-                        | pyfilesplit --lines 4 \
-                            >(bgzip -c -@{bgzip_threads} > ${library}.${run}.1.fastq.gz) \
-                            >(bgzip -c -@{bgzip_threads} > ${library}.${run}.2.fastq.gz) \
-                            | cat """
-
-
-    } else {
+    if (SINGLE_END) {
         cmd = """
             #cp -r $HOME/.ncbi/ .  # Fix for sra-tools requiring ncbi folder locally
-            HOME=`readlink -e ./`
-            fastq-dump ${file_or_srr} --gzip --split-spot --split-3 ${srr_start_flag} ${srr_end_flag} 
-            mv *_1.fastq.gz ${library}.${run}.1.fastq.gz
-            mv *_2.fastq.gz ${library}.${run}.2.fastq.gz
+            #HOME=`readlink -e ./`
+            fastq-dump ${file_or_srr} --gzip --split-spot ${srr_start_flag} ${srr_end_flag}
+            mv *.fastq.gz ${library}.${run}.1.fastq.gz
+            touch ${library}.${run}.2.fastq.gz
         """
+    } else {
+        if (use_custom_split) {
+            cmd = """
+                #cp -r $HOME/.ncbi/ . # Fix for sra-tools requiring ncbi folder locally
+                #HOME=`readlink -e ./`
+                fastq-dump ${file_or_srr} -Z --split-spot ${srr_start_flag} ${srr_end_flag} \
+                            | pyfilesplit --lines 4 \
+                                >(bgzip -c -@{bgzip_threads} > ${library}.${run}.1.fastq.gz) \
+                                >(bgzip -c -@{bgzip_threads} > ${library}.${run}.2.fastq.gz) \
+                                | cat """
+
+
+        } else {
+            cmd = """
+                cp -r $HOME/.ncbi/ .  # Fix for sra-tools requiring ncbi folder locally
+                HOME=`readlink -e ./`
+                fastq-dump ${file_or_srr} --gzip --split-spot --split-3 ${srr_start_flag} ${srr_end_flag}
+                mv *_1.fastq.gz ${library}.${run}.1.fastq.gz
+                mv *_2.fastq.gz ${library}.${run}.2.fastq.gz
+            """
+        }
     }
 
     return cmd
 }
 
+/* TODO: check three functions below whether they work in single-end mode  */
 
 def sraDownloadTruncateCmd(sra_query, library, run, truncate_fastq_reads=0,
                            chunksize=0, threads=1, use_custom_split=true) {
@@ -284,6 +297,7 @@ String fastqLocalTruncateChunkCmd(path, library, run, side,
     return cmd
 }
 
+/* Single-end and minimap modifications start here */
 
 process download_truncate_chunk_fastqs{
     tag "library:${library} run:${run}"
@@ -439,14 +453,12 @@ process fastqc{
 
 }
 
-
-
-BWA_INDEX = Channel.from([[
-             params.input.genome.bwa_index_wildcard_path
+INDEX = Channel.from([[
+             params.input.genome.genome_index_wildcard_path
                 .split('/')[-1]
                 .replaceAll('\\*$', "")
                 .replaceAll('\\.$', ""),
-             file(params.input.genome.bwa_index_wildcard_path),
+             file(params.input.genome.genome_index_wildcard_path),
             ]])
 
 /*
@@ -458,7 +470,7 @@ process map_parse_sort_chunks {
 
     input:
     set val(library), val(run), val(chunk), file(fastq1), file(fastq2) from LIB_RUN_CHUNK_FASTQS
-    set val(bwa_index_base), file(bwa_index_files) from BWA_INDEX.first()
+    set val(genome_index_base), file(genome_index_files) from INDEX.first()
     file(chrom_sizes) from CHROM_SIZES_FOR_PARSING.first()
 
     output:
@@ -480,29 +492,121 @@ process map_parse_sort_chunks {
         params['parse'].get('keep_unparsed_bams','false').toBoolean() ?
         "| tee >(samtools view -bS > ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.bam)" : "" )
     def parsing_options = params['parse'].get('parsing_options','')
-    def bwa_threads = (task.cpus as int)
+    def mapping_threads = (task.cpus as int)
     def sorting_threads = (task.cpus as int)
 
-    def mapping_command = (
-        trim_options ? 
-        "fastp ${trim_options} \
-        --json ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.json \
-        --html ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.html \
-        -i ${fastq1} -I ${fastq2} --stdout | \
-        bwa mem -p -t ${bwa_threads} ${mapping_options} -SP ${bwa_index_base} \
-        - ${keep_unparsed_bams_command}" : \
-        \
-        "bwa mem -t ${bwa_threads} ${mapping_options} -SP ${bwa_index_base} \
-        ${fastq1} ${fastq2} ${keep_unparsed_bams_command}"
-        )
+    mapping_method = params['map'].get('method','bwa mem')
 
+    def fastq = ""
+    def trim_fastq = ""
+    if (SINGLE_END) {
+        if (mapping_method=='bowtie2') {
+            fastq = "-U ${fastq1}"
+        } else {
+            fastq = "${fastq1}"
+        }
+        trim_fastq = "-i ${fastq1}"
+    } else {
+        if (mapping_method=='bowtie2') {
+            fastq = "-1 ${fastq1} -2 ${fastq2}"
+        } else {
+            fastq = "${fastq1} ${fastq2}"
+        }
+        trim_fastq = "-i ${fastq1} -I ${fastq2}"
+    }
+
+    def mapping_command = ""
+    def parse_method = params['parse'].get('method', 'parse')
+    single_end_flag = SINGLE_END.toBoolean() ? "--single-end" : ""
+    if (mapping_method=='bwa mem') { /* bwa mem branch */
+            mapping_command = (
+                trim_options ?
+                "fastp ${trim_options} \
+                --json ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.json \
+                --html ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.html \
+                ${trim_fastq} --stdout | \
+                bwa mem -p -t ${mapping_threads} ${mapping_options} -SP ${genome_index_base} \
+                - ${keep_unparsed_bams_command}" : \
+                \
+                "bwa mem -t ${mapping_threads} ${mapping_options} -SP ${genome_index_base} \
+                ${fastq} ${keep_unparsed_bams_command}"
+            )
+        } else if (mapping_method=='bwa mem2') { /* bwa mem2 branch */
+            mapping_command = (
+                trim_options ?
+                "fastp ${trim_options} \
+                --json ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.json \
+                --html ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.html \
+                ${trim_fastq} --stdout | \
+                bwa mem2 -p -t ${mapping_threads} ${mapping_options} -SP ${genome_index_base} \
+                - ${keep_unparsed_bams_command}" : \
+                \
+                "bwa mem2 -t ${mapping_threads} ${mapping_options} -SP ${genome_index_base} \
+                ${fastq} ${keep_unparsed_bams_command}"
+            )
+        } else if (mapping_method=='minimap2') { /* minimap2 branch */
+            mapping_command = (
+                trim_options ?
+                "fastp ${trim_options} \
+                --json ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.json \
+                --html ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.html \
+                ${trim_fastq} --stdout | \
+                minimap2 -a -t ${mapping_threads} ${mapping_options} ${genome_index_base} \
+                - ${keep_unparsed_bams_command}" : \
+                \
+                "minimap2 -a -t ${mapping_threads} ${mapping_options} ${genome_index_base} \
+                ${fastq} ${keep_unparsed_bams_command}"
+            )
+        } else if (mapping_method=='bwa aln') { /* bwa aln short reads branch, not supported for single-end */
+            if (trim_options){
+                mapping_command = "fastp ${trim_options} \
+                    --json ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.json \
+                    --html ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.html \
+                    -i ${fastq1} -I ${fastq2} \
+                    -o ${fastq1}.trimmed -O ${fastq2}.trimmed; \
+                \
+                bwa aln -t ${mapping_threads} ${mapping_options} ${genome_index_base} \
+                    ${fastq1}.trimmed > ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.1.sai; \
+                \
+                bwa aln -t ${mapping_threads} ${mapping_options} ${genome_index_base} \
+                    ${fastq2}.trimmed > ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.2.sai; \
+                \
+                bwa sampe ${genome_index_base} \
+                      ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.1.sai \
+                      ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.2.sai ${fastq1} ${fastq2}"
+            } else {
+                mapping_command = "bwa aln -t ${mapping_threads} ${genome_index_base} \
+                    ${fastq1} > ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.1.sai; \
+                \
+                bwa aln -t ${mapping_threads} ${genome_index_base} \
+                    ${fastq2} > ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.2.sai; \
+                \
+                bwa sampe ${genome_index_base} \
+                      ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.1.sai \
+                      ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.2.sai ${fastq1} ${fastq2} "
+            }
+        } else if (mapping_method=="bowtie2") { /* bowtie2 branch applicable for long reads or short reads */
+            /* --reorder */
+            mapping_command = (
+                trim_options ?
+                "fastp ${trim_options} \
+                --json ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.json \
+                --html ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.html \
+                ${trim_fastq} --stdout | \
+                bowtie2 -p ${mapping_threads} --reorder ${mapping_options} -x ${genome_index_base} \
+                - ${keep_unparsed_bams_command}" : \
+                \
+                "bowtie2 -p ${mapping_threads} --reorder ${mapping_options} -x ${genome_index_base} \
+                ${fastq} ${keep_unparsed_bams_command}"
+            )
+        }
 
     """
     TASK_TMP_DIR=\$(mktemp -d -p ${task.distillerTmpDir} distiller.tmp.XXXXXXXXXX)
     touch ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.bam
 
     ${mapping_command} \
-    | pairtools parse ${dropsam_flag} ${dropreadid_flag} ${dropseq_flag} \
+    | pairtools ${parse_method} ${single_end_flag} ${dropsam_flag} ${dropreadid_flag} ${dropseq_flag} \
       ${parsing_options} \
       -c ${chrom_sizes} \
       | pairtools sort --nproc ${sorting_threads} \
@@ -533,7 +637,7 @@ process merge_dedup_splitbam {
 
     output:
     set library, "${library}.${ASSEMBLY_NAME}.nodups.pairs.gz",
-                 "${library}.${ASSEMBLY_NAME}.nodups.pairs.gz.px2",
+                 /*"${library}.${ASSEMBLY_NAME}.nodups.pairs.gz.px2",*/
                  "${library}.${ASSEMBLY_NAME}.nodups.bam",
                  "${library}.${ASSEMBLY_NAME}.dups.pairs.gz",
                  "${library}.${ASSEMBLY_NAME}.dups.bam",
@@ -595,7 +699,7 @@ process merge_dedup_splitbam {
         touch ${library}.${ASSEMBLY_NAME}.dups.bam
 
         rm -rf \$TASK_TMP_DIR
-        pairix ${library}.${ASSEMBLY_NAME}.nodups.pairs.gz
+        #pairix ${library}.${ASSEMBLY_NAME}.nodups.pairs.gz
         """
 }
 

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -7,15 +7,15 @@ RUN apt-get update --fix-missing && \
 
 
 # Set the locale
-RUN locale-gen en_US.UTF-8  
-ENV LANG en_US.UTF-8  
-ENV LANGUAGE en_US:en  
-ENV LC_ALL en_US.UTF-8     
+RUN locale-gen en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US:en
+ENV LC_ALL en_US.UTF-8
 
 # Install conda
-RUN curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+RUN curl -LO https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
     bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda3 -b && \
-    rm Miniconda3-latest-Linux-x86_64.sh 
+    rm Miniconda3-latest-Linux-x86_64.sh
 ENV PATH=/miniconda3/bin:${PATH}
 
 # Install conda dependencies
@@ -26,10 +26,8 @@ RUN conda config --set always_yes yes --set changeps1 no && \
     conda config --add channels conda-forge && \
     conda config --add channels defaults && \
     conda config --add channels bioconda && \
-    conda config --add channels golobor && \ 
     conda config --get && \
     conda update -q conda && \
     conda info -a && \
     conda env update -q -n root --file environment.yml && \
-    conda clean --tarballs --index-cache --lock
-
+    conda clean --tarballs --index-cache