Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

distiller with an option to process with minimap2 #177

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
version='0.3.3'
version='0.3.4_dev'

4 changes: 2 additions & 2 deletions configs/local.config
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ process {
//

withName: map_parse_sort_chunks {
cpus = 8
cpus = 5
}

withName: merge_dedup_splitbam {
Expand All @@ -64,7 +64,7 @@ process {
}

executor {
cpus = 40
cpus = 20
}


Expand Down
180 changes: 142 additions & 38 deletions distiller.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ nextflow.enable.dsl=1
MIN_RES = params['bin'].resolutions.collect { it as int }.min()
ASSEMBLY_NAME = params['input'].genome.assembly_name

SINGLE_END = params['input'].get('single_end', false)

pairsgz_decompress_command = 'bgzip -cd -@ 3'

switch(params.compression_format) {
Expand Down Expand Up @@ -136,30 +138,41 @@ def fastqDumpCmd(file_or_srr, library, run, srr_start=0, srr_end=-1, threads=1,
def bgzip_threads = Math.max(1,((threads as int)-2).intdiv(2))
def cmd = ''

if (use_custom_split) {
cmd = """
#cp -r $HOME/.ncbi/ . # Fix for sra-tools requiring ncbi folder locally
HOME=`readlink -e ./`
fastq-dump ${file_or_srr} -Z --split-spot ${srr_start_flag} ${srr_end_flag} \
| pyfilesplit --lines 4 \
>(bgzip -c -@{bgzip_threads} > ${library}.${run}.1.fastq.gz) \
>(bgzip -c -@{bgzip_threads} > ${library}.${run}.2.fastq.gz) \
| cat """


} else {
if (SINGLE_END) {
cmd = """
#cp -r $HOME/.ncbi/ . # Fix for sra-tools requiring ncbi folder locally
HOME=`readlink -e ./`
fastq-dump ${file_or_srr} --gzip --split-spot --split-3 ${srr_start_flag} ${srr_end_flag}
mv *_1.fastq.gz ${library}.${run}.1.fastq.gz
mv *_2.fastq.gz ${library}.${run}.2.fastq.gz
#HOME=`readlink -e ./`
fastq-dump ${file_or_srr} --gzip --split-spot ${srr_start_flag} ${srr_end_flag}
mv *.fastq.gz ${library}.${run}.1.fastq.gz
touch ${library}.${run}.2.fastq.gz
"""
} else {
if (use_custom_split) {
cmd = """
#cp -r $HOME/.ncbi/ . # Fix for sra-tools requiring ncbi folder locally
#HOME=`readlink -e ./`
fastq-dump ${file_or_srr} -Z --split-spot ${srr_start_flag} ${srr_end_flag} \
| pyfilesplit --lines 4 \
>(bgzip -c -@{bgzip_threads} > ${library}.${run}.1.fastq.gz) \
>(bgzip -c -@{bgzip_threads} > ${library}.${run}.2.fastq.gz) \
| cat """


} else {
cmd = """
cp -r $HOME/.ncbi/ . # Fix for sra-tools requiring ncbi folder locally
HOME=`readlink -e ./`
fastq-dump ${file_or_srr} --gzip --split-spot --split-3 ${srr_start_flag} ${srr_end_flag}
mv *_1.fastq.gz ${library}.${run}.1.fastq.gz
mv *_2.fastq.gz ${library}.${run}.2.fastq.gz
"""
}
}

return cmd
}

/* TODO: check three functions below whether they work in single-end mode */

def sraDownloadTruncateCmd(sra_query, library, run, truncate_fastq_reads=0,
chunksize=0, threads=1, use_custom_split=true) {
Expand Down Expand Up @@ -284,6 +297,7 @@ String fastqLocalTruncateChunkCmd(path, library, run, side,
return cmd
}

/* Single-end and minimap modifications start here */

process download_truncate_chunk_fastqs{
tag "library:${library} run:${run}"
Expand Down Expand Up @@ -439,14 +453,12 @@ process fastqc{

}



BWA_INDEX = Channel.from([[
params.input.genome.bwa_index_wildcard_path
INDEX = Channel.from([[
params.input.genome.genome_index_wildcard_path
.split('/')[-1]
.replaceAll('\\*$', "")
.replaceAll('\\.$', ""),
file(params.input.genome.bwa_index_wildcard_path),
file(params.input.genome.genome_index_wildcard_path),
]])

/*
Expand All @@ -458,7 +470,7 @@ process map_parse_sort_chunks {

input:
set val(library), val(run), val(chunk), file(fastq1), file(fastq2) from LIB_RUN_CHUNK_FASTQS
set val(bwa_index_base), file(bwa_index_files) from BWA_INDEX.first()
set val(genome_index_base), file(genome_index_files) from INDEX.first()
file(chrom_sizes) from CHROM_SIZES_FOR_PARSING.first()

output:
Expand All @@ -480,29 +492,121 @@ process map_parse_sort_chunks {
params['parse'].get('keep_unparsed_bams','false').toBoolean() ?
"| tee >(samtools view -bS > ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.bam)" : "" )
def parsing_options = params['parse'].get('parsing_options','')
def bwa_threads = (task.cpus as int)
def mapping_threads = (task.cpus as int)
def sorting_threads = (task.cpus as int)

def mapping_command = (
trim_options ?
"fastp ${trim_options} \
--json ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.json \
--html ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.html \
-i ${fastq1} -I ${fastq2} --stdout | \
bwa mem -p -t ${bwa_threads} ${mapping_options} -SP ${bwa_index_base} \
- ${keep_unparsed_bams_command}" : \
\
"bwa mem -t ${bwa_threads} ${mapping_options} -SP ${bwa_index_base} \
${fastq1} ${fastq2} ${keep_unparsed_bams_command}"
)
mapping_method = params['map'].get('method','bwa mem')

def fastq = ""
def trim_fastq = ""
if (SINGLE_END) {
if (mapping_method=='bowtie2') {
fastq = "-U ${fastq1}"
} else {
fastq = "${fastq1}"
}
trim_fastq = "-i ${fastq1}"
} else {
if (mapping_method=='bowtie2') {
fastq = "-1 ${fastq1} -2 ${fastq2}"
} else {
fastq = "${fastq1} ${fastq2}"
}
trim_fastq = "-i ${fastq1} -I ${fastq2}"
}

def mapping_command = ""
def parse_method = params['parse'].get('method', 'parse')
single_end_flag = SINGLE_END.toBoolean() ? "--single-end" : ""
if (mapping_method=='bwa mem') { /* bwa mem branch */
mapping_command = (
trim_options ?
"fastp ${trim_options} \
--json ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.json \
--html ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.html \
${trim_fastq} --stdout | \
bwa mem -p -t ${mapping_threads} ${mapping_options} -SP ${genome_index_base} \
- ${keep_unparsed_bams_command}" : \
\
"bwa mem -t ${mapping_threads} ${mapping_options} -SP ${genome_index_base} \
${fastq} ${keep_unparsed_bams_command}"
)
} else if (mapping_method=='bwa mem2') { /* bwa mem2 branch */
mapping_command = (
trim_options ?
"fastp ${trim_options} \
--json ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.json \
--html ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.html \
${trim_fastq} --stdout | \
bwa mem2 -p -t ${mapping_threads} ${mapping_options} -SP ${genome_index_base} \
- ${keep_unparsed_bams_command}" : \
\
"bwa mem2 -t ${mapping_threads} ${mapping_options} -SP ${genome_index_base} \
${fastq} ${keep_unparsed_bams_command}"
)
} else if (mapping_method=='minimap2') { /* minimap2 branch */
mapping_command = (
trim_options ?
"fastp ${trim_options} \
--json ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.json \
--html ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.html \
${trim_fastq} --stdout | \
minimap2 -a -t ${mapping_threads} ${mapping_options} ${genome_index_base} \
- ${keep_unparsed_bams_command}" : \
\
"minimap2 -a -t ${mapping_threads} ${mapping_options} ${genome_index_base} \
${fastq} ${keep_unparsed_bams_command}"
)
} else if (mapping_method=='bwa aln') { /* bwa aln short reads branch, not supported for single-end */
if (trim_options){
mapping_command = "fastp ${trim_options} \
--json ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.json \
--html ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.html \
-i ${fastq1} -I ${fastq2} \
-o ${fastq1}.trimmed -O ${fastq2}.trimmed; \
\
bwa aln -t ${mapping_threads} ${mapping_options} ${genome_index_base} \
${fastq1}.trimmed > ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.1.sai; \
\
bwa aln -t ${mapping_threads} ${mapping_options} ${genome_index_base} \
${fastq2}.trimmed > ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.2.sai; \
\
bwa sampe ${genome_index_base} \
${library}.${run}.${ASSEMBLY_NAME}.${chunk}.1.sai \
${library}.${run}.${ASSEMBLY_NAME}.${chunk}.2.sai ${fastq1} ${fastq2}"
} else {
mapping_command = "bwa aln -t ${mapping_threads} ${genome_index_base} \
${fastq1} > ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.1.sai; \
\
bwa aln -t ${mapping_threads} ${genome_index_base} \
${fastq2} > ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.2.sai; \
\
bwa sampe ${genome_index_base} \
${library}.${run}.${ASSEMBLY_NAME}.${chunk}.1.sai \
${library}.${run}.${ASSEMBLY_NAME}.${chunk}.2.sai ${fastq1} ${fastq2} "
}
} else if (mapping_method=="bowtie2") { /* bowtie2 branch applicable for long reads or short reads */
/* --reorder */
mapping_command = (
trim_options ?
"fastp ${trim_options} \
--json ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.json \
--html ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.fastp.html \
${trim_fastq} --stdout | \
bowtie2 -p ${mapping_threads} --reorder ${mapping_options} -x ${genome_index_base} \
- ${keep_unparsed_bams_command}" : \
\
"bowtie2 -p ${mapping_threads} --reorder ${mapping_options} -x ${genome_index_base} \
${fastq} ${keep_unparsed_bams_command}"
)
}

"""
TASK_TMP_DIR=\$(mktemp -d -p ${task.distillerTmpDir} distiller.tmp.XXXXXXXXXX)
touch ${library}.${run}.${ASSEMBLY_NAME}.${chunk}.bam

${mapping_command} \
| pairtools parse ${dropsam_flag} ${dropreadid_flag} ${dropseq_flag} \
| pairtools ${parse_method} ${single_end_flag} ${dropsam_flag} ${dropreadid_flag} ${dropseq_flag} \
${parsing_options} \
-c ${chrom_sizes} \
| pairtools sort --nproc ${sorting_threads} \
Expand Down Expand Up @@ -533,7 +637,7 @@ process merge_dedup_splitbam {

output:
set library, "${library}.${ASSEMBLY_NAME}.nodups.pairs.gz",
"${library}.${ASSEMBLY_NAME}.nodups.pairs.gz.px2",
/*"${library}.${ASSEMBLY_NAME}.nodups.pairs.gz.px2",*/
"${library}.${ASSEMBLY_NAME}.nodups.bam",
"${library}.${ASSEMBLY_NAME}.dups.pairs.gz",
"${library}.${ASSEMBLY_NAME}.dups.bam",
Expand Down Expand Up @@ -595,7 +699,7 @@ process merge_dedup_splitbam {
touch ${library}.${ASSEMBLY_NAME}.dups.bam

rm -rf \$TASK_TMP_DIR
pairix ${library}.${ASSEMBLY_NAME}.nodups.pairs.gz
#pairix ${library}.${ASSEMBLY_NAME}.nodups.pairs.gz
"""
}

Expand Down
16 changes: 7 additions & 9 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@ RUN apt-get update --fix-missing && \


# Set the locale
RUN locale-gen en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
RUN locale-gen en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8

# Install conda
RUN curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
RUN curl -LO https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda3 -b && \
rm Miniconda3-latest-Linux-x86_64.sh
rm Miniconda3-latest-Linux-x86_64.sh
ENV PATH=/miniconda3/bin:${PATH}

# Install conda dependencies
Expand All @@ -26,10 +26,8 @@ RUN conda config --set always_yes yes --set changeps1 no && \
conda config --add channels conda-forge && \
conda config --add channels defaults && \
conda config --add channels bioconda && \
conda config --add channels golobor && \
conda config --get && \
conda update -q conda && \
conda info -a && \
conda env update -q -n root --file environment.yml && \
conda clean --tarballs --index-cache --lock

conda clean --tarballs --index-cache
Loading