zjnolen · zjnolen · Feb 2, 2024 · Feb 1, 2024 · Feb 1, 2024 · Feb 2, 2024
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -35,8 +35,8 @@ populations: []
 analyses:
   # mapping options
   mapping:
-    historical_only_collapsed: true
-    historical_aligner: "aln"
+    historical_only_collapsed: false
+    historical_collapsed_aligner: "aln"
   # filtering
   pileup-mappability: true
   repeatmasker:

diff --git a/README.md b/README.md
@@ -131,7 +131,9 @@ two datasets in different working directories.
 Some QC will not be available for users starting at BAM files. No read
 processing QC can be produced and should be done beforehand. While mapping
 percentages are calculated, these may not entirely represent the truth, as they
-may not account for anything already fully removed from the bam.
+may not account for anything already fully removed from the bam. In this case,
+they also can't be separated into categories of collapsed and uncollapsed
+reads, and instead are simply reported as the total percentage mapping only.
 
 ### Running on a cluster
 

diff --git a/config/README.md b/config/README.md
@@ -180,6 +180,11 @@ settings for each analysis are set in the next section.
       help avoid mapping contaminants, as longer fragments are likely from more
       recent, non-endogenous DNA. However, in the event you want to map both,
       you can set this to `false`. (`true`/`false`)
+    - `historical_collapsed_aligner:` Aligner used to map collapsed historical
+      sample reads. `aln` is the recommended for this, but this is here in case
+      you would like to select `mem` for this. Uncollapsed historical reads
+      will be mapped with `mem` if `historical_only_collapsed` is set to
+      `false`, regardless of what is put here. (`aln`/`mem`)
   - `genmap:` Filter out sites with low mappability estimated by Genmap
   (`true`/`false`)
   - `repeatmasker:` (NOTE: Only one of the three options should be filled/true)

diff --git a/config/config.yaml b/config/config.yaml
@@ -38,7 +38,7 @@ analyses:
   # mapping options
   mapping:
     historical_only_collapsed: true
-    historical_aligner: "aln"
+    historical_collapsed_aligner: "aln"
   # filtering
   pileup-mappability: true
   repeatmasker:
@@ -96,7 +96,6 @@ params:
     filt-on-depth-classes: true
   fastp:
     extra: "-p -g" # don't put --merge or --overlap_len_require here, they're implicit
-    min_overlap_mod: 30
     min_overlap_hist: 15
   bwa_aln:
     extra: "-l 16500 -n 0.01 -o 2"

diff --git a/workflow/rules/1.0_preprocessing.smk b/workflow/rules/1.0_preprocessing.smk
@@ -22,11 +22,48 @@ rule fastp_mergedout:
     output:
         trimmed=temp(
             expand(
-                "results/preprocessing/fastp/{{sample}}_{{unit}}_{{lib}}.{read}.fastq.gz",
+                "results/preprocessing/fastp/{{sample}}_{{unit}}_{{lib}}.{read}.uncollapsed.fastq.gz",
                 read=["R1", "R2"],
             )
         ),
         merged=temp("results/preprocessing/fastp/{sample}_{unit}_{lib}.merged.fastq.gz"),
+        html=report(
+            "results/preprocessing/qc/fastp/{sample}_{unit}_{lib}_merged.html",
+            category="Quality Control",
+            subcategory="Trimming Reports",
+            labels={
+                "Sample": "{sample}",
+                "Unit": "{unit}",
+                "Lib": "{lib}",
+                "Type": "fastp Report",
+            },
+        ),
+        json="results/preprocessing/qc/fastp/{sample}_{unit}_{lib}_merged.json",
+    log:
+        "logs/preprocessing/fastp/{sample}_{unit}_{lib}.merged.log",
+    benchmark:
+        "benchmarks/preprocessing/fastp/{sample}_{unit}_{lib}.merged.log"
+    params:
+        extra=lambda w: config["params"]["fastp"]["extra"]
+        + f" --merge --overlap_len_require {config['params']['fastp']['min_overlap_hist']}",
+    threads: lambda wildcards, attempt: attempt * 2
+    resources:
+        runtime=lambda wildcards, attempt: attempt * 480,
+    wrapper:
+        "v2.5.0/bio/fastp"
+
+
+rule fastp_pairedout:
+    """Process reads with fastp, don't collapse overlapping read pairs"""
+    input:
+        unpack(get_raw_fastq),
+    output:
+        trimmed=temp(
+            expand(
+                "results/preprocessing/fastp/{{sample}}_{{unit}}_{{lib}}.{read}.paired.fastq.gz",
+                read=["R1", "R2"],
+            )
+        ),
         html=report(
             "results/preprocessing/qc/fastp/{sample}_{unit}_{lib}_paired.html",
             category="Quality Control",
@@ -40,16 +77,14 @@ rule fastp_mergedout:
         ),
         json="results/preprocessing/qc/fastp/{sample}_{unit}_{lib}_paired.json",
     log:
-        "logs/preprocessing/fastp/{sample}_{unit}_{lib}.log",
+        "logs/preprocessing/fastp/{sample}_{unit}_{lib}.paired.log",
     benchmark:
-        "benchmarks/preprocessing/fastp/{sample}_{unit}_{lib}.log"
+        "benchmarks/preprocessing/fastp/{sample}_{unit}_{lib}.paired.log"
     params:
-        extra=lambda w: config["params"]["fastp"]["extra"]
-        + f" --merge --overlap_len_require {get_min_overlap(w)}",
-    threads: 2
+        extra=lambda w: config["params"]["fastp"]["extra"],
+    threads: lambda wildcards, attempt: attempt * 2
     resources:
         runtime=lambda wildcards, attempt: attempt * 480,
-        mem_mb=lambda wildcards, input, attempt: int(attempt * (input.size_mb / 2)),
     wrapper:
         "v2.5.0/bio/fastp"
 

diff --git a/workflow/rules/2.0_mapping.smk b/workflow/rules/2.0_mapping.smk
@@ -47,18 +47,23 @@ rule bwa_samse_merged:
         "v2.6.0/bio/bwa/samse"
 
 
-rule bwa_mem_merged:
-    """Map collapsed read pairs for historical samples to reference genome"""
+rule bwa_mem_paired:
+    """Map trimmed paired reads from modern samples to reference genome"""
     input:
-        reads="results/preprocessing/fastp/{sample}_{unit}_{lib}.merged.fastq.gz",
+        reads=expand(
+            "results/preprocessing/fastp/{{sample}}_{{unit}}_{{lib}}.{read}.{{pairing}}.fastq.gz",
+            read=["R1", "R2"],
+        ),
         ref="results/ref/{ref}/{ref}.fa",
         idx=rules.bwa_index.output,
     output:
-        temp("results/mapping/mapped/{sample}_{unit}_{lib}.{ref}.mem.merged.bam"),
+        bam=temp("results/mapping/mapped/{sample}_{unit}_{lib}.{ref}.mem.{pairing}.bam"),
     log:
-        "logs/mapping/bwa_mem/{sample}_{unit}_{lib}.{ref}.merged.log",
+        "logs/mapping/bwa_mem/{sample}_{unit}_{lib}.{ref}.{pairing}.log",
     benchmark:
-        "benchmarks/mapping/bwa_mem/{sample}_{unit}_{lib}.{ref}.merged.log"
+        "benchmarks/mapping/bwa_mem/{sample}_{unit}_{lib}.{ref}.{pairing}.log"
+    wildcard_constraints:
+        pairing="paired|uncollapsed",
     params:
         extra=lambda w: f"-R {get_read_group(w)}",
         sorting="samtools",
@@ -69,21 +74,18 @@ rule bwa_mem_merged:
         "v2.6.0/bio/bwa/mem"
 
 
-rule bwa_mem_paired:
-    """Map trimmed paired reads from modern samples to reference genome"""
+rule bwa_mem_merged:
+    """Map collapsed reads from historical samples to reference genome"""
     input:
-        reads=expand(
-            "results/preprocessing/fastp/{{sample}}_{{unit}}_{{lib}}.{read}.fastq.gz",
-            read=["R1", "R2"],
-        ),
+        reads="results/preprocessing/fastp/{sample}_{unit}_{lib}.merged.fastq.gz",
         ref="results/ref/{ref}/{ref}.fa",
         idx=rules.bwa_index.output,
     output:
-        bam=temp("results/mapping/mapped/{sample}_{unit}_{lib}.{ref}.mem.paired.bam"),
+        bam=temp("results/mapping/mapped/{sample}_{unit}_{lib}.{ref}.mem.merged.bam"),
     log:
-        "logs/mapping/bwa_mem/{sample}_{unit}_{lib}.{ref}.paired.log",
+        "logs/mapping/bwa_mem/{sample}_{unit}_{lib}.{ref}.merged.log",
     benchmark:
-        "benchmarks/mapping/bwa_mem/{sample}_{unit}_{lib}.{ref}.paired.log"
+        "benchmarks/mapping/bwa_mem/{sample}_{unit}_{lib}.{ref}.merged.log"
     params:
         extra=lambda w: f"-R {get_read_group(w)}",
         sorting="samtools",
@@ -127,10 +129,9 @@ rule mark_duplicates:
         extra=config["params"]["picard"]["MarkDuplicates"],
     shadow:
         "minimal"
-    threads: 1
+    threads: lambda wildcards, attempt: attempt * 4
     resources:
         runtime=lambda wildcards, attempt: attempt * 1440,
-        mem_mb=lambda wildcards, attempt, input: int(attempt * (input.size_mb * 3)),
     wrapper:
         "v1.17.2/bio/picard/markduplicates"
 

diff --git a/workflow/rules/2.1_sample_qc.smk b/workflow/rules/2.1_sample_qc.smk
@@ -117,7 +117,7 @@ rule compile_endo_cont:
         runtime=lambda wildcards, attempt: attempt * 15,
     shell:
         """
-        (printf "sample\tperc.merged.map\tperc.paired.map\tperc.total.map\n" > {output}
+        (printf "sample\tperc.collapsed.map\tperc.uncollapsed.map\tperc.total.map\n" > {output}
         cat {input} >> {output}) 2> {log}
         """
 

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -352,7 +352,12 @@ def get_sample_bams(wildcards):
     reads = units.loc[units["sample"] == wildcards.sample]
     combos = reads[["sample", "unit", "lib"]].agg("_".join, axis=1)
     if wildcards.sample in samples.index[samples.time == "historical"]:
-        aligner = config["analyses"]["mapping"]["historical_aligner"]
+        if wildcards.pairing == "paired":
+            return expand(
+                "results/mapping/mapped/{combo}.{{ref}}.mem.uncollapsed.bam",
+                combo=combos,
+            )
+        aligner = config["analyses"]["mapping"]["historical_collapsed_aligner"]
     else:
         aligner = "mem"
     return expand(
@@ -364,20 +369,15 @@ def get_sample_bams(wildcards):
 
 ## Select which duplicate removal process the bam goes through
 def get_dedup_bams(wildcards):
-    if config["analyses"]["mapping"]["historical_only_collapsed"]:
-        s = wildcards.sample
-        if s in samples.index[samples.time == "historical"]:
-            return ["results/mapping/dedup/{sample}.{ref}.merged.rmdup.bam"]
-        elif s in samples.index[samples.time == "modern"]:
-            return [
-                "results/mapping/dedup/{sample}.{ref}.paired.rmdup.bam",
-                "results/mapping/dedup/{sample}.{ref}.merged.rmdup.bam",
-            ]
-    else:
+    s = wildcards.sample
+    if s in samples.index[samples.time == "historical"]:
+        if config["analyses"]["mapping"]["historical_only_collapsed"]:
+            return "results/mapping/dedup/{sample}.{ref}.merged.rmdup.bam"
         return [
             "results/mapping/dedup/{sample}.{ref}.paired.rmdup.bam",
             "results/mapping/dedup/{sample}.{ref}.merged.rmdup.bam",
         ]
+    return "results/mapping/dedup/{sample}.{ref}.paired.rmdup.bam"
 
 
 ## Determine what bam to use in analyses. This decides whether to use user provided
@@ -425,18 +425,14 @@ def get_endo_cont_stat(wildcards):
             "paired": "results/mapping/user-provided-bams/{sample}.{ref}.user-processed.flagstat",
             "merged": "results/mapping/user-provided-bams/{sample}.{ref}.user-processed.flagstat",
         }
-    if (config["analyses"]["mapping"]["historical_only_collapsed"]) and (
-        s in samples.index[samples.time == "historical"]
-    ):
-        return {
-            "paired": "results/mapping/mapped/{sample}.{ref}.merged.flagstat",
-            "merged": "results/mapping/mapped/{sample}.{ref}.merged.flagstat",
-        }
-    else:
+    if s in samples.index[samples.time == "historical"]:
+        if config["analyses"]["mapping"]["historical_only_collapsed"]:
+            return {"merged": "results/mapping/mapped/{sample}.{ref}.merged.flagstat"}
         return {
             "paired": "results/mapping/mapped/{sample}.{ref}.paired.flagstat",
             "merged": "results/mapping/mapped/{sample}.{ref}.merged.flagstat",
         }
+    return {"paired": "results/mapping/mapped/{sample}.{ref}.paired.flagstat"}
 
 
 # ANGSD

diff --git a/workflow/scripts/calc_endocont.sh b/workflow/scripts/calc_endocont.sh
@@ -3,38 +3,49 @@
 (merged="${snakemake_input[merged]}"
 paired="${snakemake_input[paired]}"
 
-tot_merged=$(grep -E "^[0-9]+ \+ [0-9]+ in total" $merged | awk '{print $1}')
-map_merged=$(grep -E "^[0-9]+ \+ [0-9]+ mapped" $merged | awk '{print $1}')
-if [ $tot_merged == 0 ]; then
-    perc_merged="No merged reads..."
+if [[ -z "$merged" ]]; then
+    tot_merged=0
+    map_merged=0
+    perc_merged="NA"
 else
-    perc_merged=$(echo $tot_merged $map_merged | awk '{printf "%.2f", $2/$1*100}')
+    tot_merged=$(grep -E "^[0-9]+ \+ [0-9]+ in total" $merged | awk '{print $1}')
+    map_merged=$(grep -E "^[0-9]+ \+ [0-9]+ mapped" $merged | awk '{print $1}')
+    if [ $tot_merged == 0 ]; then
+        perc_merged="No reads collapsed..."
+    else
+        perc_merged=$(echo $tot_merged $map_merged | awk '{printf "%.2f", $2/$1*100}')
+    fi
 fi
 
-if [ "$merged" == "$paired" ]; then
-    tot_paired="NA"
-    map_paired="NA"
+if [[ -z "$paired" ]]; then
+    tot_paired=0
+    map_paired=0
     perc_paired="NA"
-    tot_tot="$tot_merged"
-    map_tot="$map_merged"
 else
     tot_paired=$(grep -E "^[0-9]+ \+ [0-9]+ in total" $paired | awk '{print $1}')
     map_paired=$(grep -E "^[0-9]+ \+ [0-9]+ mapped" $paired | awk '{print $1}')
     if [ $tot_paired == 0 ]; then
-        perc_paired="No paired reads..."
+        perc_paired="No uncollapsed reads..."
     else
         perc_paired=$(echo $tot_paired $map_paired | awk '{printf "%.2f", $2/$1*100}')
     fi
-    tot_tot=$(echo $tot_merged"+"$tot_paired | bc)
-    map_tot=$(echo $map_merged"+"$map_paired | bc)
 fi
 
+tot_tot=$(echo $tot_merged"+"$tot_paired | bc)
+map_tot=$(echo $map_merged"+"$map_paired | bc)
 if [ $tot_tot == 0 ]; then
-    perc_tot="No reads..."
+    perc_tot="No reads, mapped or unmapped..."
 else
     perc_tot=$(echo $tot_tot $map_tot | awk '{printf "%.2f", $2/$1*100}')
 fi
 
+if [[ ! -z "$paired" && "$merged" ]]; then
+    if [ "$merged" == "$paired" ]; then
+        perc_merged="NA"
+        perc_tot=$perc_paired
+        perc_paired="NA"
+    fi
+fi
 
 echo -e "${snakemake_wildcards[sample]}\t"$perc_merged"\t"$perc_paired"\t"$perc_tot \
     > "${snakemake_output[endo]}") 2> "${snakemake_log[0]}"