diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 180b573..3a9db55 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -24,10 +24,6 @@ exclude_ind: [] excl_pca-admix: [] -#==================== Downsampling Configuration ======================# - -downsample_cov: - #====================== Analysis Selection ============================# populations: [] @@ -68,6 +64,29 @@ analyses: inbreeding_ngsf-hmm: true ibs_matrix: true +#==================== Downsampling Configuration ======================# + +subsample_dp: 2 + +subsample_redo_filts: true + +subsample_analyses: + estimate_ld: true + ld_decay: true + pca_pcangsd: true + admix_ngsadmix: true + relatedness: + ngsrelate: true + ibsrelate_ibs: true + ibsrelate_sfs: true + thetas_angsd: true + heterozygosity_angsd: true + fst_angsd: + populations: true + individuals: true + inbreeding_ngsf-hmm: true + ibs_matrix: true + #=========================== Filter Sets ==============================# filter_beds: @@ -109,6 +128,7 @@ params: extra_beagle: "" snp_pval: "1e-6" min_maf: 0.05 + mindepthind_heterozygosity: 3 ngsld: max_kb_dist_est-ld: 200 max_kb_dist_decay: 100 diff --git a/README.md b/README.md index e58268c..20d7677 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,12 @@ Additionally, several data filtering options are available: - Removal of regions with low mappability for fragments of a specified size - Removal of regions with extreme high or low depth - Removal of regions with a certain amount of missing data +- Multiple filter sets from user provided BED files that can be intersected + with other enabled filters (for instance, performing analyses on neutral + sites and genic regions separately) + +All the above analyses can also be performed with sample depth subsampled to +a uniform level to account for differences in depth between samples. ## Getting Started diff --git a/config/README.md b/config/README.md index 68edfe3..225c295 100644 --- a/config/README.md +++ b/config/README.md @@ -282,6 +282,30 @@ settings for each analysis are set in the next section. - `ibs_matrix:` Estimate pairwise identity by state distance between all samples using ANGSD. (`true`/`false`) +#### Downsampling Section + +As this workflow is aimed at low coverage samples, its likely there might be +considerable variance in sample depth. For this reason, it may be good to +subsample all your samples to a similar depth to examine if variation in depth +is influencing results. To do this, set an integer value here to subsample all +your samples down to and run specific analyses. + +- `subsample_dp:` A mean depth to subsample your reads to. This will be done + per sample, and subsample from all the reads. If a sample already has the + same, or lower, depth than this number, it will just be used as is in the + analysis. (INT) + +- `subsample_redo_filts:` Make a separate filtered sites file using the + subsampled bams to calculate depth based filters. If left disabled, the + depth filters will be determined from the full coverage files. + (`true`/`false`) + +- `subsample_analyses:` Individually enable analyses to be performed with the + subsampled data. These are the same as the ones above in the analyses + section. Enabling here will only run the analysis for the subsampled data, + if you want to run it for the full data as well, you need to enable it in the + analyses section as well. (`true`/`false`) + #### Filter Sets By default, this workflow will perform all analyses requested in the above diff --git a/config/config.yaml b/config/config.yaml index b54b27f..5a2dd6b 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -24,12 +24,6 @@ exclude_ind: [] excl_pca-admix: [] -#==================== Downsampling Configuration ======================# - -# untested, not recommended for now - -downsample_cov: - #====================== Analysis Selection ============================# populations: [] @@ -70,6 +64,29 @@ analyses: inbreeding_ngsf-hmm: false ibs_matrix: false +#==================== Downsampling Configuration ======================# + +subsample_dp: + +subsample_redo_filts: + +subsample_analyses: + estimate_ld: false + ld_decay: false + pca_pcangsd: false + admix_ngsadmix: false + relatedness: + ngsrelate: false + ibsrelate_ibs: false + ibsrelate_sfs: false + thetas_angsd: false + heterozygosity_angsd: false + fst_angsd: + populations: false + individuals: false + inbreeding_ngsf-hmm: false + ibs_matrix: false + #=========================== Filter Sets ==============================# filter_beds: @@ -109,6 +126,7 @@ params: extra_beagle: "" snp_pval: "1e-6" min_maf: 0.05 + mindepthind_heterozygosity: 3 ngsld: max_kb_dist_est-ld: 4000 max_kb_dist_decay: 100 diff --git a/workflow/Snakefile b/workflow/Snakefile index fb3dacb..c717d1b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -29,8 +29,9 @@ include: "rules/9.0_IBS.smk" # Set wildcard restraints -if config["downsample_cov"]: - dp = config["downsample_cov"] +if config["subsample_dp"]: + dp = config["subsample_dp"] + subdp = f".dp{dp}" subsample = [f".dp{dp}", ""] else: subsample = [""] @@ -55,10 +56,28 @@ wildcard_constraints: # Accumulate desired output files from config file all_outputs = [ - "results/datasets/{dataset}/qc/{dataset}.{ref}_all{dp}.sampleqc.html", - "results/datasets/{dataset}/filters/combined/{dataset}.{ref}{dp}_{sites}-filts.html", + "results/datasets/{dataset}/qc/{dataset}.{ref}_all.sampleqc.html", + "results/datasets/{dataset}/filters/combined/{dataset}.{ref}_{sites}-filts.html", ] +if config["subsample_dp"]: + all_outputs.extend( + expand( + "results/datasets/{{dataset}}/qc/{{dataset}}.{{ref}}_all{dp}.sampleqc.html", + dp=subdp, + ) + ) + + +if config["subsample_redo_filts"]: + all_outputs.extend( + expand( + "results/datasets/{{dataset}}/filters/combined/{{dataset}}.{{ref}}{dp}_{{sites}}-filts.html", + dp=subdp, + ) + ) + + if config["analyses"]["qualimap"]: if len(pipebams) > 0: all_outputs.extend( @@ -87,59 +106,63 @@ if config["analyses"]["damageprofiler"]: ) ) + +# Non-subsampled analyses + if config["analyses"]["estimate_ld"]: all_outputs.extend( expand( - "results/datasets/{{dataset}}/analyses/ngsLD/{{dataset}}.{{ref}}_{population}{{dp}}_{{sites}}-filts.ld_maxkbdist-{maxkb}_rndsample-{rndsmp}.gz", + "results/datasets/{{dataset}}/analyses/ngsLD/{{dataset}}.{{ref}}_{population}_{{sites}}-filts.ld_maxkbdist-{maxkb}_rndsample-{rndsmp}.gz", population=pop_list + ["all"], maxkb=config["params"]["ngsld"]["max_kb_dist_est-ld"], rndsmp=config["params"]["ngsld"]["rnd_sample_est-ld"], ) ) + if config["analyses"]["ld_decay"]: all_outputs.extend( expand( - "results/datasets/{{dataset}}/plots/LD_decay/{{dataset}}.{{ref}}_{population}{{dp}}_{{sites}}-filts.LDdecay.svg", + "results/datasets/{{dataset}}/plots/LD_decay/{{dataset}}.{{ref}}_{population}_{{sites}}-filts.LDdecay.svg", population=pop_list + ["all"], ) ) if config["analyses"]["relatedness"]["ngsrelate"]: all_outputs.append( - "results/datasets/{dataset}/analyses/kinship/ngsrelate/{dataset}.{ref}_all{dp}_{sites}-filts_relate.html" + "results/datasets/{dataset}/analyses/kinship/ngsrelate/{dataset}.{ref}_all_{sites}-filts_relate.html" ) if config["analyses"]["relatedness"]["ibsrelate_sfs"]: all_outputs.append( - "results/datasets/{dataset}/analyses/kinship/ibsrelate_sfs/{dataset}.{ref}_all{dp}_{sites}-filts.kinship.html" + "results/datasets/{dataset}/analyses/kinship/ibsrelate_sfs/{dataset}.{ref}_all_{sites}-filts.kinship.html" ) if config["analyses"]["relatedness"]["ibsrelate_ibs"]: all_outputs.append( - "results/datasets/{dataset}/analyses/kinship/ibsrelate_ibs/{dataset}.{ref}_all{dp}_{sites}-filts.kinship.html" + "results/datasets/{dataset}/analyses/kinship/ibsrelate_ibs/{dataset}.{ref}_all_{sites}-filts.kinship.html" ) if config["analyses"]["pca_pcangsd"]: if config["excl_pca-admix"]: all_outputs.extend( [ - "results/datasets/{dataset}/plots/pca/{dataset}.{ref}_all_excl_pca-admix{dp}_{sites}-filts_pc1-2.svg", - "results/datasets/{dataset}/plots/pca/{dataset}.{ref}_all_excl_pca-admix{dp}_{sites}-filts_pc3-4.svg", + "results/datasets/{dataset}/plots/pca/{dataset}.{ref}_all_excl_pca-admix_{sites}-filts_pc1-2.svg", + "results/datasets/{dataset}/plots/pca/{dataset}.{ref}_all_excl_pca-admix_{sites}-filts_pc3-4.svg", ] ) else: all_outputs.extend( [ - "results/datasets/{dataset}/plots/pca/{dataset}.{ref}_all{dp}_{sites}-filts_pc1-2.svg", - "results/datasets/{dataset}/plots/pca/{dataset}.{ref}_all{dp}_{sites}-filts_pc3-4.svg", + "results/datasets/{dataset}/plots/pca/{dataset}.{ref}_all_{sites}-filts_pc1-2.svg", + "results/datasets/{dataset}/plots/pca/{dataset}.{ref}_all_{sites}-filts_pc3-4.svg", ] ) if config["analyses"]["thetas_angsd"]: all_outputs.extend( expand( - "results/datasets/{{dataset}}/analyses/thetas/{{dataset}}.{{ref}}_all{{dp}}_{{sites}}-filts.window_{win}_{step}.{stat}.mean.html", + "results/datasets/{{dataset}}/analyses/thetas/{{dataset}}.{{ref}}_all_{{sites}}-filts.window_{win}_{step}.{stat}.mean.html", stat=["watterson", "pi", "tajima"], win=config["params"]["thetas"]["win_size"], step=config["params"]["thetas"]["win_step"], @@ -148,11 +171,11 @@ if config["analyses"]["thetas_angsd"]: if config["analyses"]["fst_angsd"]["populations"]: all_outputs.append( - "results/datasets/{dataset}/plots/fst/{dataset}.{ref}_poppairs{dp}_{sites}-filts.fst.global.pdf", + "results/datasets/{dataset}/plots/fst/{dataset}.{ref}_poppairs_{sites}-filts.fst.global.pdf", ) all_outputs.extend( expand( - "results/datasets/{{dataset}}/analyses/fst/{{dataset}}.{{ref}}_poppairs{{dp}}_{{sites}}-filts.fst.window_{win}_{step}.tsv", + "results/datasets/{{dataset}}/analyses/fst/{{dataset}}.{{ref}}_poppairs_{{sites}}-filts.fst.window_{win}_{step}.tsv", win=config["params"]["fst"]["win_size"], step=config["params"]["fst"]["win_step"], ) @@ -160,11 +183,11 @@ if config["analyses"]["fst_angsd"]["populations"]: if config["analyses"]["fst_angsd"]["individuals"]: all_outputs.append( - "results/datasets/{dataset}/plots/fst/{dataset}.{ref}_indpairs{dp}_{sites}-filts.fst.global.pdf", + "results/datasets/{dataset}/plots/fst/{dataset}.{ref}_indpairs_{sites}-filts.fst.global.pdf", ) all_outputs.extend( expand( - "results/datasets/{{dataset}}/analyses/fst/{{dataset}}.{{ref}}_indpairs{{dp}}_{{sites}}-filts.fst.window_{win}_{step}.tsv", + "results/datasets/{{dataset}}/analyses/fst/{{dataset}}.{{ref}}_indpairs_{{sites}}-filts.fst.window_{win}_{step}.tsv", win=config["params"]["fst"]["win_size"], step=config["params"]["fst"]["win_step"], ) @@ -173,14 +196,14 @@ if config["analyses"]["fst_angsd"]["individuals"]: if config["analyses"]["heterozygosity_angsd"]: all_outputs.extend( [ - "results/datasets/{dataset}/plots/heterozygosity/{dataset}.{ref}_all{dp}_{sites}-filts_heterozygosity.pdf", - "results/datasets/{dataset}/analyses/heterozygosity/{dataset}.{ref}_all{dp}_{sites}-filts_heterozygosity.html", + "results/datasets/{dataset}/plots/heterozygosity/{dataset}.{ref}_all_{sites}-filts_heterozygosity.pdf", + "results/datasets/{dataset}/analyses/heterozygosity/{dataset}.{ref}_all_{sites}-filts_heterozygosity.html", ] ) if config["analyses"]["inbreeding_ngsf-hmm"]: all_outputs.append( - "results/datasets/{dataset}/plots/inbreeding/{dataset}.{ref}_all{dp}_{sites}-filts.froh.pdf" + "results/datasets/{dataset}/plots/inbreeding/{dataset}.{ref}_all_{sites}-filts.froh.pdf" ) if config["analyses"]["admix_ngsadmix"]: @@ -188,8 +211,8 @@ if config["analyses"]["admix_ngsadmix"]: all_outputs.extend( expand( [ - "results/datasets/{{dataset}}/plots/evaladmix/{{dataset}}.{{ref}}_all_excl_pca-admix{{dp}}_{{sites}}-filts_K{kvalue}_evaladmix.html", - "results/datasets/{{dataset}}/plots/ngsadmix/{{dataset}}.{{ref}}_all_excl_pca-admix{{dp}}_{{sites}}-filts_K{kvalue}.svg", + "results/datasets/{{dataset}}/plots/evaladmix/{{dataset}}.{{ref}}_all_excl_pca-admix_{{sites}}-filts_K{kvalue}_evaladmix.html", + "results/datasets/{{dataset}}/plots/ngsadmix/{{dataset}}.{{ref}}_all_excl_pca-admix_{{sites}}-filts_K{kvalue}.svg", ], kvalue=config["params"]["ngsadmix"]["kvalues"], ) @@ -198,8 +221,8 @@ if config["analyses"]["admix_ngsadmix"]: all_outputs.extend( expand( [ - "results/datasets/{{dataset}}/plots/evaladmix/{{dataset}}.{{ref}}_all{{dp}}_{{sites}}-filts_K{kvalue}_evaladmix.html", - "results/datasets/{{dataset}}/plots/ngsadmix/{{dataset}}.{{ref}}_all{{dp}}_{{sites}}-filts_K{kvalue}.svg", + "results/datasets/{{dataset}}/plots/evaladmix/{{dataset}}.{{ref}}_all_{{sites}}-filts_K{kvalue}_evaladmix.html", + "results/datasets/{{dataset}}/plots/ngsadmix/{{dataset}}.{{ref}}_all_{{sites}}-filts_K{kvalue}.svg", ], kvalue=config["params"]["ngsadmix"]["kvalues"], ) @@ -207,7 +230,170 @@ if config["analyses"]["admix_ngsadmix"]: if config["analyses"]["ibs_matrix"]: all_outputs.append( - "results/datasets/{dataset}/analyses/IBS/{dataset}.{ref}_all{dp}_{sites}-filts.ibsMat" + "results/datasets/{dataset}/analyses/IBS/{dataset}.{ref}_all_{sites}-filts.ibsMat" + ) + +# Downsampled analyses + +if config["subsample_analyses"]["estimate_ld"]: + all_outputs.extend( + expand( + "results/datasets/{{dataset}}/analyses/ngsLD/{{dataset}}.{{ref}}_{population}{dp}_{{sites}}-filts.ld_maxkbdist-{maxkb}_rndsample-{rndsmp}.gz", + population=pop_list + ["all"], + dp=subdp, + maxkb=config["params"]["ngsld"]["max_kb_dist_est-ld"], + rndsmp=config["params"]["ngsld"]["rnd_sample_est-ld"], + ) + ) + + +if config["subsample_analyses"]["ld_decay"]: + all_outputs.extend( + expand( + "results/datasets/{{dataset}}/plots/LD_decay/{{dataset}}.{{ref}}_{population}{dp}_{{sites}}-filts.LDdecay.svg", + population=pop_list + ["all"], + dp=subdp, + ) + ) + +if config["subsample_analyses"]["relatedness"]["ngsrelate"]: + all_outputs.extend( + expand( + "results/datasets/{{dataset}}/analyses/kinship/ngsrelate/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts_relate.html", + dp=subdp, + ) + ) + +if config["subsample_analyses"]["relatedness"]["ibsrelate_sfs"]: + all_outputs.extend( + expand( + "results/datasets/{{dataset}}/analyses/kinship/ibsrelate_sfs/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts.kinship.html", + dp=subdp, + ) + ) + +if config["subsample_analyses"]["relatedness"]["ibsrelate_ibs"]: + all_outputs.extend( + expand( + "results/datasets/{{dataset}}/analyses/kinship/ibsrelate_ibs/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts.kinship.html", + dp=subdp, + ) + ) + +if config["subsample_analyses"]["pca_pcangsd"]: + if config["excl_pca-admix"]: + all_outputs.extend( + expand( + [ + "results/datasets/{{dataset}}/plots/pca/{{dataset}}.{{ref}}_all_excl_pca-admix{dp}_{{sites}}-filts_pc1-2.svg", + "results/datasets/{{dataset}}/plots/pca/{{dataset}}.{{ref}}_all_excl_pca-admix{dp}_{{sites}}-filts_pc3-4.svg", + ], + dp=subdp, + ) + ) + else: + all_outputs.extend( + expand( + [ + "results/datasets/{{dataset}}/plots/pca/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts_pc1-2.svg", + "results/datasets/{{dataset}}/plots/pca/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts_pc3-4.svg", + ], + dp=subdp, + ) + ) + +if config["subsample_analyses"]["thetas_angsd"]: + all_outputs.extend( + expand( + "results/datasets/{{dataset}}/analyses/thetas/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts.window_{win}_{step}.{stat}.mean.html", + stat=["watterson", "pi", "tajima"], + win=config["params"]["thetas"]["win_size"], + step=config["params"]["thetas"]["win_step"], + dp=subdp, + ) + ) + +if config["subsample_analyses"]["fst_angsd"]["populations"]: + all_outputs.extend( + expand( + "results/datasets/{{dataset}}/plots/fst/{{dataset}}.{{ref}}_poppairs{dp}_{{sites}}-filts.fst.global.pdf", + dp=subdp, + ) + ) + all_outputs.extend( + expand( + "results/datasets/{{dataset}}/analyses/fst/{{dataset}}.{{ref}}_poppairs{dp}_{{sites}}-filts.fst.window_{win}_{step}.tsv", + win=config["params"]["fst"]["win_size"], + step=config["params"]["fst"]["win_step"], + dp=subdp, + ) + ) + +if config["subsample_analyses"]["fst_angsd"]["individuals"]: + all_outputs.extend( + expand( + "results/datasets/{{dataset}}/plots/fst/{{dataset}}.{{ref}}_indpairs{dp}_{{sites}}-filts.fst.global.pdf", + dp=subdp, + ) + ) + all_outputs.extend( + expand( + "results/datasets/{{dataset}}/analyses/fst/{{dataset}}.{{ref}}_indpairs{dp}_{{sites}}-filts.fst.window_{win}_{step}.tsv", + win=config["params"]["fst"]["win_size"], + step=config["params"]["fst"]["win_step"], + dp=subdp, + ) + ) + +if config["subsample_analyses"]["heterozygosity_angsd"]: + all_outputs.extend( + expand( + [ + "results/datasets/{{dataset}}/plots/heterozygosity/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts_heterozygosity.pdf", + "results/datasets/{{dataset}}/analyses/heterozygosity/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts_heterozygosity.html", + ], + dp=subdp, + ) + ) + +if config["subsample_analyses"]["inbreeding_ngsf-hmm"]: + all_outputs.extend( + expand( + "results/datasets/{{dataset}}/plots/inbreeding/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts.froh.pdf", + dp=subdp, + ) + ) + +if config["subsample_analyses"]["admix_ngsadmix"]: + if config["excl_pca-admix"]: + all_outputs.extend( + expand( + [ + "results/datasets/{{dataset}}/plots/evaladmix/{{dataset}}.{{ref}}_all_excl_pca-admix{dp}_{{sites}}-filts_K{kvalue}_evaladmix.html", + "results/datasets/{{dataset}}/plots/ngsadmix/{{dataset}}.{{ref}}_all_excl_pca-admix{dp}_{{sites}}-filts_K{kvalue}.svg", + ], + kvalue=config["params"]["ngsadmix"]["kvalues"], + dp=subdp, + ) + ) + else: + all_outputs.extend( + expand( + [ + "results/datasets/{{dataset}}/plots/evaladmix/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts_K{kvalue}_evaladmix.html", + "results/datasets/{{dataset}}/plots/ngsadmix/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts_K{kvalue}.svg", + ], + kvalue=config["params"]["ngsadmix"]["kvalues"], + dp=subdp, + ) + ) + +if config["subsample_analyses"]["ibs_matrix"]: + all_outputs.extend( + expand( + "results/datasets/{{dataset}}/analyses/IBS/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts.ibsMat", + dp=subdp, + ) ) @@ -219,6 +405,5 @@ rule all: sample=samples.index, ref=config["reference"]["name"], dataset=config["dataset"], - dp=subsample, sites=filters, ), diff --git a/workflow/rules/2.0_mapping.smk b/workflow/rules/2.0_mapping.smk index 9384f62..fc8e018 100644 --- a/workflow/rules/2.0_mapping.smk +++ b/workflow/rules/2.0_mapping.smk @@ -303,6 +303,7 @@ rule bam_clipoverlap_userbams: ruleorder: symlink_bams > samtools_index +ruleorder: samtools_subsample > samtools_index rule samtools_index: @@ -347,23 +348,24 @@ rule samtools_subsample: coverage """ input: - bam=get_bamlist_bams, - bai=get_bamlist_bais, - depth="results/datasets/{dataset}/qc/ind_depth/filtered/{dataset}_{population}.depth.sum", - bed="results/datasets/{dataset}/genotyping/filters/beds/{dataset}_filts.bed", + bam="results/datasets/{dataset}/bams/{sample}.{ref}.bam", + bai="results/datasets/{dataset}/bams/{sample}.{ref}.bam.bai", + depth="results/mapping/qc/ind_depth/unfiltered/{dataset}.{ref}_{sample}_allsites-unfilt.depth.sum", output: - bam="results/datasets/{dataset}/bams/{population}{dp}.bam", - bai="results/datasets/{dataset}/bams/{population}{dp}.bam.bai", + bam="results/datasets/{dataset}/bams/{sample}.{ref}{dp}.bam", + bai="results/datasets/{dataset}/bams/{sample}.{ref}{dp}.bam.bai", log: - "logs/mapping/samtools/subsample/{dataset}_{population}{dp}.log", + "logs/mapping/samtools/subsample/{dataset}_{sample}.{ref}{dp}.log", benchmark: - "benchmarks/mapping/samtools/subsample/{dataset}_{population}{dp}.log" + "benchmarks/mapping/samtools/subsample/{dataset}_{sample}.{ref}{dp}.log" + wildcard_constraints: + dp=f".dp{config['subsample_dp']}", conda: "../envs/samtools.yaml" shadow: "minimal" params: - dp=config["downsample_cov"], + dp=config["subsample_dp"], resources: runtime=lambda wildcards, attempt: attempt * 720, shell: @@ -374,8 +376,8 @@ rule samtools_subsample: if [ `awk 'BEGIN {{print ('$prop' <= 1.0)}}'` = 1 ]; then propdec=$(echo $prop | awk -F "." '{{print $2}}') - samtools view -h -s ${{RANDOM}}.${{propdec}} -q 30 -L {input.bed} -@ {threads} \ - -b {input.bam} > {output.bam} 2> {log} + samtools view -h -s 0.${{propdec}} -@ {threads} -b {input.bam} \ + > {output.bam} 2> {log} samtools index {output.bam} 2>> {log} else echo "WARNING: Depth of sample is lower than subsample depth." \ diff --git a/workflow/rules/2.1_sample_qc.smk b/workflow/rules/2.1_sample_qc.smk index 5de0e6d..0c81c9d 100644 --- a/workflow/rules/2.1_sample_qc.smk +++ b/workflow/rules/2.1_sample_qc.smk @@ -162,12 +162,11 @@ rule ind_filtered_depth: depth of positions that will go into likelihood calculations """ input: + unpack(filt_depth), bamlist="results/datasets/{dataset}/bamlists/{dataset}.{ref}_{population}{dp}.bamlist", bams=get_bamlist_bams, bais=get_bamlist_bais, ref="results/ref/{ref}/{ref}.fa", - sites="results/datasets/{dataset}/filters/combined/{dataset}.{ref}{dp}_{sites}-filts.sites", - idx="results/datasets/{dataset}/filters/combined/{dataset}.{ref}{dp}_{sites}-filts.sites.idx", output: sample_hist="results/datasets/{dataset}/qc/ind_depth/filtered/{dataset}.{ref}_{population}{dp}_{sites}-filts.depthSample", global_hist=temp( diff --git a/workflow/rules/3.1_safs.smk b/workflow/rules/3.1_safs.smk index dcb07df..daa40fa 100644 --- a/workflow/rules/3.1_safs.smk +++ b/workflow/rules/3.1_safs.smk @@ -6,13 +6,12 @@ rule angsd_doSaf_pop: Generate a site allele frequency file for a given population and genome chunk. """ input: + unpack(filt_depth), bam="results/datasets/{dataset}/bamlists/{dataset}.{ref}_{population}{dp}.bamlist", bams=get_bamlist_bams, bais=get_bamlist_bais, ref="results/ref/{ref}/{ref}.fa", regions="results/datasets/{dataset}/filters/chunks/{ref}_chunk{chunk}.rf", - sites="results/datasets/{dataset}/filters/combined/{dataset}.{ref}_{sites}-filts.sites", - idx="results/datasets/{dataset}/filters/combined/{dataset}.{ref}_{sites}-filts.sites.idx", output: saf=temp( "results/datasets/{dataset}/safs/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.saf.gz" @@ -105,16 +104,15 @@ rule realSFS_catsaf: rule angsd_doSaf_sample: """ - Generate a site allele frequency file for a given downsampled population and genome + Generate a site allele frequency file for a given subsampled population and genome chunk. """ input: + unpack(filt_depth), bam="results/datasets/{dataset}/bamlists/{dataset}.{ref}_{population}{dp}.bamlist", bams=get_bamlist_bams, bais=get_bamlist_bais, ref="results/ref/{ref}/{ref}.fa", - sites="results/datasets/{dataset}/filters/combined/{dataset}.{ref}_{sites}-filts.sites", - idx="results/datasets/{dataset}/filters/combined/{dataset}.{ref}_{sites}-filts.sites.idx", output: saf="results/datasets/{dataset}/safs/{dataset}.{ref}_{population}{dp}_{sites}-filts.saf.gz", safidx="results/datasets/{dataset}/safs/{dataset}.{ref}_{population}{dp}_{sites}-filts.saf.idx", @@ -132,6 +130,7 @@ rule angsd_doSaf_sample: gl_model=config["params"]["angsd"]["gl_model"], extra=config["params"]["angsd"]["extra"], extra_saf=config["params"]["angsd"]["extra_saf"], + mindepthind=config["params"]["angsd"]["mindepthind_heterozygosity"], mapQ=config["mapQ"], baseQ=config["baseQ"], out=lambda w, output: os.path.splitext(output.arg)[0], @@ -142,5 +141,6 @@ rule angsd_doSaf_sample: (angsd -doSaf 1 -bam {input.bam} -GL {params.gl_model} -ref {input.ref} \ -nThreads {threads} {params.extra} -minMapQ {params.mapQ} \ -minQ {params.baseQ} -sites {input.sites} -anc {input.ref} \ - {params.extra_saf} -out {params.out}) &> {log} + -setMinDepthInd {params.mindepthind} {params.extra_saf} \ + -out {params.out}) &> {log} """ diff --git a/workflow/rules/3.2_beagles.smk b/workflow/rules/3.2_beagles.smk index f438c16..74c7fe9 100644 --- a/workflow/rules/3.2_beagles.smk +++ b/workflow/rules/3.2_beagles.smk @@ -9,13 +9,12 @@ rule angsd_doGlf2: population beagle files, even if a population is fixed for a certain allele. """ input: + unpack(filt_depth), bam="results/datasets/{dataset}/bamlists/{dataset}.{ref}_{population}{dp}.bamlist", bams=get_bamlist_bams, bais=get_bamlist_bais, ref="results/ref/{ref}/{ref}.fa", regions="results/datasets/{dataset}/filters/chunks/{ref}_chunk{chunk}.rf", - sites="results/datasets/{dataset}/filters/combined/{dataset}.{ref}_{sites}-filts.sites", - idx="results/datasets/{dataset}/filters/combined/{dataset}.{ref}_{sites}-filts.sites.idx", output: beagle=temp( "results/datasets/{dataset}/beagles/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.beagle.gz" diff --git a/workflow/rules/5.0_relatedness.smk b/workflow/rules/5.0_relatedness.smk index 9c6531b..710e5d6 100644 --- a/workflow/rules/5.0_relatedness.smk +++ b/workflow/rules/5.0_relatedness.smk @@ -51,13 +51,12 @@ rule compile_kinship_stats_sfs: rule doGlf1_ibsrelate: input: + unpack(filt_depth), bam="results/datasets/{dataset}/bamlists/{dataset}.{ref}_{population}{dp}.bamlist", bams=get_bamlist_bams, bais=get_bamlist_bais, ref="results/ref/{ref}/{ref}.fa", regions="results/datasets/{dataset}/filters/chunks/{ref}_chunk{chunk}.rf", - sites="results/datasets/{dataset}/filters/combined/{dataset}.{ref}_{sites}-filts.sites", - idx="results/datasets/{dataset}/filters/combined/{dataset}.{ref}_{sites}-filts.sites.idx", output: glf=temp( "results/datasets/{dataset}/glfs/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.glf.gz" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 6a8c55a..2502fde 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -454,6 +454,18 @@ def get_endo_cont_stat(wildcards): # return "results/datasets/{dataset}/glfs/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_allsites-filts.glf.gz" +def filt_depth(wildcards): + if config["subsample_redo_filts"]: + return { + "sites": "results/datasets/{dataset}/filters/combined/{dataset}.{ref}{dp}_{sites}-filts.sites", + "idx": "results/datasets/{dataset}/filters/combined/{dataset}.{ref}{dp}_{sites}-filts.sites.idx", + } + return { + "sites": "results/datasets/{dataset}/filters/combined/{dataset}.{ref}_{sites}-filts.sites", + "idx": "results/datasets/{dataset}/filters/combined/{dataset}.{ref}_{sites}-filts.sites.idx", + } + + ## Get random sampling proportion depending on if LD decay is being calculated ## or if LD pruning is being done def get_ngsld_sampling(wildcards):