diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 5c359dd..0688b17 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -149,14 +149,12 @@ params: extra_saf: "" # goes to all -doSaf runs extra_beagle: "" # goes to all -doGlf 2 runs domajorminor: 1 # Currently only compatible with values 1, 2, 4, 5 - domaf: 1 # Currently only compatible with values 1, 2, 3 - popmafmaj: "population" # "all" or "population" + domaf: 1 snp_pval: "1e-6" min_maf: -1 # Set -1 to disable mindepthind_heterozygosity: 1 ngsrelate: - ibsrelate-only-extra: "" - freqbased-extra: "" + prune: true ngsld: max_kb_dist_est-ld: 200 rnd_sample_est-ld: 0.001 diff --git a/config/config.yaml b/config/config.yaml index 5b6922f..bd25994 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -148,14 +148,12 @@ params: extra_saf: "" # goes to all -doSaf runs extra_beagle: "" # goes to all -doGlf 2 runs domajorminor: 1 # Currently only compatible with values 1, 2, 4, 5 - domaf: 1 # Currently only compatible with values 1, 2, 3 - popmafmaj: "population" # "all" or "population" + domaf: 1 snp_pval: "1e-6" min_maf: -1 # Set -1 to disable mindepthind_heterozygosity: 1 ngsrelate: - ibsrelate-only-extra: "" - freqbased-extra: "" + prune: false ngsld: max_kb_dist_est-ld: 4000 rnd_sample_est-ld: 0.001 diff --git a/workflow/Snakefile b/workflow/Snakefile index 6594381..e191e3a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -59,7 +59,6 @@ wildcard_constraints: dp=".{0}|.dp[1-9][0-9]*", chunk="[0-9]+", sites="|".join(filters), - maj="all|pop|anc|ref", # Accumulate desired output files from config file diff --git a/workflow/rules/3.2_beagles.smk b/workflow/rules/3.2_beagles.smk index e38aa14..979f9f9 100644 --- a/workflow/rules/3.2_beagles.smk +++ b/workflow/rules/3.2_beagles.smk @@ -9,7 +9,7 @@ rule angsd_doGlf2: population beagle files, even if a population is fixed for a certain allele. """ input: - unpack(get_sitesfile), + unpack(filt_depth), unpack(get_anc_ref), bam="results/datasets/{dataset}/bamlists/{dataset}.{ref}_{population}{dp}.bamlist", bams=get_bamlist_bams, @@ -18,13 +18,13 @@ rule angsd_doGlf2: reffai="results/ref/{ref}/{ref}.fa.fai", regions="results/datasets/{dataset}/filters/chunks/{ref}_chunk{chunk}.rf", output: - beagle="results/datasets/{dataset}/beagles/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.beagle.gz", - maf="results/datasets/{dataset}/beagles/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.mafs.gz", - arg="results/datasets/{dataset}/beagles/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.arg", + beagle="results/datasets/{dataset}/beagles/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.beagle.gz", + maf="results/datasets/{dataset}/beagles/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.mafs.gz", + arg="results/datasets/{dataset}/beagles/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.arg", log: - "logs/{dataset}/angsd/doGlf2/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.log", + "logs/{dataset}/angsd/doGlf2/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.log", benchmark: - "benchmarks/{dataset}/angsd/doGlf2/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.log" + "benchmarks/{dataset}/angsd/doGlf2/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.log" container: angsd_container params: @@ -33,9 +33,10 @@ rule angsd_doGlf2: extra_beagle=config["params"]["angsd"]["extra_beagle"], mapQ=config["mapQ"], baseQ=config["baseQ"], + snp_pval=config["params"]["angsd"]["snp_pval"], maf=config["params"]["angsd"]["domaf"], - snp_pval_maf=get_snppval_maf, - majmin=get_majmin, + minmaf=config["params"]["angsd"]["min_maf"], + majmin=config["params"]["angsd"]["domajorminor"], counts=get_docounts, trans=get_trans, nind=get_nind, @@ -48,12 +49,12 @@ rule angsd_doGlf2: shell: """ angsd -doGlf 2 -bam {input.bam} -GL {params.gl_model} -ref {input.ref} \ - {params.majmin} -doMaf {params.maf} {params.snp_pval_maf} \ - -nThreads {threads} {params.extra} -minMapQ {params.mapQ} \ - -minQ {params.baseQ} -sites {input.sites} -anc {input.anc} \ - {params.extra_beagle} -rf {input.regions} {params.minind} \ - -setMinDepthInd {params.mininddp} {params.counts} \ - -rmTrans {params.trans} -out {params.out} &> {log} + -doMajorMinor {params.majmin} -doMaf {params.maf} -minMaf {params.minmaf} \ + -SNP_pval {params.snp_pval} -nThreads {threads} {params.extra} \ + -minMapQ {params.mapQ} -minQ {params.baseQ} -sites {input.sites} \ + -anc {input.anc} {params.extra_beagle} -rf {input.regions} {params.minind} \ + -setMinDepthInd {params.mininddp} {params.counts} -rmTrans {params.trans} \ + -out {params.out} &> {log} """ @@ -63,15 +64,15 @@ rule merge_beagle: """ input: lambda w: expand( - "results/datasets/{{dataset}}/beagles/chunks/{{dataset}}.{{ref}}_{{population}}{{dp}}_chunk{chunk}_{{sites}}-filts.{{maj}}maj.beagle.gz", + "results/datasets/{{dataset}}/beagles/chunks/{{dataset}}.{{ref}}_{{population}}{{dp}}_chunk{chunk}_{{sites}}-filts.beagle.gz", chunk=chunklist, ), output: - beagle="results/datasets/{dataset}/beagles/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj.beagle.gz", + beagle="results/datasets/{dataset}/beagles/{dataset}.{ref}_{population}{dp}_{sites}-filts.beagle.gz", log: - "logs/{dataset}/angsd/doGlf2/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj_merge-beagle.log", + "logs/{dataset}/angsd/doGlf2/{dataset}.{ref}_{population}{dp}_{sites}-filts_merge-beagle.log", benchmark: - "benchmarks/{dataset}/angsd/doGlf2/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj_merge-beagle.log" + "benchmarks/{dataset}/angsd/doGlf2/{dataset}.{ref}_{population}{dp}_{sites}-filts_merge-beagle.log" container: shell_container resources: @@ -93,15 +94,15 @@ rule merge_maf: """ input: lambda w: expand( - "results/datasets/{{dataset}}/beagles/chunks/{{dataset}}.{{ref}}_{{population}}{{dp}}_chunk{chunk}_{{sites}}-filts.{{maj}}maj.mafs.gz", + "results/datasets/{{dataset}}/beagles/chunks/{{dataset}}.{{ref}}_{{population}}{{dp}}_chunk{chunk}_{{sites}}-filts.mafs.gz", chunk=chunklist, ), output: - maf="results/datasets/{dataset}/beagles/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj.mafs.gz", + maf="results/datasets/{dataset}/mafs/{dataset}.{ref}_{population}{dp}_{sites}-filts.mafs.gz", log: - "logs/{dataset}/angsd/doGlf2/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj_merge-mafs.log", + "logs/{dataset}/angsd/doGlf2/{dataset}.{ref}_{population}{dp}_{sites}-filts_merge-mafs.log", benchmark: - "benchmarks/{dataset}/angsd/doGlf2/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj_merge-mafs.log" + "benchmarks/{dataset}/angsd/doGlf2/{dataset}.{ref}_{population}{dp}_{sites}-filts_merge-mafs.log" container: shell_container resources: @@ -123,13 +124,13 @@ rule snpset: dataset. """ input: - "results/datasets/{dataset}/beagles/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj.mafs.gz", + "results/datasets/{dataset}/mafs/{dataset}.{ref}_{population}{dp}_{sites}-filts.mafs.gz", output: - "results/datasets/{dataset}/filters/snps/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj_snps.sites", + "results/datasets/{dataset}/filters/snps/{dataset}.{ref}_{population}{dp}_{sites}-filts_snps.sites", log: - "logs/{dataset}/filters/snps/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj_snps.log", + "logs/{dataset}/filters/snps/{dataset}.{ref}_{population}{dp}_{sites}-filts_snps.log", benchmark: - "benchmarks/{dataset}/filters/snps/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj_snps.log" + "benchmarks/{dataset}/filters/snps/{dataset}.{ref}_{population}{dp}_{sites}-filts_snps.log" container: shell_container shell: diff --git a/workflow/rules/4.0_estimate_LD.smk b/workflow/rules/4.0_estimate_LD.smk index d950c04..5d9f610 100644 --- a/workflow/rules/4.0_estimate_LD.smk +++ b/workflow/rules/4.0_estimate_LD.smk @@ -8,19 +8,19 @@ rule ngsLD_estLD: Estimates pairwise linkage disequilibrium between SNPs. """ input: - beagle="results/datasets/{dataset}/beagles/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.beagle.gz", + beagle="results/datasets/{dataset}/beagles/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.beagle.gz", bamlist="results/datasets/{dataset}/bamlists/{dataset}.{ref}_{population}{dp}.bamlist", output: ld=temp( - "results/datasets/{dataset}/{path}/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.ld_maxkbdist-{maxkb}_rndsample-{rndsmp}.gz" + "results/datasets/{dataset}/{path}/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.ld_maxkbdist-{maxkb}_rndsample-{rndsmp}.gz" ), pos=temp( - "results/datasets/{dataset}/{path}/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.ld_maxkbdist-{maxkb}_rndsample-{rndsmp}.pos", + "results/datasets/{dataset}/{path}/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.ld_maxkbdist-{maxkb}_rndsample-{rndsmp}.pos", ), log: - "logs/{dataset}/ngsLD/estLD/{path}/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.ld_maxkbdist-{maxkb}_rndsample-{rndsmp}.log", + "logs/{dataset}/ngsLD/estLD/{path}/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.ld_maxkbdist-{maxkb}_rndsample-{rndsmp}.log", benchmark: - "benchmarks/{dataset}/ngsLD/estLD/{path}/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.ld_maxkbdist-{maxkb}_rndsample-{rndsmp}.log" + "benchmarks/{dataset}/ngsLD/estLD/{path}/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.ld_maxkbdist-{maxkb}_rndsample-{rndsmp}.log" container: ngsld_container threads: lambda wildcards, attempt: attempt @@ -50,9 +50,8 @@ rule ngsLD_estLD: rule combine_LD_files: input: ldgz=expand( - "results/datasets/{{dataset}}/analyses/ngsLD/chunks/{{dataset}}.{{ref}}_{{population}}{{dp}}_chunk{chunk}_{{sites}}-filts.{maj}maj.ld_maxkbdist-{{maxkb}}_rndsample-{{rndsmp}}.gz", + "results/datasets/{{dataset}}/analyses/ngsLD/chunks/{{dataset}}.{{ref}}_{{population}}{{dp}}_chunk{chunk}_{{sites}}-filts.ld_maxkbdist-{{maxkb}}_rndsample-{{rndsmp}}.gz", chunk=chunklist, - maj=get_maj, ), output: ldgz="results/datasets/{dataset}/analyses/ngsLD/{dataset}.{ref}_{population}{dp}_{sites}-filts.ld_maxkbdist-{maxkb}_rndsample-{rndsmp}.gz", diff --git a/workflow/rules/4.1_linkage_pruning.smk b/workflow/rules/4.1_linkage_pruning.smk index 3230951..975726b 100644 --- a/workflow/rules/4.1_linkage_pruning.smk +++ b/workflow/rules/4.1_linkage_pruning.smk @@ -7,13 +7,13 @@ rule ngsLD_prune_sites: Prunes SNPs to produce a list of SNPs in linkage equilibrium. """ input: - ld="results/datasets/{dataset}/beagles/pruned/ngsLD/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.ld_maxkbdist-{maxkb}_rndsample-1.gz", + ld="results/datasets/{dataset}/beagles/pruned/ngsLD/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.ld_maxkbdist-{maxkb}_rndsample-1.gz", output: - sites="results/datasets/{dataset}/beagles/pruned/ngsLD/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.pruned_maxkbdist-{maxkb}_minr2-{r2}.sites", + sites="results/datasets/{dataset}/beagles/pruned/ngsLD/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.sites", log: - "logs/{dataset}/ngsLD/prune_sites/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj_maxkbdist-{maxkb}_minr2-{r2}.log", + "logs/{dataset}/ngsLD/prune_sites/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts_maxkbdist-{maxkb}_minr2-{r2}.log", benchmark: - "benchmarks/{dataset}/ngsLD/prune_sites/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj_maxkbdist-{maxkb}_minr2-{r2}.log" + "benchmarks/{dataset}/ngsLD/prune_sites/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts_maxkbdist-{maxkb}_minr2-{r2}.log" container: ngsld_container threads: 4 @@ -36,21 +36,21 @@ rule prune_chunk_beagle: Subsets beagle file to pruned SNPs. """ input: - beagle="results/datasets/{dataset}/beagles/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.beagle.gz", - sites="results/datasets/{dataset}/beagles/pruned/ngsLD/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.pruned_maxkbdist-{maxkb}_minr2-{r2}.sites", + beagle="results/datasets/{dataset}/beagles/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.beagle.gz", + sites="results/datasets/{dataset}/beagles/pruned/ngsLD/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.sites", output: - prunedgz="results/datasets/{dataset}/beagles/pruned/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", + prunedgz="results/datasets/{dataset}/beagles/pruned/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", log: - "logs/{dataset}/ngsLD/prune_beagle/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.pruned_maxkbdist-{maxkb}_minr2-{r2}.log", + "logs/{dataset}/ngsLD/prune_beagle/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.log", benchmark: - "benchmarks/{dataset}/ngsLD/prune_beagle/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.pruned_maxkbdist-{maxkb}_minr2-{r2}.log" + "benchmarks/{dataset}/ngsLD/prune_beagle/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.log" container: shell_container shadow: "minimal" threads: lambda wildcards, attempt: attempt params: - pruned="results/datasets/{dataset}/beagles/pruned/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.{maj}maj.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle", + pruned="results/datasets/{dataset}/beagles/pruned/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle", resources: runtime=lambda wildcards, attempt: attempt * 120, shell: @@ -78,15 +78,15 @@ rule merge_pruned_beagles: """ input: pruned=lambda w: expand( - "results/datasets/{{dataset}}/beagles/pruned/chunks/{{dataset}}.{{ref}}_{{population}}{{dp}}_chunk{chunk}_{{sites}}-filts.{{maj}}maj.pruned_maxkbdist-{{maxkb}}_minr2-{{r2}}.beagle.gz", + "results/datasets/{{dataset}}/beagles/pruned/chunks/{{dataset}}.{{ref}}_{{population}}{{dp}}_chunk{chunk}_{{sites}}-filts.pruned_maxkbdist-{{maxkb}}_minr2-{{r2}}.beagle.gz", chunk=chunklist, ), output: - beagle="results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", + beagle="results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", log: - "logs/{dataset}/ngsLD/merge_pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj_merge.pruned_maxkbdist-{maxkb}_minr2-{r2}.log", + "logs/{dataset}/ngsLD/merge_pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts_merge.pruned_maxkbdist-{maxkb}_minr2-{r2}.log", benchmark: - "benchmarks/{dataset}/ngsLD/merge_pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj_merge.pruned_maxkbdist-{maxkb}_minr2-{r2}.log" + "benchmarks/{dataset}/ngsLD/merge_pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts_merge.pruned_maxkbdist-{maxkb}_minr2-{r2}.log" wildcard_constraints: population="|".join( ["all"] diff --git a/workflow/rules/4.2_linkage_decay.smk b/workflow/rules/4.2_linkage_decay.smk index 7cdd98b..49fd718 100644 --- a/workflow/rules/4.2_linkage_decay.smk +++ b/workflow/rules/4.2_linkage_decay.smk @@ -1,19 +1,19 @@ rule combine_LDdecay_files: input: ldgz=expand( - "results/datasets/{{dataset}}/analyses/ngsLD/chunks/{{dataset}}.{{ref}}_{{population}}{{dp}}_chunk{chunk}_{{sites}}-filts.{{maj}}maj.ld_maxkbdist-{maxkb}_rndsample-{rndsmp}.gz", + "results/datasets/{{dataset}}/analyses/ngsLD/chunks/{{dataset}}.{{ref}}_{{population}}{{dp}}_chunk{chunk}_{{sites}}-filts.ld_maxkbdist-{maxkb}_rndsample-{rndsmp}.gz", chunk=chunklist, maxkb=config["params"]["ngsld"]["max_kb_dist_decay"], rndsmp=config["params"]["ngsld"]["rnd_sample_decay"], ), output: ldgz=temp( - "results/datasets/{dataset}/analyses/ngsLD/decay/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj.LDdecay.gz" + "results/datasets/{dataset}/analyses/ngsLD/decay/{dataset}.{ref}_{population}{dp}_{sites}-filts.LDdecay.gz" ), log: - "logs/{dataset}/ngsLD/combine_LDdecay_files/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj.ld_decay.log", + "logs/{dataset}/ngsLD/combine_LDdecay_files/{dataset}.{ref}_{population}{dp}_{sites}-filts.ld_decay.log", benchmark: - "benchmarks/{dataset}/ngsLD/combine_LDdecay_files/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj.ld_decay.log" + "benchmarks/{dataset}/ngsLD/combine_LDdecay_files/{dataset}.{ref}_{population}{dp}_{sites}-filts.ld_decay.log" container: shell_container shell: @@ -24,10 +24,7 @@ rule combine_LDdecay_files: rule fit_LD_decay: input: - lambda w: expand( - "results/datasets/{{dataset}}/analyses/ngsLD/decay/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.{maj}maj.LDdecay.gz", - maj=get_maj, - ), + "results/datasets/{dataset}/analyses/ngsLD/decay/{dataset}.{ref}_{population}{dp}_{sites}-filts.LDdecay.gz", output: plot=report( "results/datasets/{dataset}/plots/LD_decay/{dataset}.{ref}_{population}{dp}_{sites}-filts.LDdecay.pdf", diff --git a/workflow/rules/5.0_relatedness.smk b/workflow/rules/5.0_relatedness.smk index 56d2d33..0695616 100644 --- a/workflow/rules/5.0_relatedness.smk +++ b/workflow/rules/5.0_relatedness.smk @@ -164,10 +164,7 @@ rule ngsrelate_ibsrelate_only: does not require allele frequencies. """ input: - beagle=lambda w: expand( - "results/datasets/{{dataset}}/beagles/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.{maj}maj.beagle.gz", - maj=get_maj, - ), + unpack(get_ngsrelate_input), inds="results/datasets/{dataset}/poplists/{dataset}_{population}{dp}.indiv.list", output: ngsrelate=temp( @@ -184,7 +181,6 @@ rule ngsrelate_ibsrelate_only: threads: lambda wildcards, attempt: attempt * 4 params: nind=get_nind, - extra=config["params"]["ngsrelate"]["ibsrelate-only-extra"], resources: runtime=lambda wildcards, attempt: attempt * 360, shell: @@ -194,73 +190,43 @@ rule ngsrelate_ibsrelate_only: echo $nsites {params.nind} cut -f1 {input.inds} | tail -n +2 > {output.samples} ngsRelate -G {input.beagle} -n {params.nind} -L $nsites \ - -O {output.ngsrelate} -z {output.samples} {params.extra} + -O {output.relate} -z {output.samples} cut -f3-5,30-35 {output.ngsrelate} > {output.ibsrelate}) &> {log} """ -rule ngsrelate_freqbased: - """ - Estimates inbreeding and relatedness measures using NGSrelate. This will use - allele frequencies, enabling all coefficients, including IBS relate ones, - all will be kept. - """ - input: - beagle=lambda w: expand( - "results/datasets/{{dataset}}/beagles/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.{maj}maj.beagle.gz", - maj=get_maj, - ), - mafs=lambda w: expand( - "results/datasets/{{dataset}}/beagles/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.{maj}maj.mafs.gz", - maj=get_maj, - ), - inds="results/datasets/{dataset}/poplists/{dataset}_{population}{dp}.indiv.list", - output: - relate="results/datasets/{dataset}/analyses/kinship/ngsrelate/{dataset}.{ref}_{population}{dp}_{sites}-filts_ngsrelate-freq.tsv", - freq="results/datasets/{dataset}/analyses/kinship/ngsrelate/{dataset}.{ref}_{population}{dp}_{sites}-filts_ngsrelate-freq.freqs", - samples="results/datasets/{dataset}/analyses/kinship/ngsrelate/{dataset}.{ref}_{population}{dp}_{sites}-filts_samples.ngsrelate-freq.list", - wildcard_constraints: - population="|".join(pop_list), - log: - "logs/{dataset}/kinship/ngsrelate/{dataset}.{ref}_{population}{dp}_{sites}-filts.ngsrelate-freq.log", - container: - ngsrelate_container - threads: lambda wildcards, attempt: attempt * 4 - params: - nind=get_nind, - extra=config["params"]["ngsrelate"]["freqbased-extra"], - resources: - runtime=lambda wildcards, attempt: attempt * 360, - shell: - r""" - (echo "nind" - echo {params.nind} - cut -f1 {input.inds} | tail -n +2 > {output.samples} - zcat {input.mafs} | cut -f7 | sed 1d > {output.freq} - ngsRelate -G {input.beagle} -n {params.nind} -O {output.relate} \ - -z {output.samples} -f {output.freq} {params.extra}) &> {log} - """ - - -rule ngsrelate_freqbased_merge: - input: - relates=expand( - "results/datasets/{{dataset}}/analyses/kinship/ngsrelate/{{dataset}}.{{ref}}_{population}{{dp}}_{{sites}}-filts_ngsrelate-freq.tsv", - population=pop_list, - ), - output: - "results/datasets/{dataset}/analyses/kinship/ngsrelate/{dataset}.{ref}_all{dp}_{sites}-filts_ngsrelate-freq.tsv", - log: - "logs/{dataset}/kinship/ngsrelate/{dataset}.{ref}_all{dp}_{sites}-filts.ngsrelate-freq.log", - container: - shell_container - shell: - """ - (head -n 1 {input.relates[0]} > {output} - for i in {input.relates}; do - tail -n+2 $i >> {output} - done) 2> {log} - """ +# rule ngsrelate_freqbased: +# """ +# Estimates inbreeding and relatedness measures using NGSrelate. This will use +# allele frequencies, enabling all coefficients, including IBS relate ones. +# All will be kept. +# """ +# input: +# unpack(get_ngsrelate_input), +# inds="results/datasets/{dataset}/poplists/{dataset}_{population}{dp}.indiv.list", +# output: +# relate="results/datasets/{dataset}/analyses/kinship/ngsrelate/{dataset}.{ref}_{population}{dp}_{sites}-filts_relate.tsv", +# samples="results/datasets/{dataset}/analyses/kinship/ngsrelate/{dataset}.{ref}_{population}{dp}_{sites}-filts_samples.list", +# wildcard_constraints: +# population="all", +# log: +# "logs/{dataset}/kinship/ngsrelate/{dataset}.{ref}_{population}{dp}_{sites}-filts.log", +# container: +# ngsrelate_container +# threads: lambda wildcards, attempt: attempt * 4 +# params: +# nind=get_nind, +# resources: +# runtime=lambda wildcards, attempt: attempt * 360, +# shell: +# r""" +# (nsites=$(zcat {input.beagle} | tail -n +2 | wc -l) +# echo "nsites nind" +# echo $nsites {params.nind} +# cut -f1 {input.inds} | tail -n +2 > {output.samples} +# ngsRelate -G {input.beagle} -n {params.nind} -L $nsites -O {output.relate} \ +# -z {output.samples}) &> {log} +# """ rule ngsrelate_summary: @@ -281,8 +247,6 @@ rule ngsrelate_summary: "Type": "Table", }, ), - wildcard_constraints: - method="ngsrelate-freq|ibsrelate-nofreq", log: "logs/{dataset}/kinship/ngsrelate/{dataset}.{ref}_all{dp}_{sites}-filts_{method}.tsv2html.log", container: diff --git a/workflow/rules/6.0_pca.smk b/workflow/rules/6.0_pca.smk index 58c70da..30b1b6d 100644 --- a/workflow/rules/6.0_pca.smk +++ b/workflow/rules/6.0_pca.smk @@ -8,15 +8,15 @@ rule remove_excl_pca_admix: results), while allowing them in all other analyses. """ input: - "results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts.{maj}maj.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", + "results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", output: - "results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_{population}_excl_pca-admix{dp}_{sites}-filts.{maj}maj.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", + "results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_{population}_excl_pca-admix{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", wildcard_constraints: population="all", log: - "logs/{dataset}/ngsLD/excl_pca_admix_beagle/{dataset}.{ref}_{population}_excl_pca-admix{dp}_{sites}-filts.{maj}maj.pruned_maxkbdist-{maxkb}_minr2-{r2}.log", + "logs/{dataset}/ngsLD/excl_pca_admix_beagle/{dataset}.{ref}_{population}_excl_pca-admix{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.log", benchmark: - "benchmarks/{dataset}/ngsLD/excl_pca_admix_beagle/{dataset}.{ref}_{population}_excl_pca-admix{dp}_{sites}-filts.{maj}maj.pruned_maxkbdist-{maxkb}_minr2-{r2}.log" + "benchmarks/{dataset}/ngsLD/excl_pca_admix_beagle/{dataset}.{ref}_{population}_excl_pca-admix{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.log" container: shell_container params: @@ -32,11 +32,10 @@ rule pca_pcangsd: Produces covariance matrix from SNP genotype likelihood data with PCAngsd. """ input: - beagle=lambda w: expand( - "results/datasets/{{dataset}}/beagles/pruned/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.{maj}maj.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", + beagle=expand( + "results/datasets/{{dataset}}/beagles/pruned/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", maxkb=config["params"]["ngsld"]["max_kb_dist_pruning_dataset"], r2=config["params"]["ngsld"]["pruning_min-weight_dataset"], - maj=get_maj, ), output: cov="results/datasets/{dataset}/analyses/pcangsd/{dataset}.{ref}_{population}{dp}_{sites}-filts.cov", diff --git a/workflow/rules/6.1_admixture.smk b/workflow/rules/6.1_admixture.smk index 3c00eca..358a86a 100644 --- a/workflow/rules/6.1_admixture.smk +++ b/workflow/rules/6.1_admixture.smk @@ -10,10 +10,9 @@ rule ngsAdmix: """ input: beagle=expand( - "results/datasets/{{dataset}}/beagles/pruned/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.{maj}maj.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", + "results/datasets/{{dataset}}/beagles/pruned/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", maxkb=config["params"]["ngsld"]["max_kb_dist_pruning_dataset"], r2=config["params"]["ngsld"]["pruning_min-weight_dataset"], - maj=get_maj, ), output: qopt="results/datasets/{dataset}/analyses/ngsadmix/{dataset}.{ref}_{population}{dp}_{sites}-filts_K{kvalue}.qopt", @@ -74,10 +73,9 @@ rule evalAdmix: """ input: beagle=expand( - "results/datasets/{{dataset}}/beagles/pruned/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.{maj}maj.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", + "results/datasets/{{dataset}}/beagles/pruned/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", maxkb=config["params"]["ngsld"]["max_kb_dist_pruning_dataset"], r2=config["params"]["ngsld"]["pruning_min-weight_dataset"], - maj=get_maj, ), qopt="results/datasets/{dataset}/analyses/ngsadmix/{dataset}.{ref}_{population}{dp}_{sites}-filts_K{kvalue}.qopt", fopt="results/datasets/{dataset}/analyses/ngsadmix/{dataset}.{ref}_{population}{dp}_{sites}-filts_K{kvalue}.fopt.gz", diff --git a/workflow/rules/8.0_inbreeding.smk b/workflow/rules/8.0_inbreeding.smk index c8f0405..9c44f34 100644 --- a/workflow/rules/8.0_inbreeding.smk +++ b/workflow/rules/8.0_inbreeding.smk @@ -7,15 +7,14 @@ rule ngsf_hmm: Estimate IBD tracts within individual genomes. """ input: - beagle=lambda w: expand( - "results/datasets/{{dataset}}/beagles/{folder}{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.{maj}maj{pruning}.beagle.gz", + beagle=expand( + "results/datasets/{{dataset}}/beagles/{folder}{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts{pruning}.beagle.gz", folder="pruned/" if config["params"]["ngsf-hmm"]["prune"] else "", pruning=( f".pruned_maxkbdist-{config['params']['ngsf-hmm']['max_kb_dist_pruning_pop']}_minr2-{config['params']['ngsf-hmm']['pruning_min-weight_pop']}" if config["params"]["ngsf-hmm"]["prune"] else "" ), - maj=get_maj, ), output: ibd="results/datasets/{dataset}/analyses/ngsF-HMM/{dataset}.{ref}_{population}{dp}_{sites}-filts.ibd", diff --git a/workflow/rules/9.0_IBS.smk b/workflow/rules/9.0_IBS.smk index 4e093e8..0d64818 100644 --- a/workflow/rules/9.0_IBS.smk +++ b/workflow/rules/9.0_IBS.smk @@ -9,14 +9,8 @@ rule angsd_doIBS: bamlist="results/datasets/{dataset}/bamlists/{dataset}.{ref}_{population}{dp}.bamlist", bams=get_bamlist_bams, bais=get_bamlist_bais, - sites=lambda w: expand( - "results/datasets/{{dataset}}/filters/snps/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.{maj}maj_snps.sites", - maj=get_maj, - ), - idx=lambda w: expand( - "results/datasets/{{dataset}}/filters/snps/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.{maj}maj_snps.sites.idx", - maj=get_maj, - ), + sites="results/datasets/{dataset}/filters/snps/{dataset}.{ref}_{population}{dp}_{sites}-filts_snps.sites", + idx="results/datasets/{dataset}/filters/snps/{dataset}.{ref}_{population}{dp}_{sites}-filts_snps.sites.idx", output: ibs="results/datasets/{dataset}/analyses/IBS/{dataset}.{ref}_{population}{dp}_{sites}-filts.ibs.gz", ibsmat="results/datasets/{dataset}/analyses/IBS/{dataset}.{ref}_{population}{dp}_{sites}-filts.ibsMat", diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 7df114c..95c837e 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -709,90 +709,17 @@ def get_minind(wildcards): # Determine if docounts is needed for beagle/maf calculation to keep it from # slowing things down when it is not. It is only needed if the major and minor -# alleles are being inferred from counts (-doMajorMinor 2). This would also be -# needed if using -doMaf 8, but that is not supported currently by the pipeline. -def get_docounts(wildcards): - if str(config["params"]["angsd"]["domajorminor"]) == "2": +# alleles are being inferred from counts (-doMajorMinor 2) or the minor allele +# frequency is being inferred by counts (-doMaf 8, >8 possible if count +# inference is combined with other inferences) +def get_docounts(wildcard): + if (int(config["params"]["angsd"]["domajorminor"]) == 2) or ( + int(config["params"]["angsd"]["domaf"]) >= 8 + ): return "-doCounts 1" return "" -# Determine how what -doMajorMinor value to pass to Beagle/MAF calculation. The -# primary way minor and major will be inferred is set in the config, however, -# sometimes the pipeline may set these from a -sites file, for instance when -# polarizing major/minor in pops to the dataset major/minor. -def get_majmin(wildcards): - maj = wildcards.maj - pop = wildcards.population - polarize = config["params"]["angsd"]["popmafmaj"] - configmaj = config["params"]["angsd"]["domajorminor"] - # If desired major is reference, always use -doMajorMinor 4 - if maj == "ref": - return "-doMajorMinor 4" - # If desired major is ancestral, always use -doMajorMinor 5 - if maj == "anc": - return "-doMajorMinor 5" - # If desired major is major of all samples, use -doMajorMinor 3 for pops - if (pop != "all") & (polarize == "all"): - return "-doMajorMinor 3" - # In all other cases, use the configured value - return f"-doMajorMinor {configmaj}" - - -# Determine which sites file to use for Beagle/MAF calculation. In almost all -# cases, this is the filtered sites lists. In the event the user wants -# population Beagle/MAFs to be polarized to the major across all samples, the -# sites list produced by the all samples Beagle will be used. -def get_sitesfile(wildcards): - wildmaj = wildcards.maj - pop = wildcards.population - polarize = config["params"]["angsd"]["popmafmaj"] - configmaj = str(config["params"]["angsd"]["domajorminor"]) - if (pop != "all") & (polarize == "all") & (configmaj in ["1", "2"]): - return { - "sites": "results/datasets/{dataset}/filters/snps/{dataset}.{ref}_all{dp}_{sites}-filts.{maj}maj_snps.sites", - "idx": "results/datasets/{dataset}/filters/snps/{dataset}.{ref}_all{dp}_{sites}-filts.{maj}maj_snps.sites.idx", - } - return filt_depth(wildcards) - - -# In the event we do polarize to the major allele across all samples within the -# population Beagle/MAF files, it may mean that fixed sites within the -# populations are interesting. We'll override the user's set -minMaf and -# -SNP_pval options in that case to keep any site that was variable in the -# dataset, even if not in one population. -def get_snppval_maf(wildcards): - pop = wildcards.population - snppval = config["params"]["angsd"]["snp_pval"] - minmaf = config["params"]["angsd"]["min_maf"] - polarize = config["params"]["angsd"]["popmafmaj"] - configmaj = str(config["params"]["angsd"]["domajorminor"]) - if (pop != "all") & (polarize == "all") & (configmaj in ["1", "2"]): - return "-SNP_pval 1 -minMaf -1" - return f"-SNP_pval {snppval} -minMaf {minmaf}" - - -# Determine which major to set in the filename -def get_maj(wildcards): - pop = wildcards.population - configmaj = str(config["params"]["angsd"]["domajorminor"]) - polarize = config["params"]["angsd"]["popmafmaj"] - if configmaj not in ["1", "2", "4", "5"]: - raise ValueError( - f"Config invalid - you have set a 'domajorminor' parameter " - f"incompatible with this workflow. You can only select from 1, 2, " - f"4, and 5 for -doMajorMinor at this time. Please change the " - f"'domajorminor' entry in the config file to one of these values." - ) - if configmaj == "4": - return "ref" - if configmaj == "5": - return "anc" - if (pop in pop_list) & (polarize == "population"): - return "pop" - return "all" - - # Determine whether transitions should be removed based on user configuration def get_trans(wildcards): if config["params"]["angsd"]["rmtrans"]: @@ -842,6 +769,21 @@ def get_excl_ind_cols(wildcards): # Kinship +## Get beagle file for input to ngsrelate (either pruned or not) +def get_ngsrelate_input(wildcards): + if config["params"]["ngsrelate"]["prune"]: + return { + "beagle": expand( + "results/datasets/{{dataset}}/beagles/pruned/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", + maxkb=config["params"]["ngsld"]["max_kb_dist_pruning_dataset"], + r2=config["params"]["ngsld"]["pruning_min-weight_dataset"], + ), + } + return { + "beagle": "results/datasets/{dataset}/beagles/{dataset}.{ref}_{population}{dp}_{sites}-filts.beagle.gz" + } + + ## Get all possible kinship estimate pairings def get_kinship(wildcards): sam = get_samples_from_pop("all")