diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 00bcbea..e36026d 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -130,20 +130,27 @@ params: domajorminor: 1 # Currently only compatible with values 1, 2, 4, 5 domaf: 1 snp_pval: "1e-6" - min_maf: 0.05 + min_maf: -1 # Set -1 to disable mindepthind_heterozygosity: 3 ngsld: max_kb_dist_est-ld: 200 - max_kb_dist_decay: 100 - max_kb_dist_pruning: 50 rnd_sample_est-ld: 0.001 + max_kb_dist_decay: 100 rnd_sample_decay: 0.9 fit_LDdecay_extra: "--fit_level 3" fit_LDdecay_n_correction: true - pruning_min-weight: 0.1 + max_kb_dist_pruning_dataset: 50 # used for PCA, Admix, NGSrelate, not ngsF-HMM + pruning_min-weight_dataset: 0.1 # used for PCA, Admix, NGSrelate, not ngsF-HMM + ngsf-hmm: + estimate_in_pops: false + prune: false + max_kb_dist_pruning_pop: 50 # used only for ngsf-hmm + pruning_min-weight_pop: 0.8 # used only for ngsf-hmm + min_roh_length: 50000 + roh_bins: [ 100000, 250000, 500000, 1000000, 2500000, 5000000 ] # in ascending order, bins are between adjacent values and between min_roh_length and value 1 and the last value and infinity realsfs: fold: 1 # Should only be set to 1, unless ancestral reference is given above - sfsboot: 100 + sfsboot: 100 # Does nothing for now fst: whichFst: 1 win_size: 10000 diff --git a/config/README.md b/config/README.md index f152fc9..c3824d7 100644 --- a/config/README.md +++ b/config/README.md @@ -451,17 +451,16 @@ or a pull request and I'll gladly put it in. for other options. I have not tested much beyond `1` and `8`, please open an issue if you have problems. - `min_maf:` The minimum minor allele frequency required to call a SNP. - (float, [docs](http://www.popgen.dk/angsd/index.php/Allele_Frequencies)) + This is set when generating the beagle file, so will filter SNPs for + PCAngsd, NGSadmix, ngsF-HMM, and NGSrelate. If you would like each tool + to handle filtering for maf on its own you can set this to `-1` + (disabled). (float, [docs](http://www.popgen.dk/angsd/index.php/Allele_Frequencies)) - `ngsld:` Settings for ngsLD ([docs](https://github.com/fgvieira/ngsLD)) - `max_kb_dist_est-ld:` For the LD estimates generated when setting `estimate_ld: true` above, set the maximum distance between sites in kb that LD will be estimated for (`--max_kb_dist` in ngsLD, integer) - `max_kb_dist_decay:` The same as `max_kb_dist_est-ld:`, but used when estimating LD decay when setting `ld_decay: true` above (integer) - - `max_kb_dist_pruning:` The same as `max_kb_dist_est-ld:`, but used when - linkage pruning SNPs as inputs for PCA, Admix, and Inbreeding analyses. - Any positions above this distance will be assumed to be in linkage - equilibrium during the pruning process (integer) - `rnd_sample_est-ld:` For the LD estimates generated when setting `estimate_ld: true` above, randomly sample this proportion of pairwise linkage estimates rather than estimating all (`--rnd_sample` in ngsLD, @@ -474,8 +473,41 @@ or a pull request and I'll gladly put it in. size corrected r^2 model be used? (`true`/`false`, `true` is the equivalent of passing a sample size to `fit_LDdecay.R` in ngsLD using `--n_ind`) - - `pruning_min-weight:` The minimum r^2 to assume two positions are in - linkage disequilibrium when pruning (float) + - `max_kb_dist_pruning_dataset:` The same as `max_kb_dist_est-ld:`, but + used when linkage pruning SNPs as inputs for PCAngsd, NGSadmix, and + NGSrelate analyses. Pruning is performed on the whole dataset. Any + positions above this distance will be assumed to be in linkage + equilibrium during the pruning process. (integer) + - `pruning_min-weight_dataset:` The minimum r^2 to assume two positions are + in linkage disequilibrium when pruning for PCAngsd, NGSadmix, and + NGSrelate analyses. (float) + - `ngsf-hmm:` Settings for ngsF-HMM + - `estimate_in_pops:` Set to `true` to run ngsF-HMM separately for each + population in your dataset. Set to `false` to run for whole dataset at + once. ngsF-HMM assumes Hardy-Weinberg Equilibrium (aside from inbreeding) + in the input data, so select the option that most reflects this. You can + use PCA and Admixture analyses to help determine this. (`true`/`false`) + - `prune:` Whether or not to prune SNPs for LD before running the analysis. + ngsF-HMM assumes independent sites, so it is preferred to set this to + `true` to satisfy that expectation. (`true`/`false`) + - `max_kb_dist_pruning_pop:` The maximum distance between sites in kb + that will be treated as in LD when pruning for the ngsF-HMM input. (INT) + - `pruning_min-weight_pop:` The minimum r^2 to assume two positions are in + linkage disequilibrium when pruning for the ngsF-HMM input. Note, that + this likely will be substantially higher for individual populations than + for the whole dataset, as background LD should be higher when no + substructure is present. (float) + - `min_roh_length:` Minimum ROH size in base pairs to include in inbreeding + coefficient calculation. Set if short ROH might be considered low + confidence for your data. (integer) + - `roh_bins:` A list of integers that describe the size classes in base + pairs you would like to partition the inbreeding coefficient by. This can + help visualize how much of the coefficient comes from ROH of certain size + classes (and thus, ages). List should be in ascending order and the first + entry should be greater than `min_roh_length`. The first bin will group + ROH between `min_roh_length` and the first entry, subsequent bins will + group ROH with sizes between adjacent entries in the list, and the final + bin will group all ROH larger than the final entry in the list. (list) - `realSFS:` Settings for realSFS - `fold:` Whether or not to fold the produced SFS. Set to 1 if you have not provided an ancestral-state reference (0 or 1, [docs](http://www.popgen.dk/angsd/index.php/SFS_Estimation)) @@ -508,6 +540,6 @@ or a pull request and I'll gladly put it in. assessment. (integer) - `extra:` Additional arguments to pass to NGSadmix (for instance, increasing `-maxiter`). (string, [docs](http://www.popgen.dk/software/index.php/NgsAdmix)) - - `ibs:` Settings for identity by state calculation with ANGSD - - `-doIBS:` Whether to use a random (1) or consensus (2) base in IBS + - `ibs:` Settings for identity by state calculation with ANGSD + - `-doIBS:` Whether to use a random (1) or consensus (2) base in IBS distance calculation ([docs](http://www.popgen.dk/angsd/index.php/PCA_MDS)) diff --git a/config/config.yaml b/config/config.yaml index e6d4f78..297acbf 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -129,20 +129,27 @@ params: domajorminor: 1 # Currently only compatible with values 1, 2, 4, 5 domaf: 1 snp_pval: "1e-6" - min_maf: 0.05 + min_maf: -1 # Set -1 to disable mindepthind_heterozygosity: 3 ngsld: max_kb_dist_est-ld: 4000 - max_kb_dist_decay: 100 - max_kb_dist_pruning: 100 rnd_sample_est-ld: 0.001 + max_kb_dist_decay: 100 rnd_sample_decay: 0.001 fit_LDdecay_extra: "--fit_level 100 --fit_boot 100 --plot_size 3,6 --plot_data --plot_no_legend --plot_y_lim 0.5" fit_LDdecay_n_correction: true - pruning_min-weight: 0.1 + max_kb_dist_pruning_dataset: 100 # used for PCA, Admix, NGSrelate, not ngsF-HMM + pruning_min-weight_dataset: 0.1 # used for PCA, Admix, NGSrelate, not ngsF-HMM + ngsf-hmm: + estimate_in_pops: true + prune: true + max_kb_dist_pruning_pop: 100 # used only for ngsf-hmm + pruning_min-weight_pop: 0.8 # used only for ngsf-hmm + min_roh_length: 50000 + roh_bins: [ 100000, 250000, 500000, 1000000, 2500000, 5000000 ] # in ascending order, bins are between adjacent values and between min_roh_length and value 1 and the last value and infinity realsfs: fold: 1 # Should only be set to 1, unless ancestral reference is given above - sfsboot: 100 + sfsboot: 100 # Does nothing for now fst: whichFst: 1 win_size: 50000 diff --git a/workflow/Snakefile b/workflow/Snakefile index c717d1b..555d556 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -203,7 +203,7 @@ if config["analyses"]["heterozygosity_angsd"]: if config["analyses"]["inbreeding_ngsf-hmm"]: all_outputs.append( - "results/datasets/{dataset}/plots/inbreeding/{dataset}.{ref}_all_{sites}-filts.froh.pdf" + "results/datasets/{dataset}/plots/inbreeding/{dataset}.{ref}_all_{sites}-filts.ind_froh.tsv" ) if config["analyses"]["admix_ngsadmix"]: @@ -359,7 +359,7 @@ if config["subsample_analyses"]["heterozygosity_angsd"]: if config["subsample_analyses"]["inbreeding_ngsf-hmm"]: all_outputs.extend( expand( - "results/datasets/{{dataset}}/plots/inbreeding/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts.froh.pdf", + "results/datasets/{{dataset}}/plots/inbreeding/{{dataset}}.{{ref}}_all{dp}_{{sites}}-filts.ind_froh.tsv", dp=subdp, ) ) diff --git a/workflow/rules/4.1_linkage_pruning.smk b/workflow/rules/4.1_linkage_pruning.smk index 8965df3..d100749 100644 --- a/workflow/rules/4.1_linkage_pruning.smk +++ b/workflow/rules/4.1_linkage_pruning.smk @@ -7,29 +7,23 @@ rule ngsLD_prune_sites: Prunes SNPs to produce a list of SNPs in linkage equilibrium. """ input: - ld=expand( - "results/datasets/{{dataset}}/beagles/pruned/ngsLD/{{dataset}}.{{ref}}_{{population}}{{dp}}_chunk{{chunk}}_{{sites}}-filts.ld_maxkbdist-{maxkb}_rndsample-1.gz", - maxkb=config["params"]["ngsld"]["max_kb_dist_pruning"], - ), + ld="results/datasets/{dataset}/beagles/pruned/ngsLD/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.ld_maxkbdist-{maxkb}_rndsample-1.gz", output: - sites="results/datasets/{dataset}/beagles/pruned/ngsLD/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts_pruned.sites", + sites="results/datasets/{dataset}/beagles/pruned/ngsLD/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.sites", log: - "logs/{dataset}/ngsLD/prune_sites/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.log", + "logs/{dataset}/ngsLD/prune_sites/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts_maxkbdist-{maxkb}_minr2-{r2}.log", benchmark: - "benchmarks/{dataset}/ngsLD/prune_sites/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.log" + "benchmarks/{dataset}/ngsLD/prune_sites/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts_maxkbdist-{maxkb}_minr2-{r2}.log" container: ngsld_container threads: 4 resources: runtime="10d", - params: - maxdist=lambda w: str(config["params"]["ngsld"]["max_kb_dist_pruning"]) + "000", - minweight=config["params"]["ngsld"]["pruning_min-weight"], shell: """ if [ -s {input.ld} ]; then zcat {input.ld} | prune_graph --weight-field 'column_7' \ - --weight-filter 'column_3 <= {params.maxdist} && column_7 >= {params.minweight}' \ + --weight-filter 'column_3 <= {wildcards.maxkb}000 && column_7 >= {wildcards.r2}' \ --out {output.sites} else > {output.sites} @@ -43,20 +37,20 @@ rule prune_chunk_beagle: """ input: beagle="results/datasets/{dataset}/beagles/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.beagle.gz", - sites="results/datasets/{dataset}/beagles/pruned/ngsLD/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts_pruned.sites", + sites="results/datasets/{dataset}/beagles/pruned/ngsLD/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.sites", output: - prunedgz="results/datasets/{dataset}/beagles/pruned/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts_pruned.beagle.gz", + prunedgz="results/datasets/{dataset}/beagles/pruned/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", log: - "logs/{dataset}/ngsLD/prune_beagle/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.log", + "logs/{dataset}/ngsLD/prune_beagle/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.log", benchmark: - "benchmarks/{dataset}/ngsLD/prune_beagle/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.log" + "benchmarks/{dataset}/ngsLD/prune_beagle/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.log" conda: "../envs/shell.yaml" shadow: "minimal" threads: lambda wildcards, attempt: attempt params: - pruned="results/datasets/{dataset}/beagles/pruned/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts_pruned.beagle", + pruned="results/datasets/{dataset}/beagles/pruned/chunks/{dataset}.{ref}_{population}{dp}_chunk{chunk}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle", resources: runtime=lambda wildcards, attempt: attempt * 120, shell: @@ -84,15 +78,15 @@ rule merge_pruned_beagles: """ input: pruned=lambda w: expand( - "results/datasets/{{dataset}}/beagles/pruned/chunks/{{dataset}}.{{ref}}_{{population}}{{dp}}_chunk{chunk}_{{sites}}-filts_pruned.beagle.gz", + "results/datasets/{{dataset}}/beagles/pruned/chunks/{{dataset}}.{{ref}}_{{population}}{{dp}}_chunk{chunk}_{{sites}}-filts.pruned_maxkbdist-{{maxkb}}_minr2-{{r2}}.beagle.gz", chunk=chunklist, ), output: - beagle="results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts_pruned.beagle.gz", + beagle="results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", log: - "logs/{dataset}/ngsLD/merge_pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts_merge_pruned.log", + "logs/{dataset}/ngsLD/merge_pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts_merge.pruned_maxkbdist-{maxkb}_minr2-{r2}.log", benchmark: - "benchmarks/{dataset}/ngsLD/merge_pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts_merge_pruned.log" + "benchmarks/{dataset}/ngsLD/merge_pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts_merge.pruned_maxkbdist-{maxkb}_minr2-{r2}.log" wildcard_constraints: population="|".join( ["all"] diff --git a/workflow/rules/5.0_relatedness.smk b/workflow/rules/5.0_relatedness.smk index 710e5d6..c5d9b89 100644 --- a/workflow/rules/5.0_relatedness.smk +++ b/workflow/rules/5.0_relatedness.smk @@ -162,7 +162,11 @@ rule ngsrelate: Estimates inbreeding and relatedness measures using NGSrelate. """ input: - beagle="results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_all{dp}_{sites}-filts_pruned.beagle.gz", + beagle=expand( + "results/datasets/{{dataset}}/beagles/pruned/{{dataset}}.{{ref}}_all{{dp}}_{{sites}}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", + maxkb=config["params"]["ngsld"]["max_kb_dist_pruning_dataset"], + r2=config["params"]["ngsld"]["pruning_min-weight_dataset"], + ), inds="results/datasets/{dataset}/poplists/{dataset}_all.indiv.list", output: relate="results/datasets/{dataset}/analyses/kinship/ngsrelate/{dataset}.{ref}_all{dp}_{sites}-filts_relate.tsv", diff --git a/workflow/rules/6.0_pca.smk b/workflow/rules/6.0_pca.smk index 61f23b5..efa0584 100644 --- a/workflow/rules/6.0_pca.smk +++ b/workflow/rules/6.0_pca.smk @@ -8,13 +8,13 @@ rule remove_excl_pca_admix: results), while allowing them in all other analyses. """ input: - "results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_all{dp}_{sites}-filts_pruned.beagle.gz", + "results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_all{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", output: - "results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_all_excl_pca-admix{dp}_{sites}-filts_pruned.beagle.gz", + "results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_all_excl_pca-admix{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", log: - "logs/{dataset}/ngsLD/excl_pca_admix_beagle/{dataset}.{ref}_all_excl_pca-admix{dp}_{sites}-filts.log", + "logs/{dataset}/ngsLD/excl_pca_admix_beagle/{dataset}.{ref}_all_excl_pca-admix{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.log", benchmark: - "benchmarks/{dataset}/ngsLD/excl_pca_admix_beagle/{dataset}.{ref}_all_excl_pca-admix{dp}_{sites}-filts.log" + "benchmarks/{dataset}/ngsLD/excl_pca_admix_beagle/{dataset}.{ref}_all_excl_pca-admix{dp}_{sites}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.log" conda: "../envs/shell.yaml" params: @@ -30,7 +30,11 @@ rule pca_pcangsd: Produces covariance matrix from SNP genotype likelihood data with PCAngsd. """ input: - beagle="results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts_pruned.beagle.gz", + beagle=expand( + "results/datasets/{{dataset}}/beagles/pruned/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", + maxkb=config["params"]["ngsld"]["max_kb_dist_pruning_dataset"], + r2=config["params"]["ngsld"]["pruning_min-weight_dataset"], + ), output: cov="results/datasets/{dataset}/analyses/pcangsd/{dataset}.{ref}_{population}{dp}_{sites}-filts.cov", log: diff --git a/workflow/rules/6.1_admixture.smk b/workflow/rules/6.1_admixture.smk index 27c70de..d5de210 100644 --- a/workflow/rules/6.1_admixture.smk +++ b/workflow/rules/6.1_admixture.smk @@ -9,7 +9,11 @@ rule ngsAdmix: replicates have been performed. """ input: - beagle="results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts_pruned.beagle.gz", + beagle=expand( + "results/datasets/{{dataset}}/beagles/pruned/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", + maxkb=config["params"]["ngsld"]["max_kb_dist_pruning_dataset"], + r2=config["params"]["ngsld"]["pruning_min-weight_dataset"], + ), output: qopt="results/datasets/{dataset}/analyses/ngsadmix/{dataset}.{ref}_{population}{dp}_{sites}-filts_K{kvalue}.qopt", fopt="results/datasets/{dataset}/analyses/ngsadmix/{dataset}.{ref}_{population}{dp}_{sites}-filts_K{kvalue}.fopt.gz", @@ -68,7 +72,11 @@ rule evalAdmix: Perform evalAdmix analysis to get residuals on best fitting replicate for a given K. """ input: - beagle="results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts_pruned.beagle.gz", + beagle=expand( + "results/datasets/{{dataset}}/beagles/pruned/{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts.pruned_maxkbdist-{maxkb}_minr2-{r2}.beagle.gz", + maxkb=config["params"]["ngsld"]["max_kb_dist_pruning_dataset"], + r2=config["params"]["ngsld"]["pruning_min-weight_dataset"], + ), qopt="results/datasets/{dataset}/analyses/ngsadmix/{dataset}.{ref}_{population}{dp}_{sites}-filts_K{kvalue}.qopt", fopt="results/datasets/{dataset}/analyses/ngsadmix/{dataset}.{ref}_{population}{dp}_{sites}-filts_K{kvalue}.fopt.gz", output: diff --git a/workflow/rules/8.0_inbreeding.smk b/workflow/rules/8.0_inbreeding.smk index d40949f..b075f4e 100644 --- a/workflow/rules/8.0_inbreeding.smk +++ b/workflow/rules/8.0_inbreeding.smk @@ -7,7 +7,13 @@ rule ngsf_hmm: Estimate IBD tracts within individual genomes. """ input: - beagle="results/datasets/{dataset}/beagles/pruned/{dataset}.{ref}_{population}{dp}_{sites}-filts_pruned.beagle.gz", + beagle=expand( + "results/datasets/{{dataset}}/beagles/{folder}{{dataset}}.{{ref}}_{{population}}{{dp}}_{{sites}}-filts{pruning}.beagle.gz", + folder="pruned/" if config["params"]["ngsf-hmm"]["prune"] else "", + pruning=f".pruned_maxkbdist-{config['params']['ngsf-hmm']['max_kb_dist_pruning_pop']}_minr2-{config['params']['ngsf-hmm']['pruning_min-weight_pop']}" + if config["params"]["ngsf-hmm"]["prune"] + else "", + ), output: ibd="results/datasets/{dataset}/analyses/ngsF-HMM/{dataset}.{ref}_{population}{dp}_{sites}-filts.ibd", indF="results/datasets/{dataset}/analyses/ngsF-HMM/{dataset}.{ref}_{population}{dp}_{sites}-filts.indF", @@ -61,17 +67,33 @@ rule plot_froh: input: roh=expand( "results/datasets/{{dataset}}/analyses/ngsF-HMM/{{dataset}}.{{ref}}_{population}{{dp}}_{{sites}}-filts.roh", - population=pop_list, + population=pop_list + if config["params"]["ngsf-hmm"]["estimate_in_pops"] + else "all", ), inds="results/datasets/{dataset}/poplists/{dataset}_all.indiv.list", autos=get_auto_sum, output: - plot=report( - "results/datasets/{dataset}/plots/inbreeding/{dataset}.{ref}_all{dp}_{sites}-filts.froh.pdf", + barplot=report( + "results/datasets/{dataset}/plots/inbreeding/{dataset}.{ref}_all{dp}_{sites}-filts.froh_bins.svg", + category="Inbreeding", + labels=lambda w: { + "Filter": "{sites}", + **dp_report(w), + "Type": "Froh Bins Barplot", + }, + ), + scatter=report( + "results/datasets/{dataset}/plots/inbreeding/{dataset}.{ref}_all{dp}_{sites}-filts.cumroh_nroh.svg", category="Inbreeding", - labels=lambda w: {"Filter": "{sites}", **dp_report(w), "Type": "Barplot"}, + labels=lambda w: { + "Filter": "{sites}", + **dp_report(w), + "Type": "Nroh ~ Lroh Scatterplot", + }, ), - tsv="results/datasets/{dataset}/plots/inbreeding/{dataset}.{ref}_all{dp}_{sites}-filts.froh.tsv", + roh="results/datasets/{dataset}/plots/inbreeding/{dataset}.{ref}_all{dp}_{sites}-filts.all_roh.bed", + froh="results/datasets/{dataset}/plots/inbreeding/{dataset}.{ref}_all{dp}_{sites}-filts.ind_froh.tsv", log: "logs/{dataset}/ngsF-HMM/{dataset}.{ref}_all{dp}_{sites}-filts_plot.log", benchmark: @@ -79,7 +101,8 @@ rule plot_froh: conda: "../envs/r.yaml" params: - popnames=pop_list, - outpre=lambda w, output: output["plot"].removesuffix(".froh.pdf"), + bins=config["params"]["ngsf-hmm"]["roh_bins"], + minroh=config["params"]["ngsf-hmm"]["min_roh_length"], + outpre=lambda w, output: output["barplot"].removesuffix(".froh_bins.svg"), script: "../scripts/plot_Froh.R" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 2301da6..4daaf09 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -171,15 +171,6 @@ def get_raw_fastq(wildcards): } -## Get minimum overlap to collapse read pairs per sample -def get_min_overlap(wildcards): - s = wildcards.sample - if s in samples.index[samples.time == "historical"]: - return config["params"]["fastp"]["min_overlap_hist"] - elif s in samples.index[samples.time == "modern"]: - return config["params"]["fastp"]["min_overlap_mod"] - - # Reference diff --git a/workflow/scripts/plot_Froh.R b/workflow/scripts/plot_Froh.R index 12eb213..6cbf36a 100644 --- a/workflow/scripts/plot_Froh.R +++ b/workflow/scripts/plot_Froh.R @@ -1,70 +1,160 @@ sink(file(snakemake@log[[1]], open="wt"), type = "message") -aggregate_roh <- function(rohlist,poplist,samplelist,lenautos) { - samples <- as.data.frame(read.table(samplelist, header=TRUE)) - Froh <- data.frame() - - for (i in 1:length(poplist)) { - - for (l in c(100000,1000000,2500000)) { - if (file.size(rohlist[i]) == 0) { - roh <- data.frame(matrix(ncol = 5, nrow = 0)) - } else { - roh <- as.data.frame(read.table(rohlist[i], header = FALSE)) - } - for (s in samples$sample[samples$population==poplist[i]]) { - sampleroh <- (subset(roh, roh[,4] == s)) - nroh <- nrow(sampleroh[sampleroh[,5]>l,]) - sumroh <- sum(sampleroh[,5][sampleroh[,5] > l]) - Fsample <- sumroh/lenautos - row <- c(s,sumroh,Fsample,nroh,poplist[i],l) - Froh <- rbind(Froh, row) - } +library(dplyr) +library(ggplot2) - } +# Combine ngsF-HMM roh beds into single table +aggregate_roh <- function(rohlist) { + + df <- data.frame() + for (i in rohlist) { + if (file.size(i) == 0) { + roh <- data.frame(matrix(ncol = 5, nrow = 0)) + } else { + roh <- as.data.frame(read.table(i, header = FALSE)) + } + df <- rbind(df, roh) } + + colnames(df) <- c("chr","start","stop","sample","length") - colnames(Froh) <- c("sample","lenroh","Froh","Nroh","population","minroh") + return(df) + +} + +# Calculate Froh for separate user defined size bins. Describes the proportion +# of total Froh that comes from each size class +froh_bins <- function(samplelist, roh_df, minroh, bins, lenautos) { + samples <- as.data.frame(read.table(samplelist, header=TRUE)) + + norun <- c() + norun$chr <- rep(0, nrow(samples)) + norun$start <- rep(0, nrow(samples)) + norun$end <- rep(0, nrow(samples)) + norun$sample <- samples$sample + norun$length <- rep(0, nrow(samples)) + + bins <- c(minroh, bins, Inf) + frohs <- c() + + for (i in seq(length(bins))) { + runs <- roh_df[roh_df[,5] >= bins[i],] + if (i <= length(bins)-1){ + runs <- runs[runs[,5] < bins[i+1],] + } + runs <- rbind(runs, norun) + runs <- runs %>% group_by(sample) %>% + summarize(length = sum(length)) + runs <- merge(runs, samples, by = "sample") + runs$froh <- runs$length / as.numeric(lenautos) + runs$range <- paste0("[",bins[i],",",bins[i+1],")") + frohs <- rbind(frohs, runs) + } - Froh$sample <- as.factor(Froh$sample) - Froh$population <- as.factor(Froh$population) - Froh$lenroh <- as.integer(Froh$lenroh) - Froh$Froh <- as.numeric(Froh$Froh) - Froh$Nroh <- as.integer(Froh$Nroh) - Froh$minroh <- as.factor(Froh$minroh) + return(frohs) - return(Froh) } -plot_roh <- function(aggroh,plotpre) { - require(ggplot2) +# Calculate the number of rohs and the cumulative length per individual for +# rohs > the minimum roh size. This can help distinguish patterns of +# consanguinity from increased background relatedness +nroh_cumroh <- function(roh_df, minroh, samplelist) { + samples <- as.data.frame(read.table(samplelist, header=TRUE)) + df <- roh_df[roh_df$length >= minroh, ] + df <- df %>% group_by(sample) %>% + summarize(cumlen = sum(length), + nroh = n()) + df <- merge(df, samples, by = "sample") + return(df) +} - ggplot(aggroh, aes(x=population,y=Froh)) + - geom_boxplot(outlier.shape = NA) + - geom_jitter(height = 0, width = 0.2) + - scale_y_continuous(limits = c(0, NA)) + - facet_wrap(~ minroh) + - theme_classic() - - ggsave(paste0(plotpre,".froh.pdf")) +# Plot binned Froh per individual +plot_bins <- function(frohs, minroh, bins, plotpre) { + bins <- c(minroh, bins, Inf) + ranges <- c() + for (i in seq(length(bins))) { + ranges <- c(ranges, paste0("[",bins[i],",",bins[i+1],")")) + } + frohs$range <- as.factor(frohs$range) + frohs$range <- factor(frohs$range, levels = ranges, labels = ranges) + indfroh <- frohs %>% group_by(sample) %>% + summarize(froh = sum(froh)) - # ggplot(aggroh, aes(x=Nroh,y=lenroh)) + - # geom_point(aes(col=population)) + - # geom_smooth(method='lm') + - # theme_classic() + - # facet_grid(cols = vars(minroh)) + frohs$sample <- factor(frohs$sample, levels = indfroh$sample[order(indfroh$froh)]) - # ggsave(paste0(plotpre,".rohreg.pdf")) + ggplot(data = frohs, aes(x = sample, y = froh, fill = range)) + + geom_bar(position="stack", stat="identity") + + facet_grid(cols = vars(population), scales = "free_x") + + theme_bw() + + labs(y = expression(F[RoH]), fill = "RoH Length (bp)") + + scale_fill_grey() + + theme( + axis.title.x = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank() + ) + + ggsave(paste0(plotpre,".froh_bins.svg")) +} - write.table(aggroh, file = paste0(plotpre,".froh.tsv"), quote = FALSE, - sep = "\t", row.names = FALSE, col.names = FALSE) +# Plot Nroh ~ Cumulative roh length for individuals with rohs +plot_cumroh_nroh <- function(nroh_cumroh, plotpre) { + ggplot(data = nroh_cumroh, aes(x = cumlen, y = nroh, color = population)) + + geom_point() + + geom_smooth(method = "lm", color = "black", se = FALSE) + + ylab("Total number of runs of homozygosity") + + xlab("Total length of runs of homozygosity (bp)") + + theme_classic() + + labs(color = "Population") + ggsave(paste0(plotpre,".cumroh_nroh.svg")) } -aggroh <- aggregate_roh(snakemake@input[["roh"]], - snakemake@params[["popnames"]], + +# Actually run functions to produce outputs, saving tables for the bed file of +# all rohs for all individuals and the Froh calculated for all rohs > the +# minimum roh size for each individual + +aggroh <- aggregate_roh(snakemake@input[["roh"]]) + +write.table(aggroh, file = paste0(snakemake@params[["outpre"]],".all_roh.bed"), quote = FALSE, + sep = "\t", row.names = FALSE, col.names = FALSE) + +frohs <- froh_bins( snakemake@input[["inds"]], - read.table(snakemake@input[["autos"]], sep = "\t")[,2]) + aggroh, + snakemake@params[["minroh"]], + snakemake@params[["bins"]], + read.table(snakemake@input[["autos"]], sep = "\t")[,2] +) + +indfroh <- frohs %>% group_by(sample) %>% + summarize(Froh = sum(froh)) + +indfroh <- merge( + as.data.frame(read.table(snakemake@input[["inds"]], header=TRUE)), + indfroh, + by = "sample" +) + +write.table(indfroh, file = paste0(snakemake@params[["outpre"]],".ind_froh.tsv"), quote = FALSE, + sep = "\t", row.names = FALSE, col.names = TRUE) + +cumroh <- nroh_cumroh( + aggroh, + snakemake@params[["minroh"]], + snakemake@input[["inds"]] +) + +plot_bins( + frohs, + snakemake@params[["minroh"]], + snakemake@params[["bins"]], + snakemake@params[["outpre"]] +) -plot_roh(aggroh,snakemake@params[["outpre"]]) \ No newline at end of file +plot_cumroh_nroh( + cumroh, + snakemake@params[["outpre"]] +) \ No newline at end of file