Skip to content

Commit

Permalink
Improve pca (#267)
Browse files Browse the repository at this point in the history
* Output allele frequencies along with missingness (for filtering variants)

* Add afreq to output

* Add afreq to intersect_variants.nf

* add afreq to intersect_thinned

* intersect with new pgscatalog-intersect application

* rebase

* Make verbose

* Remove duplication

* Use new output of intersect_variants in filtering

* Use new output of intersect_variants in intersect_variants.nf : keeps memory footprint very low (but higher I/O into tempfiles)

* Fix column index to PCA_ELIGIBLE (13)

* Fix awk statement that doesn't work with odd carriage return?

* Fix awk statement for True/False (not 0/1 as in previous version)

* Add in variant-based filters

---------

Co-authored-by: Benjamin Wingfield <bwingfield@ebi.ac.uk>
  • Loading branch information
smlmbrt and nebfield authored May 23, 2024
1 parent b2de82d commit fe5dcb1
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 12 deletions.
10 changes: 9 additions & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,15 @@ process {
ext.args2 = "zs" // compress .sscore with zstd by default
}

// container configuration
// container configuration
withLabel: pygscatalog {
ext.conda = "$projectDir/environments/pygscatalog/environment.yml"
ext.docker = 'dockerhub.ebi.ac.uk/gdp-public/pygscatalog/pgscatalog-utils'
ext.singularity = 'oras://dockerhub.ebi.ac.uk/gdp-public/pygscatalog/singularity/pgscatalog-utils'
ext.docker_version = ':dev'
ext.singularity_version = ':dev'
}

withLabel: pgscatalog_utils {
ext.conda = "$projectDir/environments/pgscatalog_utils/environment.yml"
ext.docker = 'dockerhub.ebi.ac.uk/gdp-public/pygscatalog/pgscatalog-utils'
Expand Down
4 changes: 2 additions & 2 deletions modules/local/ancestry/filter_variants.nf
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ process FILTER_VARIANTS {
"""
# 1. Get QC'd variant set & unrelated samples from REFERENCE data for PCA --
# ((IS_MA_REF == FALSE) && (IS_MA_TARGET == FALSE)) && (((IS_INDEL == FALSE) && (STRANDAMB == FALSE)) || ((IS_INDEL == TRUE) && (SAME_REF == TRUE)))
awk ' \$13 ~ /True/ && (((\$6 == "False") && (\$9 == "False")) && (((\$4 == "False") && (\$5 == "False")) || ((\$4 == "True") && (\$10 == "True")))) {print \$2}' <(zcat $shared) | gzip -c > shared.txt.gz
# PCA_ELIGIBLE == "True"
awk '(\$13 ~ /^True/) {print \$2}' <(zcat $shared) | gzip -c > shared.txt.gz
plink2 \
--threads $task.cpus \
Expand Down
24 changes: 17 additions & 7 deletions modules/local/ancestry/intersect_variants.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,23 @@ process INTERSECT_VARIANTS {
id = meta.subMap('id', 'build', 'n_chrom', 'chrom')
output = "${meta.id}_${meta.chrom}_matched"
"""
pgscatalog-intersect --reference $ref_variants \
--target $variants \
--chrom $meta.chrom \
--outdir \$PWD \
--verbose
mv matched_variants.txt.gz ${output}.txt.gz
pgscatalog-intersect --ref $ref_variants \
--target $variants \
--chrom $meta.chrom \
--maf_target 0.1 \
--geno_miss 0.1 \
--outdir . \
-v
n_matched=\$(sed -n '3p' intersect_counts_${meta.chrom}.txt)
if [ \$n_matched == "0" ]
then
echo "ERROR: No variants in intersection"
exit 1
else
mv matched_variants.txt.gz ${output}.txt.gz
fi
cat <<-END_VERSIONS > versions.yml
${task.process.tokenize(':').last()}:
Expand Down
1 change: 1 addition & 0 deletions modules/local/plink2_relabelbim.nf
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ process PLINK2_RELABELBIM {
plink2 \\
--threads $task.cpus \\
--memory $mem_mb \\
--freq \\
--missing vcols=fmissdosage,fmiss \\
--freq \\
$args \\
Expand Down
4 changes: 3 additions & 1 deletion modules/local/plink2_relabelpvar.nf
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ process PLINK2_RELABELPVAR {
plink2 \\
--threads $task.cpus \\
--memory $mem_mb \\
--freq \\
--missing vcols=fmissdosage,fmiss \\
--freq \\
$args \\
Expand All @@ -59,7 +60,8 @@ process PLINK2_RELABELPVAR {
cp -a $geno ${output}.pgen || true
cp -a $pheno ${output}.psam || true
gzip ${output}.vmiss ${output}.afreq
gzip ${output}.vmiss
gzip ${output}.afreq
cat <<-END_VERSIONS > versions.yml
${task.process.tokenize(':').last()}:
Expand Down
4 changes: 3 additions & 1 deletion modules/local/plink2_vcf.nf
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ process PLINK2_VCF {
--memory $mem_mb \\
--set-all-var-ids '@:#:\$r:\$a' \\
$set_ma_missing \\
--freq \\
--missing vcols=fmissdosage,fmiss \\
--freq \\
$args \\
Expand All @@ -53,7 +54,8 @@ process PLINK2_VCF {
--make-pgen vzs \\
--out ${output}
gzip ${output}.vmiss ${output}.afreq
gzip ${output}.vmiss
gzip ${output}.afreq
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down

0 comments on commit fe5dcb1

Please sign in to comment.