From 0116bbfbc749e952d81dd508704b67a9a45ab4df Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 13 May 2022 21:52:34 -0700 Subject: [PATCH 01/44] start on simphenotype refactor --- haptools/sim_phenotypes.py | 107 +++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 29 deletions(-) diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py index 1701a1b0..99279433 100644 --- a/haptools/sim_phenotypes.py +++ b/haptools/sim_phenotypes.py @@ -1,40 +1,89 @@ -""" -Example test command: +from __future__ import annotations +from logging import getLogger, Logger -haptools simphenotype --vcf tests/data/simple.vcf.gz --hap tests/data/simple.hap.gz \ - --out test --simu-qt --simu-hsq 0.8 --simu-rep 2 -""" +from haplotype import HaptoolsHaplotype as Haplotype +from .data import GenotypesRefAlt, Phenotypes, Haplotypes -from cyvcf2 import VCF -from .haplotypes import * +class PhenoSimulator: + """ + Simulate phenotypes from genotypes -def simulate_pt(vcffile, hapfile, simu_rep, \ - simu_hsq, simu_k, simu_qt, simu_cc, outprefix): + Attributes + ---------- + gens: GenotypesRefAlt + A set of genotypes to use when simulating phenotypes + haps: Haplotypes + A set of haplotypes to use as causal variables in the simulation + haps_gts: GenotypesRefAlt + Genotypes for each haplotype + log: Logger + A logging instance for recording debug statements + """ - # Initialize readers - vcf_reader = VCF(vcffile, gts012=True) - hap_reader = HapReader(hapfile) + def __init__( + self, genotypes: GenotypesRefAlt, haplotypes: Haplotypes, log: Logger = None + ): + self.gens = genotypes + self.haps = haplotypes + self.log = log or getLogger(self.__class__.__name__) + self.haps_gts = self.haps.transform(self.gens) - # Initialize phenotype simulator (haptools simphenotypes) - pt_sim = PhenoSimulator(vcf_reader.samples) + def run( + self, + simu_rep: int, + simu_hsq: float, + simu_k: float, + simu_qt: bool = False, + simu_cc: bool = False, + ) -> Phenotypes: + effect = np.array([hap.beta for hap in haps.data.values()]) + data = self.haps_gts.data.sum(axis=2) + pt = np.sum(effect * (data - data.mean(axis=0)) / np.std(axis=0)) + phens = np.empty((len(self.genotypes.samples)), dtype=np.float64) - # Set up file to write summary info - outf_sum = open("%s.par"%outprefix, "w") - outf_sum.write("\t".join(["Haplotype", "Frequency", "Effect"])+"\n") + # Simulate and write + for i in range(simu_rep): + resid_component = np.random.normal(0, np.var(pt) * (1 / simu_hsq - 1)) + if simu_qt: + outinfo.append(pt + resid_component) + else: + raise NotImplementedError("Case control not implemented") - # Add effect for each haplotype - for hap in hap_reader: - sample_to_hap = hap.Transform(vcf_reader) - pt_sim.AddEffect(sample_to_hap, hap.hap_effect) - # Output summary info - hap_freq = hap.GetFrequency(vcf_reader) - outf_sum.write("\t".join([hap.hap_id, str(hap_freq), str(hap.hap_effect)])+"\n") +def simulate_pt( + genotypes: Path, + haplotypes: Path, + phenotypes: Path, + simu_rep: int, + simu_hsq: float, + simu_k: float, + simu_qt: bool = False, + simu_cc: bool = False, +): + # Initialize readers + gens = GenotypesRefAlt.load(genotypes) + haps = Haplotypes(haplotypes, haplotype=Haplotype) + haps.read() - # Output simulated phenotypes to a file (haptools simphenotypes) - pt_sim.WritePhenotypes(outprefix, simu_rep, simu_hsq, \ - simu_k, simu_qt, simu_cc) + # Initialize phenotype simulator (haptools simphenotypes) + pt_sim = PhenoSimulator(gens, haps) - # Done - outf_sum.close() \ No newline at end of file + # Set up file to write summary info + outf_sum = open("%s.par" % outprefix, "w") + outf_sum.write("\t".join(["Haplotype", "Frequency", "Effect"]) + "\n") + + # Add effect for each haplotype + for idx, hap in enumerate(haps.data.values()): + sample_to_hap = hap.transform(gens) + pt_sim.add_effect(sample_to_hap, hap.beta) + + # Output summary info + hap_freq = hap.GetFrequency(gens) + outf_sum.write("\t".join([hap.id, str(hap_freq), str(hap.beta)]) + "\n") + + # Output simulated phenotypes to a file (haptools simphenotypes) + pt_sim.run(outprefix, simu_rep, simu_hsq, simu_k, simu_qt, simu_cc) + + # Done + outf_sum.close() From 4fbb2935de839482690df09bf47f2b1a30fed4d2 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 13 May 2022 22:08:28 -0700 Subject: [PATCH 02/44] remove original simphenotype code --- haptools/haplotypes.py | 199 ------------------------------------- haptools/sim_phenotypes.py | 5 +- 2 files changed, 3 insertions(+), 201 deletions(-) delete mode 100644 haptools/haplotypes.py diff --git a/haptools/haplotypes.py b/haptools/haplotypes.py deleted file mode 100644 index 3c9993e4..00000000 --- a/haptools/haplotypes.py +++ /dev/null @@ -1,199 +0,0 @@ -""" -Classes for reading/writing and other functions -on haplotypes - -TODO: -* transform - * deal with LA and STRs - -* writephenotypes -* documentation and type hinting -* tests - -* Compare simulation output to GCTA -* fail gracefully on assertions - we need ERROR utility - -""" -import re -import gzip -from pathlib import Path - -import pysam -import numpy as np - - -class Haplotype: - def __init__(self, hap_id, hap_chr, hap_start, hap_end, \ - hap_varids, hap_varalleles, \ - hap_LA, hap_effect): - self.hap_id = hap_id - self.hap_chr = hap_chr - self.hap_start = hap_start - self.hap_end = hap_end - self.hap_varids = hap_varids - self.hap_varalleles = hap_varalleles - self.hap_LA = hap_LA - self.hap_effect = hap_effect - - ##### Do some basic checks ##### - # hap_varids and hap_varalleles must be the same length - assert(len(self.hap_varids)==len(self.hap_varalleles)) - # Do not repeat variant IDs - assert(len(set(self.hap_varids))==len(self.hap_varids)) - # TODO - STR and LA checks - - def GetVariantID(self, record): - return "%s_%s_%s_%s"%(record.CHROM, record.POS, record.REF, ":".join(record.ALT)) - - def CheckHapMatch(self, vcf_reader): - sample_list = vcf_reader.samples - - # Keep track of whether each chrom - # of each sample matches or not - hap_matches = {} - for sample in sample_list: - hap_matches[sample] = [True, True] - - found_ids = 0 - records = vcf_reader("%s:%s-%s"%(self.hap_chr, self.hap_start, self.hap_end)) - for record in records: - # Check if the record is part of the haplotype - if (record.ID in self.hap_varids): - varind = self.hap_varids.index(record.ID) - elif (GetVariantID(record) in self.varids): - varind = self.hap_varids.index(self.GetVariantID(record)) - else: continue - var_allele = self.hap_varalleles[varind] - var_id = self.hap_varids[varind] - found_ids += 1 - - # Determine the index of the target allele - record_alleles = [record.REF]+record.ALT - assert(var_allele in record_alleles) # TODO fail more gracefully with a message - allele_ind = record_alleles.index(var_allele) - - # Check if each sample matches - # If not, set hap_matches to False - for i in range(len(sample_list)): - sample = sample_list[i] - call = record.genotypes[i] - assert(call[2]) # make sure it is phased - for j in range(2): - if call[j]!=allele_ind: hap_matches[sample][j] = False - - # Check we got through all the IDs - assert(found_ids == len(self.hap_varids)) # TODO fail more gracefully with a message - return hap_matches - - def Transform(self, vcf_reader): - hap_gts = {} - - ##### Case 1 - STR length ##### - pass # TODO - - ##### Case 2 - variant haplotype ##### - hap_matches = self.CheckHapMatch(vcf_reader) - - ##### Case 3 - LA (may be in combination with case 2) ##### - pass # TODO - - # Get haplotype counts - for sample in vcf_reader.samples: - hap_gts[sample] = sum(hap_matches[sample]) - return hap_gts - - def GetFrequency(self, vcf_reader): - hap_gts = self.Transform(vcf_reader) - return np.sum(list(hap_gts.values()))/(2*len(hap_gts.keys())) - - def __str__(self): - return self.hap_id - -class HapReader: - def __init__(self, hapfile: Path, region: str = None): - """ - Open a haplotype file with the given region - - Parameters - ---------- - hapfile : Path - The path to a .haps file containing haplotypes - region : str, optional - A region from which to query specific haplotypes; ex: "chr1:1000-2000" - """ - if region is None: - self.haps = gzip.open(hapfile) - else: - # split the region string so each portion is an element - region = re.split(':|-', region) - if len(region) > 1: - region[1:] = [int(pos) for pos in region[1:] if pos] - # fetch region - self.haps = pysam.TabixFile(hapfile).fetch(*region) - - def GetHaplotype(self, hapline): - hap_id = hapline[0] - hap_chr = hapline[1] - hap_start = int(hapline[2]) - hap_end = int(hapline[3]) - hap_varids = hapline[4].split(",") - hap_varalleles = hapline[5].split(",") - hap_LA = hapline[6].split(",") - hap_effect = float(hapline[7]) - return Haplotype(hap_id, hap_chr, hap_start, hap_end, \ - hap_varids, hap_varalleles, \ - hap_LA, hap_effect) - - def __iter__(self): - return self - - def __next__(self): - # Get the next haplotype - hapline = next(self.haps) - - # If reading from gzip file, convert to list - # so it is in the same format as from tabix - if type(hapline) == bytes: - hapline = hapline.decode('utf-8') - while hapline.startswith('#'): - hapline = next(self.haps).decode('utf-8') - hapline = hapline.strip().split() - - # Convert the line to a haplotype object - return self.GetHaplotype(hapline) - -class PhenoSimulator: - def __init__(self, sample_list): - self.pts = {} - for sample in sample_list: - self.pts[sample] = 0 - - def AddEffect(self, sample_to_hap, effect): - mean = np.mean(list(sample_to_hap.values())) - sd = np.sqrt(np.var(list(sample_to_hap.values()))) - for sample in self.pts.keys(): - assert(sample in sample_to_hap.keys()) - self.pts[sample] += effect*(sample_to_hap[sample]-mean)/sd - - def WritePhenotypes(self, outprefix, simu_rep, simu_hsq, \ - simu_k, simu_qt, simu_cc): - # First compute var(additive) - var_add = np.var(list(self.pts.values())) - - # Prepare output file - outf_sim = open("%s.phen"%outprefix, "w") - - # Simulate and write - for sample in self.pts.keys(): - outinfo = [sample, sample] - genetic_component = self.pts[sample] - for i in range(simu_rep): - resid_component = np.random.normal(0, var_add*(1/simu_hsq-1)) - if simu_qt: - outinfo.append(genetic_component+resid_component) - else: - raise NotImplementedError("Case control not implemented") - outf_sim.write("\t".join([str(item) for item in outinfo])+"\n") - - # Done - outf_sim.close() \ No newline at end of file diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py index 99279433..e39b4772 100644 --- a/haptools/sim_phenotypes.py +++ b/haptools/sim_phenotypes.py @@ -38,8 +38,8 @@ def run( simu_cc: bool = False, ) -> Phenotypes: effect = np.array([hap.beta for hap in haps.data.values()]) - data = self.haps_gts.data.sum(axis=2) - pt = np.sum(effect * (data - data.mean(axis=0)) / np.std(axis=0)) + gts = self.haps_gts.data.sum(axis=2) + pt = np.sum(effect * (gts - gts.mean(axis=0)) / gts.std(axis=0)) phens = np.empty((len(self.genotypes.samples)), dtype=np.float64) # Simulate and write @@ -68,6 +68,7 @@ def simulate_pt( # Initialize phenotype simulator (haptools simphenotypes) pt_sim = PhenoSimulator(gens, haps) + phens = pt_sim.run() # Set up file to write summary info outf_sum = open("%s.par" % outprefix, "w") From 30e594935f108187ce065d891b93695528b8e932 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 15 May 2022 16:42:44 -0700 Subject: [PATCH 03/44] continue adding support for other file formats --- haptools/data/genotypes.py | 181 ++++++++++++++++++++++++++---------- haptools/data/haplotypes.py | 10 +- 2 files changed, 133 insertions(+), 58 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 24e36fd9..d030db11 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -1,9 +1,11 @@ from __future__ import annotations +import re from csv import reader from pathlib import Path from typing import Iterator from collections import namedtuple from logging import getLogger, Logger +from fileinput import hook_compressed import numpy as np import numpy.typing as npt @@ -547,8 +549,78 @@ class GenotypesPLINK(GenotypesRefAlt): >>> genotypes = GenotypesPLINK.load('tests/data/simple.pgen') """ + def read_samples(self, samples: list[str] = None): + """ + Read sample IDs from a PSAM file into a list stored in + :py:attr:`~.GenotypesPLINK.samples` + + One of either variants or max_variants MUST be specified! + + Parameters + ---------- + samples : list[str], optional + See documentation for :py:attr:`~.GenotypesRefAlt.read` + + Returns + ------- + npt.NDArray[np.uint32] + The indices of each of the samples within the PSAM file + """ + if samples is not None and not isinstance(samples, set): + samples = set(samples) + with hook_compressed(self.fname.with_suffix(".psam"), mode="rt") as psam: + psamples = reader(psam, delimiter="\t") + # find the line that declares the header + for header in psamples: + if header[0].startswith('#FID') or header[0].startswith('#IID'): + break + # we need the header to contain the IID column + if header[0][0] != "#": + raise ValueError("Your PSAM file is missing a header!") + col_idx = 1 + self.log.info( + "Your PVAR file lacks a proper header! Assuming first five columns" + " are [CHROM, POS, ID, REF, ALT] in that order..." + ) + # TODO: add header back in to psamples using itertools.chain? + else: + header[0] = header[0][1:] + try: + col_idx = header.index('IID') + except ValueError: + raise ValueError("Your PSAM file must have an IID column.") + self.samples = { + ct: samp[col_idx] for ct, samp in enumerate(psamples) + if (samples is None) or (samp[col_idx] in samples) + } + indices = np.array(list(self.samples.keys()), dtype=np.uint32) + self.samples = list(self.samples.values()) + return indices + + def _check_region( + self, pos: tuple, chrom: str, start: int = 0, end: int = float("inf") + ): + """ + Check that pos lies within the provided chrom, start, and end coordinates + + This is a helper function for :py:meth:`~.GenotypesPLINK.read_variants` + + Parameters + ---------- + pos : tuple[str, int] + A tuple of two elements: contig (str) and chromosomal position (int) + chrom: str + The contig of the region to check + start: int, optional + The start position of the region + end: int, optional + The end position of the region + """ + return (pos[0] == chrom) and (start <= pos[1]) and (end >= pos[1]) + def read_variants( self, + region: str = None, variants: set[str] = None, max_variants: int = None, ): @@ -578,53 +650,68 @@ def read_variants( max_variants = len(variants) if max_variants is None: raise ValueError("Provide either the variants or max_variants parameter!") - # TODO: try using pysam.tabix_iterator with the parser param specified for VCF - if region: - pvar = TabixFile(str(self.fname.with_suffix(".pvar"))) - pvariants = pvar.fetch(region) - else: - pvar = hook_compressed(self.fname.with_suffix(".pvar"), mode="rt") + # split the region string so each portion is an element + if region is not None: + region = re.split(":|-", region) + if len(region) > 1: + region[1:] = [int(pos) for pos in region[1:] if pos] + with hook_compressed(self.fname.with_suffix(".pvar"), mode="rt") as pvar: pvariants = reader(pvar, delimiter="\t") - # first, preallocate the array - self.variants = np.empty((max_variants,), dtype=self.variants.dtype) - header = next(pvariants) - # there should be at least five columns - if len(header) < 5: - raise ValueError("Your PVAR file should have at least five columns.") - if header[0][0] == "#": - header[0] = header[0][1:] - else: - raise ValueError("Your PVAR file is missing a header!") - header = ["CHROM", "POS", "ID", "REF", "ALT"] - self.log.info( - "Your PVAR file lacks a proper header! Assuming first five columns" - " are [CHROM, POS, ID, REF, ALT] in that order..." - ) - # TODO: add header back in to pvariants using itertools.chain? - cid = {item: header.index(item.upper()) for item in self.variants.dtype.names} - if variants is not None: + # first, preallocate the array + self.variants = np.empty((max_variants,), dtype=self.variants.dtype) + # find the line that declares the header + for header in pvariants: + if not header[0].startswith('##'): + break + # there should be at least five columns + if len(header) < 5: + raise ValueError("Your PVAR file should have at least five columns.") + if header[0][0] == "#": + header[0] = header[0][1:] + else: + raise ValueError("Your PVAR file is missing a header!") + header = ["CHROM", "POS", "ID", "REF", "ALT"] + self.log.info( + "Your PVAR file lacks a proper header! Assuming first five columns" + " are [CHROM, POS, ID, REF, ALT] in that order..." + ) + # TODO: add header back in to pvariants using itertools.chain? + cid = { + item: header.index(item.upper()) + for item in self.variants.dtype.names + if item is not "aaf" + } indices = np.empty((max_variants,), dtype=np.uint32) - num_seen = 0 - for ct, rec in enumerate(pvariants): - if variants is not None: - if rec[cid["id"]] not in variants: + num_seen = 0 + for ct, rec in enumerate(pvariants): + if region and not self._check_region( + (rec[cid["chrom"]], int(rec[cid["pos"]])), *region + ): continue + if variants is not None: + if rec[cid["id"]] not in variants: + continue indices[num_seen] = ct - self.variants[num_seen] = np.array( - ( - rec[cid["id"]], - rec[cid["chrom"]], - rec[cid["pos"]], - 0, - rec[cid["ref"]], - rec[cid["alt"]], - ), - dtype=self.variants.dtype, + self.variants[num_seen] = np.array( + ( + rec[cid["id"]], + rec[cid["chrom"]], + rec[cid["pos"]], + 0, + rec[cid["ref"]], + rec[cid["alt"]], + ), + dtype=self.variants.dtype, + ) + num_seen += 1 + if max_variants > num_seen: + self.log.info( + f"Removing {max_variants-num_seen} unneeded variant records that " + "were preallocated b/c max_variants was specified." ) - num_seen += 1 - pvar.close() - if variants is not None: - return indices + indices = indices[:num_seen] + self.variants = self.variants[:num_seen] + return indices def read( self, @@ -654,7 +741,8 @@ def read( # TODO: figure out how to install this package from pgenlib import PgenReader - pgen = PgenReader(bytes(self.fname, "utf8")) + sample_idxs = self.read_samples(samples) + pgen = PgenReader(bytes(self.fname, "utf8"), sample_subset=sample_idxs) if variants is not None: max_variants = len(variants) @@ -663,14 +751,9 @@ def read( max_variants = pgen.get_variant_ct() indices = self.read_variants(region, variants, max_variants) - variant_ct_start = 0 - variant_ct_end = max_variants - variant_ct = variant_ct_end - variant_ct_start - - sample_ct = pgen.get_raw_sample_ct() # the genotypes start out as a simple 2D array with twice the number of samples # so each column is a different chromosomal strand - self.data = np.empty((variant_ct, sample_ct * 2), dtype=np.int32) + self.data = np.empty((len(indices), len(sample_idxs) * 2), dtype=np.int32) pgen.read_alleles(indices, self.data) # extract the genotypes to a np matrix of size n x p x 2 # the last dimension has two items: diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py index f15f5c9b..c46895bb 100644 --- a/haptools/data/haplotypes.py +++ b/haptools/data/haplotypes.py @@ -1,5 +1,4 @@ from __future__ import annotations -import re from pathlib import Path from logging import getLogger, Logger from fileinput import hook_compressed @@ -785,14 +784,7 @@ def transform( hap_gts.samples = genotypes.samples hap_gts.variants = np.array( [(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()], - dtype=[ - ("id", "U50"), - ("chrom", "U10"), - ("pos", np.uint32), - ("aaf", np.float64), - ("ref", "U100"), - ("alt", "U100"), - ], + dtype=self.variants.dtype, ) self.log.info( f"Transforming a set of genotypes from {len(genotypes.variants)} total " From 06e380613f2a0cd66c7844f65b0b9c031071c7a5 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 15 May 2022 21:08:03 -0700 Subject: [PATCH 04/44] refmt with black --- haptools/data/genotypes.py | 89 ++++++++++++++++++++++++------------- haptools/data/haplotypes.py | 2 +- 2 files changed, 60 insertions(+), 31 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index d030db11..2cd3fd64 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -95,8 +95,7 @@ def load( genotypes.read(region, samples, variants) genotypes.check_missing() genotypes.check_biallelic() - if not self._prephased: - genotypes.check_phase() + genotypes.check_phase() # genotypes.to_MAC() return genotypes @@ -566,13 +565,15 @@ def read_samples(self, samples: list[str] = None): npt.NDArray[np.uint32] The indices of each of the samples within the PSAM file """ + if len(self.samples) != 0: + self.log.warning("Sample data has already been loaded. Overriding.") if samples is not None and not isinstance(samples, set): samples = set(samples) with hook_compressed(self.fname.with_suffix(".psam"), mode="rt") as psam: psamples = reader(psam, delimiter="\t") # find the line that declares the header for header in psamples: - if header[0].startswith('#FID') or header[0].startswith('#IID'): + if header[0].startswith("#FID") or header[0].startswith("#IID"): break # we need the header to contain the IID column if header[0][0] != "#": @@ -586,11 +587,12 @@ def read_samples(self, samples: list[str] = None): else: header[0] = header[0][1:] try: - col_idx = header.index('IID') + col_idx = header.index("IID") except ValueError: raise ValueError("Your PSAM file must have an IID column.") self.samples = { - ct: samp[col_idx] for ct, samp in enumerate(psamples) + ct: samp[col_idx] + for ct, samp in enumerate(psamples) if (samples is None) or (samp[col_idx] in samples) } indices = np.array(list(self.samples.keys()), dtype=np.uint32) @@ -661,7 +663,7 @@ def read_variants( self.variants = np.empty((max_variants,), dtype=self.variants.dtype) # find the line that declares the header for header in pvariants: - if not header[0].startswith('##'): + if not header[0].startswith("##"): break # there should be at least five columns if len(header) < 5: @@ -679,7 +681,7 @@ def read_variants( cid = { item: header.index(item.upper()) for item in self.variants.dtype.names - if item is not "aaf" + if item != "aaf" } indices = np.empty((max_variants,), dtype=np.uint32) num_seen = 0 @@ -735,30 +737,57 @@ def read( max_variants : int, optional See documentation for :py:attr:`~.GenotypesRefAlt.read` """ - super().read() - # load the pgen-reader file - # note: very little is loaded into memory at this point + super(Genotypes, self).read() # TODO: figure out how to install this package from pgenlib import PgenReader sample_idxs = self.read_samples(samples) - pgen = PgenReader(bytes(self.fname, "utf8"), sample_subset=sample_idxs) - - if variants is not None: - max_variants = len(variants) - if max_variants is None: - # use the pgen file to figure out how many variants there are - max_variants = pgen.get_variant_ct() - indices = self.read_variants(region, variants, max_variants) - - # the genotypes start out as a simple 2D array with twice the number of samples - # so each column is a different chromosomal strand - self.data = np.empty((len(indices), len(sample_idxs) * 2), dtype=np.int32) - pgen.read_alleles(indices, self.data) - # extract the genotypes to a np matrix of size n x p x 2 - # the last dimension has two items: - # 1) presence of REF in strand one - # 2) presence of REF in strand two - self.data = np.dstack((self.data[:, ::2], self.data[:, 1::2])) - # transpose the GT matrix so that samples are rows and variants are columns - self.data = self.data.transpose((1, 0, 2)) + with PgenReader( + bytes(str(self.fname), "utf8"), sample_subset=sample_idxs + ) as pgen: + + if variants is not None: + max_variants = len(variants) + if max_variants is None: + # use the pgen file to figure out how many variants there are + max_variants = pgen.get_variant_ct() + indices = self.read_variants(region, variants, max_variants) + + # the genotypes start out as a simple 2D array with twice the number of samples + if not self._prephased: + # raise an error message b/c this is untested code that doesn't really + # seem to work properly + # for ex, the phasing info is not boolean for some reason? + raise ValueError("Not implemented yet!") + # ...so each column is a different chromosomal strand + self.data = np.empty( + (len(indices), len(sample_idxs) * 2), dtype=np.int32 + ) + phasing = np.empty((len(indices), len(sample_idxs)), dtype=np.uint8) + # The haplotype-major mode of read_alleles_and_phasepresent_list has + # not been implemented yet, so we need to read the genotypes in sample- + # major mode and then transpose them + pgen.read_alleles_and_phasepresent_list(indices, self.data, phasing) + # missing alleles will have a value of -9 + # let's make them be -1 to be consistent with cyvcf2 + self.data[self.data == -9] = -1 + self.data = np.dstack((self.data[:, ::2], self.data[:, 1::2])).astype( + np.uint8 + ) + self.data = np.concatenate( + (self.data, phasing[:, :, np.newaxis]), axis=2 + ) + # transpose the GT matrix so that samples are rows and variants are columns + self.data = self.data.transpose((1, 0, 2)) + else: + # ...so each row is a different chromosomal strand + self.data = np.empty( + (len(sample_idxs) * 2, len(indices)), dtype=np.int32 + ) + pgen.read_alleles_list(indices, self.data, hap_maj=True) + # missing alleles will have a value of -9 + # let's make them be -1 to be consistent with cyvcf2 + self.data[self.data == -9] = -1 + self.data = np.dstack((self.data[::2, :], self.data[1::2, :])).astype( + np.uint8 + ) diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py index c46895bb..df8baa2d 100644 --- a/haptools/data/haplotypes.py +++ b/haptools/data/haplotypes.py @@ -784,7 +784,7 @@ def transform( hap_gts.samples = genotypes.samples hap_gts.variants = np.array( [(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()], - dtype=self.variants.dtype, + dtype=hap_gts.variants.dtype, ) self.log.info( f"Transforming a set of genotypes from {len(genotypes.variants)} total " From 52d90a04d8c1b623e82bca196169d2a440db8721 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 13 Jun 2022 12:52:33 -0700 Subject: [PATCH 05/44] move transform subcommand to its own module --- haptools/__main__.py | 21 ++-------------- haptools/transform.py | 56 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 19 deletions(-) create mode 100644 haptools/transform.py diff --git a/haptools/__main__.py b/haptools/__main__.py index b89e4533..633d5b9b 100644 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -204,8 +204,7 @@ def transform( """ import logging - from haptools import data - from .haplotype import HaptoolsHaplotype + from .transform import transform_haps log = logging.getLogger("run") logging.basicConfig( @@ -226,23 +225,7 @@ def transform( else: samples = None - log.info("Loading haplotypes") - hp = data.Haplotypes(haplotypes) - hp.read(region=region) - log.info("Extracting variants from haplotypes") - variants = {var.id for hap in hp.data.values() for var in hap.variants} - log.info("Loading genotypes") - gt = data.GenotypesRefAlt(genotypes, log=log) - # gt._prephased = True - gt.read(region=region, samples=samples, variants=variants) - gt.check_missing(discard_also=True) - gt.check_biallelic(discard_also=True) - gt.check_phase() - log.info("Transforming genotypes via haplotypes") - hp_gt = data.GenotypesRefAlt(fname=output, log=log) - hp.transform(gt, hp_gt) - log.info("Writing haplotypes to VCF") - hp_gt.write() + transform_haps(genotypes, haplotypes, region, samples, output, log) if __name__ == "__main__": diff --git a/haptools/transform.py b/haptools/transform.py new file mode 100644 index 00000000..2b727e16 --- /dev/null +++ b/haptools/transform.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from haptools import data +from .haplotype import HaptoolsHaplotype + + +def transform_haps( + genotypes: Path, + haplotypes: Path, + region: str = None, + samples: list[str] = None, + output: Path = Path("-"), + log: Logger = None, +): + """ + Creates a VCF composed of haplotypes + + Parameters + ---------- + genotypes : Path + The path to the genotypes in VCF format + haplotypes : Path + The path to the haplotypes in a .hap file + region : str, optional + See documentation for :py:meth:`~.data.Genotypes.read` + samples : list[str], optional + See documentation for :py:meth:`~.data.Genotypes.read` + output : Path, optional + The location to which to write output + log : Logger, optional + A logging module to which to write messages about progress and any errors + """ + log.info("Loading haplotypes") + hp = data.Haplotypes(haplotypes) + hp.read(region=region) + + log.info("Extracting variants from haplotypes") + variants = {var.id for hap in hp.data.values() for var in hap.variants} + + log.info("Loading genotypes") + gt = data.GenotypesRefAlt(genotypes, log=log) + # gt._prephased = True + gt.read(region=region, samples=samples, variants=variants) + gt.check_missing(discard_also=True) + gt.check_biallelic(discard_also=True) + gt.check_phase() + + log.info("Transforming genotypes via haplotypes") + hp_gt = data.GenotypesRefAlt(fname=output, log=log) + hp.transform(gt, hp_gt) + + log.info("Writing haplotypes to VCF") + hp_gt.write() + + return hp_gt + \ No newline at end of file From e3c9df99b74a4c0c1fd8ad34a84bb37f8fc782d9 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 13 Jun 2022 12:53:29 -0700 Subject: [PATCH 06/44] setup PhenoSimulator.__init__ method --- haptools/sim_phenotypes.py | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py index e39b4772..16913b05 100644 --- a/haptools/sim_phenotypes.py +++ b/haptools/sim_phenotypes.py @@ -17,20 +17,56 @@ class PhenoSimulator: A set of haplotypes to use as causal variables in the simulation haps_gts: GenotypesRefAlt Genotypes for each haplotype + phens: Phenotypes + Phenotypes for each haplotype log: Logger A logging instance for recording debug statements + + Examples + -------- + >>> gens = Genotypes.load("tests/data/example.vcf.gz") + >>> haps = Haplotypes.load("tests/data/basic.hap") + >>> phenosim = PhenoSimulator(gens, haps, Path("basic_phens.tsv.gz")) + >>> phenosim.transform() + >>> phenotypes = phenosim.run() """ def __init__( - self, genotypes: GenotypesRefAlt, haplotypes: Haplotypes, log: Logger = None + self, + genotypes: GenotypesRefAlt, + haplotypes: Haplotypes, + phenotypes: Path, + log: Logger = None, ): self.gens = genotypes self.haps = haplotypes + self.haps_gts = None + self.phens = Phenotypes(phenotypes, self.log) + self.phens.samples = self.gens self.log = log or getLogger(self.__class__.__name__) + + def unset(self): + """ + Whether the data has been loaded into the object yet + + Returns + ------- + bool + True if :py:attr:`~.PhenoSimulator.haps_gts` is None else False + """ + return self.haps_gts is None + + def transform(self): + """ + Obtain the "genotypes" of the haplotypes + """ + if self.unset(): + self.log.warning("The haplotype 'genotypes' have already been loaded. Overriding.") self.haps_gts = self.haps.transform(self.gens) def run( self, + name: str, simu_rep: int, simu_hsq: float, simu_k: float, From 49341e664895f37ee92602aef427afdcdf550cea Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 16 Jun 2022 10:34:31 -0700 Subject: [PATCH 07/44] add Genotypes.subset() method --- haptools/data/genotypes.py | 86 ++++++++++++++++++++++++++++++++++++++ tests/test_data.py | 31 ++++++++++++++ 2 files changed, 117 insertions(+) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 2cd3fd64..7f100acb 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -38,6 +38,10 @@ class Genotypes(Data): _prephased : bool If True, assume that the genotypes are phased. Otherwise, extract their phase when reading from the VCF. + _samp_idx : dict[str, int] + Sample index; maps samples to indices in self.samples + _var_idx : dict[str, int] + Variant index; maps variant IDs to indices in self.variants Examples -------- @@ -61,6 +65,8 @@ def __init__(self, fname: Path, log: Logger = None): ], ) self._prephased = False + self._samp_idx = None + self._var_idx = None @classmethod def load( @@ -281,6 +287,86 @@ def __iter__( # see https://stackoverflow.com/a/36726497 return self._iterate(vcf, region, variants) + def index(self, samples: bool = True, variants: bool = True): + """ + Call this function once to improve the speed of future look-ups of samples and + variants by their ID. This is useful if you intend to later subset by a set + of samples or variant IDs. + The time complexity of this function should be roughly O(n+m) if both + parameters are True. Otherwise, it will be either O(n) or O(m). + + Parameters + ---------- + samples: bool, optional + Whether to index the samples for fast loop-up. Adds complexity O(n). + variants: bool, optional + Whether to index the variants for fast look-up. Adds complexity O(m). + """ + if samples and self._samp_idx is None: + self._samp_idx = dict(zip(self.samples, range(len(self.samples)))) + if variants and self._var_idx is None: + self._var_idx = dict(zip(self.variants["id"], range(len(self.variants)))) + + def subset( + self, + samples: tuple[str] = None, + variants: tuple[str] = None, + inplace: bool = False, + ): + """ + Subset these genotypes to a smaller set of samples or a smaller set of variants + + Parameters + ---------- + samples: tuple[str] + A subset of samples to keep + variants: tuple[str] + A subset of variant IDs to keep + inplace: bool, optional + If False, return a new Genotypes object; otherwise, alter the current one + + Returns + ------- + A new Genotypes object if inplace is set to False, else returns None + """ + # First, initialize variables + gts = self + if not inplace: + gts = self.__class__(self.fname, self.log) + gts.samples = self.samples + gts.variants = self.variants + gts.data = self.data + # Index the current set of samples and variants so we can have fast look-up + self.index(samples=(samples is not None), variants=(variants is not None)) + # Subset the samples + if samples is not None: + gts.samples = tuple(samp for samp in samples if samp in self._samp_idx) + if len(gts.samples) < len(samples): + diff = len(samples) - len(gts.samples) + self.log.warning( + f"Saw {diff} fewer samples than requested. Proceeding with " + f"f{len(gts.samples)} samples." + ) + samp_idx = tuple(self._samp_idx[samp] for samp in gts.samples) + if inplace: + self._samp_idx = None + gts.data = gts.data[samp_idx, :] + # Subset the variants + if variants is not None: + var_idx = [self._var_idx[var] for var in variants if var in self._var_idx] + if len(var_idx) < len(variants): + diff = len(variants) - len(var_idx) + self.log.warning( + f"Saw {diff} fewer variants than requested. Proceeding with " + f"f{len(var_idx)} variants." + ) + gts.variants = self.variants[var_idx] + if inplace: + self._var_idx = None + gts.data = gts.data[:, var_idx] + if not inplace: + return gts + def check_missing(self, discard_also=False): """ Check that each sample is properly genotyped diff --git a/tests/test_data.py b/tests/test_data.py index 190e3370..94c31190 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -208,6 +208,37 @@ def test_load_genotypes_subset(self): np.testing.assert_allclose(gts.data, expected) assert gts.samples == tuple(samples) + def test_subset_genotypes(self): + gts = self._get_fake_genotypes() + + # subset to just the samples we want + expected_data = gts.data[:3] + expected_variants = gts.variants + samples = ("HG00096", "HG00097", "HG00099") + gts_sub = gts.subset(samples=samples) + assert gts_sub.samples == samples + np.testing.assert_allclose(gts_sub.data, expected_data) + assert np.array_equal(gts_sub.variants, expected_variants) + + # subset to just the variants we want + expected_data = gts.data[:, [1, 2]] + expected_variants = gts.variants[[1, 2]] + variants = ('1:10116:A:G', '1:10117:C:A') + gts_sub = gts.subset(variants=variants) + assert gts_sub.samples == gts.samples + np.testing.assert_allclose(gts_sub.data, expected_data) + assert np.array_equal(gts_sub.variants, expected_variants) + + # subset both: samples and variants + expected_data = gts.data[[3, 4], :2] + expected_variants = gts.variants[:2] + samples = ("HG00100", "HG00101") + variants = ("1:10114:T:C", '1:10116:A:G') + gts_sub = gts.subset(samples=samples, variants=variants) + assert gts_sub.samples == samples + np.testing.assert_allclose(gts_sub.data, expected_data) + assert np.array_equal(gts_sub.variants, expected_variants) + def test_load_phenotypes(caplog): # create a phenotype vector with shape: num_samples x 1 From a0ae5e1075f47ae6aceaad130731e4704b791327 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 16 Jun 2022 11:10:01 -0700 Subject: [PATCH 08/44] also allow indexing in GenotypesRefAlt class --- haptools/data/genotypes.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 7f100acb..8421a37c 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -316,6 +316,9 @@ def subset( """ Subset these genotypes to a smaller set of samples or a smaller set of variants + The order of the samples and variants in the subsetted instance will match + the order in the provided tuple parameters. + Parameters ---------- samples: tuple[str] @@ -549,6 +552,8 @@ def __init__(self, fname: Path, log: Logger = None): ], ) self._prephased = False + self._samp_idx = None + self._var_idx = None def _variant_arr(self, record: Variant): """ From f2a30170d7fd7cb18390312aa91eaf82a1183142 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 16 Jun 2022 11:17:53 -0700 Subject: [PATCH 09/44] simplify the transform method of the Haplotype and Haplotypes classes see https://github.com/gymrek-lab/haptools/issues/19#issuecomment-1126651795 --- haptools/data/haplotypes.py | 64 ++++++++++++++----------------------- 1 file changed, 24 insertions(+), 40 deletions(-) diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py index df8baa2d..a6fd254a 100644 --- a/haptools/data/haplotypes.py +++ b/haptools/data/haplotypes.py @@ -277,7 +277,7 @@ def _fmt(self): @property # TODO: use @cached_property in py3.8 def varIDs(self): - return {var.id for var in self.variants} + return tuple(var.id for var in self.variants) @classmethod def from_hap_spec( @@ -334,9 +334,7 @@ def extras_head(cls) -> tuple: """ return tuple(extra.to_hap_spec("H") for extra in cls._extras) - def transform( - self, genotypes: GenotypesRefAlt, samples: list[str] = None - ) -> npt.NDArray[bool]: + def transform(self, genotypes: GenotypesRefAlt) -> npt.NDArray[bool]: """ Transform a genotypes matrix via the current haplotype @@ -350,8 +348,6 @@ def transform( If the genotypes have not been loaded into the Genotypes object yet, this method will call Genotypes.read(), while loading only the needed variants - samples : list[str], optional - See documentation for :py:attr:`~.Genotypes.read` Returns ------- @@ -360,34 +356,27 @@ def transform( denotes the presence of the haplotype in one chromosome of a sample """ var_IDs = self.varIDs - # check: have the genotypes been loaded yet? - # if not, we can load just the variants we need - if genotypes.unset(): - start = min(var.start for var in self.variants) - end = max(var.end for var in self.variants) - region = f"{self.chrom}:{start}-{end}" - genotypes.read(region=region, samples=samples, variants=var_IDs) - genotypes.check_biallelic(discard_also=True) - genotypes.check_phase() - # create a dict where the variants are keyed by ID - var_dict = { - var["id"]: var["ref"] for var in genotypes.variants if var["id"] in var_IDs - } - var_idxs = [ - idx for idx, var in enumerate(genotypes.variants) if var["id"] in var_IDs - ] - missing_IDs = var_IDs - var_dict.keys() - if len(missing_IDs): + gts = genotypes.subset(variants=var_IDs) + # check: were any of the variants absent from the genotypes? + if len(gts.variants) < len(var_IDs): + missing_IDs = set(var_IDs) - set(gts.variants["id"]) raise ValueError( f"Variants {missing_IDs} are present in haplotype '{self.id}' but " "absent in the provided genotypes" ) # create a np array denoting the alleles that we want - alleles = [int(var.allele != var_dict[var.id]) for var in self.variants] - allele_arr = np.array([[[al] for al in alleles]]) # shape: (1, n, 1) + # note: the excessive use of square-brackets gives us shape (1, n, 1) + allele_arr = np.array( + [ + [ + [int(var.allele != gts.variants[i]["ref"])] + for i, var in enumerate(self.variants) + ] + ] + ) # look for the presence of each allele in each chromosomal strand # and then just AND them together - return np.all(allele_arr == genotypes.data[:, var_idxs], axis=1) + return np.all(allele_arr == gts.data, axis=1) class Haplotypes(Data): @@ -751,9 +740,7 @@ def write(self): def transform( self, genotypes: GenotypesRefAlt, - hap_gts: GenotypesRefAlt, - samples: list[str] = None, - low_memory: bool = False, + hap_gts: GenotypesRefAlt = None, ) -> GenotypesRefAlt: """ Transform a genotypes matrix via the current haplotype @@ -765,35 +752,32 @@ def transform( ---------- genotypes : GenotypesRefAlt The genotypes which to transform using the current haplotype - - If the genotypes have not been loaded into the Genotypes object yet, this - method will call Genotypes.read(), while loading only the needed variants hap_gts: GenotypesRefAlt An empty GenotypesRefAlt object into which the haplotype genotypes should be stored - samples : list[str], optional - See documentation for :py:attr:`~.Genotypes.read` - low_memory : bool, optional - If True, each haplotype's genotypes will be loaded one at a time. Returns ------- GenotypesRefAlt A Genotypes object composed of haplotypes instead of regular variants. """ + # Initialize GenotypesRefAlt return value + if hap_gts is None: + hap_gts = GenotypesRefAlt(fname=None, log=self.log) hap_gts.samples = genotypes.samples hap_gts.variants = np.array( [(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()], dtype=hap_gts.variants.dtype, ) + # Obtain and merge the haplotype genotypes self.log.info( f"Transforming a set of genotypes from {len(genotypes.variants)} total " f"variants with a list of {len(self.data)} haplotypes" ) hap_gts.data = np.concatenate( tuple( - hap.transform(genotypes, samples)[:, np.newaxis] - for hap in self.data.values() + hap.transform(genotypes)[:, np.newaxis] for hap in self.data.values() ), axis=1, - ).astype(np.uint8) + ).astype(genotypes.data.dtype) + return hap_gts From a13f94abfb3fa9f13634e55a0e1d8d6debe00840 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 16 Jun 2022 12:00:58 -0700 Subject: [PATCH 10/44] add region and samples params to simphenotype --- haptools/__main__.py | 128 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 113 insertions(+), 15 deletions(-) diff --git a/haptools/__main__.py b/haptools/__main__.py index 633d5b9b..26a635fc 100644 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -85,34 +85,132 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms): DEFAULT_SIMU_K = 0.1 ################################################## @main.command() -@click.option('--vcf', help='Phased VCF file', type=str, required=True) -@click.option('--hap', help='Haplotype file with effect sizes', \ - type=str, required=True) -@click.option('--out', help='Prefix for output files', \ - type=str, required=True) -@click.option('--simu-qt', help='Simulate a quantitative trait', \ - default=False, is_flag=True) -@click.option('--simu-cc', help='Simulate a case/control trait', \ - default=False, is_flag=True) +@click.argument("genotypes", type=click.Path(exists=True, path_type=Path)) +@click.argument("haplotypes", type=click.Path(exists=True, path_type=Path)) +@click.option( + "--region", + type=str, + default=None, + show_default="all haplotypes", + help=""" + The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'\n + For this to work, the VCF must be indexed and the seqname must match!""", +) +@click.option( + "-s", + "--sample", + "samples", + type=str, + multiple=True, + show_default="all samples", + help=( + "A list of the samples to subset from the genotypes file (ex: '-s sample1 -s" + " sample2')" + ), +) +@click.option( + "-S", + "--samples-file", + type=click.File("r"), + show_default="all samples", + help=( + "A single column txt file containing a list of the samples (one per line) to" + " subset from the genotypes file" + ), +) @click.option('--simu-rep', help='Number of rounds of simulation to perform', \ type=int, default=DEFAULT_SIMU_REP) @click.option('--simu-hsq', help='Trait heritability', \ type=float, default=DEFAULT_SIMU_HSQ) @click.option('--simu-k', help='Specify the disease prevalence', \ type=float, default=DEFAULT_SIMU_K) -def simphenotype(vcf, hap, simu_rep, simu_hsq, simu_k, simu_qt, simu_cc, out): +@click.option( + "-o", + "--output", + type=click.Path(path_type=Path), + default=Path("-"), + show_default="stdout", + help="A TSV file containing simulated phenotypes", +) +@click.option( + "-v", + "--verbosity", + type=click.Choice(["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"]), + default="ERROR", + show_default="only errors", + help="The level of verbosity desired", +) +def simphenotype( + genotypes: Path, + haplotypes: Path, + region: str = None, + samples: tuple[str] = tuple(), + samples_file: Path = None, + simu_rep, simu_hsq, simu_k, + output: Path = Path("-"), + verbosity: str = 'ERROR', +): """ - Haplotype-aware phenotype simulation + Haplotype-aware phenotype simulation. Create a set of simulated phenotypes from a + set of haplotypes. + + GENOTYPES must be formatted as a VCF and HAPLOTYPES must be formatted according + to the .hap format spec + + \f + Examples + -------- + >>> haptools simphenotype tests/data/example.vcf.gz tests/data/example.hap.gz > simu_phens.tsv + + Parameters + ---------- + genotypes : Path + The path to the genotypes in VCF format + haplotypes : Path + The path to the haplotypes in a .hap file + region : str, optional + See documentation for :py:meth:`~.data.Genotypes.read` + sample : Tuple[str], optional + See documentation for :py:meth:`~.data.Genotypes.read` + samples_file : Path, optional + A single column txt file containing a list of the samples (one per line) to + subset from the genotypes file + output : Path, optional + The location to which to write the simulated phenotypes + verbosity : str, optional + The level of verbosity desired in messages written to stderr """ + import logging + from .sim_phenotypes import simulate_pt + + log = logging.getLogger("run") + logging.basicConfig( + format="[%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)", + level=verbosity, + ) + # handle samples + if samples and samples_file: + raise click.UsageError( + "You may only use one of --sample or --samples-file but not both." + ) + if samples_file: + with samples_file as samps_file: + samples = samps_file.read().splitlines() + elif samples: + # needs to be converted from tuple to list + samples = list(samples) + else: + samples = None # Basic checks on input # TODO - check VCF zipped, check only one of simu-qt/simu-cc, # check values of other inputs # Only use simu-k for case/control # Run simulation - simulate_pt(vcf, hap, simu_rep, \ - simu_hsq, simu_k, simu_qt, simu_cc, out) + simulate_pt( + genotypes, haplotypes, simu_rep, simu_hsq, simu_k, region, samples, output, log + ) @main.command(short_help="Transform a genotypes matrix via a set of haplotypes") @click.argument("genotypes", type=click.Path(exists=True, path_type=Path)) @@ -121,9 +219,9 @@ def simphenotype(vcf, hap, simu_rep, simu_hsq, simu_k, simu_qt, simu_cc, out): "--region", type=str, default=None, - show_default="all genotypes", + show_default="all haplotypes", help=""" - The region from which to extract genotypes; ex: 'chr1:1234-34566' or 'chr7'\n + The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'\n For this to work, the VCF must be indexed and the seqname must match!""", ) @click.option( From fa55ee8dc248fed5914612ae547e8618cf125cbe Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 16 Jun 2022 12:01:48 -0700 Subject: [PATCH 11/44] raise error if any samples or variants are invalid in transform subcommand --- haptools/data/genotypes.py | 2 ++ haptools/transform.py | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 8421a37c..039446a9 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -396,6 +396,7 @@ def check_missing(self, discard_also=False): ) self.data = np.delete(self.data, samp_idx, axis=0) self.samples = tuple(np.delete(self.samples, samp_idx)) + self._samp_idx = None else: raise ValueError( "Genotype with ID {} at POS {}:{} is missing for sample {}".format( @@ -437,6 +438,7 @@ def check_biallelic(self, discard_also=False): self.log.info(f"Ignoring {len(variant_idx)} multiallelic variants") self.data = np.delete(self.data, variant_idx, axis=1) self.variants = np.delete(self.variants, variant_idx) + self._var_idx = None else: raise ValueError( "Variant with ID {} at POS {}:{} is multiallelic for sample {}" diff --git a/haptools/transform.py b/haptools/transform.py index 2b727e16..0745c80f 100644 --- a/haptools/transform.py +++ b/haptools/transform.py @@ -1,4 +1,5 @@ from __future__ import annotations +import logging from haptools import data from .haplotype import HaptoolsHaplotype @@ -30,27 +31,33 @@ def transform_haps( log : Logger, optional A logging module to which to write messages about progress and any errors """ + if log is None: + log = logging.getLogger("run") + logging.basicConfig( + format="[%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)", + level="ERROR", + ) + log.info("Loading haplotypes") - hp = data.Haplotypes(haplotypes) + hp = data.Haplotypes(haplotypes, log=log) hp.read(region=region) log.info("Extracting variants from haplotypes") variants = {var.id for hap in hp.data.values() for var in hap.variants} - + log.info("Loading genotypes") gt = data.GenotypesRefAlt(genotypes, log=log) # gt._prephased = True gt.read(region=region, samples=samples, variants=variants) - gt.check_missing(discard_also=True) - gt.check_biallelic(discard_also=True) + gt.check_missing() + gt.check_biallelic() gt.check_phase() - + log.info("Transforming genotypes via haplotypes") hp_gt = data.GenotypesRefAlt(fname=output, log=log) hp.transform(gt, hp_gt) - + log.info("Writing haplotypes to VCF") hp_gt.write() return hp_gt - \ No newline at end of file From 5542ccb17c24001a98bddb006709674d393c87f0 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 16 Jun 2022 12:03:17 -0700 Subject: [PATCH 12/44] copy transform subroutine to simphenotypes --- haptools/sim_phenotypes.py | 130 +++++++++++++++++++++---------------- 1 file changed, 75 insertions(+), 55 deletions(-) diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py index 16913b05..6472d008 100644 --- a/haptools/sim_phenotypes.py +++ b/haptools/sim_phenotypes.py @@ -1,6 +1,8 @@ from __future__ import annotations from logging import getLogger, Logger +import numpy.typing as npt + from haplotype import HaptoolsHaplotype as Haplotype from .data import GenotypesRefAlt, Phenotypes, Haplotypes @@ -11,14 +13,8 @@ class PhenoSimulator: Attributes ---------- - gens: GenotypesRefAlt - A set of genotypes to use when simulating phenotypes - haps: Haplotypes - A set of haplotypes to use as causal variables in the simulation - haps_gts: GenotypesRefAlt - Genotypes for each haplotype - phens: Phenotypes - Phenotypes for each haplotype + gens: Genotypes + Genotypes to simulate log: Logger A logging instance for recording debug statements @@ -26,57 +22,58 @@ class PhenoSimulator: -------- >>> gens = Genotypes.load("tests/data/example.vcf.gz") >>> haps = Haplotypes.load("tests/data/basic.hap") - >>> phenosim = PhenoSimulator(gens, haps, Path("basic_phens.tsv.gz")) - >>> phenosim.transform() + >>> haps_gts = GenotypesRefAlt(None) + >>> haps.transform(gens, haps_gts) + >>> phenosim = PhenoSimulator(haps_gts) >>> phenotypes = phenosim.run() """ def __init__( self, - genotypes: GenotypesRefAlt, - haplotypes: Haplotypes, - phenotypes: Path, + genotypes: Genotypes, log: Logger = None, ): - self.gens = genotypes - self.haps = haplotypes - self.haps_gts = None - self.phens = Phenotypes(phenotypes, self.log) - self.phens.samples = self.gens - self.log = log or getLogger(self.__class__.__name__) - - def unset(self): - """ - Whether the data has been loaded into the object yet - - Returns - ------- - bool - True if :py:attr:`~.PhenoSimulator.haps_gts` is None else False """ - return self.haps_gts is None - - def transform(self): - """ - Obtain the "genotypes" of the haplotypes + Initialize a PhenoSimulator object + + Parameters + ---------- + genotypes: Genotypes + Genotypes for each haplotype + log: Logger, optional + A logging instance for recording debug statements """ - if self.unset(): - self.log.warning("The haplotype 'genotypes' have already been loaded. Overriding.") - self.haps_gts = self.haps.transform(self.gens) + self.gens = genotypes + self.log = log or getLogger(self.__class__.__name__) def run( self, - name: str, - simu_rep: int, - simu_hsq: float, - simu_k: float, - simu_qt: bool = False, - simu_cc: bool = False, - ) -> Phenotypes: - effect = np.array([hap.beta for hap in haps.data.values()]) - gts = self.haps_gts.data.sum(axis=2) - pt = np.sum(effect * (gts - gts.mean(axis=0)) / gts.std(axis=0)) - phens = np.empty((len(self.genotypes.samples)), dtype=np.float64) + varID: str, + beta: float, + heritability: float = 1, + prevalence: float = None, + ) -> npt.NDArray: + """ + Simulate phenotypes for an entry in the Genotypes object + + Parameters + ---------- + varID: str + The ID of the variant in the Genotype object from which to simulate + phenotypes + beta: float + The simulated effect size of the variant; must be between -1 and 1 + heritability: float, optional + The simulated heritability of the trait + prevalence: float, optional + How common should the disease be within the population? + + If this value is specified, case/control phenotypes will be generated + instead of quantitative traits. + """ + gts = self.gens.subset(variants=[varID]).data[:, 0].sum(axis=1) + # error = + pt = effect * (gts - gts.mean(axis=0)) / gts.std(axis=0) # Simulate and write for i in range(simu_rep): @@ -90,20 +87,43 @@ def run( def simulate_pt( genotypes: Path, haplotypes: Path, - phenotypes: Path, simu_rep: int, simu_hsq: float, simu_k: float, - simu_qt: bool = False, - simu_cc: bool = False, + region: str = None, + samples: list[str] = None, + output: Path = Path("-"), + log: Logger = None, ): - # Initialize readers - gens = GenotypesRefAlt.load(genotypes) - haps = Haplotypes(haplotypes, haplotype=Haplotype) - haps.read() + if log is None: + log = logging.getLogger("run") + logging.basicConfig( + format="[%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)", + level="ERROR", + ) + + log.info("Loading haplotypes") + hp = data.Haplotypes(haplotypes, log=log) + hp.read(region=region) + + log.info("Extracting variants from haplotypes") + variants = {var.id for hap in hp.data.values() for var in hap.variants} + + log.info("Loading genotypes") + gt = data.GenotypesRefAlt(genotypes, log=log) + # gt._prephased = True + gt.read(region=region, samples=samples, variants=variants) + gt.check_missing() + gt.check_biallelic() + gt.check_phase() + + log.info("Transforming genotypes via haplotypes") + hp_gt = data.GenotypesRefAlt(fname=None, log=log) + hp.transform(gt, hp_gt) # Initialize phenotype simulator (haptools simphenotypes) - pt_sim = PhenoSimulator(gens, haps) + log.info("Simulating phenotypes") + pt_sim = PhenoSimulator(hp_gt) phens = pt_sim.run() # Set up file to write summary info From def4826608e6ce543c9d5ab2119e88e9326148f6 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 30 Jun 2022 22:43:35 -0700 Subject: [PATCH 13/44] rename covar and pheno files --- tests/data/{covars.tsv => simple.covar} | 2 +- tests/data/simple.pheno | 6 ++++++ tests/data/simple.tsv | 5 ----- 3 files changed, 7 insertions(+), 6 deletions(-) rename tests/data/{covars.tsv => simple.covar} (81%) create mode 100644 tests/data/simple.pheno delete mode 100644 tests/data/simple.tsv diff --git a/tests/data/covars.tsv b/tests/data/simple.covar similarity index 81% rename from tests/data/covars.tsv rename to tests/data/simple.covar index 9d4ec1d1..c2b96e30 100644 --- a/tests/data/covars.tsv +++ b/tests/data/simple.covar @@ -1,4 +1,4 @@ -sample sex age +#IID sex age HG00096 0 4 HG00097 1 20 HG00099 1 33 diff --git a/tests/data/simple.pheno b/tests/data/simple.pheno new file mode 100644 index 00000000..b567f784 --- /dev/null +++ b/tests/data/simple.pheno @@ -0,0 +1,6 @@ +#IID height bmi +HG00096 1 3 +HG00097 1 6 +HG00099 2 8 +HG00100 2 1 +HG00101 0 4 \ No newline at end of file diff --git a/tests/data/simple.tsv b/tests/data/simple.tsv deleted file mode 100644 index f36f9427..00000000 --- a/tests/data/simple.tsv +++ /dev/null @@ -1,5 +0,0 @@ -HG00096 1 -HG00097 1 -HG00099 2 -HG00100 2 -HG00101 0 From bffd27cf123753ca62b7411837755b422e5c4eeb Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 30 Jun 2022 22:52:45 -0700 Subject: [PATCH 14/44] describe phenotypes in a separate page of the docs --- docs/formats/genotypes.rst | 7 +++++++ docs/formats/haplotypes.rst | 4 ++-- docs/formats/inputs.rst | 31 ------------------------------- docs/formats/phenotypes.rst | 28 ++++++++++++++++++++++++++++ docs/index.rst | 3 ++- 5 files changed, 39 insertions(+), 34 deletions(-) create mode 100644 docs/formats/genotypes.rst delete mode 100644 docs/formats/inputs.rst create mode 100644 docs/formats/phenotypes.rst diff --git a/docs/formats/genotypes.rst b/docs/formats/genotypes.rst new file mode 100644 index 00000000..3c204ff9 --- /dev/null +++ b/docs/formats/genotypes.rst @@ -0,0 +1,7 @@ +.. _formats-genotypes: + + +Genotypes +========= + +Genotype files must be specified as VCF or BCF files. diff --git a/docs/formats/haplotypes.rst b/docs/formats/haplotypes.rst index a6707d1e..73e4d747 100644 --- a/docs/formats/haplotypes.rst +++ b/docs/formats/haplotypes.rst @@ -1,8 +1,8 @@ .. _formats-haplotypes: -.hap -==== +Haplotypes +========== This document describes our custom file format specification for haplotypes: the ``.hap`` file. diff --git a/docs/formats/inputs.rst b/docs/formats/inputs.rst deleted file mode 100644 index 14f89d9b..00000000 --- a/docs/formats/inputs.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. _formats-inputs: - - -Inputs -========= - -Genotype file format --------------------- -Genotype files must be specified as VCF or BCF files. - -Phenotype file format ---------------------- -Phenotypes are expected to be in a tab-separated format with two columns: - -1. sample ID, which must match those from the genotype file -2. the phenotype value - -There should be no header line in the file. - -See ``tests/data/simple.tsv`` for an example of a phenotype file. - -Covariate file format ---------------------- -Covariates are expected to be in a tab-separated format with the first column -corresponding to the sample ID. Subsequent columns should contain each of your -covariates. - -The first line of the file will be treated as a header. The column with sample IDs -should be named "sample" and subsequent columns can be labeled however you'd like. - -See ``tests/data/covars.tsv`` for an example of a covariate file. diff --git a/docs/formats/phenotypes.rst b/docs/formats/phenotypes.rst new file mode 100644 index 00000000..2d856ff2 --- /dev/null +++ b/docs/formats/phenotypes.rst @@ -0,0 +1,28 @@ +.. _formats-phenotypes: + + +Phenotypes and Covariates +========================= + +Phenotype file format +--------------------- +Phenotypes are expected to follow the PLINK2 ``.pheno`` file format. This is a +tab-separated format where the first column corresponds to the sample ID, and +subsequent columns contain each of your phenotypes. + +The first line of the file corresponds with the header and must begin with ``#IID``. +The names of each of your phenotypes belong in the subbsequent columns of the header. + +See `tests/data/simple.pheno `_ for an example of a phenotype file: + +.. include:: ../../tests/data/simple.pheno + :literal: + +Covariate file format +--------------------- +Covariates follow the same format as phenotypes. + +See `tests/data/simple.covar `_ for an example of a covariate file: + +.. include:: ../../tests/data/simple.covar + :literal: diff --git a/docs/index.rst b/docs/index.rst index 6b3266dc..b2a14574 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,8 +9,9 @@ :hidden: :maxdepth: 1 - formats/inputs.rst + formats/genotypes.rst formats/haplotypes.rst + formats/phenotypes.rst .. toctree:: :caption: Commands From afcdda580c6c0ab626bcf2fc9a4b841e30936a4b Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 30 Jun 2022 22:54:14 -0700 Subject: [PATCH 15/44] support reading PLINK2-style pheno and covar files and make covariates class a subclass of phenotypes see #19 for more details --- haptools/data/covariates.py | 128 +--------------------- haptools/data/haplotypes.py | 2 +- haptools/data/phenotypes.py | 112 ++++++++++++------- tests/test_data.py | 211 +++++++++++++++++++++--------------- 4 files changed, 204 insertions(+), 249 deletions(-) diff --git a/haptools/data/covariates.py b/haptools/data/covariates.py index 45ad1f30..64270501 100644 --- a/haptools/data/covariates.py +++ b/haptools/data/covariates.py @@ -8,16 +8,17 @@ import numpy as np from .data import Data +from .phenotypes import Phenotypes -class Covariates(Data): +class Covariates(Phenotypes): """ A class for processing covariates from a file Attributes ---------- data : np.array - The covariates in an n (samples) x 1 (covariate value) array + The covariates in an n (samples) x m (covariates) array fname : Path The path to the read-only file containing the data samples : tuple[str] @@ -33,124 +34,5 @@ class Covariates(Data): """ def __init__(self, fname: Path, log: Logger = None): - super().__init__(fname, log) - self.samples = tuple() - self.names = tuple() - - @classmethod - def load(cls: Covariates, fname: Path, samples: list[str] = None) -> Covariates: - """ - Load covariates from a TSV file - - Read the file contents and standardize the covariates - - Parameters - ---------- - fname - See documentation for :py:attr:`~.Data.fname` - samples : list[str], optional - See documentation for :py:meth:`~.Data.Covariates.read` - - Returns - ------- - covariates - A Covariates object with the data loaded into its properties - """ - covariates = cls(fname) - covariates.read(samples) - return covariates - - def read(self, samples: list[str] = None): - """ - Read covariates from a TSV file into a numpy matrix stored in :py:attr:`~.Covariates.data` - - Parameters - ---------- - samples : list[str], optional - A subset of the samples from which to extract covariates - - Defaults to loading covariates from all samples - - Raises - ------ - AssertionError - If the provided file doesn't follow the expected format - """ - super().read() - # load all info into memory - # use hook_compressed to automatically handle gz files - with hook_compressed(self.fname, mode="rt") as covars: - covar_text = reader(covars, delimiter="\t") - header = next(covar_text) - # there should at least two columns - if len(header) < 2: - raise ValueError("The covariates TSV should have at least two columns.") - # the first column should be called "sample" - if header[0] != "sample": - self.log.warning( - "The first column of the covariates TSV should contain sample IDs" - " and should be named 'sample' in the header line" - ) - # convert to list and subset samples if need be - if samples: - samples = set(samples) - covar_text = [covar for covar in covar_text if covar[0] in samples] - else: - covar_text = list(covar_text) - # the second column should be castable to a float - try: - float(covar_text[0][1]) - except: - self.log.error( - "Every column in the covariates file (besides the sample column) must" - " be numeric." - ) - # fill out the samples and data properties - data = list(zip(*covar_text)) - self.samples = data[0] - self.names = tuple(header[1:]) - # coerce strings to floats - self.data = np.transpose(np.array(data[1:], dtype="float64")) - - def __iter__(self, samples: list[str] = None) -> Iterator[namedtuple]: - """ - Read covariates from a TSV line by line without storing anything - - Parameters - ---------- - samples : list[str], optional - A subset of the samples from which to extract covariates - - Defaults to loading covariates from all samples - - Yields - ------ - Iterator[namedtuple] - An iterator over each line in the file, where each line is encoded as a - namedtuple containing each of the class properties - """ - with hook_compressed(self.fname, mode="rt") as covars: - covar_text = reader(covars, delimiter="\t") - header = next(covar_text) - # there should at least two columns - if len(header) < 2: - raise ValueError("The covariates TSV should have at least two columns.") - # the first column should be called "sample" - if header[0] != "sample": - self.log.warning( - "The first column of the covariates TSV should contain sample IDs" - " and should be named 'sample' in the header line" - ) - header = tuple(header[1:]) - Record = namedtuple("Record", "data samples names") - for covar in covar_text: - if samples is None or covar[0] in samples: - try: - yield Record( - np.array(covar[1:], dtype="float64"), covar[0], header - ) - except: - self.log.error( - "Every column in the covariates file (besides the sample" - " column) must be numeric." - ) + super(Phenotypes, self).__init__(fname, log) + self._ext = 'covar' diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py index a6fd254a..a8ccdb88 100644 --- a/haptools/data/haplotypes.py +++ b/haptools/data/haplotypes.py @@ -87,7 +87,7 @@ def fmt_str(self) -> str: """ Convert an Extra into a fmt string - Retruns + Returns ------- str A python format string (ex: "{beta:.3f}") diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py index a27c2535..c2d55792 100644 --- a/haptools/data/phenotypes.py +++ b/haptools/data/phenotypes.py @@ -1,7 +1,9 @@ from __future__ import annotations from csv import reader from pathlib import Path +from io import TextIOBase from collections import namedtuple +from collections.abc import Iterable from logging import getLogger, Logger from fileinput import hook_compressed @@ -17,11 +19,13 @@ class Phenotypes(Data): Attributes ---------- data : np.array - The phenotypes in an n (samples) x 1 (phenotype value) array + The phenotypes in an n (samples) x m (phenotypes) array fname : Path The path to the read-only file containing the data samples : tuple The names of each of the n samples + names : tuple[str] + The names of the phenotypes log: Logger A logging instance for recording debug statements. @@ -33,6 +37,8 @@ class Phenotypes(Data): def __init__(self, fname: Path, log: Logger = None): super().__init__(fname, log) self.samples = tuple() + self.names = tuple() + self._ext = 'pheno' @classmethod def load(cls: Phenotypes, fname: Path, samples: list[str] = None) -> Phenotypes: @@ -75,36 +81,26 @@ def read(self, samples: list[str] = None): If the provided file doesn't follow the expected format """ super().read() - # load all info into memory - # use hook_compressed to automatically handle gz files - with hook_compressed(self.fname, mode="rt") as phens: - phen_text = reader(phens, delimiter="\t") - # convert to list and subset samples if need be - if samples: - samples = set(samples) - phen_text = [phen for phen in phen_text if phen[0] in samples] - else: - phen_text = list(phen_text) - # there should only be two columns - if len(phen_text[0]) != 2: - self.log.warning("The phenotype TSV should only have two columns.") - # the second column should be castable to a float - try: - float(phen_text[0][1]) - except: - self.log.error("The second column of the TSV file must numeric.") + # call self.__iter__() to obtain the data and samples + data, self.samples = zip(*self.__iter__(samples)) + self.log.info(f"Loaded {len(self.samples)} samples from .{self._ext} file") # fill out the samples and data properties - self.samples, self.data = zip(*phen_text) - # coerce strings to floats - self.data = np.array(self.data, dtype="float64") + # collect data in a np array + self.data = np.array(data) - def __iter__(self, samples: list[str] = None) -> Iterator[namedtuple]: + def _iterate(self, phens: TextIOBase, phen_text: Iterable, samples: set[str] = None): """ - Read phenotypes from a TSV line by line without storing anything + A generator over the lines of a TSV + + This is a helper function for :py:meth:`~.Phenotypes.__iter__` Parameters ---------- - samples : list[str], optional + phens: TextIOBase + The file handler for the stream + phen_text: Iterable + The csv.reader object containing the lines of text from the file as lists + samples : set[str], optional A subset of the samples from which to extract phenotypes Defaults to loading phenotypes from all samples @@ -115,17 +111,59 @@ def __iter__(self, samples: list[str] = None) -> Iterator[namedtuple]: An iterator over each line in the file, where each line is encoded as a namedtuple containing each of the class properties """ - with hook_compressed(self.fname, mode="rt") as phens: - phen_text = reader(phens, delimiter="\t") - Record = namedtuple("Record", "data samples") - for phen in phen_text: - if samples is None or phen[0] in samples: - try: - yield Record(float(phen[1]), phen[0]) - except: - self.log.error( - "The second column of the TSV file must numeric." - ) + self.log.info(f"Loading {len(self.names)} columns from .{self._ext} file") + Record = namedtuple("Record", "data samples") + for phen in phen_text: + if samples is None or phen[0] in samples: + try: + yield Record( + np.array(phen[1:], dtype="float64"), phen[0] + ) + except: + self.log.error( + f"Every column in the .{self._ext} file (besides the sample" + " column) must be numeric." + ) + phens.close() + + def __iter__(self, samples: list[str] = None) -> Iterable[namedtuple]: + """ + Read phenotypes from a TSV line by line without storing anything + + Parameters + ---------- + samples : list[str], optional + A subset of the samples from which to extract phenotypes + + Defaults to loading phenotypes from all samples + + Returns + ------ + Iterable[namedtuple] + See documentation for :py:meth:`~.Phenotypes._iterate` + """ + phens = hook_compressed(self.fname, mode="rt") + phen_text = reader(phens, delimiter="\t") + # ignore all of the comment lines + while True: + header = next(phen_text) + if not header[0].startswith('#') or header[0].startswith('#IID'): + break + + # there should be at least two columns + if len(header) < 2: + raise ValueError(f"The .{self._ext} file should have at least two columns.") + # the first column should be called "#IID" + if header[0] != "#IID": + self.log.warning( + f"The first column of the .{self._ext} file should contain sample IDs" + " and should be named '#IID' in the header line" + ) + self.names = tuple(header[1:]) + samples = set(samples) if samples else None + # call another function to force the lines above to be run immediately + # see https://stackoverflow.com/a/36726497 + return self._iterate(phens, phen_text, samples) def standardize(self): """ @@ -133,4 +171,4 @@ def standardize(self): This function modifies :py:attr:`~.Genotypes.data` in-place """ - self.data = (self.data - np.mean(self.data)) / np.std(self.data) + self.data = (self.data - np.mean(self.data, axis=0)) / np.std(self.data, axis=0) diff --git a/tests/test_data.py b/tests/test_data.py index 94c31190..db97049c 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -240,94 +240,129 @@ def test_subset_genotypes(self): assert np.array_equal(gts_sub.variants, expected_variants) -def test_load_phenotypes(caplog): - # create a phenotype vector with shape: num_samples x 1 - expected = np.array([1, 1, 2, 2, 0]) - - # can we load the data from the phenotype file? - phens = Phenotypes(DATADIR.joinpath("simple.tsv")) - phens.read() - np.testing.assert_allclose(phens.data, expected) - assert phens.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") - - # try loading the data again - it should warn b/c we've already done it - phens.read() - assert len(caplog.records) == 1 and caplog.records[0].levelname == "WARNING" - - expected = (expected - np.mean(expected)) / np.std(expected) - phens.standardize() - np.testing.assert_allclose(phens.data, expected) - - -def test_load_phenotypes_iterate(): - # create a phenotype vector with shape: num_samples x 1 - expected = np.array([1, 1, 2, 2, 0]) - samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") - - # can we load the data from the phenotype file? - phens = Phenotypes(DATADIR.joinpath("simple.tsv")) - for idx, line in enumerate(phens): - np.testing.assert_allclose(line.data, expected[idx]) - assert line.samples == samples[idx] - - -def test_load_phenotypes_subset(): - # create a phenotype vector with shape: num_samples x 1 - expected = np.array([1, 1, 2, 2, 0]) - - # subset for just the samples we want - expected = expected[[1, 3]] - - # can we load the data from the phenotype file? - phens = Phenotypes(DATADIR.joinpath("simple.tsv")) - samples = ["HG00097", "HG00100"] - phens.read(samples=samples) - np.testing.assert_allclose(phens.data, expected) - assert phens.samples == tuple(samples) - - -def test_load_covariates(caplog): - # create a covariate vector with shape: num_samples x num_covars - expected = np.array([(0, 4), (1, 20), (1, 33), (0, 15), (0, 78)]) - - # can we load the data from the covariates file? - covars = Covariates(DATADIR.joinpath("covars.tsv")) - covars.read() - np.testing.assert_allclose(covars.data, expected) - assert covars.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") - assert covars.names == ("sex", "age") - - # try loading the data again - it should warn b/c we've already done it - covars.read() - assert len(caplog.records) == 1 and caplog.records[0].levelname == "WARNING" - - -def test_load_covariates_iterate(): - # create a covariate vector with shape: num_samples x num_covars - expected = np.array([(0, 4), (1, 20), (1, 33), (0, 15), (0, 78)]) - samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") - - # can we load the data from the covariates file? - covars = Covariates(DATADIR.joinpath("covars.tsv")) - for idx, line in enumerate(covars): - np.testing.assert_allclose(line.data, expected[idx]) - assert line.samples == samples[idx] - assert line.names == ("sex", "age") - - -def test_load_covariates_subset(): - # create a covariate vector with shape: num_samples x num_covars - expected = np.array([(0, 4), (1, 20), (1, 33), (0, 15), (0, 78)]) - - # subset for just the samples we want - expected = expected[[1, 3]] - - # can we load the data from the covariate file? - covars = Covariates(DATADIR.joinpath("covars.tsv")) - samples = ["HG00097", "HG00100"] - covars.read(samples=samples) - np.testing.assert_allclose(covars.data, expected) - assert covars.samples == tuple(samples) +class TestPhenotypes: + def _get_expected_phenotypes(self): + # create a phen matrix with shape: samples x phenotypes + expected = np.array([ + [1, 1, 2, 2, 0], + [3, 6, 8, 1, 4], + ], dtype="float64").T + return expected + + def _get_fake_phenotypes(self): + gts = Phenotypes(fname=None) + gts.data = self._get_expected_phenotypes() + gts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") + gts.names = ("height", "bmi") + return gts + + def test_load_phenotypes(self, caplog): + expected_phen = self._get_fake_phenotypes() + expected = expected_phen.data + + # can we load the data from the phenotype file? + phens = Phenotypes(DATADIR.joinpath("simple.pheno")) + phens.read() + np.testing.assert_allclose(phens.data, expected) + assert phens.samples == expected_phen.samples + + # try loading the data again - it should warn b/c we've already done it + caplog.clear() + phens.read() + assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING" + + expected = (expected - np.mean(expected, axis=0)) / np.std(expected, axis=0) + phens.standardize() + np.testing.assert_allclose(phens.data, expected) + + + def test_load_phenotypes_iterate(self): + expected_phen = self._get_fake_phenotypes() + expected = expected_phen.data + samples = expected_phen.samples + + # can we load the data from the phenotype file? + phens = Phenotypes(DATADIR.joinpath("simple.pheno")) + for idx, line in enumerate(phens): + np.testing.assert_allclose(line.data, expected[idx]) + assert line.samples == samples[idx] + + + def test_load_phenotypes_subset(self): + expected_phen = self._get_fake_phenotypes() + expected = expected_phen.data + samples = ["HG00097", "HG00100"] + + # subset for just the samples we want + expected = expected[[1, 3]] + + # can we load the data from the phenotype file? + phens = Phenotypes(DATADIR.joinpath("simple.pheno")) + phens.read(samples=samples) + np.testing.assert_allclose(phens.data, expected) + assert phens.samples == tuple(samples) + + +class TestCovariates: + def _get_expected_covariates(self): + # create a covar matrix with shape: samples x covariates + expected = np.array([(0, 4), (1, 20), (1, 33), (0, 15), (0, 78)]) + return expected + + def _get_fake_covariates(self): + gts = Phenotypes(fname=None) + gts.data = self._get_expected_covariates() + gts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") + gts.names = ("sex", "age") + return gts + + def test_load_covariates(self, caplog): + # create a covariate vector with shape: num_samples x num_covars + expected_covar = self._get_fake_covariates() + expected = expected_covar.data + samples = expected_covar.samples + + # can we load the data from the covariates file? + covars = Covariates(DATADIR.joinpath("simple.covar")) + covars.read() + np.testing.assert_allclose(covars.data, expected) + assert covars.samples == samples + assert covars.names == ("sex", "age") + + # try loading the data again - it should warn b/c we've already done it + caplog.clear() + covars.read() + assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING" + + + def test_load_covariates_iterate(self): + # create a covariate vector with shape: num_samples x num_covars + expected_covar = self._get_fake_covariates() + expected = expected_covar.data + samples = expected_covar.samples + + # can we load the data from the covariates file? + covars = Covariates(DATADIR.joinpath("simple.covar")) + for idx, line in enumerate(covars): + np.testing.assert_allclose(line.data, expected[idx]) + assert line.samples == samples[idx] + assert covars.names == ("sex", "age") + + + def test_load_covariates_subset(self): + # create a covariate vector with shape: num_samples x num_covars + expected_covar = self._get_fake_covariates() + expected = expected_covar.data + samples = ["HG00097", "HG00100"] + + # subset for just the samples we want + expected = expected[[1, 3]] + + # can we load the data from the covariate file? + covars = Covariates(DATADIR.joinpath("simple.covar")) + covars.read(samples=samples) + np.testing.assert_allclose(covars.data, expected) + assert covars.samples == tuple(samples) class TestHaplotypes: From 2849dcae9ef0aa1404a2a22f877baa15e0ba2f99 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 30 Jun 2022 22:59:46 -0700 Subject: [PATCH 16/44] clarify transform documentation --- README.md | 2 ++ haptools/__main__.py | 11 +++++++---- haptools/sim_phenotypes.py | 2 +- haptools/transform.py | 1 + 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 678dbeff..f9d64180 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,8 @@ Haptools consists of multiple utilities listed below. Click on a utility to see * [`haptools karyogram`](docs/commands/karyogram.md): Visualize a "chromosome painting" of local ancestry labels based on breakpoints output by `haptools simgenome`. +* [`haptools transform`](https://haptools.readthedocs.io/en/latest/commands/transform.html): Transform a set of genotypes via a list of haplotypes. Create a new VCF containing haplotypes instead of variants. + Outputs produced by these utilities are compatible with each other. For example `haptools simgenome` outputs a VCF file with local ancestry information annotated for each variant. The output VCF file can be used as input to `haptools simphenotype` to simulate phenotype information. `haptools simgenome` also outputs a list of local ancestry breakpoints which can be visualized using `haptools karyogram`. diff --git a/haptools/__main__.py b/haptools/__main__.py index 26a635fc..d2b1fd30 100644 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -143,10 +143,10 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms): def simphenotype( genotypes: Path, haplotypes: Path, + simu_rep, simu_hsq, simu_k, region: str = None, samples: tuple[str] = tuple(), samples_file: Path = None, - simu_rep, simu_hsq, simu_k, output: Path = Path("-"), verbosity: str = 'ERROR', ): @@ -220,9 +220,11 @@ def simphenotype( type=str, default=None, show_default="all haplotypes", - help=""" - The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'\n - For this to work, the VCF must be indexed and the seqname must match!""", + help=( + "The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'" + "For this to work, the VCF and .hap file must be indexed and the seqnames must" + "correspond with those in the files" + ) ) @click.option( "-s", @@ -290,6 +292,7 @@ def transform( The path to the haplotypes in a .hap file region : str, optional See documentation for :py:meth:`~.data.Genotypes.read` + and :py:meth:`~.data.Haplotypes.read` sample : Tuple[str], optional See documentation for :py:meth:`~.data.Genotypes.read` samples_file : Path, optional diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py index 6472d008..87cae660 100644 --- a/haptools/sim_phenotypes.py +++ b/haptools/sim_phenotypes.py @@ -121,7 +121,7 @@ def simulate_pt( hp_gt = data.GenotypesRefAlt(fname=None, log=log) hp.transform(gt, hp_gt) - # Initialize phenotype simulator (haptools simphenotypes) + # Initialize phenotype simulator (haptools simphenotype) log.info("Simulating phenotypes") pt_sim = PhenoSimulator(hp_gt) phens = pt_sim.run() diff --git a/haptools/transform.py b/haptools/transform.py index 0745c80f..cbcb1bf5 100644 --- a/haptools/transform.py +++ b/haptools/transform.py @@ -24,6 +24,7 @@ def transform_haps( The path to the haplotypes in a .hap file region : str, optional See documentation for :py:meth:`~.data.Genotypes.read` + and :py:meth:`~.data.Haplotypes.read` samples : list[str], optional See documentation for :py:meth:`~.data.Genotypes.read` output : Path, optional From a76e1a6158b2926a0ab7fdfe3c1fd12f01ed827d Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 1 Jul 2022 11:28:32 -0700 Subject: [PATCH 17/44] add Phenotypes.write() method --- haptools/data/phenotypes.py | 20 ++++++++++++++++++++ tests/test_data.py | 21 +++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py index c2d55792..3b64f7b3 100644 --- a/haptools/data/phenotypes.py +++ b/haptools/data/phenotypes.py @@ -165,6 +165,26 @@ def __iter__(self, samples: list[str] = None) -> Iterable[namedtuple]: # see https://stackoverflow.com/a/36726497 return self._iterate(phens, phen_text, samples) + def write(self): + """ + Write the phenotypes in this class to a file at :py:attr:`~.Phenotypes.fname` + + Examples + -------- + To write to a file, you must first initialize a Phenotypes object and then + fill out the names, data, and samples properties: + >>> phenotypes = Phenotypes('tests/data/simple.pheno') + >>> phenotypes.names = ('height',) + >>> phenotypes.data = np.array([1, 1, 2], dtype='float64') + >>> phenotypes.samples = ('HG00096', 'HG00097', 'HG00099') + >>> phenotypes.write() + """ + with hook_compressed(self.fname, mode="wt") as haps: + haps.write("#IID\t"+"\t".join(self.names)+"\n") + for samp, phens in zip(self.samples, self.data): + line = np.array2string(phens, separator="\t", formatter={'float_kind':lambda x: "%.2f" % x})[1:-1] + haps.write(f"{samp}\t" + line + "\n") + def standardize(self): """ Standardize phenotypes so they have a mean of 0 and a stdev of 1 diff --git a/tests/test_data.py b/tests/test_data.py index db97049c..d8bb3ba9 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -275,7 +275,6 @@ def test_load_phenotypes(self, caplog): phens.standardize() np.testing.assert_allclose(phens.data, expected) - def test_load_phenotypes_iterate(self): expected_phen = self._get_fake_phenotypes() expected = expected_phen.data @@ -287,7 +286,6 @@ def test_load_phenotypes_iterate(self): np.testing.assert_allclose(line.data, expected[idx]) assert line.samples == samples[idx] - def test_load_phenotypes_subset(self): expected_phen = self._get_fake_phenotypes() expected = expected_phen.data @@ -302,6 +300,23 @@ def test_load_phenotypes_subset(self): np.testing.assert_allclose(phens.data, expected) assert phens.samples == tuple(samples) + def test_write_phenotypes(self): + expected_phen = self._get_fake_phenotypes() + + # first, we write the data + expected_phen.fname = DATADIR.joinpath("test.pheno") + expected_phen.write() + + # now, let's load the data and check that it's what we wrote + result = Phenotypes(expected_phen.fname) + result.read() + np.testing.assert_allclose(expected_phen.data, result.data) + assert expected_phen.names == result.names + assert expected_phen.samples == result.samples + + # let's clean up after ourselves and delete the file + os.remove(str(expected_phen.fname)) + class TestCovariates: def _get_expected_covariates(self): @@ -334,7 +349,6 @@ def test_load_covariates(self, caplog): covars.read() assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING" - def test_load_covariates_iterate(self): # create a covariate vector with shape: num_samples x num_covars expected_covar = self._get_fake_covariates() @@ -348,7 +362,6 @@ def test_load_covariates_iterate(self): assert line.samples == samples[idx] assert covars.names == ("sex", "age") - def test_load_covariates_subset(self): # create a covariate vector with shape: num_samples x num_covars expected_covar = self._get_fake_covariates() From 80bf943a0be6c23f8421ecc75a62da71e203bd60 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 1 Jul 2022 11:31:55 -0700 Subject: [PATCH 18/44] refmt with black --- haptools/data/covariates.py | 2 +- haptools/data/haplotypes.py | 7 +------ haptools/data/phenotypes.py | 19 ++++++++++--------- tests/test_data.py | 15 +++++++++------ 4 files changed, 21 insertions(+), 22 deletions(-) diff --git a/haptools/data/covariates.py b/haptools/data/covariates.py index 64270501..04960e50 100644 --- a/haptools/data/covariates.py +++ b/haptools/data/covariates.py @@ -35,4 +35,4 @@ class Covariates(Phenotypes): def __init__(self, fname: Path, log: Logger = None): super(Phenotypes, self).__init__(fname, log) - self._ext = 'covar' + self._ext = "covar" diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py index a8ccdb88..385833f5 100644 --- a/haptools/data/haplotypes.py +++ b/haptools/data/haplotypes.py @@ -718,12 +718,7 @@ def __repr__(self): def write(self): """ - Write the contents of this Haplotypes object to the file given by fname - - Parameters - ---------- - file: TextIO - A file-like object to which this Haplotypes object should be written. + Write the contents of this Haplotypes object to the file at :py:attr:`~.Haplotypes.fname` Examples -------- diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py index 3b64f7b3..1a61ebaa 100644 --- a/haptools/data/phenotypes.py +++ b/haptools/data/phenotypes.py @@ -38,7 +38,7 @@ def __init__(self, fname: Path, log: Logger = None): super().__init__(fname, log) self.samples = tuple() self.names = tuple() - self._ext = 'pheno' + self._ext = "pheno" @classmethod def load(cls: Phenotypes, fname: Path, samples: list[str] = None) -> Phenotypes: @@ -88,7 +88,9 @@ def read(self, samples: list[str] = None): # collect data in a np array self.data = np.array(data) - def _iterate(self, phens: TextIOBase, phen_text: Iterable, samples: set[str] = None): + def _iterate( + self, phens: TextIOBase, phen_text: Iterable, samples: set[str] = None + ): """ A generator over the lines of a TSV @@ -116,9 +118,7 @@ def _iterate(self, phens: TextIOBase, phen_text: Iterable, samples: set[str] = N for phen in phen_text: if samples is None or phen[0] in samples: try: - yield Record( - np.array(phen[1:], dtype="float64"), phen[0] - ) + yield Record(np.array(phen[1:], dtype="float64"), phen[0]) except: self.log.error( f"Every column in the .{self._ext} file (besides the sample" @@ -147,7 +147,7 @@ def __iter__(self, samples: list[str] = None) -> Iterable[namedtuple]: # ignore all of the comment lines while True: header = next(phen_text) - if not header[0].startswith('#') or header[0].startswith('#IID'): + if not header[0].startswith("#") or header[0].startswith("#IID"): break # there should be at least two columns @@ -180,9 +180,10 @@ def write(self): >>> phenotypes.write() """ with hook_compressed(self.fname, mode="wt") as haps: - haps.write("#IID\t"+"\t".join(self.names)+"\n") - for samp, phens in zip(self.samples, self.data): - line = np.array2string(phens, separator="\t", formatter={'float_kind':lambda x: "%.2f" % x})[1:-1] + haps.write("#IID\t" + "\t".join(self.names) + "\n") + formatter = {"float_kind": lambda x: "%.2f" % x} + for samp, phen in zip(self.samples, self.data): + line = np.array2string(phen, separator="\t", formatter=formatter)[1:-1] haps.write(f"{samp}\t" + line + "\n") def standardize(self): diff --git a/tests/test_data.py b/tests/test_data.py index d8bb3ba9..30c1e3a9 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -223,7 +223,7 @@ def test_subset_genotypes(self): # subset to just the variants we want expected_data = gts.data[:, [1, 2]] expected_variants = gts.variants[[1, 2]] - variants = ('1:10116:A:G', '1:10117:C:A') + variants = ("1:10116:A:G", "1:10117:C:A") gts_sub = gts.subset(variants=variants) assert gts_sub.samples == gts.samples np.testing.assert_allclose(gts_sub.data, expected_data) @@ -233,7 +233,7 @@ def test_subset_genotypes(self): expected_data = gts.data[[3, 4], :2] expected_variants = gts.variants[:2] samples = ("HG00100", "HG00101") - variants = ("1:10114:T:C", '1:10116:A:G') + variants = ("1:10114:T:C", "1:10116:A:G") gts_sub = gts.subset(samples=samples, variants=variants) assert gts_sub.samples == samples np.testing.assert_allclose(gts_sub.data, expected_data) @@ -243,10 +243,13 @@ def test_subset_genotypes(self): class TestPhenotypes: def _get_expected_phenotypes(self): # create a phen matrix with shape: samples x phenotypes - expected = np.array([ - [1, 1, 2, 2, 0], - [3, 6, 8, 1, 4], - ], dtype="float64").T + expected = np.array( + [ + [1, 1, 2, 2, 0], + [3, 6, 8, 1, 4], + ], + dtype="float64", + ).T return expected def _get_fake_phenotypes(self): From d24920611cecf813bfa2d68fd7bada13b363be6d Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 1 Jul 2022 12:54:15 -0700 Subject: [PATCH 19/44] prelim finish PhenoSimulator.run --- haptools/sim_phenotypes.py | 73 ++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py index 87cae660..3399352b 100644 --- a/haptools/sim_phenotypes.py +++ b/haptools/sim_phenotypes.py @@ -15,6 +15,8 @@ class PhenoSimulator: ---------- gens: Genotypes Genotypes to simulate + phens: Phenotypes + Simulated phenotypes; filled by :py:meth:`~.PhenoSimular.run` log: Logger A logging instance for recording debug statements @@ -31,6 +33,7 @@ class PhenoSimulator: def __init__( self, genotypes: Genotypes, + output: Path = None, log: Logger = None, ): """ @@ -40,22 +43,30 @@ def __init__( ---------- genotypes: Genotypes Genotypes for each haplotype + output: Path + Path to a '.pheno' file to which the generated phenotypes could be written log: Logger, optional A logging instance for recording debug statements """ self.gens = genotypes + self.phens = Phenotypes(fname=output) + self.phens.names = tuple() + self.phens.data = None + self.phens.samples = self.gens.samples self.log = log or getLogger(self.__class__.__name__) def run( self, varID: str, beta: float, - heritability: float = 1, + heritability: float = None, prevalence: float = None, ) -> npt.NDArray: """ Simulate phenotypes for an entry in the Genotypes object + The generated phenotypes will also be added to :py:attr:`~.PhenoSimulator.fname` + Parameters ---------- varID: str @@ -65,23 +76,40 @@ def run( The simulated effect size of the variant; must be between -1 and 1 heritability: float, optional The simulated heritability of the trait + + If not provided, this will be estimated from the variability of the + genotypes prevalence: float, optional How common should the disease be within the population? If this value is specified, case/control phenotypes will be generated instead of quantitative traits. + + Returns + ------- """ gts = self.gens.subset(variants=[varID]).data[:, 0].sum(axis=1) - # error = pt = effect * (gts - gts.mean(axis=0)) / gts.std(axis=0) - - # Simulate and write - for i in range(simu_rep): - resid_component = np.random.normal(0, np.var(pt) * (1 / simu_hsq - 1)) - if simu_qt: - outinfo.append(pt + resid_component) - else: - raise NotImplementedError("Case control not implemented") + # add an error term + if heritability: + # as defined in GTCA + pt += np.random.normal(0, np.var(pt) * ((1/heritability) - 1) ) + else: + pt += np.random.normal(0, 1 - np.var(gts)**2) + if self.phens.data is None: + self.phens.data = pt + else: + self.phens.data = np.concatenate((self.phens.data, pt), axis=1) + self.phens.names = self.phens.names + (varID,) + # TODO: implement case/control phenotypes + return pt + + def write(self): + """ + Write the generated phenotypes to the file specified in + :py:meth:`~.PhenoSimular.__init__` + """ + self.phens.write() def simulate_pt( @@ -123,24 +151,7 @@ def simulate_pt( # Initialize phenotype simulator (haptools simphenotype) log.info("Simulating phenotypes") - pt_sim = PhenoSimulator(hp_gt) - phens = pt_sim.run() - - # Set up file to write summary info - outf_sum = open("%s.par" % outprefix, "w") - outf_sum.write("\t".join(["Haplotype", "Frequency", "Effect"]) + "\n") - - # Add effect for each haplotype - for idx, hap in enumerate(haps.data.values()): - sample_to_hap = hap.transform(gens) - pt_sim.add_effect(sample_to_hap, hap.beta) - - # Output summary info - hap_freq = hap.GetFrequency(gens) - outf_sum.write("\t".join([hap.id, str(hap_freq), str(hap.beta)]) + "\n") - - # Output simulated phenotypes to a file (haptools simphenotypes) - pt_sim.run(outprefix, simu_rep, simu_hsq, simu_k, simu_qt, simu_cc) - - # Done - outf_sum.close() + pt_sim = PhenoSimulator(hp_gt, output=output, log=log) + for hap in hp.data.values(): + pt_sim.run(hap.id, hap.beta) + pt_sim.write() From e00ff2b33a4938176d908eb99d29e2fffa313c45 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 1 Jul 2022 18:12:12 -0700 Subject: [PATCH 20/44] try simulating phenotypes from multiple haplotypes, additively --- haptools/data/haplotypes.py | 2 +- haptools/sim_phenotypes.py | 43 +++++++++++++++++++++---------------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py index 385833f5..bd3f7da3 100644 --- a/haptools/data/haplotypes.py +++ b/haptools/data/haplotypes.py @@ -493,7 +493,7 @@ def check_header(self, lines: list[str], check_version=False): expected_lines.remove(line) except ValueError: # extract the name of the extra field - name = line.split("\t", maxsplit=1)[1] + name = line.split("\t", maxsplit=2)[1] raise ValueError( f"The extra field '{name}' is declared in the header of the" " .hap file but is not accepted by this tool." diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py index 3399352b..d854fab7 100644 --- a/haptools/sim_phenotypes.py +++ b/haptools/sim_phenotypes.py @@ -1,9 +1,11 @@ from __future__ import annotations +from pathlib import Path from logging import getLogger, Logger +import numpy as np import numpy.typing as npt -from haplotype import HaptoolsHaplotype as Haplotype +from .haplotype import HaptoolsHaplotype as Haplotype from .data import GenotypesRefAlt, Phenotypes, Haplotypes @@ -57,23 +59,20 @@ def __init__( def run( self, - varID: str, - beta: float, + effects: list[Haplotype], heritability: float = None, prevalence: float = None, ) -> npt.NDArray: """ Simulate phenotypes for an entry in the Genotypes object - The generated phenotypes will also be added to :py:attr:`~.PhenoSimulator.fname` + The generated phenotypes will also be added to + :py:attr:`~.PhenoSimulator.output` Parameters ---------- - varID: str - The ID of the variant in the Genotype object from which to simulate - phenotypes - beta: float - The simulated effect size of the variant; must be between -1 and 1 + effects: list[Haplotype] + A list of Haplotypes to use in an additive fashion within the simulations heritability: float, optional The simulated heritability of the trait @@ -87,20 +86,29 @@ def run( Returns ------- + npt.NDArray + The simulated phenotypes, as a np array of shape num_samples x 1 """ - gts = self.gens.subset(variants=[varID]).data[:, 0].sum(axis=1) - pt = effect * (gts - gts.mean(axis=0)) / gts.std(axis=0) + # extract the needed information from the Haplotype objects + ids = [hap.id for hap in effects] + betas = np.array([hap.beta for hap in effects]) + # extract the haplotype "genotypes" and compute the phenotypes + gts = self.gens.subset(variants=ids).data.sum(axis=2) + # standardize the genotypes + gts = (gts - gts.mean(axis=0)) / gts.std(axis=0) + # generate the phenotypes + pt = (betas * gts).sum(axis=1) # add an error term if heritability: # as defined in GTCA pt += np.random.normal(0, np.var(pt) * ((1/heritability) - 1) ) else: - pt += np.random.normal(0, 1 - np.var(gts)**2) + pt += np.random.normal(0, 1 - np.power(betas, 2).sum()) if self.phens.data is None: self.phens.data = pt else: self.phens.data = np.concatenate((self.phens.data, pt), axis=1) - self.phens.names = self.phens.names + (varID,) + self.phens.names = self.phens.names + ("-".join(ids),) # TODO: implement case/control phenotypes return pt @@ -131,14 +139,14 @@ def simulate_pt( ) log.info("Loading haplotypes") - hp = data.Haplotypes(haplotypes, log=log) + hp = Haplotypes(haplotypes, haplotype=Haplotype, log=log) hp.read(region=region) log.info("Extracting variants from haplotypes") variants = {var.id for hap in hp.data.values() for var in hap.variants} log.info("Loading genotypes") - gt = data.GenotypesRefAlt(genotypes, log=log) + gt = GenotypesRefAlt(genotypes, log=log) # gt._prephased = True gt.read(region=region, samples=samples, variants=variants) gt.check_missing() @@ -146,12 +154,11 @@ def simulate_pt( gt.check_phase() log.info("Transforming genotypes via haplotypes") - hp_gt = data.GenotypesRefAlt(fname=None, log=log) + hp_gt = GenotypesRefAlt(fname=None, log=log) hp.transform(gt, hp_gt) # Initialize phenotype simulator (haptools simphenotype) log.info("Simulating phenotypes") pt_sim = PhenoSimulator(hp_gt, output=output, log=log) - for hap in hp.data.values(): - pt_sim.run(hap.id, hap.beta) + pt_sim.run(hp.data.values()) pt_sim.write() From 05430eec94811decc151bea6ebb1564312bd23d2 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 2 Jul 2022 11:35:34 -0700 Subject: [PATCH 21/44] merge haplotype module with sim_phenotypes module --- haptools/haplotype.py | 26 -------------------------- haptools/sim_phenotypes.py | 25 +++++++++++++++++++++++-- 2 files changed, 23 insertions(+), 28 deletions(-) delete mode 100644 haptools/haplotype.py diff --git a/haptools/haplotype.py b/haptools/haplotype.py deleted file mode 100644 index b6c0a1d8..00000000 --- a/haptools/haplotype.py +++ /dev/null @@ -1,26 +0,0 @@ -from __future__ import annotations -from dataclasses import dataclass, field - -import numpy as np - -from .data import Extra, Haplotype - - -@dataclass -class HaptoolsHaplotype(Haplotype): - """ - A haplotype with sufficient fields for simphenotype - - Properties and functions are shared with the base Haplotype object - """ - - ancestry: str - beta: float - _extras: tuple = field( - repr=False, - init=False, - default=( - Extra("ancestry", "s", "Local ancestry"), - Extra("beta", ".2f", "Effect size in linear model"), - ), - ) diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py index d854fab7..8736834f 100644 --- a/haptools/sim_phenotypes.py +++ b/haptools/sim_phenotypes.py @@ -1,12 +1,33 @@ from __future__ import annotations from pathlib import Path from logging import getLogger, Logger +from dataclasses import dataclass, field import numpy as np import numpy.typing as npt -from .haplotype import HaptoolsHaplotype as Haplotype -from .data import GenotypesRefAlt, Phenotypes, Haplotypes +from .data import Haplotype as HaplotypeBase +from .data import GenotypesRefAlt, Phenotypes, Haplotypes, Extra + + +@dataclass +class Haplotype(HaplotypeBase): + """ + A haplotype with sufficient fields for simphenotype + + Properties and functions are shared with the base Haplotype object, "HaplotypeBase" + """ + + ancestry: str + beta: float + _extras: tuple = field( + repr=False, + init=False, + default=( + Extra("ancestry", "s", "Local ancestry"), + Extra("beta", ".2f", "Effect size in linear model"), + ), + ) class PhenoSimulator: From 0636294a3d652ef2cc8d2668fb1b855b20bb13ed Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 2 Jul 2022 13:00:21 -0700 Subject: [PATCH 22/44] compute heritability when not provided in simphenotype --- haptools/sim_phenotypes.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py index 8736834f..019be83f 100644 --- a/haptools/sim_phenotypes.py +++ b/haptools/sim_phenotypes.py @@ -1,5 +1,6 @@ from __future__ import annotations from pathlib import Path +from itertools import combinations from logging import getLogger, Logger from dataclasses import dataclass, field @@ -110,27 +111,38 @@ def run( npt.NDArray The simulated phenotypes, as a np array of shape num_samples x 1 """ - # extract the needed information from the Haplotype objects + # extract the ID and effect size information from the Haplotype objects ids = [hap.id for hap in effects] betas = np.array([hap.beta for hap in effects]) # extract the haplotype "genotypes" and compute the phenotypes gts = self.gens.subset(variants=ids).data.sum(axis=2) # standardize the genotypes gts = (gts - gts.mean(axis=0)) / gts.std(axis=0) - # generate the phenotypes + # generate the genetic component pt = (betas * gts).sum(axis=1) - # add an error term - if heritability: - # as defined in GTCA - pt += np.random.normal(0, np.var(pt) * ((1/heritability) - 1) ) - else: - pt += np.random.normal(0, 1 - np.power(betas, 2).sum()) + # compute the heritability + if heritability is None: + # if heritability is not defined, then we set it equal to the sum of the + # effect sizes + # assuming the genotypes are independent, this makes the variance of the + # noise term equal to 1 - sum(betas^2) + heritability = np.power(betas, 2).sum() + # # account for the fact that the genotypes are not independent by adding the + # # covariance between all of the variables + # for a_idx, b_idx in combinations(range(len(betas)), 2): + # heritability += 2 * betas[a_idx] * betas[b_idx] * \ + # np.cov(gts[:,a_idx], gts[:,b_idx])[0][1] + # compute the environmental effect + noise = np.var(pt) * (np.reciprocal(heritability) - 1) + # finally, add everything together to get the simulated phenotypes + pt += np.random.normal(0, noise, size=pt.shape) + # TODO: implement case/control phenotypes + # now, save the archived phenotypes for later if self.phens.data is None: self.phens.data = pt else: self.phens.data = np.concatenate((self.phens.data, pt), axis=1) self.phens.names = self.phens.names + ("-".join(ids),) - # TODO: implement case/control phenotypes return pt def write(self): @@ -170,6 +182,7 @@ def simulate_pt( gt = GenotypesRefAlt(genotypes, log=log) # gt._prephased = True gt.read(region=region, samples=samples, variants=variants) + log.info("QC-ing genotypes") gt.check_missing() gt.check_biallelic() gt.check_phase() @@ -182,4 +195,5 @@ def simulate_pt( log.info("Simulating phenotypes") pt_sim = PhenoSimulator(hp_gt, output=output, log=log) pt_sim.run(hp.data.values()) + log.info("Writing phenotypes") pt_sim.write() From 54f48de43c42f1af524633dd7aa4d7a76026e806 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 2 Jul 2022 15:01:10 -0700 Subject: [PATCH 23/44] implement case/control --- haptools/sim_phenotypes.py | 71 ++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 6 deletions(-) diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py index 019be83f..2b1d3d35 100644 --- a/haptools/sim_phenotypes.py +++ b/haptools/sim_phenotypes.py @@ -111,17 +111,20 @@ def run( npt.NDArray The simulated phenotypes, as a np array of shape num_samples x 1 """ - # extract the ID and effect size information from the Haplotype objects + # extract the relevant haplotype info from the Haplotype objects ids = [hap.id for hap in effects] betas = np.array([hap.beta for hap in effects]) + self.log.info(f"Extracting haplotype genotypes for haps: {ids}") # extract the haplotype "genotypes" and compute the phenotypes gts = self.gens.subset(variants=ids).data.sum(axis=2) + self.log.info("Computing genetic component") # standardize the genotypes gts = (gts - gts.mean(axis=0)) / gts.std(axis=0) # generate the genetic component pt = (betas * gts).sum(axis=1) # compute the heritability if heritability is None: + self.log.info("Computing heritability") # if heritability is not defined, then we set it equal to the sum of the # effect sizes # assuming the genotypes are independent, this makes the variance of the @@ -132,11 +135,20 @@ def run( # for a_idx, b_idx in combinations(range(len(betas)), 2): # heritability += 2 * betas[a_idx] * betas[b_idx] * \ # np.cov(gts[:,a_idx], gts[:,b_idx])[0][1] + self.log.info(f"Adding environmental component for h^squared: {heritability}") # compute the environmental effect noise = np.var(pt) * (np.reciprocal(heritability) - 1) # finally, add everything together to get the simulated phenotypes pt += np.random.normal(0, noise, size=pt.shape) - # TODO: implement case/control phenotypes + if prevalence is not None: + self.log.info(f"Converting to case/control with prevalence {prevalence}") + # first, find the number of desired positives + k = int(prevalence * len(pt)) + # choose the top k values and label them positive + bool_pt = np.repeat(False, repeats=len(pt)) + bool_pt[np.argpartition(pt, k)[-k:]] = True + pt = bool_pt + pt = pt[:, np.newaxis] # now, save the archived phenotypes for later if self.phens.data is None: self.phens.data = pt @@ -156,14 +168,60 @@ def write(self): def simulate_pt( genotypes: Path, haplotypes: Path, - simu_rep: int, - simu_hsq: float, - simu_k: float, + num_replications: int = 1, + heritability: float = None, + prevalence: float = None, region: str = None, samples: list[str] = None, output: Path = Path("-"), log: Logger = None, ): + """ + Haplotype-aware phenotype simulation. Create a set of simulated phenotypes from a + set of haplotypes. + + GENOTYPES must be formatted as a VCF and HAPLOTYPES must be formatted according + to the .hap format spec + + \f + Examples + -------- + >>> haptools simphenotype tests/data/example.vcf.gz tests/data/example.hap.gz > simu_phens.tsv + + Parameters + ---------- + genotypes : Path + The path to the genotypes in VCF format + haplotypes : Path + The path to the haplotypes in a .hap file + replications : int, optional + The number of rounds of simulation to perform + heritability : int, optional + The heritability of the simulated trait; must be a float between 0 and 1 + prevalence : int, optional + The prevalence of the disease if the trait should be simulated as case/control; + must be a float between 0 and 1 + + If not provided, a quantitative trait will be simulated, instead + region : str, optional + The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7' + + For this to work, the VCF and .hap file must be indexed and the seqname must + match! + + Defaults to loading all haplotypes + sample : tuple[str], optional + A subset of the samples from which to extract genotypes + + Defaults to loading genotypes from all samples + samples_file : Path, optional + A single column txt file containing a list of the samples (one per line) to + subset from the genotypes file + output : Path, optional + The location to which to write the simulated phenotypes + log : Logger, optional + The logging module for this task + """ if log is None: log = logging.getLogger("run") logging.basicConfig( @@ -194,6 +252,7 @@ def simulate_pt( # Initialize phenotype simulator (haptools simphenotype) log.info("Simulating phenotypes") pt_sim = PhenoSimulator(hp_gt, output=output, log=log) - pt_sim.run(hp.data.values()) + for i in range(num_replications): + pt_sim.run(hp.data.values(), heritability, prevalence) log.info("Writing phenotypes") pt_sim.write() From 7fba06c5fe77bd9ad5ad57c561fa8c974362d822 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 2 Jul 2022 15:01:52 -0700 Subject: [PATCH 24/44] improve commenting in __main__ --- haptools/__main__.py | 92 ++++++++++++++++++++++++++------------ haptools/data/genotypes.py | 6 +-- haptools/transform.py | 2 +- 3 files changed, 67 insertions(+), 33 deletions(-) diff --git a/haptools/__main__.py b/haptools/__main__.py index d2b1fd30..38f7a697 100644 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -80,21 +80,40 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms): breakpoints = write_breakpoints(samples, breakpoints, out) ############ Haptools simphenotype ############### -DEFAULT_SIMU_REP = 1 -DEFAULT_SIMU_HSQ = 0.1 -DEFAULT_SIMU_K = 0.1 -################################################## @main.command() @click.argument("genotypes", type=click.Path(exists=True, path_type=Path)) @click.argument("haplotypes", type=click.Path(exists=True, path_type=Path)) +@click.option( + "-r", + "--replications", + type=click.IntRange(min=1), + default=1, + show_default=True, + help="Number of rounds of simulation to perform", +) +@click.option( + "-h", + "--heritability", + type=click.FloatRange(min=0, max=1), + help="Trait heritability", +) +@click.option( + "-p", + "--prevalence", + type=click.FloatRange(min=0, max=1, min_open=True, max_open=True), + show_default="quantitative trait", + help="Disease prevalence if simulating a case-control trait" +) @click.option( "--region", type=str, default=None, show_default="all haplotypes", - help=""" - The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'\n - For this to work, the VCF must be indexed and the seqname must match!""", + help=( + "The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'" + "For this to work, the VCF and .hap file must be indexed and the seqname " + "provided must correspond with one in the files" + ) ) @click.option( "-s", @@ -118,17 +137,11 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms): " subset from the genotypes file" ), ) -@click.option('--simu-rep', help='Number of rounds of simulation to perform', \ - type=int, default=DEFAULT_SIMU_REP) -@click.option('--simu-hsq', help='Trait heritability', \ - type=float, default=DEFAULT_SIMU_HSQ) -@click.option('--simu-k', help='Specify the disease prevalence', \ - type=float, default=DEFAULT_SIMU_K) @click.option( "-o", "--output", type=click.Path(path_type=Path), - default=Path("-"), + default=Path("/dev/stdout"), show_default="stdout", help="A TSV file containing simulated phenotypes", ) @@ -143,7 +156,9 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms): def simphenotype( genotypes: Path, haplotypes: Path, - simu_rep, simu_hsq, simu_k, + replications: int = 1, + heritability: float = None, + prevalence: float = None, region: str = None, samples: tuple[str] = tuple(), samples_file: Path = None, @@ -168,10 +183,26 @@ def simphenotype( The path to the genotypes in VCF format haplotypes : Path The path to the haplotypes in a .hap file + replications : int, optional + The number of rounds of simulation to perform + heritability : int, optional + The heritability of the simulated trait; must be a float between 0 and 1 + prevalence : int, optional + The prevalence of the disease if the trait should be simulated as case/control; + must be a float between 0 and 1 + + If not provided, a quantitative trait will be simulated, instead region : str, optional - See documentation for :py:meth:`~.data.Genotypes.read` - sample : Tuple[str], optional - See documentation for :py:meth:`~.data.Genotypes.read` + The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7' + + For this to work, the VCF and .hap file must be indexed and the seqname must + match! + + Defaults to loading all haplotypes + sample : tuple[str], optional + A subset of the samples from which to extract genotypes + + Defaults to loading genotypes from all samples samples_file : Path, optional A single column txt file containing a list of the samples (one per line) to subset from the genotypes file @@ -202,14 +233,11 @@ def simphenotype( samples = list(samples) else: samples = None - # Basic checks on input - # TODO - check VCF zipped, check only one of simu-qt/simu-cc, - # check values of other inputs - # Only use simu-k for case/control # Run simulation simulate_pt( - genotypes, haplotypes, simu_rep, simu_hsq, simu_k, region, samples, output, log + genotypes, haplotypes, replications, heritability, prevalence, region, samples, + output, log ) @main.command(short_help="Transform a genotypes matrix via a set of haplotypes") @@ -222,8 +250,8 @@ def simphenotype( show_default="all haplotypes", help=( "The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'" - "For this to work, the VCF and .hap file must be indexed and the seqnames must" - "correspond with those in the files" + "For this to work, the VCF and .hap file must be indexed and the seqname " + "provided must correspond with one in the files" ) ) @click.option( @@ -291,10 +319,16 @@ def transform( haplotypes : Path The path to the haplotypes in a .hap file region : str, optional - See documentation for :py:meth:`~.data.Genotypes.read` - and :py:meth:`~.data.Haplotypes.read` - sample : Tuple[str], optional - See documentation for :py:meth:`~.data.Genotypes.read` + The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7' + + For this to work, the VCF and .hap file must be indexed and the seqname must + match! + + Defaults to loading all haplotypes + sample : tuple[str], optional + A subset of the samples from which to extract genotypes + + Defaults to loading genotypes from all samples samples_file : Path, optional A single column txt file containing a list of the samples (one per line) to subset from the genotypes file diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 039446a9..ff128ee6 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -289,9 +289,9 @@ def __iter__( def index(self, samples: bool = True, variants: bool = True): """ - Call this function once to improve the speed of future look-ups of samples and - variants by their ID. This is useful if you intend to later subset by a set - of samples or variant IDs. + Call this function once to improve the amortized time-complexity of look-ups of + samples and variants by their ID. This is useful if you intend to later subset + by a set of samples or variant IDs. The time complexity of this function should be roughly O(n+m) if both parameters are True. Otherwise, it will be either O(n) or O(m). diff --git a/haptools/transform.py b/haptools/transform.py index cbcb1bf5..2de57a2f 100644 --- a/haptools/transform.py +++ b/haptools/transform.py @@ -1,8 +1,8 @@ from __future__ import annotations import logging +from pathlib import Path from haptools import data -from .haplotype import HaptoolsHaplotype def transform_haps( From 6c9d7ce29c117fd17acd89a1aa78ea7fa23be256 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 2 Jul 2022 15:06:14 -0700 Subject: [PATCH 25/44] rename sim_phenotypes module to sim_phenotype (singular) --- haptools/__main__.py | 2 +- haptools/{sim_phenotypes.py => sim_phenotype.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename haptools/{sim_phenotypes.py => sim_phenotype.py} (100%) diff --git a/haptools/__main__.py b/haptools/__main__.py index 38f7a697..999e9293 100644 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -213,7 +213,7 @@ def simphenotype( """ import logging - from .sim_phenotypes import simulate_pt + from .sim_phenotype import simulate_pt log = logging.getLogger("run") logging.basicConfig( diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotype.py similarity index 100% rename from haptools/sim_phenotypes.py rename to haptools/sim_phenotype.py From c6f1c5c086cdbf13748ce5892fb0937586debbad Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 3 Jul 2022 14:19:12 -0700 Subject: [PATCH 26/44] create test module for simphenotype --- haptools/sim_phenotype.py | 12 ++++++++---- tests/test_simphenotype.py | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 tests/test_simphenotype.py diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py index 2b1d3d35..a2c96cc6 100644 --- a/haptools/sim_phenotype.py +++ b/haptools/sim_phenotype.py @@ -51,13 +51,14 @@ class PhenoSimulator: >>> haps_gts = GenotypesRefAlt(None) >>> haps.transform(gens, haps_gts) >>> phenosim = PhenoSimulator(haps_gts) - >>> phenotypes = phenosim.run() + >>> phenosim.run(next(haps.data.values())) + >>> phenotypes = phenosim.phens """ def __init__( self, genotypes: Genotypes, - output: Path = None, + output: Path = Path("/dev/stdout"), log: Logger = None, ): """ @@ -67,8 +68,10 @@ def __init__( ---------- genotypes: Genotypes Genotypes for each haplotype - output: Path + output: Path, optional Path to a '.pheno' file to which the generated phenotypes could be written + + Defaults to stdout if not provided log: Logger, optional A logging instance for recording debug statements """ @@ -129,13 +132,14 @@ def run( # effect sizes # assuming the genotypes are independent, this makes the variance of the # noise term equal to 1 - sum(betas^2) + # TODO: figure out how to account for the fact that this can make noise > 1 heritability = np.power(betas, 2).sum() # # account for the fact that the genotypes are not independent by adding the # # covariance between all of the variables # for a_idx, b_idx in combinations(range(len(betas)), 2): # heritability += 2 * betas[a_idx] * betas[b_idx] * \ # np.cov(gts[:,a_idx], gts[:,b_idx])[0][1] - self.log.info(f"Adding environmental component for h^squared: {heritability}") + self.log.info(f"Adding environmental component for h^2: {heritability}") # compute the environmental effect noise = np.var(pt) * (np.reciprocal(heritability) - 1) # finally, add everything together to get the simulated phenotypes diff --git a/tests/test_simphenotype.py b/tests/test_simphenotype.py new file mode 100644 index 00000000..97500955 --- /dev/null +++ b/tests/test_simphenotype.py @@ -0,0 +1,23 @@ +import os +from pathlib import Path + +import pytest +import numpy as np +import numpy.lib.recfunctions as rfn + +from haptools.sim_phenotype import Haplotype +from haptools.data import ( + Genotypes, + GenotypesRefAlt, + Phenotypes, + Haplotypes, + Variant, + Haplotype as HaplotypeBase, +) + + +DATADIR = Path(__file__).parent.joinpath("data") + +def test_simphenotype(self): + pass + From 1d9677527eebafb541c82de5c5f5c9abcf0aac6f Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 5 Jul 2022 17:54:48 -0700 Subject: [PATCH 27/44] try to use new pysam add_samples() method to speed up vcf creation see https://github.com/pysam-developers/pysam/issues/1104 --- haptools/data/genotypes.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index ff128ee6..f9decf72 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -590,10 +590,15 @@ def write(self): ("Description", "Genotype"), ], ) - for sample in self.samples: - # TODO: figure out how to make this work for large datasets - # it gets stuck and takes a long time to exit this loop - vcf.header.add_sample(sample) + try: + vcf.header.add_samples(self.samples) + except AttributeError: + self.log.warning( + "Upgrade to pysam >=0.19.1 to reduce the time required to create " + "VCFs. See https://github.com/pysam-developers/pysam/issues/1104" + ) + for sample in self.samples: + vcf.header.add_sample(sample) self.log.info("Writing VCF records") for var_idx, var in enumerate(self.variants): rec = { From 23a067acf2c15030314b2e5a7b6c9cd5c179adf7 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 5 Jul 2022 17:58:54 -0700 Subject: [PATCH 28/44] use default heritability of 1 and add append method to Phenotypes class --- docs/commands/transform.rst | 2 +- haptools/__main__.py | 4 +++- haptools/data/phenotypes.py | 30 ++++++++++++++++++++++++++++++ haptools/sim_phenotype.py | 26 ++++---------------------- tests/test_data.py | 18 +++++++++++++++++- 5 files changed, 55 insertions(+), 25 deletions(-) diff --git a/docs/commands/transform.rst b/docs/commands/transform.rst index e6f82493..905fb9b7 100644 --- a/docs/commands/transform.rst +++ b/docs/commands/transform.rst @@ -24,7 +24,7 @@ Examples ~~~~~~~~ .. code-block:: bash - haptools transform tests/data/example.vcf.gz tests/data/example.hap.gz | less + haptools transform tests/data/example.vcf.gz tests/data/basic.hap.gz | less Detailed Usage ~~~~~~~~~~~~~~ diff --git a/haptools/__main__.py b/haptools/__main__.py index 999e9293..adeab099 100644 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -95,6 +95,8 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms): "-h", "--heritability", type=click.FloatRange(min=0, max=1), + default=1, + show_default=True, help="Trait heritability", ) @click.option( @@ -157,7 +159,7 @@ def simphenotype( genotypes: Path, haplotypes: Path, replications: int = 1, - heritability: float = None, + heritability: float = 1, prevalence: float = None, region: str = None, samples: tuple[str] = tuple(), diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py index 1a61ebaa..dbbb2626 100644 --- a/haptools/data/phenotypes.py +++ b/haptools/data/phenotypes.py @@ -193,3 +193,33 @@ def standardize(self): This function modifies :py:attr:`~.Genotypes.data` in-place """ self.data = (self.data - np.mean(self.data, axis=0)) / np.std(self.data, axis=0) + + def append(self, name: str, data: npt.NDArray): + """ + Append a new set of phenotypes to the current set + + Parameters + ---------- + name: str + The name of the new phenotype + data: npt.NDArray + A 1D np array of the same length as :py:attr:`~.Phenotypes.samples`, + containing the phenotype values for each sample. Must have the same dtype + as :py:attr:`~.Phenotypes.data.` + """ + if len(self.samples): + if len(self.samples) != len(data): + self.log.error( + "The data provided to the add() method is not of the appropriate" + "length" + ) + else: + self.log.warning( + "Set the samples property of the Phenotypes instance before calling " + "the add() method" + ) + if self.unset(): + self.data = data[:, np.newaxis] + else: + self.data = np.concatenate((self.data, data[:, np.newaxis]), axis=1) + self.names = self.names + (name,) diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py index a2c96cc6..ca7a1c8b 100644 --- a/haptools/sim_phenotype.py +++ b/haptools/sim_phenotype.py @@ -85,7 +85,7 @@ def __init__( def run( self, effects: list[Haplotype], - heritability: float = None, + heritability: float = 1, prevalence: float = None, ) -> npt.NDArray: """ @@ -120,25 +120,12 @@ def run( self.log.info(f"Extracting haplotype genotypes for haps: {ids}") # extract the haplotype "genotypes" and compute the phenotypes gts = self.gens.subset(variants=ids).data.sum(axis=2) - self.log.info("Computing genetic component") + self.log.info(f"Computing genetic component w/ {gts.shape[0]} causal effects") # standardize the genotypes gts = (gts - gts.mean(axis=0)) / gts.std(axis=0) # generate the genetic component pt = (betas * gts).sum(axis=1) # compute the heritability - if heritability is None: - self.log.info("Computing heritability") - # if heritability is not defined, then we set it equal to the sum of the - # effect sizes - # assuming the genotypes are independent, this makes the variance of the - # noise term equal to 1 - sum(betas^2) - # TODO: figure out how to account for the fact that this can make noise > 1 - heritability = np.power(betas, 2).sum() - # # account for the fact that the genotypes are not independent by adding the - # # covariance between all of the variables - # for a_idx, b_idx in combinations(range(len(betas)), 2): - # heritability += 2 * betas[a_idx] * betas[b_idx] * \ - # np.cov(gts[:,a_idx], gts[:,b_idx])[0][1] self.log.info(f"Adding environmental component for h^2: {heritability}") # compute the environmental effect noise = np.var(pt) * (np.reciprocal(heritability) - 1) @@ -152,13 +139,8 @@ def run( bool_pt = np.repeat(False, repeats=len(pt)) bool_pt[np.argpartition(pt, k)[-k:]] = True pt = bool_pt - pt = pt[:, np.newaxis] # now, save the archived phenotypes for later - if self.phens.data is None: - self.phens.data = pt - else: - self.phens.data = np.concatenate((self.phens.data, pt), axis=1) - self.phens.names = self.phens.names + ("-".join(ids),) + self.phens.append(name="-".join(ids), data=pt) return pt def write(self): @@ -173,7 +155,7 @@ def simulate_pt( genotypes: Path, haplotypes: Path, num_replications: int = 1, - heritability: float = None, + heritability: float = 1, prevalence: float = None, region: str = None, samples: list[str] = None, diff --git a/tests/test_data.py b/tests/test_data.py index 30c1e3a9..001eac7a 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -5,7 +5,7 @@ import numpy as np import numpy.lib.recfunctions as rfn -from haptools.haplotype import HaptoolsHaplotype +from haptools.sim_phenotype import Haplotype as HaptoolsHaplotype from haptools.data import ( Genotypes, GenotypesRefAlt, @@ -320,6 +320,22 @@ def test_write_phenotypes(self): # let's clean up after ourselves and delete the file os.remove(str(expected_phen.fname)) + def test_append_phenotype(self): + expected1 = self._get_fake_phenotypes() + expected2 = self._get_fake_phenotypes() + + # add a phenotype called "test" to the set of phenotypes + new_phen = np.array([5, 2, 8, 0, 3], dtype=expected2.data.dtype) + expected2.append("test", new_phen) + + # did it get added correctly? + assert expected1.data.shape[1] == expected2.data.shape[1] - 1 + assert len(expected1.names) == len(expected2.names) - 1 + assert expected2.names[2] == "test" + assert len(expected1.samples) == len(expected2.samples) + np.testing.assert_allclose(expected1.data, expected2.data[:, :2]) + np.testing.assert_allclose(expected2.data[:, 2], new_phen) + class TestCovariates: def _get_expected_covariates(self): From 30852bcd297d9ad43d9ece7885d3692e393ddf84 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 5 Jul 2022 19:43:25 -0700 Subject: [PATCH 29/44] clarify installation and contribution guidelines remove @mlamkin7's email so he doesn't get spam from bots that scrape repositories --- README.md | 10 ++++++---- haptools/data/phenotypes.py | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f9d64180..f8692eab 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,10 @@ Homepage: https://haptools.readthedocs.io/ ## Installation -UNDER CONSTRUCTION +We have not officially published `haptools` yet, but in the meantime, you can install it directly from our Github repository. +```bash +pip install git+https://github.com/gymrek-lab/haptools.git +``` ## Haptools utilities @@ -28,7 +31,6 @@ Outputs produced by these utilities are compatible with each other. For example ## Contributing -If you are interested in contributing to `haptools`, please get in touch by submitting a Github issue or contacting us at mlamkin@ucsd.edu. - - +We gladly welcome any contributions to `haptools`! +Please read [our contribution guidelines](https://haptools.readthedocs.io/en/latest/project_info/contributing.html) and then submit a [Github issue](https://github.com/gymrek-lab/haptools/issues). diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py index dbbb2626..407ebde4 100644 --- a/haptools/data/phenotypes.py +++ b/haptools/data/phenotypes.py @@ -8,6 +8,7 @@ from fileinput import hook_compressed import numpy as np +import numpy.typing as npt from .data import Data From fe63e3263131b8a810586c3ce64f73c0ba7322d8 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 7 Jul 2022 09:49:05 -0700 Subject: [PATCH 30/44] seed the simphenotype random number generator --- haptools/data/genotypes.py | 4 ++-- haptools/data/haplotypes.py | 4 ++-- haptools/sim_phenotype.py | 12 ++++++++++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index f9decf72..98d4cd83 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -348,7 +348,7 @@ def subset( diff = len(samples) - len(gts.samples) self.log.warning( f"Saw {diff} fewer samples than requested. Proceeding with " - f"f{len(gts.samples)} samples." + f"{len(gts.samples)} samples." ) samp_idx = tuple(self._samp_idx[samp] for samp in gts.samples) if inplace: @@ -361,7 +361,7 @@ def subset( diff = len(variants) - len(var_idx) self.log.warning( f"Saw {diff} fewer variants than requested. Proceeding with " - f"f{len(var_idx)} variants." + f"{len(var_idx)} variants." ) gts.variants = self.variants[var_idx] if inplace: diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py index bd3f7da3..5adb031a 100644 --- a/haptools/data/haplotypes.py +++ b/haptools/data/haplotypes.py @@ -234,8 +234,8 @@ class Haplotype: The chromosomal end position of the haplotype id: str The haplotype's unique ID - variants: list[Variant] - A list of the variants in this haplotype + variants: tuple[Variant] + The variants in this haplotype _extras: tuple[Extra] Extra fields for the haplotype diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py index ca7a1c8b..b0c5386d 100644 --- a/haptools/sim_phenotype.py +++ b/haptools/sim_phenotype.py @@ -41,6 +41,8 @@ class PhenoSimulator: Genotypes to simulate phens: Phenotypes Simulated phenotypes; filled by :py:meth:`~.PhenoSimular.run` + rng: np.random.Generator, optional + A numpy random number generator log: Logger A logging instance for recording debug statements @@ -59,6 +61,7 @@ def __init__( self, genotypes: Genotypes, output: Path = Path("/dev/stdout"), + seed: int = None, log: Logger = None, ): """ @@ -72,14 +75,19 @@ def __init__( Path to a '.pheno' file to which the generated phenotypes could be written Defaults to stdout if not provided + seed: int, optional + A seed to initialize the random number generator + + This is useful if you want the generated phenotypes to be the same across + multiple PhenoSimulator instances. If not provided, it will be random. log: Logger, optional A logging instance for recording debug statements """ self.gens = genotypes self.phens = Phenotypes(fname=output) - self.phens.names = tuple() self.phens.data = None self.phens.samples = self.gens.samples + self.rng = np.random.default_rng(seed) self.log = log or getLogger(self.__class__.__name__) def run( @@ -130,7 +138,7 @@ def run( # compute the environmental effect noise = np.var(pt) * (np.reciprocal(heritability) - 1) # finally, add everything together to get the simulated phenotypes - pt += np.random.normal(0, noise, size=pt.shape) + pt += self.rng.normal(0, noise, size=pt.shape) if prevalence is not None: self.log.info(f"Converting to case/control with prevalence {prevalence}") # first, find the number of desired positives From 24fef3923520cc54c070a8bee714b49235e462e3 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 8 Jul 2022 08:53:35 -0700 Subject: [PATCH 31/44] mark case/control in pheno output --- haptools/sim_phenotype.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py index b0c5386d..71bacadd 100644 --- a/haptools/sim_phenotype.py +++ b/haptools/sim_phenotype.py @@ -139,6 +139,7 @@ def run( noise = np.var(pt) * (np.reciprocal(heritability) - 1) # finally, add everything together to get the simulated phenotypes pt += self.rng.normal(0, noise, size=pt.shape) + name_suffix = "" if prevalence is not None: self.log.info(f"Converting to case/control with prevalence {prevalence}") # first, find the number of desired positives @@ -147,8 +148,9 @@ def run( bool_pt = np.repeat(False, repeats=len(pt)) bool_pt[np.argpartition(pt, k)[-k:]] = True pt = bool_pt + name_suffix = "-cc" # now, save the archived phenotypes for later - self.phens.append(name="-".join(ids), data=pt) + self.phens.append(name="-".join(ids)+name_suffix, data=pt) return pt def write(self): From 9de40ee343a6a30d46a9f9e2355b51035a76bd21 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 8 Jul 2022 08:54:05 -0700 Subject: [PATCH 32/44] add tests for simphenotype still need to revise case/control test --- tests/test_simphenotype.py | 184 +++++++++++++++++++++++++++++++++++-- 1 file changed, 178 insertions(+), 6 deletions(-) diff --git a/tests/test_simphenotype.py b/tests/test_simphenotype.py index 97500955..c53015f0 100644 --- a/tests/test_simphenotype.py +++ b/tests/test_simphenotype.py @@ -5,19 +5,191 @@ import numpy as np import numpy.lib.recfunctions as rfn -from haptools.sim_phenotype import Haplotype +from haptools.sim_phenotype import Haplotype, PhenoSimulator from haptools.data import ( Genotypes, - GenotypesRefAlt, Phenotypes, Haplotypes, - Variant, - Haplotype as HaplotypeBase, ) DATADIR = Path(__file__).parent.joinpath("data") -def test_simphenotype(self): - pass +class TestSimPhenotype: + def _get_fake_gens(self): + gts = Genotypes(fname=None) + gts.data = np.array([ + [[1, 0], [0, 1]], + [[0, 1], [0, 1]], + [[0, 0], [1, 1]], + [[1, 1], [1, 1]], + [[0, 0], [0, 0]], + ], dtype=np.uint8) + gts.variants = np.array( + [ + ("1:10114:T:C", "1", 10114, 0.7), + ("1:10116:A:G", "1", 10116, 0.6), + ], + dtype=[ + ("id", "U50"), + ("chrom", "U10"), + ("pos", np.uint32), + ("aaf", np.float64), + ], + ) + gts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") + return gts + def _get_fake_haps(self): + return [ + Haplotype("1", 10114, 10115, "1:10114:T:C", "CEU", 0.25), + Haplotype("1", 10116, 10117, "1:10116:A:G", "YRI", 0.75), + ] + + def _get_expected_phens(self): + pts = Phenotypes(fname=None) + pts.data = np.array([ + [-0.13363062, False], + [-0.13363062, False], + [ 0.53452248, True], + [ 1.20267559, True], + [-1.46993683, False], + ], dtype=np.float64) + pts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") + pts.names = ("1:10114:T:C", "1:10116:A:G") + return pts + + def test_one_phen_zero_noise(self): + gts = self._get_fake_gens() + hps = self._get_fake_haps() + expected = self._get_expected_phens() + + pt_sim = PhenoSimulator(gts, seed=42) + data = pt_sim.run([hps[0]], heritability=1) + data = data[:, np.newaxis] + phens = pt_sim.phens + + # check the data and the generated phenotype object + assert phens.data.shape == (5, 1) + np.testing.assert_allclose(phens.data, data) + assert phens.data[0, 0] == phens.data[1, 0] + assert phens.data[2, 0] == phens.data[4, 0] + assert phens.data[3, 0] > phens.data[0, 0] + assert phens.data[2, 0] < phens.data[0, 0] + assert phens.samples == expected.samples + assert phens.names[0] == expected.names[0] + + def test_one_phen_zero_noise_neg_beta(self): + """ + the same test as test_one_phen_zero_noise but with a negative beta this time + """ + gts = self._get_fake_gens() + hps = self._get_fake_haps() + # make the beta value negative + hps[0].beta = -hps[0].beta + expected = self._get_expected_phens() + + pt_sim = PhenoSimulator(gts, seed=42) + data = pt_sim.run([hps[0]], heritability=1) + data = data[:, np.newaxis] + phens = pt_sim.phens + + # check the data and the generated phenotype object + assert phens.data.shape == (5, 1) + np.testing.assert_allclose(phens.data, data) + assert phens.data[0, 0] == phens.data[1, 0] + assert phens.data[2, 0] == phens.data[4, 0] + assert phens.data[3, 0] < phens.data[0, 0] + assert phens.data[2, 0] > phens.data[0, 0] + assert phens.samples == expected.samples + assert phens.names[0] == expected.names[0] + + def test_two_phens_zero_noise(self): + gts = self._get_fake_gens() + hps = self._get_fake_haps() + expected = self._get_expected_phens() + + pt_sim = PhenoSimulator(gts, seed=42) + pt_sim.run([hps[0]], heritability=1) + pt_sim.run([hps[1]], heritability=1) + phens = pt_sim.phens + + # check the data and the generated phenotype object + assert phens.data.shape == (5, 2) + assert phens.data[0, 0] == phens.data[1, 0] + assert phens.data[2, 0] == phens.data[4, 0] + assert phens.data[3, 0] > phens.data[0, 0] + assert phens.data[2, 0] < phens.data[0, 0] + + assert phens.data[0, 1] == phens.data[1, 1] + assert phens.data[2, 1] == phens.data[3, 1] + assert phens.data[3, 1] > phens.data[0, 1] + assert phens.data[4, 1] < phens.data[0, 1] + + assert phens.samples == expected.samples + assert phens.names == expected.names + + def test_combined_phen_zero_noise(self): + gts = self._get_fake_gens() + hps = self._get_fake_haps() + expected = self._get_expected_phens() + + pt_sim = PhenoSimulator(gts, seed=42) + pt_sim.run(hps, heritability=1) + phens = pt_sim.phens + + # check the data and the generated phenotype object + assert phens.data.shape == (5, 1) + np.testing.assert_allclose(phens.data[:, 0], expected.data[:, 0]) + + assert phens.samples == expected.samples + assert phens.names == ("-".join(expected.names),) + + def test_noise(self): + gts = self._get_fake_gens() + hps = self._get_fake_haps() + expected = self._get_expected_phens() + + pt_sim = PhenoSimulator(gts, seed=42) + pt_sim.run(hps, heritability=1) + pt_sim.run(hps, heritability=0.96) + pt_sim.run(hps, heritability=0.5) + pt_sim.run(hps, heritability=0.2) + phens = pt_sim.phens + + # check the data and the generated phenotype object + assert phens.data.shape == (5, 4) + np.testing.assert_allclose(phens.data[:, 0], expected.data[:, 0]) + diff1 = np.abs(phens.data[:, 1] - phens.data[:, 0]).sum() + assert diff1 > 0 + diff2 = np.abs(phens.data[:, 2] - phens.data[:, 0]).sum() + assert diff2 > diff1 + diff3 = np.abs(phens.data[:, 3] - phens.data[:, 0]).sum() + assert diff3 > diff2 + + def test_case_control(self): + gts = self._get_fake_gens() + hps = self._get_fake_haps() + expected = self._get_expected_phens() + all_true = np.empty(expected.data.shape, dtype=np.bool_) + some_true = expected.data[:, 1].astype(np.bool_) + all_false = ~np.empty(expected.data.shape, dtype=np.bool_) + + pt_sim = PhenoSimulator(gts, seed=42) + pt_sim.run(hps, heritability=1, prevalence=0.8) + pt_sim.run(hps, heritability=1, prevalence=1) + pt_sim.run(hps, heritability=1, prevalence=0) + pt_sim.run(hps, heritability=0.96, prevalence=0.8) + pt_sim.run(hps, heritability=0.5, prevalence=0.8) + pt_sim.run(hps, heritability=0.2, prevalence=0.8) + phens = pt_sim.phens + assert phens.data.shape == (5, 6) + np.testing.assert_allclose(phens.data[:, 1], some_true) + np.testing.assert_allclose(phens.data[:, 0], all_true[:, 0]) + np.testing.assert_allclose(phens.data[:, 2], all_false[:, 0]) + diff1 = (phens.data[:, 3] == phens.data[:, 0]).sum() + assert diff1 > 0 + diff2 = (phens.data[:, 4] == phens.data[:, 0]).sum() + assert diff2 > diff1 + diff3 = (phens.data[:, 5] == phens.data[:, 0]).sum() + assert diff3 > diff2 From 9a616fd8a01fe07414743673c158353105bee5b0 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 8 Jul 2022 09:46:01 -0700 Subject: [PATCH 33/44] finish testing case/control simphenotype --- haptools/sim_phenotype.py | 11 ++++++++--- tests/test_simphenotype.py | 24 +++++++++--------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py index 71bacadd..cbaea5f8 100644 --- a/haptools/sim_phenotype.py +++ b/haptools/sim_phenotype.py @@ -139,18 +139,23 @@ def run( noise = np.var(pt) * (np.reciprocal(heritability) - 1) # finally, add everything together to get the simulated phenotypes pt += self.rng.normal(0, noise, size=pt.shape) + # now, handle case/control name_suffix = "" if prevalence is not None: self.log.info(f"Converting to case/control with prevalence {prevalence}") # first, find the number of desired positives k = int(prevalence * len(pt)) # choose the top k values and label them positive - bool_pt = np.repeat(False, repeats=len(pt)) - bool_pt[np.argpartition(pt, k)[-k:]] = True + bool_pt = np.zeros(len(pt), dtype=np.bool_) + if k == len(pt): + bool_pt = ~bool_pt + else: + max_indices = np.argpartition(-pt, k)[:k] + bool_pt[max_indices] = True pt = bool_pt name_suffix = "-cc" # now, save the archived phenotypes for later - self.phens.append(name="-".join(ids)+name_suffix, data=pt) + self.phens.append(name="-".join(ids)+name_suffix, data=pt.astype(np.float64)) return pt def write(self): diff --git a/tests/test_simphenotype.py b/tests/test_simphenotype.py index c53015f0..6c937212 100644 --- a/tests/test_simphenotype.py +++ b/tests/test_simphenotype.py @@ -171,25 +171,19 @@ def test_case_control(self): gts = self._get_fake_gens() hps = self._get_fake_haps() expected = self._get_expected_phens() - all_true = np.empty(expected.data.shape, dtype=np.bool_) - some_true = expected.data[:, 1].astype(np.bool_) - all_false = ~np.empty(expected.data.shape, dtype=np.bool_) + all_false = np.zeros(expected.data.shape, dtype=np.float64) + some_true = expected.data[:, 1] + all_true = (~(all_false.astype(np.bool_))).astype(np.float64) pt_sim = PhenoSimulator(gts, seed=42) - pt_sim.run(hps, heritability=1, prevalence=0.8) + pt_sim.run(hps, heritability=1, prevalence=0.4) pt_sim.run(hps, heritability=1, prevalence=1) pt_sim.run(hps, heritability=1, prevalence=0) - pt_sim.run(hps, heritability=0.96, prevalence=0.8) - pt_sim.run(hps, heritability=0.5, prevalence=0.8) - pt_sim.run(hps, heritability=0.2, prevalence=0.8) + pt_sim.run(hps, heritability=0.5, prevalence=0.4) phens = pt_sim.phens - assert phens.data.shape == (5, 6) - np.testing.assert_allclose(phens.data[:, 1], some_true) - np.testing.assert_allclose(phens.data[:, 0], all_true[:, 0]) - np.testing.assert_allclose(phens.data[:, 2], all_false[:, 0]) + assert phens.data.shape == (5, 4) + np.testing.assert_allclose(phens.data[:, 0], some_true) + np.testing.assert_allclose(phens.data[:, 1], all_true[:, 1]) + np.testing.assert_allclose(phens.data[:, 2], all_false[:, 1]) diff1 = (phens.data[:, 3] == phens.data[:, 0]).sum() assert diff1 > 0 - diff2 = (phens.data[:, 4] == phens.data[:, 0]).sum() - assert diff2 > diff1 - diff3 = (phens.data[:, 5] == phens.data[:, 0]).sum() - assert diff3 > diff2 From b7e0dc08e412db8c4682676a71dd20a64b51de9c Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 8 Jul 2022 09:46:42 -0700 Subject: [PATCH 34/44] ensure written pheno file has unique names --- haptools/__main__.py | 2 +- haptools/data/phenotypes.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) mode change 100644 => 100755 haptools/__main__.py diff --git a/haptools/__main__.py b/haptools/__main__.py old mode 100644 new mode 100755 index adeab099..ee1980f5 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -102,7 +102,7 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms): @click.option( "-p", "--prevalence", - type=click.FloatRange(min=0, max=1, min_open=True, max_open=True), + type=click.FloatRange(min=0, max=1, min_open=False, max_open=True), show_default="quantitative trait", help="Disease prevalence if simulating a case-control trait" ) diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py index 407ebde4..a55cb961 100644 --- a/haptools/data/phenotypes.py +++ b/haptools/data/phenotypes.py @@ -2,10 +2,10 @@ from csv import reader from pathlib import Path from io import TextIOBase -from collections import namedtuple from collections.abc import Iterable from logging import getLogger, Logger from fileinput import hook_compressed +from collections import namedtuple, Counter import numpy as np import numpy.typing as npt @@ -180,8 +180,15 @@ def write(self): >>> phenotypes.samples = ('HG00096', 'HG00097', 'HG00099') >>> phenotypes.write() """ + # make sure the names are unique + uniq_names = Counter() + names = [] + for name in self.names: + names.append(name+(f"-{uniq_names[name]}" if uniq_names[name] else "")) + uniq_names[name] += 1 + # now we can finally write the file with hook_compressed(self.fname, mode="wt") as haps: - haps.write("#IID\t" + "\t".join(self.names) + "\n") + haps.write("#IID\t" + "\t".join(names) + "\n") formatter = {"float_kind": lambda x: "%.2f" % x} for samp, phen in zip(self.samples, self.data): line = np.array2string(phen, separator="\t", formatter=formatter)[1:-1] From f02cb939f1fe18adf3ba12ef4babef781e2302ea Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 8 Jul 2022 09:52:02 -0700 Subject: [PATCH 35/44] refmt with black --- haptools/data/phenotypes.py | 9 ++++++--- haptools/sim_phenotype.py | 2 +- tests/test_simphenotype.py | 35 +++++++++++++++++++++-------------- 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py index a55cb961..d43222c4 100644 --- a/haptools/data/phenotypes.py +++ b/haptools/data/phenotypes.py @@ -182,9 +182,12 @@ def write(self): """ # make sure the names are unique uniq_names = Counter() - names = [] - for name in self.names: - names.append(name+(f"-{uniq_names[name]}" if uniq_names[name] else "")) + names = [None] * len(self.names) + for idx, name in enumerate(self.names): + suffix = "" + if uniq_names[name]: + suffix = f"-{uniq_names[name]}" + names[idx] = name + suffix uniq_names[name] += 1 # now we can finally write the file with hook_compressed(self.fname, mode="wt") as haps: diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py index cbaea5f8..8829b04e 100644 --- a/haptools/sim_phenotype.py +++ b/haptools/sim_phenotype.py @@ -155,7 +155,7 @@ def run( pt = bool_pt name_suffix = "-cc" # now, save the archived phenotypes for later - self.phens.append(name="-".join(ids)+name_suffix, data=pt.astype(np.float64)) + self.phens.append(name="-".join(ids) + name_suffix, data=pt.astype(np.float64)) return pt def write(self): diff --git a/tests/test_simphenotype.py b/tests/test_simphenotype.py index 6c937212..a47d761d 100644 --- a/tests/test_simphenotype.py +++ b/tests/test_simphenotype.py @@ -15,16 +15,20 @@ DATADIR = Path(__file__).parent.joinpath("data") + class TestSimPhenotype: def _get_fake_gens(self): gts = Genotypes(fname=None) - gts.data = np.array([ - [[1, 0], [0, 1]], - [[0, 1], [0, 1]], - [[0, 0], [1, 1]], - [[1, 1], [1, 1]], - [[0, 0], [0, 0]], - ], dtype=np.uint8) + gts.data = np.array( + [ + [[1, 0], [0, 1]], + [[0, 1], [0, 1]], + [[0, 0], [1, 1]], + [[1, 1], [1, 1]], + [[0, 0], [0, 0]], + ], + dtype=np.uint8, + ) gts.variants = np.array( [ ("1:10114:T:C", "1", 10114, 0.7), @@ -48,13 +52,16 @@ def _get_fake_haps(self): def _get_expected_phens(self): pts = Phenotypes(fname=None) - pts.data = np.array([ - [-0.13363062, False], - [-0.13363062, False], - [ 0.53452248, True], - [ 1.20267559, True], - [-1.46993683, False], - ], dtype=np.float64) + pts.data = np.array( + [ + [-0.13363062, False], + [-0.13363062, False], + [0.53452248, True], + [1.20267559, True], + [-1.46993683, False], + ], + dtype=np.float64, + ) pts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101") pts.names = ("1:10114:T:C", "1:10116:A:G") return pts From d2f144ca380c9df73d744f0fb2a00f45ca140030 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 8 Jul 2022 12:01:23 -0700 Subject: [PATCH 36/44] add docs for phenosimulator to API section --- docs/api/haptools.rst | 8 ++++++++ haptools/sim_phenotype.py | 5 ++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/api/haptools.rst b/docs/api/haptools.rst index d37b5ab7..00c8115b 100644 --- a/docs/api/haptools.rst +++ b/docs/api/haptools.rst @@ -51,3 +51,11 @@ haptools.data.haplotypes module :members: :undoc-members: :show-inheritance: + +haptools.sim_phenotype module +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. automodule:: haptools.sim_phenotype + :members: + :undoc-members: + :show-inheritance: diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py index 8829b04e..68c81d08 100644 --- a/haptools/sim_phenotype.py +++ b/haptools/sim_phenotype.py @@ -50,10 +50,9 @@ class PhenoSimulator: -------- >>> gens = Genotypes.load("tests/data/example.vcf.gz") >>> haps = Haplotypes.load("tests/data/basic.hap") - >>> haps_gts = GenotypesRefAlt(None) - >>> haps.transform(gens, haps_gts) + >>> haps_gts = haps.transform(gens) >>> phenosim = PhenoSimulator(haps_gts) - >>> phenosim.run(next(haps.data.values())) + >>> phenosim.run(haps.data.values()) >>> phenotypes = phenosim.phens """ From 729138076758454c1717670cb9d60781fdb9c0d6 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 8 Jul 2022 13:37:00 -0700 Subject: [PATCH 37/44] rename phenotype simulation tests dunno why I named them like that originally - must not have been thinking --- tests/test_simphenotype.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_simphenotype.py b/tests/test_simphenotype.py index a47d761d..00339656 100644 --- a/tests/test_simphenotype.py +++ b/tests/test_simphenotype.py @@ -66,7 +66,7 @@ def _get_expected_phens(self): pts.names = ("1:10114:T:C", "1:10116:A:G") return pts - def test_one_phen_zero_noise(self): + def test_one_hap_zero_noise(self): gts = self._get_fake_gens() hps = self._get_fake_haps() expected = self._get_expected_phens() @@ -86,7 +86,7 @@ def test_one_phen_zero_noise(self): assert phens.samples == expected.samples assert phens.names[0] == expected.names[0] - def test_one_phen_zero_noise_neg_beta(self): + def test_one_hap_zero_noise_neg_beta(self): """ the same test as test_one_phen_zero_noise but with a negative beta this time """ @@ -111,7 +111,7 @@ def test_one_phen_zero_noise_neg_beta(self): assert phens.samples == expected.samples assert phens.names[0] == expected.names[0] - def test_two_phens_zero_noise(self): + def test_two_haps_zero_noise(self): gts = self._get_fake_gens() hps = self._get_fake_haps() expected = self._get_expected_phens() @@ -136,7 +136,7 @@ def test_two_phens_zero_noise(self): assert phens.samples == expected.samples assert phens.names == expected.names - def test_combined_phen_zero_noise(self): + def test_combined_haps_zero_noise(self): gts = self._get_fake_gens() hps = self._get_fake_haps() expected = self._get_expected_phens() From 81e48a3472dcf7bf1f2908379d1ba92f247d58bb Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 14 Jul 2022 09:00:21 -0700 Subject: [PATCH 38/44] revise simphenotype command docs after changes to CLI --- docs/commands/karyogram.md | 2 +- docs/commands/simphenotype.md | 76 ---------------------------------- docs/commands/simphenotype.rst | 75 ++++++++++++++++++++++++++++++++- docs/formats/phenotypes.rst | 2 +- docs/index.rst | 1 + haptools/__main__.py | 2 +- 6 files changed, 77 insertions(+), 81 deletions(-) delete mode 100644 docs/commands/simphenotype.md diff --git a/docs/commands/karyogram.md b/docs/commands/karyogram.md index 706f2419..9b898e40 100644 --- a/docs/commands/karyogram.md +++ b/docs/commands/karyogram.md @@ -1,4 +1,4 @@ -# Haptools karyogram +# karyogram `haptools karyogram` takes as input a breakpoints file (e.g. as output by `haptools simgenotype`) and a sample name, and plots a karyogram depicting local ancestry tracks. diff --git a/docs/commands/simphenotype.md b/docs/commands/simphenotype.md deleted file mode 100644 index 629a95be..00000000 --- a/docs/commands/simphenotype.md +++ /dev/null @@ -1,76 +0,0 @@ -# simphenotype - -Haptools simphenotype simulates a complex trait, taking into account haplotype- or local-ancestry- specific effects as well as traditional variant-level effects. It takes causal effects and genotypes as input and outputs simulated phenotypes. - -Usage is modeled based on the [GCTA GWAS Simulation](https://yanglab.westlake.edu.cn/software/gcta/#GWASSimulation) utility. - -## Usage - -Below is a basic `haptools simphenotype` command: - -``` -haptools simphenotype \ - --vcf \ - --hap \ - --out \ - < --simu-qt | --simu-cc > \ - [simulation options] -``` - -Required parameters: - -* `--vcf `: A bgzipped, tabix-indexed, phased VCF file. If you are simulating local-ancestry effects, the VCF file must contain the `FORMAT/LA` tag included in output of `haptools simgenotype`. See [haptools file formats](../../docs/formats/inputs.rst) for more details. - -* `--hap `: A bgzipped, tabix-indexed HAP file, which specifies causal effects. This is a custom format described in more detail in [haptools file formats](../../docs/formats/haplotypes.rst). The HAP format enables flexible specification of a range of effect types including traditional variant-level effects, haplotype-level effects, associations with repeat lengths at short tandem repeats, and interaction of these effects with local ancestry labels. See [Examples](#examples) below for detailed examples of how to specify effects. - -* `--out `: Prefix to name output files. - -* `--simu-qt` or `simu-cc` indicate whether to simulate a quantitative or case control trait. One of these two options must be specified. - -Additional parameters: - -* `--simu-rep `: Number of phenotypes to simulate. Default: 1. - -* `--simu-hsq `: Trait heritability. Default: 0.1. - -* `--simu-k `: Disease prevalence (for case-control). Default: 0.1 - -## Output files - -`haptools simphenotypes` outputs the following files: - -* `.phen`: Based on the phenotype files accepted by [plink](https://www.cog-genomics.org/plink/1.9/input#pheno). Tab-delimited file with one row per sample. The first and second columns give the sample ID. The third column gives the simulated phenotype. If `--simu-rep` was set to greater than 1, additional columns are output for each simulated trait. Example file: - -``` -HG00096 HG00096 0.058008375906919506 -HG00097 HG00097 0.03472768471423458 -HG00099 HG00099 -0.20850127859705808 -HG00100 HG00100 -0.21206803352471154 -HG00101 HG00101 0.3157913763428323 -``` - -* `.par`: summarizes the frequencies and effects of simulated haplotypes. The columns are: haplotype ID (from the HAP file), haplotype frequency, and effect. Example file: - -``` -Haplotype Frequency Effect -H-001 0.6 -0.2 -``` - - -## Examples - -#### Simulate a single haplotype-effect based on a 2 SNP haplotype: - -``` -haptools simphenotype \ - --vcf tests/data/simple.vcf.gz \ - --hap tests/data/simple.hap.gz \ - --out test \ - --simu-qt --simu-hsq 0.8 --simu-rep 1 -``` - -based on this HAP file (available in `tests/data`) - -``` -H-001 1 10114 10116 1:10114:T:C,1:10116:A:G T,G * -0.2 -``` \ No newline at end of file diff --git a/docs/commands/simphenotype.rst b/docs/commands/simphenotype.rst index 5c2afcfe..a0808872 100644 --- a/docs/commands/simphenotype.rst +++ b/docs/commands/simphenotype.rst @@ -1,4 +1,75 @@ .. _commands-simphenotype: -.. include:: simphenotype.md - :parser: myst_parser.sphinx_ \ No newline at end of file + +simphenotype +============ + +Simulates a complex trait, taking into account haplotype- or local-ancestry- specific effects as well as traditional variant-level effects. The user denotes causal variables to use within the simulation by specifying them in a ``.hap`` file. + +The implementation is based on the `GCTA GWAS Simulation `_ utility. + +Usage +~~~~~ +.. code-block:: bash + + haptools transform \ + --replications INT \ + --heritability FLOAT \ + --prevalence FLOAT \ + --region TEXT \ + --sample SAMPLE \ + --samples-file FILENAME \ + --output PATH \ + --verbosity [CRITICAL|ERROR|WARNING|INFO|DEBUG|NOTSET] \ + GENOTYPES HAPLOTYPES + +Model +~~~~~ +Each haplotype is encoded as an independent causal variable in the linear model. + +.. math:: + + \vec{y} = \sum_j \beta_j \vec{Z_j} + \vec \epsilon + +where + +.. math:: + + \epsilon_i \sim N(0, \sigma^2) + +.. math:: + + \sigma^2 = Var[\sum_j \beta_j \vec{Z_j}] * (\frac 1 {h^2} - 1) + +The heritability :math:`h^2` is user-specified but defaults to 1 when not provided. + +If a prevalence for the disease is specified, the final :math:`\vec{y}` value will be thresholded to produce a binary case/control trait with the desired fraction of diseased individuals. + +Output +~~~~~~ +Phenotypes are output in the PLINK2-style ``.pheno`` file format. If ``--replications`` was set to greater than 1, additional columns are output for each simulated trait. + +Examples +~~~~~~~~ +.. code-block:: bash + + haptools simphenotype -o simulated.pheno tests/data/example.vcf.gz tests/data/example.hap.gz + +Simulate two replicates of a case/control trait that occurs in 60% of your samples with a heritability of 0.8. Encode all of the haplotypes in ``tests/data/example.hap.gz`` as independent causal variables. + +.. code-block:: bash + + haptools simphenotype \ + --replications 2 \ + --heritability 0.8 \ + --prevalence 0.6 \ + --output simulated.pheno \ + tests/data/example.vcf.gz tests/data/example.hap.gz + +Detailed Usage +~~~~~~~~~~~~~~ + +.. click:: haptools.__main__:main + :prog: haptools + :show-nested: + :commands: simphenotype \ No newline at end of file diff --git a/docs/formats/phenotypes.rst b/docs/formats/phenotypes.rst index 2d856ff2..f300076f 100644 --- a/docs/formats/phenotypes.rst +++ b/docs/formats/phenotypes.rst @@ -6,7 +6,7 @@ Phenotypes and Covariates Phenotype file format --------------------- -Phenotypes are expected to follow the PLINK2 ``.pheno`` file format. This is a +Phenotypes are expected to follow `the PLINK2 .pheno file format `_. This is a tab-separated format where the first column corresponds to the sample ID, and subsequent columns contain each of your phenotypes. diff --git a/docs/index.rst b/docs/index.rst index b2a14574..bda46b4c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -21,6 +21,7 @@ commands/simgenotype.rst commands/simphenotype.rst + commands/karyogram.rst commands/transform.rst .. toctree:: diff --git a/haptools/__main__.py b/haptools/__main__.py index ee1980f5..7518d043 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -177,7 +177,7 @@ def simphenotype( \f Examples -------- - >>> haptools simphenotype tests/data/example.vcf.gz tests/data/example.hap.gz > simu_phens.tsv + >>> haptools simphenotype tests/data/example.vcf.gz tests/data/example.hap.gz > simulated.pheno Parameters ---------- From 8b5de5c7aa17f46fdfd8a2567fe4173f6038ddf4 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 14 Jul 2022 09:06:42 -0700 Subject: [PATCH 39/44] oops - fix typos --- docs/commands/simphenotype.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/commands/simphenotype.rst b/docs/commands/simphenotype.rst index a0808872..5ee89d33 100644 --- a/docs/commands/simphenotype.rst +++ b/docs/commands/simphenotype.rst @@ -12,7 +12,7 @@ Usage ~~~~~ .. code-block:: bash - haptools transform \ + haptools simphenotype \ --replications INT \ --heritability FLOAT \ --prevalence FLOAT \ @@ -25,7 +25,7 @@ Usage Model ~~~~~ -Each haplotype is encoded as an independent causal variable in the linear model. +Each normalized haplotype :math:`\vec{Z_j}` is encoded as an independent causal variable in a linear model: .. math:: @@ -72,4 +72,4 @@ Detailed Usage .. click:: haptools.__main__:main :prog: haptools :show-nested: - :commands: simphenotype \ No newline at end of file + :commands: simphenotype From b237872b7d4875f5b6324c85b0b6a9c75465e64c Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 18 Jul 2022 23:35:59 -0700 Subject: [PATCH 40/44] add non-gcta model to simphenotype --- docs/commands/simphenotype.rst | 6 +++++- haptools/__main__.py | 6 ++++-- haptools/sim_phenotype.py | 22 +++++++++++++++------- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/docs/commands/simphenotype.rst b/docs/commands/simphenotype.rst index 5ee89d33..790c9882 100644 --- a/docs/commands/simphenotype.rst +++ b/docs/commands/simphenotype.rst @@ -41,7 +41,11 @@ where \sigma^2 = Var[\sum_j \beta_j \vec{Z_j}] * (\frac 1 {h^2} - 1) -The heritability :math:`h^2` is user-specified but defaults to 1 when not provided. +The heritability :math:`h^2` is user-specified, but if it is not provided, then :math:`\sigma^2` will be computed purely from the effect sizes, instead: + +.. math:: + + \sigma^2 = \Biggl \lbrace {1 - \sum \beta_j^2 \quad \quad {\sum \beta_j^2 \le 1} \atop 0 \quad \quad \quad \quad \quad \text{ otherwise }} If a prevalence for the disease is specified, the final :math:`\vec{y}` value will be thresholded to produce a binary case/control trait with the desired fraction of diseased individuals. diff --git a/haptools/__main__.py b/haptools/__main__.py index 7518d043..80035d03 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -95,7 +95,7 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms): "-h", "--heritability", type=click.FloatRange(min=0, max=1), - default=1, + default=None, show_default=True, help="Trait heritability", ) @@ -159,7 +159,7 @@ def simphenotype( genotypes: Path, haplotypes: Path, replications: int = 1, - heritability: float = 1, + heritability: float = None, prevalence: float = None, region: str = None, samples: tuple[str] = tuple(), @@ -189,6 +189,8 @@ def simphenotype( The number of rounds of simulation to perform heritability : int, optional The heritability of the simulated trait; must be a float between 0 and 1 + + If not provided, it will be computed from the sum of the squared effect sizes prevalence : int, optional The prevalence of the disease if the trait should be simulated as case/control; must be a float between 0 and 1 diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py index 68c81d08..a1ece544 100644 --- a/haptools/sim_phenotype.py +++ b/haptools/sim_phenotype.py @@ -92,7 +92,7 @@ def __init__( def run( self, effects: list[Haplotype], - heritability: float = 1, + heritability: float = None, prevalence: float = None, ) -> npt.NDArray: """ @@ -108,8 +108,7 @@ def run( heritability: float, optional The simulated heritability of the trait - If not provided, this will be estimated from the variability of the - genotypes + If not provided, this will default to the sum of the squared effect sizes prevalence: float, optional How common should the disease be within the population? @@ -133,9 +132,16 @@ def run( # generate the genetic component pt = (betas * gts).sum(axis=1) # compute the heritability - self.log.info(f"Adding environmental component for h^2: {heritability}") - # compute the environmental effect - noise = np.var(pt) * (np.reciprocal(heritability) - 1) + if heritability is None: + # compute the environmental effect + noise = np.var(pt) * (np.reciprocal(heritability) - 1) + else: + self.log.debug("Computing heritability as the sum of the squared betas") + heritability = np.power(betas, 2).sum() + if heritability > 1: + heritability = 1 + noise = 1 - heritability + self.log.info(f"Adding environmental component {noise} for h^2 {heritability}") # finally, add everything together to get the simulated phenotypes pt += self.rng.normal(0, noise, size=pt.shape) # now, handle case/control @@ -169,7 +175,7 @@ def simulate_pt( genotypes: Path, haplotypes: Path, num_replications: int = 1, - heritability: float = 1, + heritability: float = None, prevalence: float = None, region: str = None, samples: list[str] = None, @@ -198,6 +204,8 @@ def simulate_pt( The number of rounds of simulation to perform heritability : int, optional The heritability of the simulated trait; must be a float between 0 and 1 + + If not provided, it will be computed from the sum of the squared effect sizes prevalence : int, optional The prevalence of the disease if the trait should be simulated as case/control; must be a float between 0 and 1 From 9f9d0ea09065bbc837bf1ee68d98d942b449ad98 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 18 Jul 2022 23:42:12 -0700 Subject: [PATCH 41/44] switch if else in simphenotype to fix bug --- haptools/sim_phenotype.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py index a1ece544..308877c2 100644 --- a/haptools/sim_phenotype.py +++ b/haptools/sim_phenotype.py @@ -133,14 +133,15 @@ def run( pt = (betas * gts).sum(axis=1) # compute the heritability if heritability is None: - # compute the environmental effect - noise = np.var(pt) * (np.reciprocal(heritability) - 1) - else: self.log.debug("Computing heritability as the sum of the squared betas") heritability = np.power(betas, 2).sum() if heritability > 1: heritability = 1 + # compute the environmental effect noise = 1 - heritability + else: + # compute the environmental effect + noise = np.var(pt) * (np.reciprocal(heritability) - 1) self.log.info(f"Adding environmental component {noise} for h^2 {heritability}") # finally, add everything together to get the simulated phenotypes pt += self.rng.normal(0, noise, size=pt.shape) From 6aefc33ea0a62ae06273dc202d2aa4a8901ca022 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 19 Jul 2022 09:10:44 -0700 Subject: [PATCH 42/44] adjust logging of info in PhenoSimulator.run() --- haptools/__main__.py | 2 +- haptools/sim_phenotype.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/haptools/__main__.py b/haptools/__main__.py index 80035d03..276adef7 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -190,7 +190,7 @@ def simphenotype( heritability : int, optional The heritability of the simulated trait; must be a float between 0 and 1 - If not provided, it will be computed from the sum of the squared effect sizes + If not provided, it will be computed from the sum of the squared effect sizes. prevalence : int, optional The prevalence of the disease if the trait should be simulated as case/control; must be a float between 0 and 1 diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py index 308877c2..5626ba98 100644 --- a/haptools/sim_phenotype.py +++ b/haptools/sim_phenotype.py @@ -123,7 +123,8 @@ def run( # extract the relevant haplotype info from the Haplotype objects ids = [hap.id for hap in effects] betas = np.array([hap.beta for hap in effects]) - self.log.info(f"Extracting haplotype genotypes for haps: {ids}") + self.log.debug(f"Extracting haplotype genotypes for haps: {ids}") + self.log.debug(f"Beta values are {betas}") # extract the haplotype "genotypes" and compute the phenotypes gts = self.gens.subset(variants=ids).data.sum(axis=2) self.log.info(f"Computing genetic component w/ {gts.shape[0]} causal effects") From 38931d781544656800236eea4731ebd25dcbdf58 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 19 Jul 2022 10:40:07 -0700 Subject: [PATCH 43/44] warn about case/control encoding in simphenotype docs --- docs/commands/simphenotype.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/commands/simphenotype.rst b/docs/commands/simphenotype.rst index 790c9882..e8ba06fc 100644 --- a/docs/commands/simphenotype.rst +++ b/docs/commands/simphenotype.rst @@ -53,6 +53,8 @@ Output ~~~~~~ Phenotypes are output in the PLINK2-style ``.pheno`` file format. If ``--replications`` was set to greater than 1, additional columns are output for each simulated trait. +Note that case/control phenotypes are encoded as 0 (control) + 1 (case) **not** 1 (control) + 2 (case). The latter is used by PLINK2 unless the ``--1`` flag is used (see `the PLIN2 docs `_). Therefore, you must use ``--1`` when providing our ``.pheno`` files to PLINK. + Examples ~~~~~~~~ .. code-block:: bash From 3f9af3a064272ba9ed010504eb6108058c478908 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 19 Jul 2022 11:36:53 -0700 Subject: [PATCH 44/44] log expected heritability and expand docs --- docs/commands/simphenotype.rst | 2 +- haptools/sim_phenotype.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/commands/simphenotype.rst b/docs/commands/simphenotype.rst index e8ba06fc..216d70fa 100644 --- a/docs/commands/simphenotype.rst +++ b/docs/commands/simphenotype.rst @@ -59,7 +59,7 @@ Examples ~~~~~~~~ .. code-block:: bash - haptools simphenotype -o simulated.pheno tests/data/example.vcf.gz tests/data/example.hap.gz + haptools simphenotype -o simulated.pheno tests/data/example.vcf.gz tests/data/simphenotype.hap Simulate two replicates of a case/control trait that occurs in 60% of your samples with a heritability of 0.8. Encode all of the haplotypes in ``tests/data/example.hap.gz`` as independent causal variables. diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py index 5626ba98..df1c417c 100644 --- a/haptools/sim_phenotype.py +++ b/haptools/sim_phenotype.py @@ -1,7 +1,7 @@ from __future__ import annotations from pathlib import Path from itertools import combinations -from logging import getLogger, Logger +from logging import getLogger, Logger, DEBUG from dataclasses import dataclass, field import numpy as np @@ -145,7 +145,13 @@ def run( noise = np.var(pt) * (np.reciprocal(heritability) - 1) self.log.info(f"Adding environmental component {noise} for h^2 {heritability}") # finally, add everything together to get the simulated phenotypes - pt += self.rng.normal(0, noise, size=pt.shape) + pt_noise = self.rng.normal(0, noise, size=pt.shape) + if self.log.getEffectiveLevel() == DEBUG: + # if we're in debug mode, compute the pearson correlation and report it + # but don't do this otherwise to keep things fast + corr = np.corrcoef(pt, pt + pt_noise)[1, 0] + self.log.debug(f"Estimated heritability is {corr}") + pt += pt_noise # now, handle case/control name_suffix = "" if prevalence is not None: