From 0116bbfbc749e952d81dd508704b67a9a45ab4df Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 13 May 2022 21:52:34 -0700
Subject: [PATCH 01/44] start on simphenotype refactor

---
 haptools/sim_phenotypes.py | 107 +++++++++++++++++++++++++++----------
 1 file changed, 78 insertions(+), 29 deletions(-)

diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py
index 1701a1b0..99279433 100644
--- a/haptools/sim_phenotypes.py
+++ b/haptools/sim_phenotypes.py
@@ -1,40 +1,89 @@
-"""
-Example test command:
+from __future__ import annotations
+from logging import getLogger, Logger
 
-haptools simphenotype --vcf tests/data/simple.vcf.gz --hap tests/data/simple.hap.gz \
-	--out test --simu-qt  --simu-hsq 0.8 --simu-rep 2
-"""
+from haplotype import HaptoolsHaplotype as Haplotype
+from .data import GenotypesRefAlt, Phenotypes, Haplotypes
 
-from cyvcf2 import VCF
 
-from .haplotypes import *
+class PhenoSimulator:
+    """
+    Simulate phenotypes from genotypes
 
-def simulate_pt(vcffile, hapfile, simu_rep, \
-        simu_hsq, simu_k, simu_qt, simu_cc, outprefix):
+    Attributes
+    ----------
+    gens: GenotypesRefAlt
+        A set of genotypes to use when simulating phenotypes
+    haps: Haplotypes
+        A set of haplotypes to use as causal variables in the simulation
+    haps_gts: GenotypesRefAlt
+        Genotypes for each haplotype
+    log: Logger
+        A logging instance for recording debug statements
+    """
 
-	# Initialize readers
-	vcf_reader = VCF(vcffile, gts012=True)
-	hap_reader = HapReader(hapfile)
+    def __init__(
+        self, genotypes: GenotypesRefAlt, haplotypes: Haplotypes, log: Logger = None
+    ):
+        self.gens = genotypes
+        self.haps = haplotypes
+        self.log = log or getLogger(self.__class__.__name__)
+        self.haps_gts = self.haps.transform(self.gens)
 
-	# Initialize phenotype simulator (haptools simphenotypes)
-	pt_sim = PhenoSimulator(vcf_reader.samples)
+    def run(
+        self,
+        simu_rep: int,
+        simu_hsq: float,
+        simu_k: float,
+        simu_qt: bool = False,
+        simu_cc: bool = False,
+    ) -> Phenotypes:
+        effect = np.array([hap.beta for hap in haps.data.values()])
+        data = self.haps_gts.data.sum(axis=2)
+        pt = np.sum(effect * (data - data.mean(axis=0)) / np.std(axis=0))
+        phens = np.empty((len(self.genotypes.samples)), dtype=np.float64)
 
-	# Set up file to write summary info
-	outf_sum = open("%s.par"%outprefix, "w")
-	outf_sum.write("\t".join(["Haplotype", "Frequency", "Effect"])+"\n")
+        # Simulate and write
+        for i in range(simu_rep):
+            resid_component = np.random.normal(0, np.var(pt) * (1 / simu_hsq - 1))
+            if simu_qt:
+                outinfo.append(pt + resid_component)
+            else:
+                raise NotImplementedError("Case control not implemented")
 
-	# Add effect for each haplotype
-	for hap in hap_reader:
-		sample_to_hap = hap.Transform(vcf_reader)
-		pt_sim.AddEffect(sample_to_hap, hap.hap_effect)
 
-		# Output summary info
-		hap_freq = hap.GetFrequency(vcf_reader)
-		outf_sum.write("\t".join([hap.hap_id, str(hap_freq), str(hap.hap_effect)])+"\n")
+def simulate_pt(
+    genotypes: Path,
+    haplotypes: Path,
+    phenotypes: Path,
+    simu_rep: int,
+    simu_hsq: float,
+    simu_k: float,
+    simu_qt: bool = False,
+    simu_cc: bool = False,
+):
+    # Initialize readers
+    gens = GenotypesRefAlt.load(genotypes)
+    haps = Haplotypes(haplotypes, haplotype=Haplotype)
+    haps.read()
 
-	# Output simulated phenotypes to a file (haptools simphenotypes)
-	pt_sim.WritePhenotypes(outprefix, simu_rep, simu_hsq, \
-		simu_k, simu_qt, simu_cc)
+    # Initialize phenotype simulator (haptools simphenotypes)
+    pt_sim = PhenoSimulator(gens, haps)
 
-	# Done
-	outf_sum.close()
\ No newline at end of file
+    # Set up file to write summary info
+    outf_sum = open("%s.par" % outprefix, "w")
+    outf_sum.write("\t".join(["Haplotype", "Frequency", "Effect"]) + "\n")
+
+    # Add effect for each haplotype
+    for idx, hap in enumerate(haps.data.values()):
+        sample_to_hap = hap.transform(gens)
+        pt_sim.add_effect(sample_to_hap, hap.beta)
+
+        # Output summary info
+        hap_freq = hap.GetFrequency(gens)
+        outf_sum.write("\t".join([hap.id, str(hap_freq), str(hap.beta)]) + "\n")
+
+    # Output simulated phenotypes to a file (haptools simphenotypes)
+    pt_sim.run(outprefix, simu_rep, simu_hsq, simu_k, simu_qt, simu_cc)
+
+    # Done
+    outf_sum.close()

From 4fbb2935de839482690df09bf47f2b1a30fed4d2 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 13 May 2022 22:08:28 -0700
Subject: [PATCH 02/44] remove original simphenotype code

---
 haptools/haplotypes.py     | 199 -------------------------------------
 haptools/sim_phenotypes.py |   5 +-
 2 files changed, 3 insertions(+), 201 deletions(-)
 delete mode 100644 haptools/haplotypes.py

diff --git a/haptools/haplotypes.py b/haptools/haplotypes.py
deleted file mode 100644
index 3c9993e4..00000000
--- a/haptools/haplotypes.py
+++ /dev/null
@@ -1,199 +0,0 @@
-"""
-Classes for reading/writing and other functions
-on haplotypes
-
-TODO:
-* transform
-  * deal with LA and STRs
-
-* writephenotypes
-* documentation and type hinting
-* tests
-
-* Compare simulation output to GCTA
-* fail gracefully on assertions - we need ERROR utility
-
-"""
-import re
-import gzip
-from pathlib import Path
-
-import pysam
-import numpy as np
-
-
-class Haplotype:
-	def __init__(self, hap_id, hap_chr, hap_start, hap_end, \
-					hap_varids, hap_varalleles, \
-					hap_LA, hap_effect):
-		self.hap_id = hap_id
-		self.hap_chr = hap_chr
-		self.hap_start = hap_start
-		self.hap_end = hap_end
-		self.hap_varids = hap_varids
-		self.hap_varalleles = hap_varalleles
-		self.hap_LA = hap_LA
-		self.hap_effect = hap_effect
-
-		##### Do some basic checks #####
-		# hap_varids and hap_varalleles must be the same length
-		assert(len(self.hap_varids)==len(self.hap_varalleles))
-		# Do not repeat variant IDs
-		assert(len(set(self.hap_varids))==len(self.hap_varids))
-		# TODO - STR and LA checks
-
-	def GetVariantID(self, record):
-		return "%s_%s_%s_%s"%(record.CHROM, record.POS, record.REF, ":".join(record.ALT))
-
-	def CheckHapMatch(self, vcf_reader):
-		sample_list = vcf_reader.samples
-
-		# Keep track of whether each chrom
-		# of each sample matches or not
-		hap_matches = {}
-		for sample in sample_list:
-			hap_matches[sample] = [True, True]
-
-		found_ids = 0
-		records = vcf_reader("%s:%s-%s"%(self.hap_chr, self.hap_start, self.hap_end))
-		for record in records:
-			# Check if the record is part of the haplotype
-			if (record.ID in self.hap_varids):
-				varind = self.hap_varids.index(record.ID)
-			elif (GetVariantID(record) in self.varids):
-				varind = self.hap_varids.index(self.GetVariantID(record))
-			else: continue
-			var_allele = self.hap_varalleles[varind]
-			var_id = self.hap_varids[varind]
-			found_ids += 1
-
-			# Determine the index of the target allele
-			record_alleles = [record.REF]+record.ALT
-			assert(var_allele in record_alleles) # TODO fail more gracefully with a message
-			allele_ind = record_alleles.index(var_allele)
-
-			# Check if each sample matches
-			# If not, set hap_matches to False
-			for i in range(len(sample_list)):
-				sample = sample_list[i]
-				call = record.genotypes[i]
-				assert(call[2]) # make sure it is phased
-				for j in range(2):
-					if call[j]!=allele_ind: hap_matches[sample][j] = False
-
-		# Check we got through all the IDs
-		assert(found_ids == len(self.hap_varids)) # TODO fail more gracefully with a message
-		return hap_matches
-
-	def Transform(self, vcf_reader):
-		hap_gts = {}
-
-		##### Case 1 - STR length #####
-		pass # TODO
-
-		##### Case 2 - variant haplotype #####
-		hap_matches = self.CheckHapMatch(vcf_reader)
-
-		##### Case 3 - LA (may be in combination with case 2) #####
-		pass # TODO
-
-		# Get haplotype counts
-		for sample in vcf_reader.samples:
-			hap_gts[sample] = sum(hap_matches[sample])
-		return hap_gts
-
-	def GetFrequency(self, vcf_reader):
-		hap_gts = self.Transform(vcf_reader)
-		return np.sum(list(hap_gts.values()))/(2*len(hap_gts.keys()))
-
-	def __str__(self):
-		return self.hap_id
-
-class HapReader:
-	def __init__(self, hapfile: Path, region: str = None):
-		"""
-		Open a haplotype file with the given region
-
-		Parameters
-		----------
-		hapfile : Path
-		  The path to a .haps file containing haplotypes
-		region : str, optional
-		  A region from which to query specific haplotypes; ex: "chr1:1000-2000"
-		"""
-		if region is None:
-			self.haps = gzip.open(hapfile)
-		else:
-			# split the region string so each portion is an element
-			region = re.split(':|-', region)
-			if len(region) > 1:
-				region[1:] = [int(pos) for pos in region[1:] if pos]
-			# fetch region
-			self.haps = pysam.TabixFile(hapfile).fetch(*region)
-
-	def GetHaplotype(self, hapline):
-		hap_id = hapline[0]
-		hap_chr = hapline[1]
-		hap_start = int(hapline[2])
-		hap_end = int(hapline[3])
-		hap_varids = hapline[4].split(",")
-		hap_varalleles = hapline[5].split(",")
-		hap_LA = hapline[6].split(",")
-		hap_effect = float(hapline[7])
-		return Haplotype(hap_id, hap_chr, hap_start, hap_end, \
-							hap_varids, hap_varalleles, \
-							hap_LA, hap_effect)
-
-	def __iter__(self):
-		return self
-
-	def __next__(self):
-		# Get the next haplotype
-		hapline = next(self.haps)
-
-		# If reading from gzip file, convert to list
-		# so it is in the same format as from tabix
-		if type(hapline) == bytes:
-			hapline = hapline.decode('utf-8')
-			while hapline.startswith('#'):
-				hapline = next(self.haps).decode('utf-8')
-			hapline = hapline.strip().split()
-
-		# Convert the line to a haplotype object
-		return self.GetHaplotype(hapline)
-
-class PhenoSimulator:
-	def __init__(self, sample_list):
-		self.pts = {}
-		for sample in sample_list:
-			self.pts[sample] = 0
-
-	def AddEffect(self, sample_to_hap, effect):
-		mean = np.mean(list(sample_to_hap.values()))
-		sd = np.sqrt(np.var(list(sample_to_hap.values())))
-		for sample in self.pts.keys():
-			assert(sample in sample_to_hap.keys())
-			self.pts[sample] += effect*(sample_to_hap[sample]-mean)/sd
-
-	def WritePhenotypes(self, outprefix, simu_rep, simu_hsq, \
-						simu_k, simu_qt, simu_cc):
-		# First compute var(additive)
-		var_add = np.var(list(self.pts.values()))
-
-		# Prepare output file
-		outf_sim = open("%s.phen"%outprefix, "w")
-
-		# Simulate and write
-		for sample in self.pts.keys():
-			outinfo = [sample, sample]
-			genetic_component = self.pts[sample]
-			for i in range(simu_rep):
-				resid_component = np.random.normal(0, var_add*(1/simu_hsq-1))
-				if simu_qt:
-					outinfo.append(genetic_component+resid_component)
-				else:
-					raise NotImplementedError("Case control not implemented")
-			outf_sim.write("\t".join([str(item) for item in outinfo])+"\n")
-
-		# Done
-		outf_sim.close()
\ No newline at end of file
diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py
index 99279433..e39b4772 100644
--- a/haptools/sim_phenotypes.py
+++ b/haptools/sim_phenotypes.py
@@ -38,8 +38,8 @@ def run(
         simu_cc: bool = False,
     ) -> Phenotypes:
         effect = np.array([hap.beta for hap in haps.data.values()])
-        data = self.haps_gts.data.sum(axis=2)
-        pt = np.sum(effect * (data - data.mean(axis=0)) / np.std(axis=0))
+        gts = self.haps_gts.data.sum(axis=2)
+        pt = np.sum(effect * (gts - gts.mean(axis=0)) / gts.std(axis=0))
         phens = np.empty((len(self.genotypes.samples)), dtype=np.float64)
 
         # Simulate and write
@@ -68,6 +68,7 @@ def simulate_pt(
 
     # Initialize phenotype simulator (haptools simphenotypes)
     pt_sim = PhenoSimulator(gens, haps)
+    phens = pt_sim.run()
 
     # Set up file to write summary info
     outf_sum = open("%s.par" % outprefix, "w")

From 30e594935f108187ce065d891b93695528b8e932 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sun, 15 May 2022 16:42:44 -0700
Subject: [PATCH 03/44] continue adding support for other file formats

---
 haptools/data/genotypes.py  | 181 ++++++++++++++++++++++++++----------
 haptools/data/haplotypes.py |  10 +-
 2 files changed, 133 insertions(+), 58 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 24e36fd9..d030db11 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
+import re
 from csv import reader
 from pathlib import Path
 from typing import Iterator
 from collections import namedtuple
 from logging import getLogger, Logger
+from fileinput import hook_compressed
 
 import numpy as np
 import numpy.typing as npt
@@ -547,8 +549,78 @@ class GenotypesPLINK(GenotypesRefAlt):
     >>> genotypes = GenotypesPLINK.load('tests/data/simple.pgen')
     """
 
+    def read_samples(self, samples: list[str] = None):
+        """
+        Read sample IDs from a PSAM file into a list stored in
+        :py:attr:`~.GenotypesPLINK.samples`
+
+        One of either variants or max_variants MUST be specified!
+
+        Parameters
+        ----------
+        samples : list[str], optional
+            See documentation for :py:attr:`~.GenotypesRefAlt.read`
+
+        Returns
+        -------
+        npt.NDArray[np.uint32]
+            The indices of each of the samples within the PSAM file
+        """
+        if samples is not None and not isinstance(samples, set):
+            samples = set(samples)
+        with hook_compressed(self.fname.with_suffix(".psam"), mode="rt") as psam:
+            psamples = reader(psam, delimiter="\t")
+            # find the line that declares the header
+            for header in psamples:
+                if header[0].startswith('#FID') or header[0].startswith('#IID'):
+                    break
+            # we need the header to contain the IID column
+            if header[0][0] != "#":
+                raise ValueError("Your PSAM file is missing a header!")
+                col_idx = 1
+                self.log.info(
+                    "Your PVAR file lacks a proper header! Assuming first five columns"
+                    " are [CHROM, POS, ID, REF, ALT] in that order..."
+                )
+                # TODO: add header back in to psamples using itertools.chain?
+            else:
+                header[0] = header[0][1:]
+                try:
+                    col_idx = header.index('IID')
+                except ValueError:
+                    raise ValueError("Your PSAM file must have an IID column.")
+            self.samples = {
+                ct: samp[col_idx] for ct, samp in enumerate(psamples)
+                if (samples is None) or (samp[col_idx] in samples)
+            }
+            indices = np.array(list(self.samples.keys()), dtype=np.uint32)
+            self.samples = list(self.samples.values())
+            return indices
+
+    def _check_region(
+        self, pos: tuple, chrom: str, start: int = 0, end: int = float("inf")
+    ):
+        """
+        Check that pos lies within the provided chrom, start, and end coordinates
+
+        This is a helper function for :py:meth:`~.GenotypesPLINK.read_variants`
+
+        Parameters
+        ----------
+        pos : tuple[str, int]
+            A tuple of two elements: contig (str) and chromosomal position (int)
+        chrom: str
+            The contig of the region to check
+        start: int, optional
+            The start position of the region
+        end: int, optional
+            The end position of the region
+        """
+        return (pos[0] == chrom) and (start <= pos[1]) and (end >= pos[1])
+
     def read_variants(
         self,
+        region: str = None,
         variants: set[str] = None,
         max_variants: int = None,
     ):
@@ -578,53 +650,68 @@ def read_variants(
             max_variants = len(variants)
         if max_variants is None:
             raise ValueError("Provide either the variants or max_variants parameter!")
-        # TODO: try using pysam.tabix_iterator with the parser param specified for VCF
-        if region:
-            pvar = TabixFile(str(self.fname.with_suffix(".pvar")))
-            pvariants = pvar.fetch(region)
-        else:
-            pvar = hook_compressed(self.fname.with_suffix(".pvar"), mode="rt")
+        # split the region string so each portion is an element
+        if region is not None:
+            region = re.split(":|-", region)
+            if len(region) > 1:
+                region[1:] = [int(pos) for pos in region[1:] if pos]
+        with hook_compressed(self.fname.with_suffix(".pvar"), mode="rt") as pvar:
             pvariants = reader(pvar, delimiter="\t")
-        # first, preallocate the array
-        self.variants = np.empty((max_variants,), dtype=self.variants.dtype)
-        header = next(pvariants)
-        # there should be at least five columns
-        if len(header) < 5:
-            raise ValueError("Your PVAR file should have at least five columns.")
-        if header[0][0] == "#":
-            header[0] = header[0][1:]
-        else:
-            raise ValueError("Your PVAR file is missing a header!")
-            header = ["CHROM", "POS", "ID", "REF", "ALT"]
-            self.log.info(
-                "Your PVAR file lacks a proper header! Assuming first five columns"
-                " are [CHROM, POS, ID, REF, ALT] in that order..."
-            )
-            # TODO: add header back in to pvariants using itertools.chain?
-        cid = {item: header.index(item.upper()) for item in self.variants.dtype.names}
-        if variants is not None:
+            # first, preallocate the array
+            self.variants = np.empty((max_variants,), dtype=self.variants.dtype)
+            # find the line that declares the header
+            for header in pvariants:
+                if not header[0].startswith('##'):
+                    break
+            # there should be at least five columns
+            if len(header) < 5:
+                raise ValueError("Your PVAR file should have at least five columns.")
+            if header[0][0] == "#":
+                header[0] = header[0][1:]
+            else:
+                raise ValueError("Your PVAR file is missing a header!")
+                header = ["CHROM", "POS", "ID", "REF", "ALT"]
+                self.log.info(
+                    "Your PVAR file lacks a proper header! Assuming first five columns"
+                    " are [CHROM, POS, ID, REF, ALT] in that order..."
+                )
+                # TODO: add header back in to pvariants using itertools.chain?
+            cid = {
+                item: header.index(item.upper())
+                for item in self.variants.dtype.names
+                if item is not "aaf"
+            }
             indices = np.empty((max_variants,), dtype=np.uint32)
-        num_seen = 0
-        for ct, rec in enumerate(pvariants):
-            if variants is not None:
-                if rec[cid["id"]] not in variants:
+            num_seen = 0
+            for ct, rec in enumerate(pvariants):
+                if region and not self._check_region(
+                    (rec[cid["chrom"]], int(rec[cid["pos"]])), *region
+                ):
                     continue
+                if variants is not None:
+                    if rec[cid["id"]] not in variants:
+                        continue
                 indices[num_seen] = ct
-            self.variants[num_seen] = np.array(
-                (
-                    rec[cid["id"]],
-                    rec[cid["chrom"]],
-                    rec[cid["pos"]],
-                    0,
-                    rec[cid["ref"]],
-                    rec[cid["alt"]],
-                ),
-                dtype=self.variants.dtype,
+                self.variants[num_seen] = np.array(
+                    (
+                        rec[cid["id"]],
+                        rec[cid["chrom"]],
+                        rec[cid["pos"]],
+                        0,
+                        rec[cid["ref"]],
+                        rec[cid["alt"]],
+                    ),
+                    dtype=self.variants.dtype,
+                )
+                num_seen += 1
+        if max_variants > num_seen:
+            self.log.info(
+                f"Removing {max_variants-num_seen} unneeded variant records that "
+                "were preallocated b/c max_variants was specified."
             )
-            num_seen += 1
-        pvar.close()
-        if variants is not None:
-            return indices
+            indices = indices[:num_seen]
+            self.variants = self.variants[:num_seen]
+        return indices
 
     def read(
         self,
@@ -654,7 +741,8 @@ def read(
         # TODO: figure out how to install this package
         from pgenlib import PgenReader
 
-        pgen = PgenReader(bytes(self.fname, "utf8"))
+        sample_idxs = self.read_samples(samples)
+        pgen = PgenReader(bytes(self.fname, "utf8"), sample_subset=sample_idxs)
 
         if variants is not None:
             max_variants = len(variants)
@@ -663,14 +751,9 @@ def read(
             max_variants = pgen.get_variant_ct()
         indices = self.read_variants(region, variants, max_variants)
 
-        variant_ct_start = 0
-        variant_ct_end = max_variants
-        variant_ct = variant_ct_end - variant_ct_start
-
-        sample_ct = pgen.get_raw_sample_ct()
         # the genotypes start out as a simple 2D array with twice the number of samples
         # so each column is a different chromosomal strand
-        self.data = np.empty((variant_ct, sample_ct * 2), dtype=np.int32)
+        self.data = np.empty((len(indices), len(sample_idxs) * 2), dtype=np.int32)
         pgen.read_alleles(indices, self.data)
         # extract the genotypes to a np matrix of size n x p x 2
         # the last dimension has two items:
diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py
index f15f5c9b..c46895bb 100644
--- a/haptools/data/haplotypes.py
+++ b/haptools/data/haplotypes.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-import re
 from pathlib import Path
 from logging import getLogger, Logger
 from fileinput import hook_compressed
@@ -785,14 +784,7 @@ def transform(
         hap_gts.samples = genotypes.samples
         hap_gts.variants = np.array(
             [(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()],
-            dtype=[
-                ("id", "U50"),
-                ("chrom", "U10"),
-                ("pos", np.uint32),
-                ("aaf", np.float64),
-                ("ref", "U100"),
-                ("alt", "U100"),
-            ],
+            dtype=self.variants.dtype,
         )
         self.log.info(
             f"Transforming a set of genotypes from {len(genotypes.variants)} total "

From 06e380613f2a0cd66c7844f65b0b9c031071c7a5 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sun, 15 May 2022 21:08:03 -0700
Subject: [PATCH 04/44] refmt with black

---
 haptools/data/genotypes.py  | 89 ++++++++++++++++++++++++-------------
 haptools/data/haplotypes.py |  2 +-
 2 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index d030db11..2cd3fd64 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -95,8 +95,7 @@ def load(
         genotypes.read(region, samples, variants)
         genotypes.check_missing()
         genotypes.check_biallelic()
-        if not self._prephased:
-            genotypes.check_phase()
+        genotypes.check_phase()
         # genotypes.to_MAC()
         return genotypes
 
@@ -566,13 +565,15 @@ def read_samples(self, samples: list[str] = None):
         npt.NDArray[np.uint32]
             The indices of each of the samples within the PSAM file
         """
+        if len(self.samples) != 0:
+            self.log.warning("Sample data has already been loaded. Overriding.")
         if samples is not None and not isinstance(samples, set):
             samples = set(samples)
         with hook_compressed(self.fname.with_suffix(".psam"), mode="rt") as psam:
             psamples = reader(psam, delimiter="\t")
             # find the line that declares the header
             for header in psamples:
-                if header[0].startswith('#FID') or header[0].startswith('#IID'):
+                if header[0].startswith("#FID") or header[0].startswith("#IID"):
                     break
             # we need the header to contain the IID column
             if header[0][0] != "#":
@@ -586,11 +587,12 @@ def read_samples(self, samples: list[str] = None):
             else:
                 header[0] = header[0][1:]
                 try:
-                    col_idx = header.index('IID')
+                    col_idx = header.index("IID")
                 except ValueError:
                     raise ValueError("Your PSAM file must have an IID column.")
             self.samples = {
-                ct: samp[col_idx] for ct, samp in enumerate(psamples)
+                ct: samp[col_idx]
+                for ct, samp in enumerate(psamples)
                 if (samples is None) or (samp[col_idx] in samples)
             }
             indices = np.array(list(self.samples.keys()), dtype=np.uint32)
@@ -661,7 +663,7 @@ def read_variants(
             self.variants = np.empty((max_variants,), dtype=self.variants.dtype)
             # find the line that declares the header
             for header in pvariants:
-                if not header[0].startswith('##'):
+                if not header[0].startswith("##"):
                     break
             # there should be at least five columns
             if len(header) < 5:
@@ -679,7 +681,7 @@ def read_variants(
             cid = {
                 item: header.index(item.upper())
                 for item in self.variants.dtype.names
-                if item is not "aaf"
+                if item != "aaf"
             }
             indices = np.empty((max_variants,), dtype=np.uint32)
             num_seen = 0
@@ -735,30 +737,57 @@ def read(
         max_variants : int, optional
             See documentation for :py:attr:`~.GenotypesRefAlt.read`
         """
-        super().read()
-        # load the pgen-reader file
-        # note: very little is loaded into memory at this point
+        super(Genotypes, self).read()
         # TODO: figure out how to install this package
         from pgenlib import PgenReader
 
         sample_idxs = self.read_samples(samples)
-        pgen = PgenReader(bytes(self.fname, "utf8"), sample_subset=sample_idxs)
-
-        if variants is not None:
-            max_variants = len(variants)
-        if max_variants is None:
-            # use the pgen file to figure out how many variants there are
-            max_variants = pgen.get_variant_ct()
-        indices = self.read_variants(region, variants, max_variants)
-
-        # the genotypes start out as a simple 2D array with twice the number of samples
-        # so each column is a different chromosomal strand
-        self.data = np.empty((len(indices), len(sample_idxs) * 2), dtype=np.int32)
-        pgen.read_alleles(indices, self.data)
-        # extract the genotypes to a np matrix of size n x p x 2
-        # the last dimension has two items:
-        # 1) presence of REF in strand one
-        # 2) presence of REF in strand two
-        self.data = np.dstack((self.data[:, ::2], self.data[:, 1::2]))
-        # transpose the GT matrix so that samples are rows and variants are columns
-        self.data = self.data.transpose((1, 0, 2))
+        with PgenReader(
+            bytes(str(self.fname), "utf8"), sample_subset=sample_idxs
+        ) as pgen:
+
+            if variants is not None:
+                max_variants = len(variants)
+            if max_variants is None:
+                # use the pgen file to figure out how many variants there are
+                max_variants = pgen.get_variant_ct()
+            indices = self.read_variants(region, variants, max_variants)
+
+            # the genotypes start out as a simple 2D array with twice the number of samples
+            if not self._prephased:
+                # raise an error message b/c this is untested code that doesn't really
+                # seem to work properly
+                # for ex, the phasing info is not boolean for some reason?
+                raise ValueError("Not implemented yet!")
+                # ...so each column is a different chromosomal strand
+                self.data = np.empty(
+                    (len(indices), len(sample_idxs) * 2), dtype=np.int32
+                )
+                phasing = np.empty((len(indices), len(sample_idxs)), dtype=np.uint8)
+                # The haplotype-major mode of read_alleles_and_phasepresent_list has
+                # not been implemented yet, so we need to read the genotypes in sample-
+                # major mode and then transpose them
+                pgen.read_alleles_and_phasepresent_list(indices, self.data, phasing)
+                # missing alleles will have a value of -9
+                # let's make them be -1 to be consistent with cyvcf2
+                self.data[self.data == -9] = -1
+                self.data = np.dstack((self.data[:, ::2], self.data[:, 1::2])).astype(
+                    np.uint8
+                )
+                self.data = np.concatenate(
+                    (self.data, phasing[:, :, np.newaxis]), axis=2
+                )
+                # transpose the GT matrix so that samples are rows and variants are columns
+                self.data = self.data.transpose((1, 0, 2))
+            else:
+                # ...so each row is a different chromosomal strand
+                self.data = np.empty(
+                    (len(sample_idxs) * 2, len(indices)), dtype=np.int32
+                )
+                pgen.read_alleles_list(indices, self.data, hap_maj=True)
+                # missing alleles will have a value of -9
+                # let's make them be -1 to be consistent with cyvcf2
+                self.data[self.data == -9] = -1
+                self.data = np.dstack((self.data[::2, :], self.data[1::2, :])).astype(
+                    np.uint8
+                )
diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py
index c46895bb..df8baa2d 100644
--- a/haptools/data/haplotypes.py
+++ b/haptools/data/haplotypes.py
@@ -784,7 +784,7 @@ def transform(
         hap_gts.samples = genotypes.samples
         hap_gts.variants = np.array(
             [(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()],
-            dtype=self.variants.dtype,
+            dtype=hap_gts.variants.dtype,
         )
         self.log.info(
             f"Transforming a set of genotypes from {len(genotypes.variants)} total "

From 52d90a04d8c1b623e82bca196169d2a440db8721 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 13 Jun 2022 12:52:33 -0700
Subject: [PATCH 05/44] move transform subcommand to its own module

---
 haptools/__main__.py  | 21 ++--------------
 haptools/transform.py | 56 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 19 deletions(-)
 create mode 100644 haptools/transform.py

diff --git a/haptools/__main__.py b/haptools/__main__.py
index b89e4533..633d5b9b 100644
--- a/haptools/__main__.py
+++ b/haptools/__main__.py
@@ -204,8 +204,7 @@ def transform(
     """
     import logging
 
-    from haptools import data
-    from .haplotype import HaptoolsHaplotype
+    from .transform import transform_haps
 
     log = logging.getLogger("run")
     logging.basicConfig(
@@ -226,23 +225,7 @@ def transform(
     else:
         samples = None
 
-    log.info("Loading haplotypes")
-    hp = data.Haplotypes(haplotypes)
-    hp.read(region=region)
-    log.info("Extracting variants from haplotypes")
-    variants = {var.id for hap in hp.data.values() for var in hap.variants}
-    log.info("Loading genotypes")
-    gt = data.GenotypesRefAlt(genotypes, log=log)
-    # gt._prephased = True
-    gt.read(region=region, samples=samples, variants=variants)
-    gt.check_missing(discard_also=True)
-    gt.check_biallelic(discard_also=True)
-    gt.check_phase()
-    log.info("Transforming genotypes via haplotypes")
-    hp_gt = data.GenotypesRefAlt(fname=output, log=log)
-    hp.transform(gt, hp_gt)
-    log.info("Writing haplotypes to VCF")
-    hp_gt.write()
+    transform_haps(genotypes, haplotypes, region, samples, output, log)
 
 
 if __name__ == "__main__":
diff --git a/haptools/transform.py b/haptools/transform.py
new file mode 100644
index 00000000..2b727e16
--- /dev/null
+++ b/haptools/transform.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+from haptools import data
+from .haplotype import HaptoolsHaplotype
+
+
+def transform_haps(
+    genotypes: Path,
+    haplotypes: Path,
+    region: str = None,
+    samples: list[str] = None,
+    output: Path = Path("-"),
+    log: Logger = None,
+):
+    """
+    Creates a VCF composed of haplotypes
+
+    Parameters
+    ----------
+    genotypes : Path
+        The path to the genotypes in VCF format
+    haplotypes : Path
+        The path to the haplotypes in a .hap file
+    region : str, optional
+        See documentation for :py:meth:`~.data.Genotypes.read`
+    samples : list[str], optional
+        See documentation for :py:meth:`~.data.Genotypes.read`
+    output : Path, optional
+        The location to which to write output
+    log : Logger, optional
+        A logging module to which to write messages about progress and any errors
+    """
+    log.info("Loading haplotypes")
+    hp = data.Haplotypes(haplotypes)
+    hp.read(region=region)
+
+    log.info("Extracting variants from haplotypes")
+    variants = {var.id for hap in hp.data.values() for var in hap.variants}
+    
+    log.info("Loading genotypes")
+    gt = data.GenotypesRefAlt(genotypes, log=log)
+    # gt._prephased = True
+    gt.read(region=region, samples=samples, variants=variants)
+    gt.check_missing(discard_also=True)
+    gt.check_biallelic(discard_also=True)
+    gt.check_phase()
+    
+    log.info("Transforming genotypes via haplotypes")
+    hp_gt = data.GenotypesRefAlt(fname=output, log=log)
+    hp.transform(gt, hp_gt)
+    
+    log.info("Writing haplotypes to VCF")
+    hp_gt.write()
+
+    return hp_gt
+    
\ No newline at end of file

From e3c9df99b74a4c0c1fd8ad34a84bb37f8fc782d9 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 13 Jun 2022 12:53:29 -0700
Subject: [PATCH 06/44] setup PhenoSimulator.__init__ method

---
 haptools/sim_phenotypes.py | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py
index e39b4772..16913b05 100644
--- a/haptools/sim_phenotypes.py
+++ b/haptools/sim_phenotypes.py
@@ -17,20 +17,56 @@ class PhenoSimulator:
         A set of haplotypes to use as causal variables in the simulation
     haps_gts: GenotypesRefAlt
         Genotypes for each haplotype
+    phens: Phenotypes
+        Phenotypes for each haplotype
     log: Logger
         A logging instance for recording debug statements
+
+    Examples
+    --------
+    >>> gens = Genotypes.load("tests/data/example.vcf.gz")
+    >>> haps = Haplotypes.load("tests/data/basic.hap")
+    >>> phenosim = PhenoSimulator(gens, haps, Path("basic_phens.tsv.gz"))
+    >>> phenosim.transform()
+    >>> phenotypes = phenosim.run()
     """
 
     def __init__(
-        self, genotypes: GenotypesRefAlt, haplotypes: Haplotypes, log: Logger = None
+        self,
+        genotypes: GenotypesRefAlt,
+        haplotypes: Haplotypes,
+        phenotypes: Path,
+        log: Logger = None,
     ):
         self.gens = genotypes
         self.haps = haplotypes
+        self.haps_gts = None
+        self.phens = Phenotypes(phenotypes, self.log)
+        self.phens.samples = self.gens
         self.log = log or getLogger(self.__class__.__name__)
+
+    def unset(self):
+        """
+        Whether the data has been loaded into the object yet
+
+        Returns
+        -------
+        bool
+            True if :py:attr:`~.PhenoSimulator.haps_gts` is None else False
+        """
+        return self.haps_gts is None
+
+    def transform(self):
+        """
+        Obtain the "genotypes" of the haplotypes
+        """
+        if self.unset():
+            self.log.warning("The haplotype 'genotypes' have already been loaded. Overriding.")
         self.haps_gts = self.haps.transform(self.gens)
 
     def run(
         self,
+        name: str,
         simu_rep: int,
         simu_hsq: float,
         simu_k: float,

From 49341e664895f37ee92602aef427afdcdf550cea Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 16 Jun 2022 10:34:31 -0700
Subject: [PATCH 07/44] add Genotypes.subset() method

---
 haptools/data/genotypes.py | 86 ++++++++++++++++++++++++++++++++++++++
 tests/test_data.py         | 31 ++++++++++++++
 2 files changed, 117 insertions(+)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 2cd3fd64..7f100acb 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -38,6 +38,10 @@ class Genotypes(Data):
     _prephased : bool
         If True, assume that the genotypes are phased. Otherwise, extract their phase
         when reading from the VCF.
+    _samp_idx : dict[str, int]
+        Sample index; maps samples to indices in self.samples
+    _var_idx : dict[str, int]
+        Variant index; maps variant IDs to indices in self.variants
 
     Examples
     --------
@@ -61,6 +65,8 @@ def __init__(self, fname: Path, log: Logger = None):
             ],
         )
         self._prephased = False
+        self._samp_idx = None
+        self._var_idx = None
 
     @classmethod
     def load(
@@ -281,6 +287,86 @@ def __iter__(
         # see https://stackoverflow.com/a/36726497
         return self._iterate(vcf, region, variants)
 
+    def index(self, samples: bool = True, variants: bool = True):
+        """
+        Call this function once to improve the speed of future look-ups of samples and
+        variants by their ID. This is useful if you intend to later subset by a set
+        of samples or variant IDs.
+        The time complexity of this function should be roughly O(n+m) if both
+        parameters are True. Otherwise, it will be either O(n) or O(m).
+
+        Parameters
+        ----------
+        samples: bool, optional
+            Whether to index the samples for fast loop-up. Adds complexity O(n).
+        variants: bool, optional
+            Whether to index the variants for fast look-up. Adds complexity O(m).
+        """
+        if samples and self._samp_idx is None:
+            self._samp_idx = dict(zip(self.samples, range(len(self.samples))))
+        if variants and self._var_idx is None:
+            self._var_idx = dict(zip(self.variants["id"], range(len(self.variants))))
+
+    def subset(
+        self,
+        samples: tuple[str] = None,
+        variants: tuple[str] = None,
+        inplace: bool = False,
+    ):
+        """
+        Subset these genotypes to a smaller set of samples or a smaller set of variants
+
+        Parameters
+        ----------
+        samples: tuple[str]
+            A subset of samples to keep
+        variants: tuple[str]
+            A subset of variant IDs to keep
+        inplace: bool, optional
+            If False, return a new Genotypes object; otherwise, alter the current one
+
+        Returns
+        -------
+            A new Genotypes object if inplace is set to False, else returns None
+        """
+        # First, initialize variables
+        gts = self
+        if not inplace:
+            gts = self.__class__(self.fname, self.log)
+        gts.samples = self.samples
+        gts.variants = self.variants
+        gts.data = self.data
+        # Index the current set of samples and variants so we can have fast look-up
+        self.index(samples=(samples is not None), variants=(variants is not None))
+        # Subset the samples
+        if samples is not None:
+            gts.samples = tuple(samp for samp in samples if samp in self._samp_idx)
+            if len(gts.samples) < len(samples):
+                diff = len(samples) - len(gts.samples)
+                self.log.warning(
+                    f"Saw {diff} fewer samples than requested. Proceeding with "
+                    f"f{len(gts.samples)} samples."
+                )
+            samp_idx = tuple(self._samp_idx[samp] for samp in gts.samples)
+            if inplace:
+                self._samp_idx = None
+            gts.data = gts.data[samp_idx, :]
+        # Subset the variants
+        if variants is not None:
+            var_idx = [self._var_idx[var] for var in variants if var in self._var_idx]
+            if len(var_idx) < len(variants):
+                diff = len(variants) - len(var_idx)
+                self.log.warning(
+                    f"Saw {diff} fewer variants than requested. Proceeding with "
+                    f"f{len(var_idx)} variants."
+                )
+            gts.variants = self.variants[var_idx]
+            if inplace:
+                self._var_idx = None
+            gts.data = gts.data[:, var_idx]
+        if not inplace:
+            return gts
+
     def check_missing(self, discard_also=False):
         """
         Check that each sample is properly genotyped
diff --git a/tests/test_data.py b/tests/test_data.py
index 190e3370..94c31190 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -208,6 +208,37 @@ def test_load_genotypes_subset(self):
         np.testing.assert_allclose(gts.data, expected)
         assert gts.samples == tuple(samples)
 
+    def test_subset_genotypes(self):
+        gts = self._get_fake_genotypes()
+
+        # subset to just the samples we want
+        expected_data = gts.data[:3]
+        expected_variants = gts.variants
+        samples = ("HG00096", "HG00097", "HG00099")
+        gts_sub = gts.subset(samples=samples)
+        assert gts_sub.samples == samples
+        np.testing.assert_allclose(gts_sub.data, expected_data)
+        assert np.array_equal(gts_sub.variants, expected_variants)
+
+        # subset to just the variants we want
+        expected_data = gts.data[:, [1, 2]]
+        expected_variants = gts.variants[[1, 2]]
+        variants = ('1:10116:A:G', '1:10117:C:A')
+        gts_sub = gts.subset(variants=variants)
+        assert gts_sub.samples == gts.samples
+        np.testing.assert_allclose(gts_sub.data, expected_data)
+        assert np.array_equal(gts_sub.variants, expected_variants)
+
+        # subset both: samples and variants
+        expected_data = gts.data[[3, 4], :2]
+        expected_variants = gts.variants[:2]
+        samples = ("HG00100", "HG00101")
+        variants = ("1:10114:T:C", '1:10116:A:G')
+        gts_sub = gts.subset(samples=samples, variants=variants)
+        assert gts_sub.samples == samples
+        np.testing.assert_allclose(gts_sub.data, expected_data)
+        assert np.array_equal(gts_sub.variants, expected_variants)
+
 
 def test_load_phenotypes(caplog):
     # create a phenotype vector with shape: num_samples x 1

From a0ae5e1075f47ae6aceaad130731e4704b791327 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 16 Jun 2022 11:10:01 -0700
Subject: [PATCH 08/44] also allow indexing in GenotypesRefAlt class

---
 haptools/data/genotypes.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 7f100acb..8421a37c 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -316,6 +316,9 @@ def subset(
         """
         Subset these genotypes to a smaller set of samples or a smaller set of variants
 
+        The order of the samples and variants in the subsetted instance will match
+        the order in the provided tuple parameters.
+
         Parameters
         ----------
         samples: tuple[str]
@@ -549,6 +552,8 @@ def __init__(self, fname: Path, log: Logger = None):
             ],
         )
         self._prephased = False
+        self._samp_idx = None
+        self._var_idx = None
 
     def _variant_arr(self, record: Variant):
         """

From f2a30170d7fd7cb18390312aa91eaf82a1183142 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 16 Jun 2022 11:17:53 -0700
Subject: [PATCH 09/44] simplify the transform method of the Haplotype and
 Haplotypes classes see
 https://github.com/gymrek-lab/haptools/issues/19#issuecomment-1126651795

---
 haptools/data/haplotypes.py | 64 ++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 40 deletions(-)

diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py
index df8baa2d..a6fd254a 100644
--- a/haptools/data/haplotypes.py
+++ b/haptools/data/haplotypes.py
@@ -277,7 +277,7 @@ def _fmt(self):
     @property
     # TODO: use @cached_property in py3.8
     def varIDs(self):
-        return {var.id for var in self.variants}
+        return tuple(var.id for var in self.variants)
 
     @classmethod
     def from_hap_spec(
@@ -334,9 +334,7 @@ def extras_head(cls) -> tuple:
         """
         return tuple(extra.to_hap_spec("H") for extra in cls._extras)
 
-    def transform(
-        self, genotypes: GenotypesRefAlt, samples: list[str] = None
-    ) -> npt.NDArray[bool]:
+    def transform(self, genotypes: GenotypesRefAlt) -> npt.NDArray[bool]:
         """
         Transform a genotypes matrix via the current haplotype
 
@@ -350,8 +348,6 @@ def transform(
 
             If the genotypes have not been loaded into the Genotypes object yet, this
             method will call Genotypes.read(), while loading only the needed variants
-        samples : list[str], optional
-            See documentation for :py:attr:`~.Genotypes.read`
 
         Returns
         -------
@@ -360,34 +356,27 @@ def transform(
             denotes the presence of the haplotype in one chromosome of a sample
         """
         var_IDs = self.varIDs
-        # check: have the genotypes been loaded yet?
-        # if not, we can load just the variants we need
-        if genotypes.unset():
-            start = min(var.start for var in self.variants)
-            end = max(var.end for var in self.variants)
-            region = f"{self.chrom}:{start}-{end}"
-            genotypes.read(region=region, samples=samples, variants=var_IDs)
-            genotypes.check_biallelic(discard_also=True)
-            genotypes.check_phase()
-        # create a dict where the variants are keyed by ID
-        var_dict = {
-            var["id"]: var["ref"] for var in genotypes.variants if var["id"] in var_IDs
-        }
-        var_idxs = [
-            idx for idx, var in enumerate(genotypes.variants) if var["id"] in var_IDs
-        ]
-        missing_IDs = var_IDs - var_dict.keys()
-        if len(missing_IDs):
+        gts = genotypes.subset(variants=var_IDs)
+        # check: were any of the variants absent from the genotypes?
+        if len(gts.variants) < len(var_IDs):
+            missing_IDs = set(var_IDs) - set(gts.variants["id"])
             raise ValueError(
                 f"Variants {missing_IDs} are present in haplotype '{self.id}' but "
                 "absent in the provided genotypes"
             )
         # create a np array denoting the alleles that we want
-        alleles = [int(var.allele != var_dict[var.id]) for var in self.variants]
-        allele_arr = np.array([[[al] for al in alleles]])  # shape: (1, n, 1)
+        # note: the excessive use of square-brackets gives us shape (1, n, 1)
+        allele_arr = np.array(
+            [
+                [
+                    [int(var.allele != gts.variants[i]["ref"])]
+                    for i, var in enumerate(self.variants)
+                ]
+            ]
+        )
         # look for the presence of each allele in each chromosomal strand
         # and then just AND them together
-        return np.all(allele_arr == genotypes.data[:, var_idxs], axis=1)
+        return np.all(allele_arr == gts.data, axis=1)
 
 
 class Haplotypes(Data):
@@ -751,9 +740,7 @@ def write(self):
     def transform(
         self,
         genotypes: GenotypesRefAlt,
-        hap_gts: GenotypesRefAlt,
-        samples: list[str] = None,
-        low_memory: bool = False,
+        hap_gts: GenotypesRefAlt = None,
     ) -> GenotypesRefAlt:
         """
         Transform a genotypes matrix via the current haplotype
@@ -765,35 +752,32 @@ def transform(
         ----------
         genotypes : GenotypesRefAlt
             The genotypes which to transform using the current haplotype
-
-            If the genotypes have not been loaded into the Genotypes object yet, this
-            method will call Genotypes.read(), while loading only the needed variants
         hap_gts: GenotypesRefAlt
             An empty GenotypesRefAlt object into which the haplotype genotypes should
             be stored
-        samples : list[str], optional
-            See documentation for :py:attr:`~.Genotypes.read`
-        low_memory : bool, optional
-            If True, each haplotype's genotypes will be loaded one at a time.
 
         Returns
         -------
         GenotypesRefAlt
             A Genotypes object composed of haplotypes instead of regular variants.
         """
+        # Initialize GenotypesRefAlt return value
+        if hap_gts is None:
+            hap_gts = GenotypesRefAlt(fname=None, log=self.log)
         hap_gts.samples = genotypes.samples
         hap_gts.variants = np.array(
             [(hap.id, hap.chrom, hap.start, 0, "A", "T") for hap in self.data.values()],
             dtype=hap_gts.variants.dtype,
         )
+        # Obtain and merge the haplotype genotypes
         self.log.info(
             f"Transforming a set of genotypes from {len(genotypes.variants)} total "
             f"variants with a list of {len(self.data)} haplotypes"
         )
         hap_gts.data = np.concatenate(
             tuple(
-                hap.transform(genotypes, samples)[:, np.newaxis]
-                for hap in self.data.values()
+                hap.transform(genotypes)[:, np.newaxis] for hap in self.data.values()
             ),
             axis=1,
-        ).astype(np.uint8)
+        ).astype(genotypes.data.dtype)
+        return hap_gts

From a13f94abfb3fa9f13634e55a0e1d8d6debe00840 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 16 Jun 2022 12:00:58 -0700
Subject: [PATCH 10/44] add region and samples params to simphenotype

---
 haptools/__main__.py | 128 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 113 insertions(+), 15 deletions(-)

diff --git a/haptools/__main__.py b/haptools/__main__.py
index 633d5b9b..26a635fc 100644
--- a/haptools/__main__.py
+++ b/haptools/__main__.py
@@ -85,34 +85,132 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms):
 DEFAULT_SIMU_K = 0.1
 ##################################################
 @main.command()
-@click.option('--vcf', help='Phased VCF file', type=str, required=True)
-@click.option('--hap', help='Haplotype file with effect sizes', \
-        type=str, required=True)
-@click.option('--out', help='Prefix for output files', \
-        type=str, required=True)
-@click.option('--simu-qt', help='Simulate a quantitative trait', \
-        default=False, is_flag=True)
-@click.option('--simu-cc', help='Simulate a case/control trait', \
-        default=False, is_flag=True)
+@click.argument("genotypes", type=click.Path(exists=True, path_type=Path))
+@click.argument("haplotypes", type=click.Path(exists=True, path_type=Path))
+@click.option(
+    "--region",
+    type=str,
+    default=None,
+    show_default="all haplotypes",
+    help="""
+    The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'\n
+    For this to work, the VCF must be indexed and the seqname must match!""",
+)
+@click.option(
+    "-s",
+    "--sample",
+    "samples",
+    type=str,
+    multiple=True,
+    show_default="all samples",
+    help=(
+        "A list of the samples to subset from the genotypes file (ex: '-s sample1 -s"
+        " sample2')"
+    ),
+)
+@click.option(
+    "-S",
+    "--samples-file",
+    type=click.File("r"),
+    show_default="all samples",
+    help=(
+        "A single column txt file containing a list of the samples (one per line) to"
+        " subset from the genotypes file"
+    ),
+)
 @click.option('--simu-rep', help='Number of rounds of simulation to perform', \
         type=int, default=DEFAULT_SIMU_REP)
 @click.option('--simu-hsq', help='Trait heritability', \
         type=float, default=DEFAULT_SIMU_HSQ)
 @click.option('--simu-k', help='Specify the disease prevalence', \
         type=float, default=DEFAULT_SIMU_K)
-def simphenotype(vcf, hap, simu_rep, simu_hsq, simu_k, simu_qt, simu_cc, out):
+@click.option(
+    "-o",
+    "--output",
+    type=click.Path(path_type=Path),
+    default=Path("-"),
+    show_default="stdout",
+    help="A TSV file containing simulated phenotypes",
+)
+@click.option(
+    "-v",
+    "--verbosity",
+    type=click.Choice(["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"]),
+    default="ERROR",
+    show_default="only errors",
+    help="The level of verbosity desired",
+)
+def simphenotype(
+    genotypes: Path,
+    haplotypes: Path,
+    region: str = None,
+    samples: tuple[str] = tuple(),
+    samples_file: Path = None,
+    simu_rep, simu_hsq, simu_k,
+    output: Path = Path("-"),
+    verbosity: str = 'ERROR',
+):
     """
-    Haplotype-aware phenotype simulation
+    Haplotype-aware phenotype simulation. Create a set of simulated phenotypes from a
+    set of haplotypes.
+
+    GENOTYPES must be formatted as a VCF and HAPLOTYPES must be formatted according
+    to the .hap format spec
+
+    \f
+    Examples
+    --------
+    >>> haptools simphenotype tests/data/example.vcf.gz tests/data/example.hap.gz > simu_phens.tsv
+
+    Parameters
+    ----------
+    genotypes : Path
+        The path to the genotypes in VCF format
+    haplotypes : Path
+        The path to the haplotypes in a .hap file
+    region : str, optional
+        See documentation for :py:meth:`~.data.Genotypes.read`
+    sample : Tuple[str], optional
+        See documentation for :py:meth:`~.data.Genotypes.read`
+    samples_file : Path, optional
+        A single column txt file containing a list of the samples (one per line) to
+        subset from the genotypes file
+    output : Path, optional
+        The location to which to write the simulated phenotypes
+    verbosity : str, optional
+        The level of verbosity desired in messages written to stderr
     """
+    import logging
+
     from .sim_phenotypes import simulate_pt
+
+    log = logging.getLogger("run")
+    logging.basicConfig(
+        format="[%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)",
+        level=verbosity,
+    )
+    # handle samples
+    if samples and samples_file:
+        raise click.UsageError(
+            "You may only use one of --sample or --samples-file but not both."
+        )
+    if samples_file:
+        with samples_file as samps_file:
+            samples = samps_file.read().splitlines()
+    elif samples:
+        # needs to be converted from tuple to list
+        samples = list(samples)
+    else:
+        samples = None
     # Basic checks on input
     # TODO - check VCF zipped, check only one of simu-qt/simu-cc,
     # check values of other inputs
     # Only use simu-k for case/control
 
     # Run simulation
-    simulate_pt(vcf, hap, simu_rep, \
-        simu_hsq, simu_k, simu_qt, simu_cc, out)
+    simulate_pt(
+        genotypes, haplotypes, simu_rep, simu_hsq, simu_k, region, samples, output, log
+    )
 
 @main.command(short_help="Transform a genotypes matrix via a set of haplotypes")
 @click.argument("genotypes", type=click.Path(exists=True, path_type=Path))
@@ -121,9 +219,9 @@ def simphenotype(vcf, hap, simu_rep, simu_hsq, simu_k, simu_qt, simu_cc, out):
     "--region",
     type=str,
     default=None,
-    show_default="all genotypes",
+    show_default="all haplotypes",
     help="""
-    The region from which to extract genotypes; ex: 'chr1:1234-34566' or 'chr7'\n
+    The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'\n
     For this to work, the VCF must be indexed and the seqname must match!""",
 )
 @click.option(

From fa55ee8dc248fed5914612ae547e8618cf125cbe Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 16 Jun 2022 12:01:48 -0700
Subject: [PATCH 11/44] raise error if any samples or variants are invalid in
 transform subcommand

---
 haptools/data/genotypes.py |  2 ++
 haptools/transform.py      | 21 ++++++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 8421a37c..039446a9 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -396,6 +396,7 @@ def check_missing(self, discard_also=False):
                 )
                 self.data = np.delete(self.data, samp_idx, axis=0)
                 self.samples = tuple(np.delete(self.samples, samp_idx))
+                self._samp_idx = None
             else:
                 raise ValueError(
                     "Genotype with ID {} at POS {}:{} is missing for sample {}".format(
@@ -437,6 +438,7 @@ def check_biallelic(self, discard_also=False):
                 self.log.info(f"Ignoring {len(variant_idx)} multiallelic variants")
                 self.data = np.delete(self.data, variant_idx, axis=1)
                 self.variants = np.delete(self.variants, variant_idx)
+                self._var_idx = None
             else:
                 raise ValueError(
                     "Variant with ID {} at POS {}:{} is multiallelic for sample {}"
diff --git a/haptools/transform.py b/haptools/transform.py
index 2b727e16..0745c80f 100644
--- a/haptools/transform.py
+++ b/haptools/transform.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+import logging
 
 from haptools import data
 from .haplotype import HaptoolsHaplotype
@@ -30,27 +31,33 @@ def transform_haps(
     log : Logger, optional
         A logging module to which to write messages about progress and any errors
     """
+    if log is None:
+        log = logging.getLogger("run")
+        logging.basicConfig(
+            format="[%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)",
+            level="ERROR",
+        )
+
     log.info("Loading haplotypes")
-    hp = data.Haplotypes(haplotypes)
+    hp = data.Haplotypes(haplotypes, log=log)
     hp.read(region=region)
 
     log.info("Extracting variants from haplotypes")
     variants = {var.id for hap in hp.data.values() for var in hap.variants}
-    
+
     log.info("Loading genotypes")
     gt = data.GenotypesRefAlt(genotypes, log=log)
     # gt._prephased = True
     gt.read(region=region, samples=samples, variants=variants)
-    gt.check_missing(discard_also=True)
-    gt.check_biallelic(discard_also=True)
+    gt.check_missing()
+    gt.check_biallelic()
     gt.check_phase()
-    
+
     log.info("Transforming genotypes via haplotypes")
     hp_gt = data.GenotypesRefAlt(fname=output, log=log)
     hp.transform(gt, hp_gt)
-    
+
     log.info("Writing haplotypes to VCF")
     hp_gt.write()
 
     return hp_gt
-    
\ No newline at end of file

From 5542ccb17c24001a98bddb006709674d393c87f0 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 16 Jun 2022 12:03:17 -0700
Subject: [PATCH 12/44] copy transform subroutine to simphenotypes

---
 haptools/sim_phenotypes.py | 130 +++++++++++++++++++++----------------
 1 file changed, 75 insertions(+), 55 deletions(-)

diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py
index 16913b05..6472d008 100644
--- a/haptools/sim_phenotypes.py
+++ b/haptools/sim_phenotypes.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 from logging import getLogger, Logger
 
+import numpy.typing as npt
+
 from haplotype import HaptoolsHaplotype as Haplotype
 from .data import GenotypesRefAlt, Phenotypes, Haplotypes
 
@@ -11,14 +13,8 @@ class PhenoSimulator:
 
     Attributes
     ----------
-    gens: GenotypesRefAlt
-        A set of genotypes to use when simulating phenotypes
-    haps: Haplotypes
-        A set of haplotypes to use as causal variables in the simulation
-    haps_gts: GenotypesRefAlt
-        Genotypes for each haplotype
-    phens: Phenotypes
-        Phenotypes for each haplotype
+    gens: Genotypes
+        Genotypes to simulate
     log: Logger
         A logging instance for recording debug statements
 
@@ -26,57 +22,58 @@ class PhenoSimulator:
     --------
     >>> gens = Genotypes.load("tests/data/example.vcf.gz")
     >>> haps = Haplotypes.load("tests/data/basic.hap")
-    >>> phenosim = PhenoSimulator(gens, haps, Path("basic_phens.tsv.gz"))
-    >>> phenosim.transform()
+    >>> haps_gts = GenotypesRefAlt(None)
+    >>> haps.transform(gens, haps_gts)
+    >>> phenosim = PhenoSimulator(haps_gts)
     >>> phenotypes = phenosim.run()
     """
 
     def __init__(
         self,
-        genotypes: GenotypesRefAlt,
-        haplotypes: Haplotypes,
-        phenotypes: Path,
+        genotypes: Genotypes,
         log: Logger = None,
     ):
-        self.gens = genotypes
-        self.haps = haplotypes
-        self.haps_gts = None
-        self.phens = Phenotypes(phenotypes, self.log)
-        self.phens.samples = self.gens
-        self.log = log or getLogger(self.__class__.__name__)
-
-    def unset(self):
-        """
-        Whether the data has been loaded into the object yet
-
-        Returns
-        -------
-        bool
-            True if :py:attr:`~.PhenoSimulator.haps_gts` is None else False
         """
-        return self.haps_gts is None
-
-    def transform(self):
-        """
-        Obtain the "genotypes" of the haplotypes
+        Initialize a PhenoSimulator object
+
+        Parameters
+        ----------
+        genotypes: Genotypes
+            Genotypes for each haplotype
+        log: Logger, optional
+            A logging instance for recording debug statements
         """
-        if self.unset():
-            self.log.warning("The haplotype 'genotypes' have already been loaded. Overriding.")
-        self.haps_gts = self.haps.transform(self.gens)
+        self.gens = genotypes
+        self.log = log or getLogger(self.__class__.__name__)
 
     def run(
         self,
-        name: str,
-        simu_rep: int,
-        simu_hsq: float,
-        simu_k: float,
-        simu_qt: bool = False,
-        simu_cc: bool = False,
-    ) -> Phenotypes:
-        effect = np.array([hap.beta for hap in haps.data.values()])
-        gts = self.haps_gts.data.sum(axis=2)
-        pt = np.sum(effect * (gts - gts.mean(axis=0)) / gts.std(axis=0))
-        phens = np.empty((len(self.genotypes.samples)), dtype=np.float64)
+        varID: str,
+        beta: float,
+        heritability: float = 1,
+        prevalence: float = None,
+    ) -> npt.NDArray:
+        """
+        Simulate phenotypes for an entry in the Genotypes object
+
+        Parameters
+        ----------
+        varID: str
+            The ID of the variant in the Genotype object from which to simulate
+            phenotypes
+        beta: float
+            The simulated effect size of the variant; must be between -1 and 1
+        heritability: float, optional
+            The simulated heritability of the trait
+        prevalence: float, optional
+            How common should the disease be within the population?
+
+            If this value is specified, case/control phenotypes will be generated
+            instead of quantitative traits.
+        """
+        gts = self.gens.subset(variants=[varID]).data[:, 0].sum(axis=1)
+        # error =
+        pt = effect * (gts - gts.mean(axis=0)) / gts.std(axis=0)
 
         # Simulate and write
         for i in range(simu_rep):
@@ -90,20 +87,43 @@ def run(
 def simulate_pt(
     genotypes: Path,
     haplotypes: Path,
-    phenotypes: Path,
     simu_rep: int,
     simu_hsq: float,
     simu_k: float,
-    simu_qt: bool = False,
-    simu_cc: bool = False,
+    region: str = None,
+    samples: list[str] = None,
+    output: Path = Path("-"),
+    log: Logger = None,
 ):
-    # Initialize readers
-    gens = GenotypesRefAlt.load(genotypes)
-    haps = Haplotypes(haplotypes, haplotype=Haplotype)
-    haps.read()
+    if log is None:
+        log = logging.getLogger("run")
+        logging.basicConfig(
+            format="[%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)",
+            level="ERROR",
+        )
+
+    log.info("Loading haplotypes")
+    hp = data.Haplotypes(haplotypes, log=log)
+    hp.read(region=region)
+
+    log.info("Extracting variants from haplotypes")
+    variants = {var.id for hap in hp.data.values() for var in hap.variants}
+
+    log.info("Loading genotypes")
+    gt = data.GenotypesRefAlt(genotypes, log=log)
+    # gt._prephased = True
+    gt.read(region=region, samples=samples, variants=variants)
+    gt.check_missing()
+    gt.check_biallelic()
+    gt.check_phase()
+
+    log.info("Transforming genotypes via haplotypes")
+    hp_gt = data.GenotypesRefAlt(fname=None, log=log)
+    hp.transform(gt, hp_gt)
 
     # Initialize phenotype simulator (haptools simphenotypes)
-    pt_sim = PhenoSimulator(gens, haps)
+    log.info("Simulating phenotypes")
+    pt_sim = PhenoSimulator(hp_gt)
     phens = pt_sim.run()
 
     # Set up file to write summary info

From def4826608e6ce543c9d5ab2119e88e9326148f6 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 30 Jun 2022 22:43:35 -0700
Subject: [PATCH 13/44] rename covar and pheno files

---
 tests/data/{covars.tsv => simple.covar} | 2 +-
 tests/data/simple.pheno                 | 6 ++++++
 tests/data/simple.tsv                   | 5 -----
 3 files changed, 7 insertions(+), 6 deletions(-)
 rename tests/data/{covars.tsv => simple.covar} (81%)
 create mode 100644 tests/data/simple.pheno
 delete mode 100644 tests/data/simple.tsv

diff --git a/tests/data/covars.tsv b/tests/data/simple.covar
similarity index 81%
rename from tests/data/covars.tsv
rename to tests/data/simple.covar
index 9d4ec1d1..c2b96e30 100644
--- a/tests/data/covars.tsv
+++ b/tests/data/simple.covar
@@ -1,4 +1,4 @@
-sample	sex	age
+#IID	sex	age
 HG00096	0	4
 HG00097	1	20
 HG00099	1	33
diff --git a/tests/data/simple.pheno b/tests/data/simple.pheno
new file mode 100644
index 00000000..b567f784
--- /dev/null
+++ b/tests/data/simple.pheno
@@ -0,0 +1,6 @@
+#IID	height	bmi
+HG00096	1	3
+HG00097	1	6
+HG00099	2	8
+HG00100	2	1
+HG00101	0	4
\ No newline at end of file
diff --git a/tests/data/simple.tsv b/tests/data/simple.tsv
deleted file mode 100644
index f36f9427..00000000
--- a/tests/data/simple.tsv
+++ /dev/null
@@ -1,5 +0,0 @@
-HG00096	1
-HG00097	1
-HG00099	2
-HG00100	2
-HG00101	0

From bffd27cf123753ca62b7411837755b422e5c4eeb Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 30 Jun 2022 22:52:45 -0700
Subject: [PATCH 14/44] describe phenotypes in a separate page of the docs

---
 docs/formats/genotypes.rst  |  7 +++++++
 docs/formats/haplotypes.rst |  4 ++--
 docs/formats/inputs.rst     | 31 -------------------------------
 docs/formats/phenotypes.rst | 28 ++++++++++++++++++++++++++++
 docs/index.rst              |  3 ++-
 5 files changed, 39 insertions(+), 34 deletions(-)
 create mode 100644 docs/formats/genotypes.rst
 delete mode 100644 docs/formats/inputs.rst
 create mode 100644 docs/formats/phenotypes.rst

diff --git a/docs/formats/genotypes.rst b/docs/formats/genotypes.rst
new file mode 100644
index 00000000..3c204ff9
--- /dev/null
+++ b/docs/formats/genotypes.rst
@@ -0,0 +1,7 @@
+.. _formats-genotypes:
+
+
+Genotypes
+=========
+
+Genotype files must be specified as VCF or BCF files.
diff --git a/docs/formats/haplotypes.rst b/docs/formats/haplotypes.rst
index a6707d1e..73e4d747 100644
--- a/docs/formats/haplotypes.rst
+++ b/docs/formats/haplotypes.rst
@@ -1,8 +1,8 @@
 .. _formats-haplotypes:
 
 
-.hap
-====
+Haplotypes
+==========
 
 This document describes our custom file format specification for haplotypes: the ``.hap`` file.
 
diff --git a/docs/formats/inputs.rst b/docs/formats/inputs.rst
deleted file mode 100644
index 14f89d9b..00000000
--- a/docs/formats/inputs.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-.. _formats-inputs:
-
-
-Inputs
-=========
-
-Genotype file format
---------------------
-Genotype files must be specified as VCF or BCF files.
-
-Phenotype file format
----------------------
-Phenotypes are expected to be in a tab-separated format with two columns:
-
-1. sample ID, which must match those from the genotype file
-2. the phenotype value
-
-There should be no header line in the file.
-
-See ``tests/data/simple.tsv`` for an example of a phenotype file.
-
-Covariate file format
----------------------
-Covariates are expected to be in a tab-separated format with the first column
-corresponding to the sample ID. Subsequent columns should contain each of your
-covariates.
-
-The first line of the file will be treated as a header. The column with sample IDs
-should be named "sample" and subsequent columns can be labeled however you'd like.
-
-See ``tests/data/covars.tsv`` for an example of a covariate file.
diff --git a/docs/formats/phenotypes.rst b/docs/formats/phenotypes.rst
new file mode 100644
index 00000000..2d856ff2
--- /dev/null
+++ b/docs/formats/phenotypes.rst
@@ -0,0 +1,28 @@
+.. _formats-phenotypes:
+
+
+Phenotypes and Covariates
+=========================
+
+Phenotype file format
+---------------------
+Phenotypes are expected to follow the PLINK2 ``.pheno`` file format. This is a
+tab-separated format where the first column corresponds to the sample ID, and
+subsequent columns contain each of your phenotypes.
+
+The first line of the file corresponds with the header and must begin with ``#IID``.
+The names of each of your phenotypes belong in the subbsequent columns of the header.
+
+See `tests/data/simple.pheno <https://github.com/gymrek-lab/haptools/blob/main/tests/data/simple.pheno>`_ for an example of a phenotype file:
+
+.. include:: ../../tests/data/simple.pheno
+   :literal:
+
+Covariate file format
+---------------------
+Covariates follow the same format as phenotypes.
+
+See `tests/data/simple.covar <https://github.com/gymrek-lab/haptools/blob/main/tests/data/simple.covar>`_ for an example of a covariate file:
+
+.. include:: ../../tests/data/simple.covar
+   :literal:
diff --git a/docs/index.rst b/docs/index.rst
index 6b3266dc..b2a14574 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -9,8 +9,9 @@
    :hidden:
    :maxdepth: 1
 
-   formats/inputs.rst
+   formats/genotypes.rst
    formats/haplotypes.rst
+   formats/phenotypes.rst
 
 .. toctree::
    :caption: Commands

From afcdda580c6c0ab626bcf2fc9a4b841e30936a4b Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 30 Jun 2022 22:54:14 -0700
Subject: [PATCH 15/44] support reading PLINK2-style pheno and covar files and
 make covariates class a subclass of phenotypes see #19 for more details

---
 haptools/data/covariates.py | 128 +---------------------
 haptools/data/haplotypes.py |   2 +-
 haptools/data/phenotypes.py | 112 ++++++++++++-------
 tests/test_data.py          | 211 +++++++++++++++++++++---------------
 4 files changed, 204 insertions(+), 249 deletions(-)

diff --git a/haptools/data/covariates.py b/haptools/data/covariates.py
index 45ad1f30..64270501 100644
--- a/haptools/data/covariates.py
+++ b/haptools/data/covariates.py
@@ -8,16 +8,17 @@
 import numpy as np
 
 from .data import Data
+from .phenotypes import Phenotypes
 
 
-class Covariates(Data):
+class Covariates(Phenotypes):
     """
     A class for processing covariates from a file
 
     Attributes
     ----------
     data : np.array
-        The covariates in an n (samples) x 1 (covariate value) array
+        The covariates in an n (samples) x m (covariates) array
     fname : Path
         The path to the read-only file containing the data
     samples : tuple[str]
@@ -33,124 +34,5 @@ class Covariates(Data):
     """
 
     def __init__(self, fname: Path, log: Logger = None):
-        super().__init__(fname, log)
-        self.samples = tuple()
-        self.names = tuple()
-
-    @classmethod
-    def load(cls: Covariates, fname: Path, samples: list[str] = None) -> Covariates:
-        """
-        Load covariates from a TSV file
-
-        Read the file contents and standardize the covariates
-
-        Parameters
-        ----------
-        fname
-            See documentation for :py:attr:`~.Data.fname`
-        samples : list[str], optional
-            See documentation for :py:meth:`~.Data.Covariates.read`
-
-        Returns
-        -------
-        covariates
-            A Covariates object with the data loaded into its properties
-        """
-        covariates = cls(fname)
-        covariates.read(samples)
-        return covariates
-
-    def read(self, samples: list[str] = None):
-        """
-        Read covariates from a TSV file into a numpy matrix stored in :py:attr:`~.Covariates.data`
-
-        Parameters
-        ----------
-        samples : list[str], optional
-            A subset of the samples from which to extract covariates
-
-            Defaults to loading covariates from all samples
-
-        Raises
-        ------
-        AssertionError
-            If the provided file doesn't follow the expected format
-        """
-        super().read()
-        # load all info into memory
-        # use hook_compressed to automatically handle gz files
-        with hook_compressed(self.fname, mode="rt") as covars:
-            covar_text = reader(covars, delimiter="\t")
-            header = next(covar_text)
-            # there should at least two columns
-            if len(header) < 2:
-                raise ValueError("The covariates TSV should have at least two columns.")
-            # the first column should be called "sample"
-            if header[0] != "sample":
-                self.log.warning(
-                    "The first column of the covariates TSV should contain sample IDs"
-                    " and should be named 'sample' in the header line"
-                )
-            # convert to list and subset samples if need be
-            if samples:
-                samples = set(samples)
-                covar_text = [covar for covar in covar_text if covar[0] in samples]
-            else:
-                covar_text = list(covar_text)
-        # the second column should be castable to a float
-        try:
-            float(covar_text[0][1])
-        except:
-            self.log.error(
-                "Every column in the covariates file (besides the sample column) must"
-                " be numeric."
-            )
-        # fill out the samples and data properties
-        data = list(zip(*covar_text))
-        self.samples = data[0]
-        self.names = tuple(header[1:])
-        # coerce strings to floats
-        self.data = np.transpose(np.array(data[1:], dtype="float64"))
-
-    def __iter__(self, samples: list[str] = None) -> Iterator[namedtuple]:
-        """
-        Read covariates from a TSV line by line without storing anything
-
-        Parameters
-        ----------
-        samples : list[str], optional
-            A subset of the samples from which to extract covariates
-
-            Defaults to loading covariates from all samples
-
-        Yields
-        ------
-        Iterator[namedtuple]
-            An iterator over each line in the file, where each line is encoded as a
-            namedtuple containing each of the class properties
-        """
-        with hook_compressed(self.fname, mode="rt") as covars:
-            covar_text = reader(covars, delimiter="\t")
-            header = next(covar_text)
-            # there should at least two columns
-            if len(header) < 2:
-                raise ValueError("The covariates TSV should have at least two columns.")
-            # the first column should be called "sample"
-            if header[0] != "sample":
-                self.log.warning(
-                    "The first column of the covariates TSV should contain sample IDs"
-                    " and should be named 'sample' in the header line"
-                )
-            header = tuple(header[1:])
-            Record = namedtuple("Record", "data samples names")
-            for covar in covar_text:
-                if samples is None or covar[0] in samples:
-                    try:
-                        yield Record(
-                            np.array(covar[1:], dtype="float64"), covar[0], header
-                        )
-                    except:
-                        self.log.error(
-                            "Every column in the covariates file (besides the sample"
-                            " column) must be numeric."
-                        )
+        super(Phenotypes, self).__init__(fname, log)
+        self._ext = 'covar'
diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py
index a6fd254a..a8ccdb88 100644
--- a/haptools/data/haplotypes.py
+++ b/haptools/data/haplotypes.py
@@ -87,7 +87,7 @@ def fmt_str(self) -> str:
         """
         Convert an Extra into a fmt string
 
-        Retruns
+        Returns
         -------
         str
             A python format string (ex: "{beta:.3f}")
diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py
index a27c2535..c2d55792 100644
--- a/haptools/data/phenotypes.py
+++ b/haptools/data/phenotypes.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
 from csv import reader
 from pathlib import Path
+from io import TextIOBase
 from collections import namedtuple
+from collections.abc import Iterable
 from logging import getLogger, Logger
 from fileinput import hook_compressed
 
@@ -17,11 +19,13 @@ class Phenotypes(Data):
     Attributes
     ----------
     data : np.array
-        The phenotypes in an n (samples) x 1 (phenotype value) array
+        The phenotypes in an n (samples) x m (phenotypes) array
     fname : Path
         The path to the read-only file containing the data
     samples : tuple
         The names of each of the n samples
+    names : tuple[str]
+        The names of the phenotypes
     log: Logger
         A logging instance for recording debug statements.
 
@@ -33,6 +37,8 @@ class Phenotypes(Data):
     def __init__(self, fname: Path, log: Logger = None):
         super().__init__(fname, log)
         self.samples = tuple()
+        self.names = tuple()
+        self._ext = 'pheno'
 
     @classmethod
     def load(cls: Phenotypes, fname: Path, samples: list[str] = None) -> Phenotypes:
@@ -75,36 +81,26 @@ def read(self, samples: list[str] = None):
             If the provided file doesn't follow the expected format
         """
         super().read()
-        # load all info into memory
-        # use hook_compressed to automatically handle gz files
-        with hook_compressed(self.fname, mode="rt") as phens:
-            phen_text = reader(phens, delimiter="\t")
-            # convert to list and subset samples if need be
-            if samples:
-                samples = set(samples)
-                phen_text = [phen for phen in phen_text if phen[0] in samples]
-            else:
-                phen_text = list(phen_text)
-        # there should only be two columns
-        if len(phen_text[0]) != 2:
-            self.log.warning("The phenotype TSV should only have two columns.")
-        # the second column should be castable to a float
-        try:
-            float(phen_text[0][1])
-        except:
-            self.log.error("The second column of the TSV file must numeric.")
+        # call self.__iter__() to obtain the data and samples
+        data, self.samples = zip(*self.__iter__(samples))
+        self.log.info(f"Loaded {len(self.samples)} samples from .{self._ext} file")
         # fill out the samples and data properties
-        self.samples, self.data = zip(*phen_text)
-        # coerce strings to floats
-        self.data = np.array(self.data, dtype="float64")
+        # collect data in a np array
+        self.data = np.array(data)
 
-    def __iter__(self, samples: list[str] = None) -> Iterator[namedtuple]:
+    def _iterate(self, phens: TextIOBase, phen_text: Iterable, samples: set[str] = None):
         """
-        Read phenotypes from a TSV line by line without storing anything
+        A generator over the lines of a TSV
+
+        This is a helper function for :py:meth:`~.Phenotypes.__iter__`
 
         Parameters
         ----------
-        samples : list[str], optional
+        phens: TextIOBase
+            The file handler for the stream
+        phen_text: Iterable
+            The csv.reader object containing the lines of text from the file as lists
+        samples : set[str], optional
             A subset of the samples from which to extract phenotypes
 
             Defaults to loading phenotypes from all samples
@@ -115,17 +111,59 @@ def __iter__(self, samples: list[str] = None) -> Iterator[namedtuple]:
             An iterator over each line in the file, where each line is encoded as a
             namedtuple containing each of the class properties
         """
-        with hook_compressed(self.fname, mode="rt") as phens:
-            phen_text = reader(phens, delimiter="\t")
-            Record = namedtuple("Record", "data samples")
-            for phen in phen_text:
-                if samples is None or phen[0] in samples:
-                    try:
-                        yield Record(float(phen[1]), phen[0])
-                    except:
-                        self.log.error(
-                            "The second column of the TSV file must numeric."
-                        )
+        self.log.info(f"Loading {len(self.names)} columns from .{self._ext} file")
+        Record = namedtuple("Record", "data samples")
+        for phen in phen_text:
+            if samples is None or phen[0] in samples:
+                try:
+                    yield Record(
+                        np.array(phen[1:], dtype="float64"), phen[0]
+                    )
+                except:
+                    self.log.error(
+                        f"Every column in the .{self._ext} file (besides the sample"
+                        " column) must be numeric."
+                    )
+        phens.close()
+
+    def __iter__(self, samples: list[str] = None) -> Iterable[namedtuple]:
+        """
+        Read phenotypes from a TSV line by line without storing anything
+
+        Parameters
+        ----------
+        samples : list[str], optional
+            A subset of the samples from which to extract phenotypes
+
+            Defaults to loading phenotypes from all samples
+
+        Returns
+        ------
+        Iterable[namedtuple]
+            See documentation for :py:meth:`~.Phenotypes._iterate`
+        """
+        phens = hook_compressed(self.fname, mode="rt")
+        phen_text = reader(phens, delimiter="\t")
+        # ignore all of the comment lines
+        while True:
+            header = next(phen_text)
+            if not header[0].startswith('#') or header[0].startswith('#IID'):
+                break
+
+        # there should be at least two columns
+        if len(header) < 2:
+            raise ValueError(f"The .{self._ext} file should have at least two columns.")
+        # the first column should be called "#IID"
+        if header[0] != "#IID":
+            self.log.warning(
+                f"The first column of the .{self._ext} file should contain sample IDs"
+                " and should be named '#IID' in the header line"
+            )
+        self.names = tuple(header[1:])
+        samples = set(samples) if samples else None
+        # call another function to force the lines above to be run immediately
+        # see https://stackoverflow.com/a/36726497
+        return self._iterate(phens, phen_text, samples)
 
     def standardize(self):
         """
@@ -133,4 +171,4 @@ def standardize(self):
 
         This function modifies :py:attr:`~.Genotypes.data` in-place
         """
-        self.data = (self.data - np.mean(self.data)) / np.std(self.data)
+        self.data = (self.data - np.mean(self.data, axis=0)) / np.std(self.data, axis=0)
diff --git a/tests/test_data.py b/tests/test_data.py
index 94c31190..db97049c 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -240,94 +240,129 @@ def test_subset_genotypes(self):
         assert np.array_equal(gts_sub.variants, expected_variants)
 
 
-def test_load_phenotypes(caplog):
-    # create a phenotype vector with shape: num_samples x 1
-    expected = np.array([1, 1, 2, 2, 0])
-
-    # can we load the data from the phenotype file?
-    phens = Phenotypes(DATADIR.joinpath("simple.tsv"))
-    phens.read()
-    np.testing.assert_allclose(phens.data, expected)
-    assert phens.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
-
-    # try loading the data again - it should warn b/c we've already done it
-    phens.read()
-    assert len(caplog.records) == 1 and caplog.records[0].levelname == "WARNING"
-
-    expected = (expected - np.mean(expected)) / np.std(expected)
-    phens.standardize()
-    np.testing.assert_allclose(phens.data, expected)
-
-
-def test_load_phenotypes_iterate():
-    # create a phenotype vector with shape: num_samples x 1
-    expected = np.array([1, 1, 2, 2, 0])
-    samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
-
-    # can we load the data from the phenotype file?
-    phens = Phenotypes(DATADIR.joinpath("simple.tsv"))
-    for idx, line in enumerate(phens):
-        np.testing.assert_allclose(line.data, expected[idx])
-        assert line.samples == samples[idx]
-
-
-def test_load_phenotypes_subset():
-    # create a phenotype vector with shape: num_samples x 1
-    expected = np.array([1, 1, 2, 2, 0])
-
-    # subset for just the samples we want
-    expected = expected[[1, 3]]
-
-    # can we load the data from the phenotype file?
-    phens = Phenotypes(DATADIR.joinpath("simple.tsv"))
-    samples = ["HG00097", "HG00100"]
-    phens.read(samples=samples)
-    np.testing.assert_allclose(phens.data, expected)
-    assert phens.samples == tuple(samples)
-
-
-def test_load_covariates(caplog):
-    # create a covariate vector with shape: num_samples x num_covars
-    expected = np.array([(0, 4), (1, 20), (1, 33), (0, 15), (0, 78)])
-
-    # can we load the data from the covariates file?
-    covars = Covariates(DATADIR.joinpath("covars.tsv"))
-    covars.read()
-    np.testing.assert_allclose(covars.data, expected)
-    assert covars.samples == ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
-    assert covars.names == ("sex", "age")
-
-    # try loading the data again - it should warn b/c we've already done it
-    covars.read()
-    assert len(caplog.records) == 1 and caplog.records[0].levelname == "WARNING"
-
-
-def test_load_covariates_iterate():
-    # create a covariate vector with shape: num_samples x num_covars
-    expected = np.array([(0, 4), (1, 20), (1, 33), (0, 15), (0, 78)])
-    samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
-
-    # can we load the data from the covariates file?
-    covars = Covariates(DATADIR.joinpath("covars.tsv"))
-    for idx, line in enumerate(covars):
-        np.testing.assert_allclose(line.data, expected[idx])
-        assert line.samples == samples[idx]
-        assert line.names == ("sex", "age")
-
-
-def test_load_covariates_subset():
-    # create a covariate vector with shape: num_samples x num_covars
-    expected = np.array([(0, 4), (1, 20), (1, 33), (0, 15), (0, 78)])
-
-    # subset for just the samples we want
-    expected = expected[[1, 3]]
-
-    # can we load the data from the covariate file?
-    covars = Covariates(DATADIR.joinpath("covars.tsv"))
-    samples = ["HG00097", "HG00100"]
-    covars.read(samples=samples)
-    np.testing.assert_allclose(covars.data, expected)
-    assert covars.samples == tuple(samples)
+class TestPhenotypes:
+    def _get_expected_phenotypes(self):
+        # create a phen matrix with shape: samples x phenotypes
+        expected = np.array([
+            [1, 1, 2, 2, 0],
+            [3, 6, 8, 1, 4],
+        ], dtype="float64").T
+        return expected
+
+    def _get_fake_phenotypes(self):
+        gts = Phenotypes(fname=None)
+        gts.data = self._get_expected_phenotypes()
+        gts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
+        gts.names = ("height", "bmi")
+        return gts
+
+    def test_load_phenotypes(self, caplog):
+        expected_phen = self._get_fake_phenotypes()
+        expected = expected_phen.data
+
+        # can we load the data from the phenotype file?
+        phens = Phenotypes(DATADIR.joinpath("simple.pheno"))
+        phens.read()
+        np.testing.assert_allclose(phens.data, expected)
+        assert phens.samples == expected_phen.samples
+
+        # try loading the data again - it should warn b/c we've already done it
+        caplog.clear()
+        phens.read()
+        assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING"
+
+        expected = (expected - np.mean(expected, axis=0)) / np.std(expected, axis=0)
+        phens.standardize()
+        np.testing.assert_allclose(phens.data, expected)
+
+
+    def test_load_phenotypes_iterate(self):
+        expected_phen = self._get_fake_phenotypes()
+        expected = expected_phen.data
+        samples = expected_phen.samples
+
+        # can we load the data from the phenotype file?
+        phens = Phenotypes(DATADIR.joinpath("simple.pheno"))
+        for idx, line in enumerate(phens):
+            np.testing.assert_allclose(line.data, expected[idx])
+            assert line.samples == samples[idx]
+
+
+    def test_load_phenotypes_subset(self):
+        expected_phen = self._get_fake_phenotypes()
+        expected = expected_phen.data
+        samples = ["HG00097", "HG00100"]
+
+        # subset for just the samples we want
+        expected = expected[[1, 3]]
+
+        # can we load the data from the phenotype file?
+        phens = Phenotypes(DATADIR.joinpath("simple.pheno"))
+        phens.read(samples=samples)
+        np.testing.assert_allclose(phens.data, expected)
+        assert phens.samples == tuple(samples)
+
+
+class TestCovariates:
+    def _get_expected_covariates(self):
+        # create a covar matrix with shape: samples x covariates
+        expected = np.array([(0, 4), (1, 20), (1, 33), (0, 15), (0, 78)])
+        return expected
+
+    def _get_fake_covariates(self):
+        gts = Phenotypes(fname=None)
+        gts.data = self._get_expected_covariates()
+        gts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
+        gts.names = ("sex", "age")
+        return gts
+
+    def test_load_covariates(self, caplog):
+        # create a covariate vector with shape: num_samples x num_covars
+        expected_covar = self._get_fake_covariates()
+        expected = expected_covar.data
+        samples = expected_covar.samples
+
+        # can we load the data from the covariates file?
+        covars = Covariates(DATADIR.joinpath("simple.covar"))
+        covars.read()
+        np.testing.assert_allclose(covars.data, expected)
+        assert covars.samples == samples
+        assert covars.names == ("sex", "age")
+
+        # try loading the data again - it should warn b/c we've already done it
+        caplog.clear()
+        covars.read()
+        assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING"
+
+
+    def test_load_covariates_iterate(self):
+        # create a covariate vector with shape: num_samples x num_covars
+        expected_covar = self._get_fake_covariates()
+        expected = expected_covar.data
+        samples = expected_covar.samples
+
+        # can we load the data from the covariates file?
+        covars = Covariates(DATADIR.joinpath("simple.covar"))
+        for idx, line in enumerate(covars):
+            np.testing.assert_allclose(line.data, expected[idx])
+            assert line.samples == samples[idx]
+        assert covars.names == ("sex", "age")
+
+
+    def test_load_covariates_subset(self):
+        # create a covariate vector with shape: num_samples x num_covars
+        expected_covar = self._get_fake_covariates()
+        expected = expected_covar.data
+        samples = ["HG00097", "HG00100"]
+
+        # subset for just the samples we want
+        expected = expected[[1, 3]]
+
+        # can we load the data from the covariate file?
+        covars = Covariates(DATADIR.joinpath("simple.covar"))
+        covars.read(samples=samples)
+        np.testing.assert_allclose(covars.data, expected)
+        assert covars.samples == tuple(samples)
 
 
 class TestHaplotypes:

From 2849dcae9ef0aa1404a2a22f877baa15e0ba2f99 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 30 Jun 2022 22:59:46 -0700
Subject: [PATCH 16/44] clarify transform documentation

---
 README.md                  |  2 ++
 haptools/__main__.py       | 11 +++++++----
 haptools/sim_phenotypes.py |  2 +-
 haptools/transform.py      |  1 +
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 678dbeff..f9d64180 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,8 @@ Haptools consists of multiple utilities listed below. Click on a utility to see
 
 * [`haptools karyogram`](docs/commands/karyogram.md): Visualize a "chromosome painting" of local ancestry labels based on breakpoints output by `haptools simgenome`.
 
+* [`haptools transform`](https://haptools.readthedocs.io/en/latest/commands/transform.html): Transform a set of genotypes via a list of haplotypes. Create a new VCF containing haplotypes instead of variants.
+
 Outputs produced by these utilities are compatible with each other. For example
 `haptools simgenome` outputs a VCF file with local ancestry information annotated for each variant. The output VCF file can be used as input to `haptools simphenotype` to simulate phenotype information. `haptools simgenome` also outputs a list of local ancestry breakpoints which can be visualized using `haptools karyogram`. 
 
diff --git a/haptools/__main__.py b/haptools/__main__.py
index 26a635fc..d2b1fd30 100644
--- a/haptools/__main__.py
+++ b/haptools/__main__.py
@@ -143,10 +143,10 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms):
 def simphenotype(
     genotypes: Path,
     haplotypes: Path,
+    simu_rep, simu_hsq, simu_k,
     region: str = None,
     samples: tuple[str] = tuple(),
     samples_file: Path = None,
-    simu_rep, simu_hsq, simu_k,
     output: Path = Path("-"),
     verbosity: str = 'ERROR',
 ):
@@ -220,9 +220,11 @@ def simphenotype(
     type=str,
     default=None,
     show_default="all haplotypes",
-    help="""
-    The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'\n
-    For this to work, the VCF must be indexed and the seqname must match!""",
+    help=(
+        "The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'"
+        "For this to work, the VCF and .hap file must be indexed and the seqnames must"
+        "correspond with those in the files"
+    )
 )
 @click.option(
     "-s",
@@ -290,6 +292,7 @@ def transform(
         The path to the haplotypes in a .hap file
     region : str, optional
         See documentation for :py:meth:`~.data.Genotypes.read`
+        and :py:meth:`~.data.Haplotypes.read`
     sample : Tuple[str], optional
         See documentation for :py:meth:`~.data.Genotypes.read`
     samples_file : Path, optional
diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py
index 6472d008..87cae660 100644
--- a/haptools/sim_phenotypes.py
+++ b/haptools/sim_phenotypes.py
@@ -121,7 +121,7 @@ def simulate_pt(
     hp_gt = data.GenotypesRefAlt(fname=None, log=log)
     hp.transform(gt, hp_gt)
 
-    # Initialize phenotype simulator (haptools simphenotypes)
+    # Initialize phenotype simulator (haptools simphenotype)
     log.info("Simulating phenotypes")
     pt_sim = PhenoSimulator(hp_gt)
     phens = pt_sim.run()
diff --git a/haptools/transform.py b/haptools/transform.py
index 0745c80f..cbcb1bf5 100644
--- a/haptools/transform.py
+++ b/haptools/transform.py
@@ -24,6 +24,7 @@ def transform_haps(
         The path to the haplotypes in a .hap file
     region : str, optional
         See documentation for :py:meth:`~.data.Genotypes.read`
+        and :py:meth:`~.data.Haplotypes.read`
     samples : list[str], optional
         See documentation for :py:meth:`~.data.Genotypes.read`
     output : Path, optional

From a76e1a6158b2926a0ab7fdfe3c1fd12f01ed827d Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 1 Jul 2022 11:28:32 -0700
Subject: [PATCH 17/44] add Phenotypes.write() method

---
 haptools/data/phenotypes.py | 20 ++++++++++++++++++++
 tests/test_data.py          | 21 +++++++++++++++++----
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py
index c2d55792..3b64f7b3 100644
--- a/haptools/data/phenotypes.py
+++ b/haptools/data/phenotypes.py
@@ -165,6 +165,26 @@ def __iter__(self, samples: list[str] = None) -> Iterable[namedtuple]:
         # see https://stackoverflow.com/a/36726497
         return self._iterate(phens, phen_text, samples)
 
+    def write(self):
+        """
+        Write the phenotypes in this class to a file at :py:attr:`~.Phenotypes.fname`
+
+        Examples
+        --------
+        To write to a file, you must first initialize a Phenotypes object and then
+        fill out the names, data, and samples properties:
+        >>> phenotypes = Phenotypes('tests/data/simple.pheno')
+        >>> phenotypes.names = ('height',)
+        >>> phenotypes.data = np.array([1, 1, 2], dtype='float64')
+        >>> phenotypes.samples = ('HG00096', 'HG00097', 'HG00099')
+        >>> phenotypes.write()
+        """
+        with hook_compressed(self.fname, mode="wt") as haps:
+            haps.write("#IID\t"+"\t".join(self.names)+"\n")
+            for samp, phens in zip(self.samples, self.data):
+                line = np.array2string(phens, separator="\t", formatter={'float_kind':lambda x: "%.2f" % x})[1:-1]
+                haps.write(f"{samp}\t" + line + "\n")
+
     def standardize(self):
         """
         Standardize phenotypes so they have a mean of 0 and a stdev of 1
diff --git a/tests/test_data.py b/tests/test_data.py
index db97049c..d8bb3ba9 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -275,7 +275,6 @@ def test_load_phenotypes(self, caplog):
         phens.standardize()
         np.testing.assert_allclose(phens.data, expected)
 
-
     def test_load_phenotypes_iterate(self):
         expected_phen = self._get_fake_phenotypes()
         expected = expected_phen.data
@@ -287,7 +286,6 @@ def test_load_phenotypes_iterate(self):
             np.testing.assert_allclose(line.data, expected[idx])
             assert line.samples == samples[idx]
 
-
     def test_load_phenotypes_subset(self):
         expected_phen = self._get_fake_phenotypes()
         expected = expected_phen.data
@@ -302,6 +300,23 @@ def test_load_phenotypes_subset(self):
         np.testing.assert_allclose(phens.data, expected)
         assert phens.samples == tuple(samples)
 
+    def test_write_phenotypes(self):
+        expected_phen = self._get_fake_phenotypes()
+
+        # first, we write the data
+        expected_phen.fname = DATADIR.joinpath("test.pheno")
+        expected_phen.write()
+
+        # now, let's load the data and check that it's what we wrote
+        result = Phenotypes(expected_phen.fname)
+        result.read()
+        np.testing.assert_allclose(expected_phen.data, result.data)
+        assert expected_phen.names == result.names
+        assert expected_phen.samples == result.samples
+
+        # let's clean up after ourselves and delete the file
+        os.remove(str(expected_phen.fname))
+
 
 class TestCovariates:
     def _get_expected_covariates(self):
@@ -334,7 +349,6 @@ def test_load_covariates(self, caplog):
         covars.read()
         assert len(caplog.records) > 0 and caplog.records[0].levelname == "WARNING"
 
-
     def test_load_covariates_iterate(self):
         # create a covariate vector with shape: num_samples x num_covars
         expected_covar = self._get_fake_covariates()
@@ -348,7 +362,6 @@ def test_load_covariates_iterate(self):
             assert line.samples == samples[idx]
         assert covars.names == ("sex", "age")
 
-
     def test_load_covariates_subset(self):
         # create a covariate vector with shape: num_samples x num_covars
         expected_covar = self._get_fake_covariates()

From 80bf943a0be6c23f8421ecc75a62da71e203bd60 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 1 Jul 2022 11:31:55 -0700
Subject: [PATCH 18/44] refmt with black

---
 haptools/data/covariates.py |  2 +-
 haptools/data/haplotypes.py |  7 +------
 haptools/data/phenotypes.py | 19 ++++++++++---------
 tests/test_data.py          | 15 +++++++++------
 4 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/haptools/data/covariates.py b/haptools/data/covariates.py
index 64270501..04960e50 100644
--- a/haptools/data/covariates.py
+++ b/haptools/data/covariates.py
@@ -35,4 +35,4 @@ class Covariates(Phenotypes):
 
     def __init__(self, fname: Path, log: Logger = None):
         super(Phenotypes, self).__init__(fname, log)
-        self._ext = 'covar'
+        self._ext = "covar"
diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py
index a8ccdb88..385833f5 100644
--- a/haptools/data/haplotypes.py
+++ b/haptools/data/haplotypes.py
@@ -718,12 +718,7 @@ def __repr__(self):
 
     def write(self):
         """
-        Write the contents of this Haplotypes object to the file given by fname
-
-        Parameters
-        ----------
-        file: TextIO
-            A file-like object to which this Haplotypes object should be written.
+        Write the contents of this Haplotypes object to the file at :py:attr:`~.Haplotypes.fname`
 
         Examples
         --------
diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py
index 3b64f7b3..1a61ebaa 100644
--- a/haptools/data/phenotypes.py
+++ b/haptools/data/phenotypes.py
@@ -38,7 +38,7 @@ def __init__(self, fname: Path, log: Logger = None):
         super().__init__(fname, log)
         self.samples = tuple()
         self.names = tuple()
-        self._ext = 'pheno'
+        self._ext = "pheno"
 
     @classmethod
     def load(cls: Phenotypes, fname: Path, samples: list[str] = None) -> Phenotypes:
@@ -88,7 +88,9 @@ def read(self, samples: list[str] = None):
         # collect data in a np array
         self.data = np.array(data)
 
-    def _iterate(self, phens: TextIOBase, phen_text: Iterable, samples: set[str] = None):
+    def _iterate(
+        self, phens: TextIOBase, phen_text: Iterable, samples: set[str] = None
+    ):
         """
         A generator over the lines of a TSV
 
@@ -116,9 +118,7 @@ def _iterate(self, phens: TextIOBase, phen_text: Iterable, samples: set[str] = N
         for phen in phen_text:
             if samples is None or phen[0] in samples:
                 try:
-                    yield Record(
-                        np.array(phen[1:], dtype="float64"), phen[0]
-                    )
+                    yield Record(np.array(phen[1:], dtype="float64"), phen[0])
                 except:
                     self.log.error(
                         f"Every column in the .{self._ext} file (besides the sample"
@@ -147,7 +147,7 @@ def __iter__(self, samples: list[str] = None) -> Iterable[namedtuple]:
         # ignore all of the comment lines
         while True:
             header = next(phen_text)
-            if not header[0].startswith('#') or header[0].startswith('#IID'):
+            if not header[0].startswith("#") or header[0].startswith("#IID"):
                 break
 
         # there should be at least two columns
@@ -180,9 +180,10 @@ def write(self):
         >>> phenotypes.write()
         """
         with hook_compressed(self.fname, mode="wt") as haps:
-            haps.write("#IID\t"+"\t".join(self.names)+"\n")
-            for samp, phens in zip(self.samples, self.data):
-                line = np.array2string(phens, separator="\t", formatter={'float_kind':lambda x: "%.2f" % x})[1:-1]
+            haps.write("#IID\t" + "\t".join(self.names) + "\n")
+            formatter = {"float_kind": lambda x: "%.2f" % x}
+            for samp, phen in zip(self.samples, self.data):
+                line = np.array2string(phen, separator="\t", formatter=formatter)[1:-1]
                 haps.write(f"{samp}\t" + line + "\n")
 
     def standardize(self):
diff --git a/tests/test_data.py b/tests/test_data.py
index d8bb3ba9..30c1e3a9 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -223,7 +223,7 @@ def test_subset_genotypes(self):
         # subset to just the variants we want
         expected_data = gts.data[:, [1, 2]]
         expected_variants = gts.variants[[1, 2]]
-        variants = ('1:10116:A:G', '1:10117:C:A')
+        variants = ("1:10116:A:G", "1:10117:C:A")
         gts_sub = gts.subset(variants=variants)
         assert gts_sub.samples == gts.samples
         np.testing.assert_allclose(gts_sub.data, expected_data)
@@ -233,7 +233,7 @@ def test_subset_genotypes(self):
         expected_data = gts.data[[3, 4], :2]
         expected_variants = gts.variants[:2]
         samples = ("HG00100", "HG00101")
-        variants = ("1:10114:T:C", '1:10116:A:G')
+        variants = ("1:10114:T:C", "1:10116:A:G")
         gts_sub = gts.subset(samples=samples, variants=variants)
         assert gts_sub.samples == samples
         np.testing.assert_allclose(gts_sub.data, expected_data)
@@ -243,10 +243,13 @@ def test_subset_genotypes(self):
 class TestPhenotypes:
     def _get_expected_phenotypes(self):
         # create a phen matrix with shape: samples x phenotypes
-        expected = np.array([
-            [1, 1, 2, 2, 0],
-            [3, 6, 8, 1, 4],
-        ], dtype="float64").T
+        expected = np.array(
+            [
+                [1, 1, 2, 2, 0],
+                [3, 6, 8, 1, 4],
+            ],
+            dtype="float64",
+        ).T
         return expected
 
     def _get_fake_phenotypes(self):

From d24920611cecf813bfa2d68fd7bada13b363be6d Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 1 Jul 2022 12:54:15 -0700
Subject: [PATCH 19/44] prelim finish PhenoSimulator.run

---
 haptools/sim_phenotypes.py | 73 ++++++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 31 deletions(-)

diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py
index 87cae660..3399352b 100644
--- a/haptools/sim_phenotypes.py
+++ b/haptools/sim_phenotypes.py
@@ -15,6 +15,8 @@ class PhenoSimulator:
     ----------
     gens: Genotypes
         Genotypes to simulate
+    phens: Phenotypes
+        Simulated phenotypes; filled by :py:meth:`~.PhenoSimular.run`
     log: Logger
         A logging instance for recording debug statements
 
@@ -31,6 +33,7 @@ class PhenoSimulator:
     def __init__(
         self,
         genotypes: Genotypes,
+        output: Path = None,
         log: Logger = None,
     ):
         """
@@ -40,22 +43,30 @@ def __init__(
         ----------
         genotypes: Genotypes
             Genotypes for each haplotype
+        output: Path
+            Path to a '.pheno' file to which the generated phenotypes could be written
         log: Logger, optional
             A logging instance for recording debug statements
         """
         self.gens = genotypes
+        self.phens = Phenotypes(fname=output)
+        self.phens.names = tuple()
+        self.phens.data = None
+        self.phens.samples = self.gens.samples
         self.log = log or getLogger(self.__class__.__name__)
 
     def run(
         self,
         varID: str,
         beta: float,
-        heritability: float = 1,
+        heritability: float = None,
         prevalence: float = None,
     ) -> npt.NDArray:
         """
         Simulate phenotypes for an entry in the Genotypes object
 
+        The generated phenotypes will also be added to :py:attr:`~.PhenoSimulator.fname`
+
         Parameters
         ----------
         varID: str
@@ -65,23 +76,40 @@ def run(
             The simulated effect size of the variant; must be between -1 and 1
         heritability: float, optional
             The simulated heritability of the trait
+
+            If not provided, this will be estimated from the variability of the
+            genotypes
         prevalence: float, optional
             How common should the disease be within the population?
 
             If this value is specified, case/control phenotypes will be generated
             instead of quantitative traits.
+
+        Returns
+        -------
         """
         gts = self.gens.subset(variants=[varID]).data[:, 0].sum(axis=1)
-        # error =
         pt = effect * (gts - gts.mean(axis=0)) / gts.std(axis=0)
-
-        # Simulate and write
-        for i in range(simu_rep):
-            resid_component = np.random.normal(0, np.var(pt) * (1 / simu_hsq - 1))
-            if simu_qt:
-                outinfo.append(pt + resid_component)
-            else:
-                raise NotImplementedError("Case control not implemented")
+        # add an error term
+        if heritability:
+            # as defined in GTCA
+            pt += np.random.normal(0, np.var(pt) * ((1/heritability) - 1) )
+        else:
+            pt += np.random.normal(0, 1 - np.var(gts)**2)
+        if self.phens.data is None:
+            self.phens.data = pt
+        else:
+            self.phens.data = np.concatenate((self.phens.data, pt), axis=1)
+        self.phens.names = self.phens.names + (varID,)
+        # TODO: implement case/control phenotypes
+        return pt
+
+    def write(self):
+        """
+        Write the generated phenotypes to the file specified in
+        :py:meth:`~.PhenoSimular.__init__`
+        """
+        self.phens.write()
 
 
 def simulate_pt(
@@ -123,24 +151,7 @@ def simulate_pt(
 
     # Initialize phenotype simulator (haptools simphenotype)
     log.info("Simulating phenotypes")
-    pt_sim = PhenoSimulator(hp_gt)
-    phens = pt_sim.run()
-
-    # Set up file to write summary info
-    outf_sum = open("%s.par" % outprefix, "w")
-    outf_sum.write("\t".join(["Haplotype", "Frequency", "Effect"]) + "\n")
-
-    # Add effect for each haplotype
-    for idx, hap in enumerate(haps.data.values()):
-        sample_to_hap = hap.transform(gens)
-        pt_sim.add_effect(sample_to_hap, hap.beta)
-
-        # Output summary info
-        hap_freq = hap.GetFrequency(gens)
-        outf_sum.write("\t".join([hap.id, str(hap_freq), str(hap.beta)]) + "\n")
-
-    # Output simulated phenotypes to a file (haptools simphenotypes)
-    pt_sim.run(outprefix, simu_rep, simu_hsq, simu_k, simu_qt, simu_cc)
-
-    # Done
-    outf_sum.close()
+    pt_sim = PhenoSimulator(hp_gt, output=output, log=log)
+    for hap in hp.data.values():
+        pt_sim.run(hap.id, hap.beta)
+    pt_sim.write()

From e00ff2b33a4938176d908eb99d29e2fffa313c45 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 1 Jul 2022 18:12:12 -0700
Subject: [PATCH 20/44] try simulating phenotypes from multiple haplotypes,
 additively

---
 haptools/data/haplotypes.py |  2 +-
 haptools/sim_phenotypes.py  | 43 +++++++++++++++++++++----------------
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py
index 385833f5..bd3f7da3 100644
--- a/haptools/data/haplotypes.py
+++ b/haptools/data/haplotypes.py
@@ -493,7 +493,7 @@ def check_header(self, lines: list[str], check_version=False):
                     expected_lines.remove(line)
                 except ValueError:
                     # extract the name of the extra field
-                    name = line.split("\t", maxsplit=1)[1]
+                    name = line.split("\t", maxsplit=2)[1]
                     raise ValueError(
                         f"The extra field '{name}' is declared in the header of the"
                         " .hap file but is not accepted by this tool."
diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py
index 3399352b..d854fab7 100644
--- a/haptools/sim_phenotypes.py
+++ b/haptools/sim_phenotypes.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
+from pathlib import Path
 from logging import getLogger, Logger
 
+import numpy as np
 import numpy.typing as npt
 
-from haplotype import HaptoolsHaplotype as Haplotype
+from .haplotype import HaptoolsHaplotype as Haplotype
 from .data import GenotypesRefAlt, Phenotypes, Haplotypes
 
 
@@ -57,23 +59,20 @@ def __init__(
 
     def run(
         self,
-        varID: str,
-        beta: float,
+        effects: list[Haplotype],
         heritability: float = None,
         prevalence: float = None,
     ) -> npt.NDArray:
         """
         Simulate phenotypes for an entry in the Genotypes object
 
-        The generated phenotypes will also be added to :py:attr:`~.PhenoSimulator.fname`
+        The generated phenotypes will also be added to
+        :py:attr:`~.PhenoSimulator.output`
 
         Parameters
         ----------
-        varID: str
-            The ID of the variant in the Genotype object from which to simulate
-            phenotypes
-        beta: float
-            The simulated effect size of the variant; must be between -1 and 1
+        effects: list[Haplotype]
+            A list of Haplotypes to use in an additive fashion within the simulations
         heritability: float, optional
             The simulated heritability of the trait
 
@@ -87,20 +86,29 @@ def run(
 
         Returns
         -------
+        npt.NDArray
+            The simulated phenotypes, as a np array of shape num_samples x 1
         """
-        gts = self.gens.subset(variants=[varID]).data[:, 0].sum(axis=1)
-        pt = effect * (gts - gts.mean(axis=0)) / gts.std(axis=0)
+        # extract the needed information from the Haplotype objects
+        ids = [hap.id for hap in effects]
+        betas = np.array([hap.beta for hap in effects])
+        # extract the haplotype "genotypes" and compute the phenotypes
+        gts = self.gens.subset(variants=ids).data.sum(axis=2)
+        # standardize the genotypes
+        gts = (gts - gts.mean(axis=0)) / gts.std(axis=0)
+        # generate the phenotypes
+        pt = (betas * gts).sum(axis=1)
         # add an error term
         if heritability:
             # as defined in GTCA
             pt += np.random.normal(0, np.var(pt) * ((1/heritability) - 1) )
         else:
-            pt += np.random.normal(0, 1 - np.var(gts)**2)
+            pt += np.random.normal(0, 1 - np.power(betas, 2).sum())
         if self.phens.data is None:
             self.phens.data = pt
         else:
             self.phens.data = np.concatenate((self.phens.data, pt), axis=1)
-        self.phens.names = self.phens.names + (varID,)
+        self.phens.names = self.phens.names + ("-".join(ids),)
         # TODO: implement case/control phenotypes
         return pt
 
@@ -131,14 +139,14 @@ def simulate_pt(
         )
 
     log.info("Loading haplotypes")
-    hp = data.Haplotypes(haplotypes, log=log)
+    hp = Haplotypes(haplotypes, haplotype=Haplotype, log=log)
     hp.read(region=region)
 
     log.info("Extracting variants from haplotypes")
     variants = {var.id for hap in hp.data.values() for var in hap.variants}
 
     log.info("Loading genotypes")
-    gt = data.GenotypesRefAlt(genotypes, log=log)
+    gt = GenotypesRefAlt(genotypes, log=log)
     # gt._prephased = True
     gt.read(region=region, samples=samples, variants=variants)
     gt.check_missing()
@@ -146,12 +154,11 @@ def simulate_pt(
     gt.check_phase()
 
     log.info("Transforming genotypes via haplotypes")
-    hp_gt = data.GenotypesRefAlt(fname=None, log=log)
+    hp_gt = GenotypesRefAlt(fname=None, log=log)
     hp.transform(gt, hp_gt)
 
     # Initialize phenotype simulator (haptools simphenotype)
     log.info("Simulating phenotypes")
     pt_sim = PhenoSimulator(hp_gt, output=output, log=log)
-    for hap in hp.data.values():
-        pt_sim.run(hap.id, hap.beta)
+    pt_sim.run(hp.data.values())
     pt_sim.write()

From 05430eec94811decc151bea6ebb1564312bd23d2 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sat, 2 Jul 2022 11:35:34 -0700
Subject: [PATCH 21/44] merge haplotype module with sim_phenotypes module

---
 haptools/haplotype.py      | 26 --------------------------
 haptools/sim_phenotypes.py | 25 +++++++++++++++++++++++--
 2 files changed, 23 insertions(+), 28 deletions(-)
 delete mode 100644 haptools/haplotype.py

diff --git a/haptools/haplotype.py b/haptools/haplotype.py
deleted file mode 100644
index b6c0a1d8..00000000
--- a/haptools/haplotype.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from __future__ import annotations
-from dataclasses import dataclass, field
-
-import numpy as np
-
-from .data import Extra, Haplotype
-
-
-@dataclass
-class HaptoolsHaplotype(Haplotype):
-    """
-    A haplotype with sufficient fields for simphenotype
-
-    Properties and functions are shared with the base Haplotype object
-    """
-
-    ancestry: str
-    beta: float
-    _extras: tuple = field(
-        repr=False,
-        init=False,
-        default=(
-            Extra("ancestry", "s", "Local ancestry"),
-            Extra("beta", ".2f", "Effect size in linear model"),
-        ),
-    )
diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py
index d854fab7..8736834f 100644
--- a/haptools/sim_phenotypes.py
+++ b/haptools/sim_phenotypes.py
@@ -1,12 +1,33 @@
 from __future__ import annotations
 from pathlib import Path
 from logging import getLogger, Logger
+from dataclasses import dataclass, field
 
 import numpy as np
 import numpy.typing as npt
 
-from .haplotype import HaptoolsHaplotype as Haplotype
-from .data import GenotypesRefAlt, Phenotypes, Haplotypes
+from .data import Haplotype as HaplotypeBase
+from .data import GenotypesRefAlt, Phenotypes, Haplotypes, Extra
+
+
+@dataclass
+class Haplotype(HaplotypeBase):
+    """
+    A haplotype with sufficient fields for simphenotype
+
+    Properties and functions are shared with the base Haplotype object, "HaplotypeBase"
+    """
+
+    ancestry: str
+    beta: float
+    _extras: tuple = field(
+        repr=False,
+        init=False,
+        default=(
+            Extra("ancestry", "s", "Local ancestry"),
+            Extra("beta", ".2f", "Effect size in linear model"),
+        ),
+    )
 
 
 class PhenoSimulator:

From 0636294a3d652ef2cc8d2668fb1b855b20bb13ed Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sat, 2 Jul 2022 13:00:21 -0700
Subject: [PATCH 22/44] compute heritability when not provided in simphenotype

---
 haptools/sim_phenotypes.py | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py
index 8736834f..019be83f 100644
--- a/haptools/sim_phenotypes.py
+++ b/haptools/sim_phenotypes.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 from pathlib import Path
+from itertools import combinations
 from logging import getLogger, Logger
 from dataclasses import dataclass, field
 
@@ -110,27 +111,38 @@ def run(
         npt.NDArray
             The simulated phenotypes, as a np array of shape num_samples x 1
         """
-        # extract the needed information from the Haplotype objects
+        # extract the ID and effect size information from the Haplotype objects
         ids = [hap.id for hap in effects]
         betas = np.array([hap.beta for hap in effects])
         # extract the haplotype "genotypes" and compute the phenotypes
         gts = self.gens.subset(variants=ids).data.sum(axis=2)
         # standardize the genotypes
         gts = (gts - gts.mean(axis=0)) / gts.std(axis=0)
-        # generate the phenotypes
+        # generate the genetic component
         pt = (betas * gts).sum(axis=1)
-        # add an error term
-        if heritability:
-            # as defined in GTCA
-            pt += np.random.normal(0, np.var(pt) * ((1/heritability) - 1) )
-        else:
-            pt += np.random.normal(0, 1 - np.power(betas, 2).sum())
+        # compute the heritability
+        if heritability is None:
+            # if heritability is not defined, then we set it equal to the sum of the
+            # effect sizes
+            # assuming the genotypes are independent, this makes the variance of the
+            # noise term equal to 1 - sum(betas^2)
+            heritability = np.power(betas, 2).sum()
+            # # account for the fact that the genotypes are not independent by adding the
+            # # covariance between all of the variables
+            # for a_idx, b_idx in combinations(range(len(betas)), 2):
+            #     heritability += 2 * betas[a_idx] * betas[b_idx] * \
+            #         np.cov(gts[:,a_idx], gts[:,b_idx])[0][1]
+        # compute the environmental effect
+        noise = np.var(pt) * (np.reciprocal(heritability) - 1)
+        # finally, add everything together to get the simulated phenotypes
+        pt += np.random.normal(0, noise, size=pt.shape)
+        # TODO: implement case/control phenotypes
+        # now, save the archived phenotypes for later
         if self.phens.data is None:
             self.phens.data = pt
         else:
             self.phens.data = np.concatenate((self.phens.data, pt), axis=1)
         self.phens.names = self.phens.names + ("-".join(ids),)
-        # TODO: implement case/control phenotypes
         return pt
 
     def write(self):
@@ -170,6 +182,7 @@ def simulate_pt(
     gt = GenotypesRefAlt(genotypes, log=log)
     # gt._prephased = True
     gt.read(region=region, samples=samples, variants=variants)
+    log.info("QC-ing genotypes")
     gt.check_missing()
     gt.check_biallelic()
     gt.check_phase()
@@ -182,4 +195,5 @@ def simulate_pt(
     log.info("Simulating phenotypes")
     pt_sim = PhenoSimulator(hp_gt, output=output, log=log)
     pt_sim.run(hp.data.values())
+    log.info("Writing phenotypes")
     pt_sim.write()

From 54f48de43c42f1af524633dd7aa4d7a76026e806 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sat, 2 Jul 2022 15:01:10 -0700
Subject: [PATCH 23/44] implement case/control

---
 haptools/sim_phenotypes.py | 71 ++++++++++++++++++++++++++++++++++----
 1 file changed, 65 insertions(+), 6 deletions(-)

diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotypes.py
index 019be83f..2b1d3d35 100644
--- a/haptools/sim_phenotypes.py
+++ b/haptools/sim_phenotypes.py
@@ -111,17 +111,20 @@ def run(
         npt.NDArray
             The simulated phenotypes, as a np array of shape num_samples x 1
         """
-        # extract the ID and effect size information from the Haplotype objects
+        # extract the relevant haplotype info from the Haplotype objects
         ids = [hap.id for hap in effects]
         betas = np.array([hap.beta for hap in effects])
+        self.log.info(f"Extracting haplotype genotypes for haps: {ids}")
         # extract the haplotype "genotypes" and compute the phenotypes
         gts = self.gens.subset(variants=ids).data.sum(axis=2)
+        self.log.info("Computing genetic component")
         # standardize the genotypes
         gts = (gts - gts.mean(axis=0)) / gts.std(axis=0)
         # generate the genetic component
         pt = (betas * gts).sum(axis=1)
         # compute the heritability
         if heritability is None:
+            self.log.info("Computing heritability")
             # if heritability is not defined, then we set it equal to the sum of the
             # effect sizes
             # assuming the genotypes are independent, this makes the variance of the
@@ -132,11 +135,20 @@ def run(
             # for a_idx, b_idx in combinations(range(len(betas)), 2):
             #     heritability += 2 * betas[a_idx] * betas[b_idx] * \
             #         np.cov(gts[:,a_idx], gts[:,b_idx])[0][1]
+        self.log.info(f"Adding environmental component for h^squared: {heritability}")
         # compute the environmental effect
         noise = np.var(pt) * (np.reciprocal(heritability) - 1)
         # finally, add everything together to get the simulated phenotypes
         pt += np.random.normal(0, noise, size=pt.shape)
-        # TODO: implement case/control phenotypes
+        if prevalence is not None:
+            self.log.info(f"Converting to case/control with prevalence {prevalence}")
+            # first, find the number of desired positives
+            k = int(prevalence * len(pt))
+            # choose the top k values and label them positive
+            bool_pt = np.repeat(False, repeats=len(pt))
+            bool_pt[np.argpartition(pt, k)[-k:]] = True
+            pt = bool_pt
+        pt = pt[:, np.newaxis]
         # now, save the archived phenotypes for later
         if self.phens.data is None:
             self.phens.data = pt
@@ -156,14 +168,60 @@ def write(self):
 def simulate_pt(
     genotypes: Path,
     haplotypes: Path,
-    simu_rep: int,
-    simu_hsq: float,
-    simu_k: float,
+    num_replications: int = 1,
+    heritability: float = None,
+    prevalence: float = None,
     region: str = None,
     samples: list[str] = None,
     output: Path = Path("-"),
     log: Logger = None,
 ):
+    """
+    Haplotype-aware phenotype simulation. Create a set of simulated phenotypes from a
+    set of haplotypes.
+
+    GENOTYPES must be formatted as a VCF and HAPLOTYPES must be formatted according
+    to the .hap format spec
+
+    \f
+    Examples
+    --------
+    >>> haptools simphenotype tests/data/example.vcf.gz tests/data/example.hap.gz > simu_phens.tsv
+
+    Parameters
+    ----------
+    genotypes : Path
+        The path to the genotypes in VCF format
+    haplotypes : Path
+        The path to the haplotypes in a .hap file
+    replications : int, optional
+        The number of rounds of simulation to perform
+    heritability : int, optional
+        The heritability of the simulated trait; must be a float between 0 and 1
+    prevalence : int, optional
+        The prevalence of the disease if the trait should be simulated as case/control;
+        must be a float between 0 and 1
+
+        If not provided, a quantitative trait will be simulated, instead
+    region : str, optional
+        The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'
+
+        For this to work, the VCF and .hap file must be indexed and the seqname must
+        match!
+
+        Defaults to loading all haplotypes
+    sample : tuple[str], optional
+        A subset of the samples from which to extract genotypes
+
+        Defaults to loading genotypes from all samples
+    samples_file : Path, optional
+        A single column txt file containing a list of the samples (one per line) to
+        subset from the genotypes file
+    output : Path, optional
+        The location to which to write the simulated phenotypes
+    log : Logger, optional
+        The logging module for this task
+    """
     if log is None:
         log = logging.getLogger("run")
         logging.basicConfig(
@@ -194,6 +252,7 @@ def simulate_pt(
     # Initialize phenotype simulator (haptools simphenotype)
     log.info("Simulating phenotypes")
     pt_sim = PhenoSimulator(hp_gt, output=output, log=log)
-    pt_sim.run(hp.data.values())
+    for i in range(num_replications):
+        pt_sim.run(hp.data.values(), heritability, prevalence)
     log.info("Writing phenotypes")
     pt_sim.write()

From 7fba06c5fe77bd9ad5ad57c561fa8c974362d822 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sat, 2 Jul 2022 15:01:52 -0700
Subject: [PATCH 24/44] improve commenting in __main__

---
 haptools/__main__.py       | 92 ++++++++++++++++++++++++++------------
 haptools/data/genotypes.py |  6 +--
 haptools/transform.py      |  2 +-
 3 files changed, 67 insertions(+), 33 deletions(-)

diff --git a/haptools/__main__.py b/haptools/__main__.py
index d2b1fd30..38f7a697 100644
--- a/haptools/__main__.py
+++ b/haptools/__main__.py
@@ -80,21 +80,40 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms):
     breakpoints = write_breakpoints(samples, breakpoints, out)
 
 ############ Haptools simphenotype ###############
-DEFAULT_SIMU_REP = 1
-DEFAULT_SIMU_HSQ = 0.1
-DEFAULT_SIMU_K = 0.1
-##################################################
 @main.command()
 @click.argument("genotypes", type=click.Path(exists=True, path_type=Path))
 @click.argument("haplotypes", type=click.Path(exists=True, path_type=Path))
+@click.option(
+    "-r",
+    "--replications",
+    type=click.IntRange(min=1),
+    default=1,
+    show_default=True,
+    help="Number of rounds of simulation to perform",
+)
+@click.option(
+    "-h",
+    "--heritability",
+    type=click.FloatRange(min=0, max=1),
+    help="Trait heritability",
+)
+@click.option(
+    "-p",
+    "--prevalence",
+    type=click.FloatRange(min=0, max=1, min_open=True, max_open=True),
+    show_default="quantitative trait",
+    help="Disease prevalence if simulating a case-control trait"
+)
 @click.option(
     "--region",
     type=str,
     default=None,
     show_default="all haplotypes",
-    help="""
-    The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'\n
-    For this to work, the VCF must be indexed and the seqname must match!""",
+    help=(
+        "The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'"
+        "For this to work, the VCF and .hap file must be indexed and the seqname "
+        "provided must correspond with one in the files"
+    )
 )
 @click.option(
     "-s",
@@ -118,17 +137,11 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms):
         " subset from the genotypes file"
     ),
 )
-@click.option('--simu-rep', help='Number of rounds of simulation to perform', \
-        type=int, default=DEFAULT_SIMU_REP)
-@click.option('--simu-hsq', help='Trait heritability', \
-        type=float, default=DEFAULT_SIMU_HSQ)
-@click.option('--simu-k', help='Specify the disease prevalence', \
-        type=float, default=DEFAULT_SIMU_K)
 @click.option(
     "-o",
     "--output",
     type=click.Path(path_type=Path),
-    default=Path("-"),
+    default=Path("/dev/stdout"),
     show_default="stdout",
     help="A TSV file containing simulated phenotypes",
 )
@@ -143,7 +156,9 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms):
 def simphenotype(
     genotypes: Path,
     haplotypes: Path,
-    simu_rep, simu_hsq, simu_k,
+    replications: int = 1,
+    heritability: float = None,
+    prevalence: float = None,
     region: str = None,
     samples: tuple[str] = tuple(),
     samples_file: Path = None,
@@ -168,10 +183,26 @@ def simphenotype(
         The path to the genotypes in VCF format
     haplotypes : Path
         The path to the haplotypes in a .hap file
+    replications : int, optional
+        The number of rounds of simulation to perform
+    heritability : int, optional
+        The heritability of the simulated trait; must be a float between 0 and 1
+    prevalence : int, optional
+        The prevalence of the disease if the trait should be simulated as case/control;
+        must be a float between 0 and 1
+
+        If not provided, a quantitative trait will be simulated, instead
     region : str, optional
-        See documentation for :py:meth:`~.data.Genotypes.read`
-    sample : Tuple[str], optional
-        See documentation for :py:meth:`~.data.Genotypes.read`
+        The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'
+
+        For this to work, the VCF and .hap file must be indexed and the seqname must
+        match!
+
+        Defaults to loading all haplotypes
+    sample : tuple[str], optional
+        A subset of the samples from which to extract genotypes
+
+        Defaults to loading genotypes from all samples
     samples_file : Path, optional
         A single column txt file containing a list of the samples (one per line) to
         subset from the genotypes file
@@ -202,14 +233,11 @@ def simphenotype(
         samples = list(samples)
     else:
         samples = None
-    # Basic checks on input
-    # TODO - check VCF zipped, check only one of simu-qt/simu-cc,
-    # check values of other inputs
-    # Only use simu-k for case/control
 
     # Run simulation
     simulate_pt(
-        genotypes, haplotypes, simu_rep, simu_hsq, simu_k, region, samples, output, log
+        genotypes, haplotypes, replications, heritability, prevalence, region, samples,
+        output, log
     )
 
 @main.command(short_help="Transform a genotypes matrix via a set of haplotypes")
@@ -222,8 +250,8 @@ def simphenotype(
     show_default="all haplotypes",
     help=(
         "The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'"
-        "For this to work, the VCF and .hap file must be indexed and the seqnames must"
-        "correspond with those in the files"
+        "For this to work, the VCF and .hap file must be indexed and the seqname "
+        "provided must correspond with one in the files"
     )
 )
 @click.option(
@@ -291,10 +319,16 @@ def transform(
     haplotypes : Path
         The path to the haplotypes in a .hap file
     region : str, optional
-        See documentation for :py:meth:`~.data.Genotypes.read`
-        and :py:meth:`~.data.Haplotypes.read`
-    sample : Tuple[str], optional
-        See documentation for :py:meth:`~.data.Genotypes.read`
+        The region from which to extract haplotypes; ex: 'chr1:1234-34566' or 'chr7'
+
+        For this to work, the VCF and .hap file must be indexed and the seqname must
+        match!
+
+        Defaults to loading all haplotypes
+    sample : tuple[str], optional
+        A subset of the samples from which to extract genotypes
+
+        Defaults to loading genotypes from all samples
     samples_file : Path, optional
         A single column txt file containing a list of the samples (one per line) to
         subset from the genotypes file
diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index 039446a9..ff128ee6 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -289,9 +289,9 @@ def __iter__(
 
     def index(self, samples: bool = True, variants: bool = True):
         """
-        Call this function once to improve the speed of future look-ups of samples and
-        variants by their ID. This is useful if you intend to later subset by a set
-        of samples or variant IDs.
+        Call this function once to improve the amortized time-complexity of look-ups of
+        samples and variants by their ID. This is useful if you intend to later subset
+        by a set of samples or variant IDs.
         The time complexity of this function should be roughly O(n+m) if both
         parameters are True. Otherwise, it will be either O(n) or O(m).
 
diff --git a/haptools/transform.py b/haptools/transform.py
index cbcb1bf5..2de57a2f 100644
--- a/haptools/transform.py
+++ b/haptools/transform.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 import logging
+from pathlib import Path
 
 from haptools import data
-from .haplotype import HaptoolsHaplotype
 
 
 def transform_haps(

From 6c9d7ce29c117fd17acd89a1aa78ea7fa23be256 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sat, 2 Jul 2022 15:06:14 -0700
Subject: [PATCH 25/44] rename sim_phenotypes module to sim_phenotype
 (singular)

---
 haptools/__main__.py                             | 2 +-
 haptools/{sim_phenotypes.py => sim_phenotype.py} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename haptools/{sim_phenotypes.py => sim_phenotype.py} (100%)

diff --git a/haptools/__main__.py b/haptools/__main__.py
index 38f7a697..999e9293 100644
--- a/haptools/__main__.py
+++ b/haptools/__main__.py
@@ -213,7 +213,7 @@ def simphenotype(
     """
     import logging
 
-    from .sim_phenotypes import simulate_pt
+    from .sim_phenotype import simulate_pt
 
     log = logging.getLogger("run")
     logging.basicConfig(
diff --git a/haptools/sim_phenotypes.py b/haptools/sim_phenotype.py
similarity index 100%
rename from haptools/sim_phenotypes.py
rename to haptools/sim_phenotype.py

From c6f1c5c086cdbf13748ce5892fb0937586debbad Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Sun, 3 Jul 2022 14:19:12 -0700
Subject: [PATCH 26/44] create test module for simphenotype

---
 haptools/sim_phenotype.py  | 12 ++++++++----
 tests/test_simphenotype.py | 23 +++++++++++++++++++++++
 2 files changed, 31 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_simphenotype.py

diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py
index 2b1d3d35..a2c96cc6 100644
--- a/haptools/sim_phenotype.py
+++ b/haptools/sim_phenotype.py
@@ -51,13 +51,14 @@ class PhenoSimulator:
     >>> haps_gts = GenotypesRefAlt(None)
     >>> haps.transform(gens, haps_gts)
     >>> phenosim = PhenoSimulator(haps_gts)
-    >>> phenotypes = phenosim.run()
+    >>> phenosim.run(next(haps.data.values()))
+    >>> phenotypes = phenosim.phens
     """
 
     def __init__(
         self,
         genotypes: Genotypes,
-        output: Path = None,
+        output: Path = Path("/dev/stdout"),
         log: Logger = None,
     ):
         """
@@ -67,8 +68,10 @@ def __init__(
         ----------
         genotypes: Genotypes
             Genotypes for each haplotype
-        output: Path
+        output: Path, optional
             Path to a '.pheno' file to which the generated phenotypes could be written
+
+            Defaults to stdout if not provided
         log: Logger, optional
             A logging instance for recording debug statements
         """
@@ -129,13 +132,14 @@ def run(
             # effect sizes
             # assuming the genotypes are independent, this makes the variance of the
             # noise term equal to 1 - sum(betas^2)
+            # TODO: figure out how to account for the fact that this can make noise > 1
             heritability = np.power(betas, 2).sum()
             # # account for the fact that the genotypes are not independent by adding the
             # # covariance between all of the variables
             # for a_idx, b_idx in combinations(range(len(betas)), 2):
             #     heritability += 2 * betas[a_idx] * betas[b_idx] * \
             #         np.cov(gts[:,a_idx], gts[:,b_idx])[0][1]
-        self.log.info(f"Adding environmental component for h^squared: {heritability}")
+        self.log.info(f"Adding environmental component for h^2: {heritability}")
         # compute the environmental effect
         noise = np.var(pt) * (np.reciprocal(heritability) - 1)
         # finally, add everything together to get the simulated phenotypes
diff --git a/tests/test_simphenotype.py b/tests/test_simphenotype.py
new file mode 100644
index 00000000..97500955
--- /dev/null
+++ b/tests/test_simphenotype.py
@@ -0,0 +1,23 @@
+import os
+from pathlib import Path
+
+import pytest
+import numpy as np
+import numpy.lib.recfunctions as rfn
+
+from haptools.sim_phenotype import Haplotype
+from haptools.data import (
+    Genotypes,
+    GenotypesRefAlt,
+    Phenotypes,
+    Haplotypes,
+    Variant,
+    Haplotype as HaplotypeBase,
+)
+
+
+DATADIR = Path(__file__).parent.joinpath("data")
+
+def test_simphenotype(self):
+    pass
+

From 1d9677527eebafb541c82de5c5f5c9abcf0aac6f Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 5 Jul 2022 17:54:48 -0700
Subject: [PATCH 27/44] try to use new pysam add_samples() method to speed up
 vcf creation see https://github.com/pysam-developers/pysam/issues/1104

---
 haptools/data/genotypes.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index ff128ee6..f9decf72 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -590,10 +590,15 @@ def write(self):
                 ("Description", "Genotype"),
             ],
         )
-        for sample in self.samples:
-            # TODO: figure out how to make this work for large datasets
-            # it gets stuck and takes a long time to exit this loop
-            vcf.header.add_sample(sample)
+        try:
+            vcf.header.add_samples(self.samples)
+        except AttributeError:
+            self.log.warning(
+                "Upgrade to pysam >=0.19.1 to reduce the time required to create "
+                "VCFs. See https://github.com/pysam-developers/pysam/issues/1104"
+            )
+            for sample in self.samples:
+                vcf.header.add_sample(sample)
         self.log.info("Writing VCF records")
         for var_idx, var in enumerate(self.variants):
             rec = {

From 23a067acf2c15030314b2e5a7b6c9cd5c179adf7 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 5 Jul 2022 17:58:54 -0700
Subject: [PATCH 28/44] use default heritability of 1 and add append method to
 Phenotypes class

---
 docs/commands/transform.rst |  2 +-
 haptools/__main__.py        |  4 +++-
 haptools/data/phenotypes.py | 30 ++++++++++++++++++++++++++++++
 haptools/sim_phenotype.py   | 26 ++++----------------------
 tests/test_data.py          | 18 +++++++++++++++++-
 5 files changed, 55 insertions(+), 25 deletions(-)

diff --git a/docs/commands/transform.rst b/docs/commands/transform.rst
index e6f82493..905fb9b7 100644
--- a/docs/commands/transform.rst
+++ b/docs/commands/transform.rst
@@ -24,7 +24,7 @@ Examples
 ~~~~~~~~
 .. code-block:: bash
 
-	haptools transform tests/data/example.vcf.gz tests/data/example.hap.gz | less
+	haptools transform tests/data/example.vcf.gz tests/data/basic.hap.gz | less
 
 Detailed Usage
 ~~~~~~~~~~~~~~
diff --git a/haptools/__main__.py b/haptools/__main__.py
index 999e9293..adeab099 100644
--- a/haptools/__main__.py
+++ b/haptools/__main__.py
@@ -95,6 +95,8 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms):
     "-h",
     "--heritability",
     type=click.FloatRange(min=0, max=1),
+    default=1,
+    show_default=True,
     help="Trait heritability",
 )
 @click.option(
@@ -157,7 +159,7 @@ def simphenotype(
     genotypes: Path,
     haplotypes: Path,
     replications: int = 1,
-    heritability: float = None,
+    heritability: float = 1,
     prevalence: float = None,
     region: str = None,
     samples: tuple[str] = tuple(),
diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py
index 1a61ebaa..dbbb2626 100644
--- a/haptools/data/phenotypes.py
+++ b/haptools/data/phenotypes.py
@@ -193,3 +193,33 @@ def standardize(self):
         This function modifies :py:attr:`~.Genotypes.data` in-place
         """
         self.data = (self.data - np.mean(self.data, axis=0)) / np.std(self.data, axis=0)
+
+    def append(self, name: str, data: npt.NDArray):
+        """
+        Append a new set of phenotypes to the current set
+
+        Parameters
+        ----------
+        name: str
+            The name of the new phenotype
+        data: npt.NDArray
+            A 1D np array of the same length as :py:attr:`~.Phenotypes.samples`,
+            containing the phenotype values for each sample. Must have the same dtype
+            as :py:attr:`~.Phenotypes.data.`
+        """
+        if len(self.samples):
+            if len(self.samples) != len(data):
+                self.log.error(
+                    "The data provided to the add() method is not of the appropriate"
+                    "length"
+                )
+        else:
+            self.log.warning(
+                "Set the samples property of the Phenotypes instance before calling "
+                "the add() method"
+            )
+        if self.unset():
+            self.data = data[:, np.newaxis]
+        else:
+            self.data = np.concatenate((self.data, data[:, np.newaxis]), axis=1)
+        self.names = self.names + (name,)
diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py
index a2c96cc6..ca7a1c8b 100644
--- a/haptools/sim_phenotype.py
+++ b/haptools/sim_phenotype.py
@@ -85,7 +85,7 @@ def __init__(
     def run(
         self,
         effects: list[Haplotype],
-        heritability: float = None,
+        heritability: float = 1,
         prevalence: float = None,
     ) -> npt.NDArray:
         """
@@ -120,25 +120,12 @@ def run(
         self.log.info(f"Extracting haplotype genotypes for haps: {ids}")
         # extract the haplotype "genotypes" and compute the phenotypes
         gts = self.gens.subset(variants=ids).data.sum(axis=2)
-        self.log.info("Computing genetic component")
+        self.log.info(f"Computing genetic component w/ {gts.shape[0]} causal effects")
         # standardize the genotypes
         gts = (gts - gts.mean(axis=0)) / gts.std(axis=0)
         # generate the genetic component
         pt = (betas * gts).sum(axis=1)
         # compute the heritability
-        if heritability is None:
-            self.log.info("Computing heritability")
-            # if heritability is not defined, then we set it equal to the sum of the
-            # effect sizes
-            # assuming the genotypes are independent, this makes the variance of the
-            # noise term equal to 1 - sum(betas^2)
-            # TODO: figure out how to account for the fact that this can make noise > 1
-            heritability = np.power(betas, 2).sum()
-            # # account for the fact that the genotypes are not independent by adding the
-            # # covariance between all of the variables
-            # for a_idx, b_idx in combinations(range(len(betas)), 2):
-            #     heritability += 2 * betas[a_idx] * betas[b_idx] * \
-            #         np.cov(gts[:,a_idx], gts[:,b_idx])[0][1]
         self.log.info(f"Adding environmental component for h^2: {heritability}")
         # compute the environmental effect
         noise = np.var(pt) * (np.reciprocal(heritability) - 1)
@@ -152,13 +139,8 @@ def run(
             bool_pt = np.repeat(False, repeats=len(pt))
             bool_pt[np.argpartition(pt, k)[-k:]] = True
             pt = bool_pt
-        pt = pt[:, np.newaxis]
         # now, save the archived phenotypes for later
-        if self.phens.data is None:
-            self.phens.data = pt
-        else:
-            self.phens.data = np.concatenate((self.phens.data, pt), axis=1)
-        self.phens.names = self.phens.names + ("-".join(ids),)
+        self.phens.append(name="-".join(ids), data=pt)
         return pt
 
     def write(self):
@@ -173,7 +155,7 @@ def simulate_pt(
     genotypes: Path,
     haplotypes: Path,
     num_replications: int = 1,
-    heritability: float = None,
+    heritability: float = 1,
     prevalence: float = None,
     region: str = None,
     samples: list[str] = None,
diff --git a/tests/test_data.py b/tests/test_data.py
index 30c1e3a9..001eac7a 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -5,7 +5,7 @@
 import numpy as np
 import numpy.lib.recfunctions as rfn
 
-from haptools.haplotype import HaptoolsHaplotype
+from haptools.sim_phenotype import Haplotype as HaptoolsHaplotype
 from haptools.data import (
     Genotypes,
     GenotypesRefAlt,
@@ -320,6 +320,22 @@ def test_write_phenotypes(self):
         # let's clean up after ourselves and delete the file
         os.remove(str(expected_phen.fname))
 
+    def test_append_phenotype(self):
+        expected1 = self._get_fake_phenotypes()
+        expected2 = self._get_fake_phenotypes()
+
+        # add a phenotype called "test" to the set of phenotypes
+        new_phen = np.array([5, 2, 8, 0, 3], dtype=expected2.data.dtype)
+        expected2.append("test", new_phen)
+
+        # did it get added correctly?
+        assert expected1.data.shape[1] == expected2.data.shape[1] - 1
+        assert len(expected1.names) == len(expected2.names) - 1
+        assert expected2.names[2] == "test"
+        assert len(expected1.samples) == len(expected2.samples)
+        np.testing.assert_allclose(expected1.data, expected2.data[:, :2])
+        np.testing.assert_allclose(expected2.data[:, 2], new_phen)
+
 
 class TestCovariates:
     def _get_expected_covariates(self):

From 30852bcd297d9ad43d9ece7885d3692e393ddf84 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 5 Jul 2022 19:43:25 -0700
Subject: [PATCH 29/44] clarify installation and contribution guidelines remove
 @mlamkin7's email so he doesn't get spam from bots that scrape repositories

---
 README.md                   | 10 ++++++----
 haptools/data/phenotypes.py |  1 +
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index f9d64180..f8692eab 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,10 @@ Homepage: https://haptools.readthedocs.io/
 
 ## Installation
 
-UNDER CONSTRUCTION
+We have not officially published `haptools` yet, but in the meantime, you can install it directly from our Github repository.
+```bash
+pip install git+https://github.com/gymrek-lab/haptools.git
+```
 
 ## Haptools utilities
 
@@ -28,7 +31,6 @@ Outputs produced by these utilities are compatible with each other. For example
 
 ## Contributing
 
-If you are interested in contributing to `haptools`, please get in touch by submitting a Github issue or contacting us at mlamkin@ucsd.edu.
-
-
+We gladly welcome any contributions to `haptools`!
 
+Please read [our contribution guidelines](https://haptools.readthedocs.io/en/latest/project_info/contributing.html) and then submit a [Github issue](https://github.com/gymrek-lab/haptools/issues).
diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py
index dbbb2626..407ebde4 100644
--- a/haptools/data/phenotypes.py
+++ b/haptools/data/phenotypes.py
@@ -8,6 +8,7 @@
 from fileinput import hook_compressed
 
 import numpy as np
+import numpy.typing as npt
 
 from .data import Data
 

From fe63e3263131b8a810586c3ce64f73c0ba7322d8 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 7 Jul 2022 09:49:05 -0700
Subject: [PATCH 30/44] seed the simphenotype random number generator

---
 haptools/data/genotypes.py  |  4 ++--
 haptools/data/haplotypes.py |  4 ++--
 haptools/sim_phenotype.py   | 12 ++++++++++--
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py
index f9decf72..98d4cd83 100644
--- a/haptools/data/genotypes.py
+++ b/haptools/data/genotypes.py
@@ -348,7 +348,7 @@ def subset(
                 diff = len(samples) - len(gts.samples)
                 self.log.warning(
                     f"Saw {diff} fewer samples than requested. Proceeding with "
-                    f"f{len(gts.samples)} samples."
+                    f"{len(gts.samples)} samples."
                 )
             samp_idx = tuple(self._samp_idx[samp] for samp in gts.samples)
             if inplace:
@@ -361,7 +361,7 @@ def subset(
                 diff = len(variants) - len(var_idx)
                 self.log.warning(
                     f"Saw {diff} fewer variants than requested. Proceeding with "
-                    f"f{len(var_idx)} variants."
+                    f"{len(var_idx)} variants."
                 )
             gts.variants = self.variants[var_idx]
             if inplace:
diff --git a/haptools/data/haplotypes.py b/haptools/data/haplotypes.py
index bd3f7da3..5adb031a 100644
--- a/haptools/data/haplotypes.py
+++ b/haptools/data/haplotypes.py
@@ -234,8 +234,8 @@ class Haplotype:
         The chromosomal end position of the haplotype
     id: str
         The haplotype's unique ID
-    variants: list[Variant]
-        A list of the variants in this haplotype
+    variants: tuple[Variant]
+        The variants in this haplotype
     _extras: tuple[Extra]
         Extra fields for the haplotype
 
diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py
index ca7a1c8b..b0c5386d 100644
--- a/haptools/sim_phenotype.py
+++ b/haptools/sim_phenotype.py
@@ -41,6 +41,8 @@ class PhenoSimulator:
         Genotypes to simulate
     phens: Phenotypes
         Simulated phenotypes; filled by :py:meth:`~.PhenoSimular.run`
+    rng: np.random.Generator, optional
+        A numpy random number generator
     log: Logger
         A logging instance for recording debug statements
 
@@ -59,6 +61,7 @@ def __init__(
         self,
         genotypes: Genotypes,
         output: Path = Path("/dev/stdout"),
+        seed: int = None,
         log: Logger = None,
     ):
         """
@@ -72,14 +75,19 @@ def __init__(
             Path to a '.pheno' file to which the generated phenotypes could be written
 
             Defaults to stdout if not provided
+        seed: int, optional
+            A seed to initialize the random number generator
+
+            This is useful if you want the generated phenotypes to be the same across
+            multiple PhenoSimulator instances. If not provided, it will be random.
         log: Logger, optional
             A logging instance for recording debug statements
         """
         self.gens = genotypes
         self.phens = Phenotypes(fname=output)
-        self.phens.names = tuple()
         self.phens.data = None
         self.phens.samples = self.gens.samples
+        self.rng = np.random.default_rng(seed)
         self.log = log or getLogger(self.__class__.__name__)
 
     def run(
@@ -130,7 +138,7 @@ def run(
         # compute the environmental effect
         noise = np.var(pt) * (np.reciprocal(heritability) - 1)
         # finally, add everything together to get the simulated phenotypes
-        pt += np.random.normal(0, noise, size=pt.shape)
+        pt += self.rng.normal(0, noise, size=pt.shape)
         if prevalence is not None:
             self.log.info(f"Converting to case/control with prevalence {prevalence}")
             # first, find the number of desired positives

From 24fef3923520cc54c070a8bee714b49235e462e3 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 8 Jul 2022 08:53:35 -0700
Subject: [PATCH 31/44] mark case/control in pheno output

---
 haptools/sim_phenotype.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py
index b0c5386d..71bacadd 100644
--- a/haptools/sim_phenotype.py
+++ b/haptools/sim_phenotype.py
@@ -139,6 +139,7 @@ def run(
         noise = np.var(pt) * (np.reciprocal(heritability) - 1)
         # finally, add everything together to get the simulated phenotypes
         pt += self.rng.normal(0, noise, size=pt.shape)
+        name_suffix = ""
         if prevalence is not None:
             self.log.info(f"Converting to case/control with prevalence {prevalence}")
             # first, find the number of desired positives
@@ -147,8 +148,9 @@ def run(
             bool_pt = np.repeat(False, repeats=len(pt))
             bool_pt[np.argpartition(pt, k)[-k:]] = True
             pt = bool_pt
+            name_suffix = "-cc"
         # now, save the archived phenotypes for later
-        self.phens.append(name="-".join(ids), data=pt)
+        self.phens.append(name="-".join(ids)+name_suffix, data=pt)
         return pt
 
     def write(self):

From 9de40ee343a6a30d46a9f9e2355b51035a76bd21 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 8 Jul 2022 08:54:05 -0700
Subject: [PATCH 32/44] add tests for simphenotype still need to revise
 case/control test

---
 tests/test_simphenotype.py | 184 +++++++++++++++++++++++++++++++++++--
 1 file changed, 178 insertions(+), 6 deletions(-)

diff --git a/tests/test_simphenotype.py b/tests/test_simphenotype.py
index 97500955..c53015f0 100644
--- a/tests/test_simphenotype.py
+++ b/tests/test_simphenotype.py
@@ -5,19 +5,191 @@
 import numpy as np
 import numpy.lib.recfunctions as rfn
 
-from haptools.sim_phenotype import Haplotype
+from haptools.sim_phenotype import Haplotype, PhenoSimulator
 from haptools.data import (
     Genotypes,
-    GenotypesRefAlt,
     Phenotypes,
     Haplotypes,
-    Variant,
-    Haplotype as HaplotypeBase,
 )
 
 
 DATADIR = Path(__file__).parent.joinpath("data")
 
-def test_simphenotype(self):
-    pass
+class TestSimPhenotype:
+    def _get_fake_gens(self):
+        gts = Genotypes(fname=None)
+        gts.data = np.array([
+            [[1, 0], [0, 1]],
+            [[0, 1], [0, 1]],
+            [[0, 0], [1, 1]],
+            [[1, 1], [1, 1]],
+            [[0, 0], [0, 0]],
+        ], dtype=np.uint8)
+        gts.variants = np.array(
+            [
+                ("1:10114:T:C", "1", 10114, 0.7),
+                ("1:10116:A:G", "1", 10116, 0.6),
+            ],
+            dtype=[
+                ("id", "U50"),
+                ("chrom", "U10"),
+                ("pos", np.uint32),
+                ("aaf", np.float64),
+            ],
+        )
+        gts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
+        return gts
 
+    def _get_fake_haps(self):
+        return [
+            Haplotype("1", 10114, 10115, "1:10114:T:C", "CEU", 0.25),
+            Haplotype("1", 10116, 10117, "1:10116:A:G", "YRI", 0.75),
+        ]
+
+    def _get_expected_phens(self):
+        pts = Phenotypes(fname=None)
+        pts.data = np.array([
+            [-0.13363062, False],
+            [-0.13363062, False],
+            [ 0.53452248, True],
+            [ 1.20267559, True],
+            [-1.46993683, False],
+        ], dtype=np.float64)
+        pts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
+        pts.names = ("1:10114:T:C", "1:10116:A:G")
+        return pts
+
+    def test_one_phen_zero_noise(self):
+        gts = self._get_fake_gens()
+        hps = self._get_fake_haps()
+        expected = self._get_expected_phens()
+
+        pt_sim = PhenoSimulator(gts, seed=42)
+        data = pt_sim.run([hps[0]], heritability=1)
+        data = data[:, np.newaxis]
+        phens = pt_sim.phens
+
+        # check the data and the generated phenotype object
+        assert phens.data.shape == (5, 1)
+        np.testing.assert_allclose(phens.data, data)
+        assert phens.data[0, 0] == phens.data[1, 0]
+        assert phens.data[2, 0] == phens.data[4, 0]
+        assert phens.data[3, 0] > phens.data[0, 0]
+        assert phens.data[2, 0] < phens.data[0, 0]
+        assert phens.samples == expected.samples
+        assert phens.names[0] == expected.names[0]
+
+    def test_one_phen_zero_noise_neg_beta(self):
+        """
+        the same test as test_one_phen_zero_noise but with a negative beta this time
+        """
+        gts = self._get_fake_gens()
+        hps = self._get_fake_haps()
+        # make the beta value negative
+        hps[0].beta = -hps[0].beta
+        expected = self._get_expected_phens()
+
+        pt_sim = PhenoSimulator(gts, seed=42)
+        data = pt_sim.run([hps[0]], heritability=1)
+        data = data[:, np.newaxis]
+        phens = pt_sim.phens
+
+        # check the data and the generated phenotype object
+        assert phens.data.shape == (5, 1)
+        np.testing.assert_allclose(phens.data, data)
+        assert phens.data[0, 0] == phens.data[1, 0]
+        assert phens.data[2, 0] == phens.data[4, 0]
+        assert phens.data[3, 0] < phens.data[0, 0]
+        assert phens.data[2, 0] > phens.data[0, 0]
+        assert phens.samples == expected.samples
+        assert phens.names[0] == expected.names[0]
+
+    def test_two_phens_zero_noise(self):
+        gts = self._get_fake_gens()
+        hps = self._get_fake_haps()
+        expected = self._get_expected_phens()
+
+        pt_sim = PhenoSimulator(gts, seed=42)
+        pt_sim.run([hps[0]], heritability=1)
+        pt_sim.run([hps[1]], heritability=1)
+        phens = pt_sim.phens
+
+        # check the data and the generated phenotype object
+        assert phens.data.shape == (5, 2)
+        assert phens.data[0, 0] == phens.data[1, 0]
+        assert phens.data[2, 0] == phens.data[4, 0]
+        assert phens.data[3, 0] > phens.data[0, 0]
+        assert phens.data[2, 0] < phens.data[0, 0]
+
+        assert phens.data[0, 1] == phens.data[1, 1]
+        assert phens.data[2, 1] == phens.data[3, 1]
+        assert phens.data[3, 1] > phens.data[0, 1]
+        assert phens.data[4, 1] < phens.data[0, 1]
+
+        assert phens.samples == expected.samples
+        assert phens.names == expected.names
+
+    def test_combined_phen_zero_noise(self):
+        gts = self._get_fake_gens()
+        hps = self._get_fake_haps()
+        expected = self._get_expected_phens()
+
+        pt_sim = PhenoSimulator(gts, seed=42)
+        pt_sim.run(hps, heritability=1)
+        phens = pt_sim.phens
+
+        # check the data and the generated phenotype object
+        assert phens.data.shape == (5, 1)
+        np.testing.assert_allclose(phens.data[:, 0], expected.data[:, 0])
+
+        assert phens.samples == expected.samples
+        assert phens.names == ("-".join(expected.names),)
+
+    def test_noise(self):
+        gts = self._get_fake_gens()
+        hps = self._get_fake_haps()
+        expected = self._get_expected_phens()
+
+        pt_sim = PhenoSimulator(gts, seed=42)
+        pt_sim.run(hps, heritability=1)
+        pt_sim.run(hps, heritability=0.96)
+        pt_sim.run(hps, heritability=0.5)
+        pt_sim.run(hps, heritability=0.2)
+        phens = pt_sim.phens
+
+        # check the data and the generated phenotype object
+        assert phens.data.shape == (5, 4)
+        np.testing.assert_allclose(phens.data[:, 0], expected.data[:, 0])
+        diff1 = np.abs(phens.data[:, 1] - phens.data[:, 0]).sum()
+        assert diff1 > 0
+        diff2 = np.abs(phens.data[:, 2] - phens.data[:, 0]).sum()
+        assert diff2 > diff1
+        diff3 = np.abs(phens.data[:, 3] - phens.data[:, 0]).sum()
+        assert diff3 > diff2
+
+    def test_case_control(self):
+        gts = self._get_fake_gens()
+        hps = self._get_fake_haps()
+        expected = self._get_expected_phens()
+        all_true = np.empty(expected.data.shape, dtype=np.bool_)
+        some_true = expected.data[:, 1].astype(np.bool_)
+        all_false = ~np.empty(expected.data.shape, dtype=np.bool_)
+
+        pt_sim = PhenoSimulator(gts, seed=42)
+        pt_sim.run(hps, heritability=1, prevalence=0.8)
+        pt_sim.run(hps, heritability=1, prevalence=1)
+        pt_sim.run(hps, heritability=1, prevalence=0)
+        pt_sim.run(hps, heritability=0.96, prevalence=0.8)
+        pt_sim.run(hps, heritability=0.5, prevalence=0.8)
+        pt_sim.run(hps, heritability=0.2, prevalence=0.8)
+        phens = pt_sim.phens
+        assert phens.data.shape == (5, 6)
+        np.testing.assert_allclose(phens.data[:, 1], some_true)
+        np.testing.assert_allclose(phens.data[:, 0], all_true[:, 0])
+        np.testing.assert_allclose(phens.data[:, 2], all_false[:, 0])
+        diff1 = (phens.data[:, 3] == phens.data[:, 0]).sum()
+        assert diff1 > 0
+        diff2 = (phens.data[:, 4] == phens.data[:, 0]).sum()
+        assert diff2 > diff1
+        diff3 = (phens.data[:, 5] == phens.data[:, 0]).sum()
+        assert diff3 > diff2

From 9a616fd8a01fe07414743673c158353105bee5b0 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 8 Jul 2022 09:46:01 -0700
Subject: [PATCH 33/44] finish testing case/control simphenotype

---
 haptools/sim_phenotype.py  | 11 ++++++++---
 tests/test_simphenotype.py | 24 +++++++++---------------
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py
index 71bacadd..cbaea5f8 100644
--- a/haptools/sim_phenotype.py
+++ b/haptools/sim_phenotype.py
@@ -139,18 +139,23 @@ def run(
         noise = np.var(pt) * (np.reciprocal(heritability) - 1)
         # finally, add everything together to get the simulated phenotypes
         pt += self.rng.normal(0, noise, size=pt.shape)
+        # now, handle case/control
         name_suffix = ""
         if prevalence is not None:
             self.log.info(f"Converting to case/control with prevalence {prevalence}")
             # first, find the number of desired positives
             k = int(prevalence * len(pt))
             # choose the top k values and label them positive
-            bool_pt = np.repeat(False, repeats=len(pt))
-            bool_pt[np.argpartition(pt, k)[-k:]] = True
+            bool_pt = np.zeros(len(pt), dtype=np.bool_)
+            if k == len(pt):
+                bool_pt = ~bool_pt
+            else:
+                max_indices = np.argpartition(-pt, k)[:k]
+                bool_pt[max_indices] = True
             pt = bool_pt
             name_suffix = "-cc"
         # now, save the archived phenotypes for later
-        self.phens.append(name="-".join(ids)+name_suffix, data=pt)
+        self.phens.append(name="-".join(ids)+name_suffix, data=pt.astype(np.float64))
         return pt
 
     def write(self):
diff --git a/tests/test_simphenotype.py b/tests/test_simphenotype.py
index c53015f0..6c937212 100644
--- a/tests/test_simphenotype.py
+++ b/tests/test_simphenotype.py
@@ -171,25 +171,19 @@ def test_case_control(self):
         gts = self._get_fake_gens()
         hps = self._get_fake_haps()
         expected = self._get_expected_phens()
-        all_true = np.empty(expected.data.shape, dtype=np.bool_)
-        some_true = expected.data[:, 1].astype(np.bool_)
-        all_false = ~np.empty(expected.data.shape, dtype=np.bool_)
+        all_false = np.zeros(expected.data.shape, dtype=np.float64)
+        some_true = expected.data[:, 1]
+        all_true = (~(all_false.astype(np.bool_))).astype(np.float64)
 
         pt_sim = PhenoSimulator(gts, seed=42)
-        pt_sim.run(hps, heritability=1, prevalence=0.8)
+        pt_sim.run(hps, heritability=1, prevalence=0.4)
         pt_sim.run(hps, heritability=1, prevalence=1)
         pt_sim.run(hps, heritability=1, prevalence=0)
-        pt_sim.run(hps, heritability=0.96, prevalence=0.8)
-        pt_sim.run(hps, heritability=0.5, prevalence=0.8)
-        pt_sim.run(hps, heritability=0.2, prevalence=0.8)
+        pt_sim.run(hps, heritability=0.5, prevalence=0.4)
         phens = pt_sim.phens
-        assert phens.data.shape == (5, 6)
-        np.testing.assert_allclose(phens.data[:, 1], some_true)
-        np.testing.assert_allclose(phens.data[:, 0], all_true[:, 0])
-        np.testing.assert_allclose(phens.data[:, 2], all_false[:, 0])
+        assert phens.data.shape == (5, 4)
+        np.testing.assert_allclose(phens.data[:, 0], some_true)
+        np.testing.assert_allclose(phens.data[:, 1], all_true[:, 1])
+        np.testing.assert_allclose(phens.data[:, 2], all_false[:, 1])
         diff1 = (phens.data[:, 3] == phens.data[:, 0]).sum()
         assert diff1 > 0
-        diff2 = (phens.data[:, 4] == phens.data[:, 0]).sum()
-        assert diff2 > diff1
-        diff3 = (phens.data[:, 5] == phens.data[:, 0]).sum()
-        assert diff3 > diff2

From b7e0dc08e412db8c4682676a71dd20a64b51de9c Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 8 Jul 2022 09:46:42 -0700
Subject: [PATCH 34/44] ensure written pheno file has unique names

---
 haptools/__main__.py        |  2 +-
 haptools/data/phenotypes.py | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 haptools/__main__.py

diff --git a/haptools/__main__.py b/haptools/__main__.py
old mode 100644
new mode 100755
index adeab099..ee1980f5
--- a/haptools/__main__.py
+++ b/haptools/__main__.py
@@ -102,7 +102,7 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms):
 @click.option(
     "-p",
     "--prevalence",
-    type=click.FloatRange(min=0, max=1, min_open=True, max_open=True),
+    type=click.FloatRange(min=0, max=1, min_open=False, max_open=True),
     show_default="quantitative trait",
     help="Disease prevalence if simulating a case-control trait"
 )
diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py
index 407ebde4..a55cb961 100644
--- a/haptools/data/phenotypes.py
+++ b/haptools/data/phenotypes.py
@@ -2,10 +2,10 @@
 from csv import reader
 from pathlib import Path
 from io import TextIOBase
-from collections import namedtuple
 from collections.abc import Iterable
 from logging import getLogger, Logger
 from fileinput import hook_compressed
+from collections import namedtuple, Counter
 
 import numpy as np
 import numpy.typing as npt
@@ -180,8 +180,15 @@ def write(self):
         >>> phenotypes.samples = ('HG00096', 'HG00097', 'HG00099')
         >>> phenotypes.write()
         """
+        # make sure the names are unique
+        uniq_names = Counter()
+        names = []
+        for name in self.names:
+            names.append(name+(f"-{uniq_names[name]}" if uniq_names[name] else ""))
+            uniq_names[name] += 1
+        # now we can finally write the file
         with hook_compressed(self.fname, mode="wt") as haps:
-            haps.write("#IID\t" + "\t".join(self.names) + "\n")
+            haps.write("#IID\t" + "\t".join(names) + "\n")
             formatter = {"float_kind": lambda x: "%.2f" % x}
             for samp, phen in zip(self.samples, self.data):
                 line = np.array2string(phen, separator="\t", formatter=formatter)[1:-1]

From f02cb939f1fe18adf3ba12ef4babef781e2302ea Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 8 Jul 2022 09:52:02 -0700
Subject: [PATCH 35/44] refmt with black

---
 haptools/data/phenotypes.py |  9 ++++++---
 haptools/sim_phenotype.py   |  2 +-
 tests/test_simphenotype.py  | 35 +++++++++++++++++++++--------------
 3 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/haptools/data/phenotypes.py b/haptools/data/phenotypes.py
index a55cb961..d43222c4 100644
--- a/haptools/data/phenotypes.py
+++ b/haptools/data/phenotypes.py
@@ -182,9 +182,12 @@ def write(self):
         """
         # make sure the names are unique
         uniq_names = Counter()
-        names = []
-        for name in self.names:
-            names.append(name+(f"-{uniq_names[name]}" if uniq_names[name] else ""))
+        names = [None] * len(self.names)
+        for idx, name in enumerate(self.names):
+            suffix = ""
+            if uniq_names[name]:
+                suffix = f"-{uniq_names[name]}"
+            names[idx] = name + suffix
             uniq_names[name] += 1
         # now we can finally write the file
         with hook_compressed(self.fname, mode="wt") as haps:
diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py
index cbaea5f8..8829b04e 100644
--- a/haptools/sim_phenotype.py
+++ b/haptools/sim_phenotype.py
@@ -155,7 +155,7 @@ def run(
             pt = bool_pt
             name_suffix = "-cc"
         # now, save the archived phenotypes for later
-        self.phens.append(name="-".join(ids)+name_suffix, data=pt.astype(np.float64))
+        self.phens.append(name="-".join(ids) + name_suffix, data=pt.astype(np.float64))
         return pt
 
     def write(self):
diff --git a/tests/test_simphenotype.py b/tests/test_simphenotype.py
index 6c937212..a47d761d 100644
--- a/tests/test_simphenotype.py
+++ b/tests/test_simphenotype.py
@@ -15,16 +15,20 @@
 
 DATADIR = Path(__file__).parent.joinpath("data")
 
+
 class TestSimPhenotype:
     def _get_fake_gens(self):
         gts = Genotypes(fname=None)
-        gts.data = np.array([
-            [[1, 0], [0, 1]],
-            [[0, 1], [0, 1]],
-            [[0, 0], [1, 1]],
-            [[1, 1], [1, 1]],
-            [[0, 0], [0, 0]],
-        ], dtype=np.uint8)
+        gts.data = np.array(
+            [
+                [[1, 0], [0, 1]],
+                [[0, 1], [0, 1]],
+                [[0, 0], [1, 1]],
+                [[1, 1], [1, 1]],
+                [[0, 0], [0, 0]],
+            ],
+            dtype=np.uint8,
+        )
         gts.variants = np.array(
             [
                 ("1:10114:T:C", "1", 10114, 0.7),
@@ -48,13 +52,16 @@ def _get_fake_haps(self):
 
     def _get_expected_phens(self):
         pts = Phenotypes(fname=None)
-        pts.data = np.array([
-            [-0.13363062, False],
-            [-0.13363062, False],
-            [ 0.53452248, True],
-            [ 1.20267559, True],
-            [-1.46993683, False],
-        ], dtype=np.float64)
+        pts.data = np.array(
+            [
+                [-0.13363062, False],
+                [-0.13363062, False],
+                [0.53452248, True],
+                [1.20267559, True],
+                [-1.46993683, False],
+            ],
+            dtype=np.float64,
+        )
         pts.samples = ("HG00096", "HG00097", "HG00099", "HG00100", "HG00101")
         pts.names = ("1:10114:T:C", "1:10116:A:G")
         return pts

From d2f144ca380c9df73d744f0fb2a00f45ca140030 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 8 Jul 2022 12:01:23 -0700
Subject: [PATCH 36/44] add docs for phenosimulator to API section

---
 docs/api/haptools.rst     | 8 ++++++++
 haptools/sim_phenotype.py | 5 ++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/docs/api/haptools.rst b/docs/api/haptools.rst
index d37b5ab7..00c8115b 100644
--- a/docs/api/haptools.rst
+++ b/docs/api/haptools.rst
@@ -51,3 +51,11 @@ haptools.data.haplotypes module
    :members:
    :undoc-members:
    :show-inheritance:
+
+haptools.sim_phenotype module
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: haptools.sim_phenotype
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py
index 8829b04e..68c81d08 100644
--- a/haptools/sim_phenotype.py
+++ b/haptools/sim_phenotype.py
@@ -50,10 +50,9 @@ class PhenoSimulator:
     --------
     >>> gens = Genotypes.load("tests/data/example.vcf.gz")
     >>> haps = Haplotypes.load("tests/data/basic.hap")
-    >>> haps_gts = GenotypesRefAlt(None)
-    >>> haps.transform(gens, haps_gts)
+    >>> haps_gts = haps.transform(gens)
     >>> phenosim = PhenoSimulator(haps_gts)
-    >>> phenosim.run(next(haps.data.values()))
+    >>> phenosim.run(haps.data.values())
     >>> phenotypes = phenosim.phens
     """
 

From 729138076758454c1717670cb9d60781fdb9c0d6 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Fri, 8 Jul 2022 13:37:00 -0700
Subject: [PATCH 37/44] rename phenotype simulation tests

dunno why I named them like that originally - must not have been thinking
---
 tests/test_simphenotype.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_simphenotype.py b/tests/test_simphenotype.py
index a47d761d..00339656 100644
--- a/tests/test_simphenotype.py
+++ b/tests/test_simphenotype.py
@@ -66,7 +66,7 @@ def _get_expected_phens(self):
         pts.names = ("1:10114:T:C", "1:10116:A:G")
         return pts
 
-    def test_one_phen_zero_noise(self):
+    def test_one_hap_zero_noise(self):
         gts = self._get_fake_gens()
         hps = self._get_fake_haps()
         expected = self._get_expected_phens()
@@ -86,7 +86,7 @@ def test_one_phen_zero_noise(self):
         assert phens.samples == expected.samples
         assert phens.names[0] == expected.names[0]
 
-    def test_one_phen_zero_noise_neg_beta(self):
+    def test_one_hap_zero_noise_neg_beta(self):
         """
         the same test as test_one_phen_zero_noise but with a negative beta this time
         """
@@ -111,7 +111,7 @@ def test_one_phen_zero_noise_neg_beta(self):
         assert phens.samples == expected.samples
         assert phens.names[0] == expected.names[0]
 
-    def test_two_phens_zero_noise(self):
+    def test_two_haps_zero_noise(self):
         gts = self._get_fake_gens()
         hps = self._get_fake_haps()
         expected = self._get_expected_phens()
@@ -136,7 +136,7 @@ def test_two_phens_zero_noise(self):
         assert phens.samples == expected.samples
         assert phens.names == expected.names
 
-    def test_combined_phen_zero_noise(self):
+    def test_combined_haps_zero_noise(self):
         gts = self._get_fake_gens()
         hps = self._get_fake_haps()
         expected = self._get_expected_phens()

From 81e48a3472dcf7bf1f2908379d1ba92f247d58bb Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 14 Jul 2022 09:00:21 -0700
Subject: [PATCH 38/44] revise simphenotype command docs after changes to CLI

---
 docs/commands/karyogram.md     |  2 +-
 docs/commands/simphenotype.md  | 76 ----------------------------------
 docs/commands/simphenotype.rst | 75 ++++++++++++++++++++++++++++++++-
 docs/formats/phenotypes.rst    |  2 +-
 docs/index.rst                 |  1 +
 haptools/__main__.py           |  2 +-
 6 files changed, 77 insertions(+), 81 deletions(-)
 delete mode 100644 docs/commands/simphenotype.md

diff --git a/docs/commands/karyogram.md b/docs/commands/karyogram.md
index 706f2419..9b898e40 100644
--- a/docs/commands/karyogram.md
+++ b/docs/commands/karyogram.md
@@ -1,4 +1,4 @@
-# Haptools karyogram
+# karyogram
 
 `haptools karyogram` takes as input a breakpoints file (e.g. as output by `haptools simgenotype`) and a sample name, and plots a karyogram depicting local ancestry tracks.
 
diff --git a/docs/commands/simphenotype.md b/docs/commands/simphenotype.md
deleted file mode 100644
index 629a95be..00000000
--- a/docs/commands/simphenotype.md
+++ /dev/null
@@ -1,76 +0,0 @@
-# simphenotype
-
-Haptools simphenotype simulates a complex trait, taking into account haplotype- or local-ancestry- specific effects as well as traditional variant-level effects. It takes causal effects and genotypes as input and outputs simulated phenotypes.
-
-Usage is modeled based on the [GCTA GWAS Simulation](https://yanglab.westlake.edu.cn/software/gcta/#GWASSimulation) utility.
-
-## Usage
-
-Below is a basic `haptools simphenotype` command:
-
-```
-haptools simphenotype \
-  --vcf <gzipped vcf file> \
-  --hap <gzipped hap file> \
-  --out <outprefix> \
-  < --simu-qt | --simu-cc > \
-  [simulation options]
-```
-
-Required parameters:
-
-* `--vcf <string>`: A bgzipped, tabix-indexed, phased VCF file. If you are simulating local-ancestry effects, the VCF file must contain the `FORMAT/LA` tag included in output of `haptools simgenotype`. See [haptools file formats](../../docs/formats/inputs.rst) for more details.
-
-* `--hap <string>`: A bgzipped, tabix-indexed HAP file, which specifies causal effects. This is a custom format described in more detail in [haptools file formats](../../docs/formats/haplotypes.rst). The HAP format enables flexible specification of a range of effect types including traditional variant-level effects, haplotype-level effects, associations with repeat lengths at short tandem repeats, and interaction of these effects with local ancestry labels. See [Examples](#examples) below for detailed examples of how to specify effects.
-
-* `--out <string>`: Prefix to name output files.
-
-* `--simu-qt` or `simu-cc` indicate whether to simulate a quantitative or case control trait. One of these two options must be specified.
-
-Additional parameters:
-
-* `--simu-rep <int>`: Number of phenotypes to simulate. Default: 1.
-
-* `--simu-hsq <float>`: Trait heritability. Default: 0.1.
-
-* `--simu-k <float>`: Disease prevalence (for case-control). Default: 0.1
-
-## Output files
-
-`haptools simphenotypes` outputs the following files:
-
-* `<outprefix>.phen`: Based on the phenotype files accepted by [plink](https://www.cog-genomics.org/plink/1.9/input#pheno). Tab-delimited file with one row per sample. The first and second columns give the sample ID. The third column gives the simulated phenotype. If `--simu-rep` was set to greater than 1, additional columns are output for each simulated trait. Example file:
-
-```
-HG00096	HG00096	0.058008375906919506
-HG00097	HG00097	0.03472768471423458
-HG00099	HG00099	-0.20850127859705808
-HG00100	HG00100	-0.21206803352471154
-HG00101	HG00101	0.3157913763428323
-```
-
-* `<outprefix>.par`: summarizes the frequencies and effects of simulated haplotypes. The columns are: haplotype ID (from the HAP file), haplotype frequency, and effect. Example file:
-
-```
-Haplotype	Frequency	Effect
-H-001	0.6	-0.2
-```
-
-<a name="examples"></a>
-## Examples
-
-#### Simulate a single haplotype-effect based on a 2 SNP haplotype:
-
-```
-haptools simphenotype \
-  --vcf tests/data/simple.vcf.gz \
-  --hap tests/data/simple.hap.gz \
-  --out test \
-  --simu-qt --simu-hsq 0.8 --simu-rep 1
-```
-
-based on this HAP file (available in `tests/data`)
-
-```
-H-001	1	10114	10116	1:10114:T:C,1:10116:A:G	T,G	*	-0.2
-```
\ No newline at end of file
diff --git a/docs/commands/simphenotype.rst b/docs/commands/simphenotype.rst
index 5c2afcfe..a0808872 100644
--- a/docs/commands/simphenotype.rst
+++ b/docs/commands/simphenotype.rst
@@ -1,4 +1,75 @@
 .. _commands-simphenotype:
 
-.. include:: simphenotype.md
-   :parser: myst_parser.sphinx_
\ No newline at end of file
+
+simphenotype
+============
+
+Simulates a complex trait, taking into account haplotype- or local-ancestry- specific effects as well as traditional variant-level effects. The user denotes causal variables to use within the simulation by specifying them in a ``.hap`` file.
+
+The implementation is based on the `GCTA GWAS Simulation <https://yanglab.westlake.edu.cn/software/gcta/#GWASSimulation>`_ utility.
+
+Usage
+~~~~~
+.. code-block:: bash
+
+   haptools transform \
+   --replications INT \
+   --heritability FLOAT \
+   --prevalence FLOAT \
+   --region TEXT \
+   --sample SAMPLE \
+   --samples-file FILENAME \
+   --output PATH \
+   --verbosity [CRITICAL|ERROR|WARNING|INFO|DEBUG|NOTSET] \
+   GENOTYPES HAPLOTYPES
+
+Model
+~~~~~
+Each haplotype is encoded as an independent causal variable in the linear model.
+
+.. math::
+
+   \vec{y} = \sum_j \beta_j \vec{Z_j} + \vec \epsilon
+
+where
+
+.. math::
+
+   \epsilon_i \sim N(0, \sigma^2)
+
+.. math::
+
+   \sigma^2 = Var[\sum_j \beta_j \vec{Z_j}] * (\frac 1 {h^2} - 1)
+
+The heritability :math:`h^2` is user-specified but defaults to 1 when not provided.
+
+If a prevalence for the disease is specified, the final :math:`\vec{y}` value will be thresholded to produce a binary case/control trait with the desired fraction of diseased individuals.
+
+Output
+~~~~~~
+Phenotypes are output in the PLINK2-style ``.pheno`` file format. If ``--replications`` was set to greater than 1, additional columns are output for each simulated trait.
+
+Examples
+~~~~~~~~
+.. code-block:: bash
+
+   haptools simphenotype -o simulated.pheno tests/data/example.vcf.gz tests/data/example.hap.gz
+
+Simulate two replicates of a case/control trait that occurs in 60% of your samples with a heritability of 0.8. Encode all of the haplotypes in ``tests/data/example.hap.gz`` as independent causal variables.
+
+.. code-block:: bash
+
+   haptools simphenotype \
+   --replications 2 \
+   --heritability 0.8 \
+   --prevalence 0.6 \
+   --output simulated.pheno \
+   tests/data/example.vcf.gz tests/data/example.hap.gz
+
+Detailed Usage
+~~~~~~~~~~~~~~
+
+.. click:: haptools.__main__:main
+   :prog: haptools
+   :show-nested:
+   :commands: simphenotype
\ No newline at end of file
diff --git a/docs/formats/phenotypes.rst b/docs/formats/phenotypes.rst
index 2d856ff2..f300076f 100644
--- a/docs/formats/phenotypes.rst
+++ b/docs/formats/phenotypes.rst
@@ -6,7 +6,7 @@ Phenotypes and Covariates
 
 Phenotype file format
 ---------------------
-Phenotypes are expected to follow the PLINK2 ``.pheno`` file format. This is a
+Phenotypes are expected to follow `the PLINK2 .pheno file format <https://www.cog-genomics.org/plink/2.0/input#pheno>`_. This is a
 tab-separated format where the first column corresponds to the sample ID, and
 subsequent columns contain each of your phenotypes.
 
diff --git a/docs/index.rst b/docs/index.rst
index b2a14574..bda46b4c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -21,6 +21,7 @@
 
    commands/simgenotype.rst
    commands/simphenotype.rst
+   commands/karyogram.rst
    commands/transform.rst
 
 .. toctree::
diff --git a/haptools/__main__.py b/haptools/__main__.py
index ee1980f5..7518d043 100755
--- a/haptools/__main__.py
+++ b/haptools/__main__.py
@@ -177,7 +177,7 @@ def simphenotype(
     \f
     Examples
     --------
-    >>> haptools simphenotype tests/data/example.vcf.gz tests/data/example.hap.gz > simu_phens.tsv
+    >>> haptools simphenotype tests/data/example.vcf.gz tests/data/example.hap.gz > simulated.pheno
 
     Parameters
     ----------

From 8b5de5c7aa17f46fdfd8a2567fe4173f6038ddf4 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Thu, 14 Jul 2022 09:06:42 -0700
Subject: [PATCH 39/44] oops - fix typos

---
 docs/commands/simphenotype.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/commands/simphenotype.rst b/docs/commands/simphenotype.rst
index a0808872..5ee89d33 100644
--- a/docs/commands/simphenotype.rst
+++ b/docs/commands/simphenotype.rst
@@ -12,7 +12,7 @@ Usage
 ~~~~~
 .. code-block:: bash
 
-   haptools transform \
+   haptools simphenotype \
    --replications INT \
    --heritability FLOAT \
    --prevalence FLOAT \
@@ -25,7 +25,7 @@ Usage
 
 Model
 ~~~~~
-Each haplotype is encoded as an independent causal variable in the linear model.
+Each normalized haplotype :math:`\vec{Z_j}` is encoded as an independent causal variable in a linear model:
 
 .. math::
 
@@ -72,4 +72,4 @@ Detailed Usage
 .. click:: haptools.__main__:main
    :prog: haptools
    :show-nested:
-   :commands: simphenotype
\ No newline at end of file
+   :commands: simphenotype

From b237872b7d4875f5b6324c85b0b6a9c75465e64c Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 18 Jul 2022 23:35:59 -0700
Subject: [PATCH 40/44] add non-gcta model to simphenotype

---
 docs/commands/simphenotype.rst |  6 +++++-
 haptools/__main__.py           |  6 ++++--
 haptools/sim_phenotype.py      | 22 +++++++++++++++-------
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/docs/commands/simphenotype.rst b/docs/commands/simphenotype.rst
index 5ee89d33..790c9882 100644
--- a/docs/commands/simphenotype.rst
+++ b/docs/commands/simphenotype.rst
@@ -41,7 +41,11 @@ where
 
    \sigma^2 = Var[\sum_j \beta_j \vec{Z_j}] * (\frac 1 {h^2} - 1)
 
-The heritability :math:`h^2` is user-specified but defaults to 1 when not provided.
+The heritability :math:`h^2` is user-specified, but if it is not provided, then :math:`\sigma^2` will be computed purely from the effect sizes, instead:
+
+.. math::
+
+   \sigma^2 = \Biggl \lbrace {1 - \sum \beta_j^2 \quad \quad {\sum \beta_j^2 \le 1} \atop 0 \quad \quad \quad \quad \quad \text{ otherwise }}
 
 If a prevalence for the disease is specified, the final :math:`\vec{y}` value will be thresholded to produce a binary case/control trait with the desired fraction of diseased individuals.
 
diff --git a/haptools/__main__.py b/haptools/__main__.py
index 7518d043..80035d03 100755
--- a/haptools/__main__.py
+++ b/haptools/__main__.py
@@ -95,7 +95,7 @@ def simgenotype(invcf, sample_info, model, mapdir, out, popsize, seed, chroms):
     "-h",
     "--heritability",
     type=click.FloatRange(min=0, max=1),
-    default=1,
+    default=None,
     show_default=True,
     help="Trait heritability",
 )
@@ -159,7 +159,7 @@ def simphenotype(
     genotypes: Path,
     haplotypes: Path,
     replications: int = 1,
-    heritability: float = 1,
+    heritability: float = None,
     prevalence: float = None,
     region: str = None,
     samples: tuple[str] = tuple(),
@@ -189,6 +189,8 @@ def simphenotype(
         The number of rounds of simulation to perform
     heritability : int, optional
         The heritability of the simulated trait; must be a float between 0 and 1
+
+        If not provided, it will be computed from the sum of the squared effect sizes
     prevalence : int, optional
         The prevalence of the disease if the trait should be simulated as case/control;
         must be a float between 0 and 1
diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py
index 68c81d08..a1ece544 100644
--- a/haptools/sim_phenotype.py
+++ b/haptools/sim_phenotype.py
@@ -92,7 +92,7 @@ def __init__(
     def run(
         self,
         effects: list[Haplotype],
-        heritability: float = 1,
+        heritability: float = None,
         prevalence: float = None,
     ) -> npt.NDArray:
         """
@@ -108,8 +108,7 @@ def run(
         heritability: float, optional
             The simulated heritability of the trait
 
-            If not provided, this will be estimated from the variability of the
-            genotypes
+            If not provided, this will default to the sum of the squared effect sizes
         prevalence: float, optional
             How common should the disease be within the population?
 
@@ -133,9 +132,16 @@ def run(
         # generate the genetic component
         pt = (betas * gts).sum(axis=1)
         # compute the heritability
-        self.log.info(f"Adding environmental component for h^2: {heritability}")
-        # compute the environmental effect
-        noise = np.var(pt) * (np.reciprocal(heritability) - 1)
+        if heritability is None:
+            # compute the environmental effect
+            noise = np.var(pt) * (np.reciprocal(heritability) - 1)
+        else:
+            self.log.debug("Computing heritability as the sum of the squared betas")
+            heritability = np.power(betas, 2).sum()
+            if heritability > 1:
+                heritability = 1
+            noise = 1 - heritability
+        self.log.info(f"Adding environmental component {noise} for h^2 {heritability}")
         # finally, add everything together to get the simulated phenotypes
         pt += self.rng.normal(0, noise, size=pt.shape)
         # now, handle case/control
@@ -169,7 +175,7 @@ def simulate_pt(
     genotypes: Path,
     haplotypes: Path,
     num_replications: int = 1,
-    heritability: float = 1,
+    heritability: float = None,
     prevalence: float = None,
     region: str = None,
     samples: list[str] = None,
@@ -198,6 +204,8 @@ def simulate_pt(
         The number of rounds of simulation to perform
     heritability : int, optional
         The heritability of the simulated trait; must be a float between 0 and 1
+
+        If not provided, it will be computed from the sum of the squared effect sizes
     prevalence : int, optional
         The prevalence of the disease if the trait should be simulated as case/control;
         must be a float between 0 and 1

From 9f9d0ea09065bbc837bf1ee68d98d942b449ad98 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Mon, 18 Jul 2022 23:42:12 -0700
Subject: [PATCH 41/44] switch if else in simphenotype to fix bug

---
 haptools/sim_phenotype.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py
index a1ece544..308877c2 100644
--- a/haptools/sim_phenotype.py
+++ b/haptools/sim_phenotype.py
@@ -133,14 +133,15 @@ def run(
         pt = (betas * gts).sum(axis=1)
         # compute the heritability
         if heritability is None:
-            # compute the environmental effect
-            noise = np.var(pt) * (np.reciprocal(heritability) - 1)
-        else:
             self.log.debug("Computing heritability as the sum of the squared betas")
             heritability = np.power(betas, 2).sum()
             if heritability > 1:
                 heritability = 1
+            # compute the environmental effect
             noise = 1 - heritability
+        else:
+            # compute the environmental effect
+            noise = np.var(pt) * (np.reciprocal(heritability) - 1)
         self.log.info(f"Adding environmental component {noise} for h^2 {heritability}")
         # finally, add everything together to get the simulated phenotypes
         pt += self.rng.normal(0, noise, size=pt.shape)

From 6aefc33ea0a62ae06273dc202d2aa4a8901ca022 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 19 Jul 2022 09:10:44 -0700
Subject: [PATCH 42/44] adjust logging of info in PhenoSimulator.run()

---
 haptools/__main__.py      | 2 +-
 haptools/sim_phenotype.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/haptools/__main__.py b/haptools/__main__.py
index 80035d03..276adef7 100755
--- a/haptools/__main__.py
+++ b/haptools/__main__.py
@@ -190,7 +190,7 @@ def simphenotype(
     heritability : int, optional
         The heritability of the simulated trait; must be a float between 0 and 1
 
-        If not provided, it will be computed from the sum of the squared effect sizes
+        If not provided, it will be computed from the sum of the squared effect sizes.
     prevalence : int, optional
         The prevalence of the disease if the trait should be simulated as case/control;
         must be a float between 0 and 1
diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py
index 308877c2..5626ba98 100644
--- a/haptools/sim_phenotype.py
+++ b/haptools/sim_phenotype.py
@@ -123,7 +123,8 @@ def run(
         # extract the relevant haplotype info from the Haplotype objects
         ids = [hap.id for hap in effects]
         betas = np.array([hap.beta for hap in effects])
-        self.log.info(f"Extracting haplotype genotypes for haps: {ids}")
+        self.log.debug(f"Extracting haplotype genotypes for haps: {ids}")
+        self.log.debug(f"Beta values are {betas}")
         # extract the haplotype "genotypes" and compute the phenotypes
         gts = self.gens.subset(variants=ids).data.sum(axis=2)
         self.log.info(f"Computing genetic component w/ {gts.shape[0]} causal effects")

From 38931d781544656800236eea4731ebd25dcbdf58 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 19 Jul 2022 10:40:07 -0700
Subject: [PATCH 43/44] warn about case/control encoding in simphenotype docs

---
 docs/commands/simphenotype.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/commands/simphenotype.rst b/docs/commands/simphenotype.rst
index 790c9882..e8ba06fc 100644
--- a/docs/commands/simphenotype.rst
+++ b/docs/commands/simphenotype.rst
@@ -53,6 +53,8 @@ Output
 ~~~~~~
 Phenotypes are output in the PLINK2-style ``.pheno`` file format. If ``--replications`` was set to greater than 1, additional columns are output for each simulated trait.
 
+Note that case/control phenotypes are encoded as 0 (control) + 1 (case) **not** 1 (control) + 2 (case). The latter is used by PLINK2 unless the ``--1`` flag is used (see `the PLIN2 docs <https://www.cog-genomics.org/plink/2.0/input#input_missing_phenotype>`_). Therefore, you must use ``--1`` when providing our ``.pheno`` files to PLINK.
+
 Examples
 ~~~~~~~~
 .. code-block:: bash

From 3f9af3a064272ba9ed010504eb6108058c478908 Mon Sep 17 00:00:00 2001
From: Arya Massarat <23412689+aryarm@users.noreply.github.com>
Date: Tue, 19 Jul 2022 11:36:53 -0700
Subject: [PATCH 44/44] log expected heritability and expand docs

---
 docs/commands/simphenotype.rst |  2 +-
 haptools/sim_phenotype.py      | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/docs/commands/simphenotype.rst b/docs/commands/simphenotype.rst
index e8ba06fc..216d70fa 100644
--- a/docs/commands/simphenotype.rst
+++ b/docs/commands/simphenotype.rst
@@ -59,7 +59,7 @@ Examples
 ~~~~~~~~
 .. code-block:: bash
 
-   haptools simphenotype -o simulated.pheno tests/data/example.vcf.gz tests/data/example.hap.gz
+   haptools simphenotype -o simulated.pheno tests/data/example.vcf.gz tests/data/simphenotype.hap
 
 Simulate two replicates of a case/control trait that occurs in 60% of your samples with a heritability of 0.8. Encode all of the haplotypes in ``tests/data/example.hap.gz`` as independent causal variables.
 
diff --git a/haptools/sim_phenotype.py b/haptools/sim_phenotype.py
index 5626ba98..df1c417c 100644
--- a/haptools/sim_phenotype.py
+++ b/haptools/sim_phenotype.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 from pathlib import Path
 from itertools import combinations
-from logging import getLogger, Logger
+from logging import getLogger, Logger, DEBUG
 from dataclasses import dataclass, field
 
 import numpy as np
@@ -145,7 +145,13 @@ def run(
             noise = np.var(pt) * (np.reciprocal(heritability) - 1)
         self.log.info(f"Adding environmental component {noise} for h^2 {heritability}")
         # finally, add everything together to get the simulated phenotypes
-        pt += self.rng.normal(0, noise, size=pt.shape)
+        pt_noise = self.rng.normal(0, noise, size=pt.shape)
+        if self.log.getEffectiveLevel() == DEBUG:
+            # if we're in debug mode, compute the pearson correlation and report it
+            # but don't do this otherwise to keep things fast
+            corr = np.corrcoef(pt, pt + pt_noise)[1, 0]
+            self.log.debug(f"Estimated heritability is {corr}")
+        pt += pt_noise
         # now, handle case/control
         name_suffix = ""
         if prevalence is not None: