From dc7cad040fe7f3f9c35b597a8fb94cce5f9c5da0 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 1 Apr 2022 19:45:45 -0700 Subject: [PATCH] warn user if the data module loaded zero variants --- haptools/data/genotypes.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/haptools/data/genotypes.py b/haptools/data/genotypes.py index 678d00b7..c9fff622 100644 --- a/haptools/data/genotypes.py +++ b/haptools/data/genotypes.py @@ -70,6 +70,11 @@ def read(self, region: str = None, samples: list[str] = None): """ Read genotypes from a VCF into a numpy matrix stored in :py:attr:`~.Genotypes.data` + Raises + ------ + ValueError + If the genotypes array is empty + Parameters ---------- region : str, optional @@ -110,6 +115,11 @@ def read(self, region: str = None, samples: list[str] = None): self.data = np.array( [variant.genotypes for variant in variants], dtype=np.uint8 ) + if self.data.shape == (0, 0, 0): + raise ValueError( + "Failed to load genotypes. If you specified a region, check that the" + " contig name matches! For example, double-check the 'chr' prefix." + ) # transpose the GT matrix so that samples are rows and variants are columns self.data = self.data.transpose((1, 0, 2)) @@ -144,8 +154,10 @@ def check_biallelic(self, discard_also=False): self.variants = np.delete(self.variants, variant_idx) else: raise ValueError( - "Variant with ID {} at POS {}:{} is multiallelic for sample {}".format( - *tuple(self.variants[variant_idx[0]])[:3], self.samples[samp_idx[0]] + "Variant with ID {} at POS {}:{} is multiallelic for sample {}" + .format( + *tuple(self.variants[variant_idx[0]])[:3], + self.samples[samp_idx[0]], ) ) self.data = self.data.astype(np.bool_)