diff --git a/util.py b/util.py index 0d8e9d5..34148d5 100644 --- a/util.py +++ b/util.py @@ -337,6 +337,15 @@ def process_vcf(self, inputfile): sg_dict = indexGroups(self.args.samplefile, self.args.groupvar) samples = sorted(list(set(sg_dict.values()))) + + # get boolean vector of samples that are in sample file + samples_keep_match = np.isin(all_samples, list(sg_dict.keys())) + + # get indices of matching samples + samples_keep_idx = np.where(samples_keep_match) + + # get list of individual sample ids to keep + samples_keep = sorted(list(set(sg_dict.keys()))) util_log.debug("%s samples will be pooled into %s groups: %s", len(all_samples), len(samples), ",".join(samples)) @@ -411,19 +420,40 @@ def process_vcf(self, inputfile): # currently only works with singletons-- if (self.args.samplefile and self.args.groupvar): + + gt_new = record.gt_types - if record.gt_types.sum() == 0: - numsites_skip += 1 - continue + if (self.args.impute and 3 in gt_new): + gt_complete = gt_new[gt_new != 3] + freq = sum(gt_complete) / len(gt_complete) + gt_new[gt_new == 3] = freq - carrier = all_samples[record.gt_types.tolist().index(1)] - if carrier not in sg_dict: + else: + gt_new[gt_new == 3] = 0 + + # if not any("/" in b for b in record.gt_bases): + if self.args.haploid: + gt_new = np.divide(gt_new, 2.) + + # get array of genotypes only for samples in samplefile + gt_sub = gt_new[samples_keep_idx] + + if gt_sub.sum() == 0: numsites_skip += 1 continue - - sample_gp = sg_dict[carrier] - ind = samples.index(sample_gp) - M[ind, st] += 1 + + # initialize dict of group allele counts = 0 + sg_counts = {k: 0 for k in sorted(list(set(sg_dict.values())))} + + # initialize dict of allele counts per sample + d2 = dict(zip(samples_keep, gt_sub)) + + # iterate per-sample counts and update per-group counts + for key, value in d2.items(): + sg_counts[sg_dict[key]] += value + + # add to matrix + M[:, st] = M[:, st] + list(sg_counts.values()) numsites_keep += 1 else: