Skip to content

Commit

Permalink
wip: fixing sync points, genotype errors, phase sets
Browse files Browse the repository at this point in the history
  • Loading branch information
TimD1 committed Nov 16, 2024
1 parent dd67202 commit 237d0e7
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 34 deletions.
49 changes: 30 additions & 19 deletions src/dist.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,20 +275,20 @@ void calc_prec_recall(
/* (graph->ttypes[curr.tni] == TYPE_REF) ? "ref" : "var", */
/* (graph->tbegs[curr.tni] + curr.ti == graph->qbegs[curr.qni] + curr.qi) ? "same" : "diff"); */

// if we move into a new query variant, mark it as included
// if we move out of a new query variant, mark it as included
if (prev.qni != curr.qni && // new query node
graph->qtypes[prev.qni] != TYPE_REF) { // node is variant
graph->qtypes[curr.qni] != TYPE_REF) { // node is variant
if (print) printf("new query variant\n");
int qvar_idx = graph->qidxs[prev.qni];
int qvar_idx = graph->qidxs[curr.qni];
sync_qvars.push_back(qvar_idx);
}
// if we move into a query variant, include it in sync group and ref dist calc
// if we move out of a query variant, include it in sync group and ref dist calc
if (prev.tni != curr.tni && // new truth node
graph->ttypes[prev.tni] != TYPE_REF) { // node is variant
graph->ttypes[curr.tni] != TYPE_REF) { // node is variant
if (print) printf("new truth variant\n");
int tvar_idx = graph->tidxs[prev.tni];
int tvar_idx = graph->tidxs[curr.tni];
sync_tvars.push_back(tvar_idx);
switch (graph->ttypes[prev.tni]) {
switch (graph->ttypes[curr.tni]) {
case TYPE_INS:
ref_dist += tvars->alts[tvar_idx].size();
break;
Expand All @@ -300,7 +300,7 @@ void calc_prec_recall(
break;
default:
ERROR("Unexpected truth var type '%s' in calc_prec_recall() at %s:%d",
type_strs[graph->ttypes[prev.tni]].data(),
type_strs[graph->ttypes[curr.tni]].data(),
tvars->ctg.data(), tvars->poss[tvar_idx]);
break;
}
Expand All @@ -313,18 +313,20 @@ void calc_prec_recall(
if (print) printf("non-match step\n");
query_dist += 1;
}

// check if this movement is a sync point (ref main diag mvmt or between var main diag mvmt)
if ((prev.qni == curr.qni && prev.tni == curr.tni && // same submatrix
prev.qi+1 == curr.qi && prev.ti+1 == curr.ti && // diagonal movement
graph->ttypes[curr.tni] == TYPE_REF && // not in truth var
graph->qtypes[curr.qni] == TYPE_REF && // not in query var
graph->tbegs[curr.tni] + curr.ti == graph->qbegs[curr.qni] + curr.qi) || // main diag
(prev.qni != curr.qni && prev.tni != curr.tni && // between vars
curr.qi == 0 && curr.ti == 0 &&
graph->ttypes[curr.tni] == TYPE_REF && // not in truth var
graph->qtypes[curr.qni] == TYPE_REF && // not in query var
graph->tbegs[curr.tni] == graph->qbegs[curr.qni]) // on main diag
) {
bool on_main_diag = graph->tbegs[curr.tni] + curr.ti == graph->qbegs[curr.qni] + curr.qi;
bool ref_query_move = graph->qtypes[curr.qni] == TYPE_REF
&& prev.qni == curr.qni
&& prev.qi+1 == curr.qi;
bool ref_truth_move = graph->ttypes[curr.tni] == TYPE_REF
&& prev.tni == curr.tni
&& prev.ti+1 == curr.ti;
bool query_var_end = curr.qi == 0 && prev.qni != curr.qni;
bool truth_var_end = curr.ti == 0 && prev.tni != curr.tni;
bool sync_point = on_main_diag
&& ((ref_query_move && ref_truth_move) || truth_var_end || query_var_end);
if (sync_point) {
if (print) printf("potential sync point\n");

// add sync point
Expand Down Expand Up @@ -797,6 +799,7 @@ void precision_recall_wrapper(
superclusterData* clusterdata_ptr,
const std::vector< std::vector< std::vector<int> > > & sc_groups,
int thread_step, int start, int stop, bool thread2, bool print) {
print = true;

// parse sc_idx from grouped superclusters
if (stop == start) return;
Expand Down Expand Up @@ -875,6 +878,14 @@ void fix_genotype_allele_counts(std::shared_ptr<ctgSuperclusters> sc, int sc_idx
vars->ac_errtype[vi] = AC_ERR_1_TO_2;
}
vars->calc_gts[vi] = vars->orig_gts[vi];
// if we're in a PHASE_SWAP phase block, we should swap data here since otherwise
// there's no way based on calc_gt 1|1 to know to look at data from other hap
std::swap(vars->errtypes[HAP1][vi], vars->errtypes[HAP2][vi]);
std::swap(vars->sync_group[HAP1][vi], vars->sync_group[HAP2][vi]);
std::swap(vars->callq[HAP1][vi], vars->callq[HAP2][vi]);
std::swap(vars->ref_ed[HAP1][vi], vars->ref_ed[HAP2][vi]);
std::swap(vars->query_ed[HAP1][vi], vars->query_ed[HAP2][vi]);
std::swap(vars->credit[HAP1][vi], vars->credit[HAP2][vi]);
}
// force 0|1 and 1|0 query variants to be evaluated as such
else if (vars->orig_gts[vi] == GT_REF_ALT1 || vars->orig_gts[vi] == GT_ALT1_REF) {
Expand Down
21 changes: 11 additions & 10 deletions src/phase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ void phaseblockData::write_summary_vcf(std::string out_vcf_fn) {
fprintf(out_vcf, "##FORMAT=<ID=SG,Number=1,Type=Integer,Description=\"Sync Group (index in supercluster, for credit assignment)\">\n");
fprintf(out_vcf, "##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set identifier (input, per-variant)\">\n");
fprintf(out_vcf, "##FORMAT=<ID=PB,Number=1,Type=Integer,Description=\"Phase Block (output, per-supercluster, index in contig)\">\n");
fprintf(out_vcf, "##FORMAT=<ID=BP,Number=1,Type=Integer,Description=\"Block Phase: 0 = PHASE_KEEP, 1 = PHASE_SWAP)\">\n");
fprintf(out_vcf, "##FORMAT=<ID=BS,Number=1,Type=Integer,Description=\"Block Phase: 0 = PHASE_KEEP, 1 = PHASE_SWAP)\">\n");
fprintf(out_vcf, "##FORMAT=<ID=VP,Number=1,Type=Integer,Description=\"Variant Phase: 0 = PHASE_ORIG, 1 = PHASE_SWAP, . = PHASE_NONE)\">\n");
fprintf(out_vcf, "##FORMAT=<ID=FE,Number=1,Type=Integer,Description=\"Flip Error (a per-supercluster error)\">\n");
fprintf(out_vcf, "##FORMAT=<ID=GE,Number=1,Type=String,Description=\"Genotype Error ('+' if 0/1 -> 1/1, '-' if 1/1 -> 0/1, '.' otherwise)\">\n");
fprintf(out_vcf, "##FORMAT=<ID=GE,Number=1,Type=String,Description=\"Genotype Error ('+' if 0/1 truth -> 1/1 query, '-' if 1/1 truth -> 0/1 query, '.' otherwise)\">\n");
fprintf(out_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tTRUTH\tQUERY\n");

// write variants
Expand Down Expand Up @@ -119,6 +119,15 @@ void phaseblockData::write_summary_vcf(std::string out_vcf_fn) {
if (ptrs[QUERY] >= ctg_pbs->phase_blocks[phase_block+1])
phase_block++;
}
/* if (next[QUERY]) { */
/* fprintf(out_vcf, "orig_gt: %s\tcalc_gt: %s\n", */
/* gt_strs[vars[QUERY]->orig_gts[ptrs[QUERY]]].data(), */
/* gt_strs[vars[QUERY]->calc_gts[ptrs[QUERY]]].data()); */
/* } */
/* if (next[TRUTH]) { */
/* fprintf(out_vcf, "orig_gt: %s\n", */
/* gt_strs[vars[TRUTH]->orig_gts[ptrs[TRUTH]]].data()); */
/* } */

if (next[QUERY]) {
if (next[TRUTH]) {
Expand Down Expand Up @@ -270,14 +279,6 @@ void phaseblockData::phase()
} else {
qvars->phases[i] = PHASE_NONE;
}

// only TP variants are actually phased
if (qvars->calc_gts[i] == GT_ALT1_REF && qvars->errtypes[HAP1][i] != ERRTYPE_TP) {
qvars->phases[i] = PHASE_NONE;
}
if (qvars->calc_gts[i] == GT_REF_ALT1 && qvars->errtypes[HAP2][i] != ERRTYPE_TP) {
qvars->phases[i] = PHASE_NONE;
}
}

// forward pass
Expand Down
13 changes: 8 additions & 5 deletions src/variant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ void variantData::print_phase_info(int callset) {
phase_set_sizes.push_back(this->lengths[ctg_idx]);
total_phase_sets++;
continue;
} else {
} else { // set phase set up until first PS
for (int hi = 0; hi < HAPS; hi++) {
std::shared_ptr<ctgVariants> vars = this->variants[hi][ctg];
for (int vi = 0; vi < vars->n; vi++) {
Expand All @@ -98,6 +98,7 @@ void variantData::print_phase_info(int callset) {
}
}

// TODO: why is hi needed here? when are variants merged? shouldn't this iterate over one list?
std::vector<int> vi(HAPS, 0);
int ps_beg = 0; int ps_end = 0;
int phase_set = 0;
Expand All @@ -115,7 +116,7 @@ void variantData::print_phase_info(int callset) {

// TODO: for now, only start new phase set if new PS tag found
std::shared_ptr<ctgVariants> vars = this->variants[hi][ctg];
if (vars->phase_sets[vi[hi]]) { // ignore unphased variants
if (vars->phase_sets[vi[hi]] != 0) { // variant is phased
if (vars->phase_sets[vi[hi]] != phase_set) { // new phase set, save old
if (phase_set) phase_set_sizes.push_back(ps_end - ps_beg);
phase_set = vars->phase_sets[vi[hi]];
Expand All @@ -125,6 +126,9 @@ void variantData::print_phase_info(int callset) {
} else { // same
ps_end = std::max(ps_end, vars->poss[vi[hi]] + vars->rlens[vi[hi]]);
}
} else { // set unphased variant phase set to current phase set
// TODO: only set phase sets for 1|1 variants (for when we add unphased eval)
vars->phase_sets[vi[hi]] = phase_set;
}

vi[hi]++;
Expand Down Expand Up @@ -432,14 +436,14 @@ void ctgVariants::print_var_info(FILE* out_fp, std::shared_ptr<fastaData> ref,
char ref_base;
switch (this->types[idx]) {
case TYPE_SUB:
fprintf(out_fp, "%s\t%d\t.\t%s\t%s\t.\tPASS\t.\tGT:BD:BC:RD:QD:BK:QQ:SC:SG:PS:PB:BP:VP:FE:GE",
fprintf(out_fp, "%s\t%d\t.\t%s\t%s\t.\tPASS\t.\tGT:BD:BC:RD:QD:BK:QQ:SC:SG:PS:PB:BS:VP:FE:GE",
ctg.data(), this->poss[idx]+1, this->refs[idx].data(),
this->alts[idx].data());
break;
case TYPE_INS:
case TYPE_DEL:
ref_base = ref->fasta.at(ctg)[this->poss[idx]-1];
fprintf(out_fp, "%s\t%d\t.\t%s\t%s\t.\tPASS\t.\tGT:BD:BC:RD:QD:BK:QQ:SC:SG:PS:PB:BP:VP:FE:GE", ctg.data(),
fprintf(out_fp, "%s\t%d\t.\t%s\t%s\t.\tPASS\t.\tGT:BD:BC:RD:QD:BK:QQ:SC:SG:PS:PB:BS:VP:FE:GE", ctg.data(),
this->poss[idx], (ref_base + this->refs[idx]).data(),
(ref_base + this->alts[idx]).data());
break;
Expand Down Expand Up @@ -921,7 +925,6 @@ variantData::variantData(std::string vcf_fn,
PS_missing_total++;
phase_set = 0;
}
// else, leave phase_set of 1|1 vars with missing PS tags the same as previous

} else if (nPS <= 0) { // other error
ERROR("Failed to read %s PS at %s:%lld",
Expand Down

0 comments on commit 237d0e7

Please sign in to comment.