Skip to content

Commit

Permalink
Handle trivial segmentations
Browse files Browse the repository at this point in the history
– The added algorithm selects the most common input sequences as the founder sequences.
  • Loading branch information
tsnorri committed Jan 7, 2024
1 parent 52fa545 commit 6e4c2d5
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 1 deletion.
3 changes: 3 additions & 0 deletions libvcf2multialign/find_cut_positions.cc
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,9 @@ namespace vcf2multialign {
it = std::lower_bound(cut_positions.cbegin(), it, prev_edge, cut_position_cmp{});
}

if (0 != out_cut_positions.back())
out_cut_positions.push_back(0);

std::reverse(out_cut_positions.begin(), out_cut_positions.end());

// Handle the (common) case where the sink node does not have any ALT-in-edges.
Expand Down
48 changes: 47 additions & 1 deletion libvcf2multialign/founder_sequence_greedy_output.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,12 @@ namespace {
libbio_assert_neq(rhs_rep, PLOIDY_MAX);
}

explicit joined_path_eq_class(ploidy_type const rhs_rep_):
lhs_rep(PLOIDY_MAX),
rhs_rep(rhs_rep_)
{
}

bool operator<(joined_path_eq_class const &other) const { return size < other.size; }
};

Expand Down Expand Up @@ -186,7 +192,8 @@ namespace vcf2multialign {
pbwt_context_type pbwt_ctx(graph.total_chromosome_copies());

// Handle the rest.
for (variant_graph::position_type cut_pos_idx{}; walker.advance();)
variant_graph::position_type cut_pos_idx{};
while (walker.advance())
{
libbio_assert_neq(cut_pos_it, m_cut_positions.cut_positions.end());

Expand All @@ -196,6 +203,8 @@ namespace vcf2multialign {
// Check if we are at a cut position.
if (node == *cut_pos_it)
{
bool const is_final_node(graph.node_count() == 1 + node);

{
using std::swap;
swap(lhs_eq_classes, rhs_eq_classes);
Expand Down Expand Up @@ -457,6 +466,43 @@ namespace vcf2multialign {
m_delegate->handled_node(node);
}

// Handle the trivial case.
if (1 == cut_pos_idx)
{
ploidy_type rep{PLOIDY_MAX};
joined_path_eq_classes.clear();
for (auto const [aa, dd] : rsv::zip(pbwt_ctx.permutation, pbwt_ctx.divergence))
{
// Check if the current entry begins a new equivalence class.
if (0 < dd)
{
rep = aa;
++rhs_distinct_eq_classes;
joined_path_eq_classes.emplace_back(rep);
}

// Store for the next cut position.
rhs_eq_classes[aa] = rep;

libbio_assert(!joined_path_eq_classes.empty());
++joined_path_eq_classes.back().size;
}

// Sort by the size. (The smallest will be the first.)
std::sort(joined_path_eq_classes.begin(), joined_path_eq_classes.end());

// Remove the REF edges if needed. This is easier here than in the divergence value handling loop above.
if (!m_should_keep_ref_edges && rhs_first_path_is_ref)
{
std::erase_if(joined_path_eq_classes, [rhs_first_path_eq_class](auto const &eq_class){
return rhs_first_path_eq_class == eq_class.rhs_rep;
});
}

for (auto const &[founder_idx, eq_class] : rsv::reverse(joined_path_eq_classes) | rsv::take(founder_count) | rsv::enumerate)
m_assigned_samples(0, founder_idx) = eq_class.rhs_rep;
}

return true;
}

Expand Down

0 comments on commit 6e4c2d5

Please sign in to comment.