From d7eeb0fc74b3e2b8651e1ecec1a90037f051d656 Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Fri, 6 Dec 2024 17:38:47 +0100
Subject: [PATCH] fix: reconstruction of ambiguous nucs in ancestral parsimony
 mode

Followup of: https://github.com/neherlab/treetime/pull/292

The transition from character sets to bit sets introduced some confusion in handling of ambiguous characters. Ambiguous characters correspond to multiple canonical characters, so they needed explicit disambiguation and disambiguation when using sets. With profile maps (arrays of floats) and with bitsets (~arrays of bits) this happens transparently: multiple "enabled" bits denote canonical components of an ambiguous character.

Here I introduce precalculated `char_to_set` and `set_to_char` maps, which, allow to resolve particular combinations of bits from and to characters. This is similar to how profile maps operate, but with bits (0s and 1s) rather than floating point values.

This allows the remaining test for parsimony reconstruction to pass.
---
 packages/treetime/src/alphabet/alphabet.rs    | 58 ++++++++++---------
 .../treetime/src/commands/ancestral/fitch.rs  |  3 +-
 .../src/representation/graph_sparse.rs        |  2 +-
 3 files changed, 33 insertions(+), 30 deletions(-)
diff --git a/packages/treetime/src/alphabet/alphabet.rs b/packages/treetime/src/alphabet/alphabet.rs
index c47052db..e0f10bc9 100644
--- a/packages/treetime/src/alphabet/alphabet.rs
+++ b/packages/treetime/src/alphabet/alphabet.rs
@@ -29,6 +29,9 @@ pub enum AlphabetName {
 }
 
 pub type ProfileMap = IndexMap<char, Array1<f64>>;
+pub type StateSetMap = IndexMap<char, StateSet>;
+pub type CharToSet = IndexMap<char, StateSet>;
+pub type SetToChar = IndexMap<StateSet, char>;
 
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct Alphabet {
@@ -43,6 +46,11 @@ pub struct Alphabet {
   treat_gap_as_unknown: bool,
   profile_map: ProfileMap,
 
+  #[serde(skip)]
+  char_to_set: IndexMap<char, StateSet>,
+  #[serde(skip)]
+  set_to_char: IndexMap<StateSet, char>,
+
   #[serde(skip)]
   char_to_index: Vec<Option<usize>>,
   #[serde(skip)]
@@ -124,6 +132,18 @@ impl Alphabet {
       index_to_char.push(c);
     }
 
+    let char_to_set = {
+      let mut char_to_set: CharToSet = canonical.iter().map(|c| (c, StateSet::from_char(c))).collect();
+      ambiguous.iter().for_each(|(key, chars)| {
+        char_to_set.insert(*key, StateSet::from_iter(chars));
+      });
+      char_to_set.insert(*gap, StateSet::from_char(*gap));
+      char_to_set.insert(*unknown, StateSet::from_char(*unknown));
+      char_to_set
+    };
+
+    let set_to_char: SetToChar = char_to_set.iter().map(|(&c, &s)| (s, c)).collect();
+
     Ok(Self {
       all,
       char_to_index,
@@ -137,25 +157,11 @@ impl Alphabet {
       gap: *gap,
       treat_gap_as_unknown: *treat_gap_as_unknown,
       profile_map,
+      char_to_set,
+      set_to_char,
     })
   }
 
-  /// Resolve possible ambiguity of the given character to the set of canonical chars
-  pub fn disambiguate(&self, c: char) -> StateSet {
-    // If unknown then could be any canonical (e.g. N => { A, C, G, T })
-    if self.is_unknown(c) {
-      self.canonical().collect()
-    }
-    // If ambiguous (e.g. R => { A, G })
-    else if let Some(resolutions) = self.ambiguous.get(&c) {
-      resolutions.iter().copied().collect()
-    }
-    // Otherwise it's not ambiguous and it's the char itself (incl. gap)
-    else {
-      once(c).collect()
-    }
-  }
-
   #[inline]
   pub fn get_profile(&self, c: char) -> &Array1<f64> {
     self
@@ -178,7 +184,7 @@ impl Alphabet {
   {
     let mut profile = Array1::<f64>::zeros(self.n_canonical());
     for c in chars {
-      let chars = self.disambiguate(*c.borrow());
+      let chars = self.char_to_set(*c.borrow());
       for c in chars.iter() {
         let index = self.index(c);
         profile[index] = 1.0;
@@ -206,6 +212,14 @@ impl Alphabet {
     Ok(prof)
   }
 
+  pub fn set_to_char(&self, c: StateSet) -> char {
+    self.set_to_char[&c]
+  }
+
+  pub fn char_to_set(&self, c: char) -> StateSet {
+    self.char_to_set[&c]
+  }
+
   /// All existing characters (including 'unknown' and 'gap')
   pub fn chars(&self) -> impl Iterator<Item = char> + '_ {
     self.all.iter()
@@ -466,16 +480,6 @@ mod tests {
   use indoc::indoc;
   use pretty_assertions::assert_eq;
 
-  #[test]
-  fn test_disambiguate() -> Result<(), Report> {
-    let alphabet = Alphabet::new(AlphabetName::Nuc, false)?;
-    assert_eq!(stateset! {'A', 'G'}, alphabet.disambiguate('R'));
-    assert_eq!(stateset! {'A', 'C', 'G', 'T'}, alphabet.disambiguate('N'));
-    assert_eq!(stateset! {'C'}, alphabet.disambiguate('C'));
-    assert_eq!(stateset! {alphabet.gap()}, alphabet.disambiguate(alphabet.gap()));
-    Ok(())
-  }
-
   #[test]
   fn test_alphabet_nuc() -> Result<(), Report> {
     let alphabet = Alphabet::new(AlphabetName::Nuc, false)?;
diff --git a/packages/treetime/src/commands/ancestral/fitch.rs b/packages/treetime/src/commands/ancestral/fitch.rs
index eaba7e29..f15c9ed6 100644
--- a/packages/treetime/src/commands/ancestral/fitch.rs
+++ b/packages/treetime/src/commands/ancestral/fitch.rs
@@ -509,8 +509,7 @@ pub fn ancestral_reconstruction_fitch(
         }
 
         for (&pos, &states) in &node.fitch.variable {
-          seq[pos] = states.first().unwrap();
-          // seq[pos] = alphabet.ambiguate(states).first().unwrap();
+          seq[pos] = alphabet.set_to_char(states);
         }
 
         node.sequence = seq.clone();
diff --git a/packages/treetime/src/representation/graph_sparse.rs b/packages/treetime/src/representation/graph_sparse.rs
index f6a4d15a..114d2c61 100644
--- a/packages/treetime/src/representation/graph_sparse.rs
+++ b/packages/treetime/src/representation/graph_sparse.rs
@@ -94,7 +94,7 @@ impl SparseSeqNode {
       .iter()
       .enumerate()
       .filter(|(_, &c)| alphabet.is_ambiguous(c))
-      .map(|(pos, &c)| (pos, alphabet.disambiguate(c)))
+      .map(|(pos, &c)| (pos, alphabet.char_to_set(c)))
       .collect();
 
     let seq_dis = ParsimonySeqDis {