diff --git a/text-utils-prefix/benches/benchmark.rs b/text-utils-prefix/benches/benchmark.rs index 1ed76f8..717bf28 100644 --- a/text-utils-prefix/benches/benchmark.rs +++ b/text-utils-prefix/benches/benchmark.rs @@ -5,9 +5,11 @@ use criterion::{criterion_group, criterion_main, Criterion}; use rand::seq::SliceRandom; use rand::SeedableRng; use rand_chacha::ChaCha8Rng; -use text_utils_prefix::vec::PrefixVecContinuations; -use text_utils_prefix::{optimized_prefix_order, ContinuationSearch, PrefixSearch}; -use text_utils_prefix::{AdaptiveRadixTrie, AdaptiveRadixTrie}; +use text_utils_prefix::vec::ContinuationsVec; +use text_utils_prefix::{ + utils::optimized_prefix_order, ContinuationSearch, ContinuationTrie, PrefixSearch, +}; +use text_utils_prefix::{AdaptiveRadixTrie, PatriciaTrie}; use art_tree::{Art, ByteString}; use patricia_tree::PatriciaMap; @@ -32,7 +34,6 @@ fn bench_prefix(c: &mut Criterion) { .collect(); let prefix = "Albert".as_bytes(); let prefixes: Vec<_> = (0..64).map(|_| prefix.to_vec()).collect(); - let (permutation, skips) = optimized_prefix_order(&continuations); group.bench_with_input("optimized_prefix_order", &continuations, |b, input| { b.iter(|| optimized_prefix_order(input)); @@ -63,7 +64,7 @@ fn bench_prefix(c: &mut Criterion) { }); // benchmark patricia trie - let mut trie: AdaptiveRadixTrie<_> = words.iter().zip(0..words.len()).collect(); + let mut trie: PatriciaTrie<_> = words.iter().zip(0..words.len()).collect(); group.bench_with_input("patricia_trie_insert", word, |b, input| { b.iter(|| trie.insert(input, 1)); }); @@ -73,60 +74,22 @@ fn bench_prefix(c: &mut Criterion) { group.bench_with_input("patricia_trie_contains", word, |b, input| { b.iter(|| trie.contains_prefix(&input[..input.len().saturating_sub(3)])); }); - group.bench_with_input( - "patricia_trie_continuations", - &(prefix, &continuations), - |b, input| { - let (word, continuations) = input; - b.iter(|| trie.contains_continuations(word, continuations)); - }, - ); - group.bench_with_input( - "patricia_trie_continuations_optimized", - &(prefix, &continuations), - |b, input| { - let (word, continuations) = input; - b.iter(|| { - trie.contains_continuations_optimized(word, continuations, &permutation, &skips) - }); - }, - ); + let trie: ContinuationTrie<_> = ContinuationTrie::new(trie, &continuations); + group.bench_with_input("patricia_trie_continuations", prefix, |b, input| { + b.iter(|| trie.contains_continuations(input)); + }); group.bench_with_input( "patricia_trie_continuations_batch", - &(&prefixes, &continuations), - |b, input| { - let (words, continuations) = input; - b.iter(|| trie.batch_contains_continuations(words, &continuations)); - }, - ); - group.bench_with_input( - "patricia_trie_continuations_batch_optimized", - &(&prefixes, &continuations), + &prefixes, |b, input| { - let (words, continuations) = input; - b.iter(|| { - trie.batch_contains_continuations_optimized( - words, - continuations, - &permutation, - &skips, - ) - }); + b.iter(|| trie.batch_contains_continuations(input)); }, ); group.bench_with_input( - "patricia_trie_continuations_batch_optimized_parallel", - &(&prefixes, &continuations), + "patricia_trie_continuations_batch_parallel", + &prefixes, |b, input| { - let (words, continuations) = input; - b.iter(|| { - trie.batch_contains_continuations_optimized_parallel( - words, - continuations, - &permutation, - &skips, - ) - }); + b.iter(|| trie.batch_contains_continuations_parallel(input)); }, ); @@ -141,66 +104,28 @@ fn bench_prefix(c: &mut Criterion) { group.bench_with_input("adaptive_radix_trie_contains", word, |b, input| { b.iter(|| trie.contains_prefix(&input[..input.len().saturating_sub(3)])); }); - group.bench_with_input( - "adaptive_radix_trie_continuations", - &(prefix, &continuations), - |b, input| { - let (word, continuations) = input; - b.iter(|| trie.contains_continuations(&word, &continuations)); - }, - ); - group.bench_with_input( - "adaptive_radix_trie_continuations_optimized", - &(prefix, &continuations), - |b, input| { - let (word, continuations) = input; - b.iter(|| { - trie.contains_continuations_optimized(&word, &continuations, &permutation, &skips) - }); - }, - ); + + let trie: ContinuationTrie<_> = ContinuationTrie::new(trie, &continuations); + group.bench_with_input("adaptive_radix_trie_continuations", prefix, |b, input| { + b.iter(|| trie.contains_continuations(input)); + }); group.bench_with_input( "adaptive_radix_trie_continuations_batch", - &(&prefixes, &continuations), - |b, input| { - let (words, continuations) = input; - b.iter(|| trie.batch_contains_continuations(words, continuations)); - }, - ); - group.bench_with_input( - "adaptive_radix_trie_continuations_batch_optimized", - &(&prefixes, &continuations), + &prefixes, |b, input| { - let (words, continuations) = input; - b.iter(|| { - trie.batch_contains_continuations_optimized( - words, - continuations, - &permutation, - &skips, - ) - }); + b.iter(|| trie.batch_contains_continuations(input)); }, ); group.bench_with_input( - "adaptive_radix_trie_continuations_batch_optimized_parallel", - &(&prefixes, &continuations), + "adaptive_radix_trie_continuations_batch_parallel", + &prefixes, |b, input| { - let (words, continuations) = input; - b.iter(|| { - trie.batch_contains_continuations_optimized_parallel( - words, - continuations, - &permutation, - &skips, - ) - }); + b.iter(|| trie.batch_contains_continuations_parallel(input)); }, ); // benchmark prefix vec continuations - let vec = - PrefixVecContinuations::new(words.iter().zip(0..words.len()).collect(), &continuations); + let vec = ContinuationsVec::new(words.iter().zip(0..words.len()).collect(), &continuations); group.bench_with_input("prefix_vec_continuations", word, |b, input| { b.iter(|| vec.contains_continuations(input)); }); diff --git a/text-utils-prefix/src/art.rs b/text-utils-prefix/src/art.rs index 3397747..8318a12 100644 --- a/text-utils-prefix/src/art.rs +++ b/text-utils-prefix/src/art.rs @@ -3,7 +3,7 @@ use std::{ iter::{empty, once}, }; -use crate::{ContinuationSearch, PrefixSearch}; +use crate::{ContinuationsTrie, PrefixSearch}; type Index = Box<[u8; N]>; type Children = Box<[Option>>; N]>; @@ -316,7 +316,7 @@ impl Node { fn contains_prefix_iter( &self, mut key: impl Iterator, - offset: usize, + mut offset: usize, ) -> Option<(&Self, usize)> { let mut node = self; loop { @@ -326,6 +326,8 @@ impl Node { Matching::FullPrefix(k) => k, Matching::Partial(..) => break, }; + // reset offset after first node + offset = 0; let Some(child) = node.find_child(k) else { break; @@ -609,7 +611,8 @@ impl PrefixSearch for AdaptiveRadixTrie { let Node { inner: NodeType::Leaf(value), .. - } = node.remove_child(k) else { + } = node.remove_child(k) + else { unreachable!("should not happen"); }; return Some(value); @@ -638,10 +641,7 @@ impl PrefixSearch for AdaptiveRadixTrie { root.contains_prefix_iter(key, 0).is_some() } - fn path<'a>(&'a self, prefix: &[u8]) -> Vec<(usize, &'a Self::Value)> - where - Self::Value: 'a, - { + fn path(&self, prefix: &[u8]) -> Vec<(usize, &Self::Value)> { let Some(root) = &self.root else { return vec![]; }; @@ -682,9 +682,7 @@ impl PrefixSearch for AdaptiveRadixTrie { } path } -} -impl ContinuationSearch for AdaptiveRadixTrie { fn continuations(&self, prefix: &[u8]) -> Box, &V)> + '_> { let Some(root) = &self.root else { return Box::new(empty()); @@ -713,41 +711,10 @@ impl ContinuationSearch for AdaptiveRadixTrie { node.leaves_recursive(prefix) } +} - fn contains_continuation(&self, prefix: &[u8], continuation: &[u8]) -> bool { - let Some(root) = &self.root else { - return false; - }; - - let key = prefix.iter().chain(continuation.iter()).copied(); - root.contains_prefix_iter(key, 0).is_some() - } - - fn contains_continuations(&self, prefix: &[u8], continuations: &[Vec]) -> Vec { - let Some(root) = &self.root else { - return vec![]; - }; - - let key = prefix.iter().copied(); - let Some((node, n)) = root.contains_prefix_iter(key, 0) else { - return vec![]; - }; - - continuations - .iter() - .enumerate() - .filter_map(|(i, c)| { - let key = c.iter().copied(); - if node.contains_prefix_iter(key, n).is_some() { - Some(i) - } else { - None - } - }) - .collect() - } - - fn contains_continuations_optimized( +impl ContinuationsTrie for AdaptiveRadixTrie { + fn contains_continuations( &self, prefix: &[u8], continuations: &[Vec], diff --git a/text-utils-prefix/src/lib.rs b/text-utils-prefix/src/lib.rs index 57f6554..54f883b 100644 --- a/text-utils-prefix/src/lib.rs +++ b/text-utils-prefix/src/lib.rs @@ -1,13 +1,13 @@ -use itertools::Itertools; use rayon::prelude::*; pub mod art; pub mod patricia; +pub mod utils; pub mod vec; pub use art::AdaptiveRadixTrie; pub use patricia::PatriciaTrie; -pub use vec::{PrefixVec, PrefixVecContinuations}; +pub use vec::{ContinuationsVec, PrefixVec}; pub trait PrefixSearch { type Value; @@ -20,171 +20,106 @@ pub trait PrefixSearch { fn contains_prefix(&self, prefix: &[u8]) -> bool; - fn path<'a>(&'a self, prefix: &[u8]) -> Vec<(usize, &'a Self::Value)> - where - Self::Value: 'a; -} + fn path(&self, prefix: &[u8]) -> Vec<(usize, &Self::Value)>; -pub trait ContinuationSearch: PrefixSearch { fn continuations( &self, prefix: &[u8], ) -> Box, &Self::Value)> + '_>; +} - fn contains_continuation(&self, prefix: &[u8], continuation: &[u8]) -> bool; - - fn contains_continuations(&self, prefix: &[u8], continuations: &[Vec]) -> Vec { - // default naive implementation, should be overridden if there is a more efficient way - continuations - .iter() - .enumerate() - .filter_map(|(i, c)| { - if self.contains_continuation(prefix.as_ref(), c.as_ref()) { - Some(i) - } else { - None - } - }) - .collect() - } - - fn contains_continuations_optimized( - &self, - prefix: &[u8], - continuations: &[Vec], - permutation: &[usize], - skips: &[usize], - ) -> Vec { - // default naive implementation, should be overridden if there is a more efficient way - assert_eq!(continuations.len(), permutation.len()); - assert_eq!(continuations.len(), skips.len()); - let mut result = vec![]; - let mut i = 0; - while let Some(&j) = permutation.get(i) { - let continuation = continuations[j].as_ref(); - if self.contains_continuation(prefix.as_ref(), continuation) { - result.push(j); - } else { - i += skips[i]; - }; - i += 1; - } - result - } - - fn batch_contains_continuations( - &self, - prefixes: &[Vec], - continuations: &[Vec], - ) -> Vec> { - prefixes - .iter() - .map(|p| self.contains_continuations(p, continuations)) - .collect() - } +pub trait ContinuationSearch { + fn contains_continuations(&self, prefix: &[u8]) -> Vec; - fn batch_contains_continuations_optimized( - &self, - prefixes: &[Vec], - continuations: &[Vec], - permutation: &[usize], - skips: &[usize], - ) -> Vec> { + fn batch_contains_continuations(&self, prefixes: &[Vec]) -> Vec> { prefixes .iter() - .map(|p| self.contains_continuations_optimized(p, continuations, permutation, skips)) + .map(|prefix| self.contains_continuations(prefix)) .collect() } - fn batch_contains_continuations_optimized_parallel( - &self, - prefixes: &[Vec], - continuations: &[Vec], - permutation: &[usize], - skips: &[usize], - ) -> Vec> + fn batch_contains_continuations_parallel(&self, prefixes: &[Vec]) -> Vec> where Self: Sync, { prefixes .par_iter() - .map(|p| self.contains_continuations_optimized(p, continuations, permutation, skips)) + .map(|prefix| self.contains_continuations(prefix)) .collect() } } -pub fn optimized_prefix_order(continuations: &[C]) -> (Vec, Vec) -where - C: AsRef<[u8]>, -{ - let permutation: Vec<_> = continuations - .iter() - .enumerate() - .sorted_by(|(_, a), (_, b)| a.as_ref().cmp(b.as_ref())) - .map(|(i, _)| i) - .collect(); - let mut skips = vec![0; continuations.len()]; - for i in 0..permutation.len() { - // if the current key is a prefix of the next one, we can skip the - // latter - let continuation = continuations[permutation[i]].as_ref(); - while let Some(next) = permutation.get(i + skips[i] + 1) { - let next_continuation = continuations[*next].as_ref(); - if next_continuation.starts_with(continuation) { - skips[i] += 1; - } else { - break; - } - } - } - (permutation, skips) +pub trait ContinuationsTrie { + fn contains_continuations( + &self, + prefix: &[u8], + continuations: &[Vec], + permutation: &[usize], + skips: &[usize], + ) -> Vec; } pub struct ContinuationTrie { - trie: T, - continuations: (Vec>, Vec, Vec), + pub trie: T, + conts: Vec>, + optimized: (Vec, Vec), } -impl ContinuationTrie -where - T: ContinuationSearch + Sync, -{ +impl ContinuationTrie { pub fn new(trie: T, continuations: &[C]) -> Self where C: AsRef<[u8]>, { - let (permutation, skips) = optimized_prefix_order(continuations); Self { trie, - continuations: ( - continuations.iter().map(|c| c.as_ref().to_vec()).collect(), - permutation, - skips, - ), + conts: continuations.iter().map(|c| c.as_ref().to_vec()).collect(), + optimized: utils::optimized_prefix_order(continuations), } } +} - pub fn continuation_indices

(&self, prefix: P) -> Vec - where - P: AsRef<[u8]>, - { - let (continuations, permutation, skips) = &self.continuations; - self.trie.contains_continuations_optimized( - prefix.as_ref(), - continuations, - permutation, - skips, - ) +impl ContinuationSearch for ContinuationTrie +where + T: ContinuationsTrie, +{ + fn contains_continuations(&self, prefix: &[u8]) -> Vec { + let (permutation, skips) = &self.optimized; + self.trie + .contains_continuations(prefix, &self.conts, permutation, skips) + } +} + +impl PrefixSearch for ContinuationTrie +where + T: PrefixSearch, +{ + type Value = T::Value; + + fn insert(&mut self, key: &[u8], value: Self::Value) -> Option { + self.trie.insert(key, value) } - pub fn batch_continuation_indices(&self, prefixes: &[Vec]) -> Vec> { - let (continuations, permutation, skips) = &self.continuations; - self.trie.batch_contains_continuations_optimized_parallel( - prefixes, - continuations, - permutation, - skips, - ) + fn delete(&mut self, key: &[u8]) -> Option { + self.trie.delete(key) + } + + fn get(&self, key: &[u8]) -> Option<&Self::Value> { + self.trie.get(key) + } + + fn contains_prefix(&self, prefix: &[u8]) -> bool { + self.trie.contains_prefix(prefix) + } + + fn path(&self, prefix: &[u8]) -> Vec<(usize, &Self::Value)> { + self.trie.path(prefix) + } + + fn continuations( + &self, + prefix: &[u8], + ) -> Box, &Self::Value)> + '_> { + self.trie.continuations(prefix) } } @@ -193,13 +128,14 @@ mod test { use std::{fs, path::PathBuf}; use itertools::Itertools; + use rand::{seq::SliceRandom, Rng}; use crate::{ - optimized_prefix_order, AdaptiveRadixTrie, ContinuationSearch, PatriciaTrie, PrefixSearch, - PrefixVec, PrefixVecContinuations, + AdaptiveRadixTrie, ContinuationSearch, ContinuationTrie, ContinuationsTrie, + ContinuationsVec, PatriciaTrie, PrefixSearch, PrefixVec, }; - fn get_tries() -> Vec<(&'static str, Box>)> { + fn get_prefix_searchers() -> Vec<(&'static str, Box>)> { vec![ ("art", Box::new(AdaptiveRadixTrie::default())), ("patricia", Box::new(PatriciaTrie::default())), @@ -207,9 +143,18 @@ mod test { ] } - fn load_prefixes() -> Vec<&'static [u8]> { - [b"Albert".as_slice(), b"Ber", b"Frank"] + fn load_prefixes(words: &[String], n: usize) -> Vec<&[u8]> { + // sample n random prefixes from the words + let mut rng = rand::thread_rng(); + words + .choose_multiple(&mut rng, n) .into_iter() + .map(|s| { + let s = s.as_bytes(); + // choose random prefix length + let len = rng.gen_range(0..s.len()); + &s[..len.max(2).min(s.len())] + }) .collect() } @@ -234,59 +179,48 @@ mod test { .collect() } - #[test] - fn test_optimized_prefix_order() { - let items = ["de", "a", "d", "ab", "abc", "b"]; - let (permutation, skips) = optimized_prefix_order(&items); - assert_eq!(permutation, vec![1, 3, 4, 5, 2, 0]); - assert_eq!(skips, vec![2, 1, 0, 0, 1, 0]); - } - #[test] fn test_prefix_search() { - for (_, mut trie) in get_tries() { - assert_eq!(trie.get(b"hello"), None); - assert_eq!(trie.get(b""), None); - assert!(!trie.contains_prefix(b"")); - trie.insert(b"", 4); - trie.insert(b"h", 5); - trie.insert(b"hello", 1); - assert_eq!(trie.delete(b"hello"), Some(1)); - assert_eq!(trie.delete(b"hello "), None); - trie.insert(b"hello", 1); - trie.insert(b"hell", 2); - trie.insert(b"hello world", 3); - assert_eq!(trie.path(b""), vec![(0, &4)]); - assert_eq!( - trie.path(b"hello"), - vec![(0, &4), (1, &5), (4, &2), (5, &1)] - ); - assert_eq!(trie.get(b"hello"), Some(&1)); - assert_eq!(trie.get(b"hell"), Some(&2)); - assert_eq!(trie.get(b"hello world"), Some(&3)); - assert_eq!(trie.contains_prefix(b"hell"), true); - assert_eq!(trie.contains_prefix(b"hello"), true); - assert_eq!(trie.contains_prefix(b""), true); - assert_eq!(trie.contains_prefix(b"hello world!"), false); - assert_eq!(trie.contains_prefix(b"test"), false); - assert_eq!(trie.get(b"hello"), Some(&1)); - assert_eq!(trie.delete(b"hello"), Some(1)); - assert_eq!(trie.get(b"hello"), None); + for (_, mut pfx) in get_prefix_searchers() { + assert_eq!(pfx.get(b"hello"), None); + assert_eq!(pfx.get(b""), None); + assert!(!pfx.contains_prefix(b"")); + pfx.insert(b"", 4); + pfx.insert(b"h", 5); + pfx.insert(b"hello", 1); + assert_eq!(pfx.delete(b"hello"), Some(1)); + assert_eq!(pfx.delete(b"hello "), None); + pfx.insert(b"hello", 1); + pfx.insert(b"hell", 2); + pfx.insert(b"hello world", 3); + assert_eq!(pfx.path(b""), vec![(0, &4)]); + assert_eq!(pfx.path(b"hello"), vec![(0, &4), (1, &5), (4, &2), (5, &1)]); + assert_eq!(pfx.get(b"hello"), Some(&1)); + assert_eq!(pfx.get(b"hell"), Some(&2)); + assert_eq!(pfx.get(b"hello world"), Some(&3)); + assert_eq!(pfx.contains_prefix(b"hell"), true); + assert_eq!(pfx.contains_prefix(b"hello"), true); + assert_eq!(pfx.contains_prefix(b""), true); + assert_eq!(pfx.contains_prefix(b"hello world!"), false); + assert_eq!(pfx.contains_prefix(b"test"), false); + assert_eq!(pfx.get(b"hello"), Some(&1)); + assert_eq!(pfx.delete(b"hello"), Some(1)); + assert_eq!(pfx.get(b"hello"), None); } } #[test] fn test_path() { let words = load_words(); - let prefixes = load_prefixes(); + let prefixes = load_prefixes(&words, 1000); - for (_, mut trie) in get_tries() { + for (_, mut pfx) in get_prefix_searchers() { words.iter().enumerate().for_each(|(i, w)| { - trie.insert(w.as_bytes(), i); + pfx.insert(w.as_bytes(), i); }); for prefix in &prefixes { - let path = trie.path(prefix); + let path = pfx.path(prefix); assert!(path .iter() .all(|&(n, i)| { &prefix[..n] == words[*i].as_bytes() })); @@ -305,24 +239,24 @@ mod test { fn test_insert_delete_contains_prefix() { let words = load_words(); - for (_, mut trie) in get_tries() { + for (_, mut pfx) in get_prefix_searchers() { words.iter().enumerate().for_each(|(i, w)| { - trie.insert(w.as_bytes(), i); + pfx.insert(w.as_bytes(), i); }); for (i, word) in words.iter().enumerate() { - assert_eq!(trie.get(word.as_bytes()), Some(&i)); + assert_eq!(pfx.get(word.as_bytes()), Some(&i)); let bytes = word.as_bytes(); - assert!(trie.contains_prefix(&bytes[..=bytes.len() / 2])); + assert!(pfx.contains_prefix(&bytes[..=bytes.len() / 2])); } for (i, word) in words.iter().enumerate() { let even = i % 2 == 0; if even { - assert_eq!(trie.delete(word.as_bytes()), Some(i)); - assert_eq!(trie.get(word.as_bytes()), None); + assert_eq!(pfx.delete(word.as_bytes()), Some(i)); + assert_eq!(pfx.get(word.as_bytes()), None); } else { - assert_eq!(trie.get(word.as_bytes()), Some(&i)); + assert_eq!(pfx.get(word.as_bytes()), Some(&i)); } } } @@ -331,16 +265,20 @@ mod test { #[test] fn test_continuation_vec() { let words = load_words(); - let prefixes = load_prefixes(); - let continuations = load_continuations().into_iter().skip(4).collect::>(); + let prefixes = load_prefixes(&words, 10); + let continuations = load_continuations(); - let vec = PrefixVecContinuations::new( + let vec = ContinuationsVec::new( words.iter().enumerate().map(|(i, w)| (w, i)).collect(), &continuations, ); for prefix in prefixes { - let conts: Vec<_> = vec.continuations(prefix).map(|(w, v)| (w, *v)).collect(); + let conts: Vec<_> = vec + .vec + .continuations(prefix) + .map(|(w, v)| (w, *v)) + .collect(); // check that no other words than the given conts start with the prefix assert!(words.iter().all(|w| { let w = w.as_bytes(); @@ -376,22 +314,85 @@ mod test { } } + trait PrefixContTrie: PrefixSearch + ContinuationsTrie {} + impl PrefixContTrie for T where T: PrefixSearch + ContinuationsTrie + ?Sized {} + impl PrefixSearch for Box { + type Value = T::Value; + + fn insert(&mut self, key: &[u8], value: Self::Value) -> Option { + self.as_mut().insert(key, value) + } + + fn delete(&mut self, key: &[u8]) -> Option { + self.as_mut().delete(key) + } + + fn get(&self, key: &[u8]) -> Option<&Self::Value> { + self.as_ref().get(key) + } + + fn contains_prefix(&self, prefix: &[u8]) -> bool { + self.as_ref().contains_prefix(prefix) + } + + fn path(&self, prefix: &[u8]) -> Vec<(usize, &Self::Value)> { + self.as_ref().path(prefix) + } + + fn continuations( + &self, + prefix: &[u8], + ) -> Box, &Self::Value)> + '_> { + self.as_ref().continuations(prefix) + } + } + impl ContinuationsTrie for Box { + fn contains_continuations( + &self, + prefix: &[u8], + continuations: &[Vec], + permutation: &[usize], + skips: &[usize], + ) -> Vec { + self.as_ref() + .contains_continuations(prefix, continuations, permutation, skips) + } + } + #[test] - fn test_continuation_search() { + fn test_continuation_tries() { let words = load_words(); - let prefixes = load_prefixes(); + let prefixes = load_prefixes(&words, 10); let continuations = load_continuations(); - let tries: Vec<(_, Box>)> = vec![ - ("art", Box::new(AdaptiveRadixTrie::default())), - ("patricia", Box::new(AdaptiveRadixTrie::default())), + let tries: Vec<(_, ContinuationTrie>>)> = vec![ + ( + "art", + ContinuationTrie::new( + Box::new( + words + .iter() + .zip(0..words.len()) + .collect::>(), + ), + &continuations, + ), + ), + ( + "patricia", + ContinuationTrie::new( + Box::new( + words + .iter() + .zip(0..words.len()) + .collect::>(), + ), + &continuations, + ), + ), ]; - for (_, mut trie) in tries { - words.iter().enumerate().for_each(|(i, w)| { - trie.insert(w.as_bytes(), i); - }); - + for (_, trie) in tries { for prefix in &prefixes { let conts: Vec<_> = trie.continuations(prefix).map(|(w, v)| (w, *v)).collect(); // check that no other words than the given conts start with the prefix @@ -408,15 +409,73 @@ mod test { assert_eq!(trie.get(word), Some(idx)); assert_eq!(words[*idx].as_bytes(), word); } - let cont_indices = trie.contains_continuations(prefix, &continuations); + let cont_indices = trie.contains_continuations(prefix); for (i, cont) in continuations.iter().enumerate() { let full_prefix: Vec<_> = prefix.iter().chain(cont.iter()).copied().collect(); - let contains_cont = trie.contains_continuation(prefix, cont); let in_conts = conts.iter().any(|(w, _)| w.starts_with(&full_prefix)); - let all = contains_cont && in_conts; - assert!(if cont_indices.contains(&i) { all } else { !all }); + assert!(if cont_indices.contains(&i) { + in_conts + } else { + !in_conts + }); } } } } + + #[test] + fn test_continuation_search() { + let words = load_words(); + let prefixes = load_prefixes(&words, 100); + let continuations = load_continuations(); + + let cont_search: Vec<(_, Box)> = vec![ + ( + "art", + Box::new(ContinuationTrie::new( + words + .iter() + .zip(0..words.len()) + .collect::>(), + &continuations, + )), + ), + ( + "patricia", + Box::new(ContinuationTrie::new( + words + .iter() + .zip(0..words.len()) + .collect::>(), + &continuations, + )), + ), + ( + "vec", + Box::new(ContinuationsVec::new( + words.iter().zip(0..words.len()).collect(), + &continuations, + )), + ), + ]; + + for prefix in &prefixes { + let conts: Vec<_> = cont_search + .iter() + .map(|(_, c)| { + let mut conts = c.contains_continuations(prefix); + conts.sort(); + conts + }) + .collect(); + assert!(conts.windows(2).all(|w| w[0] == w[1]),); + assert!(conts[0].iter().all(|&i| { + let extended_prefix: Vec<_> = + prefix.iter().chain(&continuations[i]).copied().collect(); + words + .iter() + .any(|w| w.as_bytes().starts_with(&extended_prefix)) + })); + } + } } diff --git a/text-utils-prefix/src/patricia.rs b/text-utils-prefix/src/patricia.rs index 6b61f1b..dbde705 100644 --- a/text-utils-prefix/src/patricia.rs +++ b/text-utils-prefix/src/patricia.rs @@ -3,7 +3,7 @@ use std::{ iter::{empty, once}, }; -use crate::{ContinuationSearch, PrefixSearch}; +use crate::{ContinuationsTrie, PrefixSearch}; #[derive(Debug)] enum NodeType { @@ -212,7 +212,7 @@ impl Node { fn contains_prefix_iter( &self, mut key: impl Iterator, - offset: usize, + mut offset: usize, ) -> Option<(&Self, usize)> { let mut node = self; // extend given key with null byte @@ -225,6 +225,8 @@ impl Node { Matching::FullPrefix(k) => k, Matching::Partial(..) => break, }; + // reset offset after first node + offset = 0; let Some(child) = node.find_child(k) else { break; @@ -397,10 +399,7 @@ impl PrefixSearch for PatriciaTrie { root.contains_prefix_iter(key, 0).is_some() } - fn path<'a>(&'a self, prefix: &[u8]) -> Vec<(usize, &'a Self::Value)> - where - Self::Value: 'a, - { + fn path(&self, prefix: &[u8]) -> Vec<(usize, &Self::Value)> { let Some(root) = &self.root else { return vec![]; }; @@ -441,9 +440,7 @@ impl PrefixSearch for PatriciaTrie { } path } -} -impl ContinuationSearch for PatriciaTrie { fn continuations(&self, prefix: &[u8]) -> Box, &V)> + '_> { let Some(root) = &self.root else { return Box::new(empty()); @@ -472,41 +469,10 @@ impl ContinuationSearch for PatriciaTrie { node.leaves_recursive(prefix) } +} - fn contains_continuation(&self, prefix: &[u8], continuation: &[u8]) -> bool { - let Some(root) = &self.root else { - return false; - }; - - let key = prefix.iter().chain(continuation.iter()).copied(); - root.contains_prefix_iter(key, 0).is_some() - } - - fn contains_continuations(&self, prefix: &[u8], continuations: &[Vec]) -> Vec { - let Some(root) = &self.root else { - return vec![]; - }; - - let key = prefix.iter().copied(); - let Some((node, n)) = root.contains_prefix_iter(key, 0) else { - return vec![]; - }; - - continuations - .iter() - .enumerate() - .filter_map(|(i, c)| { - let key = c.iter().copied(); - if node.contains_prefix_iter(key, n).is_some() { - Some(i) - } else { - None - } - }) - .collect() - } - - fn contains_continuations_optimized( +impl ContinuationsTrie for PatriciaTrie { + fn contains_continuations( &self, prefix: &[u8], continuations: &[Vec], diff --git a/text-utils-prefix/src/utils.rs b/text-utils-prefix/src/utils.rs new file mode 100644 index 0000000..575309e --- /dev/null +++ b/text-utils-prefix/src/utils.rs @@ -0,0 +1,41 @@ +use itertools::Itertools; + +pub fn optimized_prefix_order(continuations: &[C]) -> (Vec, Vec) +where + C: AsRef<[u8]>, +{ + let permutation: Vec<_> = continuations + .iter() + .enumerate() + .sorted_by(|(_, a), (_, b)| a.as_ref().cmp(b.as_ref())) + .map(|(i, _)| i) + .collect(); + let mut skips = vec![0; continuations.len()]; + for i in 0..permutation.len() { + // if the current key is a prefix of the next one, we can skip the + // latter + let continuation = continuations[permutation[i]].as_ref(); + while let Some(next) = permutation.get(i + skips[i] + 1) { + let next_continuation = continuations[*next].as_ref(); + if next_continuation.starts_with(continuation) { + skips[i] += 1; + } else { + break; + } + } + } + (permutation, skips) +} + +#[cfg(test)] +mod test { + use crate::utils::optimized_prefix_order; + + #[test] + fn test_optimized_prefix_order() { + let items = ["de", "a", "d", "ab", "abc", "b"]; + let (permutation, skips) = optimized_prefix_order(&items); + assert_eq!(permutation, vec![1, 3, 4, 5, 2, 0]); + assert_eq!(skips, vec![2, 1, 0, 0, 1, 0]); + } +} diff --git a/text-utils-prefix/src/vec.rs b/text-utils-prefix/src/vec.rs index 9aa7556..2004e2c 100644 --- a/text-utils-prefix/src/vec.rs +++ b/text-utils-prefix/src/vec.rs @@ -1,14 +1,12 @@ use itertools::Itertools; -use rayon::prelude::*; -use std::hash::Hash; use std::{cmp::Ordering, iter::empty}; use serde::{Deserialize, Serialize}; -use crate::PrefixSearch; +use crate::{ContinuationSearch, PrefixSearch}; #[derive(Serialize, Deserialize)] -pub struct PrefixVec { +pub struct PrefixVec { data: Vec<(Box<[u8]>, V)>, } @@ -17,7 +15,7 @@ pub struct PrefixVecStats { pub num_keys: usize, } -impl Default for PrefixVec { +impl Default for PrefixVec { fn default() -> Self { Self { data: vec![] } } @@ -28,7 +26,7 @@ enum FindResult { NotFound(usize), } -impl PrefixVec { +impl PrefixVec { pub fn stats(&self) -> PrefixVecStats { PrefixVecStats { num_keys: self.data.len(), @@ -146,7 +144,7 @@ impl PrefixVec { } } -impl PrefixSearch for PrefixVec { +impl PrefixSearch for PrefixVec { type Value = V; #[inline] @@ -165,7 +163,10 @@ impl PrefixSearch for PrefixVec { } fn delete(&mut self, key: &[u8]) -> Option { - let Ok(idx) = self.data.binary_search_by(|(prefix, _)| prefix.as_ref().cmp(key)) else { + let Ok(idx) = self + .data + .binary_search_by(|(prefix, _)| prefix.as_ref().cmp(key)) + else { return None; }; let (_, value) = self.data.remove(idx); @@ -192,10 +193,7 @@ impl PrefixSearch for PrefixVec { matches!(self.find_range(prefix, 0, self.data.len(), 0), FindResult::Found(left, _) if left < self.data.len()) } - fn path<'a>(&'a self, prefix: &[u8]) -> Vec<(usize, &'a Self::Value)> - where - Self::Value: 'a, - { + fn path(&self, prefix: &[u8]) -> Vec<(usize, &Self::Value)> { let mut left = 0; let mut right = self.data.len(); let mut path = vec![]; @@ -207,7 +205,7 @@ impl PrefixSearch for PrefixVec { _ => (), } for (i, k) in prefix.iter().enumerate() { - let Some((l, r)) = self.range_search(k, i, left, right) else { + let Some((l, r)) = self.range_search(k, i, left, right) else { break; }; left = l; @@ -221,12 +219,22 @@ impl PrefixSearch for PrefixVec { } path } + + fn continuations(&self, prefix: &[u8]) -> Box, &V)> + '_> { + match self.find_range(prefix, 0, self.data.len(), 0) { + FindResult::Found(left, right) => Box::new( + self.data[left..right] + .iter() + .map(|(key, value)| (key.to_vec(), value)), + ), + FindResult::NotFound(_) => Box::new(empty()), + } + } } impl FromIterator<(K, V)> for PrefixVec where K: AsRef<[u8]>, - V: Hash + Eq, { fn from_iter>(iter: T) -> Self { let mut pfx = Self::default(); @@ -264,12 +272,12 @@ impl Default for Node { } } -pub struct PrefixVecContinuations { +pub struct ContinuationsVec { pub(crate) vec: PrefixVec, pub(crate) continuation_trie: Node, } -impl PrefixVecContinuations { +impl ContinuationsVec { pub fn new(vec: PrefixVec, continuations: &[C]) -> Self where C: AsRef<[u8]>, @@ -287,29 +295,38 @@ impl PrefixVecContinuations { continuation_trie, } } +} - pub fn continuations

(&self, prefix: P) -> Box, &V)> + '_> - where - P: AsRef<[u8]>, - { - match self - .vec - .find_range(prefix.as_ref(), 0, self.vec.data.len(), 0) - { - FindResult::Found(left, right) => Box::new( - self.vec.data[left..right] - .iter() - .map(|(key, value)| (key.to_vec(), value)), - ), - FindResult::NotFound(_) => Box::new(empty()), - } +impl PrefixSearch for ContinuationsVec { + type Value = V; + + fn insert(&mut self, key: &[u8], value: V) -> Option { + self.vec.insert(key, value) } - pub fn contains_continuations

(&self, prefix: P) -> Vec - where - P: AsRef<[u8]>, - { - let prefix = prefix.as_ref(); + fn delete(&mut self, key: &[u8]) -> Option { + self.vec.delete(key) + } + + fn get(&self, prefix: &[u8]) -> Option<&V> { + self.vec.get(prefix) + } + + fn contains_prefix(&self, prefix: &[u8]) -> bool { + self.vec.contains_prefix(prefix) + } + + fn path(&self, prefix: &[u8]) -> Vec<(usize, &Self::Value)> { + self.vec.path(prefix) + } + + fn continuations(&self, prefix: &[u8]) -> Box, &V)> + '_> { + self.vec.continuations(prefix) + } +} + +impl ContinuationSearch for ContinuationsVec { + fn contains_continuations(&self, prefix: &[u8]) -> Vec { let FindResult::Found(left, right) = self.vec.find_range(prefix, 0, self.vec.data.len(), 0) else { return vec![]; @@ -334,24 +351,4 @@ impl PrefixVecContinuations { .unique() .collect() } - - pub fn batch_contains_continuations

(&self, prefixes: &[P]) -> Vec> - where - P: AsRef<[u8]>, - { - prefixes - .iter() - .map(|p| self.contains_continuations(p)) - .collect() - } - - pub fn batch_contains_continuations_parallel(&self, prefixes: &[Vec]) -> Vec> - where - Self: Sync, - { - prefixes - .par_iter() - .map(|p| self.contains_continuations(p)) - .collect() - } }