-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
de883b5
commit 97c8144
Showing
6 changed files
with
1,035 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
[package] | ||
name = "text-utils-prefix" | ||
version = "0.1.0" | ||
edition = "2021" | ||
|
||
[dependencies] | ||
|
||
[dev-dependencies] | ||
criterion = "0.5" | ||
art-tree = "0.2.0" | ||
patricia_tree = "0.8.0" | ||
rand = "0.8" | ||
rand_distr = "0.4" | ||
rand_chacha = "0.3" | ||
|
||
[profile.release] | ||
lto = true | ||
codegen-units = 1 | ||
strip = true | ||
|
||
[[bench]] | ||
name = "benchmark" | ||
harness = false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
use std::fs; | ||
use std::path::PathBuf; | ||
|
||
use criterion::{criterion_group, criterion_main, Criterion}; | ||
use rand::seq::SliceRandom; | ||
use rand::SeedableRng; | ||
use rand_chacha::ChaCha8Rng; | ||
use text_utils_prefix::{patricia_trie::PatriciaTrie, trie::Trie}; | ||
use text_utils_prefix::{ContinuationSearch, PrefixSearch}; | ||
|
||
use art_tree::{Art, ByteString}; | ||
use patricia_tree::PatriciaMap; | ||
|
||
const ASCII_LETTERS: &[u8; 52] = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; | ||
|
||
fn bench_prefix(c: &mut Criterion) { | ||
let dir = env!("CARGO_MANIFEST_DIR"); | ||
let index = fs::read_to_string(PathBuf::from(dir).join("resources/test/index.txt")) | ||
.expect("failed to read file"); | ||
let words: Vec<_> = index.lines().map(|s| s.as_bytes()).take(100_000).collect(); | ||
let mut rng = ChaCha8Rng::seed_from_u64(22); | ||
// sample random word from all words | ||
let word = *words.choose(&mut rng).unwrap(); | ||
println!("choose word {}", String::from_utf8_lossy(word)); | ||
let mut group = c.benchmark_group("prefix_search"); | ||
let continuations: Vec<_> = ASCII_LETTERS.iter().map(|&c| [c]).collect(); | ||
|
||
// benchmark art-tree | ||
let mut trie: Art<_, _> = Art::new(); | ||
for (i, word) in words.iter().enumerate() { | ||
trie.insert(ByteString::new(word), i); | ||
} | ||
group.bench_with_input("art_tree_insert", word, |b, input| { | ||
b.iter(|| trie.insert(ByteString::new(input), 1)); | ||
}); | ||
group.bench_with_input("art_tree_get", word, |b, input| { | ||
b.iter(|| trie.get(&ByteString::new(input))); | ||
}); | ||
|
||
// benchmark patricia_tree | ||
let mut trie: PatriciaMap<_> = PatriciaMap::new(); | ||
for (i, word) in words.iter().enumerate() { | ||
trie.insert(word, i); | ||
} | ||
group.bench_with_input("patricia_tree_insert", word, |b, input| { | ||
b.iter(|| trie.insert(input, 1)); | ||
}); | ||
group.bench_with_input("patricia_tree_get", word, |b, input| { | ||
b.iter(|| trie.get(input)); | ||
}); | ||
|
||
// benchmark prefix tries | ||
let mut trie: Trie<_> = words.iter().zip(0..words.len()).collect(); | ||
group.bench_with_input("trie_insert", word, |b, input| { | ||
b.iter(|| trie.insert(input, 1)); | ||
}); | ||
group.bench_with_input("trie_get", word, |b, input| { | ||
b.iter(|| trie.get(input)); | ||
}); | ||
group.bench_with_input("trie_contains", word, |b, input| { | ||
b.iter(|| trie.contains_prefix(&input[..input.len().saturating_sub(3)])); | ||
}); | ||
|
||
// benchmark patricia trie | ||
let mut trie: PatriciaTrie<_> = words.iter().zip(0..words.len()).collect(); | ||
group.bench_with_input("patricia_trie_insert", word, |b, input| { | ||
b.iter(|| trie.insert(input, 1)); | ||
}); | ||
group.bench_with_input("patricia_trie_get", word, |b, input| { | ||
b.iter(|| trie.get(input)); | ||
}); | ||
group.bench_with_input("patricia_trie_contains", word, |b, input| { | ||
b.iter(|| trie.contains_prefix(&input[..input.len().saturating_sub(3)])); | ||
}); | ||
let conts = trie.contains_continuations("Albert", &continuations); | ||
assert_eq!( | ||
conts.iter().map(|&b| if b { 1 } else { 0 }).sum::<usize>(), | ||
4 | ||
); | ||
group.bench_with_input( | ||
"patricia_trie_continuations", | ||
&("Albert", &continuations), | ||
|b, input| { | ||
let (word, continuations) = input; | ||
b.iter(|| trie.contains_continuations(&word, &continuations)); | ||
}, | ||
); | ||
group.bench_with_input( | ||
"patricia_trie_batch_continuations", | ||
&(["Albert"; 64], &continuations), | ||
|b, input| { | ||
let (words, continuations) = input; | ||
b.iter(|| trie.batch_contains_continuations(words, &continuations)); | ||
}, | ||
); | ||
|
||
// benchmark build, load, and save | ||
drop(group); | ||
let mut group = c.benchmark_group("prefix_io"); | ||
let n = 10_000; | ||
|
||
// let trie: RadixTrie<_> = words.iter().zip(0..words.len()).take(n).collect(); | ||
// let path = PathBuf::from(dir).join("resources/test/byte_trie.bin"); | ||
// group.bench_with_input("byte_trie_build", &words, |b, input| { | ||
// b.iter(|| { | ||
// input | ||
// .iter() | ||
// .zip(0..input.len()) | ||
// .take(n) | ||
// .collect::<RadixTrie<_>>() | ||
// }); | ||
// }); | ||
} | ||
|
||
criterion_group!(benches, bench_prefix); | ||
criterion_main!(benches); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
use std::iter::once; | ||
|
||
use crate::PrefixSearch; | ||
|
||
type Index<const N: usize> = [u8; N]; | ||
type Children<V, const N: usize> = [Option<Box<Node<V>>>; N]; | ||
|
||
enum NodeType<V> { | ||
Leaf(V), | ||
N4(Index<4>, Children<V, 4>, usize), | ||
N16(Index<16>, Children<V, 16>, usize), | ||
N48(Box<Index<256>>, Children<V, 48>, usize), | ||
N256(Children<V, 256>, usize), | ||
} | ||
|
||
struct Node<V> { | ||
prefix: Box<[u8]>, | ||
inner: NodeType<V>, | ||
} | ||
|
||
pub struct AdaptiveRadixTrie<V> { | ||
root: Option<Node<V>>, | ||
} | ||
|
||
impl<V> Default for AdaptiveRadixTrie<V> { | ||
fn default() -> Self { | ||
Self { root: None } | ||
} | ||
} | ||
|
||
impl<K, V> FromIterator<(K, V)> for AdaptiveRadixTrie<V> | ||
where | ||
K: AsRef<[u8]>, | ||
{ | ||
fn from_iter<T: IntoIterator<Item = (K, V)>>(iter: T) -> Self { | ||
let mut trie = Self::default(); | ||
for (k, v) in iter { | ||
trie.insert(k, v); | ||
} | ||
trie | ||
} | ||
} | ||
|
||
enum Matching { | ||
FullKey(usize), | ||
FullNode, | ||
Partial(usize, u8), | ||
} | ||
|
||
impl<V> Node<V> { | ||
#[inline] | ||
fn is_leaf(&self) -> bool { | ||
matches!(self.inner, NodeType::Leaf(_)) | ||
} | ||
|
||
#[inline] | ||
fn advance_key<'a>(&self, key: &mut impl Iterator<Item = &'a u8>) -> Matching { | ||
let mut i = 0; | ||
while i < self.prefix.len() { | ||
let Some(k) = key.next() else { | ||
return Matching::FullKey(i); | ||
}; | ||
if k != &self.prefix[i] { | ||
return Matching::Partial(i, *k); | ||
} | ||
i += 1; | ||
} | ||
Matching::FullNode | ||
} | ||
|
||
#[inline] | ||
fn exact_match<'a>(&self, key: &mut impl Iterator<Item = &'a u8>) -> bool { | ||
let mut i = 0; | ||
while i < self.prefix.len() { | ||
let Some(k) = key.next() else { | ||
return false; | ||
}; | ||
if k != &self.prefix[i] { | ||
return false; | ||
} | ||
i += 1; | ||
} | ||
// we have to be at the end of the key for an exact match | ||
key.next().is_none() | ||
} | ||
|
||
#[inline] | ||
fn find(&self, key: &[u8]) -> Option<&Self> { | ||
let mut node = self; | ||
// extend given key with null byte | ||
// because its needed for the correctness of the algorithm | ||
// when it comes to key lookup | ||
let mut key = key.iter().chain(once(&0)); | ||
loop { | ||
if node.is_leaf() { | ||
if self.exact_match(&mut key) { | ||
return Some(node); | ||
} | ||
break; | ||
} | ||
|
||
let Matching::FullNode = self.advance_key(&mut key) else { | ||
// if we have not a full node match, | ||
// we can return early | ||
return None; | ||
}; | ||
|
||
let k = key.next()?; | ||
let Some(child) = node.find_child(k) else { | ||
break; | ||
}; | ||
node = child; | ||
} | ||
None | ||
} | ||
|
||
#[inline] | ||
fn find_child(&self, key: &u8) -> Option<&Self> { | ||
match &self.inner { | ||
NodeType::Leaf(_) => None, | ||
NodeType::N4(keys, children, num_children) => { | ||
for i in 0..*num_children { | ||
if &keys[i] == key { | ||
return children[i].as_deref(); | ||
} | ||
} | ||
None | ||
} | ||
NodeType::N16(keys, children, num_children) => { | ||
let idx = keys[..*num_children].binary_search(key).ok()?; | ||
children[idx].as_deref() | ||
} | ||
NodeType::N48(keys, children, _) => { | ||
children.get(keys[*key as usize] as usize)?.as_deref() | ||
} | ||
NodeType::N256(children, _) => children[*key as usize].as_deref(), | ||
} | ||
} | ||
|
||
fn upgrade(self) -> Result<Self, Self> { | ||
todo!() | ||
} | ||
|
||
fn downgrade(self) -> Result<Self, Self> { | ||
todo!() | ||
} | ||
} | ||
|
||
impl<V> PrefixSearch<V> for AdaptiveRadixTrie<V> { | ||
fn insert<K>(&mut self, key: K, value: V) | ||
where | ||
K: AsRef<[u8]>, | ||
{ | ||
todo!() | ||
} | ||
|
||
fn delete<K>(&mut self, key: K) -> Option<V> | ||
where | ||
K: AsRef<[u8]>, | ||
{ | ||
todo!() | ||
} | ||
|
||
fn get<K>(&self, key: K) -> Option<&V> | ||
where | ||
K: AsRef<[u8]>, | ||
{ | ||
let Some(root) = &self.root else { | ||
return None; | ||
}; | ||
|
||
root.find(key.as_ref()).and_then(|node| match &node.inner { | ||
NodeType::Leaf(v) => Some(v), | ||
_ => None, | ||
}) | ||
} | ||
|
||
fn contains_prefix<P>(&self, prefix: P) -> bool | ||
where | ||
P: AsRef<[u8]>, | ||
{ | ||
let Some(root) = &self.root else { | ||
return false; | ||
}; | ||
|
||
todo!(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
pub mod adaptive_radix_trie; | ||
pub mod patricia_trie; | ||
pub mod trie; | ||
|
||
pub trait PrefixSearch<V> { | ||
fn insert<K>(&mut self, key: K, value: V) | ||
where | ||
K: AsRef<[u8]>; | ||
|
||
fn delete<K>(&mut self, key: K) -> Option<V> | ||
where | ||
K: AsRef<[u8]>; | ||
|
||
fn get<K>(&self, key: K) -> Option<&V> | ||
where | ||
K: AsRef<[u8]>; | ||
|
||
fn contains_prefix<P>(&self, prefix: P) -> bool | ||
where | ||
P: AsRef<[u8]>; | ||
} | ||
|
||
pub trait ContinuationSearch<V>: PrefixSearch<V> { | ||
fn continuations<'a, P>(&'a self, prefix: P) -> impl Iterator<Item = (Vec<u8>, &'a V)> | ||
where | ||
P: AsRef<[u8]>, | ||
V: 'a; | ||
|
||
fn contains_continuation<P, C>(&self, prefix: P, continuation: C) -> bool | ||
where | ||
P: AsRef<[u8]>, | ||
C: AsRef<[u8]>; | ||
|
||
fn contains_continuations<P, C>(&self, prefix: P, continuations: &[C]) -> Vec<bool> | ||
where | ||
P: AsRef<[u8]>, | ||
C: AsRef<[u8]>, | ||
{ | ||
// default naive implementation, should be overridden if there is a more efficient way | ||
continuations | ||
.iter() | ||
.map(|c| self.contains_continuation(prefix.as_ref(), c.as_ref())) | ||
.collect() | ||
} | ||
|
||
fn batch_contains_continuations<P, C>( | ||
&self, | ||
prefixes: &[P], | ||
continuations: &[C], | ||
) -> Vec<Vec<bool>> | ||
where | ||
P: AsRef<[u8]>, | ||
C: AsRef<[u8]>, | ||
Self: Sync, | ||
{ | ||
// default naive implementation, should be overridden if there is a more efficient way | ||
prefixes | ||
.iter() | ||
.map(|p| self.contains_continuations(p, continuations)) | ||
.collect() | ||
} | ||
} |
Oops, something went wrong.