Skip to content

Commit

Permalink
add prefix subcrate
Browse files Browse the repository at this point in the history
  • Loading branch information
bastiscode committed Jan 28, 2024
1 parent de883b5 commit 97c8144
Show file tree
Hide file tree
Showing 6 changed files with 1,035 additions and 0 deletions.
23 changes: 23 additions & 0 deletions text-utils-prefix/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[package]
name = "text-utils-prefix"
version = "0.1.0"
edition = "2021"

[dependencies]

[dev-dependencies]
criterion = "0.5"
art-tree = "0.2.0"
patricia_tree = "0.8.0"
rand = "0.8"
rand_distr = "0.4"
rand_chacha = "0.3"

[profile.release]
lto = true
codegen-units = 1
strip = true

[[bench]]
name = "benchmark"
harness = false
116 changes: 116 additions & 0 deletions text-utils-prefix/benches/benchmark.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
use std::fs;
use std::path::PathBuf;

use criterion::{criterion_group, criterion_main, Criterion};
use rand::seq::SliceRandom;
use rand::SeedableRng;
use rand_chacha::ChaCha8Rng;
use text_utils_prefix::{patricia_trie::PatriciaTrie, trie::Trie};
use text_utils_prefix::{ContinuationSearch, PrefixSearch};

use art_tree::{Art, ByteString};
use patricia_tree::PatriciaMap;

const ASCII_LETTERS: &[u8; 52] = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";

fn bench_prefix(c: &mut Criterion) {
let dir = env!("CARGO_MANIFEST_DIR");
let index = fs::read_to_string(PathBuf::from(dir).join("resources/test/index.txt"))
.expect("failed to read file");
let words: Vec<_> = index.lines().map(|s| s.as_bytes()).take(100_000).collect();
let mut rng = ChaCha8Rng::seed_from_u64(22);
// sample random word from all words
let word = *words.choose(&mut rng).unwrap();
println!("choose word {}", String::from_utf8_lossy(word));
let mut group = c.benchmark_group("prefix_search");
let continuations: Vec<_> = ASCII_LETTERS.iter().map(|&c| [c]).collect();

// benchmark art-tree
let mut trie: Art<_, _> = Art::new();
for (i, word) in words.iter().enumerate() {
trie.insert(ByteString::new(word), i);
}
group.bench_with_input("art_tree_insert", word, |b, input| {
b.iter(|| trie.insert(ByteString::new(input), 1));
});
group.bench_with_input("art_tree_get", word, |b, input| {
b.iter(|| trie.get(&ByteString::new(input)));
});

// benchmark patricia_tree
let mut trie: PatriciaMap<_> = PatriciaMap::new();
for (i, word) in words.iter().enumerate() {
trie.insert(word, i);
}
group.bench_with_input("patricia_tree_insert", word, |b, input| {
b.iter(|| trie.insert(input, 1));
});
group.bench_with_input("patricia_tree_get", word, |b, input| {
b.iter(|| trie.get(input));
});

// benchmark prefix tries
let mut trie: Trie<_> = words.iter().zip(0..words.len()).collect();
group.bench_with_input("trie_insert", word, |b, input| {
b.iter(|| trie.insert(input, 1));
});
group.bench_with_input("trie_get", word, |b, input| {
b.iter(|| trie.get(input));
});
group.bench_with_input("trie_contains", word, |b, input| {
b.iter(|| trie.contains_prefix(&input[..input.len().saturating_sub(3)]));
});

// benchmark patricia trie
let mut trie: PatriciaTrie<_> = words.iter().zip(0..words.len()).collect();
group.bench_with_input("patricia_trie_insert", word, |b, input| {
b.iter(|| trie.insert(input, 1));
});
group.bench_with_input("patricia_trie_get", word, |b, input| {
b.iter(|| trie.get(input));
});
group.bench_with_input("patricia_trie_contains", word, |b, input| {
b.iter(|| trie.contains_prefix(&input[..input.len().saturating_sub(3)]));
});
let conts = trie.contains_continuations("Albert", &continuations);
assert_eq!(
conts.iter().map(|&b| if b { 1 } else { 0 }).sum::<usize>(),
4
);
group.bench_with_input(
"patricia_trie_continuations",
&("Albert", &continuations),
|b, input| {
let (word, continuations) = input;
b.iter(|| trie.contains_continuations(&word, &continuations));
},
);
group.bench_with_input(
"patricia_trie_batch_continuations",
&(["Albert"; 64], &continuations),
|b, input| {
let (words, continuations) = input;
b.iter(|| trie.batch_contains_continuations(words, &continuations));
},
);

// benchmark build, load, and save
drop(group);
let mut group = c.benchmark_group("prefix_io");
let n = 10_000;

// let trie: RadixTrie<_> = words.iter().zip(0..words.len()).take(n).collect();
// let path = PathBuf::from(dir).join("resources/test/byte_trie.bin");
// group.bench_with_input("byte_trie_build", &words, |b, input| {
// b.iter(|| {
// input
// .iter()
// .zip(0..input.len())
// .take(n)
// .collect::<RadixTrie<_>>()
// });
// });
}

criterion_group!(benches, bench_prefix);
criterion_main!(benches);
188 changes: 188 additions & 0 deletions text-utils-prefix/src/adaptive_radix_trie.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
use std::iter::once;

use crate::PrefixSearch;

type Index<const N: usize> = [u8; N];
type Children<V, const N: usize> = [Option<Box<Node<V>>>; N];

enum NodeType<V> {
Leaf(V),
N4(Index<4>, Children<V, 4>, usize),
N16(Index<16>, Children<V, 16>, usize),
N48(Box<Index<256>>, Children<V, 48>, usize),
N256(Children<V, 256>, usize),
}

struct Node<V> {
prefix: Box<[u8]>,
inner: NodeType<V>,
}

pub struct AdaptiveRadixTrie<V> {
root: Option<Node<V>>,
}

impl<V> Default for AdaptiveRadixTrie<V> {
fn default() -> Self {
Self { root: None }
}
}

impl<K, V> FromIterator<(K, V)> for AdaptiveRadixTrie<V>
where
K: AsRef<[u8]>,
{
fn from_iter<T: IntoIterator<Item = (K, V)>>(iter: T) -> Self {
let mut trie = Self::default();
for (k, v) in iter {
trie.insert(k, v);
}
trie
}
}

enum Matching {
FullKey(usize),
FullNode,
Partial(usize, u8),
}

impl<V> Node<V> {
#[inline]
fn is_leaf(&self) -> bool {
matches!(self.inner, NodeType::Leaf(_))
}

#[inline]
fn advance_key<'a>(&self, key: &mut impl Iterator<Item = &'a u8>) -> Matching {
let mut i = 0;
while i < self.prefix.len() {
let Some(k) = key.next() else {
return Matching::FullKey(i);
};
if k != &self.prefix[i] {
return Matching::Partial(i, *k);
}
i += 1;
}
Matching::FullNode
}

#[inline]
fn exact_match<'a>(&self, key: &mut impl Iterator<Item = &'a u8>) -> bool {
let mut i = 0;
while i < self.prefix.len() {
let Some(k) = key.next() else {
return false;
};
if k != &self.prefix[i] {
return false;
}
i += 1;
}
// we have to be at the end of the key for an exact match
key.next().is_none()
}

#[inline]
fn find(&self, key: &[u8]) -> Option<&Self> {
let mut node = self;
// extend given key with null byte
// because its needed for the correctness of the algorithm
// when it comes to key lookup
let mut key = key.iter().chain(once(&0));
loop {
if node.is_leaf() {
if self.exact_match(&mut key) {
return Some(node);
}
break;
}

let Matching::FullNode = self.advance_key(&mut key) else {
// if we have not a full node match,
// we can return early
return None;
};

let k = key.next()?;
let Some(child) = node.find_child(k) else {
break;
};
node = child;
}
None
}

#[inline]
fn find_child(&self, key: &u8) -> Option<&Self> {
match &self.inner {
NodeType::Leaf(_) => None,
NodeType::N4(keys, children, num_children) => {
for i in 0..*num_children {
if &keys[i] == key {
return children[i].as_deref();
}
}
None
}
NodeType::N16(keys, children, num_children) => {
let idx = keys[..*num_children].binary_search(key).ok()?;
children[idx].as_deref()
}
NodeType::N48(keys, children, _) => {
children.get(keys[*key as usize] as usize)?.as_deref()
}
NodeType::N256(children, _) => children[*key as usize].as_deref(),
}
}

fn upgrade(self) -> Result<Self, Self> {
todo!()
}

fn downgrade(self) -> Result<Self, Self> {
todo!()
}
}

impl<V> PrefixSearch<V> for AdaptiveRadixTrie<V> {
fn insert<K>(&mut self, key: K, value: V)
where
K: AsRef<[u8]>,
{
todo!()
}

fn delete<K>(&mut self, key: K) -> Option<V>
where
K: AsRef<[u8]>,
{
todo!()
}

fn get<K>(&self, key: K) -> Option<&V>
where
K: AsRef<[u8]>,
{
let Some(root) = &self.root else {
return None;
};

root.find(key.as_ref()).and_then(|node| match &node.inner {
NodeType::Leaf(v) => Some(v),
_ => None,
})
}

fn contains_prefix<P>(&self, prefix: P) -> bool
where
P: AsRef<[u8]>,
{
let Some(root) = &self.root else {
return false;
};

todo!();
}
}
62 changes: 62 additions & 0 deletions text-utils-prefix/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
pub mod adaptive_radix_trie;
pub mod patricia_trie;
pub mod trie;

pub trait PrefixSearch<V> {
fn insert<K>(&mut self, key: K, value: V)
where
K: AsRef<[u8]>;

fn delete<K>(&mut self, key: K) -> Option<V>
where
K: AsRef<[u8]>;

fn get<K>(&self, key: K) -> Option<&V>
where
K: AsRef<[u8]>;

fn contains_prefix<P>(&self, prefix: P) -> bool
where
P: AsRef<[u8]>;
}

pub trait ContinuationSearch<V>: PrefixSearch<V> {
fn continuations<'a, P>(&'a self, prefix: P) -> impl Iterator<Item = (Vec<u8>, &'a V)>
where
P: AsRef<[u8]>,
V: 'a;

fn contains_continuation<P, C>(&self, prefix: P, continuation: C) -> bool
where
P: AsRef<[u8]>,
C: AsRef<[u8]>;

fn contains_continuations<P, C>(&self, prefix: P, continuations: &[C]) -> Vec<bool>
where
P: AsRef<[u8]>,
C: AsRef<[u8]>,
{
// default naive implementation, should be overridden if there is a more efficient way
continuations
.iter()
.map(|c| self.contains_continuation(prefix.as_ref(), c.as_ref()))
.collect()
}

fn batch_contains_continuations<P, C>(
&self,
prefixes: &[P],
continuations: &[C],
) -> Vec<Vec<bool>>
where
P: AsRef<[u8]>,
C: AsRef<[u8]>,
Self: Sync,
{
// default naive implementation, should be overridden if there is a more efficient way
prefixes
.iter()
.map(|p| self.contains_continuations(p, continuations))
.collect()
}
}
Loading

0 comments on commit 97c8144

Please sign in to comment.