Skip to content

Commit

Permalink
refactor: switch from rsdict to sucds
Browse files Browse the repository at this point in the history
  • Loading branch information
KonradHoeffner committed Apr 11, 2024
1 parent 536421b commit 3ce07e2
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 34 deletions.
2 changes: 0 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ crc = "3"
iref = "3"
langtag = "0.4"
ntriple = "0.1"
# rsdict "simd" feature does not build with nightly 1.78, see <https://github.com/KonradHoeffner/hdt/issues/41>
rsdict = { version = "0.0.7", features = [] }
sophia = { version = "0.8.0", optional = true }
sucds = "0.8"
thiserror = "1"
Expand Down
3 changes: 1 addition & 2 deletions src/containers/adj_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,8 @@ impl AdjList {
return 0;
}
// hdt counts from 1
//self.bitmap.dict.select1(x as u64).unwrap() as usize +1
// rsdict has nonzero value for 0, is that correct? adjust for that.
self.bitmap.dict.select1(x as u64 - 1).unwrap() as usize + 1
self.bitmap.select1(x - 1).unwrap() as usize + 1
}

/// Return the position of element within the given bounds.
Expand Down
39 changes: 27 additions & 12 deletions src/containers/bitmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,44 +2,59 @@
use crate::containers::vbyte::read_vbyte;
use bytesize::ByteSize;
use eyre::{eyre, Result};
use rsdict::RsDict;
use std::fmt;
use std::io::BufRead;
use std::mem::size_of;

//const USIZE_BITS: usize = usize::BITS as usize;
use sucds::bit_vectors::{Access, BitVector, Rank, Rank9Sel, Select};
use sucds::Serializable;

/// Compact bitmap representation with rank and select support.
#[derive(Clone)]
pub struct Bitmap {
//num_bits: usize,
// could also use sucds::rs_bit_vector::RsBitVector, that would be -1 dependency but that doesn't seem to have from_blocks
/// Currently using the rsdict crate.
pub dict: RsDict,
//pub data: Vec<u64>,
/// should be private but is needed by containers/bitmap.rs, use methods provided by Bitmap
pub dict: Rank9Sel,
}

impl fmt::Debug for Bitmap {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", ByteSize(self.dict.heap_size() as u64))
write!(f, "{}", ByteSize(self.size_in_bytes() as u64))
}
}

impl Bitmap {
/// Construct a bitmap from an existing bitmap in form of a vector, which doesn't have rank and select support.
pub fn new(data: Vec<u64>) -> Self {
let dict = RsDict::from_blocks((data as Vec<u64>).into_iter());
let mut v = BitVector::new();
for d in data {
let _ = v.push_bits(d as usize, 64);
}
let dict = Rank9Sel::new(v).select1_hints();
Bitmap { dict }
}

/// Size in bytes on the heap.
pub fn size_in_bytes(&self) -> usize {
self.dict.heap_size()
self.dict.size_in_bytes()
}

/// Number of bits in the bitmap
pub const fn len(&self) -> usize {
self.dict.len()
}

/// Returns the position of the k-1-th one bit or None if there aren't that many.
pub fn select1(&self, k: usize) -> Option<usize> {
self.dict.select1(k)
}

/// Returns the number of one bits from the 0-th bit to the k-1-th bit. Panics if self.len() < pos.
pub fn rank(&self, k: usize) -> usize {
self.dict.rank1(k).unwrap_or_else(|| panic!("Out of bounds position: {} >= {}", k, self.dict.len()))
}

/// Whether the node given position is the last child of its parent.
pub fn at_last_sibling(&self, word_index: usize) -> bool {
self.dict.get_bit(word_index as u64)
self.dict.access(word_index).expect("word index out of bounds")
}

/// Read bitmap from a suitable point within HDT file data and verify checksums.
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ fn query(hdt: Hdt)
#![allow(clippy::doc_markdown)]
#![allow(clippy::if_not_else)]
#![allow(clippy::into_iter_without_iter)]
#![allow(clippy::len_without_is_empty)]
// multiple versions of syn crate in transitive dependencies
#![allow(clippy::multiple_crate_versions)]
/// Types for storing and reading data.
Expand Down
25 changes: 14 additions & 11 deletions src/triples.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@ use crate::ControlInfo;
use bytesize::ByteSize;
use eyre::{eyre, Result, WrapErr};
use log::{debug, error};
use rsdict::RsDict;
use std::cmp::Ordering;
use std::fmt;

use std::io::BufRead;
use sucds::{bit_vectors::Rank9Sel, char_sequences::WaveletMatrix, int_vectors::CompactVector, Serializable};
use sucds::{
bit_vectors::{BitVector, Rank9Sel},
char_sequences::WaveletMatrix,
int_vectors::CompactVector,
Serializable,
};

mod subject_iter;
pub use subject_iter::SubjectIter;
Expand Down Expand Up @@ -80,13 +83,13 @@ impl OpIndex {
}
/// Find the first position in the OP index of the given object ID.
pub fn find(&self, o: Id) -> usize {
self.bitmap.dict.select1(o as u64 - 1).unwrap() as usize
self.bitmap.select1(o - 1).unwrap() as usize
}
/// Find the last position in the object index of the given object ID.
pub fn last(&self, o: Id) -> usize {
match self.bitmap.dict.select1(o as u64) {
match self.bitmap.select1(o) {
Some(index) => index as usize - 1,
None => self.bitmap.dict.len() - 1,
None => self.bitmap.len() - 1,
}
}
}
Expand Down Expand Up @@ -136,7 +139,7 @@ impl TriplesBitmap {
if subject_id == 0 {
return 0;
}
self.bitmap_y.dict.select1(subject_id as u64 - 1).unwrap() as usize + 1
self.bitmap_y.select1(subject_id - 1).unwrap() as usize + 1
}

/// Position in the wavelet index of the last predicate for the given subject ID.
Expand Down Expand Up @@ -225,11 +228,11 @@ impl TriplesBitmap {
error!("ERROR: There is a zero value in the Z level.");
continue;
}
let pos_y = bitmap_z.dict.rank(pos_z.to_owned() as u64, true);
let pos_y = bitmap_z.rank(pos_z.to_owned());
indicess[object - 1].push(pos_y as u32); // hdt index counts from 1 but we count from 0 for simplicity
}
// reduce memory consumption of index by using adjacency list
let mut bitmap_index_dict = RsDict::new();
let mut bitmap_index_bitvector = BitVector::new();
let mut cv = CompactVector::with_capacity(entries, sucds::utils::needed_bits(entries))
.map_err(|err| eyre!(Box::new(err)))?;
let wavelet_y = wavelet_thread.join().unwrap();
Expand All @@ -244,12 +247,12 @@ impl TriplesBitmap {
// sort by predicate
indices.sort_by_cached_key(|pos_y| wavelet_y.access(*pos_y as usize).unwrap());
for index in indices {
bitmap_index_dict.push(first);
bitmap_index_bitvector.push_bit(first);
first = false;
cv.push_int(index as usize).unwrap();
}
}
let bitmap_index = Bitmap { dict: bitmap_index_dict };
let bitmap_index = Bitmap { dict: Rank9Sel::new(bitmap_index_bitvector) };
let op_index = OpIndex { sequence: cv, bitmap: bitmap_index };
debug!("built OPS index");
assert!(sequence_z.crc_handle.take().unwrap().join().unwrap(), "sequence_z CRC check failed.");
Expand Down
2 changes: 1 addition & 1 deletion src/triples/object_iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ impl<'a> Iterator for ObjectIter<'a> {
}
let pos_y = self.triples.op_index.sequence.access(self.pos_index).unwrap();
let y = self.triples.wavelet_y.access(pos_y).unwrap() as Id;
let x = self.triples.bitmap_y.dict.rank(pos_y as u64, true) as Id + 1;
let x = self.triples.bitmap_y.rank(pos_y) as Id + 1;
self.pos_index += 1;
Some(TripleId::new(x, y, self.o))
//Some(self.triples.coord_to_triple(x, y, self.o).unwrap())
Expand Down
4 changes: 2 additions & 2 deletions src/triples/predicate_iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ impl<'a> Iterator for PredicateIter<'a> {
}
if self.os == 0 {
// Algorithm 1 findSubj from Martinez et al. 2012 ******
let pos_y = self.triples.wavelet_y.select(self.i, self.p as usize).unwrap() as u64;
self.s = self.triples.bitmap_y.dict.rank(pos_y, true) as Id + 1;
let pos_y = self.triples.wavelet_y.select(self.i, self.p as usize).unwrap();
self.s = self.triples.bitmap_y.rank(pos_y) as Id + 1;
// *****************************************************
// SP can have multiple O
self.pos_z = self.triples.adjlist_z.find(pos_y as Id);
Expand Down
8 changes: 4 additions & 4 deletions src/triples/predicate_object_iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ impl<'a> PredicateObjectIter<'a> {
let mut low = triples.op_index.find(o);
let mut high = triples.op_index.last(o);
let get_y = |pos_index| {
let pos_y = triples.op_index.sequence.access(pos_index).unwrap() as u64;
triples.wavelet_y.access(pos_y as usize).unwrap() as Id
let pos_y = triples.op_index.sequence.access(pos_index).unwrap();
triples.wavelet_y.access(pos_y).unwrap() as Id
};
// Binary search with a twist:
// Each value may occur multiple times, so we search for the left and right borders.
Expand Down Expand Up @@ -69,10 +69,10 @@ impl<'a> Iterator for PredicateObjectIter<'a> {
if self.pos_index > self.max_index {
return None;
}
let pos_y = self.triples.op_index.sequence.access(self.pos_index).unwrap() as u64;
let pos_y = self.triples.op_index.sequence.access(self.pos_index).unwrap();
//let y = self.triples.wavelet_y.get(pos_y as usize) as Id;
//println!(" op p {y}");
let s = self.triples.bitmap_y.dict.rank(pos_y, true) as Id + 1;
let s = self.triples.bitmap_y.rank(pos_y) as Id + 1;
self.pos_index += 1;
Some(s)
}
Expand Down

0 comments on commit 3ce07e2

Please sign in to comment.