diff --git a/nlprule/Cargo.toml b/nlprule/Cargo.toml index 85e41dc..f991de9 100644 --- a/nlprule/Cargo.toml +++ b/nlprule/Cargo.toml @@ -30,6 +30,8 @@ half = { version = "1.7", features = ["serde"] } srx = { version = "^0.1.3", features = ["serde"] } lazycell = "1" cfg-if = "1" +fnv = "1" +hashbrown = "0.11" rayon-cond = "0.1" rayon = "1.5" diff --git a/nlprule/src/compile/impls.rs b/nlprule/src/compile/impls.rs index 9584f0d..bfc6438 100644 --- a/nlprule/src/compile/impls.rs +++ b/nlprule/src/compile/impls.rs @@ -1,4 +1,5 @@ use bimap::BiMap; +use fnv::{FnvBuildHasher, FnvHashSet}; use fs_err::File; use indexmap::IndexMap; use log::warn; @@ -138,12 +139,12 @@ impl Tagger { tag_store.insert(i, special_pos); } - let word_store: BiMap<_, _> = word_store + let word_store: FastBiMap<_, _> = word_store .iter() .enumerate() .map(|(i, x)| (x.to_string(), WordIdInt::from_value_unchecked(i as u32))) .collect(); - let tag_store: BiMap<_, _> = tag_store + let tag_store: FastBiMap<_, _> = tag_store .iter() .enumerate() .map(|(i, x)| (x.to_string(), PosIdInt::from_value_unchecked(i as u16))) @@ -154,9 +155,9 @@ impl Tagger { let inflection_id = word_store.get_by_left(inflection).unwrap(); let pos_id = tag_store.get_by_left(tag).unwrap(); - let group = groups.entry(*inflection_id).or_insert_with(Vec::new); + let group = groups.entry(*inflection_id).or_insert_with(HashSet::::default); if !group.contains(word_id) { - group.push(*word_id); + group.insert(*word_id); } tags.entry(*word_id) diff --git a/nlprule/src/rule/engine/composition.rs b/nlprule/src/rule/engine/composition.rs index dc53593..24d0c72 100644 --- a/nlprule/src/rule/engine/composition.rs +++ b/nlprule/src/rule/engine/composition.rs @@ -115,7 +115,7 @@ pub struct PosMatcher { impl PosMatcher { pub fn is_match(&self, pos: &PosId) -> bool { - self.mask[pos.id().value() as usize] + self.mask[pos.id().0 as usize] } } diff --git a/nlprule/src/tokenizer/tag.rs b/nlprule/src/tokenizer/tag.rs index d0b500a..be98293 100644 --- a/nlprule/src/tokenizer/tag.rs +++ b/nlprule/src/tokenizer/tag.rs @@ -2,39 +2,12 @@ //! where each word typically has multiple entries with different part-of-speech tags. use crate::types::*; -use bimap::BiMap; use fst::{IntoStreamer, Map, Streamer}; use indexmap::IndexMap; use log::error; use serde::{Deserialize, Serialize}; use std::{borrow::Cow, fmt, iter::once}; -#[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)] -#[serde(transparent)] -pub(crate) struct WordIdInt(u32); - -impl WordIdInt { - #[allow(dead_code)] // used in compile module - pub(crate) fn from_value_unchecked(value: u32) -> Self { - WordIdInt(value) - } -} - -#[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)] -#[serde(transparent)] -pub(crate) struct PosIdInt(u16); - -impl PosIdInt { - #[allow(dead_code)] // used in compile module - pub(crate) fn from_value_unchecked(value: u16) -> Self { - PosIdInt(value) - } - - pub fn value(&self) -> u16 { - self.0 - } -} - /// A potentially identified word. If it is identified as a known word, many optimizations can be applied. #[derive(Clone, PartialEq)] pub struct WordId<'t>(pub(crate) Cow<'t, str>, pub(crate) Option); @@ -181,7 +154,7 @@ impl Default for TaggerLangOptions { struct TaggerFields { tag_fst: Vec, word_store_fst: Vec, - tag_store: BiMap, + tag_store: FastBiMap, lang_options: TaggerLangOptions, } @@ -198,20 +171,11 @@ impl From for TaggerFields { assert!(i < 255); i += 1; - let key: Vec = word.as_bytes().iter().chain(once(&i)).copied().collect(); - let pos_bytes = pos_id.0.to_be_bytes(); - let inflect_bytes = inflect_id.0.to_be_bytes(); - - let value = u64::from_be_bytes([ - inflect_bytes[0], - inflect_bytes[1], - inflect_bytes[2], - inflect_bytes[3], - 0, - 0, - pos_bytes[0], - pos_bytes[1], - ]); + let key: Vec = word.as_bytes().iter().copied().chain(once(i)).collect(); + let pos_bytes = pos_id.0 as u64; + let inflect_bytes = inflect_id.0 as u64; + let value = (pos_bytes & 0xFFFF) | (inflect_bytes & 0xFFFF_FFFF) << 32; + tag_fst_items.push((key, value)); } } @@ -249,42 +213,38 @@ impl From for TaggerFields { impl From for Tagger { fn from(data: TaggerFields) -> Self { let word_store_fst = Map::new(data.word_store_fst).unwrap(); - let word_store: BiMap = word_store_fst - .into_stream() - .into_str_vec() - .unwrap() - .into_iter() - .map(|(key, value)| (key, WordIdInt(value as u32))) - .collect(); + let mut word_store = FastBiMap::::with_capacity_and_hashers( + word_store_fst.len(), + Default::default(), + Default::default(), + ); + let mut stream = word_store_fst.into_stream(); + while let Some((key, value)) = stream.next() { + if let Some(key) = std::str::from_utf8(key).ok() { + word_store.insert(key.to_owned(), WordIdInt(value as u32)); + } + }; - let mut tags = DefaultHashMap::new(); - let mut groups = DefaultHashMap::new(); + let mut tags = FastHashMap::new(); + let mut groups = FastHashMap::new(); let tag_fst = Map::new(data.tag_fst).unwrap(); let mut stream = tag_fst.into_stream(); while let Some((key, value)) = stream.next() { - let word = std::str::from_utf8(&key[..key.len() - 1]).unwrap(); + let word = std::str::from_utf8(&key[..(key.len().saturating_sub(1))]).unwrap(); let word_id = *word_store.get_by_left(word).unwrap(); - let value_bytes = value.to_be_bytes(); - let inflection_id = WordIdInt(u32::from_be_bytes([ - value_bytes[0], - value_bytes[1], - value_bytes[2], - value_bytes[3], - ])); - let pos_id = PosIdInt(u16::from_be_bytes([value_bytes[6], value_bytes[7]])); - - let group = groups.entry(inflection_id).or_insert_with(Vec::new); - if !group.contains(&word_id) { - group.push(word_id); - } + let inflection_id = WordIdInt((value >> 32) as u32); + let pos_id = PosIdInt((value & 0xFF_u64) as u16); + + let group = groups.entry(inflection_id).or_insert_with(FastHashSet::default); + let _ = group.insert(word_id); tags.entry(word_id) .or_insert_with(IndexMap::new) .entry(inflection_id) - .or_insert_with(Vec::new) + .or_insert_with(|| Vec::with_capacity(32)) .push(pos_id); } @@ -302,10 +262,10 @@ impl From for Tagger { #[derive(Default, Serialize, Deserialize, Clone)] #[serde(from = "TaggerFields", into = "TaggerFields")] pub struct Tagger { - pub(crate) tags: DefaultHashMap>>, - pub(crate) tag_store: BiMap, - pub(crate) word_store: BiMap, - pub(crate) groups: DefaultHashMap>, + pub(crate) tags: FastHashMap>>, + pub(crate) tag_store: FastBiMap, + pub(crate) word_store: FastBiMap, + pub(crate) groups: FastHashMap>, pub(crate) lang_options: TaggerLangOptions, } @@ -358,12 +318,12 @@ impl Tagger { } #[allow(dead_code)] // used by compile module - pub(crate) fn tag_store(&self) -> &BiMap { + pub(crate) fn tag_store(&self) -> &FastBiMap { &self.tag_store } #[allow(dead_code)] // used by compile module - pub(crate) fn word_store(&self) -> &BiMap { + pub(crate) fn word_store(&self) -> &FastBiMap { &self.word_store } diff --git a/nlprule/src/types.rs b/nlprule/src/types.rs index 775ddb0..cf4ed17 100644 --- a/nlprule/src/types.rs +++ b/nlprule/src/types.rs @@ -2,7 +2,7 @@ use crate::tokenizer::tag::Tagger; pub use crate::tokenizer::tag::{PosId, WordId}; -pub(crate) use crate::tokenizer::tag::{PosIdInt, SpecialPos, WordIdInt}; +pub(crate) use crate::tokenizer::tag::SpecialPos; use derivative::Derivative; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; @@ -11,11 +11,24 @@ use std::{ collections::{hash_map, HashMap, HashSet}, ops::{Add, AddAssign, Range, Sub}, }; +use bimap::BiHashMap; + pub(crate) type DefaultHashMap = HashMap; pub(crate) type DefaultHashSet = HashSet; pub(crate) type DefaultHasher = hash_map::DefaultHasher; +pub(crate) type FastBiMap = BiHashMap; +pub(crate) type FastHashSet = hashbrown::HashSet; +pub(crate) type FastHashMap = hashbrown::HashMap; + +#[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)] +#[serde(transparent)] +pub(crate) struct WordIdInt(pub u32); +#[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)] +#[serde(transparent)] +pub(crate) struct PosIdInt(pub u16); + /// Owned versions of the types for use in longer-living structures not bound to the `'t` lifetime e.g. rule tests. pub mod owned { use super::*; diff --git a/nlprule/src/utils/regex.rs b/nlprule/src/utils/regex.rs index c743f45..145eb04 100644 --- a/nlprule/src/utils/regex.rs +++ b/nlprule/src/utils/regex.rs @@ -491,7 +491,9 @@ cfg_if::cfg_if! { use regex_impl_all as regex_impl; } else if #[cfg(feature = "regex-onig")] { use regex_impl_onig as regex_impl; - } else { + } else if #[cfg(feature = "regex-fancy")] { use regex_impl_fancy as regex_impl; + } else { + compile_error!{"Must select exactly one regex impl via features: regex-onig OR regex-fancy"} } }