bminixhofer · drahnr · Apr 7, 2021 · Apr 7, 2021 · Apr 7, 2021 · Apr 8, 2021
diff --git a/nlprule/Cargo.toml b/nlprule/Cargo.toml
@@ -30,6 +30,8 @@ half = { version = "1.7", features = ["serde"] }
 srx = { version = "^0.1.3", features = ["serde"] }
 lazycell = "1"
 cfg-if = "1"
+fnv = "1"
+hashbrown = "0.11"
 
 rayon-cond = "0.1"
 rayon = "1.5"

diff --git a/nlprule/src/compile/impls.rs b/nlprule/src/compile/impls.rs
@@ -1,4 +1,5 @@
 use bimap::BiMap;
+use fnv::{FnvBuildHasher, FnvHashSet};
 use fs_err::File;
 use indexmap::IndexMap;
 use log::warn;
@@ -138,12 +139,12 @@ impl Tagger {
             tag_store.insert(i, special_pos);
         }
 
-        let word_store: BiMap<_, _> = word_store
+        let word_store: FastBiMap<_, _> = word_store
             .iter()
             .enumerate()
             .map(|(i, x)| (x.to_string(), WordIdInt::from_value_unchecked(i as u32)))
             .collect();
-        let tag_store: BiMap<_, _> = tag_store
+        let tag_store: FastBiMap<_, _> = tag_store
             .iter()
             .enumerate()
             .map(|(i, x)| (x.to_string(), PosIdInt::from_value_unchecked(i as u16)))
@@ -154,9 +155,9 @@ impl Tagger {
             let inflection_id = word_store.get_by_left(inflection).unwrap();
             let pos_id = tag_store.get_by_left(tag).unwrap();
 
-            let group = groups.entry(*inflection_id).or_insert_with(Vec::new);
+            let group = groups.entry(*inflection_id).or_insert_with(HashSet::<WordIdInt, FnvBuildHasher>::default);
             if !group.contains(word_id) {
-                group.push(*word_id);
+                group.insert(*word_id);
             }
 
             tags.entry(*word_id)

diff --git a/nlprule/src/rule/engine/composition.rs b/nlprule/src/rule/engine/composition.rs
@@ -115,7 +115,7 @@ pub struct PosMatcher {
 
 impl PosMatcher {
     pub fn is_match(&self, pos: &PosId) -> bool {
-        self.mask[pos.id().value() as usize]
+        self.mask[pos.id().0 as usize]
     }
 }
 

diff --git a/nlprule/src/tokenizer/tag.rs b/nlprule/src/tokenizer/tag.rs
@@ -2,39 +2,12 @@
 //! where each word typically has multiple entries with different part-of-speech tags.
 
 use crate::types::*;
-use bimap::BiMap;
 use fst::{IntoStreamer, Map, Streamer};
 use indexmap::IndexMap;
 use log::error;
 use serde::{Deserialize, Serialize};
 use std::{borrow::Cow, fmt, iter::once};
 
-#[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)]
-#[serde(transparent)]
-pub(crate) struct WordIdInt(u32);
-
-impl WordIdInt {
-    #[allow(dead_code)] // used in compile module
-    pub(crate) fn from_value_unchecked(value: u32) -> Self {
-        WordIdInt(value)
-    }
-}
-
-#[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)]
-#[serde(transparent)]
-pub(crate) struct PosIdInt(u16);
-
-impl PosIdInt {
-    #[allow(dead_code)] // used in compile module
-    pub(crate) fn from_value_unchecked(value: u16) -> Self {
-        PosIdInt(value)
-    }
-
-    pub fn value(&self) -> u16 {
-        self.0
-    }
-}
-
 /// A potentially identified word. If it is identified as a known word, many optimizations can be applied.
 #[derive(Clone, PartialEq)]
 pub struct WordId<'t>(pub(crate) Cow<'t, str>, pub(crate) Option<WordIdInt>);
@@ -181,7 +154,7 @@ impl Default for TaggerLangOptions {
 struct TaggerFields {
     tag_fst: Vec<u8>,
     word_store_fst: Vec<u8>,
-    tag_store: BiMap<String, PosIdInt>,
+    tag_store: FastBiMap<String, PosIdInt>,
     lang_options: TaggerLangOptions,
 }
 
@@ -198,20 +171,11 @@ impl From<Tagger> for TaggerFields {
                     assert!(i < 255);
                     i += 1;
 
-                    let key: Vec<u8> = word.as_bytes().iter().chain(once(&i)).copied().collect();
-                    let pos_bytes = pos_id.0.to_be_bytes();
-                    let inflect_bytes = inflect_id.0.to_be_bytes();
-
-                    let value = u64::from_be_bytes([
-                        inflect_bytes[0],
-                        inflect_bytes[1],
-                        inflect_bytes[2],
-                        inflect_bytes[3],
-                        0,
-                        0,
-                        pos_bytes[0],
-                        pos_bytes[1],
-                    ]);
+                    let key: Vec<u8> = word.as_bytes().iter().copied().chain(once(i)).collect();
+                    let pos_bytes = pos_id.0 as u64;
+                    let inflect_bytes = inflect_id.0 as u64;
+                    let value = (pos_bytes & 0xFFFF) | (inflect_bytes & 0xFFFF_FFFF) << 32;
+
                     tag_fst_items.push((key, value));
                 }
             }
@@ -249,42 +213,38 @@ impl From<Tagger> for TaggerFields {
 impl From<TaggerFields> for Tagger {
     fn from(data: TaggerFields) -> Self {
         let word_store_fst = Map::new(data.word_store_fst).unwrap();
-        let word_store: BiMap<String, WordIdInt> = word_store_fst
-            .into_stream()
-            .into_str_vec()
-            .unwrap()
-            .into_iter()
-            .map(|(key, value)| (key, WordIdInt(value as u32)))
-            .collect();
+        let mut word_store = FastBiMap::<String, WordIdInt>::with_capacity_and_hashers(
+            word_store_fst.len(),
+            Default::default(),
+            Default::default(),
+        );
+        let mut stream = word_store_fst.into_stream();
+        while let Some((key, value)) = stream.next() {
+            if let Some(key) = std::str::from_utf8(key).ok() {
+                word_store.insert(key.to_owned(), WordIdInt(value as u32));
+            }
+        };
 
-        let mut tags = DefaultHashMap::new();
-        let mut groups = DefaultHashMap::new();
+        let mut tags = FastHashMap::new();
+        let mut groups = FastHashMap::new();
 
         let tag_fst = Map::new(data.tag_fst).unwrap();
         let mut stream = tag_fst.into_stream();
 
         while let Some((key, value)) = stream.next() {
-            let word = std::str::from_utf8(&key[..key.len() - 1]).unwrap();
+            let word = std::str::from_utf8(&key[..(key.len().saturating_sub(1))]).unwrap();
             let word_id = *word_store.get_by_left(word).unwrap();
 
-            let value_bytes = value.to_be_bytes();
-            let inflection_id = WordIdInt(u32::from_be_bytes([
-                value_bytes[0],
-                value_bytes[1],
-                value_bytes[2],
-                value_bytes[3],
-            ]));
-            let pos_id = PosIdInt(u16::from_be_bytes([value_bytes[6], value_bytes[7]]));
-
-            let group = groups.entry(inflection_id).or_insert_with(Vec::new);
-            if !group.contains(&word_id) {
-                group.push(word_id);
-            }
+            let inflection_id = WordIdInt((value >> 32) as u32);
+            let pos_id = PosIdInt((value & 0xFF_u64) as u16);
+
+            let group = groups.entry(inflection_id).or_insert_with(FastHashSet::default);
+            let _ = group.insert(word_id);
 
             tags.entry(word_id)
                 .or_insert_with(IndexMap::new)
                 .entry(inflection_id)
-                .or_insert_with(Vec::new)
+                .or_insert_with(|| Vec::with_capacity(32))
                 .push(pos_id);
         }
 
@@ -302,10 +262,10 @@ impl From<TaggerFields> for Tagger {
 #[derive(Default, Serialize, Deserialize, Clone)]
 #[serde(from = "TaggerFields", into = "TaggerFields")]
 pub struct Tagger {
-    pub(crate) tags: DefaultHashMap<WordIdInt, IndexMap<WordIdInt, Vec<PosIdInt>>>,
-    pub(crate) tag_store: BiMap<String, PosIdInt>,
-    pub(crate) word_store: BiMap<String, WordIdInt>,
-    pub(crate) groups: DefaultHashMap<WordIdInt, Vec<WordIdInt>>,
+    pub(crate) tags: FastHashMap<WordIdInt, IndexMap<WordIdInt, Vec<PosIdInt>>>,
+    pub(crate) tag_store: FastBiMap<String, PosIdInt>,
+    pub(crate) word_store: FastBiMap<String, WordIdInt>,
+    pub(crate) groups: FastHashMap<WordIdInt, FastHashSet<WordIdInt>>,
     pub(crate) lang_options: TaggerLangOptions,
 }
 
@@ -358,12 +318,12 @@ impl Tagger {
     }
 
     #[allow(dead_code)] // used by compile module
-    pub(crate) fn tag_store(&self) -> &BiMap<String, PosIdInt> {
+    pub(crate) fn tag_store(&self) -> &FastBiMap<String, PosIdInt> {
         &self.tag_store
     }
 
     #[allow(dead_code)] // used by compile module
-    pub(crate) fn word_store(&self) -> &BiMap<String, WordIdInt> {
+    pub(crate) fn word_store(&self) -> &FastBiMap<String, WordIdInt> {
         &self.word_store
     }
 

diff --git a/nlprule/src/types.rs b/nlprule/src/types.rs
@@ -2,7 +2,7 @@
 
 use crate::tokenizer::tag::Tagger;
 pub use crate::tokenizer::tag::{PosId, WordId};
-pub(crate) use crate::tokenizer::tag::{PosIdInt, SpecialPos, WordIdInt};
+pub(crate) use crate::tokenizer::tag::SpecialPos;
 use derivative::Derivative;
 use lazy_static::lazy_static;
 use serde::{Deserialize, Serialize};
@@ -11,11 +11,24 @@ use std::{
     collections::{hash_map, HashMap, HashSet},
     ops::{Add, AddAssign, Range, Sub},
 };
+use bimap::BiHashMap;
+
 
 pub(crate) type DefaultHashMap<K, V> = HashMap<K, V>;
 pub(crate) type DefaultHashSet<T> = HashSet<T>;
 pub(crate) type DefaultHasher = hash_map::DefaultHasher;
 
+pub(crate) type FastBiMap<L,R> = BiHashMap<L, R, hashbrown::hash_map::DefaultHashBuilder, hashbrown::hash_map::DefaultHashBuilder>;
+pub(crate) type FastHashSet<I> = hashbrown::HashSet<I>;
+pub(crate) type FastHashMap<K,V> = hashbrown::HashMap<K,V>;
+
+#[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)]
+#[serde(transparent)]
+pub(crate) struct WordIdInt(pub u32);
+#[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)]
+#[serde(transparent)]
+pub(crate) struct PosIdInt(pub u16);
+
 /// Owned versions of the types for use in longer-living structures not bound to the `'t` lifetime e.g. rule tests.
 pub mod owned {
     use super::*;

diff --git a/nlprule/src/utils/regex.rs b/nlprule/src/utils/regex.rs
@@ -491,7 +491,9 @@ cfg_if::cfg_if! {
         use regex_impl_all as regex_impl;
     } else if #[cfg(feature = "regex-onig")] {
         use regex_impl_onig as regex_impl;
-    } else {
+    } else if #[cfg(feature = "regex-fancy")] {
         use regex_impl_fancy as regex_impl;
+    } else {
+        compile_error!{"Must select exactly one regex impl via features: regex-onig OR regex-fancy"}
     }
 }