Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor/speed: avoid unnecessary collection #60

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions nlprule/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ half = { version = "1.7", features = ["serde"] }
srx = { version = "^0.1.3", features = ["serde"] }
lazycell = "1"
cfg-if = "1"
fnv = "1"
hashbrown = "0.11"

rayon-cond = "0.1"
rayon = "1.5"
Expand Down
9 changes: 5 additions & 4 deletions nlprule/src/compile/impls.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use bimap::BiMap;
use fnv::{FnvBuildHasher, FnvHashSet};
use fs_err::File;
use indexmap::IndexMap;
use log::warn;
Expand Down Expand Up @@ -138,12 +139,12 @@ impl Tagger {
tag_store.insert(i, special_pos);
}

let word_store: BiMap<_, _> = word_store
let word_store: FastBiMap<_, _> = word_store
.iter()
.enumerate()
.map(|(i, x)| (x.to_string(), WordIdInt::from_value_unchecked(i as u32)))
.collect();
let tag_store: BiMap<_, _> = tag_store
let tag_store: FastBiMap<_, _> = tag_store
.iter()
.enumerate()
.map(|(i, x)| (x.to_string(), PosIdInt::from_value_unchecked(i as u16)))
Expand All @@ -154,9 +155,9 @@ impl Tagger {
let inflection_id = word_store.get_by_left(inflection).unwrap();
let pos_id = tag_store.get_by_left(tag).unwrap();

let group = groups.entry(*inflection_id).or_insert_with(Vec::new);
let group = groups.entry(*inflection_id).or_insert_with(HashSet::<WordIdInt, FnvBuildHasher>::default);
if !group.contains(word_id) {
group.push(*word_id);
group.insert(*word_id);
}

tags.entry(*word_id)
Expand Down
2 changes: 1 addition & 1 deletion nlprule/src/rule/engine/composition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ pub struct PosMatcher {

impl PosMatcher {
pub fn is_match(&self, pos: &PosId) -> bool {
self.mask[pos.id().value() as usize]
self.mask[pos.id().0 as usize]
}
}

Expand Down
104 changes: 32 additions & 72 deletions nlprule/src/tokenizer/tag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,12 @@
//! where each word typically has multiple entries with different part-of-speech tags.

use crate::types::*;
use bimap::BiMap;
use fst::{IntoStreamer, Map, Streamer};
use indexmap::IndexMap;
use log::error;
use serde::{Deserialize, Serialize};
use std::{borrow::Cow, fmt, iter::once};

#[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)]
#[serde(transparent)]
pub(crate) struct WordIdInt(u32);

impl WordIdInt {
#[allow(dead_code)] // used in compile module
pub(crate) fn from_value_unchecked(value: u32) -> Self {
WordIdInt(value)
}
}

#[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)]
#[serde(transparent)]
pub(crate) struct PosIdInt(u16);

impl PosIdInt {
#[allow(dead_code)] // used in compile module
pub(crate) fn from_value_unchecked(value: u16) -> Self {
PosIdInt(value)
}

pub fn value(&self) -> u16 {
self.0
}
}

/// A potentially identified word. If it is identified as a known word, many optimizations can be applied.
#[derive(Clone, PartialEq)]
pub struct WordId<'t>(pub(crate) Cow<'t, str>, pub(crate) Option<WordIdInt>);
Expand Down Expand Up @@ -181,7 +154,7 @@ impl Default for TaggerLangOptions {
struct TaggerFields {
tag_fst: Vec<u8>,
word_store_fst: Vec<u8>,
tag_store: BiMap<String, PosIdInt>,
tag_store: FastBiMap<String, PosIdInt>,
lang_options: TaggerLangOptions,
}

Expand All @@ -198,20 +171,11 @@ impl From<Tagger> for TaggerFields {
assert!(i < 255);
i += 1;

let key: Vec<u8> = word.as_bytes().iter().chain(once(&i)).copied().collect();
let pos_bytes = pos_id.0.to_be_bytes();
let inflect_bytes = inflect_id.0.to_be_bytes();

let value = u64::from_be_bytes([
inflect_bytes[0],
inflect_bytes[1],
inflect_bytes[2],
inflect_bytes[3],
0,
0,
pos_bytes[0],
pos_bytes[1],
]);
let key: Vec<u8> = word.as_bytes().iter().copied().chain(once(i)).collect();
let pos_bytes = pos_id.0 as u64;
let inflect_bytes = inflect_id.0 as u64;
let value = (pos_bytes & 0xFFFF) | (inflect_bytes & 0xFFFF_FFFF) << 32;

tag_fst_items.push((key, value));
}
}
Expand Down Expand Up @@ -249,42 +213,38 @@ impl From<Tagger> for TaggerFields {
impl From<TaggerFields> for Tagger {
fn from(data: TaggerFields) -> Self {
let word_store_fst = Map::new(data.word_store_fst).unwrap();
let word_store: BiMap<String, WordIdInt> = word_store_fst
.into_stream()
.into_str_vec()
.unwrap()
.into_iter()
.map(|(key, value)| (key, WordIdInt(value as u32)))
.collect();
let mut word_store = FastBiMap::<String, WordIdInt>::with_capacity_and_hashers(
word_store_fst.len(),
Default::default(),
Default::default(),
);
let mut stream = word_store_fst.into_stream();
while let Some((key, value)) = stream.next() {
if let Some(key) = std::str::from_utf8(key).ok() {
word_store.insert(key.to_owned(), WordIdInt(value as u32));
}
};

let mut tags = DefaultHashMap::new();
let mut groups = DefaultHashMap::new();
let mut tags = FastHashMap::new();
let mut groups = FastHashMap::new();

let tag_fst = Map::new(data.tag_fst).unwrap();
let mut stream = tag_fst.into_stream();

while let Some((key, value)) = stream.next() {
let word = std::str::from_utf8(&key[..key.len() - 1]).unwrap();
let word = std::str::from_utf8(&key[..(key.len().saturating_sub(1))]).unwrap();
let word_id = *word_store.get_by_left(word).unwrap();

let value_bytes = value.to_be_bytes();
let inflection_id = WordIdInt(u32::from_be_bytes([
value_bytes[0],
value_bytes[1],
value_bytes[2],
value_bytes[3],
]));
let pos_id = PosIdInt(u16::from_be_bytes([value_bytes[6], value_bytes[7]]));

let group = groups.entry(inflection_id).or_insert_with(Vec::new);
if !group.contains(&word_id) {
group.push(word_id);
}
let inflection_id = WordIdInt((value >> 32) as u32);
let pos_id = PosIdInt((value & 0xFF_u64) as u16);

let group = groups.entry(inflection_id).or_insert_with(FastHashSet::default);
let _ = group.insert(word_id);

tags.entry(word_id)
.or_insert_with(IndexMap::new)
.entry(inflection_id)
.or_insert_with(Vec::new)
.or_insert_with(|| Vec::with_capacity(32))
.push(pos_id);
}

Expand All @@ -302,10 +262,10 @@ impl From<TaggerFields> for Tagger {
#[derive(Default, Serialize, Deserialize, Clone)]
#[serde(from = "TaggerFields", into = "TaggerFields")]
pub struct Tagger {
pub(crate) tags: DefaultHashMap<WordIdInt, IndexMap<WordIdInt, Vec<PosIdInt>>>,
pub(crate) tag_store: BiMap<String, PosIdInt>,
pub(crate) word_store: BiMap<String, WordIdInt>,
pub(crate) groups: DefaultHashMap<WordIdInt, Vec<WordIdInt>>,
pub(crate) tags: FastHashMap<WordIdInt, IndexMap<WordIdInt, Vec<PosIdInt>>>,
pub(crate) tag_store: FastBiMap<String, PosIdInt>,
pub(crate) word_store: FastBiMap<String, WordIdInt>,
pub(crate) groups: FastHashMap<WordIdInt, FastHashSet<WordIdInt>>,
pub(crate) lang_options: TaggerLangOptions,
}

Expand Down Expand Up @@ -358,12 +318,12 @@ impl Tagger {
}

#[allow(dead_code)] // used by compile module
pub(crate) fn tag_store(&self) -> &BiMap<String, PosIdInt> {
pub(crate) fn tag_store(&self) -> &FastBiMap<String, PosIdInt> {
&self.tag_store
}

#[allow(dead_code)] // used by compile module
pub(crate) fn word_store(&self) -> &BiMap<String, WordIdInt> {
pub(crate) fn word_store(&self) -> &FastBiMap<String, WordIdInt> {
&self.word_store
}

Expand Down
15 changes: 14 additions & 1 deletion nlprule/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

use crate::tokenizer::tag::Tagger;
pub use crate::tokenizer::tag::{PosId, WordId};
pub(crate) use crate::tokenizer::tag::{PosIdInt, SpecialPos, WordIdInt};
pub(crate) use crate::tokenizer::tag::SpecialPos;
use derivative::Derivative;
use lazy_static::lazy_static;
use serde::{Deserialize, Serialize};
Expand All @@ -11,11 +11,24 @@ use std::{
collections::{hash_map, HashMap, HashSet},
ops::{Add, AddAssign, Range, Sub},
};
use bimap::BiHashMap;


pub(crate) type DefaultHashMap<K, V> = HashMap<K, V>;
pub(crate) type DefaultHashSet<T> = HashSet<T>;
pub(crate) type DefaultHasher = hash_map::DefaultHasher;

pub(crate) type FastBiMap<L,R> = BiHashMap<L, R, hashbrown::hash_map::DefaultHashBuilder, hashbrown::hash_map::DefaultHashBuilder>;
pub(crate) type FastHashSet<I> = hashbrown::HashSet<I>;
pub(crate) type FastHashMap<K,V> = hashbrown::HashMap<K,V>;

#[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)]
#[serde(transparent)]
pub(crate) struct WordIdInt(pub u32);
#[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)]
#[serde(transparent)]
pub(crate) struct PosIdInt(pub u16);

/// Owned versions of the types for use in longer-living structures not bound to the `'t` lifetime e.g. rule tests.
pub mod owned {
use super::*;
Expand Down
4 changes: 3 additions & 1 deletion nlprule/src/utils/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,9 @@ cfg_if::cfg_if! {
use regex_impl_all as regex_impl;
} else if #[cfg(feature = "regex-onig")] {
use regex_impl_onig as regex_impl;
} else {
} else if #[cfg(feature = "regex-fancy")] {
use regex_impl_fancy as regex_impl;
} else {
compile_error!{"Must select exactly one regex impl via features: regex-onig OR regex-fancy"}
}
}