Skip to content

Commit

Permalink
feat(core): improved performance by enhancing cache localization
Browse files Browse the repository at this point in the history
  • Loading branch information
elijah-potter committed Jan 7, 2025
1 parent 4b6f7e5 commit 3b43861
Show file tree
Hide file tree
Showing 4 changed files with 168 additions and 88 deletions.
7 changes: 2 additions & 5 deletions harper-core/src/linting/oxford_comma.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::{
patterns::{EitherPattern, Pattern, SequencePattern},
patterns::{Pattern, SequencePattern, WordSet},
Document, Token, TokenStringExt,
};

Expand All @@ -21,10 +21,7 @@ impl OxfordComma {
))
.then_noun_phrase()
.then_whitespace()
.then(Box::new(EitherPattern::new(vec![
Box::new(SequencePattern::aco("and")),
Box::new(SequencePattern::aco("or")),
])))
.then(Box::new(WordSet::all(&["and", "or"])))
.then_whitespace()
.then_noun_phrase(),
}
Expand Down
154 changes: 71 additions & 83 deletions harper-core/src/linting/proper_noun_capitalization_linters.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use super::PatternLinter;
use super::{Lint, LintKind, Suggestion};
use crate::make_title_case;
use crate::patterns::{EitherPattern, IsNotTitleCase, Pattern, SequencePattern};
use crate::patterns::{EitherPattern, IsNotTitleCase, Pattern, SequencePattern, WordSet};
use crate::FstDictionary;
use crate::{Token, TokenStringExt};
use std::sync::Arc;
Expand Down Expand Up @@ -62,10 +62,7 @@ macro_rules! create_linter_for {
create_linter_for!(
Americas,
SequencePattern::default()
.then(Box::new(EitherPattern::new(vec![
Box::new(SequencePattern::aco("South")),
Box::new(SequencePattern::aco("North"))
])))
.then(Box::new(WordSet::all(&["South", "North",])))
.then_whitespace()
.t_aco("America"),
"When referring to the continents, make sure to treat them as a proper noun."
Expand All @@ -74,10 +71,7 @@ create_linter_for!(
create_linter_for!(
Koreas,
SequencePattern::default()
.then(Box::new(EitherPattern::new(vec![
Box::new(SequencePattern::aco("South")),
Box::new(SequencePattern::aco("North"))
])))
.then(Box::new(WordSet::all(&["South", "North",])))
.then_whitespace()
.t_aco("Korea"),
"When referring to the nations, make sure to treat them as a proper noun."
Expand Down Expand Up @@ -119,25 +113,27 @@ create_linter_for!(
Box::new(
SequencePattern::default()
.then(Box::new(EitherPattern::new(vec![
Box::new(SequencePattern::aco("Presidents'")),
Box::new(SequencePattern::aco("Valentines")),
Box::new(SequencePattern::aco("Christmas")),
Box::new(SequencePattern::aco("Easter")),
Box::new(SequencePattern::aco("Flag")),
Box::new(SequencePattern::aco("Independence")),
Box::new(SequencePattern::aco("Mothers'")),
Box::new(SequencePattern::aco("New").t_aco("Years")),
Box::new(SequencePattern::aco("Fathers'")),
Box::new(SequencePattern::aco("Columbus")),
Box::new(SequencePattern::aco("Thanksgiving")),
Box::new(SequencePattern::aco("Memorial")),
Box::new(SequencePattern::aco("May")),
Box::new(SequencePattern::aco("Halloween")),
Box::new(SequencePattern::aco("Tax")),
Box::new(SequencePattern::aco("Parents")),
Box::new(SequencePattern::aco("Veterans")),
Box::new(SequencePattern::aco("Armistice")),
Box::new(SequencePattern::aco("Groundhog")),
Box::new(WordSet::all(&[
"Presidents'",
"Valentines",
"Christmas",
"Easter",
"Flag",
"Independence",
"Mothers'",
"Years",
"Fathers'",
"Columbus",
"Thanksgiving",
"Memorial",
"May",
"Halloween",
"Tax",
"Parents",
"Veterans",
"Armistice",
"Groundhog"
])),
Box::new(
SequencePattern::default()
.t_aco("National")
Expand Down Expand Up @@ -249,30 +245,30 @@ create_linter_for!(
SequencePattern::default()
.t_aco("Google")
.then_whitespace()
.then(Box::new(EitherPattern::new(vec![
Box::new(SequencePattern::aco("Search")),
Box::new(SequencePattern::aco("Cloud")),
Box::new(SequencePattern::aco("Maps")),
Box::new(SequencePattern::aco("Docs")),
Box::new(SequencePattern::aco("Sheets")),
Box::new(SequencePattern::aco("Slides")),
Box::new(SequencePattern::aco("Drive")),
Box::new(SequencePattern::aco("Meet")),
Box::new(SequencePattern::aco("Gmail")),
Box::new(SequencePattern::aco("Calendar")),
Box::new(SequencePattern::aco("Chrome")),
Box::new(SequencePattern::aco("ChromeOS")),
Box::new(SequencePattern::aco("Android")),
Box::new(SequencePattern::aco("Play")),
Box::new(SequencePattern::aco("Bard")),
Box::new(SequencePattern::aco("Gemini")),
Box::new(SequencePattern::aco("YouTube")),
Box::new(SequencePattern::aco("Photos")),
Box::new(SequencePattern::aco("Analytics")),
Box::new(SequencePattern::aco("AdSense")),
Box::new(SequencePattern::aco("Pixel")),
Box::new(SequencePattern::aco("Nest")),
Box::new(SequencePattern::aco("Workspace"))
.then(Box::new(WordSet::all(&[
"Search",
"Cloud",
"Maps",
"Docs",
"Sheets",
"Slides",
"Drive",
"Meet",
"Gmail",
"Calendar",
"Chrome",
"ChromeOS",
"Android",
"Play",
"Bard",
"Gemini",
"YouTube",
"Photos",
"Analytics",
"AdSense",
"Pixel",
"Nest",
"Workspace",
]))),
"When referring to Google products and services, make sure to treat them as proper nouns."
);
Expand Down Expand Up @@ -357,20 +353,22 @@ create_linter_for!(
.t_aco("Microsoft")
.then_whitespace()
.then(Box::new(EitherPattern::new(vec![
Box::new(SequencePattern::aco("Windows")),
Box::new(SequencePattern::aco("Office")),
Box::new(SequencePattern::aco("Teams")),
Box::new(SequencePattern::aco("Excel")),
Box::new(SequencePattern::aco("PowerPoint")),
Box::new(SequencePattern::aco("Word")),
Box::new(SequencePattern::aco("Outlook")),
Box::new(SequencePattern::aco("OneDrive")),
Box::new(SequencePattern::aco("SharePoint")),
Box::new(SequencePattern::aco("Xbox")),
Box::new(SequencePattern::aco("Surface")),
Box::new(SequencePattern::aco("Edge")),
Box::new(SequencePattern::aco("Bing")),
Box::new(SequencePattern::aco("Dynamics")),
Box::new(WordSet::all(&[
"Windows",
"Office",
"Teams",
"Excel",
"PowerPoint",
"Word",
"Outlook",
"OneDrive",
"SharePoint",
"Xbox",
"Surface",
"Edge",
"Bing",
"Dynamics",
])),
Box::new(
SequencePattern::default()
.t_aco("Visual")
Expand All @@ -387,10 +385,10 @@ create_linter_for!(
.t_aco("Apple")
.then_whitespace()
.then(Box::new(EitherPattern::new(vec![
Box::new(SequencePattern::aco("iPhone")),
Box::new(SequencePattern::aco("iPad")),
Box::new(SequencePattern::aco("iMac")),
Box::new(SequencePattern::aco("MacBook")),
Box::new(WordSet::all(&[
"iPhone", "iPad", "iMac", "MacBook", "Watch", "TV", "Music", "Arcade", "iCloud",
"Safari", "HomeKit", "CarPlay",
])),
Box::new(
SequencePattern::aco("MacBook")
.then_whitespace()
Expand All @@ -414,14 +412,6 @@ create_linter_for!(
.then_whitespace()
.t_aco("Max")
),
Box::new(SequencePattern::aco("Watch")),
Box::new(SequencePattern::aco("TV")),
Box::new(SequencePattern::aco("Music")),
Box::new(SequencePattern::aco("Arcade")),
Box::new(SequencePattern::aco("iCloud")),
Box::new(SequencePattern::aco("Safari")),
Box::new(SequencePattern::aco("HomeKit")),
Box::new(SequencePattern::aco("CarPlay")),
Box::new(
SequencePattern::default()
.t_aco("Vision")
Expand All @@ -437,11 +427,9 @@ create_linter_for!(
SequencePattern::aco("Meta")
.then_whitespace()
.then(Box::new(EitherPattern::new(vec![
Box::new(SequencePattern::aco("Oculus")),
Box::new(SequencePattern::aco("Portals")),
Box::new(SequencePattern::aco("Quest")),
Box::new(SequencePattern::aco("Gaming")),
Box::new(SequencePattern::aco("Horizon")),
Box::new(WordSet::all(&[
"Oculus", "Portals", "Quest", "Gaming", "Horizon",
])),
Box::new(
SequencePattern::default()
.t_aco("Reality")
Expand Down
2 changes: 2 additions & 0 deletions harper-core/src/patterns/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ mod sequence_pattern;
mod token_kind_pattern_group;
mod whitespace_pattern;
mod word_pattern_group;
mod word_set;

pub use any_pattern::AnyPattern;
use blanket::blanket;
Expand All @@ -28,6 +29,7 @@ pub use sequence_pattern::SequencePattern;
pub use token_kind_pattern_group::TokenKindPatternGroup;
pub use whitespace_pattern::WhitespacePattern;
pub use word_pattern_group::WordPatternGroup;
pub use word_set::WordSet;

#[cfg(not(feature = "concurrent"))]
#[blanket(derive(Rc, Arc))]
Expand Down
93 changes: 93 additions & 0 deletions harper-core/src/patterns/word_set.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
use super::Pattern;
use smallvec::SmallVec;

use crate::{CharString, Token};

// A [`Pattern`] that matches against any of a set of provided words.
// For small sets of short words, it doesn't allocate.
//
// Note that any capitalization of the contained words will result in a match.
#[derive(Debug, Default, Clone)]
pub struct WordSet {
words: SmallVec<[CharString; 4]>,
}

impl WordSet {
pub fn add(&mut self, word: &str) {
let chars = word.chars().collect();

if !self.words.contains(&chars) {
self.words.push(chars);
}
}

pub fn all(words: &[&'static str]) -> Self {
let mut set = Self::default();

for str in words {
set.add(str);
}

set
}
}

impl Pattern for WordSet {
fn matches(&self, tokens: &[Token], source: &[char]) -> usize {
let Some(tok) = tokens.first() else {
return 0;
};

if !tok.kind.is_word() {
return 0;
}

let tok_chars = tok.span.get_content(source);

for word in &self.words {
if tok_chars.len() != word.len() {
continue;
}

let partial_match = tok_chars
.iter()
.zip(word)
.all(|(a, b)| a.to_ascii_lowercase() == b.to_ascii_lowercase());

if partial_match {
return 1;
}
}

0
}
}

#[cfg(test)]
mod tests {
use crate::{patterns::DocPattern, Document, Span};

use super::WordSet;

#[test]
fn fruit() {
let set = WordSet::all(&["banana", "apple", "orange"]);

let doc = Document::new_markdown_curated("I ate a banana and an apple today.");

let matches = set.find_all_matches_in_doc(&doc);

assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]);
}

#[test]
fn fruit_whack_capitalization() {
let set = WordSet::all(&["banana", "apple", "orange"]);

let doc = Document::new_markdown_curated("I Ate A bAnaNa And aN apPlE today.");

let matches = set.find_all_matches_in_doc(&doc);

assert_eq!(matches, vec![Span::new(6, 7), Span::new(12, 13)]);
}
}

0 comments on commit 3b43861

Please sign in to comment.