Added matcher lint

Automattic · Jan 27, 2024 · f9d96c3 · f9d96c3
1 parent 2d03f8e
commit f9d96c3
Show file tree

Hide file tree

Showing 8 changed files with 250 additions and 34 deletions.
diff --git a/demo.md b/demo.md
@@ -1,5 +1,7 @@
 Harper is a language checker for artists. it can detect
-improper capitalization and mispelled words. There are some   cases, 
+improper capitalization and misspelled words. There are some cases, 
 where the the standard grammar checkers don't cut it.
 
 That's where Harper comes in handy.
+
+kid regards, Elijah
diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs
@@ -156,6 +156,11 @@ impl Document {
         )
     }
 
+    /// Defensively attempt to grab a specific token.
+    pub fn get_token(&self, index: usize) -> Option<Token> {
+        self.tokens.get(index).copied()
+    }
+
     pub fn tokens(&self) -> impl Iterator<Item = Token> + '_ {
         self.tokens.iter().copied()
     }

diff --git a/harper-core/src/linting/lint_set.rs b/harper-core/src/linting/lint_set.rs
@@ -4,7 +4,7 @@ use super::{spaces::Spaces, Linter};
 use paste::paste;
 
 use super::{
-    long_sentences::LongSentences, repeated_words::RepeatedWords,
+    long_sentences::LongSentences, matcher::Matcher, repeated_words::RepeatedWords,
     sentence_capitalization::SentenceCapitalization, spell_check::SpellCheck,
     unclosed_quotes::UnclosedQuotes, wrong_quotes::WrongQuotes,
 };
@@ -40,6 +40,7 @@ impl LintSet {
             .add_unclosed_quotes()
             .add_sentence_capitalization()
             .add_spell_check(dictionary)
+            .add_matcher()
             .add_spaces();
         self
     }
@@ -99,5 +100,6 @@ create_simple_builder_methods!(
     WrongQuotes,
     LongSentences,
     RepeatedWords,
-    Spaces
+    Spaces,
+    Matcher
 );
diff --git a/harper-core/src/linting/matcher.rs b/harper-core/src/linting/matcher.rs
@@ -0,0 +1,206 @@
+use crate::{
+    spell::DictWord, Document, Lint, LintKind, Linter, Punctuation, Span, Suggestion, Token,
+    TokenKind,
+};
+
+#[derive(Debug, PartialEq, PartialOrd, Clone)]
+struct PatternToken {
+    kind: TokenKind,
+    content: Option<DictWord>,
+}
+
+impl PatternToken {
+    fn from_token(token: Token, document: &Document) -> Self {
+        if token.kind.is_word() {
+            Self {
+                kind: token.kind,
+                content: Some(document.get_span_content(token.span).into()),
+            }
+        } else {
+            Self {
+                kind: token.kind,
+                content: None,
+            }
+        }
+    }
+}
+
+macro_rules! vecword {
+    ($lit:literal) => {
+        $lit.chars().collect()
+    };
+}
+
+macro_rules! pt {
+    ($str:literal) => {
+        PatternToken {
+            kind: TokenKind::Word,
+            content: Some($str.chars().collect()),
+        }
+    };
+    (Hyphen) => {
+        PatternToken {
+            kind: TokenKind::Punctuation(Punctuation::Hyphen),
+            content: None,
+        }
+    };
+    (Space) => {
+        PatternToken {
+            kind: TokenKind::Space(1),
+            content: None,
+        }
+    };
+    ( $($($str:literal),* => $repl:literal),*) => {
+        vec![
+            $(
+                {
+                    let mut rule = Rule {
+                        pattern: vec![$(
+                            pt!($str),
+                            pt!(Space),
+                        )*],
+                        replace_with: $repl.chars().collect()
+                    };
+
+                    if rule.pattern.len() > 0{
+                        rule.pattern.pop();
+                    }
+
+                    rule
+                },
+            )*
+        ]
+    };
+}
+
+struct Rule {
+    pattern: Vec<PatternToken>,
+    replace_with: Vec<char>,
+}
+
+/// A linter that uses a variety of curated pattern matches to find and fix common
+/// grammatical issues.
+pub struct Matcher {
+    triggers: Vec<Rule>,
+}
+
+impl Matcher {
+    pub fn new() -> Self {
+        let mut triggers = pt! {
+            "There","fore" => "Therefore",
+            "south","America" => "South America",
+            "South","america" => "South America",
+            "south","america" => "South America",
+            "North","america" => "North America",
+            "north","America" => "North America",
+            "north","america" => "North America",
+            "fatal","outcome" => "death",
+            "geiger","counter" => "Geiger counter",
+            "veterans","day" => "Veterans Day",
+            "presidents","day" => "Presidents' Day",
+            "president's","day" => "Presidents' Day",
+            "valentines","day" => "Valentine's Day",
+            "world","war","2" => "World War II",
+            "World","war","ii" => "World War II",
+            "world","War","ii" => "World War II",
+            "World","War","Ii" => "World War II",
+            "World","War","iI" => "World War II",
+            "black","sea" => "Black Sea",
+            "I","a","m" => "I am",
+            "We","a","re" => "We are",
+            "The","re" => "There",
+            "my","french" => "my French",
+            "It","cam" => "It can",
+            "can","be","seem" => "can be seen",
+            "mu","house" => "my house",
+            "kid","regards" => "kind regards",
+            "miss","understand" => "misunderstand",
+            "miss","use" => "misuse",
+            "miss","used" => "misused",
+            "bee","there" => "been there",
+            "want","be" => "won't be",
+            "more","then" => "more than",
+            "gong","to" => "going to",
+            "then","others" => "than others",
+            "then","before" => "than before",
+            "then","last","week" => "than last week",
+            "then","her" => "than her",
+            "then","hers" => "than hers",
+            "then","him" => "than him",
+            "then","his" => "than his"
+        };
+
+        triggers.push(Rule {
+            pattern: vec![pt!("break"), pt!(Hyphen), pt!("up")],
+            replace_with: vecword!("break-up"),
+        });
+
+        Self { triggers }
+    }
+}
+
+impl Default for Matcher {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Linter for Matcher {
+    fn lint(&mut self, document: &Document) -> Vec<Lint> {
+        let mut lints = Vec::new();
+
+        for (index, _) in document.tokens().enumerate() {
+            for trigger in &self.triggers {
+                let mut match_tokens = Vec::new();
+
+                for (p_index, pattern) in trigger.pattern.iter().enumerate() {
+                    let Some(token) = document.get_token(index + p_index) else {
+                        break;
+                    };
+
+                    let t_pattern = PatternToken::from_token(token, document);
+
+                    if t_pattern != *pattern {
+                        break;
+                    }
+
+                    match_tokens.push(token);
+                }
+
+                if match_tokens.len() == trigger.pattern.len() && !match_tokens.is_empty() {
+                    let span = Span::new(
+                        match_tokens.first().unwrap().span.start,
+                        match_tokens.last().unwrap().span.end,
+                    );
+
+                    lints.push(Lint {
+                        span,
+                        lint_kind: LintKind::Miscellaneous,
+                        suggestions: vec![Suggestion::ReplaceWith(trigger.replace_with.to_owned())],
+                        message: format!(
+                            "Did you mean “{}”?",
+                            trigger.replace_with.iter().collect::<String>()
+                        ),
+                    })
+                }
+            }
+        }
+
+        lints
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{Document, Linter};
+
+    use super::Matcher;
+
+    #[test]
+    fn matches_therefore() {
+        let document = Document::new_plain_english("There fore.");
+        let mut matcher = Matcher::new();
+        let lints = matcher.lint(&document);
+        assert!(lints.len() == 1)
+    }
+}
diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs
@@ -1,6 +1,7 @@
 mod lint;
 mod lint_set;
 mod long_sentences;
+mod matcher;
 mod repeated_words;
 mod sentence_capitalization;
 mod spaces;

diff --git a/harper-core/src/linting/repeated_words.rs b/harper-core/src/linting/repeated_words.rs
@@ -1,6 +1,8 @@
 use hashbrown::HashSet;
+use smallvec::smallvec;
 
 use crate::{
+    spell::DictWord,
     token::{Token, TokenKind, TokenStringExt},
     Document, Span, Suggestion,
 };
@@ -10,38 +12,35 @@ use super::{Lint, LintKind, Linter};
 #[derive(Debug, Clone)]
 pub struct RepeatedWords {
     /// The set of words that can be considered for repetition checking.
-    set: HashSet<Vec<char>>,
+    set: HashSet<DictWord>,
 }
 
 impl RepeatedWords {
     pub fn new() -> Self {
         let mut set = HashSet::new();
 
-        set.insert(vec!['t', 'h', 'e']);
-        set.insert(vec!['T', 'h', 'e']);
-        set.insert(vec!['a']);
-        set.insert(vec!['A']);
-        set.insert(vec!['a', 'n']);
-        set.insert(vec!['A', 'n']);
-        set.insert(vec!['i', 's']);
-        set.insert(vec!['I', 's']);
-        set.insert(vec!['w', 'i', 'l', 'l']);
-        set.insert(vec!['W', 'i', 'l', 'l']);
-        set.insert(vec!['l', 'i', 'k', 'e']);
-        set.insert(vec!['L', 'i', 'k', 'e']);
-        set.insert(vec!['t', 'h', 'a', 't']);
-        set.insert(vec!['T', 'h', 'a', 't']);
-        set.insert(vec!['w', 'h', 'a', 't']);
-        set.insert(vec!['W', 'h', 'a', 't']);
-        set.insert(vec!['w', 'h', 'i', 'c', 'h']);
-        set.insert(vec!['W', 'h', 'i', 'c', 'h']);
-        set.insert(vec!['b', 'e']);
-        set.insert(vec!['B', 'e']);
-        set.insert(vec!['a', 'n', 'd']);
-        set.insert(vec!['A', 'n', 'd']);
-        set.insert(vec!['I']);
-        set.insert(vec!['a', 't']);
-        set.insert(vec!['A', 't']);
+        macro_rules! add_set {
+            ($lit:literal) => {
+                set.insert($lit.chars().collect());
+            };
+            ($($lit:literal),*) => {
+                $(
+                    add_set!($lit);
+                )*
+            }
+        }
+
+        add_set!(
+            "the", "be", "to", "of", "and", "a", "in", "that", "have", "I", "it", "for", "not",
+            "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
+            "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
+            "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
+            "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take",
+            "people", "into", "year", "your", "good", "some", "could", "them", "see", "other",
+            "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back",
+            "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new",
+            "want", "because", "any", "these", "give", "day", "most", "us"
+        );
 
         Self { set }
     }

diff --git a/harper-core/src/spell/mod.rs b/harper-core/src/spell/mod.rs
@@ -6,7 +6,8 @@ pub use self::dictionary::Dictionary;
 mod dictionary;
 mod hunspell;
 
-type DictWord = SmallVec<[char; 6]>;
+/// A word from a dictionary or other similar structure.
+pub type DictWord = SmallVec<[char; 6]>;
 
 /// Suggest a correct spelling for a given misspelled word.
 /// [`misspelled_word`] is assumed to be quite small (n < 100).

diff --git a/harper-core/src/token.rs b/harper-core/src/token.rs
@@ -23,13 +23,13 @@ impl Token {
 }
 
 /// A [`Token`] that holds its content as a fat [`Vec<char>`] rather than as a [`Span`].
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd)]
 pub struct FatToken {
     pub content: Vec<char>,
     pub kind: TokenKind,
 }
 
-#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Default)]
+#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Default, PartialOrd)]
 #[serde(tag = "kind", content = "value")]
 pub enum TokenKind {
     #[default]
@@ -59,7 +59,7 @@ impl TokenKind {
     }
 }
 
-#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
+#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd)]
 #[serde(tag = "kind")]
 pub enum Punctuation {
     /// .
@@ -104,7 +104,7 @@ pub enum Punctuation {
     Equal,
 }
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, PartialOrd)]
 pub struct Quote {
     /// The location of the matching quote, if it exists.
     pub twin_loc: Option<usize>,