From f9d96c3e4479c64e964167637194825faa4b53c3 Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Fri, 26 Jan 2024 23:52:10 -0700 Subject: [PATCH] Added matcher lint --- demo.md | 4 +- harper-core/src/document.rs | 5 + harper-core/src/linting/lint_set.rs | 6 +- harper-core/src/linting/matcher.rs | 206 ++++++++++++++++++++++ harper-core/src/linting/mod.rs | 1 + harper-core/src/linting/repeated_words.rs | 51 +++--- harper-core/src/spell/mod.rs | 3 +- harper-core/src/token.rs | 8 +- 8 files changed, 250 insertions(+), 34 deletions(-) create mode 100644 harper-core/src/linting/matcher.rs diff --git a/demo.md b/demo.md index 0367cc26..640a4287 100644 --- a/demo.md +++ b/demo.md @@ -1,5 +1,7 @@ Harper is a language checker for artists. it can detect -improper capitalization and mispelled words. There are some cases, +improper capitalization and misspelled words. There are some cases, where the the standard grammar checkers don't cut it. That's where Harper comes in handy. + +kid regards, Elijah diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs index 3716c92a..08be7e68 100644 --- a/harper-core/src/document.rs +++ b/harper-core/src/document.rs @@ -156,6 +156,11 @@ impl Document { ) } + /// Defensively attempt to grab a specific token. + pub fn get_token(&self, index: usize) -> Option { + self.tokens.get(index).copied() + } + pub fn tokens(&self) -> impl Iterator + '_ { self.tokens.iter().copied() } diff --git a/harper-core/src/linting/lint_set.rs b/harper-core/src/linting/lint_set.rs index 0ace5dd9..0feb02cd 100644 --- a/harper-core/src/linting/lint_set.rs +++ b/harper-core/src/linting/lint_set.rs @@ -4,7 +4,7 @@ use super::{spaces::Spaces, Linter}; use paste::paste; use super::{ - long_sentences::LongSentences, repeated_words::RepeatedWords, + long_sentences::LongSentences, matcher::Matcher, repeated_words::RepeatedWords, sentence_capitalization::SentenceCapitalization, spell_check::SpellCheck, unclosed_quotes::UnclosedQuotes, wrong_quotes::WrongQuotes, }; @@ -40,6 +40,7 @@ impl LintSet { .add_unclosed_quotes() .add_sentence_capitalization() .add_spell_check(dictionary) + .add_matcher() .add_spaces(); self } @@ -99,5 +100,6 @@ create_simple_builder_methods!( WrongQuotes, LongSentences, RepeatedWords, - Spaces + Spaces, + Matcher ); diff --git a/harper-core/src/linting/matcher.rs b/harper-core/src/linting/matcher.rs new file mode 100644 index 00000000..a98482a9 --- /dev/null +++ b/harper-core/src/linting/matcher.rs @@ -0,0 +1,206 @@ +use crate::{ + spell::DictWord, Document, Lint, LintKind, Linter, Punctuation, Span, Suggestion, Token, + TokenKind, +}; + +#[derive(Debug, PartialEq, PartialOrd, Clone)] +struct PatternToken { + kind: TokenKind, + content: Option, +} + +impl PatternToken { + fn from_token(token: Token, document: &Document) -> Self { + if token.kind.is_word() { + Self { + kind: token.kind, + content: Some(document.get_span_content(token.span).into()), + } + } else { + Self { + kind: token.kind, + content: None, + } + } + } +} + +macro_rules! vecword { + ($lit:literal) => { + $lit.chars().collect() + }; +} + +macro_rules! pt { + ($str:literal) => { + PatternToken { + kind: TokenKind::Word, + content: Some($str.chars().collect()), + } + }; + (Hyphen) => { + PatternToken { + kind: TokenKind::Punctuation(Punctuation::Hyphen), + content: None, + } + }; + (Space) => { + PatternToken { + kind: TokenKind::Space(1), + content: None, + } + }; + ( $($($str:literal),* => $repl:literal),*) => { + vec![ + $( + { + let mut rule = Rule { + pattern: vec![$( + pt!($str), + pt!(Space), + )*], + replace_with: $repl.chars().collect() + }; + + if rule.pattern.len() > 0{ + rule.pattern.pop(); + } + + rule + }, + )* + ] + }; +} + +struct Rule { + pattern: Vec, + replace_with: Vec, +} + +/// A linter that uses a variety of curated pattern matches to find and fix common +/// grammatical issues. +pub struct Matcher { + triggers: Vec, +} + +impl Matcher { + pub fn new() -> Self { + let mut triggers = pt! { + "There","fore" => "Therefore", + "south","America" => "South America", + "South","america" => "South America", + "south","america" => "South America", + "North","america" => "North America", + "north","America" => "North America", + "north","america" => "North America", + "fatal","outcome" => "death", + "geiger","counter" => "Geiger counter", + "veterans","day" => "Veterans Day", + "presidents","day" => "Presidents' Day", + "president's","day" => "Presidents' Day", + "valentines","day" => "Valentine's Day", + "world","war","2" => "World War II", + "World","war","ii" => "World War II", + "world","War","ii" => "World War II", + "World","War","Ii" => "World War II", + "World","War","iI" => "World War II", + "black","sea" => "Black Sea", + "I","a","m" => "I am", + "We","a","re" => "We are", + "The","re" => "There", + "my","french" => "my French", + "It","cam" => "It can", + "can","be","seem" => "can be seen", + "mu","house" => "my house", + "kid","regards" => "kind regards", + "miss","understand" => "misunderstand", + "miss","use" => "misuse", + "miss","used" => "misused", + "bee","there" => "been there", + "want","be" => "won't be", + "more","then" => "more than", + "gong","to" => "going to", + "then","others" => "than others", + "then","before" => "than before", + "then","last","week" => "than last week", + "then","her" => "than her", + "then","hers" => "than hers", + "then","him" => "than him", + "then","his" => "than his" + }; + + triggers.push(Rule { + pattern: vec![pt!("break"), pt!(Hyphen), pt!("up")], + replace_with: vecword!("break-up"), + }); + + Self { triggers } + } +} + +impl Default for Matcher { + fn default() -> Self { + Self::new() + } +} + +impl Linter for Matcher { + fn lint(&mut self, document: &Document) -> Vec { + let mut lints = Vec::new(); + + for (index, _) in document.tokens().enumerate() { + for trigger in &self.triggers { + let mut match_tokens = Vec::new(); + + for (p_index, pattern) in trigger.pattern.iter().enumerate() { + let Some(token) = document.get_token(index + p_index) else { + break; + }; + + let t_pattern = PatternToken::from_token(token, document); + + if t_pattern != *pattern { + break; + } + + match_tokens.push(token); + } + + if match_tokens.len() == trigger.pattern.len() && !match_tokens.is_empty() { + let span = Span::new( + match_tokens.first().unwrap().span.start, + match_tokens.last().unwrap().span.end, + ); + + lints.push(Lint { + span, + lint_kind: LintKind::Miscellaneous, + suggestions: vec![Suggestion::ReplaceWith(trigger.replace_with.to_owned())], + message: format!( + "Did you mean “{}”?", + trigger.replace_with.iter().collect::() + ), + }) + } + } + } + + lints + } +} + +#[cfg(test)] +mod tests { + use crate::{Document, Linter}; + + use super::Matcher; + + #[test] + fn matches_therefore() { + let document = Document::new_plain_english("There fore."); + let mut matcher = Matcher::new(); + let lints = matcher.lint(&document); + assert!(lints.len() == 1) + } +} diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs index 13bfb898..5234f214 100644 --- a/harper-core/src/linting/mod.rs +++ b/harper-core/src/linting/mod.rs @@ -1,6 +1,7 @@ mod lint; mod lint_set; mod long_sentences; +mod matcher; mod repeated_words; mod sentence_capitalization; mod spaces; diff --git a/harper-core/src/linting/repeated_words.rs b/harper-core/src/linting/repeated_words.rs index e86ff97a..6f833816 100644 --- a/harper-core/src/linting/repeated_words.rs +++ b/harper-core/src/linting/repeated_words.rs @@ -1,6 +1,8 @@ use hashbrown::HashSet; +use smallvec::smallvec; use crate::{ + spell::DictWord, token::{Token, TokenKind, TokenStringExt}, Document, Span, Suggestion, }; @@ -10,38 +12,35 @@ use super::{Lint, LintKind, Linter}; #[derive(Debug, Clone)] pub struct RepeatedWords { /// The set of words that can be considered for repetition checking. - set: HashSet>, + set: HashSet, } impl RepeatedWords { pub fn new() -> Self { let mut set = HashSet::new(); - set.insert(vec!['t', 'h', 'e']); - set.insert(vec!['T', 'h', 'e']); - set.insert(vec!['a']); - set.insert(vec!['A']); - set.insert(vec!['a', 'n']); - set.insert(vec!['A', 'n']); - set.insert(vec!['i', 's']); - set.insert(vec!['I', 's']); - set.insert(vec!['w', 'i', 'l', 'l']); - set.insert(vec!['W', 'i', 'l', 'l']); - set.insert(vec!['l', 'i', 'k', 'e']); - set.insert(vec!['L', 'i', 'k', 'e']); - set.insert(vec!['t', 'h', 'a', 't']); - set.insert(vec!['T', 'h', 'a', 't']); - set.insert(vec!['w', 'h', 'a', 't']); - set.insert(vec!['W', 'h', 'a', 't']); - set.insert(vec!['w', 'h', 'i', 'c', 'h']); - set.insert(vec!['W', 'h', 'i', 'c', 'h']); - set.insert(vec!['b', 'e']); - set.insert(vec!['B', 'e']); - set.insert(vec!['a', 'n', 'd']); - set.insert(vec!['A', 'n', 'd']); - set.insert(vec!['I']); - set.insert(vec!['a', 't']); - set.insert(vec!['A', 't']); + macro_rules! add_set { + ($lit:literal) => { + set.insert($lit.chars().collect()); + }; + ($($lit:literal),*) => { + $( + add_set!($lit); + )* + } + } + + add_set!( + "the", "be", "to", "of", "and", "a", "in", "that", "have", "I", "it", "for", "not", + "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", + "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", + "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which", + "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take", + "people", "into", "year", "your", "good", "some", "could", "them", "see", "other", + "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", + "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", + "want", "because", "any", "these", "give", "day", "most", "us" + ); Self { set } } diff --git a/harper-core/src/spell/mod.rs b/harper-core/src/spell/mod.rs index 4502b277..cf21cec9 100644 --- a/harper-core/src/spell/mod.rs +++ b/harper-core/src/spell/mod.rs @@ -6,7 +6,8 @@ pub use self::dictionary::Dictionary; mod dictionary; mod hunspell; -type DictWord = SmallVec<[char; 6]>; +/// A word from a dictionary or other similar structure. +pub type DictWord = SmallVec<[char; 6]>; /// Suggest a correct spelling for a given misspelled word. /// [`misspelled_word`] is assumed to be quite small (n < 100). diff --git a/harper-core/src/token.rs b/harper-core/src/token.rs index 35e9893c..7db10c28 100644 --- a/harper-core/src/token.rs +++ b/harper-core/src/token.rs @@ -23,13 +23,13 @@ impl Token { } /// A [`Token`] that holds its content as a fat [`Vec`] rather than as a [`Span`]. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd)] pub struct FatToken { pub content: Vec, pub kind: TokenKind, } -#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Default)] +#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Default, PartialOrd)] #[serde(tag = "kind", content = "value")] pub enum TokenKind { #[default] @@ -59,7 +59,7 @@ impl TokenKind { } } -#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd)] #[serde(tag = "kind")] pub enum Punctuation { /// . @@ -104,7 +104,7 @@ pub enum Punctuation { Equal, } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, PartialOrd)] pub struct Quote { /// The location of the matching quote, if it exists. pub twin_loc: Option,