diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs index 2396e1ce..4fbe3d08 100644 --- a/harper-core/src/document.rs +++ b/harper-core/src/document.rs @@ -502,6 +502,7 @@ impl TokenStringExt for Document { create_fns_on_doc!(sentence_terminator); create_fns_on_doc!(chunk_terminator); create_fns_on_doc!(punctuation); + create_fns_on_doc!(likely_homograph); fn first_sentence_word(&self) -> Option { self.tokens.first_sentence_word() diff --git a/harper-core/src/linting/repeated_words.rs b/harper-core/src/linting/repeated_words.rs index 4271442a..be9d0bb2 100644 --- a/harper-core/src/linting/repeated_words.rs +++ b/harper-core/src/linting/repeated_words.rs @@ -1,66 +1,42 @@ -use super::{Lint, LintKind, PatternLinter, Suggestion}; -use crate::patterns::{Pattern, SequencePattern, WordPatternGroup}; -use crate::token::{Token, TokenStringExt}; - -pub struct RepeatedWords { - pattern: Box, -} - -impl RepeatedWords { - pub fn new() -> Self { - Self::default() - } -} - -impl Default for RepeatedWords { - fn default() -> Self { - let words = [ - "the", "be", "to", "of", "and", "a", "in", "that", "have", "I", "it", "for", "not", - "on", "with", "he", "as", "you", "do", "at", "this", "is", "but", "his", "by", "from", - "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", - "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which", - "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take", - "people", "into", "year", "your", "good", "some", "could", "them", "see", "other", - "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back", - "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new", - "want", "because", "any", "these", "give", "day", "most", "us", "are", - ]; - - let mut pattern = WordPatternGroup::default(); - - for word in words { - pattern.add( - word, - Box::new( - SequencePattern::default() - .then_exact_word(word) - .then_whitespace() - .then_exact_word(word), - ), - ); +use super::{Lint, LintKind, Linter, Suggestion}; +use crate::token::TokenStringExt; +use crate::{CharStringExt, Document, Span}; + +#[derive(Debug, Clone, Default)] +pub struct RepeatedWords; + +impl Linter for RepeatedWords { + fn lint(&mut self, document: &Document) -> Vec { + let mut lints = Vec::new(); + + for chunk in document.iter_chunks() { + let mut iter = chunk.iter_word_indices().zip(chunk.iter_words()).peekable(); + + while let (Some((idx_a, tok_a)), Some((idx_b, tok_b))) = (iter.next(), iter.peek()) { + let word_a = document.get_span_content(tok_a.span); + let word_b = document.get_span_content(tok_b.span); + + if !tok_a.kind.is_likely_homograph() && word_a.to_lower() == word_b.to_lower() { + let intervening_tokens = &chunk[idx_a + 1..*idx_b]; + + if intervening_tokens.iter().any(|t| !t.kind.is_whitespace()) { + continue; + } + + lints.push(Lint { + span: Span::new(tok_a.span.start, tok_b.span.end), + lint_kind: LintKind::Repetition, + suggestions: vec![Suggestion::ReplaceWith( + document.get_span_content(tok_a.span).to_vec(), + )], + message: "Did you mean to repeat this word?".to_string(), + ..Default::default() + }) + } + } } - Self { - pattern: Box::new(pattern), - } - } -} - -impl PatternLinter for RepeatedWords { - fn pattern(&self) -> &dyn Pattern { - self.pattern.as_ref() - } - - fn match_to_lint(&self, matched_tokens: &[Token], source: &[char]) -> Lint { - Lint { - span: matched_tokens.span().unwrap(), - lint_kind: LintKind::Repetition, - suggestions: vec![Suggestion::ReplaceWith( - matched_tokens[0].span.get_content(source).to_vec(), - )], - message: "Did you mean to repeat this word?".to_string(), - ..Default::default() - } + lints } } @@ -71,6 +47,16 @@ mod tests { #[test] fn catches_basic() { - assert_lint_count("I wanted the the banana.", RepeatedWords::new(), 1) + assert_lint_count("I wanted the the banana.", RepeatedWords, 1) + } + + #[test] + fn does_not_lint_homographs_address() { + assert_lint_count("To address address problems.", RepeatedWords, 0); + } + + #[test] + fn does_not_lint_homographs_record() { + assert_lint_count("To record record profits.", RepeatedWords, 0); } } diff --git a/harper-core/src/spell/full_dictionary.rs b/harper-core/src/spell/full_dictionary.rs index a11cd516..21e7476e 100644 --- a/harper-core/src/spell/full_dictionary.rs +++ b/harper-core/src/spell/full_dictionary.rs @@ -300,8 +300,8 @@ mod tests { #[test] fn herself_is_pronoun() { let dict = FullDictionary::curated(); - assert!(dict.get_word_metadata_str("herself").is_pronoun()); - assert!(dict.get_word_metadata_str("Herself").is_pronoun()); + assert!(dict.get_word_metadata_str("herself").is_pronoun_noun()); + assert!(dict.get_word_metadata_str("Herself").is_pronoun_noun()); } #[test] diff --git a/harper-core/src/token.rs b/harper-core/src/token.rs index ce1e70fb..cade6298 100644 --- a/harper-core/src/token.rs +++ b/harper-core/src/token.rs @@ -323,6 +323,14 @@ impl TokenKind { metadata.is_noun() } + pub fn is_likely_homograph(&self) -> bool { + let TokenKind::Word(metadata) = self else { + return false; + }; + + metadata.is_likely_homograph() + } + pub fn is_comma(&self) -> bool { matches!(self, TokenKind::Punctuation(Punctuation::Comma)) } @@ -397,6 +405,7 @@ pub trait TokenStringExt { create_decl_for!(sentence_terminator); create_decl_for!(chunk_terminator); create_decl_for!(punctuation); + create_decl_for!(likely_homograph); fn iter_linking_verb_indices(&self) -> impl Iterator + '_; fn iter_linking_verbs(&self) -> impl Iterator + '_; @@ -429,6 +438,7 @@ impl TokenStringExt for [Token] { create_fns_for!(unlintable); create_fns_for!(sentence_terminator); create_fns_for!(chunk_terminator); + create_fns_for!(likely_homograph); fn first_non_whitespace(&self) -> Option { self.iter().find(|t| !t.kind.is_whitespace()).copied() diff --git a/harper-core/src/word_metadata.rs b/harper-core/src/word_metadata.rs index ad34cafd..314f855d 100644 --- a/harper-core/src/word_metadata.rs +++ b/harper-core/src/word_metadata.rs @@ -1,4 +1,5 @@ use is_macro::Is; +use paste::paste; use serde::{Deserialize, Serialize}; #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Hash)] @@ -14,10 +15,59 @@ pub struct WordMetadata { pub common: bool, } +/// Needed for `serde` fn default_common() -> bool { false } +macro_rules! generate_metadata_queries { + ($($category:ident has $($sub:ident),*).*) => { + paste! { + pub fn is_likely_homograph(&self) -> bool { + if [$($(self.[< is_ $sub _ $category >](),)*)*].iter().map(|b| *b as u8).sum::() > 1 { + return true; + } + + [$( + self.[< is_ $category >](), + )*].iter().map(|b| *b as u8).sum::() > 1 + } + + $( + #[doc = concat!("Checks if the word is definitely a ", stringify!($category), ".")] + pub fn [< is_ $category >](&self) -> bool { + self.$category.is_some() + } + + $( + #[doc = concat!("Checks if the word is definitely a ", stringify!($category), " and more specifically is labeled as (a) ", stringify!($sub), ".")] + pub fn [< is_ $sub _ $category >](&self) -> bool { + matches!( + self.$category, + Some([< $category:camel Data >]{ + [< is_ $sub >]: Some(true), + .. + }) + ) + } + + + #[doc = concat!("Checks if the word is definitely a ", stringify!($category), " and more specifically is labeled as __not__ (a) ", stringify!($sub), ".")] + pub fn [< is_not_ $sub _ $category >](&self) -> bool { + matches!( + self.$category, + Some([< $category:camel Data >]{ + [< is_ $sub >]: Some(false), + .. + }) + ) + } + )* + )* + } + }; +} + impl WordMetadata { /// Produce a copy of `self` with the known properties of `other` set. pub fn or(&self, other: &Self) -> Self { @@ -43,75 +93,13 @@ impl WordMetadata { } } - pub fn is_noun(&self) -> bool { - self.noun.is_some() - } - - pub fn is_conjunction(&self) -> bool { - self.conjunction.is_some() - } - - pub fn is_verb(&self) -> bool { - self.verb.is_some() - } - - pub fn is_adjective(&self) -> bool { - self.adjective.is_some() - } - - pub fn is_adverb(&self) -> bool { - self.adverb.is_some() - } - - pub fn is_possessive_noun(&self) -> bool { - matches!( - self.noun, - Some(NounData { - is_possessive: Some(true), - .. - }) - ) - } - - pub fn is_plural_noun(&self) -> bool { - matches!( - self.noun, - Some(NounData { - is_plural: Some(true), - .. - }) - ) - } - - pub fn is_proper_noun(&self) -> bool { - matches!( - self.noun, - Some(NounData { - is_proper: Some(true), - .. - }) - ) - } - - pub fn is_pronoun(&self) -> bool { - matches!( - self.noun, - Some(NounData { - is_pronoun: Some(true), - .. - }) - ) - } - - pub fn is_linking_verb(&self) -> bool { - matches!( - self.verb, - Some(VerbData { - is_linking: Some(true), - .. - }) - ) - } + generate_metadata_queries!( + noun has proper, plural, possessive, pronoun. + verb has linking. + conjunction has. + adjective has. + adverb has + ); /// Checks whether a word is _definitely_ a swear. pub fn is_swear(&self) -> bool {