Merge pull request #277 from elijah-potter/all-repeated

feat(#253): remove requirement that word be in list for repetition
Automattic · Nov 24, 2024 · 5a3b88e · 5a3b88e
2 parents c26c129 + f8064ae
commit 5a3b88e
Show file tree

Hide file tree

Showing 5 changed files with 118 additions and 133 deletions.
diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs
@@ -502,6 +502,7 @@ impl TokenStringExt for Document {
     create_fns_on_doc!(sentence_terminator);
     create_fns_on_doc!(chunk_terminator);
     create_fns_on_doc!(punctuation);
+    create_fns_on_doc!(likely_homograph);
 
     fn first_sentence_word(&self) -> Option<Token> {
         self.tokens.first_sentence_word()

diff --git a/harper-core/src/linting/repeated_words.rs b/harper-core/src/linting/repeated_words.rs
@@ -1,66 +1,42 @@
-use super::{Lint, LintKind, PatternLinter, Suggestion};
-use crate::patterns::{Pattern, SequencePattern, WordPatternGroup};
-use crate::token::{Token, TokenStringExt};
-
-pub struct RepeatedWords {
-    pattern: Box<dyn Pattern>,
-}
-
-impl RepeatedWords {
-    pub fn new() -> Self {
-        Self::default()
-    }
-}
-
-impl Default for RepeatedWords {
-    fn default() -> Self {
-        let words = [
-            "the", "be", "to", "of", "and", "a", "in", "that", "have", "I", "it", "for", "not",
-            "on", "with", "he", "as", "you", "do", "at", "this", "is", "but", "his", "by", "from",
-            "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
-            "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
-            "go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take",
-            "people", "into", "year", "your", "good", "some", "could", "them", "see", "other",
-            "than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back",
-            "after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new",
-            "want", "because", "any", "these", "give", "day", "most", "us", "are",
-        ];
-
-        let mut pattern = WordPatternGroup::default();
-
-        for word in words {
-            pattern.add(
-                word,
-                Box::new(
-                    SequencePattern::default()
-                        .then_exact_word(word)
-                        .then_whitespace()
-                        .then_exact_word(word),
-                ),
-            );
+use super::{Lint, LintKind, Linter, Suggestion};
+use crate::token::TokenStringExt;
+use crate::{CharStringExt, Document, Span};
+
+#[derive(Debug, Clone, Default)]
+pub struct RepeatedWords;
+
+impl Linter for RepeatedWords {
+    fn lint(&mut self, document: &Document) -> Vec<Lint> {
+        let mut lints = Vec::new();
+
+        for chunk in document.iter_chunks() {
+            let mut iter = chunk.iter_word_indices().zip(chunk.iter_words()).peekable();
+
+            while let (Some((idx_a, tok_a)), Some((idx_b, tok_b))) = (iter.next(), iter.peek()) {
+                let word_a = document.get_span_content(tok_a.span);
+                let word_b = document.get_span_content(tok_b.span);
+
+                if !tok_a.kind.is_likely_homograph() && word_a.to_lower() == word_b.to_lower() {
+                    let intervening_tokens = &chunk[idx_a + 1..*idx_b];
+
+                    if intervening_tokens.iter().any(|t| !t.kind.is_whitespace()) {
+                        continue;
+                    }
+
+                    lints.push(Lint {
+                        span: Span::new(tok_a.span.start, tok_b.span.end),
+                        lint_kind: LintKind::Repetition,
+                        suggestions: vec![Suggestion::ReplaceWith(
+                            document.get_span_content(tok_a.span).to_vec(),
+                        )],
+                        message: "Did you mean to repeat this word?".to_string(),
+                        ..Default::default()
+                    })
+                }
+            }
         }
 
-        Self {
-            pattern: Box::new(pattern),
-        }
-    }
-}
-
-impl PatternLinter for RepeatedWords {
-    fn pattern(&self) -> &dyn Pattern {
-        self.pattern.as_ref()
-    }
-
-    fn match_to_lint(&self, matched_tokens: &[Token], source: &[char]) -> Lint {
-        Lint {
-            span: matched_tokens.span().unwrap(),
-            lint_kind: LintKind::Repetition,
-            suggestions: vec![Suggestion::ReplaceWith(
-                matched_tokens[0].span.get_content(source).to_vec(),
-            )],
-            message: "Did you mean to repeat this word?".to_string(),
-            ..Default::default()
-        }
+        lints
     }
 }
 
@@ -71,6 +47,16 @@ mod tests {
 
     #[test]
     fn catches_basic() {
-        assert_lint_count("I wanted the the banana.", RepeatedWords::new(), 1)
+        assert_lint_count("I wanted the the banana.", RepeatedWords, 1)
+    }
+
+    #[test]
+    fn does_not_lint_homographs_address() {
+        assert_lint_count("To address address problems.", RepeatedWords, 0);
+    }
+
+    #[test]
+    fn does_not_lint_homographs_record() {
+        assert_lint_count("To record record profits.", RepeatedWords, 0);
     }
 }
diff --git a/harper-core/src/spell/full_dictionary.rs b/harper-core/src/spell/full_dictionary.rs
@@ -300,8 +300,8 @@ mod tests {
     #[test]
     fn herself_is_pronoun() {
         let dict = FullDictionary::curated();
-        assert!(dict.get_word_metadata_str("herself").is_pronoun());
-        assert!(dict.get_word_metadata_str("Herself").is_pronoun());
+        assert!(dict.get_word_metadata_str("herself").is_pronoun_noun());
+        assert!(dict.get_word_metadata_str("Herself").is_pronoun_noun());
     }
 
     #[test]

diff --git a/harper-core/src/token.rs b/harper-core/src/token.rs
@@ -323,6 +323,14 @@ impl TokenKind {
         metadata.is_noun()
     }
 
+    pub fn is_likely_homograph(&self) -> bool {
+        let TokenKind::Word(metadata) = self else {
+            return false;
+        };
+
+        metadata.is_likely_homograph()
+    }
+
     pub fn is_comma(&self) -> bool {
         matches!(self, TokenKind::Punctuation(Punctuation::Comma))
     }
@@ -397,6 +405,7 @@ pub trait TokenStringExt {
     create_decl_for!(sentence_terminator);
     create_decl_for!(chunk_terminator);
     create_decl_for!(punctuation);
+    create_decl_for!(likely_homograph);
 
     fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_;
     fn iter_linking_verbs(&self) -> impl Iterator<Item = Token> + '_;
@@ -429,6 +438,7 @@ impl TokenStringExt for [Token] {
     create_fns_for!(unlintable);
     create_fns_for!(sentence_terminator);
     create_fns_for!(chunk_terminator);
+    create_fns_for!(likely_homograph);
 
     fn first_non_whitespace(&self) -> Option<Token> {
         self.iter().find(|t| !t.kind.is_whitespace()).copied()

diff --git a/harper-core/src/word_metadata.rs b/harper-core/src/word_metadata.rs
@@ -1,4 +1,5 @@
 use is_macro::Is;
+use paste::paste;
 use serde::{Deserialize, Serialize};
 
 #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Hash)]
@@ -14,10 +15,59 @@ pub struct WordMetadata {
     pub common: bool,
 }
 
+/// Needed for `serde`
 fn default_common() -> bool {
     false
 }
 
+macro_rules! generate_metadata_queries {
+    ($($category:ident has $($sub:ident),*).*) => {
+        paste! {
+            pub fn is_likely_homograph(&self) -> bool {
+                if [$($(self.[< is_ $sub _ $category >](),)*)*].iter().map(|b| *b as u8).sum::<u8>() > 1 {
+                    return true;
+                }
+
+                [$(
+                    self.[< is_ $category >](),
+                )*].iter().map(|b| *b as u8).sum::<u8>() > 1
+            }
+
+            $(
+                #[doc = concat!("Checks if the word is definitely a ", stringify!($category), ".")]
+                pub fn [< is_ $category >](&self) -> bool {
+                    self.$category.is_some()
+                }
+
+                $(
+                    #[doc = concat!("Checks if the word is definitely a ", stringify!($category), " and more specifically is labeled as (a) ", stringify!($sub), ".")]
+                    pub fn [< is_ $sub _ $category >](&self) -> bool {
+                        matches!(
+                            self.$category,
+                            Some([< $category:camel Data >]{
+                                [< is_ $sub >]: Some(true),
+                                ..
+                            })
+                        )
+                    }
+
+
+                    #[doc = concat!("Checks if the word is definitely a ", stringify!($category), " and more specifically is labeled as __not__ (a) ", stringify!($sub), ".")]
+                    pub fn [< is_not_ $sub _ $category >](&self) -> bool {
+                        matches!(
+                            self.$category,
+                            Some([< $category:camel Data >]{
+                                [< is_ $sub >]: Some(false),
+                                ..
+                            })
+                        )
+                    }
+                )*
+            )*
+        }
+    };
+}
+
 impl WordMetadata {
     /// Produce a copy of `self` with the known properties of `other` set.
     pub fn or(&self, other: &Self) -> Self {
@@ -43,75 +93,13 @@ impl WordMetadata {
         }
     }
 
-    pub fn is_noun(&self) -> bool {
-        self.noun.is_some()
-    }
-
-    pub fn is_conjunction(&self) -> bool {
-        self.conjunction.is_some()
-    }
-
-    pub fn is_verb(&self) -> bool {
-        self.verb.is_some()
-    }
-
-    pub fn is_adjective(&self) -> bool {
-        self.adjective.is_some()
-    }
-
-    pub fn is_adverb(&self) -> bool {
-        self.adverb.is_some()
-    }
-
-    pub fn is_possessive_noun(&self) -> bool {
-        matches!(
-            self.noun,
-            Some(NounData {
-                is_possessive: Some(true),
-                ..
-            })
-        )
-    }
-
-    pub fn is_plural_noun(&self) -> bool {
-        matches!(
-            self.noun,
-            Some(NounData {
-                is_plural: Some(true),
-                ..
-            })
-        )
-    }
-
-    pub fn is_proper_noun(&self) -> bool {
-        matches!(
-            self.noun,
-            Some(NounData {
-                is_proper: Some(true),
-                ..
-            })
-        )
-    }
-
-    pub fn is_pronoun(&self) -> bool {
-        matches!(
-            self.noun,
-            Some(NounData {
-                is_pronoun: Some(true),
-                ..
-            })
-        )
-    }
-
-    pub fn is_linking_verb(&self) -> bool {
-        matches!(
-            self.verb,
-            Some(VerbData {
-                is_linking: Some(true),
-                ..
-            })
-        )
-    }
+    generate_metadata_queries!(
+        noun has proper, plural, possessive, pronoun.
+        verb has linking.
+        conjunction has.
+        adjective has.
+        adverb has
+    );
 
     /// Checks whether a word is _definitely_ a swear.
     pub fn is_swear(&self) -> bool {