Skip to content

Commit

Permalink
Merge pull request #277 from elijah-potter/all-repeated
Browse files Browse the repository at this point in the history
feat(#253): remove requirement that word be in list for repetition
  • Loading branch information
elijah-potter authored Nov 24, 2024
2 parents c26c129 + f8064ae commit 5a3b88e
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 133 deletions.
1 change: 1 addition & 0 deletions harper-core/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,7 @@ impl TokenStringExt for Document {
create_fns_on_doc!(sentence_terminator);
create_fns_on_doc!(chunk_terminator);
create_fns_on_doc!(punctuation);
create_fns_on_doc!(likely_homograph);

fn first_sentence_word(&self) -> Option<Token> {
self.tokens.first_sentence_word()
Expand Down
110 changes: 48 additions & 62 deletions harper-core/src/linting/repeated_words.rs
Original file line number Diff line number Diff line change
@@ -1,66 +1,42 @@
use super::{Lint, LintKind, PatternLinter, Suggestion};
use crate::patterns::{Pattern, SequencePattern, WordPatternGroup};
use crate::token::{Token, TokenStringExt};

pub struct RepeatedWords {
pattern: Box<dyn Pattern>,
}

impl RepeatedWords {
pub fn new() -> Self {
Self::default()
}
}

impl Default for RepeatedWords {
fn default() -> Self {
let words = [
"the", "be", "to", "of", "and", "a", "in", "that", "have", "I", "it", "for", "not",
"on", "with", "he", "as", "you", "do", "at", "this", "is", "but", "his", "by", "from",
"they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
"there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
"go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take",
"people", "into", "year", "your", "good", "some", "could", "them", "see", "other",
"than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back",
"after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new",
"want", "because", "any", "these", "give", "day", "most", "us", "are",
];

let mut pattern = WordPatternGroup::default();

for word in words {
pattern.add(
word,
Box::new(
SequencePattern::default()
.then_exact_word(word)
.then_whitespace()
.then_exact_word(word),
),
);
use super::{Lint, LintKind, Linter, Suggestion};
use crate::token::TokenStringExt;
use crate::{CharStringExt, Document, Span};

#[derive(Debug, Clone, Default)]
pub struct RepeatedWords;

impl Linter for RepeatedWords {
fn lint(&mut self, document: &Document) -> Vec<Lint> {
let mut lints = Vec::new();

for chunk in document.iter_chunks() {
let mut iter = chunk.iter_word_indices().zip(chunk.iter_words()).peekable();

while let (Some((idx_a, tok_a)), Some((idx_b, tok_b))) = (iter.next(), iter.peek()) {
let word_a = document.get_span_content(tok_a.span);
let word_b = document.get_span_content(tok_b.span);

if !tok_a.kind.is_likely_homograph() && word_a.to_lower() == word_b.to_lower() {
let intervening_tokens = &chunk[idx_a + 1..*idx_b];

if intervening_tokens.iter().any(|t| !t.kind.is_whitespace()) {
continue;
}

lints.push(Lint {
span: Span::new(tok_a.span.start, tok_b.span.end),
lint_kind: LintKind::Repetition,
suggestions: vec![Suggestion::ReplaceWith(
document.get_span_content(tok_a.span).to_vec(),
)],
message: "Did you mean to repeat this word?".to_string(),
..Default::default()
})
}
}
}

Self {
pattern: Box::new(pattern),
}
}
}

impl PatternLinter for RepeatedWords {
fn pattern(&self) -> &dyn Pattern {
self.pattern.as_ref()
}

fn match_to_lint(&self, matched_tokens: &[Token], source: &[char]) -> Lint {
Lint {
span: matched_tokens.span().unwrap(),
lint_kind: LintKind::Repetition,
suggestions: vec![Suggestion::ReplaceWith(
matched_tokens[0].span.get_content(source).to_vec(),
)],
message: "Did you mean to repeat this word?".to_string(),
..Default::default()
}
lints
}
}

Expand All @@ -71,6 +47,16 @@ mod tests {

#[test]
fn catches_basic() {
assert_lint_count("I wanted the the banana.", RepeatedWords::new(), 1)
assert_lint_count("I wanted the the banana.", RepeatedWords, 1)
}

#[test]
fn does_not_lint_homographs_address() {
assert_lint_count("To address address problems.", RepeatedWords, 0);
}

#[test]
fn does_not_lint_homographs_record() {
assert_lint_count("To record record profits.", RepeatedWords, 0);
}
}
4 changes: 2 additions & 2 deletions harper-core/src/spell/full_dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,8 @@ mod tests {
#[test]
fn herself_is_pronoun() {
let dict = FullDictionary::curated();
assert!(dict.get_word_metadata_str("herself").is_pronoun());
assert!(dict.get_word_metadata_str("Herself").is_pronoun());
assert!(dict.get_word_metadata_str("herself").is_pronoun_noun());
assert!(dict.get_word_metadata_str("Herself").is_pronoun_noun());
}

#[test]
Expand Down
10 changes: 10 additions & 0 deletions harper-core/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,14 @@ impl TokenKind {
metadata.is_noun()
}

pub fn is_likely_homograph(&self) -> bool {
let TokenKind::Word(metadata) = self else {
return false;
};

metadata.is_likely_homograph()
}

pub fn is_comma(&self) -> bool {
matches!(self, TokenKind::Punctuation(Punctuation::Comma))
}
Expand Down Expand Up @@ -397,6 +405,7 @@ pub trait TokenStringExt {
create_decl_for!(sentence_terminator);
create_decl_for!(chunk_terminator);
create_decl_for!(punctuation);
create_decl_for!(likely_homograph);

fn iter_linking_verb_indices(&self) -> impl Iterator<Item = usize> + '_;
fn iter_linking_verbs(&self) -> impl Iterator<Item = Token> + '_;
Expand Down Expand Up @@ -429,6 +438,7 @@ impl TokenStringExt for [Token] {
create_fns_for!(unlintable);
create_fns_for!(sentence_terminator);
create_fns_for!(chunk_terminator);
create_fns_for!(likely_homograph);

fn first_non_whitespace(&self) -> Option<Token> {
self.iter().find(|t| !t.kind.is_whitespace()).copied()
Expand Down
126 changes: 57 additions & 69 deletions harper-core/src/word_metadata.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use is_macro::Is;
use paste::paste;
use serde::{Deserialize, Serialize};

#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, PartialOrd, Hash)]
Expand All @@ -14,10 +15,59 @@ pub struct WordMetadata {
pub common: bool,
}

/// Needed for `serde`
fn default_common() -> bool {
false
}

macro_rules! generate_metadata_queries {
($($category:ident has $($sub:ident),*).*) => {
paste! {
pub fn is_likely_homograph(&self) -> bool {
if [$($(self.[< is_ $sub _ $category >](),)*)*].iter().map(|b| *b as u8).sum::<u8>() > 1 {
return true;
}

[$(
self.[< is_ $category >](),
)*].iter().map(|b| *b as u8).sum::<u8>() > 1
}

$(
#[doc = concat!("Checks if the word is definitely a ", stringify!($category), ".")]
pub fn [< is_ $category >](&self) -> bool {
self.$category.is_some()
}

$(
#[doc = concat!("Checks if the word is definitely a ", stringify!($category), " and more specifically is labeled as (a) ", stringify!($sub), ".")]
pub fn [< is_ $sub _ $category >](&self) -> bool {
matches!(
self.$category,
Some([< $category:camel Data >]{
[< is_ $sub >]: Some(true),
..
})
)
}


#[doc = concat!("Checks if the word is definitely a ", stringify!($category), " and more specifically is labeled as __not__ (a) ", stringify!($sub), ".")]
pub fn [< is_not_ $sub _ $category >](&self) -> bool {
matches!(
self.$category,
Some([< $category:camel Data >]{
[< is_ $sub >]: Some(false),
..
})
)
}
)*
)*
}
};
}

impl WordMetadata {
/// Produce a copy of `self` with the known properties of `other` set.
pub fn or(&self, other: &Self) -> Self {
Expand All @@ -43,75 +93,13 @@ impl WordMetadata {
}
}

pub fn is_noun(&self) -> bool {
self.noun.is_some()
}

pub fn is_conjunction(&self) -> bool {
self.conjunction.is_some()
}

pub fn is_verb(&self) -> bool {
self.verb.is_some()
}

pub fn is_adjective(&self) -> bool {
self.adjective.is_some()
}

pub fn is_adverb(&self) -> bool {
self.adverb.is_some()
}

pub fn is_possessive_noun(&self) -> bool {
matches!(
self.noun,
Some(NounData {
is_possessive: Some(true),
..
})
)
}

pub fn is_plural_noun(&self) -> bool {
matches!(
self.noun,
Some(NounData {
is_plural: Some(true),
..
})
)
}

pub fn is_proper_noun(&self) -> bool {
matches!(
self.noun,
Some(NounData {
is_proper: Some(true),
..
})
)
}

pub fn is_pronoun(&self) -> bool {
matches!(
self.noun,
Some(NounData {
is_pronoun: Some(true),
..
})
)
}

pub fn is_linking_verb(&self) -> bool {
matches!(
self.verb,
Some(VerbData {
is_linking: Some(true),
..
})
)
}
generate_metadata_queries!(
noun has proper, plural, possessive, pronoun.
verb has linking.
conjunction has.
adjective has.
adverb has
);

/// Checks whether a word is _definitely_ a swear.
pub fn is_swear(&self) -> bool {
Expand Down

0 comments on commit 5a3b88e

Please sign in to comment.