Skip to content

Commit

Permalink
Added matcher lint
Browse files Browse the repository at this point in the history
  • Loading branch information
elijah-potter committed Jan 27, 2024
1 parent 2d03f8e commit f9d96c3
Show file tree
Hide file tree
Showing 8 changed files with 250 additions and 34 deletions.
4 changes: 3 additions & 1 deletion demo.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
Harper is a language checker for artists. it can detect
improper capitalization and mispelled words. There are some cases,
improper capitalization and misspelled words. There are some cases,
where the the standard grammar checkers don't cut it.

That's where Harper comes in handy.

kid regards, Elijah
5 changes: 5 additions & 0 deletions harper-core/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@ impl Document {
)
}

/// Defensively attempt to grab a specific token.
pub fn get_token(&self, index: usize) -> Option<Token> {
self.tokens.get(index).copied()
}

pub fn tokens(&self) -> impl Iterator<Item = Token> + '_ {
self.tokens.iter().copied()
}
Expand Down
6 changes: 4 additions & 2 deletions harper-core/src/linting/lint_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use super::{spaces::Spaces, Linter};
use paste::paste;

use super::{
long_sentences::LongSentences, repeated_words::RepeatedWords,
long_sentences::LongSentences, matcher::Matcher, repeated_words::RepeatedWords,
sentence_capitalization::SentenceCapitalization, spell_check::SpellCheck,
unclosed_quotes::UnclosedQuotes, wrong_quotes::WrongQuotes,
};
Expand Down Expand Up @@ -40,6 +40,7 @@ impl LintSet {
.add_unclosed_quotes()
.add_sentence_capitalization()
.add_spell_check(dictionary)
.add_matcher()
.add_spaces();
self
}
Expand Down Expand Up @@ -99,5 +100,6 @@ create_simple_builder_methods!(
WrongQuotes,
LongSentences,
RepeatedWords,
Spaces
Spaces,
Matcher
);
206 changes: 206 additions & 0 deletions harper-core/src/linting/matcher.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
use crate::{
spell::DictWord, Document, Lint, LintKind, Linter, Punctuation, Span, Suggestion, Token,
TokenKind,
};

#[derive(Debug, PartialEq, PartialOrd, Clone)]
struct PatternToken {
kind: TokenKind,
content: Option<DictWord>,
}

impl PatternToken {
fn from_token(token: Token, document: &Document) -> Self {
if token.kind.is_word() {
Self {
kind: token.kind,
content: Some(document.get_span_content(token.span).into()),
}
} else {
Self {
kind: token.kind,
content: None,
}
}
}
}

macro_rules! vecword {
($lit:literal) => {
$lit.chars().collect()
};
}

macro_rules! pt {
($str:literal) => {
PatternToken {
kind: TokenKind::Word,
content: Some($str.chars().collect()),
}
};
(Hyphen) => {
PatternToken {
kind: TokenKind::Punctuation(Punctuation::Hyphen),
content: None,
}
};
(Space) => {
PatternToken {
kind: TokenKind::Space(1),
content: None,
}
};
( $($($str:literal),* => $repl:literal),*) => {
vec![
$(
{
let mut rule = Rule {
pattern: vec![$(
pt!($str),
pt!(Space),
)*],
replace_with: $repl.chars().collect()
};

if rule.pattern.len() > 0{
rule.pattern.pop();
}

rule
},
)*
]
};
}

struct Rule {
pattern: Vec<PatternToken>,
replace_with: Vec<char>,
}

/// A linter that uses a variety of curated pattern matches to find and fix common
/// grammatical issues.
pub struct Matcher {
triggers: Vec<Rule>,
}

impl Matcher {
pub fn new() -> Self {
let mut triggers = pt! {
"There","fore" => "Therefore",
"south","America" => "South America",
"South","america" => "South America",
"south","america" => "South America",
"North","america" => "North America",
"north","America" => "North America",
"north","america" => "North America",
"fatal","outcome" => "death",
"geiger","counter" => "Geiger counter",
"veterans","day" => "Veterans Day",
"presidents","day" => "Presidents' Day",
"president's","day" => "Presidents' Day",
"valentines","day" => "Valentine's Day",
"world","war","2" => "World War II",
"World","war","ii" => "World War II",
"world","War","ii" => "World War II",
"World","War","Ii" => "World War II",
"World","War","iI" => "World War II",
"black","sea" => "Black Sea",
"I","a","m" => "I am",
"We","a","re" => "We are",
"The","re" => "There",
"my","french" => "my French",
"It","cam" => "It can",
"can","be","seem" => "can be seen",
"mu","house" => "my house",
"kid","regards" => "kind regards",
"miss","understand" => "misunderstand",
"miss","use" => "misuse",
"miss","used" => "misused",
"bee","there" => "been there",
"want","be" => "won't be",
"more","then" => "more than",
"gong","to" => "going to",
"then","others" => "than others",
"then","before" => "than before",
"then","last","week" => "than last week",
"then","her" => "than her",
"then","hers" => "than hers",
"then","him" => "than him",
"then","his" => "than his"
};

triggers.push(Rule {
pattern: vec![pt!("break"), pt!(Hyphen), pt!("up")],
replace_with: vecword!("break-up"),
});

Self { triggers }
}
}

impl Default for Matcher {
fn default() -> Self {
Self::new()
}
}

impl Linter for Matcher {
fn lint(&mut self, document: &Document) -> Vec<Lint> {
let mut lints = Vec::new();

for (index, _) in document.tokens().enumerate() {
for trigger in &self.triggers {
let mut match_tokens = Vec::new();

for (p_index, pattern) in trigger.pattern.iter().enumerate() {
let Some(token) = document.get_token(index + p_index) else {
break;
};

let t_pattern = PatternToken::from_token(token, document);

if t_pattern != *pattern {
break;
}

match_tokens.push(token);
}

if match_tokens.len() == trigger.pattern.len() && !match_tokens.is_empty() {
let span = Span::new(
match_tokens.first().unwrap().span.start,
match_tokens.last().unwrap().span.end,
);

lints.push(Lint {
span,
lint_kind: LintKind::Miscellaneous,
suggestions: vec![Suggestion::ReplaceWith(trigger.replace_with.to_owned())],
message: format!(
"Did you mean “{}”?",
trigger.replace_with.iter().collect::<String>()
),
})
}
}
}

lints
}
}

#[cfg(test)]
mod tests {
use crate::{Document, Linter};

use super::Matcher;

#[test]
fn matches_therefore() {
let document = Document::new_plain_english("There fore.");
let mut matcher = Matcher::new();
let lints = matcher.lint(&document);
assert!(lints.len() == 1)
}
}
1 change: 1 addition & 0 deletions harper-core/src/linting/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
mod lint;
mod lint_set;
mod long_sentences;
mod matcher;
mod repeated_words;
mod sentence_capitalization;
mod spaces;
Expand Down
51 changes: 25 additions & 26 deletions harper-core/src/linting/repeated_words.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use hashbrown::HashSet;
use smallvec::smallvec;

use crate::{
spell::DictWord,
token::{Token, TokenKind, TokenStringExt},
Document, Span, Suggestion,
};
Expand All @@ -10,38 +12,35 @@ use super::{Lint, LintKind, Linter};
#[derive(Debug, Clone)]
pub struct RepeatedWords {
/// The set of words that can be considered for repetition checking.
set: HashSet<Vec<char>>,
set: HashSet<DictWord>,
}

impl RepeatedWords {
pub fn new() -> Self {
let mut set = HashSet::new();

set.insert(vec!['t', 'h', 'e']);
set.insert(vec!['T', 'h', 'e']);
set.insert(vec!['a']);
set.insert(vec!['A']);
set.insert(vec!['a', 'n']);
set.insert(vec!['A', 'n']);
set.insert(vec!['i', 's']);
set.insert(vec!['I', 's']);
set.insert(vec!['w', 'i', 'l', 'l']);
set.insert(vec!['W', 'i', 'l', 'l']);
set.insert(vec!['l', 'i', 'k', 'e']);
set.insert(vec!['L', 'i', 'k', 'e']);
set.insert(vec!['t', 'h', 'a', 't']);
set.insert(vec!['T', 'h', 'a', 't']);
set.insert(vec!['w', 'h', 'a', 't']);
set.insert(vec!['W', 'h', 'a', 't']);
set.insert(vec!['w', 'h', 'i', 'c', 'h']);
set.insert(vec!['W', 'h', 'i', 'c', 'h']);
set.insert(vec!['b', 'e']);
set.insert(vec!['B', 'e']);
set.insert(vec!['a', 'n', 'd']);
set.insert(vec!['A', 'n', 'd']);
set.insert(vec!['I']);
set.insert(vec!['a', 't']);
set.insert(vec!['A', 't']);
macro_rules! add_set {
($lit:literal) => {
set.insert($lit.chars().collect());
};
($($lit:literal),*) => {
$(
add_set!($lit);
)*
}
}

add_set!(
"the", "be", "to", "of", "and", "a", "in", "that", "have", "I", "it", "for", "not",
"on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
"they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
"there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
"go", "me", "when", "make", "can", "like", "time", "no", "just", "him", "know", "take",
"people", "into", "year", "your", "good", "some", "could", "them", "see", "other",
"than", "then", "now", "look", "only", "come", "its", "over", "think", "also", "back",
"after", "use", "two", "how", "our", "work", "first", "well", "way", "even", "new",
"want", "because", "any", "these", "give", "day", "most", "us"
);

Self { set }
}
Expand Down
3 changes: 2 additions & 1 deletion harper-core/src/spell/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ pub use self::dictionary::Dictionary;
mod dictionary;
mod hunspell;

type DictWord = SmallVec<[char; 6]>;
/// A word from a dictionary or other similar structure.
pub type DictWord = SmallVec<[char; 6]>;

/// Suggest a correct spelling for a given misspelled word.
/// [`misspelled_word`] is assumed to be quite small (n < 100).
Expand Down
8 changes: 4 additions & 4 deletions harper-core/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ impl Token {
}

/// A [`Token`] that holds its content as a fat [`Vec<char>`] rather than as a [`Span`].
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd)]
pub struct FatToken {
pub content: Vec<char>,
pub kind: TokenKind,
}

#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Default)]
#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Default, PartialOrd)]
#[serde(tag = "kind", content = "value")]
pub enum TokenKind {
#[default]
Expand Down Expand Up @@ -59,7 +59,7 @@ impl TokenKind {
}
}

#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
#[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd)]
#[serde(tag = "kind")]
pub enum Punctuation {
/// .
Expand Down Expand Up @@ -104,7 +104,7 @@ pub enum Punctuation {
Equal,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, PartialOrd)]
pub struct Quote {
/// The location of the matching quote, if it exists.
pub twin_loc: Option<usize>,
Expand Down

0 comments on commit f9d96c3

Please sign in to comment.