diff --git a/harper-core/src/linting/compound_words.rs b/harper-core/src/linting/compound_words.rs deleted file mode 100644 index 1d50f60a..00000000 --- a/harper-core/src/linting/compound_words.rs +++ /dev/null @@ -1,162 +0,0 @@ -use std::sync::Arc; - -use itertools::Itertools; - -use crate::{CharString, Dictionary, Document, FstDictionary, Span}; - -use super::{Lint, LintKind, Linter, Suggestion}; - -pub struct CompoundWords { - dict: Arc, -} - -impl CompoundWords { - pub fn new() -> Self { - Self { - dict: FstDictionary::curated(), - } - } -} - -impl Default for CompoundWords { - fn default() -> Self { - Self::new() - } -} - -impl Linter for CompoundWords { - fn lint(&mut self, document: &Document) -> Vec { - let mut lints = Vec::new(); - - let mut merged_word = CharString::new(); - let mut potential_compounds = Vec::new(); - - for (a, w, b) in document.tokens().tuple_windows() { - if !a.kind.is_word() || !w.kind.is_whitespace() || !b.kind.is_word() { - continue; - } - - let a_chars = document.get_span_content(a.span); - let b_chars = document.get_span_content(b.span); - - // Not super helpful in this case, so we skip it - if matches!(a_chars, ['a']) { - continue; - } - - potential_compounds.clear(); - - merged_word.clear(); - merged_word.extend_from_slice(a_chars); - merged_word.extend_from_slice(b_chars); - - // Check for closed compound words - if self.dict.contains_word(&merged_word) - && !a.kind.is_common_word() - && !b.kind.is_common_word() - { - potential_compounds.push(merged_word.clone()); - } - - if !potential_compounds.is_empty() { - lints.push(Lint { - span: Span::new(a.span.start, b.span.end), - lint_kind: LintKind::Spelling, - suggestions: potential_compounds - .drain(..) - .map(|v| Suggestion::ReplaceWith(v.to_vec())) - .collect(), - message: - "These two words are often combined to form a hyphenated compound word." - .to_owned(), - priority: 63, - }); - } - } - - lints - } - - fn description(&self) -> &str { - "Accidentally inserting a space inside a word is common. This rule looks for valid words that are split by whitespace." - } -} - -#[cfg(test)] -mod tests { - use crate::linting::tests::{assert_lint_count, assert_suggestion_count}; - - use super::CompoundWords; - - #[test] - fn scarecrow() { - assert_lint_count( - "I saw a scare crow in the field today.", - CompoundWords::default(), - 1, - ); - } - - #[test] - fn clean() { - assert_lint_count( - "When referring to the political party, make sure to treat them as a proper noun.", - CompoundWords::default(), - 0, - ); - } - - #[test] - fn bookshelf() { - assert_lint_count( - "I have a big book shelf in my room.", - CompoundWords::default(), - 1, - ); - } - - #[test] - fn sunscreen() { - assert_lint_count( - "Don't forget to apply your sunscreen before going out.", - CompoundWords::default(), - 0, - ); - } - - #[test] - fn birthday() { - assert_lint_count( - "We're having a big party to celebrate the couple's birthday today.", - CompoundWords::default(), - 0, - ); - } - - #[test] - fn hometown() { - assert_lint_count( - "My home town is a beautiful place with many historical land marks.", - CompoundWords::default(), - 2, - ); - } - - #[test] - fn assertions() { - assert_lint_count( - "Make sure to compile with debug ass ertions disabled.", - CompoundWords::default(), - 1, - ); - } - - #[test] - fn break_up() { - assert_suggestion_count( - "Like if you break up words you shouldn't.", - CompoundWords::default(), - 0, - ); - } -} diff --git a/harper-core/src/linting/lint_group.rs b/harper-core/src/linting/lint_group.rs index 753df32d..b6b4aa93 100644 --- a/harper-core/src/linting/lint_group.rs +++ b/harper-core/src/linting/lint_group.rs @@ -5,13 +5,13 @@ use super::an_a::AnA; use super::avoid_curses::AvoidCurses; use super::boring_words::BoringWords; use super::capitalize_personal_pronouns::CapitalizePersonalPronouns; -use super::compound_words::CompoundWords; use super::correct_number_suffix::CorrectNumberSuffix; use super::dot_initialisms::DotInitialisms; use super::ellipsis_length::EllipsisLength; use super::linking_verbs::LinkingVerbs; use super::long_sentences::LongSentences; use super::matcher::Matcher; +use super::merge_words::MergeWords; use super::multiple_sequential_pronouns::MultipleSequentialPronouns; use super::number_suffix_capitalization::NumberSuffixCapitalization; use super::plural_conjugate::PluralConjugate; @@ -182,7 +182,7 @@ create_lint_group_config!( MicrosoftNames => true, AppleNames => true, AzureNames => true, - CompoundWords => true, + MergeWords => true, PluralConjugate => false ); diff --git a/harper-core/src/linting/matcher.rs b/harper-core/src/linting/matcher.rs index 82582635..369a05be 100644 --- a/harper-core/src/linting/matcher.rs +++ b/harper-core/src/linting/matcher.rs @@ -175,7 +175,6 @@ impl Matcher { "that","s" => "that is", "That","s" => "that is", "ms" => "milliseconds", - "t","he" => "the", "the","hing" => "the thing", "The","hing" => "The thing", "need","helps" => "need help", diff --git a/harper-core/src/linting/merge_words.rs b/harper-core/src/linting/merge_words.rs new file mode 100644 index 00000000..104a233f --- /dev/null +++ b/harper-core/src/linting/merge_words.rs @@ -0,0 +1,100 @@ +use std::sync::Arc; + +use itertools::Itertools; + +use crate::{CharString, CharStringExt, Dictionary, Document, FstDictionary, Span}; + +use super::{Lint, LintKind, Linter, Suggestion}; + +pub struct MergeWords { + dict: Arc, +} + +impl MergeWords { + pub fn new() -> Self { + Self { + dict: FstDictionary::curated(), + } + } +} + +impl Default for MergeWords { + fn default() -> Self { + Self::new() + } +} + +impl Linter for MergeWords { + fn lint(&mut self, document: &Document) -> Vec { + let mut lints = Vec::new(); + + let mut merged_word = CharString::new(); + + for (a, w, b) in document.tokens().tuple_windows() { + if !a.kind.is_word() || !w.kind.is_whitespace() || !b.kind.is_word() { + continue; + } + + let a_chars = document.get_span_content(a.span); + let b_chars = document.get_span_content(b.span); + + // Not super helpful in this case, so we skip it + if matches!(a_chars, ['a']) || matches!(b_chars, ['a']) { + continue; + } + + merged_word.clear(); + merged_word.extend_from_slice(&a_chars.to_lower()); + merged_word.extend_from_slice(&b_chars.to_lower()); + + if self.dict.contains_word(&merged_word) + && (!self.dict.contains_word(a_chars) || !self.dict.contains_word(b_chars)) + { + lints.push(Lint { + span: Span::new(a.span.start, b.span.end), + lint_kind: LintKind::Spelling, + suggestions: vec![Suggestion::ReplaceWith(merged_word.to_vec())], + message: "These two words are often combined to form a closed compound word." + .to_owned(), + priority: 63, + }); + } + } + + lints + } + + fn description(&self) -> &str { + "Accidentally inserting a space inside a word is common. This rule looks for valid words that are split by whitespace." + } +} + +#[cfg(test)] +mod tests { + use crate::linting::tests::assert_lint_count; + + use super::MergeWords; + + #[test] + fn clean() { + assert_lint_count( + "When referring to the political party, make sure to treat them as a proper noun.", + MergeWords::default(), + 0, + ); + } + + #[test] + fn heretofore() { + assert_lint_count( + "This is a her etofore unseen problem.", + MergeWords::default(), + 1, + ); + } + + #[test] + fn therefore() { + assert_lint_count("The refore", MergeWords::default(), 1); + } +} diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs index 03ce9bbe..deec6fa1 100644 --- a/harper-core/src/linting/mod.rs +++ b/harper-core/src/linting/mod.rs @@ -2,7 +2,6 @@ mod an_a; mod avoid_curses; mod boring_words; mod capitalize_personal_pronouns; -mod compound_words; mod correct_number_suffix; mod dashes; mod dot_initialisms; @@ -12,6 +11,7 @@ mod lint; mod lint_group; mod long_sentences; mod matcher; +mod merge_words; mod multiple_sequential_pronouns; mod number_suffix_capitalization; mod pattern_linter; @@ -40,6 +40,7 @@ pub use lint::{Lint, LintKind, Suggestion}; pub use lint_group::{LintGroup, LintGroupConfig}; pub use long_sentences::LongSentences; pub use matcher::Matcher; +pub use merge_words::MergeWords; pub use multiple_sequential_pronouns::MultipleSequentialPronouns; pub use number_suffix_capitalization::NumberSuffixCapitalization; pub use pattern_linter::PatternLinter;