Now correctly recognizes numbers when they open sentences

Automattic · Jan 29, 2024 · ccebbd0 · ccebbd0
1 parent ae429e8
commit ccebbd0
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 4 deletions.
diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict
@@ -10271,12 +10271,10 @@ Vonnegut/M
 Voronezh/M
 Vorster/M
 Voyager/M
-Vt
 Vuitton/M
 Vulcan/M
 Vulg
 Vulgate/SM
-W/MDT
 WA
 WAC
 WASP/M
@@ -40244,7 +40242,6 @@ rutted
 rutting
 rutty/RT
 rye/M
-s/NYXB
 sabbath/M
 sabbaths
 sabbatical/SM

diff --git a/harper-core/src/lexing/mod.rs b/harper-core/src/lexing/mod.rs
@@ -117,6 +117,7 @@ fn lex_punctuation(source: &[char]) -> Option<FoundToken> {
     use Punctuation::*;
 
     let punct = match c {
+        '~' => Tilde,
         '=' => Equal,
         '<' => LessThan,
         '>' => GreaterThan,

diff --git a/harper-core/src/linting/sentence_capitalization.rs b/harper-core/src/linting/sentence_capitalization.rs
@@ -14,7 +14,11 @@ impl Linter for SentenceCapitalization {
         let mut lints = Vec::new();
 
         for sentence in document.sentences() {
-            if let Some(first_word) = sentence.first_sentence_word() {
+            if let Some(first_word) = sentence.first_non_whitespace() {
+                if !first_word.kind.is_word() {
+                    continue;
+                }
+
                 let letters = document.get_span_content(first_word.span);
 
                 if let Some(first_letter) = letters.first() {
@@ -61,6 +65,15 @@ mod tests {
         )
     }
 
+    #[test]
+    fn start_with_number() {
+        assert_lint_count(
+            "53 is the length of the longest word.",
+            SentenceCapitalization,
+            0,
+        );
+    }
+
     #[test]
     fn ignores_unlintable() {
         assert_lint_count(

diff --git a/harper-core/src/token.rs b/harper-core/src/token.rs
@@ -57,6 +57,15 @@ impl TokenKind {
     pub fn is_apostrophe(&self) -> bool {
         matches!(self, TokenKind::Punctuation(Punctuation::Apostrophe))
     }
+
+    /// Checks whether the token is whitespace.
+    pub fn is_whitespace(&self) -> bool {
+        match self {
+            TokenKind::Space(_) => true,
+            TokenKind::Newline(_) => true,
+            _ => false,
+        }
+    }
 }
 
 #[derive(Debug, Is, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd)]
@@ -104,6 +113,8 @@ pub enum Punctuation {
     Equal,
     /// *
     Star,
+    /// ~
+    Tilde,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, PartialOrd)]
@@ -118,6 +129,8 @@ pub trait TokenStringExt {
     /// Will also return [`None`] if there is an unlintable token in the position of the first
     /// word.
     fn first_sentence_word(&self) -> Option<Token>;
+    /// Grabs the first token that isn't whitespace from the token string.
+    fn first_non_whitespace(&self) -> Option<Token>;
     fn iter_word_indices(&self) -> impl Iterator<Item = usize> + '_;
     fn iter_words(&self) -> impl Iterator<Item = &Token> + '_;
     fn iter_space_indices(&self) -> impl Iterator<Item = usize> + '_;
@@ -134,6 +147,10 @@ impl TokenStringExt for [Token] {
         self.iter().find(|v| v.kind.is_word()).copied()
     }
 
+    fn first_non_whitespace(&self) -> Option<Token> {
+        self.iter().find(|t| !t.kind.is_whitespace()).copied()
+    }
+
     fn first_sentence_word(&self) -> Option<Token> {
         let (w_idx, word) = self.iter().find_position(|v| v.kind.is_word())?;