From 1077ceb73b69b0b126f714bde6295c49da08a1ad Mon Sep 17 00:00:00 2001
From: Elijah Potter <elijah.sirius@protonmail.com>
Date: Tue, 30 Jan 2024 09:47:10 -0700
Subject: [PATCH] Markdown parser now properly parses lists

---
 harper-core/dictionary.aff          |  5 +++-
 harper-core/dictionary.dict         |  3 ++-
 harper-core/src/document.rs         | 41 ++++++++++++-----------------
 harper-core/src/parsers/markdown.rs | 22 +++++++++++-----
 harper-core/src/parsers/mod.rs      | 19 +++++++++++++
 harper-core/src/span.rs             |  7 +++++
 6 files changed, 65 insertions(+), 32 deletions(-)
diff --git a/harper-core/dictionary.aff b/harper-core/dictionary.aff
index af8eb37e..5c6acba9 100644
--- a/harper-core/dictionary.aff
+++ b/harper-core/dictionary.aff
@@ -113,7 +113,10 @@ SFX L Y 1
 SFX L   0     ment       .
 
 SFX O Y 1
-SFX O   0     ful         .
+SFX O   0     ful        .
+
+SFX Q Y 1
+SFX Q   0     ally       .
 
 REP 90
 REP a ei
diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict
index dbc6b27e..0cfe142b 100644
--- a/harper-core/dictionary.dict
+++ b/harper-core/dictionary.dict
@@ -11632,7 +11632,7 @@ algebra/SM
 algebraic
 algebraically
 algorithm/SM
-algorithmic
+algorithmic/Q
 alias/GMDS
 alibi/GMDS
 alien/BGMDS
@@ -49591,3 +49591,4 @@ raytracer
 viewport
 backend
 frontend
+automata
diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs
index 08be7e68..b9d249c1 100644
--- a/harper-core/src/document.rs
+++ b/harper-core/src/document.rs
@@ -172,14 +172,8 @@ impl Document {
     /// Iterate over the locations of the sentence terminators in the document.
     fn sentence_terminators(&self) -> impl Iterator<Item = usize> + '_ {
         self.tokens.iter().enumerate().filter_map(|(index, token)| {
-            if let Token {
-                kind: TokenKind::Punctuation(punct),
-                ..
-            } = token
-            {
-                if is_sentence_terminator(punct) {
-                    return Some(index);
-                }
+            if is_sentence_terminator(&token.kind) {
+                return Some(index);
             }
             None
         })
@@ -192,14 +186,8 @@ impl Document {
             .enumerate()
             .rev()
             .find_map(|(index, token)| {
-                if let Token {
-                    kind: TokenKind::Punctuation(punct),
-                    ..
-                } = token
-                {
-                    if is_sentence_terminator(punct) {
-                        return Some(index);
-                    }
+                if is_sentence_terminator(&token.kind) {
+                    return Some(index);
                 }
                 None
             })
@@ -287,13 +275,17 @@ impl Display for Document {
     }
 }
 
-fn is_sentence_terminator(punctuation: &Punctuation) -> bool {
-    [
-        Punctuation::Period,
-        Punctuation::Bang,
-        Punctuation::Question,
-    ]
-    .contains(punctuation)
+fn is_sentence_terminator(token: &TokenKind) -> bool {
+    match token {
+        TokenKind::Punctuation(punct) => [
+            Punctuation::Period,
+            Punctuation::Bang,
+            Punctuation::Question,
+        ]
+        .contains(punct),
+        TokenKind::Newline(_) => true,
+        _ => false,
+    }
 }
 
 #[cfg(test)]
@@ -313,7 +305,8 @@ mod tests {
         let mut document = Document::new(text, Box::new(Markdown));
         document.condense_contractions();
 
-        assert_eq!(document.tokens.len(), final_tok_count);
+        // We add one because the Markdown parser inserts a newline at end-of-input.
+        assert_eq!(document.tokens.len(), final_tok_count + 1);
     }
 
     #[test]
diff --git a/harper-core/src/parsers/markdown.rs b/harper-core/src/parsers/markdown.rs
index 29dab5f6..9fb39a09 100644
--- a/harper-core/src/parsers/markdown.rs
+++ b/harper-core/src/parsers/markdown.rs
@@ -24,15 +24,28 @@ impl Parser for Markdown {
         // NOTE: the range spits out __byte__ indices, not char indices.
         // This is why we keep track above.
         for (event, range) in md_parser.into_offset_iter() {
+            if range.start > traversed_bytes {
+                traversed_chars += source_str[traversed_bytes..range.start].chars().count();
+                traversed_bytes = range.start;
+            }
+
             match event {
+                pulldown_cmark::Event::HardBreak => {
+                    tokens.push(Token {
+                        span: Span::new_with_len(traversed_chars, 1),
+                        kind: TokenKind::Newline(1),
+                    });
+                }
                 pulldown_cmark::Event::Start(tag) => stack.push(tag),
+                pulldown_cmark::Event::End(pulldown_cmark::Tag::Paragraph)
+                | pulldown_cmark::Event::End(pulldown_cmark::Tag::Item) => tokens.push(Token {
+                    span: Span::new_with_len(traversed_chars, 1),
+                    kind: TokenKind::Newline(1),
+                }),
                 pulldown_cmark::Event::End(_) => {
                     stack.pop();
                 }
                 pulldown_cmark::Event::Code(code) => {
-                    traversed_chars += source_str[traversed_bytes..range.start].chars().count();
-                    traversed_bytes = range.start;
-
                     let chunk_len = code.chars().count();
 
                     tokens.push(Token {
@@ -41,9 +54,6 @@ impl Parser for Markdown {
                     });
                 }
                 pulldown_cmark::Event::Text(text) => {
-                    traversed_chars += source_str[traversed_bytes..range.start].chars().count();
-                    traversed_bytes = range.start;
-
                     let chunk_len = text.chars().count();
 
                     if let Some(tag) = stack.last() {
diff --git a/harper-core/src/parsers/mod.rs b/harper-core/src/parsers/mod.rs
index 69a3b3dc..9e608c7c 100644
--- a/harper-core/src/parsers/mod.rs
+++ b/harper-core/src/parsers/mod.rs
@@ -89,6 +89,25 @@ mod tests {
                 Word,
                 Space(1),
                 Word,
+                Newline(1),
+            ],
+        );
+    }
+
+    #[test]
+    fn inserts_newlines() {
+        assert_tokens_eq_md(
+            "__hello__ world,\n\n[my]() friend",
+            &[
+                Word,
+                Space(1),
+                Word,
+                Punctuation(Punctuation::Comma),
+                Newline(1),
+                Word,
+                Space(1),
+                Word,
+                Newline(1),
             ],
         );
     }
diff --git a/harper-core/src/span.rs b/harper-core/src/span.rs
index 0ec7e89f..1c5ebef6 100644
--- a/harper-core/src/span.rs
+++ b/harper-core/src/span.rs
@@ -14,6 +14,13 @@ impl Span {
         Self { start, end }
     }
 
+    pub fn new_with_len(start: usize, len: usize) -> Self {
+        Self {
+            start,
+            end: start + len,
+        }
+    }
+
     pub fn len(&self) -> usize {
         self.end - self.start
     }