From 1077ceb73b69b0b126f714bde6295c49da08a1ad Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Tue, 30 Jan 2024 09:47:10 -0700 Subject: [PATCH] Markdown parser now properly parses lists --- harper-core/dictionary.aff | 5 +++- harper-core/dictionary.dict | 3 ++- harper-core/src/document.rs | 41 ++++++++++++----------------- harper-core/src/parsers/markdown.rs | 22 +++++++++++----- harper-core/src/parsers/mod.rs | 19 +++++++++++++ harper-core/src/span.rs | 7 +++++ 6 files changed, 65 insertions(+), 32 deletions(-) diff --git a/harper-core/dictionary.aff b/harper-core/dictionary.aff index af8eb37e..5c6acba9 100644 --- a/harper-core/dictionary.aff +++ b/harper-core/dictionary.aff @@ -113,7 +113,10 @@ SFX L Y 1 SFX L 0 ment . SFX O Y 1 -SFX O 0 ful . +SFX O 0 ful . + +SFX Q Y 1 +SFX Q 0 ally . REP 90 REP a ei diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict index dbc6b27e..0cfe142b 100644 --- a/harper-core/dictionary.dict +++ b/harper-core/dictionary.dict @@ -11632,7 +11632,7 @@ algebra/SM algebraic algebraically algorithm/SM -algorithmic +algorithmic/Q alias/GMDS alibi/GMDS alien/BGMDS @@ -49591,3 +49591,4 @@ raytracer viewport backend frontend +automata diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs index 08be7e68..b9d249c1 100644 --- a/harper-core/src/document.rs +++ b/harper-core/src/document.rs @@ -172,14 +172,8 @@ impl Document { /// Iterate over the locations of the sentence terminators in the document. fn sentence_terminators(&self) -> impl Iterator + '_ { self.tokens.iter().enumerate().filter_map(|(index, token)| { - if let Token { - kind: TokenKind::Punctuation(punct), - .. - } = token - { - if is_sentence_terminator(punct) { - return Some(index); - } + if is_sentence_terminator(&token.kind) { + return Some(index); } None }) @@ -192,14 +186,8 @@ impl Document { .enumerate() .rev() .find_map(|(index, token)| { - if let Token { - kind: TokenKind::Punctuation(punct), - .. - } = token - { - if is_sentence_terminator(punct) { - return Some(index); - } + if is_sentence_terminator(&token.kind) { + return Some(index); } None }) @@ -287,13 +275,17 @@ impl Display for Document { } } -fn is_sentence_terminator(punctuation: &Punctuation) -> bool { - [ - Punctuation::Period, - Punctuation::Bang, - Punctuation::Question, - ] - .contains(punctuation) +fn is_sentence_terminator(token: &TokenKind) -> bool { + match token { + TokenKind::Punctuation(punct) => [ + Punctuation::Period, + Punctuation::Bang, + Punctuation::Question, + ] + .contains(punct), + TokenKind::Newline(_) => true, + _ => false, + } } #[cfg(test)] @@ -313,7 +305,8 @@ mod tests { let mut document = Document::new(text, Box::new(Markdown)); document.condense_contractions(); - assert_eq!(document.tokens.len(), final_tok_count); + // We add one because the Markdown parser inserts a newline at end-of-input. + assert_eq!(document.tokens.len(), final_tok_count + 1); } #[test] diff --git a/harper-core/src/parsers/markdown.rs b/harper-core/src/parsers/markdown.rs index 29dab5f6..9fb39a09 100644 --- a/harper-core/src/parsers/markdown.rs +++ b/harper-core/src/parsers/markdown.rs @@ -24,15 +24,28 @@ impl Parser for Markdown { // NOTE: the range spits out __byte__ indices, not char indices. // This is why we keep track above. for (event, range) in md_parser.into_offset_iter() { + if range.start > traversed_bytes { + traversed_chars += source_str[traversed_bytes..range.start].chars().count(); + traversed_bytes = range.start; + } + match event { + pulldown_cmark::Event::HardBreak => { + tokens.push(Token { + span: Span::new_with_len(traversed_chars, 1), + kind: TokenKind::Newline(1), + }); + } pulldown_cmark::Event::Start(tag) => stack.push(tag), + pulldown_cmark::Event::End(pulldown_cmark::Tag::Paragraph) + | pulldown_cmark::Event::End(pulldown_cmark::Tag::Item) => tokens.push(Token { + span: Span::new_with_len(traversed_chars, 1), + kind: TokenKind::Newline(1), + }), pulldown_cmark::Event::End(_) => { stack.pop(); } pulldown_cmark::Event::Code(code) => { - traversed_chars += source_str[traversed_bytes..range.start].chars().count(); - traversed_bytes = range.start; - let chunk_len = code.chars().count(); tokens.push(Token { @@ -41,9 +54,6 @@ impl Parser for Markdown { }); } pulldown_cmark::Event::Text(text) => { - traversed_chars += source_str[traversed_bytes..range.start].chars().count(); - traversed_bytes = range.start; - let chunk_len = text.chars().count(); if let Some(tag) = stack.last() { diff --git a/harper-core/src/parsers/mod.rs b/harper-core/src/parsers/mod.rs index 69a3b3dc..9e608c7c 100644 --- a/harper-core/src/parsers/mod.rs +++ b/harper-core/src/parsers/mod.rs @@ -89,6 +89,25 @@ mod tests { Word, Space(1), Word, + Newline(1), + ], + ); + } + + #[test] + fn inserts_newlines() { + assert_tokens_eq_md( + "__hello__ world,\n\n[my]() friend", + &[ + Word, + Space(1), + Word, + Punctuation(Punctuation::Comma), + Newline(1), + Word, + Space(1), + Word, + Newline(1), ], ); } diff --git a/harper-core/src/span.rs b/harper-core/src/span.rs index 0ec7e89f..1c5ebef6 100644 --- a/harper-core/src/span.rs +++ b/harper-core/src/span.rs @@ -14,6 +14,13 @@ impl Span { Self { start, end } } + pub fn new_with_len(start: usize, len: usize) -> Self { + Self { + start, + end: start + len, + } + } + pub fn len(&self) -> usize { self.end - self.start }