Added Markdown support

Automattic · Jan 19, 2024 · b3b6346 · b3b6346
1 parent bdba195
commit b3b6346
Show file tree

Hide file tree

Showing 11 changed files with 167 additions and 31 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/english_words.txt b/english_words.txt
@@ -365403,8 +365403,6 @@ whuther
 whutter
 whuttering
 whuz
-wi
-wy
 wyandot
 wyandotte
 wibble

diff --git a/harper-core/Cargo.toml b/harper-core/Cargo.toml
@@ -8,6 +8,7 @@ ahash = "0.8.7"
 is-macro = "0.3.0"
 itertools = "0.11.0"
 once_cell = "1.19.0"
+pulldown-cmark = "0.9.3"
 serde = { version = "1.0.190", features = ["derive"] }
 smallvec = "1.12.0"
 

diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs
@@ -5,6 +5,7 @@ use itertools::Itertools;
 use crate::{
     lex_to_end,
     linting::Suggestion,
+    parsing::lex_to_end_md,
     span::Span,
     FatToken,
     Punctuation::{self},
@@ -14,15 +15,21 @@ use crate::{
 pub struct Document {
     source: Vec<char>,
     tokens: Vec<Token>,
+    markdown: bool,
 }
 
 impl Document {
     // Lexes and parses text to produce a document.
-    pub fn new(text: &str) -> Self {
+    //
+    // Choosing to parse with markdown may have a performance penalty
+    pub fn new(text: &str, markdown: bool) -> Self {
         let source: Vec<_> = text.chars().collect();
-        let tokens = lex_to_end(&source);
 
-        let mut doc = Self { source, tokens };
+        let mut doc = Self {
+            source,
+            tokens: Vec::new(),
+            markdown,
+        };
         doc.parse();
 
         doc
@@ -32,6 +39,12 @@ impl Document {
     ///
     /// Should be run after every change to the underlying [`Self::source`].
     fn parse(&mut self) {
+        if self.markdown {
+            self.tokens = lex_to_end_md(&self.source);
+        } else {
+            self.tokens = lex_to_end(&self.source);
+        }
+
         self.match_quotes();
     }
 
@@ -182,21 +195,26 @@ mod tests {
     use crate::Token;
 
     impl Document {
-        fn from_raw_parts(source: Vec<char>, tokens: Vec<Token>) -> Self {
-            Self { source, tokens }
+        fn from_raw_parts(source: Vec<char>, tokens: Vec<Token>, markdown: bool) -> Self {
+            Self {
+                source,
+                tokens,
+                markdown,
+            }
         }
     }
 
     #[test]
-    fn parses_sentances_correctly() {
+    fn parses_sentences_correctly() {
         let text = "There were three little pigs. They built three little homes.";
-        let document = Document::new(text);
+        let document = Document::new(text, false);
 
         let mut sentence_strs = vec![];
 
         for sentence in document.sentences() {
             sentence_strs.push(
-                Document::from_raw_parts(document.source.clone(), sentence.to_vec()).to_string(),
+                Document::from_raw_parts(document.source.clone(), sentence.to_vec(), false)
+                    .to_string(),
             );
         }
 

diff --git a/harper-core/src/parsing/lexer.rs b/harper-core/src/parsing/lexer.rs
@@ -13,6 +13,52 @@ pub struct FoundToken {
     pub token: TokenKind,
 }
 
+/// Same as [`lex_to_end`], but with additional infrastructure to intelligently ignore Markdown.
+pub fn lex_to_end_md(source: &[char]) -> Vec<Token> {
+    let source_str: String = source.iter().collect();
+    let md_parser = pulldown_cmark::Parser::new(&source_str);
+
+    let mut tokens = Vec::new();
+
+    let mut traversed_bytes = 0;
+    let mut traversed_chars = 0;
+
+    // NOTE: the range spits out __byte__ indices, not char indices.
+    // This is why we keep track above.
+    for (event, range) in md_parser.into_offset_iter() {
+        if let pulldown_cmark::Event::Text(text) = event {
+            traversed_chars += source_str[traversed_bytes..range.start].chars().count();
+            traversed_bytes = range.start;
+
+            let mut new_tokens = lex_to_end_str(text);
+
+            new_tokens
+                .iter_mut()
+                .for_each(|token| token.span.offset(traversed_chars));
+
+            for token in new_tokens.iter() {
+                dbg!(token.span);
+            }
+
+            tokens.append(&mut new_tokens);
+        }
+    }
+
+    tokens
+}
+
+/// Same as [`lex_to_end_str`], but with additional infrastructure to intelligently ignore Markdown.
+///
+/// Yes, I am aware this implementation is doubly redundant, but I prefer to have a consistent API.
+/// If its an issue, we can use a different markdown parser.
+pub fn lex_to_end_md_str(source: impl AsRef<str>) -> Vec<Token> {
+    let r = source.as_ref();
+
+    let chars: Vec<_> = r.chars().collect();
+
+    lex_to_end_md(&chars)
+}
+
 pub fn lex_to_end_str(source: impl AsRef<str>) -> Vec<Token> {
     let r = source.as_ref();
 
@@ -200,26 +246,34 @@ fn lex_quote(source: &[char]) -> Option<FoundToken> {
 
 #[cfg(test)]
 mod tests {
+    use super::{lex_to_end_md_str, lex_to_end_str};
     use crate::{
-        lex_to_end_str, Punctuation,
+        Punctuation,
         TokenKind::{self, *},
     };
 
-    fn assert_tokens_eq(test_str: impl AsRef<str>, expected: &[TokenKind]) {
+    fn assert_tokens_eq_plain(test_str: impl AsRef<str>, expected: &[TokenKind]) {
         let tokens = lex_to_end_str(test_str);
         let kinds: Vec<_> = tokens.into_iter().map(|v| v.kind).collect();
 
         assert_eq!(&kinds, expected)
     }
 
+    fn assert_tokens_eq_md(test_str: impl AsRef<str>, expected: &[TokenKind]) {
+        let tokens = lex_to_end_md_str(test_str);
+        let kinds: Vec<_> = tokens.into_iter().map(|v| v.kind).collect();
+
+        assert_eq!(&kinds, expected)
+    }
+
     #[test]
     fn single_letter() {
-        assert_tokens_eq("a", &[Word])
+        assert_tokens_eq_plain("a", &[Word])
     }
 
     #[test]
     fn sentence() {
-        assert_tokens_eq(
+        assert_tokens_eq_plain(
             "hello world, my friend",
             &[
                 Word,
@@ -233,4 +287,21 @@ mod tests {
             ],
         )
     }
+
+    #[test]
+    fn sentence_md() {
+        assert_tokens_eq_md(
+            "__hello__ world, [my]() friend",
+            &[
+                Word,
+                Space(1),
+                Word,
+                Punctuation(Punctuation::Comma),
+                Space(1),
+                Word,
+                Space(1),
+                Word,
+            ],
+        );
+    }
 }
diff --git a/harper-core/src/parsing/mod.rs b/harper-core/src/parsing/mod.rs
@@ -1,5 +1,5 @@
 mod lexer;
 mod token;
 
-pub use lexer::{lex_to_end, lex_to_end_str};
+pub use lexer::{lex_to_end, lex_to_end_md, lex_to_end_md_str, lex_to_end_str};
 pub use token::{FatToken, Punctuation, Quote, Token, TokenKind, TokenStringExt};
diff --git a/harper-core/src/span.rs b/harper-core/src/span.rs
@@ -27,6 +27,7 @@ impl Span {
     pub fn get_content<'a>(&self, source: &'a [char]) -> &'a [char] {
         if cfg!(debug_assertions) {
             assert!(self.start < self.end);
+            assert!(self.start < source.len());
             assert!(self.end <= source.len());
         }
 
@@ -46,6 +47,12 @@ impl Span {
         cloned.set_len(length);
         cloned
     }
+
+    // Add an amount to both [`Self::start`] and [`Self::end`]
+    pub fn offset(&mut self, by: usize) {
+        self.start += by;
+        self.end += by;
+    }
 }
 
 #[cfg(test)]

diff --git a/harper-ls/src/diagnostics.rs b/harper-ls/src/diagnostics.rs
@@ -77,7 +77,7 @@ fn open_url(url: &Url) -> Result<String> {
 
 #[cached::proc_macro::cached]
 fn lint_string(text: String) -> Vec<Lint> {
-    let document = Document::new(&text);
+    let document = Document::new(&text, true);
     let dictionary = Dictionary::new();
     all_linters(&document, dictionary)
 }

diff --git a/harper-serve/src/main.rs b/harper-serve/src/main.rs
@@ -3,7 +3,7 @@
 use harper_core::{all_linters, Dictionary, Document, FatToken, Lint, Span, Suggestion};
 use std::net::SocketAddr;
 use tokio::time::Instant;
-use tracing::{debug, info, Level};
+use tracing::{info, Level};
 use tracing_subscriber::FmtSubscriber;
 
 use axum::{
@@ -12,7 +12,7 @@ use axum::{
     http::StatusCode,
     middleware::{self, Next},
     response::Response,
-    routing::{get, post},
+    routing::post,
     Json, Router,
 };
 use serde::{Deserialize, Serialize};
@@ -41,6 +41,8 @@ async fn main() {
 }
 
 async fn timing_middleware(request: Request<Body>, next: Next<Body>) -> Response {
+    info!("Handling request at endpoint: {}", request.uri().path());
+
     let uri = request.uri().clone();
 
     let start = Instant::now();
@@ -69,7 +71,7 @@ async fn root() -> &'static str {
 async fn parse_text(Json(payload): Json<ParseRequest>) -> (StatusCode, Json<ParseResponse>) {
     let text = payload.text;
 
-    let document = Document::new(&text);
+    let document = Document::new(&text, true);
     let tokens: Vec<_> = document.fat_tokens().collect();
 
     (StatusCode::ACCEPTED, Json(ParseResponse { tokens }))
@@ -89,7 +91,7 @@ async fn lint(Json(payload): Json<LintRequest>) -> (StatusCode, Json<LintRespons
     let text = payload.text;
 
     let dictionary = Dictionary::new();
-    let document = Document::new(&text);
+    let document = Document::new(&text, true);
 
     let lints = all_linters(&document, dictionary);
 
@@ -110,7 +112,7 @@ async fn apply_suggestion(
     Json(payload): Json<ApplySuggestionRequest>,
 ) -> (StatusCode, Json<ApplySuggestionResponse>) {
     let text = payload.text;
-    let mut document = Document::new(&text);
+    let mut document = Document::new(&text, true);
     document.apply_suggestion(&payload.suggestion, payload.span);
 
     (
-Original file line number
+Diff line change
@@ Expand Up / @@ -365403,8 +365403,6 @@ whuther @@
     whutter
     whuttering
     whuz
-    wi
-    wy
     wyandot
     wyandotte
     wibble
@@ Expand Down @@