Skip to content

Commit

Permalink
Added Markdown support
Browse files Browse the repository at this point in the history
  • Loading branch information
elijah-potter committed Jan 19, 2024
1 parent bdba195 commit b3b6346
Show file tree
Hide file tree
Showing 11 changed files with 167 additions and 31 deletions.
37 changes: 37 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions english_words.txt
Original file line number Diff line number Diff line change
Expand Up @@ -365403,8 +365403,6 @@ whuther
whutter
whuttering
whuz
wi
wy
wyandot
wyandotte
wibble
Expand Down
1 change: 1 addition & 0 deletions harper-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ ahash = "0.8.7"
is-macro = "0.3.0"
itertools = "0.11.0"
once_cell = "1.19.0"
pulldown-cmark = "0.9.3"
serde = { version = "1.0.190", features = ["derive"] }
smallvec = "1.12.0"

Expand Down
34 changes: 26 additions & 8 deletions harper-core/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use itertools::Itertools;
use crate::{
lex_to_end,
linting::Suggestion,
parsing::lex_to_end_md,
span::Span,
FatToken,
Punctuation::{self},
Expand All @@ -14,15 +15,21 @@ use crate::{
pub struct Document {
source: Vec<char>,
tokens: Vec<Token>,
markdown: bool,
}

impl Document {
// Lexes and parses text to produce a document.
pub fn new(text: &str) -> Self {
//
// Choosing to parse with markdown may have a performance penalty
pub fn new(text: &str, markdown: bool) -> Self {
let source: Vec<_> = text.chars().collect();
let tokens = lex_to_end(&source);

let mut doc = Self { source, tokens };
let mut doc = Self {
source,
tokens: Vec::new(),
markdown,
};
doc.parse();

doc
Expand All @@ -32,6 +39,12 @@ impl Document {
///
/// Should be run after every change to the underlying [`Self::source`].
fn parse(&mut self) {
if self.markdown {
self.tokens = lex_to_end_md(&self.source);
} else {
self.tokens = lex_to_end(&self.source);
}

self.match_quotes();
}

Expand Down Expand Up @@ -182,21 +195,26 @@ mod tests {
use crate::Token;

impl Document {
fn from_raw_parts(source: Vec<char>, tokens: Vec<Token>) -> Self {
Self { source, tokens }
fn from_raw_parts(source: Vec<char>, tokens: Vec<Token>, markdown: bool) -> Self {
Self {
source,
tokens,
markdown,
}
}
}

#[test]
fn parses_sentances_correctly() {
fn parses_sentences_correctly() {
let text = "There were three little pigs. They built three little homes.";
let document = Document::new(text);
let document = Document::new(text, false);

let mut sentence_strs = vec![];

for sentence in document.sentences() {
sentence_strs.push(
Document::from_raw_parts(document.source.clone(), sentence.to_vec()).to_string(),
Document::from_raw_parts(document.source.clone(), sentence.to_vec(), false)
.to_string(),
);
}

Expand Down
79 changes: 75 additions & 4 deletions harper-core/src/parsing/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,52 @@ pub struct FoundToken {
pub token: TokenKind,
}

/// Same as [`lex_to_end`], but with additional infrastructure to intelligently ignore Markdown.
pub fn lex_to_end_md(source: &[char]) -> Vec<Token> {
let source_str: String = source.iter().collect();
let md_parser = pulldown_cmark::Parser::new(&source_str);

let mut tokens = Vec::new();

let mut traversed_bytes = 0;
let mut traversed_chars = 0;

// NOTE: the range spits out __byte__ indices, not char indices.
// This is why we keep track above.
for (event, range) in md_parser.into_offset_iter() {
if let pulldown_cmark::Event::Text(text) = event {
traversed_chars += source_str[traversed_bytes..range.start].chars().count();
traversed_bytes = range.start;

let mut new_tokens = lex_to_end_str(text);

new_tokens
.iter_mut()
.for_each(|token| token.span.offset(traversed_chars));

for token in new_tokens.iter() {
dbg!(token.span);
}

tokens.append(&mut new_tokens);
}
}

tokens
}

/// Same as [`lex_to_end_str`], but with additional infrastructure to intelligently ignore Markdown.
///
/// Yes, I am aware this implementation is doubly redundant, but I prefer to have a consistent API.
/// If its an issue, we can use a different markdown parser.
pub fn lex_to_end_md_str(source: impl AsRef<str>) -> Vec<Token> {
let r = source.as_ref();

let chars: Vec<_> = r.chars().collect();

lex_to_end_md(&chars)
}

pub fn lex_to_end_str(source: impl AsRef<str>) -> Vec<Token> {
let r = source.as_ref();

Expand Down Expand Up @@ -200,26 +246,34 @@ fn lex_quote(source: &[char]) -> Option<FoundToken> {

#[cfg(test)]
mod tests {
use super::{lex_to_end_md_str, lex_to_end_str};
use crate::{
lex_to_end_str, Punctuation,
Punctuation,
TokenKind::{self, *},
};

fn assert_tokens_eq(test_str: impl AsRef<str>, expected: &[TokenKind]) {
fn assert_tokens_eq_plain(test_str: impl AsRef<str>, expected: &[TokenKind]) {
let tokens = lex_to_end_str(test_str);
let kinds: Vec<_> = tokens.into_iter().map(|v| v.kind).collect();

assert_eq!(&kinds, expected)
}

fn assert_tokens_eq_md(test_str: impl AsRef<str>, expected: &[TokenKind]) {
let tokens = lex_to_end_md_str(test_str);
let kinds: Vec<_> = tokens.into_iter().map(|v| v.kind).collect();

assert_eq!(&kinds, expected)
}

#[test]
fn single_letter() {
assert_tokens_eq("a", &[Word])
assert_tokens_eq_plain("a", &[Word])
}

#[test]
fn sentence() {
assert_tokens_eq(
assert_tokens_eq_plain(
"hello world, my friend",
&[
Word,
Expand All @@ -233,4 +287,21 @@ mod tests {
],
)
}

#[test]
fn sentence_md() {
assert_tokens_eq_md(
"__hello__ world, [my]() friend",
&[
Word,
Space(1),
Word,
Punctuation(Punctuation::Comma),
Space(1),
Word,
Space(1),
Word,
],
);
}
}
2 changes: 1 addition & 1 deletion harper-core/src/parsing/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
mod lexer;
mod token;

pub use lexer::{lex_to_end, lex_to_end_str};
pub use lexer::{lex_to_end, lex_to_end_md, lex_to_end_md_str, lex_to_end_str};
pub use token::{FatToken, Punctuation, Quote, Token, TokenKind, TokenStringExt};
7 changes: 7 additions & 0 deletions harper-core/src/span.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ impl Span {
pub fn get_content<'a>(&self, source: &'a [char]) -> &'a [char] {
if cfg!(debug_assertions) {
assert!(self.start < self.end);
assert!(self.start < source.len());
assert!(self.end <= source.len());
}

Expand All @@ -46,6 +47,12 @@ impl Span {
cloned.set_len(length);
cloned
}

// Add an amount to both [`Self::start`] and [`Self::end`]
pub fn offset(&mut self, by: usize) {
self.start += by;
self.end += by;
}
}

#[cfg(test)]
Expand Down
2 changes: 1 addition & 1 deletion harper-ls/src/diagnostics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ fn open_url(url: &Url) -> Result<String> {

#[cached::proc_macro::cached]
fn lint_string(text: String) -> Vec<Lint> {
let document = Document::new(&text);
let document = Document::new(&text, true);
let dictionary = Dictionary::new();
all_linters(&document, dictionary)
}
Expand Down
12 changes: 7 additions & 5 deletions harper-serve/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use harper_core::{all_linters, Dictionary, Document, FatToken, Lint, Span, Suggestion};
use std::net::SocketAddr;
use tokio::time::Instant;
use tracing::{debug, info, Level};
use tracing::{info, Level};
use tracing_subscriber::FmtSubscriber;

use axum::{
Expand All @@ -12,7 +12,7 @@ use axum::{
http::StatusCode,
middleware::{self, Next},
response::Response,
routing::{get, post},
routing::post,
Json, Router,
};
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -41,6 +41,8 @@ async fn main() {
}

async fn timing_middleware(request: Request<Body>, next: Next<Body>) -> Response {
info!("Handling request at endpoint: {}", request.uri().path());

let uri = request.uri().clone();

let start = Instant::now();
Expand Down Expand Up @@ -69,7 +71,7 @@ async fn root() -> &'static str {
async fn parse_text(Json(payload): Json<ParseRequest>) -> (StatusCode, Json<ParseResponse>) {
let text = payload.text;

let document = Document::new(&text);
let document = Document::new(&text, true);
let tokens: Vec<_> = document.fat_tokens().collect();

(StatusCode::ACCEPTED, Json(ParseResponse { tokens }))
Expand All @@ -89,7 +91,7 @@ async fn lint(Json(payload): Json<LintRequest>) -> (StatusCode, Json<LintRespons
let text = payload.text;

let dictionary = Dictionary::new();
let document = Document::new(&text);
let document = Document::new(&text, true);

let lints = all_linters(&document, dictionary);

Expand All @@ -110,7 +112,7 @@ async fn apply_suggestion(
Json(payload): Json<ApplySuggestionRequest>,
) -> (StatusCode, Json<ApplySuggestionResponse>) {
let text = payload.text;
let mut document = Document::new(&text);
let mut document = Document::new(&text, true);
document.apply_suggestion(&payload.suggestion, payload.span);

(
Expand Down
Loading

0 comments on commit b3b6346

Please sign in to comment.