Can now parse and lint Rust comments

Automattic · Jan 25, 2024 · 17353b2 · 17353b2
1 parent 61dcf22
commit 17353b2
Show file tree

Hide file tree

Showing 9 changed files with 113 additions and 56 deletions.
diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs
@@ -2,10 +2,11 @@ use std::fmt::Display;
 
 use itertools::Itertools;
 
+use crate::parsers::{MarkdownParser, Parser, PlainEnglishParser};
 use crate::{
     linting::Suggestion,
     span::Span,
-    FatToken, MarkdownParser, Parser, PlainEnglishParser,
+    FatToken,
     Punctuation::{self},
     Token, TokenKind,
 };
@@ -18,7 +19,6 @@ pub struct Document {
 
 impl Document {
     /// Lexes and parses text to produce a document.
-    ///
     /// Choosing to parse with markdown may have a performance penalty
     pub fn new(text: &str, parser: Box<dyn Parser>) -> Self {
         let source: Vec<_> = text.chars().collect();

diff --git a/harper-core/src/lib.rs b/harper-core/src/lib.rs
@@ -3,15 +3,14 @@
 mod document;
 mod lexing;
 mod linting;
-mod parsers;
+pub mod parsers;
 mod span;
 mod spell;
 mod token;
 
 pub use document::Document;
 pub use linting::LintSet;
 pub use linting::{Lint, LintKind, Linter, Suggestion};
-pub use parsers::{MarkdownParser, Parser, PlainEnglishParser};
 pub use span::Span;
 pub use spell::Dictionary;
 pub use token::{FatToken, Punctuation, Token, TokenKind, TokenStringExt};
diff --git a/harper-core/src/linting/lint_set.rs b/harper-core/src/linting/lint_set.rs
@@ -60,6 +60,12 @@ impl LintSet {
     }
 }
 
+impl Default for LintSet {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
 /// Create builder methods for the linters that do not take any arguments.
 macro_rules! create_simple_builder_methods {
     ($($linter:ident),*) => {

diff --git a/harper-core/src/linting/spell_check.rs b/harper-core/src/linting/spell_check.rs
@@ -29,7 +29,7 @@ impl SpellCheck {
         self.word_cache
             .entry(word.clone())
             .or_insert_with(|| {
-                suggest_correct_spelling(&word, 100, 3, &self.dictionary)
+                suggest_correct_spelling(&word, 100, 2, &self.dictionary)
                     .into_iter()
                     .map(|v| v.to_vec())
                     .collect()

diff --git a/harper-core/src/parsers/mod.rs b/harper-core/src/parsers/mod.rs
@@ -25,9 +25,9 @@ where
 
 #[cfg(test)]
 mod tests {
-    use super::{MarkdownParser, PlainEnglishParser};
+    use super::{MarkdownParser, Parser, PlainEnglishParser};
     use crate::{
-        Parser, Punctuation,
+        Punctuation,
         TokenKind::{self, *},
     };
 

diff --git a/harper-ls/src/backend.rs b/harper-ls/src/backend.rs
@@ -1,6 +1,9 @@
-use std::{borrow::Borrow, collections::HashMap, fs};
+use std::{collections::HashMap, fs};
 
-use harper_core::{Dictionary, Document, Lint, LintSet, Linter, MarkdownParser};
+use harper_core::{
+    parsers::{MarkdownParser, Parser},
+    Dictionary, Document, LintSet, Linter,
+};
 use tokio::sync::Mutex;
 use tower_lsp::{
     jsonrpc::Result,
@@ -18,6 +21,7 @@ use tower_lsp::{
 use crate::{
     diagnostics::{lint_to_code_actions, lints_to_diagnostics},
     pos_conv::range_to_span,
+    rust_parser::RustParser,
 };
 
 pub struct Backend {
@@ -33,17 +37,17 @@ impl Backend {
     }
 
     async fn update_document(&self, url: &Url, text: &str) {
-        let doc = Document::new(text, Box::new(MarkdownParser));
-        let mut files = self.files.lock().await;
-        files.insert(url.clone(), doc);
-    }
+        let mut parser: Box<dyn Parser> = Box::new(MarkdownParser);
 
-    async fn generate_lints_for_url(&self, url: &Url) -> Option<Vec<Lint>> {
-        let files = self.files.lock().await;
-        let file_contents = files.get(url)?;
+        if let Some(extension) = url.to_file_path().unwrap().extension() {
+            if extension == "rs" {
+                parser = Box::new(RustParser)
+            }
+        }
 
-        let mut linter = self.linter.lock().await;
-        Some(linter.lint(file_contents))
+        let doc = Document::new(text, parser);
+        let mut files = self.files.lock().await;
+        files.insert(url.clone(), doc);
     }
 
     async fn generate_code_actions(&self, url: &Url, range: Range) -> Result<Vec<CodeAction>> {

diff --git a/harper-ls/src/comments.rs b/harper-ls/src/comments.rs
diff --git a/harper-ls/src/main.rs b/harper-ls/src/main.rs
@@ -1,8 +1,8 @@
 use tokio::net::TcpListener;
 mod backend;
-mod comments;
 mod diagnostics;
 mod pos_conv;
+mod rust_parser;
 
 use backend::Backend;
 use clap::Parser;

diff --git a/harper-ls/src/rust_parser.rs b/harper-ls/src/rust_parser.rs
@@ -0,0 +1,85 @@
+use harper_core::{
+    parsers::{Parser, PlainEnglishParser},
+    Span,
+};
+use tree_sitter::TreeCursor;
+
+pub struct RustParser;
+
+impl Parser for RustParser {
+    fn parse(&mut self, source: &[char]) -> Vec<harper_core::Token> {
+        let text: String = source.iter().collect();
+
+        let mut english_parser = PlainEnglishParser;
+        let mut parser = tree_sitter::Parser::new();
+        parser.set_language(tree_sitter_rust::language()).unwrap();
+
+        // TODO: Use incremental parsing
+        let Some(root) = parser.parse(&text, None) else {
+            return vec![];
+        };
+
+        let mut comments_spans = Vec::new();
+
+        extract_comments(&mut root.walk(), &mut comments_spans);
+        byte_spans_to_char_spans(&mut comments_spans, &text);
+
+        let mut tokens = Vec::new();
+
+        for span in comments_spans {
+            let mut new_tokens = english_parser.parse(&source[span.start..span.end]);
+
+            new_tokens
+                .iter_mut()
+                .for_each(|t| t.span.offset(span.start));
+
+            tokens.append(&mut new_tokens);
+        }
+
+        tokens
+    }
+}
+
+/// Converts a set of byte-indexed [`Span`]s to char-index Spans, in-place.
+/// NOTE: Will sort the given array by their [`Span::start`].
+///
+/// Assumes that none of the Spans are overlapping.
+fn byte_spans_to_char_spans(byte_spans: &mut [Span], source: &str) {
+    byte_spans.sort_by_key(|s| s.start);
+
+    let mut last_byte_pos = 0;
+    let mut last_char_pos = 0;
+
+    byte_spans.iter_mut().for_each(|span| {
+        let byte_span = *span;
+
+        last_char_pos += source[last_byte_pos..byte_span.start].chars().count();
+        span.start = last_char_pos;
+
+        last_char_pos += source[byte_span.start..byte_span.end].chars().count();
+        span.end = last_char_pos;
+
+        last_byte_pos = byte_span.end;
+    })
+}
+
+/// Visits the children of a TreeSitter node, searching for comments.
+///
+/// Returns the BYTE spans of the comment position.
+fn extract_comments(cursor: &mut TreeCursor, comments: &mut Vec<Span>) {
+    if !cursor.goto_first_child() {
+        return;
+    }
+
+    while cursor.goto_next_sibling() {
+        let node = cursor.node();
+
+        if node.kind().contains("comment") {
+            comments.push(node.byte_range().into());
+        }
+
+        extract_comments(cursor, comments);
+    }
+
+    cursor.goto_parent();
+}