Skip to content

Commit

Permalink
feat: created langauge detection demo page
Browse files Browse the repository at this point in the history
  • Loading branch information
elijah-potter committed Oct 22, 2024
1 parent 9713f5d commit 178bfe0
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 5 deletions.
2 changes: 1 addition & 1 deletion harper-core/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ impl Document {
/// [`Punctuation::Quote::twin_loc`] field. This is on a best effort
/// basis.
///
/// Current algorithm is very basic and could use some work.
/// Current algorithm is basic and could use some work.
fn match_quotes(&mut self) {
let quote_indices: Vec<usize> = self.tokens.iter_quote_indices().collect();

Expand Down
6 changes: 6 additions & 0 deletions harper-core/src/language_detection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ pub fn is_likely_english(toks: &[Token], source: &[char], dict: &impl Dictionary
let mut total_words = 0;
let mut valid_words = 0;
let mut punctuation = 0;
let mut unlintable = 0;

for token in toks {
match token.kind {
Expand All @@ -22,10 +23,15 @@ pub fn is_likely_english(toks: &[Token], source: &[char], dict: &impl Dictionary
}
}
TokenKind::Punctuation(_) => punctuation += 1,
TokenKind::Unlintable => unlintable += 1,
_ => (),
}
}

if unlintable > valid_words {
return false;
}

if (punctuation as f32 * 1.25) > valid_words as f32 {
return false;
}
Expand Down
35 changes: 35 additions & 0 deletions harper-core/src/parsers/isolate_english.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
use crate::{language_detection::is_likely_english, Dictionary};

use super::{Parser, Token, TokenStringExt};

/// A parser that wraps another, using heuristics to quickly redact paragraphs of a document that aren't
/// intended to be English text.
pub struct IsolateEnglish<D: Dictionary> {
inner: Box<dyn Parser>,
dict: D,
}

impl<D: Dictionary> IsolateEnglish<D> {
pub fn new(inner: Box<dyn Parser>, dictionary: D) -> Self {
Self {
inner,
dict: dictionary,
}
}
}

impl<D: Dictionary> Parser for IsolateEnglish<D> {
fn parse(&mut self, source: &[char]) -> Vec<Token> {
let tokens = self.inner.parse(source);

let mut english_tokens: Vec<Token> = Vec::with_capacity(tokens.len());

for sentence in tokens.iter_sentences() {
if sentence.len() > 5 && is_likely_english(sentence, source, &self.dict) {
english_tokens.extend(sentence);
}
}

english_tokens
}
}
2 changes: 2 additions & 0 deletions harper-core/src/parsers/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
mod collapse_identifiers;
mod isolate_english;
mod markdown;
mod mask;
mod plain_english;

use blanket::blanket;
pub use collapse_identifiers::CollapseIdentifiers;
pub use isolate_english::IsolateEnglish;
pub use markdown::Markdown;
pub use mask::Mask;
pub use plain_english::PlainEnglish;
Expand Down
19 changes: 16 additions & 3 deletions harper-wasm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::sync::Mutex;

use harper_core::language_detection::is_doc_likely_english;
use harper_core::linting::{LintGroup, LintGroupConfig, Linter};
use harper_core::parsers::Markdown;
use harper_core::parsers::{IsolateEnglish, Markdown, PlainEnglish};
use harper_core::{remove_overlaps, Document, FullDictionary, Lrc};
use once_cell::sync::Lazy;
use wasm_bindgen::prelude::wasm_bindgen;
Expand All @@ -28,13 +28,26 @@ pub fn setup() {
tracing_wasm::set_as_global_default();
}

/// Helper method to quickly check if a Markdown string is likely intended to be English
/// Helper method to quickly check if a plain string is likely intended to be English
#[wasm_bindgen]
pub fn is_likely_english(text: String) -> bool {
let document = Document::new_markdown_curated(&text);
let document = Document::new_plain_english_curated(&text);
is_doc_likely_english(&document, &FullDictionary::curated())
}

/// Helper method to remove non-English text from a plain English document.
#[wasm_bindgen]
pub fn isolate_english(text: String) -> String {
let dict = FullDictionary::curated();

let document = Document::new_curated(
&text,
&mut IsolateEnglish::new(Box::new(PlainEnglish), dict.clone()),
);

document.to_string()
}

#[wasm_bindgen]
pub fn get_lint_config_as_object() -> JsValue {
let linter = LINTER.lock().unwrap();
Expand Down
6 changes: 6 additions & 0 deletions packages/web/src/lib/analysis.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,9 @@ export async function isLikelyEnglish(text: string): Promise<boolean> {

return wasm.is_likely_english(text);
}

export async function isolateEnglish(text: string): Promise<string> {
const wasm = await import('wasm');

return wasm.isolate_english(text);
}
6 changes: 5 additions & 1 deletion packages/web/src/routes/languagedetection/+page.svelte
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
<script lang="ts">
import { isLikelyEnglish } from '$lib/analysis';
import { isLikelyEnglish, isolateEnglish } from '$lib/analysis';
import { Textarea, Select } from 'flowbite-svelte';
import demoText from '../../../../../demo.md?raw';
let isEnglish: boolean | null = null;
let text = '';
let strippedText = '';
$: isLikelyEnglish(text).then((v) => (isEnglish = v));
$: isolateEnglish(text).then((t) => (strippedText = t));
$: color = isEnglish == null ? '' : isEnglish ? 'bg-green-100' : 'bg-red-100';
Expand Down Expand Up @@ -49,4 +51,6 @@
bind:value={text}
placeholder="Is your text supposed to be English?"
/>

<Textarea rows={8} value={strippedText} />
</div>

0 comments on commit 178bfe0

Please sign in to comment.