Skip to content

Commit

Permalink
Merge pull request #49 from allo-media/master
Browse files Browse the repository at this point in the history
Version 2.6.0: Portuguese support
  • Loading branch information
rtxm authored Jan 10, 2025
2 parents 4f4238b + c7f0a9f commit 82b48f0
Show file tree
Hide file tree
Showing 15 changed files with 502 additions and 47 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
[package]
name = "text2num"
version = "2.5.2"
version = "2.6.0"
authors = ["Allo-Media <contact@allo-media.fr>"]
edition = "2021"
license = "MIT"
description = "Parse and convert numbers written in English, Dutch, Spanish, German, Italian or French into their digit representation."
description = "Parse and convert numbers written in English, Dutch, Spanish, Portuguese, German, Italian or French into their digit representation."
keywords = ["NLP", "words-to-numbers"]
categories = ["text-processing"]
repository = "https://github.com/allo-media/text2num-rs"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Parse and convert numbers written in English, Dutch, Spanish, German, Italian or French into their digit representation.
# Parse and convert numbers written in English, Dutch, Spanish, Portuguese (Europe & Brazil), German, Italian or French into their digit representation.

This crate provides a library for recognizing, parsing and transcribing into digits (base 10) numbers expressed in natural language.

Expand Down
6 changes: 6 additions & 0 deletions src/digit_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,10 +169,16 @@ impl DigitString {
position > max_pos || self.buffer[max_pos - position] == b'0'
}

/// check strict emptiness, that is nothing, not even leading zeroes.
pub fn is_empty(&self) -> bool {
self.buffer.is_empty() && self.leading_zeroes == 0
}

/// check for emptiness or only leading zeroes
pub fn is_null(&self) -> bool {
self.buffer.is_empty()
}

pub fn len(&self) -> usize {
self.buffer.len() + self.leading_zeroes
}
Expand Down
12 changes: 7 additions & 5 deletions src/lang/de/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! German number interpretor
//! German number interpreter
//!
//! This interpretor is tolerant and accepts splitted words, that is "ein und zwanzig" is treated like "einundzwanzig", as
//! This interpreter is tolerant and accepts splitted words, that is "ein und zwanzig" is treated like "einundzwanzig", as
//! the main application, Speech-to-text recognition, may introduce spurious spaces.
use bitflags::bitflags;
Expand All @@ -11,7 +11,7 @@ use crate::tokenizer::WordSplitter;

mod vocabulary;

use super::{LangInterpretor, MorphologicalMarker};
use super::{LangInterpreter, MorphologicalMarker};
use vocabulary::INSIGNIFICANT;

fn lemmatize(word: &str) -> &str {
Expand All @@ -21,7 +21,7 @@ fn lemmatize(word: &str) -> &str {
|| word.ends_with("ten")
|| word.ends_with("tem")
{
word.trim_end_matches(&['s', 'n', 'm', 'r'])
word.trim_end_matches(['s', 'n', 'm', 'r'])
} else {
word
}
Expand Down Expand Up @@ -68,7 +68,7 @@ impl German {
}
}

impl LangInterpretor for German {
impl LangInterpreter for German {
fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> {
// In German, numbers are compounded to form a group
let lemma = lemmatize(num_func);
Expand Down Expand Up @@ -337,6 +337,7 @@ mod tests {
assert_invalid!("zwanzig zweitausend");
assert_invalid!("eine und zwanzig");
assert_invalid!("eins und zwanzig");
assert_invalid!("neun zwanzig");
}

#[test]
Expand Down Expand Up @@ -379,6 +380,7 @@ mod tests {
);
assert_replace_numbers!("Einhundert und Ende", "100 und Ende");
assert_replace_numbers!("Einhundert und und", "100 und und");
assert_replace_numbers!("neun zwanzig", "9 20");
}

#[test]
Expand Down
6 changes: 3 additions & 3 deletions src/lang/en/mod.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
//! English number interpretor
//! English number interpreter
use crate::digit_string::DigitString;
use crate::error::Error;

mod vocabulary;

use super::{BasicAnnotate, LangInterpretor, MorphologicalMarker};
use super::{BasicAnnotate, LangInterpreter, MorphologicalMarker};
use vocabulary::INSIGNIFICANT;

fn lemmatize(word: &str) -> &str {
Expand All @@ -26,7 +26,7 @@ impl English {
}
}

impl LangInterpretor for English {
impl LangInterpreter for English {
fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> {
// In English, numbers can be compounded to form a group with "-"
if num_func.contains('-') {
Expand Down
6 changes: 3 additions & 3 deletions src/lang/es/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
//! Spanish number interpretor
//! Spanish number interpreter
use crate::digit_string::DigitString;
use crate::error::Error;

mod vocabulary;

use super::{LangInterpretor, MorphologicalMarker};
use super::{LangInterpreter, MorphologicalMarker};
use vocabulary::INSIGNIFICANT;

fn lemmatize(word: &str) -> &str {
Expand All @@ -27,7 +27,7 @@ impl Spanish {
}
}

impl LangInterpretor for Spanish {
impl LangInterpreter for Spanish {
fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> {
let num_marker = self.get_morph_marker(num_func);
if !b.is_empty() && num_marker != b.marker && !num_marker.is_fraction() {
Expand Down
7 changes: 4 additions & 3 deletions src/lang/fr/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//! French number interpretor.
//! French number interpreter.
//!
//! It supports regional variants.
use bitflags::bitflags;
Expand All @@ -8,7 +8,7 @@ use crate::error::Error;

mod vocabulary;

use super::{BasicAnnotate, LangInterpretor, MorphologicalMarker};
use super::{BasicAnnotate, LangInterpreter, MorphologicalMarker};
use vocabulary::INSIGNIFICANT;

fn lemmatize(word: &str) -> &str {
Expand Down Expand Up @@ -43,7 +43,7 @@ bitflags! {
}
}

impl LangInterpretor for French {
impl LangInterpreter for French {
fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> {
// In French, numbers can be compounded to form a group with "-"
if num_func.contains('-') {
Expand Down Expand Up @@ -493,6 +493,7 @@ mod tests {
assert_replace_numbers!("cinq cent premiers", "500 premiers");
assert_replace_numbers!("cinq cent premier", "500 premier");
assert_replace_all_numbers!("une seconde", "une seconde");
assert_replace_numbers!("vingt-cinquième et trentième", "25ème et 30ème");
}

#[test]
Expand Down
6 changes: 3 additions & 3 deletions src/lang/it/mod.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
//! Italian number interpretor
//! Italian number interpreter
use crate::digit_string::DigitString;
use crate::error::Error;
use crate::tokenizer::WordSplitter;

mod vocabulary;

use super::{LangInterpretor, MorphologicalMarker};
use super::{LangInterpreter, MorphologicalMarker};
use vocabulary::INSIGNIFICANT;

pub struct Italian {
Expand Down Expand Up @@ -76,7 +76,7 @@ impl Italian {
}
}

impl LangInterpretor for Italian {
impl LangInterpreter for Italian {
fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> {
let lemma = lemmatize(num_func);
if self.word_splitter.is_splittable(lemma) {
Expand Down
17 changes: 12 additions & 5 deletions src/lang/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ a subset of the language that is "simple" and consistent enough to be interprete
A number expressed in words is then seen as a little program whose interpretation result is either a sequence of digits, if the number is valid, or an
error.
The common runtime for all interpretors is the [`DigitString`]. It provided the memory
The common runtime for all interpreters is the [`DigitString`]. It provided the memory
and the elementary functions to build a number in base 10 (even if the language to be interpreted counts otherwise).
The `DigitString` is responsible for checking the validity of the constructed number at each step (i.e at each method call).
The intepretor part, which is specific to each language, is built by implementing the `LangInterpretor` trait, which
The intepretor part, which is specific to each language, is built by implementing the `Langinterpreter` trait, which
translate each number word into a sequence of elementary instructions on a `DigitString`.
A language is just an empty (stateless) type. Everything is provided by implementating the trait.
Expand All @@ -28,6 +28,7 @@ mod es;
mod fr;
mod it;
mod nl;
mod pt;

use crate::digit_string::DigitString;

Expand All @@ -39,6 +40,7 @@ pub use es::Spanish;
pub use fr::French;
pub use it::Italian;
pub use nl::Dutch;
pub use pt::Portuguese;

pub trait BasicAnnotate {
fn text_lowercase(&self) -> &str;
Expand Down Expand Up @@ -73,7 +75,7 @@ impl MorphologicalMarker {
///
/// All methods must be implemented except the [`exec_group`](Self::exec_group), which comes with a default implementation.
/// Self must be stateless.
pub trait LangInterpretor {
pub trait LangInterpreter {
/// Interpret the word `num_func`, that may be part of a larger sequence.
///
/// `num_func` is interpreted by calling the appropriate methods on `b`.
Expand Down Expand Up @@ -136,6 +138,7 @@ pub enum Language {
Italian(Italian),
Spanish(Spanish),
Dutch(Dutch),
Portuguese(Portuguese),
}

impl Language {
Expand All @@ -162,6 +165,10 @@ impl Language {
pub fn dutch() -> Self {
Language::Dutch(Dutch::default())
}

pub fn portuguese() -> Self {
Language::Portuguese(Portuguese::default())
}
}

macro_rules! delegate {
Expand Down Expand Up @@ -229,6 +236,6 @@ macro_rules! delegate {
};
}

impl LangInterpretor for Language {
delegate!(Dutch, French, English, German, Italian, Spanish);
impl LangInterpreter for Language {
delegate!(Dutch, French, English, German, Italian, Spanish, Portuguese);
}
8 changes: 4 additions & 4 deletions src/lang/nl/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! Dutch number interpretor
//! Dutch number interpreter
//!
//! This interpretor is tolerant and accepts splitted words, that is "negen en zeventig" is treated like "negenenzeventig", as
//! This interpreter is tolerant and accepts splitted words, that is "negen en zeventig" is treated like "negenenzeventig", as
//! the main application, Speech-to-text recognition, may introduce spurious spaces.
use bitflags::bitflags;
Expand All @@ -11,7 +11,7 @@ use crate::tokenizer::WordSplitter;

mod vocabulary;

use super::{LangInterpretor, MorphologicalMarker};
use super::{LangInterpreter, MorphologicalMarker};
use vocabulary::INSIGNIFICANT;

bitflags! {
Expand Down Expand Up @@ -83,7 +83,7 @@ impl Dutch {
}
}

impl LangInterpretor for Dutch {
impl LangInterpreter for Dutch {
fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> {
// In Dutch, numbers are compounded to form a group
if self.word_splitter.is_splittable(num_func) {
Expand Down
Loading

0 comments on commit 82b48f0

Please sign in to comment.