Merge pull request #49 from allo-media/master

Version 2.6.0: Portuguese support
allo-media · Jan 10, 2025 · 82b48f0 · 82b48f0
2 parents 4f4238b + c7f0a9f
commit 82b48f0
Show file tree

Hide file tree

Showing 15 changed files with 502 additions and 47 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,10 +1,10 @@
 [package]
 name = "text2num"
-version = "2.5.2"
+version = "2.6.0"
 authors = ["Allo-Media <contact@allo-media.fr>"]
 edition = "2021"
 license = "MIT"
-description = "Parse and convert numbers written in English, Dutch, Spanish, German, Italian or French into their digit representation."
+description = "Parse and convert numbers written in English, Dutch, Spanish, Portuguese, German, Italian or French into their digit representation."
 keywords = ["NLP", "words-to-numbers"]
 categories = ["text-processing"]
 repository = "https://github.com/allo-media/text2num-rs"

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Parse and convert numbers written in English, Dutch, Spanish, German, Italian or French into their digit representation.
+# Parse and convert numbers written in English, Dutch, Spanish, Portuguese (Europe & Brazil), German, Italian or French into their digit representation.
 
 This crate provides a library for recognizing, parsing and transcribing into digits (base 10) numbers expressed in natural language.
 

diff --git a/src/digit_string.rs b/src/digit_string.rs
@@ -169,10 +169,16 @@ impl DigitString {
         position > max_pos || self.buffer[max_pos - position] == b'0'
     }
 
+    /// check strict emptiness, that is nothing, not even leading zeroes.
     pub fn is_empty(&self) -> bool {
         self.buffer.is_empty() && self.leading_zeroes == 0
     }
 
+    /// check for emptiness or only leading zeroes
+    pub fn is_null(&self) -> bool {
+        self.buffer.is_empty()
+    }
+
     pub fn len(&self) -> usize {
         self.buffer.len() + self.leading_zeroes
     }

diff --git a/src/lang/de/mod.rs b/src/lang/de/mod.rs
@@ -1,6 +1,6 @@
-//! German number interpretor
+//! German number interpreter
 //!
-//! This interpretor is tolerant and accepts splitted words, that is "ein und zwanzig" is treated like "einundzwanzig", as
+//! This interpreter is tolerant and accepts splitted words, that is "ein und zwanzig" is treated like "einundzwanzig", as
 //! the main application, Speech-to-text recognition, may introduce spurious spaces.
 
 use bitflags::bitflags;
@@ -11,7 +11,7 @@ use crate::tokenizer::WordSplitter;
 
 mod vocabulary;
 
-use super::{LangInterpretor, MorphologicalMarker};
+use super::{LangInterpreter, MorphologicalMarker};
 use vocabulary::INSIGNIFICANT;
 
 fn lemmatize(word: &str) -> &str {
@@ -21,7 +21,7 @@ fn lemmatize(word: &str) -> &str {
         || word.ends_with("ten")
         || word.ends_with("tem")
     {
-        word.trim_end_matches(&['s', 'n', 'm', 'r'])
+        word.trim_end_matches(['s', 'n', 'm', 'r'])
     } else {
         word
     }
@@ -68,7 +68,7 @@ impl German {
     }
 }
 
-impl LangInterpretor for German {
+impl LangInterpreter for German {
     fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> {
         // In German, numbers are compounded to form a group
         let lemma = lemmatize(num_func);
@@ -337,6 +337,7 @@ mod tests {
         assert_invalid!("zwanzig zweitausend");
         assert_invalid!("eine und zwanzig");
         assert_invalid!("eins und zwanzig");
+        assert_invalid!("neun zwanzig");
     }
 
     #[test]
@@ -379,6 +380,7 @@ mod tests {
         );
         assert_replace_numbers!("Einhundert und Ende", "100 und Ende");
         assert_replace_numbers!("Einhundert und und", "100 und und");
+        assert_replace_numbers!("neun zwanzig", "9 20");
     }
 
     #[test]

diff --git a/src/lang/en/mod.rs b/src/lang/en/mod.rs
@@ -1,11 +1,11 @@
-//! English number interpretor
+//! English number interpreter
 
 use crate::digit_string::DigitString;
 use crate::error::Error;
 
 mod vocabulary;
 
-use super::{BasicAnnotate, LangInterpretor, MorphologicalMarker};
+use super::{BasicAnnotate, LangInterpreter, MorphologicalMarker};
 use vocabulary::INSIGNIFICANT;
 
 fn lemmatize(word: &str) -> &str {
@@ -26,7 +26,7 @@ impl English {
     }
 }
 
-impl LangInterpretor for English {
+impl LangInterpreter for English {
     fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> {
         // In English, numbers can be compounded to form a group with "-"
         if num_func.contains('-') {

diff --git a/src/lang/es/mod.rs b/src/lang/es/mod.rs
@@ -1,10 +1,10 @@
-//! Spanish number interpretor
+//! Spanish number interpreter
 use crate::digit_string::DigitString;
 use crate::error::Error;
 
 mod vocabulary;
 
-use super::{LangInterpretor, MorphologicalMarker};
+use super::{LangInterpreter, MorphologicalMarker};
 use vocabulary::INSIGNIFICANT;
 
 fn lemmatize(word: &str) -> &str {
@@ -27,7 +27,7 @@ impl Spanish {
     }
 }
 
-impl LangInterpretor for Spanish {
+impl LangInterpreter for Spanish {
     fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> {
         let num_marker = self.get_morph_marker(num_func);
         if !b.is_empty() && num_marker != b.marker && !num_marker.is_fraction() {

diff --git a/src/lang/fr/mod.rs b/src/lang/fr/mod.rs
@@ -1,4 +1,4 @@
-//! French number interpretor.
+//! French number interpreter.
 //!
 //! It supports regional variants.
 use bitflags::bitflags;
@@ -8,7 +8,7 @@ use crate::error::Error;
 
 mod vocabulary;
 
-use super::{BasicAnnotate, LangInterpretor, MorphologicalMarker};
+use super::{BasicAnnotate, LangInterpreter, MorphologicalMarker};
 use vocabulary::INSIGNIFICANT;
 
 fn lemmatize(word: &str) -> &str {
@@ -43,7 +43,7 @@ bitflags! {
     }
 }
 
-impl LangInterpretor for French {
+impl LangInterpreter for French {
     fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> {
         // In French, numbers can be compounded to form a group with "-"
         if num_func.contains('-') {
@@ -493,6 +493,7 @@ mod tests {
         assert_replace_numbers!("cinq cent premiers", "500 premiers");
         assert_replace_numbers!("cinq cent premier", "500 premier");
         assert_replace_all_numbers!("une seconde", "une seconde");
+        assert_replace_numbers!("vingt-cinquième et trentième", "25ème et 30ème");
     }
 
     #[test]

diff --git a/src/lang/it/mod.rs b/src/lang/it/mod.rs
@@ -1,12 +1,12 @@
-//! Italian number interpretor
+//! Italian number interpreter
 
 use crate::digit_string::DigitString;
 use crate::error::Error;
 use crate::tokenizer::WordSplitter;
 
 mod vocabulary;
 
-use super::{LangInterpretor, MorphologicalMarker};
+use super::{LangInterpreter, MorphologicalMarker};
 use vocabulary::INSIGNIFICANT;
 
 pub struct Italian {
@@ -76,7 +76,7 @@ impl Italian {
     }
 }
 
-impl LangInterpretor for Italian {
+impl LangInterpreter for Italian {
     fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> {
         let lemma = lemmatize(num_func);
         if self.word_splitter.is_splittable(lemma) {

diff --git a/src/lang/mod.rs b/src/lang/mod.rs
@@ -11,11 +11,11 @@ a subset of the language that is "simple" and consistent enough to be interprete
 A number expressed in words is then seen as a little program whose interpretation result is either a sequence of digits, if the number is valid, or an
 error.
 
-The common runtime for all interpretors is the [`DigitString`]. It provided the memory
+The common runtime for all interpreters is the [`DigitString`]. It provided the memory
 and the elementary functions to build a number in base 10 (even if the language to be interpreted counts otherwise).
 The `DigitString` is responsible for checking the validity of the constructed number at each step (i.e at each method call).
 
-The intepretor part, which is specific to each language, is built by implementing the `LangInterpretor` trait, which
+The intepretor part, which is specific to each language, is built by implementing the `Langinterpreter` trait, which
 translate each number word into a sequence of elementary instructions on a `DigitString`.
 
 A language is just an empty (stateless) type. Everything is provided by implementating the trait.
@@ -28,6 +28,7 @@ mod es;
 mod fr;
 mod it;
 mod nl;
+mod pt;
 
 use crate::digit_string::DigitString;
 
@@ -39,6 +40,7 @@ pub use es::Spanish;
 pub use fr::French;
 pub use it::Italian;
 pub use nl::Dutch;
+pub use pt::Portuguese;
 
 pub trait BasicAnnotate {
     fn text_lowercase(&self) -> &str;
@@ -73,7 +75,7 @@ impl MorphologicalMarker {
 ///
 /// All methods must be implemented except the [`exec_group`](Self::exec_group), which comes with a default implementation.
 /// Self must be stateless.
-pub trait LangInterpretor {
+pub trait LangInterpreter {
     /// Interpret the word `num_func`, that may be part of a larger sequence.
     ///
     /// `num_func` is interpreted by calling the appropriate methods on `b`.
@@ -136,6 +138,7 @@ pub enum Language {
     Italian(Italian),
     Spanish(Spanish),
     Dutch(Dutch),
+    Portuguese(Portuguese),
 }
 
 impl Language {
@@ -162,6 +165,10 @@ impl Language {
     pub fn dutch() -> Self {
         Language::Dutch(Dutch::default())
     }
+
+    pub fn portuguese() -> Self {
+        Language::Portuguese(Portuguese::default())
+    }
 }
 
 macro_rules! delegate {
@@ -229,6 +236,6 @@ macro_rules! delegate {
     };
 }
 
-impl LangInterpretor for Language {
-    delegate!(Dutch, French, English, German, Italian, Spanish);
+impl LangInterpreter for Language {
+    delegate!(Dutch, French, English, German, Italian, Spanish, Portuguese);
 }
diff --git a/src/lang/nl/mod.rs b/src/lang/nl/mod.rs
@@ -1,6 +1,6 @@
-//! Dutch number interpretor
+//! Dutch number interpreter
 //!
-//! This interpretor is tolerant and accepts splitted words, that is "negen en zeventig" is treated like "negenenzeventig", as
+//! This interpreter is tolerant and accepts splitted words, that is "negen en zeventig" is treated like "negenenzeventig", as
 //! the main application, Speech-to-text recognition, may introduce spurious spaces.
 
 use bitflags::bitflags;
@@ -11,7 +11,7 @@ use crate::tokenizer::WordSplitter;
 
 mod vocabulary;
 
-use super::{LangInterpretor, MorphologicalMarker};
+use super::{LangInterpreter, MorphologicalMarker};
 use vocabulary::INSIGNIFICANT;
 
 bitflags! {
@@ -83,7 +83,7 @@ impl Dutch {
     }
 }
 
-impl LangInterpretor for Dutch {
+impl LangInterpreter for Dutch {
     fn apply(&self, num_func: &str, b: &mut DigitString) -> Result<(), Error> {
         // In Dutch, numbers are compounded to form a group
         if self.word_splitter.is_splittable(num_func) {