Skip to content

Commit

Permalink
Merge pull request #41 from allo-media/master
Browse files Browse the repository at this point in the history
v2.5.0
  • Loading branch information
rtxm authored Aug 23, 2024
2 parents 368e52b + 90755f3 commit a9b0d95
Show file tree
Hide file tree
Showing 11 changed files with 263 additions and 138 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "text2num"
version = "2.4.1"
version = "2.5.0"
authors = ["Allo-Media <contact@allo-media.fr>"]
edition = "2021"
license = "MIT"
Expand Down
10 changes: 3 additions & 7 deletions src/lang/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,16 +245,12 @@ impl LangInterpretor for German {
fn is_linking(&self, word: &str) -> bool {
INSIGNIFICANT.contains(word)
}

fn is_ambiguous(&self, _number: &str) -> bool {
false
}
}

#[cfg(test)]
mod tests {
use super::German;
use crate::word_to_digit::{replace_numbers, text2digits};
use crate::word_to_digit::{replace_numbers_in_text, text2digits};

macro_rules! assert_text2digits {
($text:expr, $res:expr) => {
Expand All @@ -269,14 +265,14 @@ mod tests {
macro_rules! assert_replace_numbers {
($text:expr, $res:expr) => {
let f = German::new();
assert_eq!(replace_numbers($text, &f, 10.0), $res)
assert_eq!(replace_numbers_in_text($text, &f, 10.0), $res)
};
}

macro_rules! assert_replace_all_numbers {
($text:expr, $res:expr) => {
let f = German::new();
assert_eq!(replace_numbers($text, &f, 0.0), $res)
assert_eq!(replace_numbers_in_text($text, &f, 0.0), $res)
};
}

Expand Down
48 changes: 42 additions & 6 deletions src/lang/en/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use crate::error::Error;

mod vocabulary;

use super::{LangInterpretor, MorphologicalMarker};
use super::{BasicAnnotate, LangInterpretor, MorphologicalMarker};
use vocabulary::INSIGNIFICANT;

fn lemmatize(word: &str) -> &str {
Expand Down Expand Up @@ -162,15 +162,49 @@ impl LangInterpretor for English {
INSIGNIFICANT.contains(word)
}

fn is_ambiguous(&self, _number: &str) -> bool {
false
fn basic_annotate<T: BasicAnnotate>(&self, tokens: &mut Vec<T>) {
let mut b = DigitString::new();
let significant_tokens_indices: Vec<usize> = tokens
.iter()
.enumerate()
.filter_map(|(i, t)| {
if !t.text_lowercase().chars().all(|c| c.is_ascii_whitespace()) {
Some(i)
} else {
None
}
})
.collect();
for (j, &i) in significant_tokens_indices.iter().enumerate() {
if tokens[i].text_lowercase() == "o" {
if j > 0
&& self
.apply(
tokens[significant_tokens_indices[j - 1]].text_lowercase(),
&mut b,
)
.is_ok()
|| j + 1 < significant_tokens_indices.len()
&& self
.apply(
tokens[significant_tokens_indices[j + 1]].text_lowercase(),
&mut b,
)
.is_ok()
{
b.reset()
} else {
tokens[i].set_nan(true);
}
}
}
}
}

#[cfg(test)]
mod tests {
use super::English;
use crate::word_to_digit::{replace_numbers, text2digits};
use crate::word_to_digit::{replace_numbers_in_text, text2digits};

macro_rules! assert_text2digits {
($text:expr, $res:expr) => {
Expand All @@ -185,14 +219,14 @@ mod tests {
macro_rules! assert_replace_numbers {
($text:expr, $res:expr) => {
let f = English {};
assert_eq!(replace_numbers($text, &f, 10.0), $res)
assert_eq!(replace_numbers_in_text($text, &f, 10.0), $res)
};
}

macro_rules! assert_replace_all_numbers {
($text:expr, $res:expr) => {
let f = English {};
assert_eq!(replace_numbers($text, &f, 0.0), $res)
assert_eq!(replace_numbers_in_text($text, &f, 0.0), $res)
};
}

Expand Down Expand Up @@ -264,6 +298,8 @@ mod tests {
assert_invalid!("five o");
assert_invalid!("fifty zero three");
assert_invalid!("fifty three zero");
assert_replace_all_numbers!("zero a b c", "0 a b c");
assert_replace_all_numbers!("o a b c", "o a b c");
}

#[test]
Expand Down
10 changes: 3 additions & 7 deletions src/lang/es/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,16 +167,12 @@ impl LangInterpretor for Spanish {
fn is_linking(&self, word: &str) -> bool {
INSIGNIFICANT.contains(word)
}

fn is_ambiguous(&self, _number: &str) -> bool {
false
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::word_to_digit::{replace_numbers, text2digits};
use crate::word_to_digit::{replace_numbers_in_text, text2digits};

macro_rules! assert_text2digits {
($text:expr, $res:expr) => {
Expand All @@ -191,14 +187,14 @@ mod tests {
macro_rules! assert_replace_numbers {
($text:expr, $res:expr) => {
let f = Spanish {};
assert_eq!(replace_numbers($text, &f, 10.0), $res)
assert_eq!(replace_numbers_in_text($text, &f, 10.0), $res)
};
}

macro_rules! assert_replace_all_numbers {
($text:expr, $res:expr) => {
let f = Spanish {};
assert_eq!(replace_numbers($text, &f, 0.0), $res)
assert_eq!(replace_numbers_in_text($text, &f, 0.0), $res)
};
}

Expand Down
48 changes: 41 additions & 7 deletions src/lang/fr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use crate::error::Error;

mod vocabulary;

use super::{LangInterpretor, MorphologicalMarker};
use super::{BasicAnnotate, LangInterpretor, MorphologicalMarker};
use vocabulary::INSIGNIFICANT;

fn lemmatize(word: &str) -> &str {
Expand Down Expand Up @@ -237,15 +237,48 @@ impl LangInterpretor for French {
INSIGNIFICANT.contains(word)
}

fn is_ambiguous(&self, number: &str) -> bool {
number == "9"
fn basic_annotate<T: BasicAnnotate>(&self, tokens: &mut Vec<T>) {
let mut iart_seen: Option<(&str, usize)> = None;
let mut num_wsp = 0;
let mut b = DigitString::new();
for (i, token) in tokens.iter_mut().enumerate() {
if token
.text_lowercase()
.chars()
.all(|c| c.is_ascii_whitespace())
{
if iart_seen.is_some() {
num_wsp += 1;
}
continue;
}
if matches!(token.text_lowercase(), "un" | "le" | "du" | "l'") {
iart_seen.replace((token.text_lowercase(), i));
num_wsp = 0;
} else if token.text_lowercase() == "neuf" {
if let Some((art, pos)) = iart_seen.take() {
let sep_words = i - pos - num_wsp - 1;
match art {
"un" => {
if sep_words > 0 && sep_words < 3 {
token.set_nan(true);
}
}
_ if sep_words == 0 => token.set_nan(true),
_ => (),
}
}
} else if self.apply(token.text_lowercase(), &mut b).is_ok() {
iart_seen.take();
}
}
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::word_to_digit::{replace_numbers, text2digits};
use crate::word_to_digit::{replace_numbers_in_text, text2digits};

macro_rules! assert_text2digits {
($text:expr, $res:expr) => {
Expand All @@ -260,14 +293,14 @@ mod tests {
macro_rules! assert_replace_numbers {
($text:expr, $res:expr) => {
let f = French {};
assert_eq!(replace_numbers($text, &f, 7.0), $res)
assert_eq!(replace_numbers_in_text($text, &f, 7.0), $res)
};
}

macro_rules! assert_replace_all_numbers {
($text:expr, $res:expr) => {
let f = French {};
assert_eq!(replace_numbers($text, &f, 0.0), $res)
assert_eq!(replace_numbers_in_text($text, &f, 0.0), $res)
};
}

Expand Down Expand Up @@ -454,6 +487,7 @@ mod tests {
assert_replace_numbers!("cinq cent unième", "501ème");
assert_replace_numbers!("cinq cent premiers", "500 premiers");
assert_replace_numbers!("cinq cent premier", "500 premier");
assert_replace_all_numbers!("une seconde", "une seconde");
}

#[test]
Expand Down Expand Up @@ -500,7 +534,7 @@ mod tests {
assert_replace_numbers!("un peu moins", "un peu moins");
// assert_replace_numbers!("onze c'est un peu plus", "11 c'est un peu plus");

assert_replace_numbers!("le logement neuf", "le logement neuf");
assert_replace_numbers!("le logement neuf", "le logement 9");
assert_replace_numbers!("le logement neuf deux sept", "le logement 9 2 7");
}

Expand Down
10 changes: 3 additions & 7 deletions src/lang/it/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -288,16 +288,12 @@ impl LangInterpretor for Italian {
fn is_linking(&self, word: &str) -> bool {
INSIGNIFICANT.contains(word)
}

fn is_ambiguous(&self, _number: &str) -> bool {
false
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::word_to_digit::{replace_numbers, text2digits};
use crate::word_to_digit::{replace_numbers_in_text, text2digits};

macro_rules! assert_text2digits {
($text:expr, $res:expr) => {
Expand All @@ -312,14 +308,14 @@ mod tests {
macro_rules! assert_replace_numbers {
($text:expr, $res:expr) => {
let f = Italian::default();
assert_eq!(replace_numbers($text, &f, 10.0), $res)
assert_eq!(replace_numbers_in_text($text, &f, 10.0), $res)
};
}

macro_rules! assert_replace_all_numbers {
($text:expr, $res:expr) => {
let f = Italian::default();
assert_eq!(replace_numbers($text, &f, 0.0), $res)
assert_eq!(replace_numbers_in_text($text, &f, 0.0), $res)
};
}

Expand Down
14 changes: 9 additions & 5 deletions src/lang/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ pub use fr::French;
pub use it::Italian;
pub use nl::Dutch;

pub trait BasicAnnotate {
fn text_lowercase(&self) -> &str;
fn set_nan(&mut self, val: bool);
}

/// Model the Morphological markers that differenciate ordinals or fractions from cardinals,
/// and that must be retained on the digit form.
///
Expand Down Expand Up @@ -102,8 +107,6 @@ pub trait LangInterpretor {
/// that separate unrelated numbers. So the method would return `false` for them.
/// This function is used to find isolate numbers.
fn is_linking(&self, word: &str) -> bool;
/// In some languages, numbers can be homonyms to other words
fn is_ambiguous(&self, number: &str) -> bool;
/// Process the `group` as all or nothing.
fn exec_group<'a, I: Iterator<Item = &'a str>>(&self, group: I) -> Result<DigitString, Error> {
let mut b = DigitString::new();
Expand All @@ -121,6 +124,8 @@ pub trait LangInterpretor {
Ok(b)
}
}

fn basic_annotate<T: BasicAnnotate>(&self, _tokens: &mut Vec<T>) {}
}

/// A convenience enum that encapsulates the builtin languages in a single type.
Expand Down Expand Up @@ -214,14 +219,13 @@ macro_rules! delegate {
}
}

fn is_ambiguous(&self, number: &str) -> bool {
fn basic_annotate<T: BasicAnnotate>(&self, tokens: &mut Vec<T>) {
match self {
$(
Language::$variant(l) => l.is_ambiguous(number),
Language::$variant(l) => l.basic_annotate(tokens),
)*
}
}

};
}

Expand Down
10 changes: 3 additions & 7 deletions src/lang/nl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -266,16 +266,12 @@ impl LangInterpretor for Dutch {
fn is_linking(&self, word: &str) -> bool {
INSIGNIFICANT.contains(word)
}

fn is_ambiguous(&self, _number: &str) -> bool {
false
}
}

#[cfg(test)]
mod tests {
use super::Dutch;
use crate::word_to_digit::{replace_numbers, text2digits};
use crate::word_to_digit::{replace_numbers_in_text, text2digits};

macro_rules! assert_text2digits {
($text:expr, $res:expr) => {
Expand All @@ -290,14 +286,14 @@ mod tests {
macro_rules! assert_replace_numbers {
($text:expr, $res:expr) => {
let f = Dutch::new();
assert_eq!(replace_numbers($text, &f, 10.0), $res)
assert_eq!(replace_numbers_in_text($text, &f, 10.0), $res)
};
}

macro_rules! assert_replace_all_numbers {
($text:expr, $res:expr) => {
let f = Dutch::new();
assert_eq!(replace_numbers($text, &f, 0.0), $res)
assert_eq!(replace_numbers_in_text($text, &f, 0.0), $res)
};
}

Expand Down
Loading

0 comments on commit a9b0d95

Please sign in to comment.