From c61c5cd35a249831c4521755f6ad02b40a6396bf Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Mon, 1 Jan 2024 19:50:49 +0100 Subject: [PATCH] Improve Osa implementation This reduces the binary size of osa_distance by more than 25% while improving the performance. --- CHANGELOG.md | 6 ++++++ src/lib.rs | 27 ++++++++++++--------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 603b6c3..ebde8cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ This project attempts to adhere to [Semantic Versioning](http://semver.org). ## [Unreleased] +### Changed + +- improve OSA implementation + - reduce runtime + - reduce binary size by more than `25%` + ### Fixed - Fix transposition counting in Jaro and Jaro-Winkler. diff --git a/src/lib.rs b/src/lib.rs index fe60018..16e15d2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,6 +8,7 @@ use std::collections::HashMap; use std::error::Error; use std::fmt::{self, Display, Formatter}; use std::hash::Hash; +use std::mem; use std::str::Chars; #[derive(Debug, PartialEq)] @@ -288,18 +289,11 @@ pub fn normalized_levenshtein(a: &str, b: &str) -> f64 { /// assert_eq!(3, osa_distance("ab", "bca")); /// ``` pub fn osa_distance(a: &str, b: &str) -> usize { - let a_len = a.chars().count(); let b_len = b.chars().count(); - if a == b { - return 0; - } else if a_len == 0 { - return b_len; - } else if b_len == 0 { - return a_len; - } - - let mut prev_two_distances: Vec = (0..=b_len).collect(); - let mut prev_distances: Vec = (0..=b_len).collect(); + // 0..=b_len behaves like 0..b_len.saturating_add(1) which could be a different size + // this leads to significantly worse code gen when swapping the vectors below + let mut prev_two_distances: Vec = (0..b_len + 1).collect(); + let mut prev_distances: Vec = (0..b_len + 1).collect(); let mut curr_distances: Vec = vec![0; b_len + 1]; let mut prev_a_char = char::MAX; @@ -309,7 +303,7 @@ pub fn osa_distance(a: &str, b: &str) -> usize { curr_distances[0] = i + 1; for (j, b_char) in b.chars().enumerate() { - let cost = if a_char == b_char { 0 } else { 1 }; + let cost = usize::from(a_char != b_char); curr_distances[j + 1] = min( curr_distances[j] + 1, min(prev_distances[j + 1] + 1, prev_distances[j] + cost), @@ -322,12 +316,15 @@ pub fn osa_distance(a: &str, b: &str) -> usize { prev_b_char = b_char; } - prev_two_distances.clone_from(&prev_distances); - prev_distances.clone_from(&curr_distances); + mem::swap(&mut prev_two_distances, &mut prev_distances); + mem::swap(&mut prev_distances, &mut curr_distances); prev_a_char = a_char; } - curr_distances[b_len] + // access prev_distances instead of curr_distances since we swapped + // them above. In case a is empty this would still contain the correct value + // from initializing the last element to b_len + prev_distances[b_len] } /* Returns the final index for a value in a single vector that represents a fixed