Skip to content

Commit

Permalink
Improve Osa implementation
Browse files Browse the repository at this point in the history
This reduces the binary size of osa_distance by more than 25% while
improving the performance.
  • Loading branch information
maxbachmann committed Jan 1, 2024
1 parent 5b512dc commit c61c5cd
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 15 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ This project attempts to adhere to [Semantic Versioning](http://semver.org).

## [Unreleased]

### Changed

- improve OSA implementation
- reduce runtime
- reduce binary size by more than `25%`

### Fixed

- Fix transposition counting in Jaro and Jaro-Winkler.
Expand Down
27 changes: 12 additions & 15 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::collections::HashMap;
use std::error::Error;
use std::fmt::{self, Display, Formatter};
use std::hash::Hash;
use std::mem;
use std::str::Chars;

#[derive(Debug, PartialEq)]
Expand Down Expand Up @@ -288,18 +289,11 @@ pub fn normalized_levenshtein(a: &str, b: &str) -> f64 {
/// assert_eq!(3, osa_distance("ab", "bca"));
/// ```
pub fn osa_distance(a: &str, b: &str) -> usize {
let a_len = a.chars().count();
let b_len = b.chars().count();
if a == b {
return 0;
} else if a_len == 0 {
return b_len;
} else if b_len == 0 {
return a_len;
}

let mut prev_two_distances: Vec<usize> = (0..=b_len).collect();
let mut prev_distances: Vec<usize> = (0..=b_len).collect();
// 0..=b_len behaves like 0..b_len.saturating_add(1) which could be a different size
// this leads to significantly worse code gen when swapping the vectors below
let mut prev_two_distances: Vec<usize> = (0..b_len + 1).collect();
let mut prev_distances: Vec<usize> = (0..b_len + 1).collect();
let mut curr_distances: Vec<usize> = vec![0; b_len + 1];

let mut prev_a_char = char::MAX;
Expand All @@ -309,7 +303,7 @@ pub fn osa_distance(a: &str, b: &str) -> usize {
curr_distances[0] = i + 1;

for (j, b_char) in b.chars().enumerate() {
let cost = if a_char == b_char { 0 } else { 1 };
let cost = usize::from(a_char != b_char);
curr_distances[j + 1] = min(
curr_distances[j] + 1,
min(prev_distances[j + 1] + 1, prev_distances[j] + cost),
Expand All @@ -322,12 +316,15 @@ pub fn osa_distance(a: &str, b: &str) -> usize {
prev_b_char = b_char;
}

prev_two_distances.clone_from(&prev_distances);
prev_distances.clone_from(&curr_distances);
mem::swap(&mut prev_two_distances, &mut prev_distances);
mem::swap(&mut prev_distances, &mut curr_distances);
prev_a_char = a_char;
}

curr_distances[b_len]
// access prev_distances instead of curr_distances since we swapped
// them above. In case a is empty this would still contain the correct value
// from initializing the last element to b_len
prev_distances[b_len]
}

/* Returns the final index for a value in a single vector that represents a fixed
Expand Down

0 comments on commit c61c5cd

Please sign in to comment.