Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add "similarity" methods and deprecate "distance" methods #55

Merged
merged 4 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
![test](https://github.com/tonytonyjan/jaro_winkler/actions/workflows/test.yml/badge.svg)

[jaro_winkler](https://rubygems.org/gems/jaro_winkler) is an implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm which is written in C extension and will fallback to pure Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. **Both of C and Ruby implementation support any kind of string encoding, such as UTF-8, EUC-JP, Big5, etc.**
[jaro_winkler](https://rubygems.org/gems/jaro_winkler) is an implementation of [Jaro-Winkler similarity](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm which is written in C extension and will fallback to pure Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. **Both of C and Ruby implementation support any kind of string encoding, such as UTF-8, EUC-JP, Big5, etc.**

# Installation

Expand All @@ -13,30 +13,30 @@ gem install jaro_winkler
```ruby
require 'jaro_winkler'

# Jaro Winkler Distance
# Jaro Winkler Similarity

JaroWinkler.distance "MARTHA", "MARHTA"
JaroWinkler.similarity "MARTHA", "MARHTA"
# => 0.9611
JaroWinkler.distance "MARTHA", "marhta", ignore_case: true
JaroWinkler.similarity "MARTHA", "marhta", ignore_case: true
# => 0.9611
JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2
JaroWinkler.similarity "MARTHA", "MARHTA", weight: 0.2
# => 0.9778

# Jaro Distance
# Jaro Similarity

JaroWinkler.jaro_distance "MARTHA", "MARHTA"
JaroWinkler.jaro_similarity "MARTHA", "MARHTA"
# => 0.9444444444444445
```

There is no `JaroWinkler.jaro_winkler_distance`, it's tediously long.
There is no `JaroWinkler.jaro_winkler_similarity`, it's tediously long.

## Options

Name | Type | Default | Note
----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------
ignore_case | boolean | false | All lower case characters are converted to upper case prior to the comparison.
weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes.
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold.
threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro similarity above the threshold.
adj_table | boolean | false | The option is used to give partial credit for characters that may be errors due to known phonetic or character recognition errors. A typical example is to match the letter "O" with the number "0".

# Adjusting Table
Expand Down
16 changes: 16 additions & 0 deletions ext/jaro_winkler/jaro_winkler.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
VALUE rb_mJaroWinkler, rb_eError, rb_eInvalidWeightError;

VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self);
VALUE rb_jaro_winkler_similarity(int argc, VALUE *argv, VALUE self);
VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self);
VALUE rb_jaro_similarity(int argc, VALUE *argv, VALUE self);
VALUE distance(int argc, VALUE *argv, VALUE self,
double (*distance_fn)(uint32_t *codepoints1, size_t len1,
uint32_t *codepoints2, size_t len2,
Expand All @@ -25,6 +27,10 @@ void Init_jaro_winkler_ext(void) {
rb_jaro_winkler_distance, -1);
rb_define_singleton_method(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance,
-1);
rb_define_singleton_method(rb_mJaroWinkler, "similarity",
rb_jaro_winkler_similarity, -1);
rb_define_singleton_method(rb_mJaroWinkler, "jaro_similarity", rb_jaro_similarity,
-1);
}

VALUE distance(int argc, VALUE *argv, VALUE self,
Expand Down Expand Up @@ -69,9 +75,19 @@ VALUE distance(int argc, VALUE *argv, VALUE self,
}

VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self) {
rb_warn("JaroWinkler.jaro_distance is deprecated. Use JaroWinkler.jaro_similarity instead.");
return rb_jaro_similarity(argc, argv, self);
}

VALUE rb_jaro_similarity(int argc, VALUE *argv, VALUE self) {
return distance(argc, argv, self, jaro_distance_from_codes);
}

VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self) {
rb_warn("JaroWinkler.distance is deprecated. Use JaroWinkler.similarity instead.");
return rb_jaro_winkler_similarity(argc, argv, self);
}

VALUE rb_jaro_winkler_similarity(int argc, VALUE *argv, VALUE self) {
return distance(argc, argv, self, jaro_winkler_distance_from_codes);
}
10 changes: 10 additions & 0 deletions lib/jaro_winkler/jaro_winkler_pure.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,21 @@ class InvalidWeightError < Error; end

class << self
def distance(str1, str2, options = {})
warn("JaroWinkler.distance is deprecated. Use JaroWinkler.similarity instead.")
similarity(str1, str2, options)
end

def similarity(str1, str2, options = {})
validate!(str1, str2)
_distance str1.codepoints.to_a, str2.codepoints.to_a, options
end

def jaro_distance(str1, str2, options = {})
warn("JaroWinkler.jaro_distance is deprecated. Use JaroWinkler.jaro_similarity instead.")
jaro_similarity(str1, str2, options)
end

def jaro_similarity(str1, str2, options = {})
validate!(str1, str2)
_jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options
end
Expand Down
154 changes: 85 additions & 69 deletions test/tests.rb
Original file line number Diff line number Diff line change
@@ -1,110 +1,118 @@
# encoding: utf-8
module Tests
def test_similarity
assert_similarity 0.9667, 'henka', 'henkan'
assert_similarity 1.0, 'al', 'al'
assert_similarity 0.9611, 'martha', 'marhta'
assert_similarity 0.8324, 'jones', 'johnson'
assert_similarity 0.9583, 'abcvwxyz', 'cabvwxyz'
assert_similarity 0.84, 'dwayne', 'duane'
assert_similarity 0.8133, 'dixon', 'dicksonx'
assert_similarity 0.0, 'fvie', 'ten'
assert_similarity 1.0, 'tony', 'tony'
assert_similarity 1.0, 'tonytonyjan', 'tonytonyjan'
assert_similarity 1.0, 'x', 'x'
assert_similarity 0.0, '', ''
assert_similarity 0.0, 'tony', ''
assert_similarity 0.0, '', 'tony'
assert_similarity 0.8727, 'tonytonyjan', 'tony'
assert_similarity 0.8727, 'tony', 'tonytonyjan'
assert_similarity 0.9407, 'necessary', 'nessecary'
assert_similarity 0.9067, 'does_exist', 'doesnt_exist'
assert_similarity 0.975, '12345678', '12345687'
assert_similarity 0.975, '12345678', '12345867'
assert_similarity 0.95, '12345678', '12348567'
end

def test_jaro_similarity
assert_jaro_similarity 0.9444, 'henka', 'henkan'
assert_jaro_similarity 1.0, 'al', 'al'
assert_jaro_similarity 0.9444, 'martha', 'marhta'
assert_jaro_similarity 0.7905, 'jones', 'johnson'
assert_jaro_similarity 0.9583, 'abcvwxyz', 'cabvwxyz'
assert_jaro_similarity 0.8222, 'dwayne', 'duane'
assert_jaro_similarity 0.7667, 'dixon', 'dicksonx'
assert_jaro_similarity 0.0, 'fvie', 'ten'
assert_jaro_similarity 1.0, 'tony', 'tony'
assert_jaro_similarity 1.0, 'tonytonyjan', 'tonytonyjan'
assert_jaro_similarity 1.0, 'x', 'x'
assert_jaro_similarity 0.0, '', ''
assert_jaro_similarity 0.0, 'tony', ''
assert_jaro_similarity 0.0, '', 'tony'
assert_jaro_similarity 0.7879, 'tonytonyjan', 'tony'
assert_jaro_similarity 0.7879, 'tony', 'tonytonyjan'
assert_jaro_similarity 0.9259, 'necessary', 'nessecary'
assert_jaro_similarity 0.8444, 'does_exist', 'doesnt_exist'
assert_jaro_similarity 0.9583, '12345678', '12345687'
assert_jaro_similarity 0.9583, '12345678', '12345867'
assert_jaro_similarity 0.9167, '12345678', '12348567'
assert_jaro_similarity 0.604, 'tonytonyjan', 'janjantony'
end

def test_distance
assert_distance 0.9667, 'henka', 'henkan'
assert_distance 1.0, 'al', 'al'
assert_distance 0.9611, 'martha', 'marhta'
assert_distance 0.8324, 'jones', 'johnson'
assert_distance 0.9583, 'abcvwxyz', 'cabvwxyz'
assert_distance 0.84, 'dwayne', 'duane'
assert_distance 0.8133, 'dixon', 'dicksonx'
assert_distance 0.0, 'fvie', 'ten'
assert_distance 1.0, 'tony', 'tony'
assert_distance 1.0, 'tonytonyjan', 'tonytonyjan'
assert_distance 1.0, 'x', 'x'
assert_distance 0.0, '', ''
assert_distance 0.0, 'tony', ''
assert_distance 0.0, '', 'tony'
assert_distance 0.8727, 'tonytonyjan', 'tony'
assert_distance 0.8727, 'tony', 'tonytonyjan'
assert_distance 0.9407, 'necessary', 'nessecary'
assert_distance 0.9067, 'does_exist', 'doesnt_exist'
assert_distance 0.975, '12345678', '12345687'
assert_distance 0.975, '12345678', '12345867'
assert_distance 0.95, '12345678', '12348567'
assert_distance 0.9667, 'henka', 'henkan'
end

def test_jaro_distance
assert_jaro_distance 0.9444, 'henka', 'henkan'
assert_jaro_distance 1.0, 'al', 'al'
assert_jaro_distance 0.9444, 'martha', 'marhta'
assert_jaro_distance 0.7905, 'jones', 'johnson'
assert_jaro_distance 0.9583, 'abcvwxyz', 'cabvwxyz'
assert_jaro_distance 0.8222, 'dwayne', 'duane'
assert_jaro_distance 0.7667, 'dixon', 'dicksonx'
assert_jaro_distance 0.0, 'fvie', 'ten'
assert_jaro_distance 1.0, 'tony', 'tony'
assert_jaro_distance 1.0, 'tonytonyjan', 'tonytonyjan'
assert_jaro_distance 1.0, 'x', 'x'
assert_jaro_distance 0.0, '', ''
assert_jaro_distance 0.0, 'tony', ''
assert_jaro_distance 0.0, '', 'tony'
assert_jaro_distance 0.7879, 'tonytonyjan', 'tony'
assert_jaro_distance 0.7879, 'tony', 'tonytonyjan'
assert_jaro_distance 0.9259, 'necessary', 'nessecary'
assert_jaro_distance 0.8444, 'does_exist', 'doesnt_exist'
assert_jaro_distance 0.9583, '12345678', '12345687'
assert_jaro_distance 0.9583, '12345678', '12345867'
assert_jaro_distance 0.9167, '12345678', '12348567'
assert_jaro_distance 0.604, 'tonytonyjan', 'janjantony'
assert_jaro_distance 0.9444, 'henka', 'henkan'
end

def test_unicode
assert_distance 0.9818, '變形金剛4:絕跡重生', '變形金剛4: 絕跡重生'
assert_distance 0.8222, '連勝文', '連勝丼'
assert_distance 0.8222, '馬英九', '馬英丸'
assert_distance 0.6667, '良い', 'いい'
assert_similarity 0.9818, '變形金剛4:絕跡重生', '變形金剛4: 絕跡重生'
assert_similarity 0.8222, '連勝文', '連勝丼'
assert_similarity 0.8222, '馬英九', '馬英丸'
assert_similarity 0.6667, '良い', 'いい'
end

def test_ignore_case
assert_distance 0.9611, 'MARTHA', 'marhta', ignore_case: true
assert_similarity 0.9611, 'MARTHA', 'marhta', ignore_case: true
end

def test_weight
assert_distance 0.9778, 'MARTHA', 'MARHTA', weight: 0.2
assert_similarity 0.9778, 'MARTHA', 'MARHTA', weight: 0.2
end

def test_threshold
assert_distance 0.9444, 'MARTHA', 'MARHTA', threshold: 0.99
assert_similarity 0.9444, 'MARTHA', 'MARHTA', threshold: 0.99
end


def test_adjusting_table
assert_distance 0.9667, 'HENKA', 'HENKAN', adj_table: true
assert_distance 1.0, 'AL', 'AL', adj_table: true
assert_distance 0.9611, 'MARTHA', 'MARHTA', adj_table: true
assert_distance 0.8598, 'JONES', 'JOHNSON', adj_table: true
assert_distance 0.9583, 'ABCVWXYZ', 'CABVWXYZ', adj_table: true
assert_distance 0.8730, 'DWAYNE', 'DUANE', adj_table: true
assert_distance 0.8393, 'DIXON', 'DICKSONX', adj_table: true
assert_distance 0.0, 'FVIE', 'TEN', adj_table: true
assert_similarity 0.9667, 'HENKA', 'HENKAN', adj_table: true
assert_similarity 1.0, 'AL', 'AL', adj_table: true
assert_similarity 0.9611, 'MARTHA', 'MARHTA', adj_table: true
assert_similarity 0.8598, 'JONES', 'JOHNSON', adj_table: true
assert_similarity 0.9583, 'ABCVWXYZ', 'CABVWXYZ', adj_table: true
assert_similarity 0.8730, 'DWAYNE', 'DUANE', adj_table: true
assert_similarity 0.8393, 'DIXON', 'DICKSONX', adj_table: true
assert_similarity 0.0, 'FVIE', 'TEN', adj_table: true
end

def test_error
assert_raises JaroWinkler::InvalidWeightError do
JaroWinkler.distance 'MARTHA', 'MARHTA', weight: 0.26
JaroWinkler.similarity 'MARTHA', 'MARHTA', weight: 0.26
end
end

def test_long_string
JaroWinkler.distance 'haisai' * 20, 'haisai' * 20
JaroWinkler.similarity 'haisai' * 20, 'haisai' * 20
end

def test_encoding
assert_encoding '焦玟綾', '焦紋綾', Encoding::Big5
assert_encoding '簡煒航', '簡偉航', Encoding::Big5_HKSCS
assert_encoding '西島之', '西鳥志', Encoding::EUCJP
assert_encoding '松本行弘', '枩本行弘', Encoding::Shift_JIS
assert_distance 1.0, "\xe8".force_encoding('iso8859-1'), 'è'
assert_similarity 1.0, "\xe8".force_encoding('iso8859-1'), 'è'
end

def test_raises_type_error
assert_raises(TypeError){ JaroWinkler.distance 'MARTHA', nil }
assert_raises(TypeError){ JaroWinkler.distance nil, 'MARTHA' }
assert_raises(TypeError){ JaroWinkler.distance nil, nil }
assert_raises(TypeError){ JaroWinkler.distance 'MARTHA', :non_string }
assert_raises(TypeError){ JaroWinkler.distance :non_string, 'MARTHA' }
assert_raises(TypeError){ JaroWinkler.distance :non_string, :non_string }
assert_raises(TypeError){ JaroWinkler.similarity 'MARTHA', nil }
assert_raises(TypeError){ JaroWinkler.similarity nil, 'MARTHA' }
assert_raises(TypeError){ JaroWinkler.similarity nil, nil }
assert_raises(TypeError){ JaroWinkler.similarity 'MARTHA', :non_string }
assert_raises(TypeError){ JaroWinkler.similarity :non_string, 'MARTHA' }
assert_raises(TypeError){ JaroWinkler.similarity :non_string, :non_string }
end

private
Expand All @@ -114,10 +122,18 @@ def assert_distance score, str1, str2, **options
end

def assert_encoding str1, str2, encoding, **options
assert_distance JaroWinkler.distance(str1, str2), str1.encode(encoding), str2.encode(encoding)
assert_similarity JaroWinkler.distance(str1, str2), str1.encode(encoding), str2.encode(encoding)
end

def assert_jaro_distance score, str1, str2, **options
assert_in_delta score, JaroWinkler.jaro_distance(str1, str2, **options)
end
end

def assert_similarity score, str1, str2, **options
assert_in_delta score, JaroWinkler.similarity(str1, str2, **options)
end

def assert_jaro_similarity score, str1, str2, **options
assert_in_delta score, JaroWinkler.jaro_similarity(str1, str2, **options)
end
end
Loading