diff --git a/README.md b/README.md index 9c1a86e..c2deb16 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ![test](https://github.com/tonytonyjan/jaro_winkler/actions/workflows/test.yml/badge.svg) -[jaro_winkler](https://rubygems.org/gems/jaro_winkler) is an implementation of [Jaro-Winkler distance](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm which is written in C extension and will fallback to pure Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. **Both of C and Ruby implementation support any kind of string encoding, such as UTF-8, EUC-JP, Big5, etc.** +[jaro_winkler](https://rubygems.org/gems/jaro_winkler) is an implementation of [Jaro-Winkler similarity](http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm which is written in C extension and will fallback to pure Ruby version in platforms other than MRI/KRI like JRuby or Rubinius. **Both of C and Ruby implementation support any kind of string encoding, such as UTF-8, EUC-JP, Big5, etc.** # Installation @@ -13,22 +13,22 @@ gem install jaro_winkler ```ruby require 'jaro_winkler' -# Jaro Winkler Distance +# Jaro Winkler Similarity -JaroWinkler.distance "MARTHA", "MARHTA" +JaroWinkler.similarity "MARTHA", "MARHTA" # => 0.9611 -JaroWinkler.distance "MARTHA", "marhta", ignore_case: true +JaroWinkler.similarity "MARTHA", "marhta", ignore_case: true # => 0.9611 -JaroWinkler.distance "MARTHA", "MARHTA", weight: 0.2 +JaroWinkler.similarity "MARTHA", "MARHTA", weight: 0.2 # => 0.9778 -# Jaro Distance +# Jaro Similarity -JaroWinkler.jaro_distance "MARTHA", "MARHTA" +JaroWinkler.jaro_similarity "MARTHA", "MARHTA" # => 0.9444444444444445 ``` -There is no `JaroWinkler.jaro_winkler_distance`, it's tediously long. +There is no `JaroWinkler.jaro_winkler_similarity`, it's tediously long. ## Options @@ -36,7 +36,7 @@ Name | Type | Default | Note ----------- | ------ | ------- | ------------------------------------------------------------------------------------------------------------ ignore_case | boolean | false | All lower case characters are converted to upper case prior to the comparison. weight | number | 0.1 | A constant scaling factor for how much the score is adjusted upwards for having common prefixes. -threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro distance above the threshold. +threshold | number | 0.7 | The prefix bonus is only added when the compared strings have a Jaro similarity above the threshold. adj_table | boolean | false | The option is used to give partial credit for characters that may be errors due to known phonetic or character recognition errors. A typical example is to match the letter "O" with the number "0". # Adjusting Table diff --git a/ext/jaro_winkler/jaro_winkler.c b/ext/jaro_winkler/jaro_winkler.c index b8e2aaa..dc4cc4d 100644 --- a/ext/jaro_winkler/jaro_winkler.c +++ b/ext/jaro_winkler/jaro_winkler.c @@ -5,7 +5,9 @@ VALUE rb_mJaroWinkler, rb_eError, rb_eInvalidWeightError; VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self); +VALUE rb_jaro_winkler_similarity(int argc, VALUE *argv, VALUE self); VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self); +VALUE rb_jaro_similarity(int argc, VALUE *argv, VALUE self); VALUE distance(int argc, VALUE *argv, VALUE self, double (*distance_fn)(uint32_t *codepoints1, size_t len1, uint32_t *codepoints2, size_t len2, @@ -25,6 +27,10 @@ void Init_jaro_winkler_ext(void) { rb_jaro_winkler_distance, -1); rb_define_singleton_method(rb_mJaroWinkler, "jaro_distance", rb_jaro_distance, -1); + rb_define_singleton_method(rb_mJaroWinkler, "similarity", + rb_jaro_winkler_similarity, -1); + rb_define_singleton_method(rb_mJaroWinkler, "jaro_similarity", rb_jaro_similarity, + -1); } VALUE distance(int argc, VALUE *argv, VALUE self, @@ -69,9 +75,19 @@ VALUE distance(int argc, VALUE *argv, VALUE self, } VALUE rb_jaro_distance(int argc, VALUE *argv, VALUE self) { + rb_warn("JaroWinkler.jaro_distance is deprecated. Use JaroWinkler.jaro_similarity instead."); + return rb_jaro_similarity(argc, argv, self); +} + +VALUE rb_jaro_similarity(int argc, VALUE *argv, VALUE self) { return distance(argc, argv, self, jaro_distance_from_codes); } VALUE rb_jaro_winkler_distance(int argc, VALUE *argv, VALUE self) { + rb_warn("JaroWinkler.distance is deprecated. Use JaroWinkler.similarity instead."); + return rb_jaro_winkler_similarity(argc, argv, self); +} + +VALUE rb_jaro_winkler_similarity(int argc, VALUE *argv, VALUE self) { return distance(argc, argv, self, jaro_winkler_distance_from_codes); } diff --git a/lib/jaro_winkler/jaro_winkler_pure.rb b/lib/jaro_winkler/jaro_winkler_pure.rb index 64d0400..3afa0ca 100644 --- a/lib/jaro_winkler/jaro_winkler_pure.rb +++ b/lib/jaro_winkler/jaro_winkler_pure.rb @@ -14,11 +14,21 @@ class InvalidWeightError < Error; end class << self def distance(str1, str2, options = {}) + warn("JaroWinkler.distance is deprecated. Use JaroWinkler.similarity instead.") + similarity(str1, str2, options) + end + + def similarity(str1, str2, options = {}) validate!(str1, str2) _distance str1.codepoints.to_a, str2.codepoints.to_a, options end def jaro_distance(str1, str2, options = {}) + warn("JaroWinkler.jaro_distance is deprecated. Use JaroWinkler.jaro_similarity instead.") + jaro_similarity(str1, str2, options) + end + + def jaro_similarity(str1, str2, options = {}) validate!(str1, str2) _jaro_distance str1.codepoints.to_a, str2.codepoints.to_a, options end diff --git a/test/tests.rb b/test/tests.rb index 51b319b..a48dded 100644 --- a/test/tests.rb +++ b/test/tests.rb @@ -1,93 +1,101 @@ # encoding: utf-8 module Tests + def test_similarity + assert_similarity 0.9667, 'henka', 'henkan' + assert_similarity 1.0, 'al', 'al' + assert_similarity 0.9611, 'martha', 'marhta' + assert_similarity 0.8324, 'jones', 'johnson' + assert_similarity 0.9583, 'abcvwxyz', 'cabvwxyz' + assert_similarity 0.84, 'dwayne', 'duane' + assert_similarity 0.8133, 'dixon', 'dicksonx' + assert_similarity 0.0, 'fvie', 'ten' + assert_similarity 1.0, 'tony', 'tony' + assert_similarity 1.0, 'tonytonyjan', 'tonytonyjan' + assert_similarity 1.0, 'x', 'x' + assert_similarity 0.0, '', '' + assert_similarity 0.0, 'tony', '' + assert_similarity 0.0, '', 'tony' + assert_similarity 0.8727, 'tonytonyjan', 'tony' + assert_similarity 0.8727, 'tony', 'tonytonyjan' + assert_similarity 0.9407, 'necessary', 'nessecary' + assert_similarity 0.9067, 'does_exist', 'doesnt_exist' + assert_similarity 0.975, '12345678', '12345687' + assert_similarity 0.975, '12345678', '12345867' + assert_similarity 0.95, '12345678', '12348567' + end + + def test_jaro_similarity + assert_jaro_similarity 0.9444, 'henka', 'henkan' + assert_jaro_similarity 1.0, 'al', 'al' + assert_jaro_similarity 0.9444, 'martha', 'marhta' + assert_jaro_similarity 0.7905, 'jones', 'johnson' + assert_jaro_similarity 0.9583, 'abcvwxyz', 'cabvwxyz' + assert_jaro_similarity 0.8222, 'dwayne', 'duane' + assert_jaro_similarity 0.7667, 'dixon', 'dicksonx' + assert_jaro_similarity 0.0, 'fvie', 'ten' + assert_jaro_similarity 1.0, 'tony', 'tony' + assert_jaro_similarity 1.0, 'tonytonyjan', 'tonytonyjan' + assert_jaro_similarity 1.0, 'x', 'x' + assert_jaro_similarity 0.0, '', '' + assert_jaro_similarity 0.0, 'tony', '' + assert_jaro_similarity 0.0, '', 'tony' + assert_jaro_similarity 0.7879, 'tonytonyjan', 'tony' + assert_jaro_similarity 0.7879, 'tony', 'tonytonyjan' + assert_jaro_similarity 0.9259, 'necessary', 'nessecary' + assert_jaro_similarity 0.8444, 'does_exist', 'doesnt_exist' + assert_jaro_similarity 0.9583, '12345678', '12345687' + assert_jaro_similarity 0.9583, '12345678', '12345867' + assert_jaro_similarity 0.9167, '12345678', '12348567' + assert_jaro_similarity 0.604, 'tonytonyjan', 'janjantony' + end + def test_distance - assert_distance 0.9667, 'henka', 'henkan' - assert_distance 1.0, 'al', 'al' - assert_distance 0.9611, 'martha', 'marhta' - assert_distance 0.8324, 'jones', 'johnson' - assert_distance 0.9583, 'abcvwxyz', 'cabvwxyz' - assert_distance 0.84, 'dwayne', 'duane' - assert_distance 0.8133, 'dixon', 'dicksonx' - assert_distance 0.0, 'fvie', 'ten' - assert_distance 1.0, 'tony', 'tony' - assert_distance 1.0, 'tonytonyjan', 'tonytonyjan' - assert_distance 1.0, 'x', 'x' - assert_distance 0.0, '', '' - assert_distance 0.0, 'tony', '' - assert_distance 0.0, '', 'tony' - assert_distance 0.8727, 'tonytonyjan', 'tony' - assert_distance 0.8727, 'tony', 'tonytonyjan' - assert_distance 0.9407, 'necessary', 'nessecary' - assert_distance 0.9067, 'does_exist', 'doesnt_exist' - assert_distance 0.975, '12345678', '12345687' - assert_distance 0.975, '12345678', '12345867' - assert_distance 0.95, '12345678', '12348567' + assert_distance 0.9667, 'henka', 'henkan' end def test_jaro_distance - assert_jaro_distance 0.9444, 'henka', 'henkan' - assert_jaro_distance 1.0, 'al', 'al' - assert_jaro_distance 0.9444, 'martha', 'marhta' - assert_jaro_distance 0.7905, 'jones', 'johnson' - assert_jaro_distance 0.9583, 'abcvwxyz', 'cabvwxyz' - assert_jaro_distance 0.8222, 'dwayne', 'duane' - assert_jaro_distance 0.7667, 'dixon', 'dicksonx' - assert_jaro_distance 0.0, 'fvie', 'ten' - assert_jaro_distance 1.0, 'tony', 'tony' - assert_jaro_distance 1.0, 'tonytonyjan', 'tonytonyjan' - assert_jaro_distance 1.0, 'x', 'x' - assert_jaro_distance 0.0, '', '' - assert_jaro_distance 0.0, 'tony', '' - assert_jaro_distance 0.0, '', 'tony' - assert_jaro_distance 0.7879, 'tonytonyjan', 'tony' - assert_jaro_distance 0.7879, 'tony', 'tonytonyjan' - assert_jaro_distance 0.9259, 'necessary', 'nessecary' - assert_jaro_distance 0.8444, 'does_exist', 'doesnt_exist' - assert_jaro_distance 0.9583, '12345678', '12345687' - assert_jaro_distance 0.9583, '12345678', '12345867' - assert_jaro_distance 0.9167, '12345678', '12348567' - assert_jaro_distance 0.604, 'tonytonyjan', 'janjantony' + assert_jaro_distance 0.9444, 'henka', 'henkan' end def test_unicode - assert_distance 0.9818, '變形金剛4:絕跡重生', '變形金剛4: 絕跡重生' - assert_distance 0.8222, '連勝文', '連勝丼' - assert_distance 0.8222, '馬英九', '馬英丸' - assert_distance 0.6667, '良い', 'いい' + assert_similarity 0.9818, '變形金剛4:絕跡重生', '變形金剛4: 絕跡重生' + assert_similarity 0.8222, '連勝文', '連勝丼' + assert_similarity 0.8222, '馬英九', '馬英丸' + assert_similarity 0.6667, '良い', 'いい' end def test_ignore_case - assert_distance 0.9611, 'MARTHA', 'marhta', ignore_case: true + assert_similarity 0.9611, 'MARTHA', 'marhta', ignore_case: true end def test_weight - assert_distance 0.9778, 'MARTHA', 'MARHTA', weight: 0.2 + assert_similarity 0.9778, 'MARTHA', 'MARHTA', weight: 0.2 end def test_threshold - assert_distance 0.9444, 'MARTHA', 'MARHTA', threshold: 0.99 + assert_similarity 0.9444, 'MARTHA', 'MARHTA', threshold: 0.99 end def test_adjusting_table - assert_distance 0.9667, 'HENKA', 'HENKAN', adj_table: true - assert_distance 1.0, 'AL', 'AL', adj_table: true - assert_distance 0.9611, 'MARTHA', 'MARHTA', adj_table: true - assert_distance 0.8598, 'JONES', 'JOHNSON', adj_table: true - assert_distance 0.9583, 'ABCVWXYZ', 'CABVWXYZ', adj_table: true - assert_distance 0.8730, 'DWAYNE', 'DUANE', adj_table: true - assert_distance 0.8393, 'DIXON', 'DICKSONX', adj_table: true - assert_distance 0.0, 'FVIE', 'TEN', adj_table: true + assert_similarity 0.9667, 'HENKA', 'HENKAN', adj_table: true + assert_similarity 1.0, 'AL', 'AL', adj_table: true + assert_similarity 0.9611, 'MARTHA', 'MARHTA', adj_table: true + assert_similarity 0.8598, 'JONES', 'JOHNSON', adj_table: true + assert_similarity 0.9583, 'ABCVWXYZ', 'CABVWXYZ', adj_table: true + assert_similarity 0.8730, 'DWAYNE', 'DUANE', adj_table: true + assert_similarity 0.8393, 'DIXON', 'DICKSONX', adj_table: true + assert_similarity 0.0, 'FVIE', 'TEN', adj_table: true end def test_error assert_raises JaroWinkler::InvalidWeightError do - JaroWinkler.distance 'MARTHA', 'MARHTA', weight: 0.26 + JaroWinkler.similarity 'MARTHA', 'MARHTA', weight: 0.26 end end def test_long_string - JaroWinkler.distance 'haisai' * 20, 'haisai' * 20 + JaroWinkler.similarity 'haisai' * 20, 'haisai' * 20 end def test_encoding @@ -95,16 +103,16 @@ def test_encoding assert_encoding '簡煒航', '簡偉航', Encoding::Big5_HKSCS assert_encoding '西島之', '西鳥志', Encoding::EUCJP assert_encoding '松本行弘', '枩本行弘', Encoding::Shift_JIS - assert_distance 1.0, "\xe8".force_encoding('iso8859-1'), 'è' + assert_similarity 1.0, "\xe8".force_encoding('iso8859-1'), 'è' end def test_raises_type_error - assert_raises(TypeError){ JaroWinkler.distance 'MARTHA', nil } - assert_raises(TypeError){ JaroWinkler.distance nil, 'MARTHA' } - assert_raises(TypeError){ JaroWinkler.distance nil, nil } - assert_raises(TypeError){ JaroWinkler.distance 'MARTHA', :non_string } - assert_raises(TypeError){ JaroWinkler.distance :non_string, 'MARTHA' } - assert_raises(TypeError){ JaroWinkler.distance :non_string, :non_string } + assert_raises(TypeError){ JaroWinkler.similarity 'MARTHA', nil } + assert_raises(TypeError){ JaroWinkler.similarity nil, 'MARTHA' } + assert_raises(TypeError){ JaroWinkler.similarity nil, nil } + assert_raises(TypeError){ JaroWinkler.similarity 'MARTHA', :non_string } + assert_raises(TypeError){ JaroWinkler.similarity :non_string, 'MARTHA' } + assert_raises(TypeError){ JaroWinkler.similarity :non_string, :non_string } end private @@ -114,10 +122,18 @@ def assert_distance score, str1, str2, **options end def assert_encoding str1, str2, encoding, **options - assert_distance JaroWinkler.distance(str1, str2), str1.encode(encoding), str2.encode(encoding) + assert_similarity JaroWinkler.distance(str1, str2), str1.encode(encoding), str2.encode(encoding) end def assert_jaro_distance score, str1, str2, **options assert_in_delta score, JaroWinkler.jaro_distance(str1, str2, **options) end -end \ No newline at end of file + + def assert_similarity score, str1, str2, **options + assert_in_delta score, JaroWinkler.similarity(str1, str2, **options) + end + + def assert_jaro_similarity score, str1, str2, **options + assert_in_delta score, JaroWinkler.jaro_similarity(str1, str2, **options) + end +end