-
-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* feat: add qgram and sorensen-dice * fix: Change function names and add Qgram and SorensenDice to string-analysis * feat(qgram): add similarity function to return an index test: fix test cases for QGram Co-authored-by: hbollon <hugo.bollon@gmail.com>
- Loading branch information
1 parent
34fcab0
commit 5f65401
Showing
7 changed files
with
202 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
package edlib | ||
|
||
import ( | ||
"math" | ||
) | ||
|
||
// QgramDistance compute the q-gram similarity between two strings | ||
// Takes two strings as parameters, a split length which defines the k-gram shingle length | ||
func QgramDistance(str1, str2 string, splitLength int) int { | ||
splittedStr1 := Shingle(str1, splitLength) | ||
splittedStr2 := Shingle(str2, splitLength) | ||
|
||
union := make(map[string]int) | ||
for i := range splittedStr1 { | ||
union[i] = 0 | ||
} | ||
for i := range splittedStr2 { | ||
union[i] = 0 | ||
} | ||
|
||
res := 0 | ||
|
||
for i := range union { | ||
res += int(math.Abs(float64(splittedStr1[i] - splittedStr2[i]))) | ||
} | ||
|
||
return res | ||
} | ||
|
||
// QgramDistanceCustomNgram compute the q-gram similarity between two custom set of individuals | ||
// Takes two n-gram map as parameters | ||
func QgramDistanceCustomNgram(splittedStr1, splittedStr2 map[string]int) int { | ||
union := make(map[string]int) | ||
for i := range splittedStr1 { | ||
union[i] = 0 | ||
} | ||
for i := range splittedStr2 { | ||
union[i] = 0 | ||
} | ||
|
||
res := 0 | ||
for i := range union { | ||
res += int(math.Abs(float64(splittedStr1[i] - splittedStr2[i]))) | ||
} | ||
|
||
return res | ||
} | ||
|
||
// QgramSimilarity compute a similarity index (between 0 and 1) between two strings from a Qgram distance | ||
// Takes two strings as parameters, a split length which defines the k-gram shingle length | ||
func QgramSimilarity(str1, str2 string, splitLength int) float32 { | ||
splittedStr1 := Shingle(str1, splitLength) | ||
splittedStr2 := Shingle(str2, splitLength) | ||
res := float32(QgramDistanceCustomNgram(splittedStr1, splittedStr2)) | ||
return 1 - (res / float32(len(splittedStr1)+len(splittedStr2))) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
package edlib | ||
|
||
import ( | ||
"testing" | ||
) | ||
|
||
func TestQgramDistance(t *testing.T) { | ||
type args struct { | ||
str1 string | ||
str2 string | ||
splitLength int | ||
} | ||
tests := []struct { | ||
name string | ||
args args | ||
want int | ||
}{ | ||
{"Qgram sim 1", args{"Radiohead", "Radiohead", 2}, 0.0}, | ||
{"Qgram sim 2", args{"ABCD", "ABCE", 2}, 2.0}, | ||
{"Qgram sim 3", args{"Radiohead", "Carly Rae Jepsen", 2}, 21.0}, | ||
{"Qgram sim 4", args{"I love horror movies", "Lights out is a horror movie", 2}, 22.0}, | ||
{"Qgram sim 5", args{"love horror movies", "Lights out horror movie", 2}, 15.0}, | ||
{"Qgram sim 6", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", 2}, 5}, | ||
{"Qgram sim 7", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", 2}, 4}, | ||
{"Qgram sim 8", args{"", "", 2}, 0.0}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
if got := QgramDistance(tt.args.str1, tt.args.str2, tt.args.splitLength); got != tt.want { | ||
t.Errorf("QgramDistance() = %v, want %v", got, tt.want) | ||
} | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
package edlib | ||
|
||
// SorensenDiceCoefficient computes the Sorensen-Dice coefficient between two strings | ||
// Takes two strings as parameters, a split length which defines the k-gram shingle length | ||
func SorensenDiceCoefficient(str1, str2 string, splitLength int) float32 { | ||
if str1 == "" && str2 == "" { | ||
return 0 | ||
} | ||
shingle1 := Shingle(str1, splitLength) | ||
shingle2 := Shingle(str2, splitLength) | ||
|
||
intersection := float32(0) | ||
for i := range shingle1 { | ||
if _, ok := shingle2[i]; ok { | ||
intersection++ | ||
} | ||
} | ||
return 2.0 * intersection / float32(len(shingle1)+len(shingle2)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
package edlib | ||
|
||
import ( | ||
"testing" | ||
) | ||
|
||
func TestSorensenDiceCoefficient(t *testing.T) { | ||
type args struct { | ||
str1 string | ||
str2 string | ||
splitLength int | ||
} | ||
tests := []struct { | ||
name string | ||
args args | ||
want float32 | ||
}{ | ||
{"SorensenDiceCoefficient 1", args{"night", "nacht", 2}, 0.25}, | ||
{"SorensenDiceCoefficient 2", args{"Radiohead", "Radiohead", 2}, 1.0}, | ||
{"SorensenDiceCoefficient 3", args{"", "", 2}, 0.0}, | ||
{"SorensenDiceCoefficient 4", args{"Radiohead", "Carly Rae Jepsen", 2}, 0.09090909}, | ||
{"SorensenDiceCoefficient 5", args{"I love horror movies", "Lights out is a horror movie", 2}, 0.52380955}, | ||
{"SorensenDiceCoefficient 6", args{"love horror movies", "Lights out horror movie", 2}, 0.6111111}, | ||
{"SorensenDiceCoefficient 7", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", 2}, 0.7619048}, | ||
{"SorensenDiceCoefficient 8", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", 2}, 0.8888889}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
if got := SorensenDiceCoefficient(tt.args.str1, tt.args.str2, tt.args.splitLength); got != tt.want { | ||
t.Errorf("SorensenDiceCoefficient() = %v, want %v", got, tt.want) | ||
} | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters