diff --git a/README.md b/README.md index b3cb01d..0f0f098 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Accuracy was evaluated based on the first 100,000 items from this [dataset of mo |**Package**|**Time**|**Accuracy**|**Comment**| |:-----:|:-----:|:-----:|:-----:| -|[finnbear/moderation](https://github.com/finnbear/moderation)|1.66s|91.12%|Current API version is not stable| +|[finnbear/moderation](https://github.com/finnbear/moderation)|1.62s|91.13%|Current API version is not stable| |[TwinProduction/go-away](https://github.com/TwinProduction/go-away)|2.20s|82.14%|Many false positives from combined words like "push it"| diff --git a/moderation_test.go b/moderation_test.go index e593ff9..ccd2713 100644 --- a/moderation_test.go +++ b/moderation_test.go @@ -5,6 +5,7 @@ import ( "fmt" "io" "os" + "strings" "testing" ) @@ -74,6 +75,21 @@ func TestAnalyze(t *testing.T) { } } +func TestRedundantReplacement(t *testing.T) { + for c, s := range replacements { + if strings.ContainsRune(s, rune(c)) { + t.Errorf("byte replacement %s is redundant\n", string([]byte{byte(c)})) + t.Fail() + } + } + for c, s := range runeReplacements { + if strings.ContainsRune(s, c) { + t.Errorf("rune replacement %s is redundant\n", string([]rune{c})) + t.Fail() + } + } +} + func ExampleIs_shit_profane() { fmt.Println(Is("shit", Profane)) // Output: true diff --git a/replacements.go b/replacements.go index a07c5f0..ad3f3d9 100644 --- a/replacements.go +++ b/replacements.go @@ -101,7 +101,6 @@ var ( 'ℬ': "b", 'ℭ': "c", '℮': "e", - 'e': "e", 'ℰ': "e", 'ℱ': "f", 'ℳ': "m", @@ -120,5 +119,44 @@ var ( 'ⅇ': "e", 'ⅈ': "i", 'ⅉ': "ji", + + // Confusable: http://www.unicode.org/reports/tr36/confusables.txt + 'е': "e", + 'о': "o", + 'ѕ': "s", + 'х': "x", + 'і': "i", + 'ј': "j", + 'р': "p", + 'с': "c", + 'у': "y", + 'ѵ': "v", + 'ɑ': "a", + 'ɡ': "g", + 'ɩ': "li", + 'ɒ': "o", + 'г': "r", + 'π': "n", + 'ո': "n", + 'հ': "h", + 'ս': "u", + 'ց': "g", + 'ք': "fp", + 'ყ': "y", + '୦': "o", + '০': "o", + '੦': "o", + '౦': "o", + '೦': "o", + '๐': "o", + '໐': "o", + '᠐': "o", + '〇': "o", + 'օ': "o", + 'б': "b", + '৪': "b", + '৭': "g", + '੧': "g", + '୨': "g", } )