From 8e6ef79dac8a8d60294decdb4eafbf30f6373ec8 Mon Sep 17 00:00:00 2001 From: Finn Bear Date: Tue, 16 Mar 2021 23:35:06 -0700 Subject: [PATCH] Handle greek letters, math symbols, and lookalike characters. --- README.md | 4 +- generator/dictionary_extra.txt | 1 + generator/profanity.csv | 4 ++ moderation.go | 41 ++++------- moderation_test.go | 5 +- replacements.go | 124 +++++++++++++++++++++++++++++++++ wordlists.csv | 46 ++++++++++++ wordlists.go | 46 ++++++++++++ 8 files changed, 240 insertions(+), 31 deletions(-) create mode 100644 replacements.go diff --git a/README.md b/README.md index b712706..b3cb01d 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ 3. Minimum false negatives (including text like `h3110_w0r!d`) 4. Minimum false positives 5. Provide a way to censor text -6. (Future) Other analysis types than inappropriate, profane, offensive, sexual, mean (spam, violence, contact info, etc.) +6. (Future) Other analysis types than inappropriate, profane, offensive, sexual, mean, spam (violence, contact info, etc.) 7. (Future) Basic support for languages other than English ## Example @@ -53,7 +53,7 @@ Accuracy was evaluated based on the first 100,000 items from this [dataset of mo |**Package**|**Time**|**Accuracy**|**Comment**| |:-----:|:-----:|:-----:|:-----:| -|[finnbear/moderation](https://github.com/finnbear/moderation)|1.52s|91.18%|Current API version is not stable| +|[finnbear/moderation](https://github.com/finnbear/moderation)|1.66s|91.12%|Current API version is not stable| |[TwinProduction/go-away](https://github.com/TwinProduction/go-away)|2.20s|82.14%|Many false positives from combined words like "push it"| diff --git a/generator/dictionary_extra.txt b/generator/dictionary_extra.txt index 94cbbb6..a5f4990 100644 --- a/generator/dictionary_extra.txt +++ b/generator/dictionary_extra.txt @@ -9,6 +9,7 @@ cockburn glhf gtg harshitha +hellen ikr imho irl diff --git a/generator/profanity.csv b/generator/profanity.csv index a56567f..40906c8 100644 --- a/generator/profanity.csv +++ b/generator/profanity.csv @@ -3,12 +3,14 @@ anal,2,0,2,0 anus,2,0,0,0 arse,2,0,0,0 ass,2,0,0,0 +azz,1,0,0,0 balls,0,0,1,0 ballsack,0,0,3,0 bastard,2,3,0,0 biatch,0,2,2,0 bich,0,2,2,0 bitch,0,2,2,0 +bitsch,0,2,2,0 bloody,2,0,0,0 blowjob,0,0,3,0 bollock,2,0,2,0 @@ -106,6 +108,7 @@ nigr,3,5,0,0 nigs,2,3,0,0 nude,0,0,3,0 orgasm,0,0,2,0 +pedophile,0,0,2,0 peepee,2,0,2,0 penis,1,0,3,0 penus,1,0,3,0 @@ -131,6 +134,7 @@ retart,0,2,0,2 rimjob,0,0,3,0 scrotum,0,0,3,0 scum,0,0,2,2 +secs,0,0,1,0 semen,0,0,3,0 sex,0,0,1,0 shagger,0,0,3,0 diff --git a/moderation.go b/moderation.go index 99b738f..85d242a 100644 --- a/moderation.go +++ b/moderation.go @@ -32,27 +32,6 @@ const ( var ( tree radix.Tree = radix.New() - // Replace the key with any one of the characters in the value - replacements = [...]string{ - '!': "li", - '@': "a", - '4': "a", - '8': "b", - '6': "b", - '(': "c", - '<': "c", - '3': "eg", - '9': "gq", - '#': "h", - '1': "li", - '0': "o", - '5': "s", - '$': "s", - '+': "t", - '7': "t", - '2': "z", - } - removeAccentsTransform = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) ) @@ -112,16 +91,15 @@ func Is(text string, types Type) bool { var replacement string if int(textByte) < len(replacements) { replacement = replacements[textByte] + } else if textRune > maxMatchable { + replacement = runeReplacements[textRune] + if replacement == "" { + lowerRune := unicode.ToLower(textRune) + replacement = runeReplacements[lowerRune] + } } switch { - case textRune < minMatchable || maxMatchable < textRune: - // Unhandled runes (not printable, not representable as byte, etc.) - // matchable = false - switch textRune { - case '\n', '\r', '\t': - skippable = true - } case textByte >= 'A' && textByte <= 'Z': upperCount++ textByte += 'a' - 'A' @@ -132,6 +110,13 @@ func Is(text string, types Type) bool { textByte = replacement[0] textBytes = replacement matchable = true + case textRune < minMatchable || maxMatchable < textRune: + // Unhandled runes (not printable, not representable as byte, etc.) + // matchable = false + switch textRune { + case '\n', '\r', '\t': + skippable = true + } default: switch textByte { case '*': // these count as replacements diff --git a/moderation_test.go b/moderation_test.go index 822fa80..e593ff9 100644 --- a/moderation_test.go +++ b/moderation_test.go @@ -27,6 +27,7 @@ func TestAnalyze(t *testing.T) { {"ass", true}, {"glass", false}, {"ÄšŚ", true}, + {"ĂżŽ", true}, {"sex", true}, {"hello_world-sex_word", true}, {"sexy", true}, @@ -38,6 +39,8 @@ func TestAnalyze(t *testing.T) { {"push it", false}, {"carcass", false}, {"retarded", true}, + {"βιτ⊂η", true}, // greek letters + {"ⓅɄȿⓢⓨ", true}, {"I had called upon my friend, Mr. Sherlock Holmes, one day in the autumn of last year and found him in deep conversation with a very stout, florid-faced, elderly gentleman with fiery red hair.", false}, {"With an apology for my intrusion, I was about to withdraw when Holmes pulled me abruptly into the room and closed the door behind me.", false}, {"You could not possibly have come at a better time, my dear Watson, he said cordially", false}, @@ -133,7 +136,7 @@ func TestAnalyzeWikipedia(t *testing.T) { correctOk++ } } else if len(phrase) < 40 { - fmt.Printf("phrase=\"%s\" analysis offensive=%v actual offensive=%v\n", phrase, !offensive, offensive) + //fmt.Printf("phrase=\"%s\" analysis offensive=%v actual offensive=%v\n", phrase, !offensive, offensive) } //_, _ = Censor(phrase, Inappropriate) diff --git a/replacements.go b/replacements.go new file mode 100644 index 0000000..a07c5f0 --- /dev/null +++ b/replacements.go @@ -0,0 +1,124 @@ +package moderation + +var ( + // Replace the key with any one of the characters in the value + replacements = [...]string{ + '!': "li", + '@': "a", + '4': "a", + '8': "b", + '6': "b", + '(': "c", + '<': "c", + '3': "eg", + '9': "gq", + '#': "h", + '1': "li", + '0': "o", + '5': "s", + '$': "s", + '+': "t", + '7': "t", + '2': "z", + } + + runeReplacements = map[rune]string{ + // Greek letters + 'α': "a", + 'β': "b", + 'γ': "y", + '∆': "a", + 'δ': "d", + 'ε': "e", + 'ζ': "z", + 'η': "hn", + 'θ': "o", + 'ι': "i", + 'κ': "k", + 'λ': "l", + 'μ': "mu", + 'ν': "nv", + 'ο': "o", + 'ρ': "p", + 'ς': "s", + 'τ': "t", + 'υ': "u", + 'φ': "p", + 'χ': "x", + 'ψ': "t", + 'Ω': "o", + 'ω': "w", + + // Math symbols + '⊗': "o", + '⊕': "o", + 'σ': "o", + '∩': "n", + '∪': "u", + '⊂': "c", + '⊆': "c", + '⊄': "c", + '∈': "e", + '⊖': "o", + 'Ø': "o", + '∨': "v", + '∄': "ab", + '∫': "l", + + // Letterlike + 'ℂ': "c", + '℃': "c", + '℄': "c", + 'ℇ': "e", + '℉': "f", + 'ℊ': "g", + 'ℋ': "h", + 'ℌ': "h", + 'ℍ': "h", + 'ℎ': "h", + 'ℏ': "h", + 'ℐ': "j", + 'ℑ': "j", + 'ℒ': "l", + 'ℓ': "l", + '℔': "b", + 'ℕ': "n", + '№': "n", + '℗': "p", + '℘': "p", + 'ℙ': "p", + 'ℚ': "q", + 'ℛ': "r", + 'ℜ': "r", + 'ℝ': "r", + '℟': "r", + '℣': "v", + 'ℤ': "z", + '℧': "o", + '℩': "i", + 'K': "k", + 'Å': "a", + 'ℬ': "b", + 'ℭ': "c", + '℮': "e", + 'e': "e", + 'ℰ': "e", + 'ℱ': "f", + 'ℳ': "m", + 'ℴ': "o", + 'ℵ': "n", + 'ℹ': "i", + '℺': "o", + 'ℼ': "n", + 'ℽ': "v", + 'ℿ': "n", + '⅀': "e", + '⅁': "g", + '⅄': "l", + 'ⅅ': "d", + 'ⅆ': "d", + 'ⅇ': "e", + 'ⅈ': "i", + 'ⅉ': "ji", + } +) diff --git a/wordlists.csv b/wordlists.csv index 241445c..904e47d 100644 --- a/wordlists.csv +++ b/wordlists.csv @@ -1,6 +1,7 @@ word,profane,offensive,sexual,mean pp , 0, 0, 2, 0 ass , 2, 0, 0, 0 +azz , 1, 0, 0, 0 bra , 0, 0, 1, 0 cum , 0, 0, 3, 0 fag , 0, 3, 1, 0 @@ -79,6 +80,7 @@ pube , 0, 0, 3, 0 pusy , 0, 2, 2, 0 rape , 0, 0, 3, 0 scum , 0, 0, 2, 2 +secs , 0, 0, 1, 0 shit , 2, 0, 0, 0 slut , 0, 2, 2, 0 stfu , 0, 0, 0, 2 @@ -299,6 +301,7 @@ bedumb , 0, 0, 0, -1 biatch , 0, 2, 2, 0 bichir , 0, -2, -2, 0 bichos , 0, -2, -2, 0 +bitsch , 0, 2, 2, 0 bloody , 2, 0, 0, 0 boclit , 0, 0, -3, 0 bocrap , -1, 0, 0, 0 @@ -1113,6 +1116,8 @@ bitchef , 0, -2, -2, 0 bitchem , 0, -2, -2, 0 bitchen , 0, -2, -2, 0 bitchip , 0, -2, -2, 0 +bitschi , 0, -2, -2, 0 +bitscho , 0, -2, -2, 0 bladego , 0, -3, 0, 0 blahate , 0, 0, 0, -2 blowjob , 0, 0, 3, 0 @@ -2411,6 +2416,13 @@ bitchoir , 0, -2, -2, 0 bitchose , 0, -2, -2, 0 bitchris , 0, -2, -2, 0 bitchuck , 0, -2, -2, 0 +bitschad , 0, -2, -2, 0 +bitschan , 0, -2, -2, 0 +bitschar , 0, -2, -2, 0 +bitschat , 0, -2, -2, 0 +bitschef , 0, -2, -2, 0 +bitschem , 0, -2, -2, 0 +bitschen , 0, -2, -2, 0 blondego , 0, -3, 0, 0 bloodyea , -2, 0, 0, 0 bloodyen , -2, 0, 0, 0 @@ -4134,6 +4146,18 @@ bitchorus , 0, -2, -2, 0 bitchrome , 0, -2, -2, 0 bitchubby , 0, -2, -2, 0 bitchurch , 0, -2, -2, 0 +bitschain , 0, -2, -2, 0 +bitschair , 0, -2, -2, 0 +bitschaos , 0, -2, -2, 0 +bitschase , 0, -2, -2, 0 +bitscheap , 0, -2, -2, 0 +bitscheat , 0, -2, -2, 0 +bitscheck , 0, -2, -2, 0 +bitschess , 0, -2, -2, 0 +bitschest , 0, -2, -2, 0 +bitschevy , 0, -2, -2, 0 +bitschris , 0, -2, -2, 0 +bitschuck , 0, -2, -2, 0 blahellis , -1, 0, 0, 0 bloodyale , -2, 0, 0, 0 bloodyang , -2, 0, 0, 0 @@ -5248,6 +5272,7 @@ pathspunk , 0, 0, -3, 0 patwatson , 0, 0, -3, 0 pctwatson , 0, 0, -3, 0 pdtwatson , 0, 0, -3, 0 +pedophile , 0, 0, 2, 0 peepeeing , -2, 0, -2, 0 peerspunk , 0, 0, -3, 0 pelecanus , -2, 0, 0, 0 @@ -6045,6 +6070,14 @@ bitchicago , 0, -2, -2, 0 bitchinese , 0, -2, -2, 0 bitchiness , 0, -2, -2, 0 bitchronic , 0, -2, -2, 0 +bitschapel , 0, -2, -2, 0 +bitscheers , 0, -2, -2, 0 +bitscheese , 0, -2, -2, 0 +bitscheque , 0, -2, -2, 0 +bitscherry , 0, -2, -2, 0 +bitschrome , 0, -2, -2, 0 +bitschubby , 0, -2, -2, 0 +bitschurch , 0, -2, -2, 0 blackballs , 0, 0, -1, 0 blackspunk , 0, 0, -3, 0 bloodyacht , -2, 0, 0, 0 @@ -7745,6 +7778,12 @@ bitchampion , 0, -2, -2, 0 bitcharging , 0, -2, -2, 0 bitchoosing , 0, -2, -2, 0 bitchrysler , 0, -2, -2, 0 +bitschamber , 0, -2, -2, 0 +bitschapter , 0, -2, -2, 0 +bitschassis , 0, -2, -2, 0 +bitschedule , 0, -2, -2, 0 +bitschelsea , 0, -2, -2, 0 +bitschronic , 0, -2, -2, 0 blackbreast , 0, 0, -2, 0 blahelliott , -1, 0, 0, 0 blanketwatt , 0, 0, -3, 0 @@ -9151,6 +9190,8 @@ bitcharacter , 0, -2, -2, 0 bitcharlotte , 0, -2, -2, 0 bitchevrolet , 0, -2, -2, 0 bitchocolate , 0, -2, -2, 0 +bitschampion , 0, -2, -2, 0 +bitschrysler , 0, -2, -2, 0 blanketwatch , 0, 0, -3, 0 blanketwater , 0, 0, -3, 0 bloggerspunk , 0, 0, -3, 0 @@ -10210,6 +10251,10 @@ bikinigravity , -3, -5, 0, 0 bikinigriffin , -3, -5, 0, 0 bikinigrocery , -3, -5, 0, 0 bitcharitable , 0, -2, -2, 0 +bitschallenge , 0, -2, -2, 0 +bitschampagne , 0, -2, -2, 0 +bitscheduling , 0, -2, -2, 0 +bitschevrolet , 0, -2, -2, 0 blanketwatson , 0, 0, -3, 0 bonerelevance , 0, 0, -3, 0 bonereligious , 0, 0, -3, 0 @@ -11330,6 +11375,7 @@ ballsustainable , 0, 0, -1, 0 ballswitzerland , 0, 0, -1, 0 beanalbuquerque , -2, 0, -2, 0 biographiespunk , 0, 0, -3, 0 +bitschallenging , 0, -2, -2, 0 bloodyugoslavia , -2, 0, 0, 0 bonereliability , 0, 0, -3, 0 bonerenaissance , 0, 0, -3, 0 diff --git a/wordlists.go b/wordlists.go index afc8ee5..0f5d6dd 100644 --- a/wordlists.go +++ b/wordlists.go @@ -10,6 +10,7 @@ type wordValue struct { var wordValues = [...]wordValue{ {"pp", 0x20000}, {"ass", 0x2}, + {"azz", 0x1}, {"bra", 0x10000}, {"cum", 0x30000}, {"fag", 0x10300}, @@ -88,6 +89,7 @@ var wordValues = [...]wordValue{ {"pusy", 0x20200}, {"rape", 0x30000}, {"scum", 0x2020000}, + {"secs", 0x10000}, {"shit", 0x2}, {"slut", 0x20200}, {"stfu", 0x2000000}, @@ -308,6 +310,7 @@ var wordValues = [...]wordValue{ {"biatch", 0x20200}, {"bichir", 0xfefe00}, {"bichos", 0xfefe00}, + {"bitsch", 0x20200}, {"bloody", 0x2}, {"boclit", 0xfd0000}, {"bocrap", 0xff}, @@ -1122,6 +1125,8 @@ var wordValues = [...]wordValue{ {"bitchem", 0xfefe00}, {"bitchen", 0xfefe00}, {"bitchip", 0xfefe00}, + {"bitschi", 0xfefe00}, + {"bitscho", 0xfefe00}, {"bladego", 0xfd00}, {"blahate", 0xfe000000}, {"blowjob", 0x30000}, @@ -2420,6 +2425,13 @@ var wordValues = [...]wordValue{ {"bitchose", 0xfefe00}, {"bitchris", 0xfefe00}, {"bitchuck", 0xfefe00}, + {"bitschad", 0xfefe00}, + {"bitschan", 0xfefe00}, + {"bitschar", 0xfefe00}, + {"bitschat", 0xfefe00}, + {"bitschef", 0xfefe00}, + {"bitschem", 0xfefe00}, + {"bitschen", 0xfefe00}, {"blondego", 0xfd00}, {"bloodyea", 0xfe}, {"bloodyen", 0xfe}, @@ -4143,6 +4155,18 @@ var wordValues = [...]wordValue{ {"bitchrome", 0xfefe00}, {"bitchubby", 0xfefe00}, {"bitchurch", 0xfefe00}, + {"bitschain", 0xfefe00}, + {"bitschair", 0xfefe00}, + {"bitschaos", 0xfefe00}, + {"bitschase", 0xfefe00}, + {"bitscheap", 0xfefe00}, + {"bitscheat", 0xfefe00}, + {"bitscheck", 0xfefe00}, + {"bitschess", 0xfefe00}, + {"bitschest", 0xfefe00}, + {"bitschevy", 0xfefe00}, + {"bitschris", 0xfefe00}, + {"bitschuck", 0xfefe00}, {"blahellis", 0xff}, {"bloodyale", 0xfe}, {"bloodyang", 0xfe}, @@ -5257,6 +5281,7 @@ var wordValues = [...]wordValue{ {"patwatson", 0xfd0000}, {"pctwatson", 0xfd0000}, {"pdtwatson", 0xfd0000}, + {"pedophile", 0x20000}, {"peepeeing", 0xfe00fe}, {"peerspunk", 0xfd0000}, {"pelecanus", 0xfe}, @@ -6054,6 +6079,14 @@ var wordValues = [...]wordValue{ {"bitchinese", 0xfefe00}, {"bitchiness", 0xfefe00}, {"bitchronic", 0xfefe00}, + {"bitschapel", 0xfefe00}, + {"bitscheers", 0xfefe00}, + {"bitscheese", 0xfefe00}, + {"bitscheque", 0xfefe00}, + {"bitscherry", 0xfefe00}, + {"bitschrome", 0xfefe00}, + {"bitschubby", 0xfefe00}, + {"bitschurch", 0xfefe00}, {"blackballs", 0xff0000}, {"blackspunk", 0xfd0000}, {"bloodyacht", 0xfe}, @@ -7754,6 +7787,12 @@ var wordValues = [...]wordValue{ {"bitcharging", 0xfefe00}, {"bitchoosing", 0xfefe00}, {"bitchrysler", 0xfefe00}, + {"bitschamber", 0xfefe00}, + {"bitschapter", 0xfefe00}, + {"bitschassis", 0xfefe00}, + {"bitschedule", 0xfefe00}, + {"bitschelsea", 0xfefe00}, + {"bitschronic", 0xfefe00}, {"blackbreast", 0xfe0000}, {"blahelliott", 0xff}, {"blanketwatt", 0xfd0000}, @@ -9160,6 +9199,8 @@ var wordValues = [...]wordValue{ {"bitcharlotte", 0xfefe00}, {"bitchevrolet", 0xfefe00}, {"bitchocolate", 0xfefe00}, + {"bitschampion", 0xfefe00}, + {"bitschrysler", 0xfefe00}, {"blanketwatch", 0xfd0000}, {"blanketwater", 0xfd0000}, {"bloggerspunk", 0xfd0000}, @@ -10219,6 +10260,10 @@ var wordValues = [...]wordValue{ {"bikinigriffin", 0xfbfd}, {"bikinigrocery", 0xfbfd}, {"bitcharitable", 0xfefe00}, + {"bitschallenge", 0xfefe00}, + {"bitschampagne", 0xfefe00}, + {"bitscheduling", 0xfefe00}, + {"bitschevrolet", 0xfefe00}, {"blanketwatson", 0xfd0000}, {"bonerelevance", 0xfd0000}, {"bonereligious", 0xfd0000}, @@ -11339,6 +11384,7 @@ var wordValues = [...]wordValue{ {"ballswitzerland", 0xff0000}, {"beanalbuquerque", 0xfe00fe}, {"biographiespunk", 0xfd0000}, + {"bitschallenging", 0xfefe00}, {"bloodyugoslavia", 0xfe}, {"bonereliability", 0xfd0000}, {"bonerenaissance", 0xfd0000},