Skip to content
This repository has been archived by the owner on Jan 13, 2024. It is now read-only.

Commit

Permalink
Handle greek letters, math symbols, and lookalike characters.
Browse files Browse the repository at this point in the history
  • Loading branch information
finnbear committed Mar 17, 2021
1 parent f76d093 commit 8e6ef79
Show file tree
Hide file tree
Showing 8 changed files with 240 additions and 31 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
3. Minimum false negatives (including text like `h3110_w0r!d`)
4. Minimum false positives
5. Provide a way to censor text
6. (Future) Other analysis types than inappropriate, profane, offensive, sexual, mean (spam, violence, contact info, etc.)
6. (Future) Other analysis types than inappropriate, profane, offensive, sexual, mean, spam (violence, contact info, etc.)
7. (Future) Basic support for languages other than English

## Example
Expand Down Expand Up @@ -53,7 +53,7 @@ Accuracy was evaluated based on the first 100,000 items from this [dataset of mo

|**Package**|**Time**|**Accuracy**|**Comment**|
|:-----:|:-----:|:-----:|:-----:|
|[finnbear/moderation](https://github.com/finnbear/moderation)|1.52s|91.18%|Current API version is not stable|
|[finnbear/moderation](https://github.com/finnbear/moderation)|1.66s|91.12%|Current API version is not stable|
|[TwinProduction/go-away](https://github.com/TwinProduction/go-away)|2.20s|82.14%|Many false positives from combined words like "push it"|


Expand Down
1 change: 1 addition & 0 deletions generator/dictionary_extra.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ cockburn
glhf
gtg
harshitha
hellen
ikr
imho
irl
Expand Down
4 changes: 4 additions & 0 deletions generator/profanity.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@ anal,2,0,2,0
anus,2,0,0,0
arse,2,0,0,0
ass,2,0,0,0
azz,1,0,0,0
balls,0,0,1,0
ballsack,0,0,3,0
bastard,2,3,0,0
biatch,0,2,2,0
bich,0,2,2,0
bitch,0,2,2,0
bitsch,0,2,2,0
bloody,2,0,0,0
blowjob,0,0,3,0
bollock,2,0,2,0
Expand Down Expand Up @@ -106,6 +108,7 @@ nigr,3,5,0,0
nigs,2,3,0,0
nude,0,0,3,0
orgasm,0,0,2,0
pedophile,0,0,2,0
peepee,2,0,2,0
penis,1,0,3,0
penus,1,0,3,0
Expand All @@ -131,6 +134,7 @@ retart,0,2,0,2
rimjob,0,0,3,0
scrotum,0,0,3,0
scum,0,0,2,2
secs,0,0,1,0
semen,0,0,3,0
sex,0,0,1,0
shagger,0,0,3,0
Expand Down
41 changes: 13 additions & 28 deletions moderation.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,27 +32,6 @@ const (
var (
tree radix.Tree = radix.New()

// Replace the key with any one of the characters in the value
replacements = [...]string{
'!': "li",
'@': "a",
'4': "a",
'8': "b",
'6': "b",
'(': "c",
'<': "c",
'3': "eg",
'9': "gq",
'#': "h",
'1': "li",
'0': "o",
'5': "s",
'$': "s",
'+': "t",
'7': "t",
'2': "z",
}

removeAccentsTransform = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
)

Expand Down Expand Up @@ -112,16 +91,15 @@ func Is(text string, types Type) bool {
var replacement string
if int(textByte) < len(replacements) {
replacement = replacements[textByte]
} else if textRune > maxMatchable {
replacement = runeReplacements[textRune]
if replacement == "" {
lowerRune := unicode.ToLower(textRune)
replacement = runeReplacements[lowerRune]
}
}

switch {
case textRune < minMatchable || maxMatchable < textRune:
// Unhandled runes (not printable, not representable as byte, etc.)
// matchable = false
switch textRune {
case '\n', '\r', '\t':
skippable = true
}
case textByte >= 'A' && textByte <= 'Z':
upperCount++
textByte += 'a' - 'A'
Expand All @@ -132,6 +110,13 @@ func Is(text string, types Type) bool {
textByte = replacement[0]
textBytes = replacement
matchable = true
case textRune < minMatchable || maxMatchable < textRune:
// Unhandled runes (not printable, not representable as byte, etc.)
// matchable = false
switch textRune {
case '\n', '\r', '\t':
skippable = true
}
default:
switch textByte {
case '*': // these count as replacements
Expand Down
5 changes: 4 additions & 1 deletion moderation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ func TestAnalyze(t *testing.T) {
{"ass", true},
{"glass", false},
{"ÄšŚ", true},
{"ĂżŽ", true},
{"sex", true},
{"hello_world-sex_word", true},
{"sexy", true},
Expand All @@ -38,6 +39,8 @@ func TestAnalyze(t *testing.T) {
{"push it", false},
{"carcass", false},
{"retarded", true},
{"βιτ⊂η", true}, // greek letters
{"ⓅɄȿⓢⓨ", true},
{"I had called upon my friend, Mr. Sherlock Holmes, one day in the autumn of last year and found him in deep conversation with a very stout, florid-faced, elderly gentleman with fiery red hair.", false},
{"With an apology for my intrusion, I was about to withdraw when Holmes pulled me abruptly into the room and closed the door behind me.", false},
{"You could not possibly have come at a better time, my dear Watson, he said cordially", false},
Expand Down Expand Up @@ -133,7 +136,7 @@ func TestAnalyzeWikipedia(t *testing.T) {
correctOk++
}
} else if len(phrase) < 40 {
fmt.Printf("phrase=\"%s\" analysis offensive=%v actual offensive=%v\n", phrase, !offensive, offensive)
//fmt.Printf("phrase=\"%s\" analysis offensive=%v actual offensive=%v\n", phrase, !offensive, offensive)
}

//_, _ = Censor(phrase, Inappropriate)
Expand Down
124 changes: 124 additions & 0 deletions replacements.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package moderation

var (
// Replace the key with any one of the characters in the value
replacements = [...]string{
'!': "li",
'@': "a",
'4': "a",
'8': "b",
'6': "b",
'(': "c",
'<': "c",
'3': "eg",
'9': "gq",
'#': "h",
'1': "li",
'0': "o",
'5': "s",
'$': "s",
'+': "t",
'7': "t",
'2': "z",
}

runeReplacements = map[rune]string{
// Greek letters
'α': "a",
'β': "b",
'γ': "y",
'∆': "a",
'δ': "d",
'ε': "e",
'ζ': "z",
'η': "hn",
'θ': "o",
'ι': "i",
'κ': "k",
'λ': "l",
'μ': "mu",
'ν': "nv",
'ο': "o",
'ρ': "p",
'ς': "s",
'τ': "t",
'υ': "u",
'φ': "p",
'χ': "x",
'ψ': "t",
'Ω': "o",
'ω': "w",

// Math symbols
'⊗': "o",
'⊕': "o",
'σ': "o",
'∩': "n",
'∪': "u",
'⊂': "c",
'⊆': "c",
'⊄': "c",
'∈': "e",
'⊖': "o",
'Ø': "o",
'∨': "v",
'∄': "ab",
'∫': "l",

// Letterlike
'ℂ': "c",
'℃': "c",
'℄': "c",
'ℇ': "e",
'℉': "f",
'ℊ': "g",
'ℋ': "h",
'ℌ': "h",
'ℍ': "h",
'ℎ': "h",
'ℏ': "h",
'ℐ': "j",
'ℑ': "j",
'ℒ': "l",
'ℓ': "l",
'℔': "b",
'ℕ': "n",
'№': "n",
'℗': "p",
'℘': "p",
'ℙ': "p",
'ℚ': "q",
'ℛ': "r",
'ℜ': "r",
'ℝ': "r",
'℟': "r",
'℣': "v",
'ℤ': "z",
'℧': "o",
'℩': "i",
'K': "k",
'Å': "a",
'ℬ': "b",
'ℭ': "c",
'℮': "e",
'e': "e",
'ℰ': "e",
'ℱ': "f",
'ℳ': "m",
'ℴ': "o",
'ℵ': "n",
'ℹ': "i",
'℺': "o",
'ℼ': "n",
'ℽ': "v",
'ℿ': "n",
'⅀': "e",
'⅁': "g",
'⅄': "l",
'ⅅ': "d",
'ⅆ': "d",
'ⅇ': "e",
'ⅈ': "i",
'ⅉ': "ji",
}
)
Loading

0 comments on commit 8e6ef79

Please sign in to comment.