Skip to content
This repository has been archived by the owner on Jan 13, 2024. It is now read-only.

Commit

Permalink
Increase accuracy by about 1%, improve code.
Browse files Browse the repository at this point in the history
  • Loading branch information
finnbear committed Feb 24, 2021
1 parent 2c7d733 commit 87a47b7
Show file tree
Hide file tree
Showing 8 changed files with 1,756 additions and 97 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ $ go run hello_world.go
Accuracy was evaluated based on a [dataset of moderated comments](https://raw.githubusercontent.com/vzhou842/profanity-check/master/profanity_check/data/clean_data.csv).
**Package**|**Accuracy**|**Comment**
:-----:|:-----:|:-----:
https://github.com/finnbear/moderation|90.44%|Current version is not stable
https://github.com/TwinProduction/go-away|82.06%|Many false positives from combined words like "push it"
https://github.com/finnbear/moderation|91.78%|Current API version is not stable
https://github.com/TwinProduction/go-away|82.07%|Many false positives from combined words like "push it"


## Acknowledgements
Expand Down
34 changes: 34 additions & 0 deletions examples/detection_finder.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package main

import (
"fmt"
"flag"
"github.com/finnbear/moderation"
)

var input string

func init() {
flag.StringVar(&input, "input", "", "the input to find an inappropriate phase in")
flag.Parse()
}

func main() {
shorter := input
for moderation.Is(shorter, 0xffffffff) { // satisfies all bitmasks
input = shorter
shorter = shorter[:len(shorter) - 1]
}

shorter = input
for moderation.IsInappropriate(shorter) {
input = shorter
shorter = shorter[1:]
}

if moderation.IsInappropriate(input) {
fmt.Printf("Found inappropriate phrase: %s\n", input)
} else {
fmt.Println("No inappropriate phrase found")
}
}
4 changes: 4 additions & 0 deletions generator/dictionary_blacklist.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
acock
asshole
(.*)bastard(.*)
blueballs
booby
Expand All @@ -10,11 +11,13 @@ crapper(.*)
crappie(.*)
crapy
dildo(.*)
ejaculation
fag(.*)
fuck(.*)
hell
jackass
livesex
motherfuck(.*)
(.*)nigger(.*)
nude(.*)
penises
Expand All @@ -28,6 +31,7 @@ shita
shithead
shitt(.*)
slutt(.*)
spunked
tite
tittie(s*)
(.*)vagina(.*)
Expand Down
1 change: 1 addition & 0 deletions generator/dictionary_extra.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
anna
afk
akshita
asap
Expand Down
24 changes: 24 additions & 0 deletions generator/profanity.csv
Original file line number Diff line number Diff line change
Expand Up @@ -19,52 +19,71 @@ btch,0,1,1,0
bugger,1,0,0,0
bullshit,1,0,0,0
butt,1,0,0,0
bra,0,0,1,0
chink,0,2,0,0
clitoris,0,0,2,0
cock,1,0,1,0
cok,1,0,1,0
condom,0,0,1,0
coon,0,3,0,0
cracker,0,1,0,0
crap,1,0,0,0
cum,0,0,2,0
cunt,0,1,1,0
dick,1,0,1,0
dildo,0,0,2,0
dong,0,0,1,0
douche,1,1,1,1
dumb,0,0,0,1
dyke,1,1,1,0
effing,1,0,1,0
ejaculat,0,0,1,0
fag,0,3,0,0
fatty,0,1,0,1
feck,1,0,1,0
felching,0,0,2,0
fellate,0,0,2,0
fellatio,0,0,2,0
flange,0,0,2,0
foursome,0,0,2,0
freak,0,0,0,1
frig,1,0,1,0
fuck,1,0,1,0
fudgepacker,0,0,2,0
fuk,1,0,1,0
gangbang,0,2,0,0
ghetto,0,1,0,0
handjob,0,0,2,0
hate,0,0,0,1
heshe,0,0,1,0
hoe,0,1,1,0
homo,0,1,0,0
horny,0,0,2,0
idiot,0,0,0,1
incest,0,0,2,0
jerk,1,0,0,0
jizz,0,0,2,0
killyourself,0,1,0,2
labia,0,0,2,0
loser,0,0,0,1
masterbat,0,0,1,0
motherfuck,0,1,1,0
muff,0,0,2,0
naked,0,0,1,0
nazi,0,1,0,0
nigga,1,2,0,0
nigger,2,3,0,0
nigguh,1,2,0,0
nude,0,0,2,0
orgasm,0,0,1,0
penis,2,0,2,0
phuck,1,0,1,0
phuk,1,0,1,0
pieceofgarbage,0,0,0,1
piss,1,0,0,0
poop,1,0,0,0
porn,0,0,2,0
pregnant,0,0,1,0
prick,2,0,0,0
prostitut,0,0,2,0
pube,0,0,2,0
Expand All @@ -80,17 +99,22 @@ sex,0,0,1,0
shagger,0,0,2,0
shagging,0,0,2,0
shit,1,0,0,0
sissy,0,0,0,1
slut,0,1,1,0
spunk,0,0,2,0
stfu,0,0,0,1
stupid,0,0,0,1
suckmy,0,0,2,0
threesome,0,0,2,0
tit,0,0,2,0
tohell,1,1,0,0
tosser,1,0,0,0
turd,1,0,0,0
twat,0,0,2,0
ugly,0,0,0,1
vagina,0,0,2,0
wank,0,0,2,0
whore,0,2,2,0
wigga,1,2,0,0
xrated,0,0,1,0
xxx,0,0,1,0
169 changes: 99 additions & 70 deletions moderation.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ func init() {
}
}

// Replace the key with any one of the characters in the value
var replacements = [...]string{
'!': "li",
'@': "a",
Expand Down Expand Up @@ -68,103 +69,131 @@ func IsInappropriate(text string) bool {
// IsInappropriate returns whether a phrase contains enough words matching the
// types flag to meet or exceed InappropriateThreshold
func Is(text string, types Type) bool {
// Sanitize input
buf := make([]byte, 0, len(text))
_, n, _ := transform.Append(removeAccentsTransform, buf, []byte(text))
text = string(buf[:n])

// How many characters ago the last separator character was observed,
// expressed as an upper and a lower bound in relation to the current queue
// of matches
lastSepMin := 0
lastSepMax := 0

// Same as above, but for replacements
lastReplacementMin := 0
lastReplacementMax := 0

// Scan status
var matches radix.Queue
inappropriateLevel := 0

var lastMatchable byte

for _, textRune := range text {
if textRune >= 0x0020 && textRune <= 0x007E {
textByte := byte(textRune)
var textBytes string
lastSepMin++
lastSepMax++

ok := true
matchable := false
skippable := false

var replacement string
if int(textByte) < len(replacements) {
replacement = replacements[textByte]
}
// Unhandled runes (not printable, not representable as byte, etc.)
if textRune < 0x0020 || textRune > 0x007E {
continue
}

textByte := byte(textRune)
var textBytes string
lastSepMin++
lastSepMax++
lastReplacementMin++
lastReplacementMax++

switch {
case textByte >= 'a' && textByte <= 'z':
matchable = true
case textByte >= 'A' && textByte <= 'Z':
textByte += 'a' - 'A'
matchable = true
case replacement != "":
textByte = replacement[0]
textBytes = replacement
matchable = true
matchable := false
skippable := false

var replacement string
if int(textByte) < len(replacements) {
replacement = replacements[textByte]
}

switch {
case textByte >= 'A' && textByte <= 'Z':
textByte += 'a' - 'A'
fallthrough
case textByte >= 'a' && textByte <= 'z':
matchable = true
case replacement != "":
textByte = replacement[0]
textBytes = replacement
matchable = true
lastReplacementMin = 0
lastReplacementMax = 0
default:
switch textByte {
case '_', '.', ',', '*':
lastReplacementMin = 0
lastReplacementMax = 0
fallthrough
case ' ', '-':
skippable = true
lastSepMin = 0
lastSepMax = 0
default:
switch textByte {
case ' ', '_', '-', '.', ',', '*':
skippable = true
lastSepMin = 0
lastSepMax = 0
default:
ok = false
}
continue
}
}

if textByte == lastMatchable {
lastSepMin-- // this character doesn't count
}
if textByte == lastMatchable {
// this character doesn't count so cancel the increments to min
lastSepMin--
lastReplacementMin--
}

if ok {
if matchable {
matches.Append(tree.Root())
if matchable {
matches.Append(tree.Root())

originalLength := matches.Len()
for i := 0; i < originalLength; i++ {
match := matches.Remove()
originalLength := matches.Len()
for i := 0; i < originalLength; i++ {
match := matches.Remove()

if textByte == lastMatchable {
matches.Append(match)
}
if skippable || textByte == lastMatchable {
matches.Append(match)
}

// Process textBytes as multiple textBytes or textByte
loops := 1
if len(textBytes) > 1 {
loops = len(textBytes)
}
// Process textBytes as multiple textBytes or textByte
loops := 1
if len(textBytes) > 1 {
loops = len(textBytes)
}

for i := 0; i < loops; i++ {
loopTextByte := textByte
if len(textBytes) > 0 {
loopTextByte = textBytes[i]
}
next := match.Next(loopTextByte)

if next != nil {
if next.Word() {
if next.Depth() > 4 || (next.Depth() > 3 && next.Start() != 's') || (next.Depth() >= lastSepMin && next.Depth() <= lastSepMax) {
match := next.Data() & uint32(types)
for i := 0; i < 4; i++ {
inappropriateLevel += int(int8(match >> (i * 8)))
}
}
}
for i := 0; i < loops; i++ {
loopTextByte := textByte
if len(textBytes) > 0 {
loopTextByte = textBytes[i]
}
next := match.Next(loopTextByte)

matches.Append(next)
if next == nil {
continue
}

if next.Word() {
if next.Depth() > 4 || (next.Depth() > 3 && next.Start() != 's') || (next.Depth() >= lastSepMin && next.Depth() <= lastSepMax) {
match := next.Data() & uint32(types)
for i := 0; i < 4; i++ {
level := int(int8(match >> (i * 8)))

// False positives that contain replacements are not matched
if level > 0 || next.Depth() - 1 <= lastReplacementMax {
inappropriateLevel += level
}
}
}
}

lastMatchable = textByte
} else if !skippable {
matches.Clear()
matches.Append(next)
}
}

lastMatchable = textByte
} else if !skippable {
matches.Clear()
}
}

return inappropriateLevel >= InappropriateThreshold
}
4 changes: 2 additions & 2 deletions moderation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ func TestAnalyzeWikipedia(t *testing.T) {
} else {
correctOk++
}
} else {
//fmt.Printf("phrase=\"%s\" analysis offensive=%v actual offensive=%v", phrase, analysis.IsInappropriate(), offensive)
} else if len(phrase) < 40 {
//fmt.Printf("phrase=\"%s\" analysis offensive=%v actual offensive=%v\n", phrase, !offensive, offensive)
}

total++
Expand Down
Loading

0 comments on commit 87a47b7

Please sign in to comment.