Increase accuracy by about 1%, improve code.

finnbear · Feb 24, 2021 · 87a47b7 · 87a47b7
1 parent 2c7d733
commit 87a47b7
Show file tree

Hide file tree

Showing 8 changed files with 1,756 additions and 97 deletions.
diff --git a/README.md b/README.md
@@ -52,8 +52,8 @@ $ go run hello_world.go
 Accuracy was evaluated based on a [dataset of moderated comments](https://raw.githubusercontent.com/vzhou842/profanity-check/master/profanity_check/data/clean_data.csv).
 **Package**|**Accuracy**|**Comment**
 :-----:|:-----:|:-----:
-https://github.com/finnbear/moderation|90.44%|Current version is not stable
-https://github.com/TwinProduction/go-away|82.06%|Many false positives from combined words like "push it"
+https://github.com/finnbear/moderation|91.78%|Current API version is not stable
+https://github.com/TwinProduction/go-away|82.07%|Many false positives from combined words like "push it"
 
 
 ## Acknowledgements

diff --git a/examples/detection_finder.go b/examples/detection_finder.go
@@ -0,0 +1,34 @@
+package main
+
+import (
+	"fmt"
+	"flag"
+	"github.com/finnbear/moderation"
+)
+
+var input string
+
+func init() {
+	flag.StringVar(&input, "input", "", "the input to find an inappropriate phase in")
+	flag.Parse()
+}
+
+func main() {
+	shorter := input
+	for moderation.Is(shorter, 0xffffffff) { // satisfies all bitmasks
+		input = shorter
+		shorter = shorter[:len(shorter) - 1]
+	}
+
+	shorter = input
+	for moderation.IsInappropriate(shorter) {
+		input = shorter
+		shorter = shorter[1:]
+	}
+
+	if moderation.IsInappropriate(input) {
+		fmt.Printf("Found inappropriate phrase: %s\n", input)
+	} else {
+		fmt.Println("No inappropriate phrase found")
+	}
+}
diff --git a/generator/dictionary_blacklist.txt b/generator/dictionary_blacklist.txt
@@ -1,4 +1,5 @@
 acock
+asshole
 (.*)bastard(.*)
 blueballs
 booby
@@ -10,11 +11,13 @@ crapper(.*)
 crappie(.*)
 crapy
 dildo(.*)
+ejaculation
 fag(.*)
 fuck(.*)
 hell
 jackass
 livesex
+motherfuck(.*)
 (.*)nigger(.*)
 nude(.*)
 penises
@@ -28,6 +31,7 @@ shita
 shithead
 shitt(.*)
 slutt(.*)
+spunked
 tite
 tittie(s*)
 (.*)vagina(.*)

diff --git a/generator/dictionary_extra.txt b/generator/dictionary_extra.txt
@@ -1,3 +1,4 @@
+anna
 afk
 akshita
 asap

diff --git a/generator/profanity.csv b/generator/profanity.csv
@@ -19,52 +19,71 @@ btch,0,1,1,0
 bugger,1,0,0,0
 bullshit,1,0,0,0
 butt,1,0,0,0
+bra,0,0,1,0
+chink,0,2,0,0
 clitoris,0,0,2,0
 cock,1,0,1,0
 cok,1,0,1,0
 condom,0,0,1,0
 coon,0,3,0,0
+cracker,0,1,0,0
 crap,1,0,0,0
 cum,0,0,2,0
 cunt,0,1,1,0
 dick,1,0,1,0
 dildo,0,0,2,0
 dong,0,0,1,0
+douche,1,1,1,1
 dumb,0,0,0,1
 dyke,1,1,1,0
 effing,1,0,1,0
+ejaculat,0,0,1,0
 fag,0,3,0,0
+fatty,0,1,0,1
 feck,1,0,1,0
 felching,0,0,2,0
 fellate,0,0,2,0
 fellatio,0,0,2,0
 flange,0,0,2,0
+foursome,0,0,2,0
+freak,0,0,0,1
 frig,1,0,1,0
 fuck,1,0,1,0
 fudgepacker,0,0,2,0
 fuk,1,0,1,0
 gangbang,0,2,0,0
+ghetto,0,1,0,0
 handjob,0,0,2,0
+hate,0,0,0,1
 heshe,0,0,1,0
+hoe,0,1,1,0
+homo,0,1,0,0
 horny,0,0,2,0
 idiot,0,0,0,1
 incest,0,0,2,0
 jerk,1,0,0,0
 jizz,0,0,2,0
+killyourself,0,1,0,2
 labia,0,0,2,0
+loser,0,0,0,1
 masterbat,0,0,1,0
 motherfuck,0,1,1,0
 muff,0,0,2,0
 naked,0,0,1,0
+nazi,0,1,0,0
 nigga,1,2,0,0
 nigger,2,3,0,0
+nigguh,1,2,0,0
 nude,0,0,2,0
 orgasm,0,0,1,0
 penis,2,0,2,0
+phuck,1,0,1,0
+phuk,1,0,1,0
 pieceofgarbage,0,0,0,1
 piss,1,0,0,0
 poop,1,0,0,0
 porn,0,0,2,0
+pregnant,0,0,1,0
 prick,2,0,0,0
 prostitut,0,0,2,0
 pube,0,0,2,0
@@ -80,17 +99,22 @@ sex,0,0,1,0
 shagger,0,0,2,0
 shagging,0,0,2,0
 shit,1,0,0,0
+sissy,0,0,0,1
 slut,0,1,1,0
 spunk,0,0,2,0
+stfu,0,0,0,1
 stupid,0,0,0,1
 suckmy,0,0,2,0
+threesome,0,0,2,0
 tit,0,0,2,0
 tohell,1,1,0,0
 tosser,1,0,0,0
 turd,1,0,0,0
 twat,0,0,2,0
+ugly,0,0,0,1
 vagina,0,0,2,0
 wank,0,0,2,0
 whore,0,2,2,0
+wigga,1,2,0,0
 xrated,0,0,1,0
 xxx,0,0,1,0
diff --git a/moderation.go b/moderation.go
@@ -33,6 +33,7 @@ func init() {
 	}
 }
 
+// Replace the key with any one of the characters in the value
 var replacements = [...]string{
 	'!': "li",
 	'@': "a",
@@ -68,103 +69,131 @@ func IsInappropriate(text string) bool {
 // IsInappropriate returns whether a phrase contains enough words matching the
 // types flag to meet or exceed InappropriateThreshold
 func Is(text string, types Type) bool {
+	// Sanitize input
 	buf := make([]byte, 0, len(text))
 	_, n, _ := transform.Append(removeAccentsTransform, buf, []byte(text))
 	text = string(buf[:n])
+
+	// How many characters ago the last separator character was observed,
+	// expressed as an upper and a lower bound in relation to the current queue
+	// of matches
 	lastSepMin := 0
 	lastSepMax := 0
 
+	// Same as above, but for replacements
+	lastReplacementMin := 0
+	lastReplacementMax := 0
+
+	// Scan status
 	var matches radix.Queue
 	inappropriateLevel := 0
-
 	var lastMatchable byte
+
 	for _, textRune := range text {
-		if textRune >= 0x0020 && textRune <= 0x007E {
-			textByte := byte(textRune)
-			var textBytes string
-			lastSepMin++
-			lastSepMax++
-
-			ok := true
-			matchable := false
-			skippable := false
-
-			var replacement string
-			if int(textByte) < len(replacements) {
-				replacement = replacements[textByte]
-			}
+		// Unhandled runes (not printable, not representable as byte, etc.)
+		if textRune < 0x0020 || textRune > 0x007E {
+			continue
+		}
+
+		textByte := byte(textRune)
+		var textBytes string
+		lastSepMin++
+		lastSepMax++
+		lastReplacementMin++
+		lastReplacementMax++
 
-			switch {
-			case textByte >= 'a' && textByte <= 'z':
-				matchable = true
-			case textByte >= 'A' && textByte <= 'Z':
-				textByte += 'a' - 'A'
-				matchable = true
-			case replacement != "":
-				textByte = replacement[0]
-				textBytes = replacement
-				matchable = true
+		matchable := false
+		skippable := false
+
+		var replacement string
+		if int(textByte) < len(replacements) {
+			replacement = replacements[textByte]
+		}
+
+		switch {
+		case textByte >= 'A' && textByte <= 'Z':
+			textByte += 'a' - 'A'
+			fallthrough
+		case textByte >= 'a' && textByte <= 'z':
+			matchable = true
+		case replacement != "":
+			textByte = replacement[0]
+			textBytes = replacement
+			matchable = true
+			lastReplacementMin = 0
+			lastReplacementMax = 0
+		default:
+			switch textByte {
+			case '_', '.', ',', '*':
+				lastReplacementMin = 0
+				lastReplacementMax = 0
+				fallthrough
+			case ' ', '-':
+				skippable = true
+				lastSepMin = 0
+				lastSepMax = 0
 			default:
-				switch textByte {
-				case ' ', '_', '-', '.', ',', '*':
-					skippable = true
-					lastSepMin = 0
-					lastSepMax = 0
-				default:
-					ok = false
-				}
+				continue
 			}
+		}
 
-			if textByte == lastMatchable {
-				lastSepMin-- // this character doesn't count
-			}
+		if textByte == lastMatchable {
+			 // this character doesn't count so cancel the increments to min
+			lastSepMin--
+			lastReplacementMin--
+		}
 
-			if ok {
-				if matchable {
-					matches.Append(tree.Root())
+		if matchable {
+			matches.Append(tree.Root())
 
-					originalLength := matches.Len()
-					for i := 0; i < originalLength; i++ {
-						match := matches.Remove()
+			originalLength := matches.Len()
+			for i := 0; i < originalLength; i++ {
+				match := matches.Remove()
 
-						if textByte == lastMatchable {
-							matches.Append(match)
-						}
+				if skippable || textByte == lastMatchable {
+					matches.Append(match)
+				}
 
-						// Process textBytes as multiple textBytes or textByte
-						loops := 1
-						if len(textBytes) > 1 {
-							loops = len(textBytes)
-						}
+				// Process textBytes as multiple textBytes or textByte
+				loops := 1
+				if len(textBytes) > 1 {
+					loops = len(textBytes)
+				}
 
-						for i := 0; i < loops; i++ {
-							loopTextByte := textByte
-							if len(textBytes) > 0 {
-								loopTextByte = textBytes[i]
-							}
-							next := match.Next(loopTextByte)
-
-							if next != nil {
-								if next.Word() {
-									if next.Depth() > 4 || (next.Depth() > 3 && next.Start() != 's') || (next.Depth() >= lastSepMin && next.Depth() <= lastSepMax) {
-										match := next.Data() & uint32(types)
-										for i := 0; i < 4; i++ {
-											inappropriateLevel += int(int8(match >> (i * 8)))
-										}
-									}
-								}
+				for i := 0; i < loops; i++ {
+					loopTextByte := textByte
+					if len(textBytes) > 0 {
+						loopTextByte = textBytes[i]
+					}
+					next := match.Next(loopTextByte)
 
-								matches.Append(next)
+					if next == nil {
+						continue
+					}
+
+					if next.Word() {
+						if next.Depth() > 4 || (next.Depth() > 3 && next.Start() != 's') || (next.Depth() >= lastSepMin && next.Depth() <= lastSepMax) {
+							match := next.Data() & uint32(types)
+							for i := 0; i < 4; i++ {
+								level := int(int8(match >> (i * 8)))
+
+								// False positives that contain replacements are not matched
+								if level > 0 || next.Depth() - 1 <= lastReplacementMax {
+									inappropriateLevel += level
+								}
 							}
 						}
 					}
 
-					lastMatchable = textByte
-				} else if !skippable {
-					matches.Clear()
+					matches.Append(next)
 				}
 			}
+
+			lastMatchable = textByte
+		} else if !skippable {
+			matches.Clear()
 		}
 	}
+
 	return inappropriateLevel >= InappropriateThreshold
 }
diff --git a/moderation_test.go b/moderation_test.go
@@ -122,8 +122,8 @@ func TestAnalyzeWikipedia(t *testing.T) {
 			} else {
 				correctOk++
 			}
-		} else {
-			//fmt.Printf("phrase=\"%s\" analysis offensive=%v actual offensive=%v", phrase, analysis.IsInappropriate(), offensive)
+		} else if len(phrase) < 40 {
+			//fmt.Printf("phrase=\"%s\" analysis offensive=%v actual offensive=%v\n", phrase, !offensive, offensive)
 		}
 
 		total++
-Original file line number
+Diff line change
@@ -1,3 +1,4 @@
+    anna
     afk
     akshita
     asap
@@ Expand Down @@