Skip to content
This repository has been archived by the owner on Jan 13, 2024. It is now read-only.

Commit

Permalink
1% accuracy improvement, add spam detection.
Browse files Browse the repository at this point in the history
  • Loading branch information
finnbear committed Feb 27, 2021
1 parent 6551981 commit f9703b8
Show file tree
Hide file tree
Showing 11 changed files with 1,253 additions and 144 deletions.
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
2. Minimum possible allocations, processing time, and binary size
3. Minimum false negatives (including text like `h3110_w0r!d`)
4. Minimum false positives
5. (Future) Other analysis types than inappropriate, profane, offensive, sexual, mean (spam, violence, contact info, etc.)
6. (Future) Implement a way to censor text
7. (Future) basic support for languages other than English
5. Provide a way to censor text
6. (Future) Other analysis types than inappropriate, profane, offensive, sexual, mean (spam, violence, contact info, etc.)
7. (Future) Basic support for languages other than English

## Example
```go
Expand Down Expand Up @@ -49,12 +49,12 @@ $ go run hello_world.go
```

## Comparison
Accuracy was evaluated based on the first 50,000 items from this [dataset of moderated comments](https://raw.githubusercontent.com/vzhou842/profanity-check/master/profanity_check/data/clean_data.csv).
Accuracy was evaluated based on the first 100,000 items from this [dataset of moderated comments](https://raw.githubusercontent.com/vzhou842/profanity-check/master/profanity_check/data/clean_data.csv).

|**Package**|**Time**|**Accuracy**|**Comment**|
|:-----:|:-----:|:-----:|:-----:|
|[finnbear/moderation](https://github.com/finnbear/moderation)|0.63s|91.78%|Current API version is not stable|
|[TwinProduction/go-away](https://github.com/TwinProduction/go-away)|1.09s|82.07%|Many false positives from combined words like "push it"|
|[finnbear/moderation](https://github.com/finnbear/moderation)|1.23s|92.71%|Current API version is not stable|
|[TwinProduction/go-away](https://github.com/TwinProduction/go-away)|2.20s|82.14%|Many false positives from combined words like "push it"|


## Acknowledgements
Expand Down
4 changes: 3 additions & 1 deletion censor.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ var CensorReplacment rune = '*'
// Censor returns a string with all but the first character of any inappropriate
// segment replaced with CensorReplacment
//
// It is currently Experimental
// It is currently Experimental and not fully tested
func Censor(text string, types Type) (censoredText string, replaced int) {
// Fast path
if len(text) == 0 || !Is(text, types) {
Expand All @@ -20,6 +20,8 @@ func Censor(text string, types Type) (censoredText string, replaced int) {

start := 0

// TODO: scan ahead for false positives

for i := start; i <= str.RuneCount(); i++ {
slice := str.Slice(start, i)
if /* (i == str.RuneCount() || str.At(i) == ' ') && */ Is(slice, types) {
Expand Down
2 changes: 1 addition & 1 deletion comparison/comparison_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func testWikipedia(t *testing.T, isInappropriate func(string) bool) {
correct := 0
total := 0

for total < 50000 {
for total < 100000 {
fields, err := reader.Read()
if err != nil {
if err == io.EOF {
Expand Down
36 changes: 31 additions & 5 deletions comparison/go.sum
Original file line number Diff line number Diff line change
@@ -1,12 +1,38 @@
github.com/TwinProduction/go-away v1.0.1 h1:LDe6jPktucIz/dftNGL5x2LYRB6VXjVUtRsrlVHo+Ag=
github.com/TwinProduction/go-away v1.0.1/go.mod h1:VB/lNzhkzh7Xw2QgU+tYBjMheldukJaIJzVaIx2rh30=
dmitri.shuralyov.com/gpu/mtl v0.0.0-20201218220906-28db891af037/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/TwinProduction/go-away v1.1.0 h1:AhkmMxDIxI4Dr0/Hki/qtFfLh/02MOtmDEqLuyA55lU=
github.com/TwinProduction/go-away v1.1.0/go.mod h1:VB/lNzhkzh7Xw2QgU+tYBjMheldukJaIJzVaIx2rh30=
github.com/finnbear/moderation v0.5.0 h1:k00252U3XaworO6EN/VRf1hasn0pcANWe7tNjEoyzsc=
github.com/finnbear/moderation v0.5.0/go.mod h1:zoexQHUm4TZCb9x/Re0TqV8SgvnDPZjMRdSMAjEqmgE=
github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/exp v0.0.0-20190731235908-ec7cb31e5a56/go.mod h1:JhuoJpWY28nO4Vef9tZUw9qufEGTyX1+7lmHxV5q5G4=
golang.org/x/exp v0.0.0-20210220032938-85be41e4509f h1:GrkO5AtFUU9U/1f5ctbIBXtBGeSJbWwIYfIsTcFMaX4=
golang.org/x/exp v0.0.0-20210220032938-85be41e4509f/go.mod h1:I6l2HNBLBZEcrOoCpyKLdY2lHoRZ8lI4x60KMCQDft4=
golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
golang.org/x/mobile v0.0.0-20201217150744-e6ae53a27f4f/go.mod h1:skQtrUTUwhdJvXM/2KKJzY8pDgNr9I/FOMqDVRPBUS4=
golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
golang.org/x/mod v0.1.1-0.20191209134235-331c550502dd/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.1-0.20200828183125-ce943fd02449/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.4 h1:0YWbFKbhXG/wIiuHDSKpS0Iy7FSA+u45VtBMfQcFTTc=
golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.5 h1:i6eZZ+zk0SOf0xgBpEpPD18qWcJda6q1sxt3S0kzyUQ=
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200117012304-6edc0a871e69/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
14 changes: 14 additions & 0 deletions generator/dictionary_blacklist.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
acock
arsehole
asshole
(.*)bastard(.*)
bitchi
bitches(.*)
bitcho
bitchy
blackbutt
blackcock
blueballs
booby
boobie(.*)
Expand All @@ -11,12 +18,18 @@ crapper(.*)
crappie(.*)
crapy
dildo(.*)
douche(.*)
ejaculation
fag(.*)
fellate
fellatio
frigg(.*)
fuck(.*)
gay
hellhole
jackass
livesex
masturbate
motherfuck(.*)
(.*)nigger(.*)
nude(.*)
Expand All @@ -25,6 +38,7 @@ penist
poop(.*)
porn(.?)
(.*)prostitut(.*)
rehoe
retarded
sex(.*)
shita
Expand Down
26 changes: 17 additions & 9 deletions generator/generate.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 23 additions & 3 deletions generator/profanity.csv
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ bra,0,0,1,0
chink,0,2,0,0
clit,0,0,2,0
cock,1,0,1,0
cok,1,0,1,0
commie,0,1,0,0
condom,0,0,1,0
coon,0,3,0,0
cracker,0,1,0,0
Expand All @@ -41,15 +41,17 @@ dominatrix,0,0,2,0
dong,0,0,1,0
douche,1,1,1,1
dumb,0,0,0,1
dumbass,1,0,0,1
dyke,1,1,1,0
effing,1,0,1,0
ejaculat,0,0,1,0
fascist,0,1,0,0
fag,0,3,0,0
fatty,0,1,0,1
feck,1,0,1,0
felching,0,0,2,0
fellate,0,0,2,0
fellatio,0,0,2,0
felate,0,0,2,0
felatio,0,0,2,0
flange,0,0,2,0
foursome,0,0,2,0
freak,0,0,0,1
Expand All @@ -59,6 +61,8 @@ fuck,1,0,1,0
fudgepacker,0,0,2,0
fuk,1,0,1,0
gangbang,0,2,0,0
gaay,0,1,0,0
genital,0,0,1,0
ghetto,0,1,0,0
handjob,0,0,2,0
hate,0,0,0,1
Expand All @@ -68,26 +72,35 @@ hoar,0,2,2,0
hoe,0,1,1,0
homo,0,1,1,0
honkey,0,1,0,0
honkie,0,1,0,0
hooker,0,0,2,0
horny,0,0,2,0
idiot,0,0,0,1
incest,0,0,2,0
jackass,1,0,0,1
jerk,1,0,0,0
jigaboo,0,2,0,0
jizz,0,0,2,0
killyourself,0,1,0,2
labia,0,0,2,0
loser,0,0,0,1
masterbat,0,0,1,0
masturbat,0,0,1,0
moron,0,0,0,1
motherfuck,1,1,1,0
muff,0,0,2,0
naked,0,0,1,0
nazi,0,1,0,0
nicca,1,2,0,0
nigga,1,2,0,0
nigger,2,3,0,0
nigglet,0,2,0,0
nigguh,1,2,0,0
nigr,2,3,0,0
nigs,1,2,0,0
nude,0,0,2,0
orgasm,0,0,1,0
peepee,1,0,1,0
penis,2,0,2,0
penus,2,0,2,0
phuc,1,0,1,0
Expand All @@ -96,6 +109,7 @@ pieceofgarbage,0,0,0,1
piss,1,0,0,0
poop,1,0,0,0
porn,0,0,2,0
pp,0,0,1,0
pregnant,0,0,1,0
prick,2,0,0,0
prostitut,0,0,2,0
Expand All @@ -110,6 +124,7 @@ retard,0,1,0,1
retart,0,1,0,1
rimjob,0,0,2,0
scrotum,0,0,2,0
scum,0,0,1,1
semen,0,0,2,0
sex,0,0,1,0
shagger,0,0,2,0
Expand All @@ -118,9 +133,12 @@ shit,1,0,0,0
sissy,0,0,0,1
skank,0,1,2,0
slut,0,1,1,0
sodomize,0,0,2,0
sodomy,0,0,2,0
spunk,0,0,2,0
stfu,0,0,0,1
stupid,0,0,0,1
suckit,0,0,1,1
suckmy,0,0,2,0
testical,0,0,2,0
testicle,0,0,2,0
Expand All @@ -134,7 +152,9 @@ ugly,0,0,0,1
vagina,0,0,2,0
vulva,0,0,2,0
wank,0,0,2,0
whitetrash,0,1,0,0
whore,0,2,2,0
wigga,1,2,0,0
wigger,1,2,0,0
xrated,0,0,1,0
xxx,0,0,1,0
25 changes: 21 additions & 4 deletions moderation.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@ import (
type Type uint32

const (
Profane Type = 255 << (iota * 8)
Profane Type = 1 << iota
Offensive
Sexual
Mean
Inappropriate = Profane | Offensive | Sexual
Any Type = 0xffffffff
Spam
Inappropriate = Profane | Offensive | Sexual
Any = Profane | Offensive | Sexual | Spam | Mean
)

var (
Expand Down Expand Up @@ -105,6 +106,10 @@ func Is(text string, types Type) bool {
inappropriateLevel := 0
var lastMatchable byte

// For spam detection purposes
var upperCount int
var repetitionCount int

for _, textRune := range text {
// Unhandled runes (not printable, not representable as byte, etc.)
if textRune < minMatchable || maxMatchable < textRune {
Expand All @@ -128,6 +133,7 @@ func Is(text string, types Type) bool {

switch {
case textByte >= 'A' && textByte <= 'Z':
upperCount++
textByte += 'a' - 'A'
fallthrough
case textByte >= 'a' && textByte <= 'z':
Expand Down Expand Up @@ -155,6 +161,8 @@ func Is(text string, types Type) bool {

if matchable {
if textByte == lastMatchable {
repetitionCount++

// this character doesn't count so cancel the increments to min
lastSepMin--
lastReplacementMin--
Expand Down Expand Up @@ -198,8 +206,12 @@ func Is(text string, types Type) bool {

if next.Word() {
if next.Depth() > 4 || (next.Depth() > 3 && next.Start() != 's') || (next.Depth() >= lastSepMin && next.Depth() <= lastSepMax) {
match := next.Data() & uint32(types)
match := next.Data()
for i := 0; i < 4; i++ {
if types&Type(1<<i) == 0 {
continue
}

level := int(int8(match >> (i * 8)))

// False positives that contain replacements are not matched
Expand All @@ -221,5 +233,10 @@ func Is(text string, types Type) bool {
}
}

if types&Spam != 0 && len(text) > 5 {
spamPercent := (100 / 2) * (upperCount + repetitionCount) / len(text)
inappropriateLevel += spamPercent / 30
}

return inappropriateLevel >= InappropriateThreshold
}
Loading

0 comments on commit f9703b8

Please sign in to comment.