Skip to content
This repository has been archived by the owner on Jan 13, 2024. It is now read-only.

Commit

Permalink
Update profanity list, add experimental Censor func, reduce allocations.
Browse files Browse the repository at this point in the history
  • Loading branch information
finnbear committed Feb 26, 2021
1 parent ac636ec commit 6551981
Show file tree
Hide file tree
Showing 11 changed files with 3,228 additions and 813 deletions.
43 changes: 43 additions & 0 deletions censor.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package moderation

import "golang.org/x/exp/utf8string"

var CensorReplacment rune = '*'

// Censor returns a string with all but the first character of any inappropriate
// segment replaced with CensorReplacment
//
// It is currently Experimental
func Censor(text string, types Type) (censoredText string, replaced int) {
// Fast path
if len(text) == 0 || !Is(text, types) {
return text, 0
}

str := utf8string.NewString(text)

censored := make([]rune, 0, str.RuneCount())

start := 0

for i := start; i <= str.RuneCount(); i++ {
slice := str.Slice(start, i)
if /* (i == str.RuneCount() || str.At(i) == ' ') && */ Is(slice, types) {
for j := start; j <= i; j++ {
slice2 := str.Slice(j, i)
if !Is(slice2, types) {
censored = append(censored, []rune(str.Slice(start, j))...)
replaced += i - j
for k := 0; k < i-j; k++ {
censored = append(censored, CensorReplacment)
}
break
}
}
start = i
}
}
censored = append(censored, []rune(str.Slice(start, str.RuneCount()))...)
censoredText = string(censored)
return
}
10 changes: 7 additions & 3 deletions examples/detection_finder.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,23 @@ func init() {
}

func main() {
fmt.Println("Original phrase: " + input)
censored, numCensored := moderation.Censor(input, moderation.Inappropriate)
fmt.Printf("Censored phrase: %s (%d characters replaced)\n", censored, numCensored)

shorter := input
for moderation.Is(shorter, 0xffffffff) { // satisfies all bitmasks
for moderation.Is(shorter, moderation.Any) { // satisfies all bitmasks
input = shorter
shorter = shorter[:len(shorter)-1]
}

shorter = input
for moderation.IsInappropriate(shorter) {
for moderation.Is(shorter, moderation.Any) {
input = shorter
shorter = shorter[1:]
}

if moderation.IsInappropriate(input) {
if moderation.Is(input, moderation.Any) {
fmt.Printf("Found inappropriate phrase: %s\n", input)
} else {
fmt.Println("No inappropriate phrase found")
Expand Down
2 changes: 1 addition & 1 deletion generator/dictionary_blacklist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dildo(.*)
ejaculation
fag(.*)
fuck(.*)
hell
hellhole
jackass
livesex
motherfuck(.*)
Expand Down
2 changes: 2 additions & 0 deletions generator/dictionary_extra.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
alt
anna
afk
akshita
Expand All @@ -23,3 +24,4 @@ ttyl
tysongay
wassup
yass
yesturday
28 changes: 24 additions & 4 deletions generator/profanity.csv
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@ bollock,1,0,1,0
bollok,1,0,1,0
boner,0,0,2,0
boob,0,0,2,0
breast,0,0,1,0
btch,0,1,1,0
bugger,1,0,0,0
bullshit,1,0,0,0
butt,1,0,0,0
bra,0,0,1,0
chink,0,2,0,0
clitoris,0,0,2,0
clit,0,0,2,0
cock,1,0,1,0
cok,1,0,1,0
condom,0,0,1,0
Expand All @@ -30,8 +31,13 @@ cracker,0,1,0,0
crap,1,0,0,0
cum,0,0,2,0
cunt,0,1,1,0
damn,1,0,0,0
daygo,0,2,0,0
dego,0,2,0,0
dick,1,0,1,0
dildo,0,0,2,0
dominatrics,0,0,2,0
dominatrix,0,0,2,0
dong,0,0,1,0
douche,1,1,1,1
dumb,0,0,0,1
Expand All @@ -47,6 +53,7 @@ fellatio,0,0,2,0
flange,0,0,2,0
foursome,0,0,2,0
freak,0,0,0,1
frick,1,0,0,0
frig,1,0,1,0
fuck,1,0,1,0
fudgepacker,0,0,2,0
Expand All @@ -55,9 +62,13 @@ gangbang,0,2,0,0
ghetto,0,1,0,0
handjob,0,0,2,0
hate,0,0,0,1
hell,1,0,0,0
heshe,0,0,1,0
hoar,0,2,2,0
hoe,0,1,1,0
homo,0,1,0,0
homo,0,1,1,0
honkey,0,1,0,0
hooker,0,0,2,0
horny,0,0,2,0
idiot,0,0,0,1
incest,0,0,2,0
Expand All @@ -67,17 +78,19 @@ killyourself,0,1,0,2
labia,0,0,2,0
loser,0,0,0,1
masterbat,0,0,1,0
motherfuck,0,1,1,0
motherfuck,1,1,1,0
muff,0,0,2,0
naked,0,0,1,0
nazi,0,1,0,0
nigga,1,2,0,0
nigger,2,3,0,0
nigguh,1,2,0,0
nigr,2,3,0,0
nude,0,0,2,0
orgasm,0,0,1,0
penis,2,0,2,0
phuck,1,0,1,0
penus,2,0,2,0
phuc,1,0,1,0
phuk,1,0,1,0
pieceofgarbage,0,0,0,1
piss,1,0,0,0
Expand All @@ -91,7 +104,10 @@ pussies,0,1,1,0
pusy,0,1,1,0
queer,0,2,0,0
rape,0,0,1,0
rectum,0,0,1,0
recktum,0,0,1,0
retard,0,1,0,1
retart,0,1,0,1
rimjob,0,0,2,0
scrotum,0,0,2,0
semen,0,0,2,0
Expand All @@ -100,11 +116,14 @@ shagger,0,0,2,0
shagging,0,0,2,0
shit,1,0,0,0
sissy,0,0,0,1
skank,0,1,2,0
slut,0,1,1,0
spunk,0,0,2,0
stfu,0,0,0,1
stupid,0,0,0,1
suckmy,0,0,2,0
testical,0,0,2,0
testicle,0,0,2,0
threesome,0,0,2,0
tit,0,0,2,0
tohell,1,1,0,0
Expand All @@ -113,6 +132,7 @@ turd,1,0,0,0
twat,0,0,2,0
ugly,0,0,0,1
vagina,0,0,2,0
vulva,0,0,2,0
wank,0,0,2,0
whore,0,2,2,0
wigga,1,2,0,0
Expand Down
5 changes: 4 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,7 @@ module github.com/finnbear/moderation

go 1.15

require golang.org/x/text v0.3.4
require (
golang.org/x/exp v0.0.0-20210220032938-85be41e4509f
golang.org/x/text v0.3.4
)
31 changes: 31 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,34 @@
dmitri.shuralyov.com/gpu/mtl v0.0.0-20201218220906-28db891af037/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/exp v0.0.0-20190731235908-ec7cb31e5a56/go.mod h1:JhuoJpWY28nO4Vef9tZUw9qufEGTyX1+7lmHxV5q5G4=
golang.org/x/exp v0.0.0-20210220032938-85be41e4509f h1:GrkO5AtFUU9U/1f5ctbIBXtBGeSJbWwIYfIsTcFMaX4=
golang.org/x/exp v0.0.0-20210220032938-85be41e4509f/go.mod h1:I6l2HNBLBZEcrOoCpyKLdY2lHoRZ8lI4x60KMCQDft4=
golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
golang.org/x/mobile v0.0.0-20201217150744-e6ae53a27f4f/go.mod h1:skQtrUTUwhdJvXM/2KKJzY8pDgNr9I/FOMqDVRPBUS4=
golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
golang.org/x/mod v0.1.1-0.20191209134235-331c550502dd/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.1-0.20200828183125-ce943fd02449/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.4 h1:0YWbFKbhXG/wIiuHDSKpS0Iy7FSA+u45VtBMfQcFTTc=
golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200117012304-6edc0a871e69/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
28 changes: 22 additions & 6 deletions moderation.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ const (
Offensive
Sexual
Mean
Inappropriate = Profane | Offensive | Sexual
Any Type = 0xffffffff
)

var (
Expand Down Expand Up @@ -56,6 +58,11 @@ var replacements = [...]string{

var removeAccentsTransform = transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)

const (
minMatchable rune = 0x0020
maxMatchable rune = 0x007E
)

// IsInappropriate returns whether a phrase contains enough inappropriate words
// to meet or exceed InappropriateThreshold
//
Expand All @@ -66,13 +73,22 @@ func IsInappropriate(text string) bool {
return Is(text, Profane|Offensive|Sexual)
}

// IsInappropriate returns whether a phrase contains enough words matching the
// Is returns whether a phrase contains enough words matching the
// types flag to meet or exceed InappropriateThreshold
func Is(text string, types Type) bool {
// Sanitize input
buf := make([]byte, 0, len(text))
_, n, _ := transform.Append(removeAccentsTransform, buf, []byte(text))
text = string(buf[:n])
// Sanitize input, if needed
needsSanitize := false
for _, textRune := range text {
if textRune < minMatchable || maxMatchable < textRune {
needsSanitize = true
break
}
}
if needsSanitize {
buf := make([]byte, 0, len(text))
_, n, _ := transform.Append(removeAccentsTransform, buf, []byte(text))
text = string(buf[:n])
}

// How many characters ago the last separator character was observed,
// expressed as an upper and a lower bound in relation to the current queue
Expand All @@ -91,7 +107,7 @@ func Is(text string, types Type) bool {

for _, textRune := range text {
// Unhandled runes (not printable, not representable as byte, etc.)
if textRune < 0x0020 || textRune > 0x007E {
if textRune < minMatchable || maxMatchable < textRune {
continue
}

Expand Down
2 changes: 2 additions & 0 deletions moderation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ func TestAnalyzeWikipedia(t *testing.T) {
//fmt.Printf("phrase=\"%s\" analysis offensive=%v actual offensive=%v\n", phrase, !offensive, offensive)
}

_, _ = Censor(phrase, Inappropriate)

total++
if offensive {
totalNok++
Expand Down
Loading

0 comments on commit 6551981

Please sign in to comment.