-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgib_detect.js
143 lines (115 loc) · 4.13 KB
/
gib_detect.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
'use strict';
const fs = require('fs');
let accepted_chars = 'abcdefghijklmnopqrstuvwxyz '
let k = accepted_chars.length;
let pos = {};
for (let i = 0; i < k; i++) {
pos[accepted_chars[i]] = i;
}
let trainFile = 'big.txt';
let goodFile = 'good.txt';
let badFile = 'bad.txt';
let modelFile = 'gib_model.json';
function normalize(line) {
var arr = line.toLowerCase().split('');
return arr.filter(function(item) {
return accepted_chars.indexOf(item) > -1;
});
}
function train() {
//Assume we have seen 10 of each character pair. This acts as a kind of
//prior or smoothing factor. This way, if we see a character transition
//live that we've never observed in the past, we won't assume the entire
//string has 0 probability.
let log_prob_matrix = Array();
for (let i = 0; i < k; i++) {
let temp = Array();
for (let j = 0; j < k; j++) {
temp.push(10);
}
log_prob_matrix.push(temp);
}
//Count transitions from big text file, taken
//from http://norvig.com/spell-correct.html
let lines = fs.readFileSync(trainFile).toString('utf8').split('\n');
//
for (var key in lines) {
//Return all n grams from l after normalizing
var filtered_line = normalize(lines[key]);
var a = false;
for (var b in filtered_line) {
if (a !== false) {
log_prob_matrix[pos[a]][pos[filtered_line[b]]] += 1;
}
a = filtered_line[b];
}
}
//Normalize the counts so that they become log probabilities.
//We use log probabilities rather than straight probabilities to avoid
//numeric underflow issues with long texts.
//This contains a justification:
//http://squarecog.wordpress.com/2009/01/10/dealing-with-underflow-in-joint-probability-calculations/
for (var i in log_prob_matrix) {
var s = log_prob_matrix[i].reduce(function(a, b) {
return a + b;
});
for (var j in log_prob_matrix[i]) {
log_prob_matrix[i][j] = Math.log(log_prob_matrix[i][j] / s);
}
}
//Find the probability of generating a few arbitrarily choosen good and
//bad phrases.
let good_lines = fs.readFileSync(goodFile).toString('utf8').split('\n');
let good_probs = Array();
for (var key in good_lines) {
good_probs.push(averageTransitionProbability(good_lines[key], log_prob_matrix));
}
let bad_lines = fs.readFileSync(badFile).toString('utf8').split('\n');
let bad_probs = Array();
for (var key in bad_lines) {
bad_probs.push(averageTransitionProbability(bad_lines[key], log_prob_matrix));
}
//Assert that we actually are capable of detecting the junk.
let min_good_probs = Math.min.apply(null, good_probs);
let max_bad_probs = Math.max.apply(null, bad_probs);
if (min_good_probs <= max_bad_probs) {
return false;
}
//And pick a threshold halfway between the worst good and best bad inputs.
let threshold = (min_good_probs + max_bad_probs) / 2;
console.log('good', good_probs);
console.log('bad', bad_probs);
console.log('th', threshold);
//save matrix
fs.writeFileSync(modelFile, JSON.stringify({
'matrix': log_prob_matrix,
'threshold': threshold
}));
return true;
}
function averageTransitionProbability(line, log_prob_matrix) {
//Return the average transition prob from line through log_prob_mat.
let log_prob = 1.0;
let transition_ct = 0;
var filtered_line = normalize(line);
var a = false;
for (var b in filtered_line) {
if (a !== false) {
log_prob += log_prob_matrix[pos[a]][pos[filtered_line[b]]];
transition_ct += 1;
}
a = filtered_line[b];
}
return Math.exp(log_prob / (transition_ct || 1));
}
let model_data = {};
try {
model_data = JSON.parse(fs.readFileSync(modelFile).toString('utf8'));
} catch(e) {
train();
model_data = JSON.parse(fs.readFileSync(modelFile).toString('utf8'));
}
function gib_dect(line) {
return averageTransitionProbability(line, model_data.matrix) > model_data.threshold
}
exports.gib_dect = gib_dect;