-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
81 lines (78 loc) · 2.97 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
const tagger = require('wink-pos-tagger')();
const lemmatize = require("wink-lemmatizer")
const preprocessor = require('text-preprocessor');
const stopwords = require('./stopwords/en');
tagger.updateLexicon({
ooh: ['UH']
});
/**
* Extracts a pure list of lemmatized words of a text filtered by stop words. it will remove non-word tokens, ones which their length is less than 3 and contains non-alphabetic charachters.
*
* @param {String} text input text
* @param {String[]} filter list of custom stopword which will replace with defaults, in case of passing `false` filtering results by stopwords will ignore.
* @returns {Object[]}
*/
function extract(text, filter) {
const normalizedText = preprocessor(text)
.defaults()
// .removeURLs()
.expandContractions()
.toString();
// console.log(normalizedText);
const tokens = tagger.tagSentence(normalizedText).filter(token => {
return token.tag == 'word' &&
token.normal.length > 2 &&
/^[a-z]+$/.test(token.normal);
}).map(token => {
// if (token.pos === 'NN' && /ing$/.test(token.normal) && token.lemma.length >= 6) {
// token.pos = 'VBG';
// token.lemma = lemmatize.verb(token.normal);
// }
token.vocabulary = token.normal;
switch (token.pos) {
// https://github.com/finnlp/en-pos#readme
// 'cars' to 'car'
case 'NNS':
case 'NNPS':
if (token.normal.substr(-1, 1) == 's') {
token.vocabulary = token.lemma;
}
if (token.vocabulary.substr(-3, 3) == 'ies') {
token.vocabulary = token.vocabulary.slice(0, -3) + 'y';
}
break;
// 'runs' to 'run'
case 'VBZ':
token.vocabulary = token.lemma;
if (token.vocabulary.substr(-3, 3) == 'ies') {
token.vocabulary = token.vocabulary.slice(0, -3) + 'y';
}
break;
// 'running' to 'run'
case 'VBG':
token.vocabulary = token.lemma;
if (/ing$/.test(token.vocabulary)) {
token.vocabulary = token.vocabulary.slice(0, -3);
}
break;
// 'wanted' to 'want'
case 'VBD':
case 'VBN':
if (token.normal.substr(-2, 2) == 'ed') {
token.vocabulary = token.lemma;
}
break;
// 'limited' to 'limit'
case 'JJ':
if (token.normal.substr(-2, 2) == 'ed') {
token.vocabulary = lemmatize.verb(token.normal);
}
break;
default:
break;
}
return token;
});
return filter === false ? tokens : tokens.filter(token => (filter ? filter : stopwords).indexOf(token.vocabulary) == -1);
};
module.exports = extract;