-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalyse.go
58 lines (47 loc) · 1.33 KB
/
analyse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
package weasel
import (
"github.com/surgebase/porter2"
"strings"
"encoding/gob"
)
var (
// Porter2Stemmer is the implementation at https://github.com/dataence/porter2
Porter2Stemmer = porter2Stemmer{}
// LowercaseFilter transforms text to lowercase.
LowercaseFilter = lowercaseFilter{}
// StopEn is an english stopwords filter using the default Elasticsearch stop word list.
StopEn = stopEn{}
// English stopwords
stopwordsEn = []string{"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is",
"it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this",
"to", "was", "will", "with"}
)
// Analyser applies some function to text at index time.
type Analyser interface {
Analyse(text string) (string, error)
}
type porter2Stemmer struct {
}
type lowercaseFilter struct {
}
type stopEn struct {
}
func (stopEn) Analyse(text string) (string, error) {
for _, word := range stopwordsEn {
if word == text {
return "", nil
}
}
return text, nil
}
func (lowercaseFilter) Analyse(text string) (string, error) {
return strings.ToLower(text), nil
}
func (porter2Stemmer) Analyse(text string) (string, error) {
return porter2.Stem(text), nil
}
func RegisterAnalysers() {
gob.Register(Porter2Stemmer)
gob.Register(LowercaseFilter)
gob.Register(StopEn)
}