-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyse.go
89 lines (79 loc) · 2.45 KB
/
analyse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
/*
Copyright 2020-2021 Ihangji, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package linguist
import (
"bytes"
"log"
"math"
"github.com/ihangji/linguist/data"
"github.com/ihangji/linguist/tokenizer"
"github.com/jbrukh/bayesian"
)
var classifier *bayesian.Classifier
var classifierInitialized = false
// Gets the baysian.Classifier which has been trained on programming language
// samples from github.com/github/linguist after running the generator
//
// See also cmd/generate-classifier
func getClassifier() *bayesian.Classifier {
// NOTE(tso): this could probably go into an init() function instead
// but this lazy loading approach works, and it's conceivable that the
// analyse() function might not invoked in an actual runtime anyway
if !classifierInitialized {
d, err := data.Asset("classifier")
if err != nil {
log.Panicln(err)
}
reader := bytes.NewReader(d)
classifier, err = bayesian.NewClassifierFromReader(reader)
if err != nil {
log.Panicln(err)
}
classifierInitialized = true
}
return classifier
}
// Analyse returns the name of a programming language, or the empty string if one could
// not be determined.
//
// Uses Naive Bayesian Classification on the file contents provided.
//
// It is recommended to use LanguageByContents() instead of this function directly.
//
// Obtain hints from LanguageHints()
//
// NOTE(tso): May yield inaccurate results
func Analyse(contents []byte, hints []string) (language string) {
document := tokenizer.Tokenize(contents)
classifier := getClassifier()
scores, idx, _ := classifier.LogScores(document)
if len(hints) == 0 {
return string(classifier.Classes[idx])
}
langs := map[string]struct{}{}
for _, hint := range hints {
langs[hint] = struct{}{}
}
bestScore := math.Inf(-1)
bestAnswer := ""
for id, score := range scores {
answer := string(classifier.Classes[id])
if _, ok := langs[answer]; ok {
if score >= bestScore {
bestScore = score
bestAnswer = answer
}
}
}
return bestAnswer
}