-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrobot.go
117 lines (109 loc) · 2.7 KB
/
robot.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
//Package cognibot Copyright 2016 Cognifly and Contributors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cognibot
import (
"bufio"
"io/ioutil"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"github.com/cognifly/cognilog"
)
// Robot passed exclusion for host
type Robot struct {
RootURL *url.URL
FullAllow bool
FullDisallow bool
Groups map[string][]string
CrawDelay time.Duration
}
// Removes the whitespaces.
func trimSpaces(s string) string {
reg := regexp.MustCompile("\\s")
clear := reg.ReplaceAllString(s, " ")
rmSpaces := strings.Split(clear, " ")
var str string
for _, s := range rmSpaces {
str = str + s
}
return str
}
// are all user-agents allowed?
func isAllowAll(grp map[string][]string) bool {
if len(grp) == 1 {
for _, val := range grp {
if len(val) == 1 {
if trimSpaces(val[0]) == "allow:/" {
return true
}
}
}
}
return false
}
// are all user-agents disallowed?
func isDisallowAll(grp map[string][]string) bool {
if len(grp) == 1 {
for _, val := range grp {
if len(val) == 1 {
if trimSpaces(val[0]) == "disallow:/" {
return true
}
}
}
}
return false
}
// MakeBot takes an http.Response and returns a Robot
func MakeBot(res *http.Response) *Robot {
var robot = new(Robot)
// treat all 4xx errors in the same way. Assume that there are no restrictions.
if res.StatusCode >= 400 && res.StatusCode > 500 {
robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL()
robot.FullAllow = true
robot.CrawDelay = DefaultCrawlDelay
return robot
} else if res.StatusCode == 200 {
byt, err := ioutil.ReadAll(res.Body)
if err != nil {
cognilog.LogINFO("red", "Body read error", err)
}
redr := strings.NewReader(string(byt))
scanner := bufio.NewScanner(redr)
var groups = make(map[string][]string)
var key string
var status bool
for scanner.Scan() {
txt := strings.ToLower(scanner.Text())
if txt == "" {
continue
}
// new group
if strings.HasPrefix(txt, "user-agent:") {
key = trimSpaces(txt)
status = true
continue
}
if status && key != "" {
groups[key] = append(groups[key], trimSpaces(txt))
}
}
if err := scanner.Err(); err != nil {
cognilog.Log("red", err)
}
if isAllowAll(groups) {
robot.FullAllow = true
} else if isDisallowAll(groups) {
robot.FullDisallow = true
}
robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL()
robot.Groups = groups
robot.CrawDelay = DefaultCrawlDelay
return robot
}
robot.RootURL = NewCmd(strings.TrimSuffix(res.Request.URL.String(), "robots.txt")).URL()
return robot
}