-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.go
116 lines (99 loc) · 2.51 KB
/
crawler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
package goccer
import (
"crypto/tls"
"fmt"
"net/http"
"io"
"strings"
"time"
"golang.org/x/net/html"
)
// TODO: Abstract below to allow for different types of crawlers (ie: db, fs)
type crawler struct {
client *http.Client
seed string
}
// newCrawler constructor
func newCrawler() *crawler {
return &crawler{
client: &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
InsecureSkipVerify: true,
},
},
Timeout: time.Second * 15,
},
}
}
// Crawl the given seed and return the URLs from c.parseHTML
func (c *crawler) Crawl(seed string) ([]string, error) {
if seed == "" || seed == " " {
return nil, nil
} else {
c.seed = seed
}
resp, err := c.client.Get(c.seed)
if err != nil {
return nil, err
}
defer resp.Body.Close()
return c.parseHTML(resp.Body), nil
}
// parseHTML takes an io.Reader (http.Response.Body), extracts all
// <a>nchor tags, and returns all rebuilt <a> tags as full URLs
func (c *crawler) parseHTML(body io.Reader) []string {
if body == nil {
return nil
}
var parsed []string
checked := make(map[string]struct{})
// https://pkg.go.dev/golang.org/x/net/html
page := html.NewTokenizer(body)
for {
tokenType := page.Next()
if tokenType == html.ErrorToken {
return parsed
}
token := page.Token()
// Example token.DataAtom possibilities: h1, p, code, a, ul, li, ...
// We only care about 'a' though
if tokenType == html.StartTagToken && token.DataAtom.String() == "a" {
for _, attr := range token.Attr {
// attr can be a valid URL or a route
// ie - "https://github.com/afkworks/spec-kn" || "books" || "#cite_note-77"
if attr.Key == "href" && attr.Val != "" {
rebuilt := c.rebuildURL(attr.Val)
if _, exists := checked[rebuilt]; !exists {
parsed = append(parsed, rebuilt)
checked[rebuilt] = struct{}{}
}
}
}
}
}
return parsed
}
// rebuildURL returns a string depending on the state of href
func (c *crawler) rebuildURL(href string) string {
rebuilt := ""
// check if href is already a valid URL
if strings.HasPrefix(href, "http") {
rebuilt = href
}
// check if href iS '/', '//', or '#'
if len(href) < 3 {
rebuilt = c.seed
}
// if we get to this point, rebuild using c.seed and href
if rebuilt == "" {
if strings.HasPrefix(href, "//") {
rebuilt = fmt.Sprintf("http:%s", href)
} else if strings.HasPrefix(href, "/") || strings.HasPrefix(href, "#") {
rebuilt = fmt.Sprintf("%s%s", c.seed, href)
} else {
rebuilt = fmt.Sprintf("%s/%s", c.seed, href)
}
}
return rebuilt
}