-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapper.go
116 lines (90 loc) · 3.06 KB
/
scrapper.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
package scrapper
import (
"fmt"
"github.com/andybalholm/cascadia"
"golang.org/x/net/html"
"strings"
"net/http"
"io/ioutil"
"github.com/danigomez/scrapper/util"
)
func mergeMap(current ScrapResult, new ScrapResult, currentUrl string) {
for k, v := range new {
newKey := k + "@" + currentUrl
current[newKey] = v
}
}
type ScrapResult map[string][]*html.Node
type ScrapDescriptor struct {
routeSelector string
tagSelectorMap map[string]string
next *ScrapDescriptor
}
// TODO Agregar la posibilidad de que se puede definir un RoadMap
// es decir, que se puede indicar que URLs del dominio se deben recorrer a partir de selectores
// y por cada una de esas urls poder definir a su vez nuevos selectores para extraer información
type Scrapper struct {
domain string // Contains the domain where we should start the scrapper
descriptors []*ScrapDescriptor
}
func NewDescriptor(routeSelector string, tagSelectorMap map[string]string, next *ScrapDescriptor) ScrapDescriptor {
return ScrapDescriptor{routeSelector, tagSelectorMap, next}
}
func NewScrapper(domain string, descriptor []*ScrapDescriptor) Scrapper {
return Scrapper{domain, descriptor}
}
func (s Scrapper) DoScrap() (ret ScrapResult) {
ret = make(ScrapResult)
response, err := http.Get(s.domain)
if err != nil {
fmt.Errorf("error: there was an error while getting resource %s", s.domain)
}
body, err := ioutil.ReadAll(response.Body)
// Iterate over each descriptor
for _, descriptor := range s.descriptors {
// For each descriptor, recurse until we reach the end of the linked list
for descriptor != nil {
if descriptor.routeSelector == "" {
mergeMap(ret, s.scrap(string(body), descriptor.tagSelectorMap), s.domain)
} else {
aux := s.scrap(string(body), map[string]string{"route": descriptor.routeSelector})
nodes := aux["route"]
for _, node := range nodes {
href := strings.Replace(util.GetValFromKey(node, "href"), "//", "http://", 1)
if href == "" {
fmt.Printf("There is no href for selector %s", descriptor.routeSelector)
continue
}
fmt.Printf("Scrapping url %s\n", href)
response, err = http.Get(href)
if err != nil {
fmt.Errorf("error: there was an error while getting resource %s", s.domain)
}
body, err = ioutil.ReadAll(response.Body)
mergeMap(ret, s.scrap(string(body), descriptor.tagSelectorMap), href)
}
}
descriptor = descriptor.next
}
}
return
}
func (s Scrapper) scrap(rawHtml string, tagSelectorMap map[string]string) (ret ScrapResult) {
// Initialize a new map
ret = make(ScrapResult)
// Convert html string to Node representation
node, err := html.Parse(strings.NewReader(rawHtml))
if err != nil {
fmt.Errorf("error: there was and error parsing html %s", err.Error())
}
for tag, selector := range tagSelectorMap {
fmt.Printf("Processing (%s, %s)\n", tag, selector)
compiled, err := cascadia.Compile(selector)
if err != nil {
fmt.Errorf("error: there was an error getting selector %s", err.Error())
}
result := compiled.MatchAll(node)
ret[tag] = result
}
return
}