@@ -2,6 +2,7 @@ package goscraper
2
2
3
3
import (
4
4
"bytes"
5
+ "errors"
5
6
"fmt"
6
7
"io"
7
8
"net/http"
@@ -18,10 +19,16 @@ var (
18
19
fragmentRegexp = regexp .MustCompile ("#!(.*)" )
19
20
)
20
21
22
+ type ScraperOptions struct {
23
+ MaxDocumentLength int64
24
+ UserAgent string
25
+ }
26
+
21
27
type Scraper struct {
22
28
Url * url.URL
23
29
EscapedFragmentUrl * url.URL
24
30
MaxRedirect int
31
+ Options ScraperOptions
25
32
}
26
33
27
34
type Document struct {
@@ -38,12 +45,12 @@ type DocumentPreview struct {
38
45
Link string
39
46
}
40
47
41
- func Scrape (uri string , maxRedirect int ) (* Document , error ) {
48
+ func Scrape (uri string , maxRedirect int , options ScraperOptions ) (* Document , error ) {
42
49
u , err := url .Parse (uri )
43
50
if err != nil {
44
51
return nil , err
45
52
}
46
- return (& Scraper {Url : u , MaxRedirect : maxRedirect }).Scrape ()
53
+ return (& Scraper {Url : u , MaxRedirect : maxRedirect , Options : options }).Scrape ()
47
54
}
48
55
49
56
func (scraper * Scraper ) Scrape () (* Document , error ) {
@@ -109,6 +116,16 @@ func (scraper *Scraper) toFragmentUrl() error {
109
116
}
110
117
111
118
func (scraper * Scraper ) getDocument () (* Document , error ) {
119
+ addUserAgent := func (req * http.Request ) * http.Request {
120
+ userAgent := "GoScraper"
121
+ if len (scraper .Options .UserAgent ) != 0 {
122
+ userAgent = scraper .Options .UserAgent
123
+ }
124
+ req .Header .Add ("User-Agent" , userAgent )
125
+
126
+ return req
127
+ }
128
+
112
129
scraper .MaxRedirect -= 1
113
130
if strings .Contains (scraper .Url .String (), "#!" ) {
114
131
scraper .toFragmentUrl ()
@@ -117,11 +134,31 @@ func (scraper *Scraper) getDocument() (*Document, error) {
117
134
scraper .EscapedFragmentUrl = scraper .Url
118
135
}
119
136
137
+ if scraper .Options .MaxDocumentLength > 0 {
138
+ // We try first to check content length (if it's present) - and if isn't - already limit by body size
139
+ req , err := http .NewRequest ("HEAD" , scraper .getUrl (), nil )
140
+ if err != nil {
141
+ return nil , err
142
+ }
143
+ req = addUserAgent (req )
144
+
145
+ resp , err := http .DefaultClient .Do (req )
146
+ if resp != nil {
147
+ defer resp .Body .Close ()
148
+ }
149
+ if err != nil {
150
+ return nil , err
151
+ }
152
+ if resp .ContentLength > scraper .Options .MaxDocumentLength {
153
+ return nil , errors .New ("Content-Length exceed limits" )
154
+ }
155
+ }
156
+
120
157
req , err := http .NewRequest ("GET" , scraper .getUrl (), nil )
121
158
if err != nil {
122
159
return nil , err
123
160
}
124
- req . Header . Add ( "User-Agent" , "GoScraper" )
161
+ req = addUserAgent ( req )
125
162
126
163
resp , err := http .DefaultClient .Do (req )
127
164
if resp != nil {
@@ -135,6 +172,11 @@ func (scraper *Scraper) getDocument() (*Document, error) {
135
172
scraper .EscapedFragmentUrl = nil
136
173
scraper .Url = resp .Request .URL
137
174
}
175
+
176
+ if scraper .Options .MaxDocumentLength > 0 {
177
+ resp .Body = http .MaxBytesReader (nil , resp .Body , scraper .Options .MaxDocumentLength )
178
+ }
179
+
138
180
b , err := convertUTF8 (resp .Body , resp .Header .Get ("content-type" ))
139
181
if err != nil {
140
182
return nil , err
@@ -197,7 +239,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
197
239
if cleanStr (attr .Key ) == "rel" && cleanStr (attr .Val ) == "canonical" {
198
240
canonical = true
199
241
}
200
- if cleanStr (attr .Key ) == "rel" && strings .Contains (cleanStr (attr .Val ), "icon" ) {
242
+ if cleanStr (attr .Key ) == "rel" && strings .Contains (cleanStr (attr .Val ), "icon" ) {
201
243
hasIcon = true
202
244
}
203
245
if cleanStr (attr .Key ) == "href" {
0 commit comments