Skip to content

Commit dbd38f5

Browse files
committed
Add MaxDocumentLength and custom UserAgent support
1 parent 36995ce commit dbd38f5

File tree

1 file changed

+46
-4
lines changed

1 file changed

+46
-4
lines changed

goscraper.go

+46-4
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package goscraper
22

33
import (
44
"bytes"
5+
"errors"
56
"fmt"
67
"io"
78
"net/http"
@@ -18,10 +19,16 @@ var (
1819
fragmentRegexp = regexp.MustCompile("#!(.*)")
1920
)
2021

22+
type ScraperOptions struct {
23+
MaxDocumentLength int64
24+
UserAgent string
25+
}
26+
2127
type Scraper struct {
2228
Url *url.URL
2329
EscapedFragmentUrl *url.URL
2430
MaxRedirect int
31+
Options ScraperOptions
2532
}
2633

2734
type Document struct {
@@ -38,12 +45,12 @@ type DocumentPreview struct {
3845
Link string
3946
}
4047

41-
func Scrape(uri string, maxRedirect int) (*Document, error) {
48+
func Scrape(uri string, maxRedirect int, options ScraperOptions) (*Document, error) {
4249
u, err := url.Parse(uri)
4350
if err != nil {
4451
return nil, err
4552
}
46-
return (&Scraper{Url: u, MaxRedirect: maxRedirect}).Scrape()
53+
return (&Scraper{Url: u, MaxRedirect: maxRedirect, Options: options}).Scrape()
4754
}
4855

4956
func (scraper *Scraper) Scrape() (*Document, error) {
@@ -109,6 +116,16 @@ func (scraper *Scraper) toFragmentUrl() error {
109116
}
110117

111118
func (scraper *Scraper) getDocument() (*Document, error) {
119+
addUserAgent := func(req *http.Request) *http.Request {
120+
userAgent := "GoScraper"
121+
if len(scraper.Options.UserAgent) != 0 {
122+
userAgent = scraper.Options.UserAgent
123+
}
124+
req.Header.Add("User-Agent", userAgent)
125+
126+
return req
127+
}
128+
112129
scraper.MaxRedirect -= 1
113130
if strings.Contains(scraper.Url.String(), "#!") {
114131
scraper.toFragmentUrl()
@@ -117,11 +134,31 @@ func (scraper *Scraper) getDocument() (*Document, error) {
117134
scraper.EscapedFragmentUrl = scraper.Url
118135
}
119136

137+
if scraper.Options.MaxDocumentLength > 0 {
138+
// We try first to check content length (if it's present) - and if isn't - already limit by body size
139+
req, err := http.NewRequest("HEAD", scraper.getUrl(), nil)
140+
if err != nil {
141+
return nil, err
142+
}
143+
req = addUserAgent(req)
144+
145+
resp, err := http.DefaultClient.Do(req)
146+
if resp != nil {
147+
defer resp.Body.Close()
148+
}
149+
if err != nil {
150+
return nil, err
151+
}
152+
if resp.ContentLength > scraper.Options.MaxDocumentLength {
153+
return nil, errors.New("Content-Length exceed limits")
154+
}
155+
}
156+
120157
req, err := http.NewRequest("GET", scraper.getUrl(), nil)
121158
if err != nil {
122159
return nil, err
123160
}
124-
req.Header.Add("User-Agent", "GoScraper")
161+
req = addUserAgent(req)
125162

126163
resp, err := http.DefaultClient.Do(req)
127164
if resp != nil {
@@ -135,6 +172,11 @@ func (scraper *Scraper) getDocument() (*Document, error) {
135172
scraper.EscapedFragmentUrl = nil
136173
scraper.Url = resp.Request.URL
137174
}
175+
176+
if scraper.Options.MaxDocumentLength > 0 {
177+
resp.Body = http.MaxBytesReader(nil, resp.Body, scraper.Options.MaxDocumentLength)
178+
}
179+
138180
b, err := convertUTF8(resp.Body, resp.Header.Get("content-type"))
139181
if err != nil {
140182
return nil, err
@@ -197,7 +239,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
197239
if cleanStr(attr.Key) == "rel" && cleanStr(attr.Val) == "canonical" {
198240
canonical = true
199241
}
200-
if cleanStr(attr.Key) == "rel" && strings.Contains(cleanStr(attr.Val), "icon") {
242+
if cleanStr(attr.Key) == "rel" && strings.Contains(cleanStr(attr.Val), "icon") {
201243
hasIcon = true
202244
}
203245
if cleanStr(attr.Key) == "href" {

0 commit comments

Comments
 (0)