Add MaxDocumentLength and custom UserAgent support

nightbloos · nightbloos · commit dbd38f5bb0b4 · 2020-07-15T18:27:40.000+03:00
diff --git a/goscraper.go b/goscraper.go
@@ -2,6 +2,7 @@ package goscraper
 
 import (
 	"bytes"
+	"errors"
 	"fmt"
 	"io"
 	"net/http"
@@ -18,10 +19,16 @@ var (
 	fragmentRegexp         = regexp.MustCompile("#!(.*)")
 )
 
+type ScraperOptions struct {
+	MaxDocumentLength int64
+	UserAgent         string
+}
+
 type Scraper struct {
 	Url                *url.URL
 	EscapedFragmentUrl *url.URL
 	MaxRedirect        int
+	Options            ScraperOptions
 }
 
 type Document struct {
@@ -38,12 +45,12 @@ type DocumentPreview struct {
 	Link        string
 }
 
-func Scrape(uri string, maxRedirect int) (*Document, error) {
+func Scrape(uri string, maxRedirect int, options ScraperOptions) (*Document, error) {
 	u, err := url.Parse(uri)
 	if err != nil {
 		return nil, err
 	}
-	return (&Scraper{Url: u, MaxRedirect: maxRedirect}).Scrape()
+	return (&Scraper{Url: u, MaxRedirect: maxRedirect, Options: options}).Scrape()
 }
 
 func (scraper *Scraper) Scrape() (*Document, error) {
@@ -109,6 +116,16 @@ func (scraper *Scraper) toFragmentUrl() error {
 }
 
 func (scraper *Scraper) getDocument() (*Document, error) {
+	addUserAgent := func(req *http.Request) *http.Request {
+		userAgent := "GoScraper"
+		if len(scraper.Options.UserAgent) != 0 {
+			userAgent = scraper.Options.UserAgent
+		}
+		req.Header.Add("User-Agent", userAgent)
+
+		return req
+	}
+
 	scraper.MaxRedirect -= 1
 	if strings.Contains(scraper.Url.String(), "#!") {
 		scraper.toFragmentUrl()
@@ -117,11 +134,31 @@ func (scraper *Scraper) getDocument() (*Document, error) {
 		scraper.EscapedFragmentUrl = scraper.Url
 	}
 
+	if scraper.Options.MaxDocumentLength > 0 {
+		// We try first to check content length (if it's present) - and if isn't - already limit by body size
+		req, err := http.NewRequest("HEAD", scraper.getUrl(), nil)
+		if err != nil {
+			return nil, err
+		}
+		req = addUserAgent(req)
+
+		resp, err := http.DefaultClient.Do(req)
+		if resp != nil {
+			defer resp.Body.Close()
+		}
+		if err != nil {
+			return nil, err
+		}
+		if resp.ContentLength > scraper.Options.MaxDocumentLength {
+			return nil, errors.New("Content-Length exceed limits")
+		}
+	}
+
 	req, err := http.NewRequest("GET", scraper.getUrl(), nil)
 	if err != nil {
 		return nil, err
 	}
-	req.Header.Add("User-Agent", "GoScraper")
+	req = addUserAgent(req)
 
 	resp, err := http.DefaultClient.Do(req)
 	if resp != nil {
@@ -135,6 +172,11 @@ func (scraper *Scraper) getDocument() (*Document, error) {
 		scraper.EscapedFragmentUrl = nil
 		scraper.Url = resp.Request.URL
 	}
+
+	if scraper.Options.MaxDocumentLength > 0 {
+		resp.Body = http.MaxBytesReader(nil, resp.Body, scraper.Options.MaxDocumentLength)
+	}
+
 	b, err := convertUTF8(resp.Body, resp.Header.Get("content-type"))
 	if err != nil {
 		return nil, err
@@ -197,7 +239,7 @@ func (scraper *Scraper) parseDocument(doc *Document) error {
 				if cleanStr(attr.Key) == "rel" && cleanStr(attr.Val) == "canonical" {
 					canonical = true
 				}
-				if cleanStr(attr.Key) == "rel" && strings.Contains(cleanStr(attr.Val),  "icon") {
+				if cleanStr(attr.Key) == "rel" && strings.Contains(cleanStr(attr.Val), "icon") {
 					hasIcon = true
 				}
 				if cleanStr(attr.Key) == "href" {