Skip to content

Commit

Permalink
add: GetURLsFromJSON error logging
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed Aug 21, 2024
1 parent 2893638 commit 75ce3ca
Showing 1 changed file with 19 additions and 6 deletions.
25 changes: 19 additions & 6 deletions internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (

func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) {
var rawAssets []string
var URL = utils.URLToString(item.URL)

// Execute plugins on the response
if strings.Contains(base.Host, "cloudflarestream.com") {
Expand All @@ -31,8 +32,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
doc.Find("[data-item]").Each(func(index int, item *goquery.Selection) {
dataItem, exists := item.Attr("data-item")
if exists {
URLsFromJSON, _ := extractor.GetURLsFromJSON(dataItem)
rawAssets = append(rawAssets, URLsFromJSON...)
URLsFromJSON, err := extractor.GetURLsFromJSON(dataItem)
if err != nil {
c.Log.Error("unable to extract URLs from JSON in data-item attribute", "error", err, "url", URL)
} else {
rawAssets = append(rawAssets, URLsFromJSON...)
}
}
})

Expand Down Expand Up @@ -137,8 +142,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
scriptType, exists := item.Attr("type")
if exists {
if scriptType == "application/json" {
URLsFromJSON, _ := extractor.GetURLsFromJSON(item.Text())
rawAssets = append(rawAssets, URLsFromJSON...)
URLsFromJSON, err := extractor.GetURLsFromJSON(item.Text())
if err != nil {
c.Log.Error("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
} else {
rawAssets = append(rawAssets, URLsFromJSON...)
}
}
}

Expand Down Expand Up @@ -185,8 +194,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
}

if len(jsonContent[1]) > payloadEndPosition {
URLsFromJSON, _ := extractor.GetURLsFromJSON(jsonContent[1][:payloadEndPosition+1])
rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...)
URLsFromJSON, err := extractor.GetURLsFromJSON(jsonContent[1][:payloadEndPosition+1])
if err != nil {
c.Log.Error("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
} else {
rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...)
}
}
}
}
Expand Down

0 comments on commit 75ce3ca

Please sign in to comment.