Merge pull request #137 from internetarchive/reddit

Add custom code for Reddit archiving
internetarchive · Aug 21, 2024 · cb3b9db · cb3b9db
2 parents be08a01 + 75ce3ca
commit cb3b9db
Show file tree

Hide file tree

Showing 7 changed files with 278 additions and 90 deletions.
diff --git a/internal/pkg/crawl/assets.go b/internal/pkg/crawl/assets.go
@@ -6,13 +6,15 @@ import (
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
+	"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
 	"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
 	"github.com/internetarchive/Zeno/internal/pkg/queue"
 	"github.com/internetarchive/Zeno/internal/pkg/utils"
 )
 
 func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) {
 	var rawAssets []string
+	var URL = utils.URLToString(item.URL)
 
 	// Execute plugins on the response
 	if strings.Contains(base.Host, "cloudflarestream.com") {
@@ -30,8 +32,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
 	doc.Find("[data-item]").Each(func(index int, item *goquery.Selection) {
 		dataItem, exists := item.Attr("data-item")
 		if exists {
-			URLsFromJSON, _ := getURLsFromJSON(dataItem)
-			rawAssets = append(rawAssets, URLsFromJSON...)
+			URLsFromJSON, err := extractor.GetURLsFromJSON(dataItem)
+			if err != nil {
+				c.Log.Error("unable to extract URLs from JSON in data-item attribute", "error", err, "url", URL)
+			} else {
+				rawAssets = append(rawAssets, URLsFromJSON...)
+			}
 		}
 	})
 
@@ -136,8 +142,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
 			scriptType, exists := item.Attr("type")
 			if exists {
 				if scriptType == "application/json" {
-					URLsFromJSON, _ := getURLsFromJSON(item.Text())
-					rawAssets = append(rawAssets, URLsFromJSON...)
+					URLsFromJSON, err := extractor.GetURLsFromJSON(item.Text())
+					if err != nil {
+						c.Log.Error("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
+					} else {
+						rawAssets = append(rawAssets, URLsFromJSON...)
+					}
 				}
 			}
 
@@ -184,8 +194,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
 					}
 
 					if len(jsonContent[1]) > payloadEndPosition {
-						URLsFromJSON, _ := getURLsFromJSON(jsonContent[1][:payloadEndPosition+1])
-						rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...)
+						URLsFromJSON, err := extractor.GetURLsFromJSON(jsonContent[1][:payloadEndPosition+1])
+						if err != nil {
+							c.Log.Error("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
+						} else {
+							rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...)
+						}
 					}
 				}
 			}

diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go
@@ -1,7 +1,6 @@
 package crawl
 
 import (
-	"encoding/json"
 	"errors"
 	"io"
 	"net/http"
@@ -12,7 +11,7 @@ import (
 	"time"
 
 	"github.com/PuerkitoBio/goquery"
-	"github.com/clbanning/mxj/v2"
+	"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
 	"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
 	"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/facebook"
 	"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/libsyn"
@@ -224,6 +223,7 @@ func (c *Crawl) Capture(item *queue.Item) error {
 	var (
 		resp      *http.Response
 		waitGroup sync.WaitGroup
+		assets    []*url.URL
 	)
 
 	defer func(i *queue.Item) {
@@ -390,53 +390,20 @@ func (c *Crawl) Capture(item *queue.Item) error {
 		return err
 	}
 
-	// If the response is a JSON document, we want to scrape it for links
-	if strings.Contains(resp.Header.Get("Content-Type"), "json") {
-		jsonBody, err := io.ReadAll(resp.Body)
-		if err != nil {
-			c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading JSON body")
-			return err
-		}
-
-		outlinksFromJSON, err := getURLsFromJSON(string(jsonBody))
-		if err != nil {
-			c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while getting URLs from JSON")
-			return err
-		}
-
-		waitGroup.Add(1)
-		go c.queueOutlinks(utils.MakeAbsolute(item.URL, utils.StringSliceToURLSlice(outlinksFromJSON)), item, &waitGroup)
-
-		return err
-	}
-
 	// If the response is an XML document, we want to scrape it for links
 	if strings.Contains(resp.Header.Get("Content-Type"), "xml") {
-		xmlBody, err := io.ReadAll(resp.Body)
+		assets, err = extractor.XML(resp)
 		if err != nil {
-			c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading XML body")
-			return err
+			c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from XML")
 		}
-
-		mv, err := mxj.NewMapXml(xmlBody)
+	} else if strings.Contains(resp.Header.Get("Content-Type"), "json") {
+		assets, err = extractor.JSON(resp)
 		if err != nil {
-			c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing XML body")
-			return err
-		}
-
-		for _, value := range mv.LeafValues() {
-			if _, ok := value.(string); ok {
-				if strings.HasPrefix(value.(string), "http") {
-					discovered = append(discovered, value.(string))
-				}
-			}
+			c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from JSON")
 		}
-	}
-
-	// If the response isn't a text/*, we do not scrape it.
-	// We also aren't going to scrape if assets and outlinks are turned off.
-	if !strings.Contains(resp.Header.Get("Content-Type"), "text/") || (c.DisableAssetsCapture && !c.DomainsCrawl && (uint64(c.MaxHops) <= item.Hop)) {
-		// Enforce reading all data from the response for WARC writing
+	} else if !strings.Contains(resp.Header.Get("Content-Type"), "text/") || (c.DisableAssetsCapture && !c.DomainsCrawl && (uint64(c.MaxHops) <= item.Hop)) {
+		// If the response isn't a text/*, we do not scrape it.
+		// We also aren't going to scrape if assets and outlinks are turned off.
 		_, err := io.Copy(io.Discard, resp.Body)
 		if err != nil {
 			c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading response body")
@@ -526,11 +493,13 @@ func (c *Crawl) Capture(item *queue.Item) error {
 		return err
 	}
 
-	// Extract and capture assets
-	assets, err := c.extractAssets(base, item, doc)
-	if err != nil {
-		c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets")
-		return err
+	// Extract and capture assets (only if we didn't use an extractor that produce assets)
+	if len(assets) == 0 {
+		assets, err = c.extractAssets(base, item, doc)
+		if err != nil {
+			c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets")
+			return err
+		}
 	}
 
 	// If we didn't find any assets, let's stop here
@@ -649,38 +618,3 @@ func (c *Crawl) Capture(item *queue.Item) error {
 	swg.Wait()
 	return err
 }
-
-func getURLsFromJSON(jsonString string) ([]string, error) {
-	var data interface{}
-	err := json.Unmarshal([]byte(jsonString), &data)
-	if err != nil {
-		return nil, err
-	}
-
-	links := make([]string, 0)
-	findURLs(data, &links)
-
-	return links, nil
-}
-
-func findURLs(data interface{}, links *[]string) {
-	switch v := data.(type) {
-	case string:
-		if isValidURL(v) {
-			*links = append(*links, v)
-		}
-	case []interface{}:
-		for _, element := range v {
-			findURLs(element, links)
-		}
-	case map[string]interface{}:
-		for _, value := range v {
-			findURLs(value, links)
-		}
-	}
-}
-
-func isValidURL(str string) bool {
-	u, err := url.Parse(str)
-	return err == nil && u.Scheme != "" && u.Host != ""
-}
diff --git a/internal/pkg/crawl/extractor/json.go b/internal/pkg/crawl/extractor/json.go
@@ -0,0 +1,64 @@
+package extractor
+
+import (
+	"encoding/json"
+	"io"
+	"net/http"
+	"net/url"
+)
+
+func JSON(resp *http.Response) (URLs []*url.URL, err error) {
+	jsonBody, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, err
+	}
+
+	rawURLs, err := GetURLsFromJSON(string(jsonBody))
+	if err != nil {
+		return nil, err
+	}
+
+	for _, rawURL := range rawURLs {
+		URL, err := url.Parse(rawURL)
+		if err == nil {
+			URLs = append(URLs, URL)
+		}
+	}
+
+	return URLs, err
+}
+
+func GetURLsFromJSON(jsonString string) ([]string, error) {
+	var data interface{}
+	err := json.Unmarshal([]byte(jsonString), &data)
+	if err != nil {
+		return nil, err
+	}
+
+	links := make([]string, 0)
+	findURLs(data, &links)
+
+	return links, nil
+}
+
+func findURLs(data interface{}, links *[]string) {
+	switch v := data.(type) {
+	case string:
+		if isValidURL(v) {
+			*links = append(*links, v)
+		}
+	case []interface{}:
+		for _, element := range v {
+			findURLs(element, links)
+		}
+	case map[string]interface{}:
+		for _, value := range v {
+			findURLs(value, links)
+		}
+	}
+}
+
+func isValidURL(str string) bool {
+	u, err := url.Parse(str)
+	return err == nil && u.Scheme != "" && u.Host != ""
+}
diff --git a/internal/pkg/crawl/extractor/json_test.go b/internal/pkg/crawl/extractor/json_test.go
@@ -0,0 +1,91 @@
+package extractor
+
+import (
+	"bytes"
+	"io"
+	"net/http"
+	"net/url"
+	"reflect"
+	"sort"
+	"testing"
+)
+
+func TestJSON(t *testing.T) {
+	tests := []struct {
+		name     string
+		jsonBody string
+		wantURLs []*url.URL
+		wantErr  bool
+	}{
+		{
+			name:     "Valid JSON with URLs",
+			jsonBody: `{"url": "https://example.com", "nested": {"link": "http://test.com"}}`,
+			wantURLs: []*url.URL{
+				{Scheme: "https", Host: "example.com"},
+				{Scheme: "http", Host: "test.com"},
+			},
+			wantErr: false,
+		},
+		{
+			name:     "Invalid JSON",
+			jsonBody: `{"url": "https://example.com"`,
+			wantURLs: nil,
+			wantErr:  true,
+		},
+		{
+			name:     "JSON with no URLs",
+			jsonBody: `{"key": "value", "number": 42}`,
+			wantURLs: nil,
+			wantErr:  false,
+		},
+		{
+			name:     "JSON with URLs in various fields",
+			jsonBody: `{"someField": "https://example.com", "otherField": "http://test.com", "nested": {"deepLink": "https://deep.example.com"}}`,
+			wantURLs: []*url.URL{
+				{Scheme: "https", Host: "example.com"},
+				{Scheme: "http", Host: "test.com"},
+				{Scheme: "https", Host: "deep.example.com"},
+			},
+			wantErr: false,
+		},
+		{
+			name:     "JSON with array of URLs",
+			jsonBody: `{"links": ["https://example1.com", "https://example2.com"]}`,
+			wantURLs: []*url.URL{
+				{Scheme: "https", Host: "example1.com"},
+				{Scheme: "https", Host: "example2.com"},
+			},
+			wantErr: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			resp := &http.Response{
+				Body: io.NopCloser(bytes.NewBufferString(tt.jsonBody)),
+			}
+
+			gotURLs, err := JSON(resp)
+
+			if (err != nil) != tt.wantErr {
+				t.Errorf("JSON() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+
+			// Sort both slices before comparison
+			sortURLs(gotURLs)
+			sortURLs(tt.wantURLs)
+
+			if !reflect.DeepEqual(gotURLs, tt.wantURLs) {
+				t.Errorf("JSON() gotURLs = %v, want %v", gotURLs, tt.wantURLs)
+			}
+		})
+	}
+}
+
+// Helper function to sort URL slices
+func sortURLs(urls []*url.URL) {
+	sort.Slice(urls, func(i, j int) bool {
+		return urls[i].String() < urls[j].String()
+	})
+}
diff --git a/internal/pkg/crawl/extractor/xml.go b/internal/pkg/crawl/extractor/xml.go
@@ -0,0 +1,35 @@
+package extractor
+
+import (
+	"io"
+	"net/http"
+	"net/url"
+	"strings"
+
+	"github.com/clbanning/mxj/v2"
+)
+
+func XML(resp *http.Response) (URLs []*url.URL, err error) {
+	xmlBody, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, err
+	}
+
+	mv, err := mxj.NewMapXml(xmlBody)
+	if err != nil {
+		return nil, err
+	}
+
+	for _, value := range mv.LeafValues() {
+		if _, ok := value.(string); ok {
+			if strings.HasPrefix(value.(string), "http") {
+				URL, err := url.Parse(value.(string))
+				if err == nil {
+					URLs = append(URLs, URL)
+				}
+			}
+		}
+	}
+
+	return URLs, nil
+}