Skip to content

Commit

Permalink
Merge pull request #137 from internetarchive/reddit
Browse files Browse the repository at this point in the history
Add custom code for Reddit archiving
  • Loading branch information
CorentinB authored Aug 21, 2024
2 parents be08a01 + 75ce3ca commit cb3b9db
Show file tree
Hide file tree
Showing 7 changed files with 278 additions and 90 deletions.
26 changes: 20 additions & 6 deletions internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ import (
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
"github.com/internetarchive/Zeno/internal/pkg/queue"
"github.com/internetarchive/Zeno/internal/pkg/utils"
)

func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) {
var rawAssets []string
var URL = utils.URLToString(item.URL)

// Execute plugins on the response
if strings.Contains(base.Host, "cloudflarestream.com") {
Expand All @@ -30,8 +32,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
doc.Find("[data-item]").Each(func(index int, item *goquery.Selection) {
dataItem, exists := item.Attr("data-item")
if exists {
URLsFromJSON, _ := getURLsFromJSON(dataItem)
rawAssets = append(rawAssets, URLsFromJSON...)
URLsFromJSON, err := extractor.GetURLsFromJSON(dataItem)
if err != nil {
c.Log.Error("unable to extract URLs from JSON in data-item attribute", "error", err, "url", URL)
} else {
rawAssets = append(rawAssets, URLsFromJSON...)
}
}
})

Expand Down Expand Up @@ -136,8 +142,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
scriptType, exists := item.Attr("type")
if exists {
if scriptType == "application/json" {
URLsFromJSON, _ := getURLsFromJSON(item.Text())
rawAssets = append(rawAssets, URLsFromJSON...)
URLsFromJSON, err := extractor.GetURLsFromJSON(item.Text())
if err != nil {
c.Log.Error("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
} else {
rawAssets = append(rawAssets, URLsFromJSON...)
}
}
}

Expand Down Expand Up @@ -184,8 +194,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
}

if len(jsonContent[1]) > payloadEndPosition {
URLsFromJSON, _ := getURLsFromJSON(jsonContent[1][:payloadEndPosition+1])
rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...)
URLsFromJSON, err := extractor.GetURLsFromJSON(jsonContent[1][:payloadEndPosition+1])
if err != nil {
c.Log.Error("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
} else {
rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...)
}
}
}
}
Expand Down
100 changes: 17 additions & 83 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package crawl

import (
"encoding/json"
"errors"
"io"
"net/http"
Expand All @@ -12,7 +11,7 @@ import (
"time"

"github.com/PuerkitoBio/goquery"
"github.com/clbanning/mxj/v2"
"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/facebook"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/libsyn"
Expand Down Expand Up @@ -224,6 +223,7 @@ func (c *Crawl) Capture(item *queue.Item) error {
var (
resp *http.Response
waitGroup sync.WaitGroup
assets []*url.URL
)

defer func(i *queue.Item) {
Expand Down Expand Up @@ -390,53 +390,20 @@ func (c *Crawl) Capture(item *queue.Item) error {
return err
}

// If the response is a JSON document, we want to scrape it for links
if strings.Contains(resp.Header.Get("Content-Type"), "json") {
jsonBody, err := io.ReadAll(resp.Body)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading JSON body")
return err
}

outlinksFromJSON, err := getURLsFromJSON(string(jsonBody))
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while getting URLs from JSON")
return err
}

waitGroup.Add(1)
go c.queueOutlinks(utils.MakeAbsolute(item.URL, utils.StringSliceToURLSlice(outlinksFromJSON)), item, &waitGroup)

return err
}

// If the response is an XML document, we want to scrape it for links
if strings.Contains(resp.Header.Get("Content-Type"), "xml") {
xmlBody, err := io.ReadAll(resp.Body)
assets, err = extractor.XML(resp)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading XML body")
return err
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from XML")
}

mv, err := mxj.NewMapXml(xmlBody)
} else if strings.Contains(resp.Header.Get("Content-Type"), "json") {
assets, err = extractor.JSON(resp)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing XML body")
return err
}

for _, value := range mv.LeafValues() {
if _, ok := value.(string); ok {
if strings.HasPrefix(value.(string), "http") {
discovered = append(discovered, value.(string))
}
}
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from JSON")
}
}

// If the response isn't a text/*, we do not scrape it.
// We also aren't going to scrape if assets and outlinks are turned off.
if !strings.Contains(resp.Header.Get("Content-Type"), "text/") || (c.DisableAssetsCapture && !c.DomainsCrawl && (uint64(c.MaxHops) <= item.Hop)) {
// Enforce reading all data from the response for WARC writing
} else if !strings.Contains(resp.Header.Get("Content-Type"), "text/") || (c.DisableAssetsCapture && !c.DomainsCrawl && (uint64(c.MaxHops) <= item.Hop)) {
// If the response isn't a text/*, we do not scrape it.
// We also aren't going to scrape if assets and outlinks are turned off.
_, err := io.Copy(io.Discard, resp.Body)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading response body")
Expand Down Expand Up @@ -526,11 +493,13 @@ func (c *Crawl) Capture(item *queue.Item) error {
return err
}

// Extract and capture assets
assets, err := c.extractAssets(base, item, doc)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets")
return err
// Extract and capture assets (only if we didn't use an extractor that produce assets)
if len(assets) == 0 {
assets, err = c.extractAssets(base, item, doc)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets")
return err
}
}

// If we didn't find any assets, let's stop here
Expand Down Expand Up @@ -649,38 +618,3 @@ func (c *Crawl) Capture(item *queue.Item) error {
swg.Wait()
return err
}

func getURLsFromJSON(jsonString string) ([]string, error) {
var data interface{}
err := json.Unmarshal([]byte(jsonString), &data)
if err != nil {
return nil, err
}

links := make([]string, 0)
findURLs(data, &links)

return links, nil
}

func findURLs(data interface{}, links *[]string) {
switch v := data.(type) {
case string:
if isValidURL(v) {
*links = append(*links, v)
}
case []interface{}:
for _, element := range v {
findURLs(element, links)
}
case map[string]interface{}:
for _, value := range v {
findURLs(value, links)
}
}
}

func isValidURL(str string) bool {
u, err := url.Parse(str)
return err == nil && u.Scheme != "" && u.Host != ""
}
64 changes: 64 additions & 0 deletions internal/pkg/crawl/extractor/json.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package extractor

import (
"encoding/json"
"io"
"net/http"
"net/url"
)

func JSON(resp *http.Response) (URLs []*url.URL, err error) {
jsonBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}

rawURLs, err := GetURLsFromJSON(string(jsonBody))
if err != nil {
return nil, err
}

for _, rawURL := range rawURLs {
URL, err := url.Parse(rawURL)
if err == nil {
URLs = append(URLs, URL)
}
}

return URLs, err
}

func GetURLsFromJSON(jsonString string) ([]string, error) {
var data interface{}
err := json.Unmarshal([]byte(jsonString), &data)
if err != nil {
return nil, err
}

links := make([]string, 0)
findURLs(data, &links)

return links, nil
}

func findURLs(data interface{}, links *[]string) {
switch v := data.(type) {
case string:
if isValidURL(v) {
*links = append(*links, v)
}
case []interface{}:
for _, element := range v {
findURLs(element, links)
}
case map[string]interface{}:
for _, value := range v {
findURLs(value, links)
}
}
}

func isValidURL(str string) bool {
u, err := url.Parse(str)
return err == nil && u.Scheme != "" && u.Host != ""
}
91 changes: 91 additions & 0 deletions internal/pkg/crawl/extractor/json_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
package extractor

import (
"bytes"
"io"
"net/http"
"net/url"
"reflect"
"sort"
"testing"
)

func TestJSON(t *testing.T) {
tests := []struct {
name string
jsonBody string
wantURLs []*url.URL
wantErr bool
}{
{
name: "Valid JSON with URLs",
jsonBody: `{"url": "https://example.com", "nested": {"link": "http://test.com"}}`,
wantURLs: []*url.URL{
{Scheme: "https", Host: "example.com"},
{Scheme: "http", Host: "test.com"},
},
wantErr: false,
},
{
name: "Invalid JSON",
jsonBody: `{"url": "https://example.com"`,
wantURLs: nil,
wantErr: true,
},
{
name: "JSON with no URLs",
jsonBody: `{"key": "value", "number": 42}`,
wantURLs: nil,
wantErr: false,
},
{
name: "JSON with URLs in various fields",
jsonBody: `{"someField": "https://example.com", "otherField": "http://test.com", "nested": {"deepLink": "https://deep.example.com"}}`,
wantURLs: []*url.URL{
{Scheme: "https", Host: "example.com"},
{Scheme: "http", Host: "test.com"},
{Scheme: "https", Host: "deep.example.com"},
},
wantErr: false,
},
{
name: "JSON with array of URLs",
jsonBody: `{"links": ["https://example1.com", "https://example2.com"]}`,
wantURLs: []*url.URL{
{Scheme: "https", Host: "example1.com"},
{Scheme: "https", Host: "example2.com"},
},
wantErr: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
resp := &http.Response{
Body: io.NopCloser(bytes.NewBufferString(tt.jsonBody)),
}

gotURLs, err := JSON(resp)

if (err != nil) != tt.wantErr {
t.Errorf("JSON() error = %v, wantErr %v", err, tt.wantErr)
return
}

// Sort both slices before comparison
sortURLs(gotURLs)
sortURLs(tt.wantURLs)

if !reflect.DeepEqual(gotURLs, tt.wantURLs) {
t.Errorf("JSON() gotURLs = %v, want %v", gotURLs, tt.wantURLs)
}
})
}
}

// Helper function to sort URL slices
func sortURLs(urls []*url.URL) {
sort.Slice(urls, func(i, j int) bool {
return urls[i].String() < urls[j].String()
})
}
35 changes: 35 additions & 0 deletions internal/pkg/crawl/extractor/xml.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package extractor

import (
"io"
"net/http"
"net/url"
"strings"

"github.com/clbanning/mxj/v2"
)

func XML(resp *http.Response) (URLs []*url.URL, err error) {
xmlBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}

mv, err := mxj.NewMapXml(xmlBody)
if err != nil {
return nil, err
}

for _, value := range mv.LeafValues() {
if _, ok := value.(string); ok {
if strings.HasPrefix(value.(string), "http") {
URL, err := url.Parse(value.(string))
if err == nil {
URLs = append(URLs, URL)
}
}
}
}

return URLs, nil
}
Loading

0 comments on commit cb3b9db

Please sign in to comment.