-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #137 from internetarchive/reddit
Add custom code for Reddit archiving
- Loading branch information
Showing
7 changed files
with
278 additions
and
90 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
package extractor | ||
|
||
import ( | ||
"encoding/json" | ||
"io" | ||
"net/http" | ||
"net/url" | ||
) | ||
|
||
func JSON(resp *http.Response) (URLs []*url.URL, err error) { | ||
jsonBody, err := io.ReadAll(resp.Body) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
rawURLs, err := GetURLsFromJSON(string(jsonBody)) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
for _, rawURL := range rawURLs { | ||
URL, err := url.Parse(rawURL) | ||
if err == nil { | ||
URLs = append(URLs, URL) | ||
} | ||
} | ||
|
||
return URLs, err | ||
} | ||
|
||
func GetURLsFromJSON(jsonString string) ([]string, error) { | ||
var data interface{} | ||
err := json.Unmarshal([]byte(jsonString), &data) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
links := make([]string, 0) | ||
findURLs(data, &links) | ||
|
||
return links, nil | ||
} | ||
|
||
func findURLs(data interface{}, links *[]string) { | ||
switch v := data.(type) { | ||
case string: | ||
if isValidURL(v) { | ||
*links = append(*links, v) | ||
} | ||
case []interface{}: | ||
for _, element := range v { | ||
findURLs(element, links) | ||
} | ||
case map[string]interface{}: | ||
for _, value := range v { | ||
findURLs(value, links) | ||
} | ||
} | ||
} | ||
|
||
func isValidURL(str string) bool { | ||
u, err := url.Parse(str) | ||
return err == nil && u.Scheme != "" && u.Host != "" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
package extractor | ||
|
||
import ( | ||
"bytes" | ||
"io" | ||
"net/http" | ||
"net/url" | ||
"reflect" | ||
"sort" | ||
"testing" | ||
) | ||
|
||
func TestJSON(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
jsonBody string | ||
wantURLs []*url.URL | ||
wantErr bool | ||
}{ | ||
{ | ||
name: "Valid JSON with URLs", | ||
jsonBody: `{"url": "https://example.com", "nested": {"link": "http://test.com"}}`, | ||
wantURLs: []*url.URL{ | ||
{Scheme: "https", Host: "example.com"}, | ||
{Scheme: "http", Host: "test.com"}, | ||
}, | ||
wantErr: false, | ||
}, | ||
{ | ||
name: "Invalid JSON", | ||
jsonBody: `{"url": "https://example.com"`, | ||
wantURLs: nil, | ||
wantErr: true, | ||
}, | ||
{ | ||
name: "JSON with no URLs", | ||
jsonBody: `{"key": "value", "number": 42}`, | ||
wantURLs: nil, | ||
wantErr: false, | ||
}, | ||
{ | ||
name: "JSON with URLs in various fields", | ||
jsonBody: `{"someField": "https://example.com", "otherField": "http://test.com", "nested": {"deepLink": "https://deep.example.com"}}`, | ||
wantURLs: []*url.URL{ | ||
{Scheme: "https", Host: "example.com"}, | ||
{Scheme: "http", Host: "test.com"}, | ||
{Scheme: "https", Host: "deep.example.com"}, | ||
}, | ||
wantErr: false, | ||
}, | ||
{ | ||
name: "JSON with array of URLs", | ||
jsonBody: `{"links": ["https://example1.com", "https://example2.com"]}`, | ||
wantURLs: []*url.URL{ | ||
{Scheme: "https", Host: "example1.com"}, | ||
{Scheme: "https", Host: "example2.com"}, | ||
}, | ||
wantErr: false, | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
resp := &http.Response{ | ||
Body: io.NopCloser(bytes.NewBufferString(tt.jsonBody)), | ||
} | ||
|
||
gotURLs, err := JSON(resp) | ||
|
||
if (err != nil) != tt.wantErr { | ||
t.Errorf("JSON() error = %v, wantErr %v", err, tt.wantErr) | ||
return | ||
} | ||
|
||
// Sort both slices before comparison | ||
sortURLs(gotURLs) | ||
sortURLs(tt.wantURLs) | ||
|
||
if !reflect.DeepEqual(gotURLs, tt.wantURLs) { | ||
t.Errorf("JSON() gotURLs = %v, want %v", gotURLs, tt.wantURLs) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
// Helper function to sort URL slices | ||
func sortURLs(urls []*url.URL) { | ||
sort.Slice(urls, func(i, j int) bool { | ||
return urls[i].String() < urls[j].String() | ||
}) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
package extractor | ||
|
||
import ( | ||
"io" | ||
"net/http" | ||
"net/url" | ||
"strings" | ||
|
||
"github.com/clbanning/mxj/v2" | ||
) | ||
|
||
func XML(resp *http.Response) (URLs []*url.URL, err error) { | ||
xmlBody, err := io.ReadAll(resp.Body) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
mv, err := mxj.NewMapXml(xmlBody) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
for _, value := range mv.LeafValues() { | ||
if _, ok := value.(string); ok { | ||
if strings.HasPrefix(value.(string), "http") { | ||
URL, err := url.Parse(value.(string)) | ||
if err == nil { | ||
URLs = append(URLs, URL) | ||
} | ||
} | ||
} | ||
} | ||
|
||
return URLs, nil | ||
} |
Oops, something went wrong.