-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Basic GetLinks() functionality added
- Loading branch information
Showing
6 changed files
with
320 additions
and
38 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
package katsuragi | ||
|
||
import ( | ||
"fmt" | ||
"net/url" | ||
"strings" | ||
|
||
"golang.org/x/net/html" | ||
) | ||
|
||
func (f *Fetcher) GetLinks(props GetLinksProps) ([]string, error) { | ||
|
||
// Set default category to "all" | ||
if props.Category == "" { | ||
props.Category = "all" | ||
} | ||
|
||
htmlres, err := retrieveHTML(props.Url, f) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
var links []string | ||
var traverse func(*html.Node) | ||
|
||
traverse = func(n *html.Node) { | ||
if n.Type == html.ElementNode && n.Data == "a" { | ||
attrMap := extractAttributes(n.Attr) | ||
if href, found := attrMap["href"]; found { | ||
isValid := validateUrl(href) | ||
newhref := ensureAbsoluteURL(href, props.Url); | ||
if props.Category == "all" { | ||
|
||
if !contains(links, newhref) && isValid { | ||
links = append(links, newhref) | ||
} | ||
} else if props.Category == "internal" { | ||
isInternal := IsInternalURL(newhref, props.Url) | ||
if isInternal && isValid { | ||
if !contains(links, newhref) { | ||
links = append(links, newhref) | ||
} | ||
} | ||
} else if props.Category == "external" { | ||
isInternal := IsInternalURL(newhref, props.Url) | ||
if !isInternal && isValid { | ||
if !contains(links, newhref) { | ||
links = append(links, newhref) | ||
} | ||
} | ||
} | ||
} | ||
} | ||
for c := n.FirstChild; c != nil; c = c.NextSibling { | ||
traverse(c) | ||
} | ||
|
||
} | ||
traverse(htmlres) | ||
if len(links) == 0 { | ||
return nil, fmt.Errorf("GetLinks failed to find any links in HTML") | ||
} | ||
return links, nil | ||
} | ||
|
||
func IsInternalURL(href, urlStr string) bool { | ||
// Parse the found link | ||
parsedBacklink, err := url.Parse(href) | ||
if err != nil { | ||
return false | ||
} | ||
// Parse the original URL | ||
parsedUrl, err := url.Parse(urlStr) | ||
if err != nil { | ||
return false | ||
} | ||
// lets check if urlStr exists in href | ||
if strings.Contains(href, urlStr) { | ||
return true | ||
} | ||
|
||
// Check if the URL is from the same domain | ||
return parsedBacklink.Host == parsedUrl.Host | ||
} | ||
|
||
|
||
func validateUrl(urlStr string) bool { | ||
_, err := url.ParseRequestURI(urlStr) | ||
fmt.Println(err) | ||
return err == nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
package katsuragi | ||
|
||
import ( | ||
"fmt" | ||
"net/http" | ||
"net/http/httptest" | ||
"strings" | ||
"testing" | ||
) | ||
|
||
func TestGetLinks(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
category string | ||
url string | ||
responseBody func(serverURL string) string // Function to generate response body dynamically | ||
expectedErr string | ||
expectedLinks []string | ||
}{ | ||
{ | ||
name: "all", | ||
category: "all", | ||
responseBody: func(serverURL string) string { | ||
return fmt.Sprintf(`<html><body> | ||
<a href="%s/internal1">Internal 1</a> | ||
<a href="%s/internal2">Internal 2</a> | ||
<a href="http://external.com">External</a> | ||
</body></html>`, serverURL, serverURL) | ||
}, | ||
expectedErr: "", | ||
expectedLinks: []string{"<serverURL>/internal1", "<serverURL>/internal2", "http://external.com"}, | ||
}, | ||
{ | ||
name: "internal", | ||
category: "internal", | ||
responseBody: func(serverURL string) string { | ||
return fmt.Sprintf(`<html><body> | ||
<a href="%s/internal1">Internal 1</a> | ||
<a href="%s/internal2">Internal 2</a> | ||
</body></html>`, serverURL, serverURL) | ||
}, | ||
expectedErr: "", | ||
expectedLinks: []string{"<serverURL>/internal1", "<serverURL>/internal2"}, | ||
}, | ||
{ | ||
name: "external", | ||
category: "external", | ||
responseBody: func(serverURL string) string { | ||
return fmt.Sprintf(`<html><body> | ||
<a href="%s/internal1">Internal 1</a> | ||
<a href="http://external.com">External</a> | ||
</body></html>`, serverURL) | ||
}, | ||
expectedErr: "", | ||
expectedLinks: []string{"http://external.com"}, | ||
}, | ||
{ | ||
name: "no category", | ||
category: "", | ||
responseBody: func(serverURL string) string { | ||
return fmt.Sprintf(`<html><body> | ||
<a href="%s/internal1">Internal 1</a> | ||
<a href="%s/internal2">Internal 2</a> | ||
<a href="http://external.com">External</a> | ||
</body></html>`, serverURL, serverURL) | ||
}, | ||
expectedErr: "", | ||
expectedLinks: []string{"<serverURL>/internal1", "<serverURL>/internal2", "http://external.com"}, | ||
}, | ||
// bad url | ||
{ | ||
name: "bad url", | ||
category: "all", | ||
url: "http:/", | ||
responseBody: func(serverURL string) string { | ||
return "" | ||
}, | ||
expectedErr: "Get \"http:/\": http: no Host in request URL", | ||
expectedLinks: []string{}, | ||
}, | ||
// broken link in html | ||
{ | ||
name: "good and invalid links in html", | ||
category: "all", | ||
responseBody: func(serverURL string) string { | ||
return fmt.Sprintf(`<html><body> | ||
<a href=":/|%s/internal1">Internal 1</a> | ||
<a href=":htpexternal.com">External</a> | ||
<a href="http://external2.com">External</a> | ||
<a href="/test">External</a> | ||
</body></html>`, serverURL) | ||
}, | ||
expectedErr: "", | ||
expectedLinks: []string{"http://external2.com", "<serverURL>/test"}, | ||
}, | ||
// no links in html | ||
{ | ||
name: "no links in html", | ||
category: "all", | ||
responseBody: func(serverURL string) string { | ||
return "<html><body></body></html>" | ||
}, | ||
expectedErr: "GetLinks failed to find any links in HTML", | ||
expectedLinks: []string{}, | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
var server *httptest.Server | ||
server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { | ||
fmt.Fprint(w, tt.responseBody(server.URL)) | ||
})) | ||
defer server.Close() | ||
|
||
// Replace "<serverURL>" in expectedLinks with the actual server.URL before assertions | ||
for i, link := range tt.expectedLinks { | ||
tt.expectedLinks[i] = strings.Replace(link, "<serverURL>", server.URL, -1) | ||
} | ||
|
||
fetcher := NewFetcher(&FetcherProps{Timeout: 3000, CacheCap: 10}) | ||
|
||
var links []string | ||
var err error | ||
|
||
if tt.url != "" { | ||
links, err = fetcher.GetLinks(GetLinksProps{Url: tt.url, Category: tt.category}) | ||
} else { | ||
links, err = fetcher.GetLinks(GetLinksProps{Url: server.URL, Category: tt.category}) | ||
} | ||
|
||
fmt.Println("Server URL: ", server.URL) | ||
|
||
// Test assertions follow | ||
if err != nil && tt.expectedErr == "" { | ||
t.Errorf("Expected no error, got %v", err) | ||
} | ||
if err == nil && tt.expectedErr != "" { | ||
t.Errorf("Expected error %v, got none", tt.expectedErr) | ||
} | ||
// compare errors | ||
if err != nil && tt.expectedErr != "" { | ||
if err.Error() != tt.expectedErr { | ||
t.Errorf("Expected error %q, got %q", tt.expectedErr, err.Error()) | ||
} | ||
} | ||
if len(links) != len(tt.expectedLinks) { | ||
t.Errorf("Expected %d links, got %d. Links: %s", len(tt.expectedLinks), len(links), links) | ||
} | ||
|
||
// compare expected links with actual links | ||
fmt.Println("Result: ", links) | ||
if len(links) > 0 { | ||
for i, link := range links { | ||
if link != tt.expectedLinks[i] { | ||
t.Errorf("Expected link %s, got %s", tt.expectedLinks[i], link) | ||
} | ||
} | ||
} | ||
|
||
}) | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.