Skip to content

Commit

Permalink
Basic GetLinks() functionality added
Browse files Browse the repository at this point in the history
  • Loading branch information
devnyxie committed Jul 22, 2024
1 parent bbbb772 commit 226d719
Show file tree
Hide file tree
Showing 6 changed files with 320 additions and 38 deletions.
91 changes: 91 additions & 0 deletions GetLinks.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
package katsuragi

import (
"fmt"
"net/url"
"strings"

"golang.org/x/net/html"
)

func (f *Fetcher) GetLinks(props GetLinksProps) ([]string, error) {

// Set default category to "all"
if props.Category == "" {
props.Category = "all"
}

htmlres, err := retrieveHTML(props.Url, f)
if err != nil {
return nil, err
}

var links []string
var traverse func(*html.Node)

traverse = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
attrMap := extractAttributes(n.Attr)
if href, found := attrMap["href"]; found {
isValid := validateUrl(href)
newhref := ensureAbsoluteURL(href, props.Url);
if props.Category == "all" {

if !contains(links, newhref) && isValid {
links = append(links, newhref)
}
} else if props.Category == "internal" {
isInternal := IsInternalURL(newhref, props.Url)
if isInternal && isValid {
if !contains(links, newhref) {
links = append(links, newhref)
}
}
} else if props.Category == "external" {
isInternal := IsInternalURL(newhref, props.Url)
if !isInternal && isValid {
if !contains(links, newhref) {
links = append(links, newhref)
}
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
traverse(c)
}

}
traverse(htmlres)
if len(links) == 0 {
return nil, fmt.Errorf("GetLinks failed to find any links in HTML")
}
return links, nil
}

func IsInternalURL(href, urlStr string) bool {
// Parse the found link
parsedBacklink, err := url.Parse(href)
if err != nil {
return false
}
// Parse the original URL
parsedUrl, err := url.Parse(urlStr)
if err != nil {
return false
}
// lets check if urlStr exists in href
if strings.Contains(href, urlStr) {
return true
}

// Check if the URL is from the same domain
return parsedBacklink.Host == parsedUrl.Host
}


func validateUrl(urlStr string) bool {
_, err := url.ParseRequestURI(urlStr)
fmt.Println(err)
return err == nil
}
164 changes: 164 additions & 0 deletions GetLinks_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
package katsuragi

import (
"fmt"
"net/http"
"net/http/httptest"
"strings"
"testing"
)

func TestGetLinks(t *testing.T) {
tests := []struct {
name string
category string
url string
responseBody func(serverURL string) string // Function to generate response body dynamically
expectedErr string
expectedLinks []string
}{
{
name: "all",
category: "all",
responseBody: func(serverURL string) string {
return fmt.Sprintf(`<html><body>
<a href="%s/internal1">Internal 1</a>
<a href="%s/internal2">Internal 2</a>
<a href="http://external.com">External</a>
</body></html>`, serverURL, serverURL)
},
expectedErr: "",
expectedLinks: []string{"<serverURL>/internal1", "<serverURL>/internal2", "http://external.com"},
},
{
name: "internal",
category: "internal",
responseBody: func(serverURL string) string {
return fmt.Sprintf(`<html><body>
<a href="%s/internal1">Internal 1</a>
<a href="%s/internal2">Internal 2</a>
</body></html>`, serverURL, serverURL)
},
expectedErr: "",
expectedLinks: []string{"<serverURL>/internal1", "<serverURL>/internal2"},
},
{
name: "external",
category: "external",
responseBody: func(serverURL string) string {
return fmt.Sprintf(`<html><body>
<a href="%s/internal1">Internal 1</a>
<a href="http://external.com">External</a>
</body></html>`, serverURL)
},
expectedErr: "",
expectedLinks: []string{"http://external.com"},
},
{
name: "no category",
category: "",
responseBody: func(serverURL string) string {
return fmt.Sprintf(`<html><body>
<a href="%s/internal1">Internal 1</a>
<a href="%s/internal2">Internal 2</a>
<a href="http://external.com">External</a>
</body></html>`, serverURL, serverURL)
},
expectedErr: "",
expectedLinks: []string{"<serverURL>/internal1", "<serverURL>/internal2", "http://external.com"},
},
// bad url
{
name: "bad url",
category: "all",
url: "http:/",
responseBody: func(serverURL string) string {
return ""
},
expectedErr: "Get \"http:/\": http: no Host in request URL",
expectedLinks: []string{},
},
// broken link in html
{
name: "good and invalid links in html",
category: "all",
responseBody: func(serverURL string) string {
return fmt.Sprintf(`<html><body>
<a href=":/|%s/internal1">Internal 1</a>
<a href=":htpexternal.com">External</a>
<a href="http://external2.com">External</a>
<a href="/test">External</a>
</body></html>`, serverURL)
},
expectedErr: "",
expectedLinks: []string{"http://external2.com", "<serverURL>/test"},
},
// no links in html
{
name: "no links in html",
category: "all",
responseBody: func(serverURL string) string {
return "<html><body></body></html>"
},
expectedErr: "GetLinks failed to find any links in HTML",
expectedLinks: []string{},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var server *httptest.Server
server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, tt.responseBody(server.URL))
}))
defer server.Close()

// Replace "<serverURL>" in expectedLinks with the actual server.URL before assertions
for i, link := range tt.expectedLinks {
tt.expectedLinks[i] = strings.Replace(link, "<serverURL>", server.URL, -1)
}

fetcher := NewFetcher(&FetcherProps{Timeout: 3000, CacheCap: 10})

var links []string
var err error

if tt.url != "" {
links, err = fetcher.GetLinks(GetLinksProps{Url: tt.url, Category: tt.category})
} else {
links, err = fetcher.GetLinks(GetLinksProps{Url: server.URL, Category: tt.category})
}

fmt.Println("Server URL: ", server.URL)

// Test assertions follow
if err != nil && tt.expectedErr == "" {
t.Errorf("Expected no error, got %v", err)
}
if err == nil && tt.expectedErr != "" {
t.Errorf("Expected error %v, got none", tt.expectedErr)
}
// compare errors
if err != nil && tt.expectedErr != "" {
if err.Error() != tt.expectedErr {
t.Errorf("Expected error %q, got %q", tt.expectedErr, err.Error())
}
}
if len(links) != len(tt.expectedLinks) {
t.Errorf("Expected %d links, got %d. Links: %s", len(tt.expectedLinks), len(links), links)
}

// compare expected links with actual links
fmt.Println("Result: ", links)
if len(links) > 0 {
for i, link := range links {
if link != tt.expectedLinks[i] {
t.Errorf("Expected link %s, got %s", tt.expectedLinks[i], link)
}
}
}

})
}
}

19 changes: 19 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ A Go toolkit for web content processing, analysis, and SEO optimization, offerin
<!-- START doctoc generated TOC please keep comment here to allow auto update -->
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->

**Table of Contents**

- [Features](#features)
Expand Down Expand Up @@ -100,6 +101,24 @@ The GetFavicons() function currently supports the following favicon meta tags:
...
```

## Links/Backlinks

The GetLinks() function searches for all `<a>` tags in the HTML document and returns a slice of links.

Options:

- `Url` (required): The URL of the website to fetch.
- `Category` (optional): The category of links to fetch. Possible values are `internal`, `external`, and `all`. Default is `all`.

```go
// Get website's links
links, err := fetcher.GetLinks(GetLinksProps{
Url: "https://www.example.com",
Category: "external",
})
// [https://www.youtube.com/example, https://www.facebook.com/example]
```

# Local Development

## Testing
Expand Down
13 changes: 9 additions & 4 deletions types.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,21 @@ type Fetcher struct {
props FetcherProps
}

var defaultProps = FetcherProps{
var defaultFetcherProps = FetcherProps{
Timeout: 3000 * time.Millisecond,
CacheCap: 10,
}

func NewFetcher(props *FetcherProps) *Fetcher {
if props == nil {
props = &defaultProps
props = &defaultFetcherProps
} else {
// Set default values for unspecified fields
if props.Timeout == 0 {
props.Timeout = defaultProps.Timeout
props.Timeout = defaultFetcherProps.Timeout
}
if props.CacheCap == 0 {
props.CacheCap = defaultProps.CacheCap
props.CacheCap = defaultFetcherProps.CacheCap
}
}

Expand All @@ -47,6 +47,11 @@ func NewFetcher(props *FetcherProps) *Fetcher {
}
}

type GetLinksProps struct {
Url string
Category string
}

type cacheEntry struct {
url string
response *html.Node
Expand Down
Loading

0 comments on commit 226d719

Please sign in to comment.