From 3b850508863bd703984f7b6f4dc3df7e78392f9c Mon Sep 17 00:00:00 2001 From: Paul Vollmer Date: Fri, 6 Sep 2019 11:59:08 +0200 Subject: [PATCH 1/3] renamed flag url to source --- main.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main.go b/main.go index 8c6b7f3..eada04f 100644 --- a/main.go +++ b/main.go @@ -27,7 +27,7 @@ func usage() { func main() { flagVersion := flag.Bool("v", false, "Print the version and exit") - flagURL := flag.String("url", "", "The website url") + flagSource := flag.String("source", "", "The filepath or website url") flagSelector := flag.String("selector", "", "The table css selector") flagCSV := flag.String("csv", "", "The csv filename. if empty, print csv to stdout") flag.Usage = usage @@ -38,8 +38,8 @@ func main() { os.Exit(0) } - if *flagURL == "" { - fmt.Println("Flag -url cannot be empty") + if *flagSource == "" { + fmt.Println("Flag -source cannot be empty") os.Exit(1) } @@ -50,7 +50,7 @@ func main() { var err error scraper := htmltable2csv.Scraper{} - scraper.URL = *flagURL + scraper.Source = *flagSource scraper.Selector = *flagSelector _, err = scraper.Scrape() if err != nil { From 01fa3ef540ebfc050083b9a2308541a3f0238d9d Mon Sep 17 00:00:00 2001 From: Paul Vollmer Date: Fri, 6 Sep 2019 11:59:33 +0200 Subject: [PATCH 2/3] scraper added file read and renamed URL to Source --- scraper/fixture/test1.html | 22 +++++++++ scraper/scraper.go | 44 +++++++++++++----- scraper/scraper_test.go | 94 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 12 deletions(-) create mode 100644 scraper/fixture/test1.html create mode 100644 scraper/scraper_test.go diff --git a/scraper/fixture/test1.html b/scraper/fixture/test1.html new file mode 100644 index 0000000..3a36c77 --- /dev/null +++ b/scraper/fixture/test1.html @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + +
keyvalue
foo1
bar2
baz3
diff --git a/scraper/scraper.go b/scraper/scraper.go index bf2ae46..3d17525 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -5,14 +5,15 @@ import ( "fmt" "io" "net/http" + "net/url" "os" "github.com/PuerkitoBio/goquery" ) -// Scraper store the URL, Selector and collected Data +// Scraper store the Source, Selector and collected Data type Scraper struct { - URL string + Source string Selector string Data [][]string } @@ -20,18 +21,37 @@ type Scraper struct { // Scrape download and parse the table data func (s *Scraper) Scrape() ([][]string, error) { var data = make([][]string, 0) - res, err := http.Get(s.URL) - if err != nil { - return data, err - } - defer res.Body.Close() - if res.StatusCode != 200 { - return data, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status) - } - doc, err := goquery.NewDocumentFromReader(res.Body) + + var doc goquery.Document + + _, err := url.ParseRequestURI(s.Source) if err != nil { - return data, err + f, err := os.Open(s.Source) + if err != nil { + return data, err + } + defer f.Close() + tmp, err := goquery.NewDocumentFromReader(f) + if err != nil { + return data, err + } + doc = *tmp + } else { + res, err := http.Get(s.Source) + if err != nil { + return data, err + } + defer res.Body.Close() + if res.StatusCode != 200 { + return data, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status) + } + tmp, err := goquery.NewDocumentFromReader(res.Body) + if err != nil { + return data, err + } + doc = *tmp } + // Find the table doc.Find(s.Selector).Each(func(i int, table *goquery.Selection) { dataRow := make([]string, 0) diff --git a/scraper/scraper_test.go b/scraper/scraper_test.go new file mode 100644 index 0000000..60e02e9 --- /dev/null +++ b/scraper/scraper_test.go @@ -0,0 +1,94 @@ +package htmltable2csv + +import ( + "net/http" + "net/http/httptest" + "testing" +) + +func TestScraper(t *testing.T) { + t.Run("source file", func(t *testing.T) { + scraper := Scraper{} + scraper.Source = "./fixture/test1.html" + scraper.Selector = "table > tbody > tr" + data, err := scraper.Scrape() + if err != nil { + t.Error(err) + } + dataEqual(t, data) + }) + + t.Run("source url", func(t *testing.T) { + // Start a local HTTP server + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + rw.Write([]byte(` + + + + + + + + + + + + + + + + + + + + +
keyvalue
foo1
bar2
baz3
`)) + })) + defer server.Close() + + scraper := Scraper{} + scraper.Source = server.URL + scraper.Selector = "table > tbody > tr" + data, err := scraper.Scrape() + if err != nil { + t.Error(err) + } + dataEqual(t, data) + }) +} + +func dataEqual(t *testing.T, data [][]string) { + if len(data) != 3 { + t.Error("data not equal") + } + + if len(data[0]) != 2 { + t.Error("data[0] not equal") + } + if data[0][0] != "foo" { + t.Error("data[0][0] not equal") + } + if data[0][1] != "1" { + t.Error("data[0][1] not equal") + } + + if len(data[1]) != 2 { + t.Error("data[1] not equal") + } + if data[1][0] != "bar" { + t.Error("data[1][0] not equal") + } + if data[1][1] != "2" { + t.Error("data[1][1] not equal") + } + + if len(data[2]) != 2 { + t.Error("data[2] not equal") + } + if data[2][0] != "baz" { + t.Error("data[2][0] not equal") + } + if data[2][1] != "3" { + t.Error("data[2][0] not equal") + } +} From cd4cda3cd04f831fd843de4721a00bc35930c394 Mon Sep 17 00:00:00 2001 From: Paul Vollmer Date: Fri, 6 Sep 2019 14:05:09 +0200 Subject: [PATCH 3/3] bump version and added release-dry command to makefile --- .goreleaser.yml | 1 + Makefile | 11 +++++++---- package.json | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.goreleaser.yml b/.goreleaser.yml index 0657938..fb01137 100644 --- a/.goreleaser.yml +++ b/.goreleaser.yml @@ -4,6 +4,7 @@ builds: goos: - darwin - linux + - windows goarch: - amd64 - 386 diff --git a/Makefile b/Makefile index 0c36d6b..2a54c11 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -VERSION=0.1.1 +VERSION=0.2.0 all: lint test @@ -11,15 +11,18 @@ lint: test: build @./htmltable2csv -v - @./htmltable2csv -url "https://www.w3schools.com/html/html_tables.asp" -selector "#customers > tbody > tr" -csv data.csv + @./htmltable2csv -source "./scraper/fixture/test1.html" -selector "table > tbody > tr" -csv data_file.csv + @./htmltable2csv -source "https://www.w3schools.com/html/html_tables.asp" -selector "#customers > tbody > tr" -csv data_url.csv test-all: - @go test all + @go test ./... @make test release: git tag -a v${VERSION} -m "Version ${VERSION}" git push origin v${VERSION} goreleaser +release-dry: + goreleaser --skip-publish --skip-validate --snapshot -.PHONY: all lint build test test-all release +.PHONY: all lint build test test-all release release-dry diff --git a/package.json b/package.json index 93d7644..4d8fef8 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "htmltable2csv", - "version": "0.1.0", + "version": "0.2.0", "description": "htmltable2csv is a tool to parse a html table and store the data as csv. It can be written to a file or print out to stdout", "scripts": { "test": "make test"