Skip to content

Commit

Permalink
added trim flag
Browse files Browse the repository at this point in the history
  • Loading branch information
paulvollmer committed Sep 6, 2019
1 parent 3f26a4a commit 4fb76d4
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 8 deletions.
11 changes: 5 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION=0.2.0
VERSION=0.2.1

all: lint test

Expand All @@ -9,15 +9,14 @@ lint:
@go fmt ./...
@golint ./...

test: build
test: test-src test-cli
test-src:
@go test ./...
test-cli: build
@./htmltable2csv -v
@./htmltable2csv -source "./scraper/fixture/test1.html" -selector "table > tbody > tr" -csv data_file.csv
@./htmltable2csv -source "https://www.w3schools.com/html/html_tables.asp" -selector "#customers > tbody > tr" -csv data_url.csv

test-all:
@go test ./...
@make test

release:
git tag -a v${VERSION} -m "Version ${VERSION}"
git push origin v${VERSION}
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ Flags:
The table css selector
-source string
The filepath or website url
-trim
Trim the whitespace for each table column
-v Print the version and exit
Examples:
Expand Down
2 changes: 2 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ func main() {
flagVersion := flag.Bool("v", false, "Print the version and exit")
flagSource := flag.String("source", "", "The filepath or website url")
flagSelector := flag.String("selector", "", "The table css selector")
flagTrim := flag.Bool("trim", false, "Trim the whitespace for each table column")
flagCSV := flag.String("csv", "", "The csv filename. if empty, print csv to stdout")
flag.Usage = usage
flag.Parse()
Expand All @@ -52,6 +53,7 @@ func main() {
scraper := htmltable2csv.Scraper{}
scraper.Source = *flagSource
scraper.Selector = *flagSelector
scraper.Trim = *flagTrim
_, err = scraper.Scrape()
if err != nil {
fmt.Println(err)
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "htmltable2csv",
"version": "0.2.0",
"version": "0.2.1",
"description": "htmltable2csv is a tool to parse a html table and store the data as csv. It can be written to a file or print out to stdout",
"scripts": {
"test": "make test"
Expand Down
50 changes: 50 additions & 0 deletions scraper/fixture/test2.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
<table>
<thead>
<tr>
<td>key</td>
<td>value</td>
</tr>
</thead>
<tbody>
<tr>
<td>
foo
</td>
<td>
1
</td>
</tr>
<tr>
<td>
bar
</td>
<td>
2




</td>
</tr>
<tr>
<td>
baz



</td>
<td>


3







</td>
</tr>
</tbody>
</table>
8 changes: 7 additions & 1 deletion scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/http"
"net/url"
"os"
"strings"

"github.com/PuerkitoBio/goquery"
)
Expand All @@ -16,6 +17,7 @@ type Scraper struct {
Source string
Selector string
Data [][]string
Trim bool
}

// Scrape download and parse the table data
Expand Down Expand Up @@ -56,7 +58,11 @@ func (s *Scraper) Scrape() ([][]string, error) {
doc.Find(s.Selector).Each(func(i int, table *goquery.Selection) {
dataRow := make([]string, 0)
table.Find("td").Each(func(j int, td *goquery.Selection) {
dataRow = append(dataRow, td.Text())
text := td.Text()
if s.Trim {
text = strings.TrimSpace(text)
}
dataRow = append(dataRow, text)
})
data = append(data, dataRow)
})
Expand Down
12 changes: 12 additions & 0 deletions scraper/scraper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,18 @@ func TestScraper(t *testing.T) {
dataEqual(t, data)
})

t.Run("source file and trim", func(t *testing.T) {
scraper := Scraper{}
scraper.Source = "./fixture/test2.html"
scraper.Selector = "table > tbody > tr"
scraper.Trim = true
data, err := scraper.Scrape()
if err != nil {
t.Error(err)
}
dataEqual(t, data)
})

t.Run("source url", func(t *testing.T) {
// Start a local HTTP server
server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
Expand Down

0 comments on commit 4fb76d4

Please sign in to comment.