Skip to content

Commit

Permalink
Refactor (#222)
Browse files Browse the repository at this point in the history
* Fix option name

* Remove intermediate variables

* Make error handling consistent

* Fix build

* Rename variables

* Refactor

* Rename argument
  • Loading branch information
raviqqe authored May 26, 2022
1 parent 90e7faf commit c697b0e
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 25 deletions.
17 changes: 5 additions & 12 deletions arguments.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package main
import (
"bytes"
"errors"
"fmt"
"regexp"
"strings"

Expand All @@ -15,7 +14,7 @@ type arguments struct {
MaxConnections int `short:"c" long:"max-connections" value-name:"<count>" default:"512" description:"Maximum number of HTTP connections"`
MaxConnectionsPerHost int `long:"max-connections-per-host" value-name:"<count>" default:"512" description:"Maximum number of HTTP connections per host"`
RawExcludedPatterns []string `short:"e" long:"exclude" value-name:"<pattern>..." description:"Exclude URLs matched with given regular expressions"`
RawIncludePatterns []string `short:"i" long:"include" value-name:"<pattern>..." description:"Include URLs matched with given regular expressions"`
RawIncludedPatterns []string `short:"i" long:"include" value-name:"<pattern>..." description:"Include URLs matched with given regular expressions"`
FollowRobotsTxt bool `long:"follow-robots-txt" description:"Follow robots.txt when scraping pages"`
FollowSitemapXML bool `long:"follow-sitemap-xml" description:"Scrape only pages listed in sitemap.xml"`
RawHeaders []string `long:"header" value-name:"<header>..." description:"Custom headers"`
Expand Down Expand Up @@ -51,27 +50,21 @@ func getArguments(ss []string) (*arguments, error) {

args.URL = ss[0]

res, err := compileRegexps(args.RawExcludedPatterns)
args.ExcludedPatterns, err = compileRegexps(args.RawExcludedPatterns)
if err != nil {
return nil, err
}

args.ExcludedPatterns = res

ris, err := compileRegexps(args.RawIncludePatterns)
args.IncludePatterns, err = compileRegexps(args.RawIncludedPatterns)
if err != nil {
return nil, errors.New(fmt.Sprintf("Parse include patterns: %s", err.Error()))
return nil, err
}

args.IncludePatterns = ris

hs, err := parseHeaders(args.RawHeaders)
args.Headers, err = parseHeaders(args.RawHeaders)
if err != nil {
return nil, err
}

args.Headers = hs

return &args, nil
}

Expand Down
20 changes: 7 additions & 13 deletions link_finder.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ type linkFinder struct {
includedPatterns []*regexp.Regexp
}

func newLinkFinder(rs []*regexp.Regexp, includePatterns []*regexp.Regexp) linkFinder {
return linkFinder{excludedPatterns: rs, includedPatterns: includePatterns}
func newLinkFinder(es []*regexp.Regexp, is []*regexp.Regexp) linkFinder {
return linkFinder{excludedPatterns: es, includedPatterns: is}
}

func (f linkFinder) Find(n *html.Node, base *url.URL) map[string]error {
Expand All @@ -51,19 +51,13 @@ func (f linkFinder) Find(n *html.Node, base *url.URL) map[string]error {
for _, s := range ss {
s := strings.TrimSpace(s)

if s == "" || f.isLinkExcluded(s) {
continue
}

// only use include pattern when not empty
if len(f.includedPatterns) > 0 && !f.isLinkIncluded(s) {
if s == "" || f.isLinkExcluded(s) || len(f.includedPatterns) > 0 && !f.isLinkIncluded(s) {
continue
}

u, err := url.Parse(s)
if err != nil {
ls[s] = err
continue
} else if _, ok := validSchemes[u.Scheme]; ok {
ls[base.ResolveReference(u).String()] = nil
}
Expand All @@ -90,15 +84,15 @@ func (linkFinder) parseLinks(n *html.Node, a string) []string {
}

func (f linkFinder) isLinkExcluded(u string) bool {
return f.isMatch(u, f.excludedPatterns)
return f.matches(u, f.excludedPatterns)
}

func (f linkFinder) isLinkIncluded(u string) bool {
return f.isMatch(u, f.includedPatterns)
return f.matches(u, f.includedPatterns)
}

func (f linkFinder) isMatch(u string, patterns []*regexp.Regexp) bool {
for _, r := range patterns {
func (f linkFinder) matches(u string, rs []*regexp.Regexp) bool {
for _, r := range rs {
if r.MatchString(u) {
return true
}
Expand Down

0 comments on commit c697b0e

Please sign in to comment.