From 90581904909513296fc116d96d1f16e764076571 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Mart=C3=AD?= Date: Thu, 18 Jul 2019 19:49:28 +0100 Subject: [PATCH] cmd/xurls: don't use bufio.Scanner to scan the input It was a convenient way to obtain the input in chunks, so that the tool could print urls incrementally without having to read the entirety of the input at once. Unfortunately, we failed to notie that bufio.Scanner has a hard limit on the size of each "token". In our particular case, it meant that any sequence of many thousands of input bytes without any whitespace could make the tool error out. Instead, use bufio.Reader, which grows a buffer to fit the data being read. Go back to reading one line at a time, as it can only stop at one specific byte like '\n', and not many of them like all whitespace characters. Fixes #28. --- cmd/xurls/main.go | 19 ++++++++++++------- generate/schemesgen/main.go | 2 +- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/cmd/xurls/main.go b/cmd/xurls/main.go index 452592f..38c5879 100644 --- a/cmd/xurls/main.go +++ b/cmd/xurls/main.go @@ -7,6 +7,7 @@ import ( "bufio" "flag" "fmt" + "io" "os" "regexp" @@ -41,15 +42,19 @@ func scanPath(re *regexp.Regexp, path string) error { defer f.Close() r = f } - scanner := bufio.NewScanner(r) - scanner.Split(bufio.ScanWords) - for scanner.Scan() { - word := scanner.Text() - for _, match := range re.FindAllString(word, -1) { - fmt.Println(match) + bufr := bufio.NewReader(r) + for { + line, err := bufr.ReadBytes('\n') + for _, match := range re.FindAll(line, -1) { + fmt.Printf("%s\n", match) + } + if err == io.EOF { + break + } else if err != nil { + return err } } - return scanner.Err() + return nil } func main() { diff --git a/generate/schemesgen/main.go b/generate/schemesgen/main.go index baa3342..5bb4a73 100644 --- a/generate/schemesgen/main.go +++ b/generate/schemesgen/main.go @@ -34,7 +34,7 @@ func schemeList() []string { } defer resp.Body.Close() r := csv.NewReader(resp.Body) - r.Read() //ignore headers + r.Read() // ignore headers schemes := make([]string, 0) for { record, err := r.Read()