From 9a98175fc5065bab185687e60ed626a14cbd7db1 Mon Sep 17 00:00:00 2001 From: Yann Defretin Date: Fri, 13 Sep 2024 21:22:19 +0200 Subject: [PATCH] Add support for proxies --- README.md | 10 ++++++++-- config/cli.go | 6 ++++++ config/settings.go | 3 ++- main.go | 7 +++++++ 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c300a87..4bc1aad 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ By default, the `docker run` command above will always `pull` before running to Output of `./moviestills --help`: ``` -Usage: moviestills [--website WEBSITE] [--list] [--parallel PARALLEL] [--delay DELAY] [--async] [--timeout TIMEOUT] [--cache-dir CACHE-DIR] [--data-dir DATA-DIR] [--debug] [--no-colors] [--no-style] [--hash] +Usage: moviestills [--website WEBSITE] [--list] [--parallel PARALLEL] [--delay DELAY] [--async] [--timeout TIMEOUT] [--proxy PROXY] [--cache-dir CACHE-DIR] [--data-dir DATA-DIR] [--hash] [--debug] [--no-colors] [--no-style] Options: --website WEBSITE, -w WEBSITE @@ -99,14 +99,16 @@ Options: --async, -a Enable asynchronus running jobs [default: false, env: ASYNC] --timeout TIMEOUT, -t TIMEOUT Set the default request timeout for the scraper [default: 15s, env: TIMEOUT] + --proxy PROXY, -x PROXY + The proxy URL to use for scraping [env: PROXY] --cache-dir CACHE-DIR, -c CACHE-DIR Where to cache scraped websites pages [default: cache, env: CACHE_DIR] --data-dir DATA-DIR, -f DATA-DIR Where to store movie snapshots [default: data, env: DATA_DIR] + --hash Hash image filenames with md5 [default: false, env: HASH] --debug, -d Set Log Level to Debug to see everything [default: false, env: DEBUG] --no-colors Disable colors from output [default: false, env: NO_COLORS] --no-style Disable styling and colors entirely from output [default: false, env: NO_STYLE] - --hash Hash image filenames with md5 [default: false, env: HASH] --help, -h display this help and exit --version display version and exit ``` @@ -201,6 +203,10 @@ You can change the default `data` folder with the `—data-dir` CLI argument or If you use our Docker image to run `moviestills`, don't forget to change the volume path in case you edited the *internal* `data` folder. Again, you should not even bother editing the *internal* `data` folder's path or name anyway as you have volumes to store and get access to these files on the host machine. +#### Proxies + +You can set up a proxy URL to use for scraping using the `--proxy` CLI agument or the `PROXY` environment variable. At the moment, you can set only one proxy but the app might support multiple proxies in a round robin fashion later. + #### Hash filenames To get some consistency, you can use the MD5 hash function to normalize image filenames. All images will then use 32 hexadecimal digits as filenames. To enable the *hashing*, use the `—hash` CLI argument or the `HASH=true` environment variable. diff --git a/config/cli.go b/config/cli.go index ae45ae0..27ceb39 100644 --- a/config/cli.go +++ b/config/cli.go @@ -6,6 +6,12 @@ func (Options) Description() string { return "this program can scrap various websites to get high quality movie snapshots.\n" } +// Epilogue will be displayed at the end +// of the "help" CLI command. +func (Options) Epilogue() string { + return "For more information visit https://github.com/kinoute/moviestills" +} + // Version of the app can be displayed either // when a user makes use of the --version flag, the // --help flag or when an erroneous flag is passed. diff --git a/config/settings.go b/config/settings.go index fc03153..d5361da 100644 --- a/config/settings.go +++ b/config/settings.go @@ -10,10 +10,11 @@ type Options struct { RandomDelay time.Duration `arg:"-r, --delay,env:RANDOM_DELAY" help:"Add some random delay between requests" default:"1s"` Async bool `arg:"-a, --async,env:ASYNC" help:"Enable asynchronus running jobs" default:"false"` TimeOut time.Duration `arg:"-t, --timeout,env:TIMEOUT" help:"Set the default request timeout for the scraper" default:"15s"` + Proxy string `arg:"-x, --proxy,env:PROXY" help:"The proxy URL to use for scraping"` CacheDir string `arg:"-c, --cache-dir,env:CACHE_DIR" help:"Where to cache scraped websites pages" default:"cache"` DataDir string `arg:"-f, --data-dir,env:DATA_DIR" help:"Where to store movie snapshots" default:"data"` + Hash bool `arg:"--hash,env:HASH" help:"Hash image filenames with md5" default:"false"` Debug bool `arg:"-d, --debug,env:DEBUG" help:"Set Log Level to Debug to see everything" default:"false"` NoColors bool `arg:"--no-colors,env:NO_COLORS" help:"Disable colors from output" default:"false"` NoStyle bool `arg:"--no-style,env:NO_STYLE" help:"Disable styling and colors entirely from output" default:"false"` - Hash bool `arg:"--hash,env:HASH" help:"Hash image filenames with md5" default:"false"` } diff --git a/main.go b/main.go index d99b0e3..76690e0 100644 --- a/main.go +++ b/main.go @@ -107,6 +107,13 @@ func main() { colly.CacheDir(options.CacheDir), ) + // Set up a proxy + if options.Proxy != "" { + if err := scraper.SetProxy(options.Proxy); err != nil { + log.Error.Println("Could not set proxy", log.White(options.Proxy), log.Red(err)) + } + } + // Set request timeout scraper.SetRequestTimeout(options.TimeOut)