Skip to content

Commit

Permalink
Add support for excluding paths from link checking (#623)
Browse files Browse the repository at this point in the history
This change deprecates `--exclude-file` as it was ambiguous.
Instead, `--exclude-path` was introduced to support excluding paths
to files and directories that should not be checked.
Furthermore, `.lycheeignore` is now the only way
to exclude URL patterns.
  • Loading branch information
mre committed May 29, 2022
1 parent 451e336 commit 363b95f
Show file tree
Hide file tree
Showing 15 changed files with 207 additions and 31 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,8 @@ OPTIONS:
--basic-auth <basic-auth> Basic authentication support. E.g. `username:password`
-c, --config <config-file> Configuration file to use [default: ./lychee.toml]
--exclude <exclude>... Exclude URLs from checking (supports regex)
--exclude-file <exclude-file>... File or files that contain URLs to be excluded from checking. Regular
expressions supported; one pattern per line. Automatically excludes
patterns from `.lycheeignore` if file exists
--exclude-file <exclude-file>... Deprecated; use `--exclude-path` instead
--exclude-path <exclude-path>... Exclude file path from getting checked
-f, --format <format> Output format of final status report (compact, detailed, json, markdown)
[default: compact]
--github-token <github-token> GitHub API token to use when checking github.com links, to avoid rate
Expand Down
2 changes: 2 additions & 0 deletions examples/collect_links/collect_links.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@ async fn main() -> Result<()> {
Url::parse("https://github.com/lycheeverse/lychee").unwrap(),
)),
file_type_hint: None,
excluded_paths: None,
},
Input {
source: InputSource::FsPath(PathBuf::from("fixtures/TEST.md")),
file_type_hint: None,
excluded_paths: None,
},
];

Expand Down
1 change: 1 addition & 0 deletions fixtures/exclude-path/dir1/TEST.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://example.com/excluded_dir
1 change: 1 addition & 0 deletions fixtures/exclude-path/dir2/TEST.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://example.com
1 change: 1 addition & 0 deletions fixtures/exclude-path/dir2/subdir/TEST.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://example.com/excluded_subdir
1 change: 1 addition & 0 deletions lychee-bin/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ csv = "1.1.6"
humantime = "2.1.0"
secrecy = { version = "0.8.0", features = ["serde"] }
supports-color = "1.3.0"
log = "0.4.17"

[dev-dependencies]
assert_cmd = "2.0.4"
Expand Down
2 changes: 1 addition & 1 deletion lychee-bin/src/commands/dump.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use crate::ExitCode;
use super::CommandParams;

/// Dump all detected links to stdout without checking them
pub(crate) async fn dump<'a, S>(params: CommandParams<S>) -> Result<ExitCode>
pub(crate) async fn dump<S>(params: CommandParams<S>) -> Result<ExitCode>
where
S: futures::Stream<Item = Result<Request>>,
{
Expand Down
23 changes: 14 additions & 9 deletions lychee-bin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,21 +58,21 @@
#![deny(anonymous_parameters, macro_use_extern_crate, pointer_structural_match)]
#![deny(missing_docs)]

use std::fs::{self, File};
use std::io::{self, BufRead, BufReader, ErrorKind, Write};
use std::sync::Arc;

use anyhow::{Context, Error, Result};
use color::YELLOW;
use commands::CommandParams;
use formatters::response::ResponseFormatter;
use lychee_lib::Collector;
// required for apple silicon
use ring as _;

use anyhow::{Context, Error, Result};
use log::warn;
use openssl_sys as _; // required for vendored-openssl feature
use ring as _;
use std::fs::{self, File};
use std::io::{self, BufRead, BufReader, ErrorKind, Write};
use std::sync::Arc;
use ring as _; // required for apple silicon
use structopt::StructOpt;

use lychee_lib::Collector;

mod cache;
mod client;
mod color;
Expand Down Expand Up @@ -142,6 +142,11 @@ fn load_config() -> Result<LycheeOptions> {
opts.config.exclude.append(&mut read_lines(&lycheeignore)?);
}

// TODO: Remove this warning and the parameter in a future release
if !&opts.config.exclude_file.is_empty() {
warn!("WARNING: `--exclude-file` is deprecated and will soon be removed; use `{}` file to ignore URL patterns instead. To exclude paths of files and directories, use `--exclude-path`.", LYCHEE_IGNORE_FILE);
}

// Load excludes from file
for path in &opts.config.exclude_file {
let file = File::open(path)?;
Expand Down
23 changes: 14 additions & 9 deletions lychee-bin/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,6 @@ const STRUCTOPT_HELP_MSG_CACHE: &str = formatcp!(
"Use request cache stored on disk at `{}`",
LYCHEE_CACHE_FILE,
);
const STRUCTOPT_HELP_MSG_IGNORE_FILE: &str = formatcp!(
"File or files that contain URLs to be excluded from checking. Regular
expressions supported; one pattern per line. Automatically excludes
patterns from `{}` if file exists",
LYCHEE_IGNORE_FILE,
);
const TIMEOUT_STR: &str = concatcp!(DEFAULT_TIMEOUT_SECS);
const RETRY_WAIT_TIME_STR: &str = concatcp!(DEFAULT_RETRY_WAIT_TIME_SECS);

Expand Down Expand Up @@ -132,9 +126,14 @@ impl LycheeOptions {
// but we'd get no access to `glob_ignore_case`.
/// Get parsed inputs from options.
pub(crate) fn inputs(&self) -> Result<Vec<Input>> {
let excluded = if self.config.exclude_path.is_empty() {
None
} else {
Some(self.config.exclude_path.clone())
};
self.raw_inputs
.iter()
.map(|s| Input::new(s, None, self.config.glob_ignore_case))
.map(|s| Input::new(s, None, self.config.glob_ignore_case, excluded.clone()))
.collect::<Result<_, _>>()
.context("Cannot parse inputs from arguments")
}
Expand Down Expand Up @@ -225,11 +224,16 @@ pub(crate) struct Config {
#[serde(default)]
pub(crate) exclude: Vec<String>,

#[structopt(help = &STRUCTOPT_HELP_MSG_IGNORE_FILE)]
/// Deprecated; use `--exclude-path` instead
#[structopt(long)]
#[serde(default)]
pub(crate) exclude_file: Vec<String>,

/// Exclude file path from getting checked.
#[structopt(long)]
#[serde(default)]
pub(crate) exclude_path: Vec<PathBuf>,

/// Exclude all private IPs from checking.
/// Equivalent to `--exclude-private --exclude-link-local --exclude-loopback`
#[structopt(short = "E", long, verbatim_doc_comment)]
Expand Down Expand Up @@ -369,7 +373,8 @@ impl Config {
scheme: Vec::<String>::new();
include: Vec::<String>::new();
exclude: Vec::<String>::new();
exclude_file: Vec::<String>::new();
exclude_file: Vec::<String>::new(); // deprecated
exclude_path: Vec::<PathBuf>::new();
exclude_all_private: false;
exclude_private: false;
exclude_link_local: false;
Expand Down
23 changes: 23 additions & 0 deletions lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -695,4 +695,27 @@ mod cli {

Ok(())
}

#[test]
fn test_excluded_paths() -> Result<()> {
let test_path = fixtures_path().join("exclude-path");

let excluded_path1 = test_path.join("dir1");
let excluded_path2 = test_path.join("dir2").join("subdir");
let mut cmd = main_command();

cmd.arg("--exclude-path")
.arg(&excluded_path1)
.arg(&excluded_path2)
.arg("--")
.arg(&test_path)
.assert()
.success()
// Links in excluded files are not taken into account in the total
// number of links.
.stdout(contains("1 Total"))
.stdout(contains("1 OK"));

Ok(())
}
}
2 changes: 1 addition & 1 deletion lychee-lib/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ pub const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION"
/// A timeout for only the connect phase of a Client.
const CONNECT_TIMEOUT: u64 = 10;
/// TCP keepalive
/// See `https://tldp.org/HOWTO/TCP-Keepalive-HOWTO/overview.html` for more info
/// See <https://tldp.org/HOWTO/TCP-Keepalive-HOWTO/overview.html> for more info
const TCP_KEEPALIVE: u64 = 60;

/// Builder for [`Client`].
Expand Down
14 changes: 12 additions & 2 deletions lychee-lib/src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ mod test {
// Treat as plaintext file (no extension)
let file_path = temp_dir.path().join("README");
let _file = File::create(&file_path).unwrap();
let input = Input::new(&file_path.as_path().display().to_string(), None, true)?;
let input = Input::new(&file_path.as_path().display().to_string(), None, true, None)?;
let contents: Vec<_> = input.get_contents(true).await.collect::<Vec<_>>().await;

assert_eq!(contents.len(), 1);
Expand All @@ -125,7 +125,7 @@ mod test {

#[tokio::test]
async fn test_url_without_extension_is_html() -> Result<()> {
let input = Input::new("https://example.com/", None, true)?;
let input = Input::new("https://example.com/", None, true, None)?;
let contents: Vec<_> = input.get_contents(true).await.collect::<Vec<_>>().await;

assert_eq!(contents.len(), 1);
Expand Down Expand Up @@ -156,6 +156,7 @@ mod test {
Input {
source: InputSource::String(TEST_STRING.to_owned()),
file_type_hint: None,
excluded_paths: None,
},
Input {
source: InputSource::RemoteUrl(Box::new(
Expand All @@ -164,17 +165,20 @@ mod test {
.unwrap(),
)),
file_type_hint: None,
excluded_paths: None,
},
Input {
source: InputSource::FsPath(file_path),
file_type_hint: None,
excluded_paths: None,
},
Input {
source: InputSource::FsGlob {
pattern: temp_dir_path.join("glob*").to_str().unwrap().to_owned(),
ignore_case: true,
},
file_type_hint: None,
excluded_paths: None,
},
];

Expand All @@ -199,6 +203,7 @@ mod test {
let input = Input {
source: InputSource::String("This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)".to_string()),
file_type_hint: Some(FileType::Markdown),
excluded_paths: None,
};
let links = collect(vec![input], Some(base)).await;

Expand All @@ -224,6 +229,7 @@ mod test {
.to_string(),
),
file_type_hint: Some(FileType::Html),
excluded_paths: None,
};
let links = collect(vec![input], Some(base)).await;

Expand Down Expand Up @@ -252,6 +258,7 @@ mod test {
.to_string(),
),
file_type_hint: Some(FileType::Html),
excluded_paths: None,
};
let links = collect(vec![input], Some(base)).await;

Expand All @@ -277,6 +284,7 @@ mod test {
.to_string(),
),
file_type_hint: Some(FileType::Markdown),
excluded_paths: None,
};

let links = collect(vec![input], Some(base)).await;
Expand All @@ -299,6 +307,7 @@ mod test {
let input = Input {
source: InputSource::String(input),
file_type_hint: Some(FileType::Html),
excluded_paths: None,
};
let links = collect(vec![input], Some(base)).await;

Expand Down Expand Up @@ -331,6 +340,7 @@ mod test {
let input = Input {
source: InputSource::RemoteUrl(Box::new(server_uri.clone())),
file_type_hint: None,
excluded_paths: None,
};

let links = collect(vec![input], None).await;
Expand Down
70 changes: 68 additions & 2 deletions lychee-lib/src/helpers/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use cached::proc_macro::cached;
use once_cell::sync::Lazy;
use path_clean::PathClean;
use std::env;
use std::fs;
use std::path::{Path, PathBuf};

static CURRENT_DIR: Lazy<PathBuf> =
Expand Down Expand Up @@ -37,6 +38,7 @@ fn dirname(src: &'_ Path) -> Option<&'_ Path> {
}

/// Resolve `dst` that was linked to from within `src`
///
/// Returns Ok(None) in case of an absolute local link without a `base_url`
pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<Option<PathBuf>> {
let resolved = match dst {
Expand Down Expand Up @@ -72,15 +74,35 @@ pub(crate) fn resolve(src: &Path, dst: &Path, base: &Option<Base>) -> Result<Opt
Ok(Some(absolute_path(resolved)))
}

// A cumbersome way to concatenate paths without checking their
// existence on disk. See https://github.com/rust-lang/rust/issues/16507
/// A cumbersome way to concatenate paths without checking their
/// existence on disk. See <https://github.com/rust-lang/rust/issues/16507>
fn join(base: PathBuf, dst: &Path) -> PathBuf {
let mut abs = base.into_os_string();
let target_str = dst.as_os_str();
abs.push(target_str);
PathBuf::from(abs)
}

/// Check if `child` is a subdirectory/file inside `parent`
///
/// Note that `contains(parent, parent)` will return `true`
///
/// See <https://stackoverflow.com/questions/30511331>
/// See <https://stackoverflow.com/questions/62939265>
///
/// # Errors
///
/// Returns an error if the `path` does not exist
/// or a non-final component in path is not a directory.
//
// Unfortunately requires real files for `fs::canonicalize`.
pub(crate) fn contains(parent: &PathBuf, child: &PathBuf) -> Result<bool> {
let parent = fs::canonicalize(&parent)?;
let child = fs::canonicalize(&child)?;

Ok(child.starts_with(parent))
}

#[cfg(test)]
mod test_path {
use super::*;
Expand Down Expand Up @@ -155,4 +177,48 @@ mod test_path {
);
Ok(())
}

#[test]
fn test_contains() {
let parent_dir = tempfile::tempdir().unwrap();
let parent = parent_dir.path();
let child_dir = tempfile::tempdir_in(parent).unwrap();
let child = child_dir.path();

assert_eq!(contains(&parent.to_owned(), &child.to_owned()), Ok(true));
}

#[test]
fn test_contains_not() {
let dir1 = tempfile::tempdir().unwrap();
let dir2 = tempfile::tempdir().unwrap();

assert_eq!(
contains(&dir1.path().to_owned(), &dir2.path().to_owned()),
Ok(false)
);
}

#[test]
fn test_contains_one_dir_does_not_exist() {
let dir1 = tempfile::tempdir().unwrap();

assert!(matches!(
contains(&dir1.path().to_owned(), &PathBuf::from("/does/not/exist")),
Err(crate::ErrorKind::ReadStdinInput(_))
));
}

// Relative paths are supported, e.g.
// parent: `/path/to/parent`
// child: `/path/to/parent/child/..`
#[test]
fn test_contains_one_dir_relative_path() {
let parent_dir = tempfile::tempdir().unwrap();
let parent = parent_dir.path();
let child_dir = tempfile::tempdir_in(parent).unwrap();
let child = child_dir.path().join("..");

assert_eq!(contains(&parent.to_owned(), &child), Ok(true));
}
}
Loading

0 comments on commit 363b95f

Please sign in to comment.