diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 90cb5e4d629..de635793ee5 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -23,6 +23,7 @@ use std::path::Path; use std::u64; use uucore::display::Quotable; use uucore::error::{FromIo, UIoError, UResult, USimpleError, UUsageError}; +use uucore::parse_size::parse_size_u64; use uucore::uio_error; use uucore::{format_usage, help_about, help_section, help_usage}; @@ -41,6 +42,8 @@ static OPT_SUFFIX_LENGTH: &str = "suffix-length"; static OPT_VERBOSE: &str = "verbose"; static OPT_SEPARATOR: &str = "separator"; static OPT_ELIDE_EMPTY_FILES: &str = "elide-empty-files"; +static OPT_IO_BLKSIZE: &str = "-io-blksize"; + static ARG_INPUT: &str = "input"; static ARG_PREFIX: &str = "prefix"; @@ -366,6 +369,12 @@ pub fn uu_app() -> Command { .action(ArgAction::Append) .help("use SEP instead of newline as the record separator; '\\0' (zero) specifies the NUL character"), ) + .arg( + Arg::new(OPT_IO_BLKSIZE) + .long("io-blksize") + .alias(OPT_IO_BLKSIZE) + .hide(true), + ) .arg( Arg::new(ARG_INPUT) .default_value("-") @@ -400,6 +409,7 @@ struct Settings { /// chunks. If this is `false`, then empty files will not be /// created. elide_empty_files: bool, + io_blksize: Option, } /// An error when parsing settings from command-line arguments. @@ -422,6 +432,9 @@ enum SettingsError { /// r/K/N FilterWithKthChunkNumber, + /// Invalid IO block size + InvalidIOBlockSize(String), + /// The `--filter` option is not supported on Windows. #[cfg(windows)] NotSupported, @@ -452,6 +465,7 @@ impl fmt::Display for SettingsError { Self::FilterWithKthChunkNumber => { write!(f, "--filter does not process a chunk extracted to stdout") } + Self::InvalidIOBlockSize(s) => write!(f, "invalid IO block size: {}", s.quote()), #[cfg(windows)] Self::NotSupported => write!( f, @@ -486,6 +500,23 @@ impl Settings { None => b'\n', }; + let io_blksize: Option = if let Some(s) = matches.get_one::(OPT_IO_BLKSIZE) { + match parse_size_u64(s) { + Ok(n) => { + let n: usize = n + .try_into() + .map_err(|_| SettingsError::InvalidIOBlockSize(s.to_string()))?; + if n > uucore::fs::sane_blksize::MAX { + return Err(SettingsError::InvalidIOBlockSize(s.to_string())); + } + Some(n) + } + _ => return Err(SettingsError::InvalidIOBlockSize(s.to_string())), + } + } else { + None + }; + let result = Self { prefix: matches.get_one::(ARG_PREFIX).unwrap().clone(), suffix, @@ -495,6 +526,7 @@ impl Settings { verbose: matches.value_source(OPT_VERBOSE) == Some(ValueSource::CommandLine), separator, elide_empty_files: matches.get_flag(OPT_ELIDE_EMPTY_FILES), + io_blksize, }; #[cfg(windows)] @@ -597,15 +629,24 @@ fn custom_write_all( /// (i.e. "infinite" input as in `cat /dev/zero | split ...`, `yes | split ...` etc.). /// /// Note: The `buf` might end up with either partial or entire input content. -fn get_input_size(input: &String, reader: &mut R, buf: &mut Vec) -> std::io::Result +fn get_input_size( + input: &String, + reader: &mut R, + buf: &mut Vec, + io_blksize: &Option, +) -> std::io::Result where R: BufRead, { // Set read limit to io_blksize if specified - // Otherwise to OPT_IO_BLKSIZE_MAX - let read_limit: u64 = uucore::fs::get_sanity_limited_blksize_from_path(Path::new(input)) - .try_into() - .unwrap(); + let read_limit: u64 = if let Some(custom_blksize) = io_blksize { + *custom_blksize + } else { + // otherwise try to get it from filesystem, or use default + uucore::fs::get_sanity_limited_blksize_from_path(Path::new(input)) + } + .try_into() + .unwrap(); // Try to read into buffer up to a limit let num_bytes = reader @@ -1243,7 +1284,7 @@ where { // Get the size of the input in bytes let initial_buf = &mut Vec::new(); - let mut num_bytes = get_input_size(&settings.input, reader, initial_buf)?; + let mut num_bytes = get_input_size(&settings.input, reader, initial_buf, &settings.io_blksize)?; let mut reader = initial_buf.chain(reader); // If input file is empty and we would not have determined the Kth chunk @@ -1389,7 +1430,7 @@ where // Get the size of the input in bytes and compute the number // of bytes per chunk. let initial_buf = &mut Vec::new(); - let num_bytes = get_input_size(&settings.input, reader, initial_buf)?; + let num_bytes = get_input_size(&settings.input, reader, initial_buf, &settings.io_blksize)?; let reader = initial_buf.chain(reader); // If input file is empty and we would not have determined the Kth chunk @@ -1587,9 +1628,11 @@ fn split(settings: &Settings) -> UResult<()> { })?; Box::new(r) as Box }; - - let blksize = uucore::fs::get_sanity_limited_blksize_from_path(Path::new(&settings.input)); - let mut reader = BufReader::with_capacity(blksize, r_box); + let mut reader = if let Some(c) = settings.io_blksize { + BufReader::with_capacity(c, r_box) + } else { + BufReader::new(r_box) + }; match settings.strategy { Strategy::Number(NumberType::Bytes(num_chunks)) => { diff --git a/src/uucore/src/lib/features/fs.rs b/src/uucore/src/lib/features/fs.rs index e1a7147e853..525a5108834 100644 --- a/src/uucore/src/lib/features/fs.rs +++ b/src/uucore/src/lib/features/fs.rs @@ -744,7 +744,7 @@ pub fn path_ends_with_terminator(path: &Path) -> bool { .map_or(false, |wide| wide == b'/'.into() || wide == b'\\'.into()) } -mod sane_blksize { +pub mod sane_blksize { pub const DEFAULT: usize = 512; #[cfg(not(target_os = "windows"))] pub const MAX: usize = (u32::MAX / 8 + 1) as usize; diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index 8868a55aba3..acb8ab56140 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -1025,7 +1025,8 @@ fn test_number_kth_of_n_round_robin() { #[test] fn test_split_number_with_io_blksize() { let (at, mut ucmd) = at_and_ucmd!(); - ucmd.args(&["-n", "5", "asciilowercase.txt"]).succeeds(); + ucmd.args(&["-n", "5", "asciilowercase.txt", "---io-blksize", "1024"]) + .succeeds(); assert_eq!(at.read("xaa"), "abcdef"); assert_eq!(at.read("xab"), "ghijkl"); assert_eq!(at.read("xac"), "mnopq"); @@ -1038,17 +1039,34 @@ fn test_split_default_with_io_blksize() { let (at, mut ucmd) = at_and_ucmd!(); let name = "split_default_with_io_blksize"; RandomFile::new(&at, name).add_lines(2000); - ucmd.args(&[name]).succeeds(); + ucmd.args(&[name, "---io-blksize", "2M"]).succeeds(); let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); assert_eq!(glob.count(), 2); assert_eq!(glob.collate(), at.read_bytes(name)); } +#[test] +fn test_split_invalid_io_blksize() { + new_ucmd!() + .args(&["---io-blksize=XYZ", "threebytes.txt"]) + .fails() + .stderr_only("split: invalid IO block size: 'XYZ'\n"); + new_ucmd!() + .args(&["---io-blksize=5000000000", "threebytes.txt"]) + .fails() + .stderr_only("split: invalid IO block size: '5000000000'\n"); + #[cfg(target_pointer_width = "32")] + new_ucmd!() + .args(&["---io-blksize=2146435072", "threebytes.txt"]) + .fails() + .stderr_only("split: invalid IO block size: '2146435072'\n"); +} + #[test] fn test_split_number_oversized_stdin() { new_ucmd!() - .args(&["--number=3"]) + .args(&["--number=3", "---io-blksize=600"]) .pipe_in_fixture("sixhundredfiftyonebytes.txt") .fails() .stderr_only("split: -: cannot determine input size\n");