From 409da2ac8f1c6bee1fc4d969d24072f458509095 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Wed, 25 Oct 2023 20:35:35 -0400 Subject: [PATCH] split: refactor filename suffix --- src/uu/split/src/filenames.rs | 374 +++++++++++++++++---- src/uu/split/src/number.rs | 2 +- src/uu/split/src/split.rs | 614 ++-------------------------------- src/uu/split/src/strategy.rs | 379 +++++++++++++++++++++ 4 files changed, 715 insertions(+), 654 deletions(-) create mode 100644 src/uu/split/src/strategy.rs diff --git a/src/uu/split/src/filenames.rs b/src/uu/split/src/filenames.rs index e6a9f19b2b7..e776b274b65 100644 --- a/src/uu/split/src/filenames.rs +++ b/src/uu/split/src/filenames.rs @@ -2,7 +2,7 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore zaaa zaab +// spell-checker:ignore zaaa zaab stype //! Compute filenames from a given index. //! //! The [`FilenameIterator`] yields filenames for use with ``split``. @@ -16,18 +16,31 @@ //! use crate::filenames::SuffixType; //! //! let prefix = "chunk_".to_string(); -//! let suffix = ".txt".to_string(); -//! let width = 2; -//! let suffix_type = SuffixType::Alphabetic; -//! let it = FilenameIterator::new(prefix, suffix, width, suffix_type); +//! let suffix = Suffix { +//! stype: SuffixType::Alphabetic, +//! length: 2, +//! start: 0, +//! auto_widening: true, +//! additional: ".txt".to_string(), +//! }; +//! let it = FilenameIterator::new(prefix, suffix); //! //! assert_eq!(it.next().unwrap(), "chunk_aa.txt"); //! assert_eq!(it.next().unwrap(), "chunk_ab.txt"); //! assert_eq!(it.next().unwrap(), "chunk_ac.txt"); //! ``` + use crate::number::DynamicWidthNumber; use crate::number::FixedWidthNumber; use crate::number::Number; +use crate::strategy::Strategy; +use crate::{ + OPT_ADDITIONAL_SUFFIX, OPT_HEX_SUFFIXES, OPT_HEX_SUFFIXES_SHORT, OPT_NUMERIC_SUFFIXES, + OPT_NUMERIC_SUFFIXES_SHORT, OPT_SUFFIX_LENGTH, +}; +use clap::ArgMatches; +use std::fmt; +use uucore::display::Quotable; use uucore::error::{UResult, USimpleError}; /// The format to use for suffixes in the filename for each output chunk. @@ -54,21 +67,200 @@ impl SuffixType { } } +/// Filename suffix parameters +#[derive(Clone)] +pub struct Suffix { + stype: SuffixType, + length: usize, + start: usize, + auto_widening: bool, + additional: String, +} + +/// An error when parsing suffix parameters from command-line arguments. +pub enum SuffixError { + /// Invalid suffix length parameter. + NotParsable(String), + + /// Suffix contains a directory separator, which is not allowed. + ContainsSeparator(String), + + /// Suffix is not large enough to split into specified chunks + TooSmall(usize), +} + +impl fmt::Display for SuffixError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::NotParsable(s) => write!(f, "invalid suffix length: {}", s.quote()), + Self::TooSmall(i) => write!(f, "the suffix length needs to be at least {i}"), + Self::ContainsSeparator(s) => write!( + f, + "invalid suffix {}, contains directory separator", + s.quote() + ), + } + } +} + +impl Suffix { + /// Parse the suffix type, start, length and additional suffix from the command-line arguments + /// as well process suffix length auto-widening and auto-width scenarios + /// + /// Suffix auto-widening: Determine if the output file names suffix is allowed to dynamically auto-widen, + /// i.e. change (increase) suffix length dynamically as more files need to be written into. + /// Suffix length auto-widening rules are (in the order they are applied): + /// - ON by default + /// - OFF when suffix start N is specified via long option with a value + /// `--numeric-suffixes=N` or `--hex-suffixes=N` + /// - OFF when suffix length N is specified, except for N=0 (see edge cases below) + /// `-a N` or `--suffix-length=N` + /// - OFF if suffix length is auto pre-calculated (auto-width) + /// + /// Suffix auto-width: Determine if the the output file names suffix length should be automatically pre-calculated + /// based on number of files that need to written into, having number of files known upfront + /// Suffix length auto pre-calculation rules: + /// - Pre-calculate new suffix length when `-n`/`--number` option (N, K/N, l/N, l/K/N, r/N, r/K/N) + /// is used, where N is number of chunks = number of files to write into + /// and suffix start < N number of files + /// as in `split --numeric-suffixes=1 --number=r/100 file` + /// - Do NOT pre-calculate new suffix length otherwise, i.e. when + /// suffix start >= N number of files + /// as in `split --numeric-suffixes=100 --number=r/100 file` + /// OR when suffix length N is specified, except for N=0 (see edge cases below) + /// `-a N` or `--suffix-length=N` + /// + /// Edge case: + /// - If suffix length is specified as 0 in a command line, + /// first apply auto-width calculations and if still 0 + /// set it to default value. + /// Do NOT change auto-widening value + /// + pub fn from(matches: &ArgMatches, strategy: &Strategy) -> Result { + let stype: SuffixType; + + // Defaults + let mut start = 0; + let mut auto_widening = true; + let default_length: usize = 2; + + // Check if the user is specifying one or more than one suffix + // Any combination of suffixes is allowed + // Since all suffixes are setup with 'overrides_with_all()' against themselves and each other, + // last one wins, all others are ignored + match ( + matches.contains_id(OPT_NUMERIC_SUFFIXES), + matches.contains_id(OPT_HEX_SUFFIXES), + matches.get_flag(OPT_NUMERIC_SUFFIXES_SHORT), + matches.get_flag(OPT_HEX_SUFFIXES_SHORT), + ) { + (true, _, _, _) => { + stype = SuffixType::Decimal; + // if option was specified, but without value - this will return None as there is no default value + if let Some(opt) = matches.get_one::(OPT_NUMERIC_SUFFIXES) { + start = opt + .parse::() + .map_err(|_| SuffixError::NotParsable(opt.to_string()))?; + auto_widening = false; + } + } + (_, true, _, _) => { + stype = SuffixType::Hexadecimal; + // if option was specified, but without value - this will return None as there is no default value + if let Some(opt) = matches.get_one::(OPT_HEX_SUFFIXES) { + start = usize::from_str_radix(opt, 16) + .map_err(|_| SuffixError::NotParsable(opt.to_string()))?; + auto_widening = false; + } + } + (_, _, true, _) => stype = SuffixType::Decimal, // short numeric suffix '-d' + (_, _, _, true) => stype = SuffixType::Hexadecimal, // short hex suffix '-x' + _ => stype = SuffixType::Alphabetic, // no numeric/hex suffix, using default alphabetic + } + + // Get suffix length and a flag to indicate if it was specified with command line option + let (mut length, is_length_cmd_opt) = + if let Some(v) = matches.get_one::(OPT_SUFFIX_LENGTH) { + // suffix length was specified in command line + ( + v.parse::() + .map_err(|_| SuffixError::NotParsable(v.to_string()))?, + true, + ) + } else { + // no suffix length option was specified in command line + // set to default value + (default_length, false) + }; + + // Disable dynamic auto-widening if suffix length was specified in command line with value > 0 + if is_length_cmd_opt && length > 0 { + auto_widening = false; + } + + // Auto pre-calculate new suffix length (auto-width) if necessary + if let Strategy::Number(ref number_type) = strategy { + let chunks = number_type.num_chunks(); + let required_length = ((start as u64 + chunks) as f64) + .log(stype.radix() as f64) + .ceil() as usize; + + if (start as u64) < chunks && !(is_length_cmd_opt && length > 0) { + // with auto-width ON the auto-widening is OFF + auto_widening = false; + + // do not reduce suffix length with auto-width + if length < required_length { + length = required_length; + } + } + + if length < required_length { + return Err(SuffixError::TooSmall(required_length)); + } + } + + // Check edge case when suffix length == 0 was specified in command line + // Set it to default value + if is_length_cmd_opt && length == 0 { + length = default_length; + } + + let additional = matches + .get_one::(OPT_ADDITIONAL_SUFFIX) + .unwrap() + .to_string(); + if additional.contains('/') { + return Err(SuffixError::ContainsSeparator(additional)); + } + + let result = Self { + stype, + length, + start, + auto_widening, + additional, + }; + + Ok(result) + } +} + /// Compute filenames from a given index. /// /// This iterator yields filenames for use with ``split``. /// /// The `prefix` is prepended to each filename and the -/// `additional_suffix1` is appended to each filename. +/// `suffix.additional` is appended to each filename. /// -/// If `suffix_length` is 0, then the variable portion of the filename +/// If `suffix.auto_widening` is true, then the variable portion of the filename /// that identifies the current chunk will have a dynamically -/// increasing width. If `suffix_length` is greater than zero, then -/// the variable portion of the filename will always be exactly that +/// increasing width. If `suffix.auto_widening` is false, then +/// the variable portion of the filename will always be exactly `suffix.length` /// width in characters. In that case, after the iterator yields each /// string of that width, the iterator is exhausted. /// -/// Finally, `suffix_type` controls which type of suffix to produce, +/// Finally, `suffix.stype` controls which type of suffix to produce, /// alphabetic or numeric. /// /// # Examples @@ -81,10 +273,14 @@ impl SuffixType { /// use crate::filenames::SuffixType; /// /// let prefix = "chunk_".to_string(); -/// let suffix = ".txt".to_string(); -/// let width = 2; -/// let suffix_type = SuffixType::Alphabetic; -/// let it = FilenameIterator::new(prefix, suffix, width, suffix_type); +/// let suffix = Suffix { +/// stype: SuffixType::Alphabetic, +/// length: 2, +/// start: 0, +/// auto_widening: true, +/// additional: ".txt".to_string(), +/// }; +/// let it = FilenameIterator::new(prefix, suffix); /// /// assert_eq!(it.next().unwrap(), "chunk_aa.txt"); /// assert_eq!(it.next().unwrap(), "chunk_ab.txt"); @@ -98,37 +294,34 @@ impl SuffixType { /// use crate::filenames::SuffixType; /// /// let prefix = "chunk_".to_string(); -/// let suffix = ".txt".to_string(); -/// let width = 2; -/// let suffix_type = SuffixType::Decimal; -/// let it = FilenameIterator::new(prefix, suffix, width, suffix_type); +/// let suffix = Suffix { +/// stype: SuffixType::Decimal, +/// length: 2, +/// start: 0, +/// auto_widening: true, +/// additional: ".txt".to_string(), +/// }; +/// let it = FilenameIterator::new(prefix, suffix); /// /// assert_eq!(it.next().unwrap(), "chunk_00.txt"); /// assert_eq!(it.next().unwrap(), "chunk_01.txt"); /// assert_eq!(it.next().unwrap(), "chunk_02.txt"); /// ``` pub struct FilenameIterator<'a> { - additional_suffix: &'a str, prefix: &'a str, + additional_suffix: &'a str, number: Number, first_iteration: bool, } impl<'a> FilenameIterator<'a> { - pub fn new( - prefix: &'a str, - additional_suffix: &'a str, - suffix_length: usize, - suffix_type: SuffixType, - suffix_start: usize, - suffix_auto_widening: bool, - ) -> UResult> { - let radix = suffix_type.radix(); - let number = if suffix_auto_widening { - Number::DynamicWidth(DynamicWidthNumber::new(radix, suffix_start)) + pub fn new(prefix: &'a str, suffix: &'a Suffix) -> UResult> { + let radix = suffix.stype.radix(); + let number = if suffix.auto_widening { + Number::DynamicWidth(DynamicWidthNumber::new(radix, suffix.start)) } else { Number::FixedWidth( - FixedWidthNumber::new(radix, suffix_length, suffix_start).map_err(|_| { + FixedWidthNumber::new(radix, suffix.length, suffix.start).map_err(|_| { USimpleError::new( 1, "numerical suffix start value is too large for the suffix length", @@ -136,6 +329,7 @@ impl<'a> FilenameIterator<'a> { })?, ) }; + let additional_suffix = suffix.additional.as_str(); Ok(FilenameIterator { prefix, @@ -168,46 +362,62 @@ impl<'a> Iterator for FilenameIterator<'a> { mod tests { use crate::filenames::FilenameIterator; + use crate::filenames::Suffix; use crate::filenames::SuffixType; #[test] fn test_filename_iterator_alphabetic_fixed_width() { - let mut it = - FilenameIterator::new("chunk_", ".txt", 2, SuffixType::Alphabetic, 0, false).unwrap(); + let suffix = Suffix { + stype: SuffixType::Alphabetic, + length: 2, + start: 0, + auto_widening: false, + additional: ".txt".to_string(), + }; + let mut it = FilenameIterator::new("chunk_", &suffix).unwrap(); assert_eq!(it.next().unwrap(), "chunk_aa.txt"); assert_eq!(it.next().unwrap(), "chunk_ab.txt"); assert_eq!(it.next().unwrap(), "chunk_ac.txt"); - let mut it = - FilenameIterator::new("chunk_", ".txt", 2, SuffixType::Alphabetic, 0, false).unwrap(); + let mut it = FilenameIterator::new("chunk_", &suffix).unwrap(); assert_eq!(it.nth(26 * 26 - 1).unwrap(), "chunk_zz.txt"); assert_eq!(it.next(), None); } #[test] fn test_filename_iterator_numeric_fixed_width() { - let mut it = - FilenameIterator::new("chunk_", ".txt", 2, SuffixType::Decimal, 0, false).unwrap(); + let suffix = Suffix { + stype: SuffixType::Decimal, + length: 2, + start: 0, + auto_widening: false, + additional: ".txt".to_string(), + }; + let mut it = FilenameIterator::new("chunk_", &suffix).unwrap(); assert_eq!(it.next().unwrap(), "chunk_00.txt"); assert_eq!(it.next().unwrap(), "chunk_01.txt"); assert_eq!(it.next().unwrap(), "chunk_02.txt"); - let mut it = - FilenameIterator::new("chunk_", ".txt", 2, SuffixType::Decimal, 0, false).unwrap(); + let mut it = FilenameIterator::new("chunk_", &suffix).unwrap(); assert_eq!(it.nth(10 * 10 - 1).unwrap(), "chunk_99.txt"); assert_eq!(it.next(), None); } #[test] fn test_filename_iterator_alphabetic_dynamic_width() { - let mut it = - FilenameIterator::new("chunk_", ".txt", 2, SuffixType::Alphabetic, 0, true).unwrap(); + let suffix = Suffix { + stype: SuffixType::Alphabetic, + length: 2, + start: 0, + auto_widening: true, + additional: ".txt".to_string(), + }; + let mut it = FilenameIterator::new("chunk_", &suffix).unwrap(); assert_eq!(it.next().unwrap(), "chunk_aa.txt"); assert_eq!(it.next().unwrap(), "chunk_ab.txt"); assert_eq!(it.next().unwrap(), "chunk_ac.txt"); - let mut it = - FilenameIterator::new("chunk_", ".txt", 2, SuffixType::Alphabetic, 0, true).unwrap(); + let mut it = FilenameIterator::new("chunk_", &suffix).unwrap(); assert_eq!(it.nth(26 * 25 - 1).unwrap(), "chunk_yz.txt"); assert_eq!(it.next().unwrap(), "chunk_zaaa.txt"); assert_eq!(it.next().unwrap(), "chunk_zaab.txt"); @@ -215,54 +425,96 @@ mod tests { #[test] fn test_filename_iterator_numeric_dynamic_width() { - let mut it = - FilenameIterator::new("chunk_", ".txt", 2, SuffixType::Decimal, 0, true).unwrap(); + let suffix = Suffix { + stype: SuffixType::Decimal, + length: 2, + start: 0, + auto_widening: true, + additional: ".txt".to_string(), + }; + let mut it = FilenameIterator::new("chunk_", &suffix).unwrap(); assert_eq!(it.next().unwrap(), "chunk_00.txt"); assert_eq!(it.next().unwrap(), "chunk_01.txt"); assert_eq!(it.next().unwrap(), "chunk_02.txt"); - let mut it = - FilenameIterator::new("chunk_", ".txt", 2, SuffixType::Decimal, 0, true).unwrap(); + let mut it = FilenameIterator::new("chunk_", &suffix).unwrap(); assert_eq!(it.nth(10 * 9 - 1).unwrap(), "chunk_89.txt"); assert_eq!(it.next().unwrap(), "chunk_9000.txt"); assert_eq!(it.next().unwrap(), "chunk_9001.txt"); } #[test] - fn test_filename_iterator_numeric_suffix_decimal() { - let mut it = - FilenameIterator::new("chunk_", ".txt", 2, SuffixType::Decimal, 5, true).unwrap(); + fn test_filename_iterator_numeric_decimal() { + let suffix = Suffix { + stype: SuffixType::Decimal, + length: 2, + start: 5, + auto_widening: true, + additional: ".txt".to_string(), + }; + let mut it = FilenameIterator::new("chunk_", &suffix).unwrap(); assert_eq!(it.next().unwrap(), "chunk_05.txt"); assert_eq!(it.next().unwrap(), "chunk_06.txt"); assert_eq!(it.next().unwrap(), "chunk_07.txt"); } #[test] - fn test_filename_iterator_numeric_suffix_hex() { - let mut it = - FilenameIterator::new("chunk_", ".txt", 2, SuffixType::Hexadecimal, 9, true).unwrap(); + fn test_filename_iterator_numeric_hex() { + let suffix = Suffix { + stype: SuffixType::Hexadecimal, + length: 2, + start: 9, + auto_widening: true, + additional: ".txt".to_string(), + }; + let mut it = FilenameIterator::new("chunk_", &suffix).unwrap(); assert_eq!(it.next().unwrap(), "chunk_09.txt"); assert_eq!(it.next().unwrap(), "chunk_0a.txt"); assert_eq!(it.next().unwrap(), "chunk_0b.txt"); } #[test] - fn test_filename_iterator_numeric_suffix_err() { - let mut it = - FilenameIterator::new("chunk_", ".txt", 3, SuffixType::Decimal, 999, false).unwrap(); + fn test_filename_iterator_numeric_err() { + let suffix = Suffix { + stype: SuffixType::Decimal, + length: 3, + start: 999, + auto_widening: false, + additional: ".txt".to_string(), + }; + let mut it = FilenameIterator::new("chunk_", &suffix).unwrap(); assert_eq!(it.next().unwrap(), "chunk_999.txt"); assert!(it.next().is_none()); - let it = FilenameIterator::new("chunk_", ".txt", 3, SuffixType::Decimal, 1000, false); + let suffix = Suffix { + stype: SuffixType::Decimal, + length: 3, + start: 1000, + auto_widening: false, + additional: ".txt".to_string(), + }; + let it = FilenameIterator::new("chunk_", &suffix); assert!(it.is_err()); - let mut it = - FilenameIterator::new("chunk_", ".txt", 3, SuffixType::Hexadecimal, 0xfff, false) - .unwrap(); + let suffix = Suffix { + stype: SuffixType::Hexadecimal, + length: 3, + start: 0xfff, + auto_widening: false, + additional: ".txt".to_string(), + }; + let mut it = FilenameIterator::new("chunk_", &suffix).unwrap(); assert_eq!(it.next().unwrap(), "chunk_fff.txt"); assert!(it.next().is_none()); - let it = FilenameIterator::new("chunk_", ".txt", 3, SuffixType::Hexadecimal, 0x1000, false); + let suffix = Suffix { + stype: SuffixType::Hexadecimal, + length: 3, + start: 0x1000, + auto_widening: false, + additional: ".txt".to_string(), + }; + let it = FilenameIterator::new("chunk_", &suffix); assert!(it.is_err()); } } diff --git a/src/uu/split/src/number.rs b/src/uu/split/src/number.rs index a01701c80e3..6312d0a3fa6 100644 --- a/src/uu/split/src/number.rs +++ b/src/uu/split/src/number.rs @@ -2,7 +2,7 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore zaaa zaab +// spell-checker:ignore zaaa zaab feff //! A number in arbitrary radix expressed in a positional notation. //! //! Use the [`Number`] enum to represent an arbitrary number in an diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 4282f1433e8..4b5cd920715 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -8,9 +8,10 @@ mod filenames; mod number; mod platform; +mod strategy; -use crate::filenames::FilenameIterator; -use crate::filenames::SuffixType; +use crate::filenames::{FilenameIterator, Suffix, SuffixError}; +use crate::strategy::{NumberType, Strategy, StrategyError}; use clap::{crate_version, parser::ValueSource, Arg, ArgAction, ArgMatches, Command, ValueHint}; use std::env; use std::ffi::OsString; @@ -22,7 +23,7 @@ use std::path::Path; use std::u64; use uucore::display::Quotable; use uucore::error::{FromIo, UIoError, UResult, USimpleError, UUsageError}; -use uucore::parse_size::{parse_size_u64, parse_size_u64_max, ParseSizeError}; + use uucore::uio_error; use uucore::{format_usage, help_about, help_section, help_usage}; @@ -37,8 +38,6 @@ static OPT_NUMERIC_SUFFIXES_SHORT: &str = "-d"; static OPT_HEX_SUFFIXES: &str = "hex-suffixes"; static OPT_HEX_SUFFIXES_SHORT: &str = "-x"; static OPT_SUFFIX_LENGTH: &str = "suffix-length"; -// If no suffix length is specified, default to "2" characters following GNU split behavior -static OPT_DEFAULT_SUFFIX_LENGTH: &str = "2"; static OPT_VERBOSE: &str = "verbose"; static OPT_SEPARATOR: &str = "separator"; //The ---io and ---io-blksize parameters are consumed and ignored. @@ -357,7 +356,6 @@ pub fn uu_app() -> Command { .long(OPT_SUFFIX_LENGTH) .allow_hyphen_values(true) .value_name("N") - .default_value(OPT_DEFAULT_SUFFIX_LENGTH) .help("generate suffixes of length N (default 2)"), ) .arg( @@ -398,418 +396,13 @@ pub fn uu_app() -> Command { ) } -/// Sub-strategy to use when splitting a file into a specific number of chunks. -#[derive(Debug, PartialEq)] -enum NumberType { - /// Split into a specific number of chunks by byte. - Bytes(u64), - - /// Split into a specific number of chunks by byte - /// but output only the *k*th chunk. - KthBytes(u64, u64), - - /// Split into a specific number of chunks by line (approximately). - Lines(u64), - - /// Split into a specific number of chunks by line - /// (approximately), but output only the *k*th chunk. - KthLines(u64, u64), - - /// Assign lines via round-robin to the specified number of output chunks. - RoundRobin(u64), - - /// Assign lines via round-robin to the specified number of output - /// chunks, but output only the *k*th chunk. - KthRoundRobin(u64, u64), -} - -impl NumberType { - /// The number of chunks for this number type. - fn num_chunks(&self) -> u64 { - match self { - Self::Bytes(n) => *n, - Self::KthBytes(_, n) => *n, - Self::Lines(n) => *n, - Self::KthLines(_, n) => *n, - Self::RoundRobin(n) => *n, - Self::KthRoundRobin(_, n) => *n, - } - } -} - -/// An error due to an invalid parameter to the `-n` command-line option. -#[derive(Debug, PartialEq)] -enum NumberTypeError { - /// The number of chunks was invalid. - /// - /// This can happen if the value of `N` in any of the following - /// command-line options is not a positive integer: - /// - /// ```ignore - /// -n N - /// -n K/N - /// -n l/N - /// -n l/K/N - /// -n r/N - /// -n r/K/N - /// ``` - NumberOfChunks(String), - - /// The chunk number was invalid. - /// - /// This can happen if the value of `K` in any of the following - /// command-line options is not a positive integer - /// or if `K` is 0 - /// or if `K` is greater than `N`: - /// - /// ```ignore - /// -n K/N - /// -n l/K/N - /// -n r/K/N - /// ``` - ChunkNumber(String), -} - -impl NumberType { - /// Parse a `NumberType` from a string. - /// - /// The following strings are valid arguments: - /// - /// ```ignore - /// "N" - /// "K/N" - /// "l/N" - /// "l/K/N" - /// "r/N" - /// "r/K/N" - /// ``` - /// - /// The `N` represents the number of chunks and the `K` represents - /// a chunk number. - /// - /// # Errors - /// - /// If the string is not one of the valid number types, - /// if `K` is not a nonnegative integer, - /// or if `K` is 0, - /// or if `N` is not a positive integer, - /// or if `K` is greater than `N` - /// then this function returns [`NumberTypeError`]. - fn from(s: &str) -> Result { - fn is_invalid_chunk(chunk_number: u64, num_chunks: u64) -> bool { - chunk_number > num_chunks || chunk_number == 0 - } - let parts: Vec<&str> = s.split('/').collect(); - match &parts[..] { - [n_str] => { - let num_chunks = parse_size_u64(n_str) - .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; - if num_chunks > 0 { - Ok(Self::Bytes(num_chunks)) - } else { - Err(NumberTypeError::NumberOfChunks(s.to_string())) - } - } - [k_str, n_str] if !k_str.starts_with('l') && !k_str.starts_with('r') => { - let num_chunks = parse_size_u64(n_str) - .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; - let chunk_number = parse_size_u64(k_str) - .map_err(|_| NumberTypeError::ChunkNumber(k_str.to_string()))?; - if is_invalid_chunk(chunk_number, num_chunks) { - return Err(NumberTypeError::ChunkNumber(k_str.to_string())); - } - Ok(Self::KthBytes(chunk_number, num_chunks)) - } - ["l", n_str] => { - let num_chunks = parse_size_u64(n_str) - .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; - Ok(Self::Lines(num_chunks)) - } - ["l", k_str, n_str] => { - let num_chunks = parse_size_u64(n_str) - .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; - let chunk_number = parse_size_u64(k_str) - .map_err(|_| NumberTypeError::ChunkNumber(k_str.to_string()))?; - if is_invalid_chunk(chunk_number, num_chunks) { - return Err(NumberTypeError::ChunkNumber(k_str.to_string())); - } - Ok(Self::KthLines(chunk_number, num_chunks)) - } - ["r", n_str] => { - let num_chunks = parse_size_u64(n_str) - .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; - Ok(Self::RoundRobin(num_chunks)) - } - ["r", k_str, n_str] => { - let num_chunks = parse_size_u64(n_str) - .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; - let chunk_number = parse_size_u64(k_str) - .map_err(|_| NumberTypeError::ChunkNumber(k_str.to_string()))?; - if is_invalid_chunk(chunk_number, num_chunks) { - return Err(NumberTypeError::ChunkNumber(k_str.to_string())); - } - Ok(Self::KthRoundRobin(chunk_number, num_chunks)) - } - _ => Err(NumberTypeError::NumberOfChunks(s.to_string())), - } - } -} - -/// The strategy for breaking up the input file into chunks. -enum Strategy { - /// Each chunk has the specified number of lines. - Lines(u64), - - /// Each chunk has the specified number of bytes. - Bytes(u64), - - /// Each chunk has as many lines as possible without exceeding the - /// specified number of bytes. - LineBytes(u64), - - /// Split the file into this many chunks. - /// - /// There are several sub-strategies available, as defined by - /// [`NumberType`]. - Number(NumberType), -} - -/// An error when parsing a chunking strategy from command-line arguments. -enum StrategyError { - /// Invalid number of lines. - Lines(ParseSizeError), - - /// Invalid number of bytes. - Bytes(ParseSizeError), - - /// Invalid number type. - NumberType(NumberTypeError), - - /// Multiple chunking strategies were specified (but only one should be). - MultipleWays, -} - -impl fmt::Display for StrategyError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::Lines(e) => write!(f, "invalid number of lines: {e}"), - Self::Bytes(e) => write!(f, "invalid number of bytes: {e}"), - Self::NumberType(NumberTypeError::NumberOfChunks(s)) => { - write!(f, "invalid number of chunks: {s}") - } - Self::NumberType(NumberTypeError::ChunkNumber(s)) => { - write!(f, "invalid chunk number: {s}") - } - Self::MultipleWays => write!(f, "cannot split in more than one way"), - } - } -} - -impl Strategy { - /// Parse a strategy from the command-line arguments. - fn from(matches: &ArgMatches, obs_lines: &Option) -> Result { - fn get_and_parse( - matches: &ArgMatches, - option: &str, - strategy: fn(u64) -> Strategy, - error: fn(ParseSizeError) -> StrategyError, - ) -> Result { - let s = matches.get_one::(option).unwrap(); - let n = parse_size_u64_max(s).map_err(error)?; - if n > 0 { - Ok(strategy(n)) - } else { - Err(error(ParseSizeError::ParseFailure(s.to_string()))) - } - } - // Check that the user is not specifying more than one strategy. - // - // Note: right now, this exact behavior cannot be handled by - // overrides_with_all() due to obsolete lines value option - match ( - obs_lines, - matches.value_source(OPT_LINES) == Some(ValueSource::CommandLine), - matches.value_source(OPT_BYTES) == Some(ValueSource::CommandLine), - matches.value_source(OPT_LINE_BYTES) == Some(ValueSource::CommandLine), - matches.value_source(OPT_NUMBER) == Some(ValueSource::CommandLine), - ) { - (Some(v), false, false, false, false) => { - let v = parse_size_u64_max(v).map_err(|_| { - StrategyError::Lines(ParseSizeError::ParseFailure(v.to_string())) - })?; - if v > 0 { - Ok(Self::Lines(v)) - } else { - Err(StrategyError::Lines(ParseSizeError::ParseFailure( - v.to_string(), - ))) - } - } - (None, false, false, false, false) => Ok(Self::Lines(1000)), - (None, true, false, false, false) => { - get_and_parse(matches, OPT_LINES, Self::Lines, StrategyError::Lines) - } - (None, false, true, false, false) => { - get_and_parse(matches, OPT_BYTES, Self::Bytes, StrategyError::Bytes) - } - (None, false, false, true, false) => get_and_parse( - matches, - OPT_LINE_BYTES, - Self::LineBytes, - StrategyError::Bytes, - ), - (None, false, false, false, true) => { - let s = matches.get_one::(OPT_NUMBER).unwrap(); - let number_type = NumberType::from(s).map_err(StrategyError::NumberType)?; - Ok(Self::Number(number_type)) - } - _ => Err(StrategyError::MultipleWays), - } - } -} - -/// Parse the suffix type, start and length from the command-line arguments -/// as well suffix length auto-widening and auto-width scenarios -/// -/// Suffix auto-widening: Determine if the output file names suffix is allowed to dynamically auto-widen, -/// i.e. change (increase) suffix length dynamically as more files need to be written into. -/// Suffix length auto-widening rules are (in the order they are applied): -/// - ON by default -/// - OFF when suffix start N is specified via long option with a value -/// `--numeric-suffixes=N` or `--hex-suffixes=N` -/// - OFF when suffix length N is specified, except for N=0 (see edge cases below) -/// `-a N` or `--suffix-length=N` -/// - OFF if suffix length is auto pre-calculated (auto-width) -/// -/// Suffix auto-width: Determine if the the output file names suffix length should be automatically pre-calculated -/// based on number of files that need to written into, having number of files known upfront -/// Suffix length auto pre-calculation rules: -/// - Pre-calculate new suffix length when `-n`/`--number` option (N, K/N, l/N, l/K/N, r/N, r/K/N) -/// is used, where N is number of chunks = number of files to write into -/// and suffix start < N number of files -/// as in `split --numeric-suffixes=1 --number=r/100 file` -/// - Do NOT pre-calculate new suffix length otherwise, i.e. when -/// suffix start >= N number of files -/// as in `split --numeric-suffixes=100 --number=r/100 file` -/// OR when suffix length N is specified, except for N=0 (see edge cases below) -/// `-a N` or `--suffix-length=N` -/// -/// Edge case: -/// - If suffix length is specified as 0 AND `-n`/`--number` option used specifying number of files: -/// set auto widening OFF AND auto pre-calculate required suffix length based on number of files needed -/// - If suffix length is specified as 0 in any other situation -/// keep auto widening ON and suffix length to default value -/// -fn suffix_from( - matches: &ArgMatches, - strategy: &Strategy, -) -> Result<(SuffixType, usize, bool, usize), SettingsError> { - let suffix_type: SuffixType; - - // Defaults - let mut suffix_start = 0; - let mut suffix_auto_widening = true; - - // Check if the user is specifying one or more than one suffix - // Any combination of suffixes is allowed - // Since all suffixes are setup with 'overrides_with_all()' against themselves and each other, - // last one wins, all others are ignored - match ( - matches.contains_id(OPT_NUMERIC_SUFFIXES), - matches.contains_id(OPT_HEX_SUFFIXES), - matches.get_flag(OPT_NUMERIC_SUFFIXES_SHORT), - matches.get_flag(OPT_HEX_SUFFIXES_SHORT), - ) { - (true, _, _, _) => { - suffix_type = SuffixType::Decimal; - let opt = matches.get_one::(OPT_NUMERIC_SUFFIXES); // if option was specified, but without value - this will return None as there is no default value - if opt.is_some() { - suffix_start = opt - .unwrap() - .parse::() - .map_err(|_| SettingsError::SuffixNotParsable(opt.unwrap().to_string()))?; - suffix_auto_widening = false; - } - } - (_, true, _, _) => { - suffix_type = SuffixType::Hexadecimal; - let opt = matches.get_one::(OPT_HEX_SUFFIXES); // if option was specified, but without value - this will return None as there is no default value - if opt.is_some() { - suffix_start = usize::from_str_radix(opt.unwrap(), 16) - .map_err(|_| SettingsError::SuffixNotParsable(opt.unwrap().to_string()))?; - suffix_auto_widening = false; - } - } - (_, _, true, _) => suffix_type = SuffixType::Decimal, // short numeric suffix '-d' - (_, _, _, true) => suffix_type = SuffixType::Hexadecimal, // short hex suffix '-x' - _ => suffix_type = SuffixType::Alphabetic, // no numeric/hex suffix, using default alphabetic - } - - // Get suffix length (could be coming from command line of default value) - let suffix_length_str = matches.get_one::(OPT_SUFFIX_LENGTH).unwrap(); // safe to unwrap here as there is default value for this option - let mut suffix_length: usize = suffix_length_str - .parse() - .map_err(|_| SettingsError::SuffixNotParsable(suffix_length_str.to_string()))?; - - // Disable dynamic auto-widening if suffix length was specified in command line with value > 0 - if matches.value_source(OPT_SUFFIX_LENGTH) == Some(ValueSource::CommandLine) - && suffix_length > 0 - { - suffix_auto_widening = false; - } - - // Auto pre-calculate new suffix length (auto-width) if necessary - if let Strategy::Number(ref number_type) = strategy { - let chunks = number_type.num_chunks(); - let required_suffix_length = ((suffix_start as u64 + chunks) as f64) - .log(suffix_type.radix() as f64) - .ceil() as usize; - - if (suffix_start as u64) < chunks - && !(matches.value_source(OPT_SUFFIX_LENGTH) == Some(ValueSource::CommandLine) - && suffix_length > 0) - { - // with auto-width ON the auto-widening is OFF - suffix_auto_widening = false; - - // do not reduce suffix length with auto-width - if suffix_length < required_suffix_length { - suffix_length = required_suffix_length; - } - } - - if suffix_length < required_suffix_length { - return Err(SettingsError::SuffixTooSmall(required_suffix_length)); - } - } - - // Check suffix length == 0 edge case - // If it is still 0 at this point, then auto-width pre-calculation did not apply - // So, set it to default value and keep auto-widening ON - if suffix_length == 0 { - suffix_length = OPT_DEFAULT_SUFFIX_LENGTH.parse().unwrap(); - } - - Ok(( - suffix_type, - suffix_start, - suffix_auto_widening, - suffix_length, - )) -} - /// Parameters that control how a file gets split. /// /// You can convert an [`ArgMatches`] instance into a [`Settings`] /// instance by calling [`Settings::from`]. struct Settings { prefix: String, - suffix_type: SuffixType, - suffix_length: usize, - suffix_start: usize, - /// Whether or not suffix length should automatically widen - suffix_auto_widening: bool, - additional_suffix: String, + suffix: Suffix, input: String, /// When supplied, a shell command to output to instead of xaa, xab … filter: Option, @@ -834,13 +427,7 @@ enum SettingsError { Strategy(StrategyError), /// Invalid suffix length parameter. - SuffixNotParsable(String), - - /// Suffix contains a directory separator, which is not allowed. - SuffixContainsSeparator(String), - - /// Suffix is not large enough to split into specified chunks - SuffixTooSmall(usize), + Suffix(SuffixError), /// Multi-character (Invalid) separator MultiCharacterSeparator(String), @@ -864,7 +451,8 @@ impl SettingsError { fn requires_usage(&self) -> bool { matches!( self, - Self::Strategy(StrategyError::MultipleWays) | Self::SuffixContainsSeparator(_) + Self::Strategy(StrategyError::MultipleWays) + | Self::Suffix(SuffixError::ContainsSeparator(_)) ) } } @@ -873,19 +461,13 @@ impl fmt::Display for SettingsError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::Strategy(e) => e.fmt(f), - Self::SuffixNotParsable(s) => write!(f, "invalid suffix length: {}", s.quote()), - Self::SuffixTooSmall(i) => write!(f, "the suffix length needs to be at least {i}"), + Self::Suffix(e) => e.fmt(f), Self::MultiCharacterSeparator(s) => { write!(f, "multi-character separator {}", s.quote()) } Self::MultipleSeparatorCharacters => { write!(f, "multiple separator characters specified") } - Self::SuffixContainsSeparator(s) => write!( - f, - "invalid suffix {}, contains directory separator", - s.quote() - ), Self::FilterWithKthChunkNumber => { write!(f, "--filter does not process a chunk extracted to stdout") } @@ -901,16 +483,8 @@ impl fmt::Display for SettingsError { impl Settings { /// Parse a strategy from the command-line arguments. fn from(matches: &ArgMatches, obs_lines: &Option) -> Result { - let additional_suffix = matches - .get_one::(OPT_ADDITIONAL_SUFFIX) - .unwrap() - .to_string(); - if additional_suffix.contains('/') { - return Err(SettingsError::SuffixContainsSeparator(additional_suffix)); - } let strategy = Strategy::from(matches, obs_lines).map_err(SettingsError::Strategy)?; - let (suffix_type, suffix_start, suffix_auto_widening, suffix_length) = - suffix_from(matches, &strategy)?; + let suffix = Suffix::from(matches, &strategy).map_err(SettingsError::Suffix)?; // Make sure that separator is only one UTF8 character (if specified) // defaults to '\n' - newline character @@ -932,17 +506,13 @@ impl Settings { }; let result = Self { - suffix_length, - suffix_type, - suffix_start, - suffix_auto_widening, - additional_suffix, - verbose: matches.value_source(OPT_VERBOSE) == Some(ValueSource::CommandLine), - separator, - strategy, - input: matches.get_one::(ARG_INPUT).unwrap().to_owned(), prefix: matches.get_one::(ARG_PREFIX).unwrap().to_owned(), + suffix, + input: matches.get_one::(ARG_INPUT).unwrap().to_owned(), filter: matches.get_one::(OPT_FILTER).map(|s| s.to_owned()), + strategy, + verbose: matches.value_source(OPT_VERBOSE) == Some(ValueSource::CommandLine), + separator, elide_empty_files: matches.get_flag(OPT_ELIDE_EMPTY_FILES), }; @@ -1059,14 +629,7 @@ struct ByteChunkWriter<'a> { impl<'a> ByteChunkWriter<'a> { fn new(chunk_size: u64, settings: &'a Settings) -> UResult> { - let mut filename_iterator = FilenameIterator::new( - &settings.prefix, - &settings.additional_suffix, - settings.suffix_length, - settings.suffix_type, - settings.suffix_start, - settings.suffix_auto_widening, - )?; + let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)?; let filename = filename_iterator .next() .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?; @@ -1190,14 +753,7 @@ struct LineChunkWriter<'a> { impl<'a> LineChunkWriter<'a> { fn new(chunk_size: u64, settings: &'a Settings) -> UResult> { - let mut filename_iterator = FilenameIterator::new( - &settings.prefix, - &settings.additional_suffix, - settings.suffix_length, - settings.suffix_type, - settings.suffix_start, - settings.suffix_auto_widening, - )?; + let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)?; let filename = filename_iterator .next() .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?; @@ -1304,14 +860,7 @@ struct LineBytesChunkWriter<'a> { impl<'a> LineBytesChunkWriter<'a> { fn new(chunk_size: u64, settings: &'a Settings) -> UResult> { - let mut filename_iterator = FilenameIterator::new( - &settings.prefix, - &settings.additional_suffix, - settings.suffix_length, - settings.suffix_type, - settings.suffix_start, - settings.suffix_auto_widening, - )?; + let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)?; let filename = filename_iterator .next() .ok_or_else(|| USimpleError::new(1, "output file suffixes exhausted"))?; @@ -1528,14 +1077,7 @@ where .map_err(|_| USimpleError::new(1, "Number of chunks too big"))?; // This object is responsible for creating the filename for each chunk. - let mut filename_iterator = FilenameIterator::new( - &settings.prefix, - &settings.additional_suffix, - settings.suffix_length, - settings.suffix_type, - settings.suffix_start, - settings.suffix_auto_widening, - )?; + let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)?; // Create one writer for each chunk. This will create each // of the underlying files (if not in `--filter` mode). @@ -1700,14 +1242,7 @@ where let chunk_size = (num_bytes / num_chunks) as usize; // This object is responsible for creating the filename for each chunk. - let mut filename_iterator = FilenameIterator::new( - &settings.prefix, - &settings.additional_suffix, - settings.suffix_length, - settings.suffix_type, - settings.suffix_start, - settings.suffix_auto_widening, - )?; + let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix)?; // Create one writer for each chunk. This will create each // of the underlying files (if not in `--filter` mode). @@ -1842,15 +1377,8 @@ where R: BufRead, { // This object is responsible for creating the filename for each chunk. - let mut filename_iterator = FilenameIterator::new( - &settings.prefix, - &settings.additional_suffix, - settings.suffix_length, - settings.suffix_type, - settings.suffix_start, - settings.suffix_auto_widening, - ) - .map_err(|e| io::Error::new(ErrorKind::Other, format!("{e}")))?; + let mut filename_iterator = FilenameIterator::new(&settings.prefix, &settings.suffix) + .map_err(|e| io::Error::new(ErrorKind::Other, format!("{e}")))?; // Create one writer for each chunk. This will create each // of the underlying files (if not in `--filter` mode). @@ -2027,101 +1555,3 @@ fn split(settings: &Settings) -> UResult<()> { } } } - -#[cfg(test)] -mod tests { - - use crate::NumberType; - use crate::NumberTypeError; - - #[test] - fn test_number_type_from() { - assert_eq!(NumberType::from("123").unwrap(), NumberType::Bytes(123)); - assert_eq!(NumberType::from("l/123").unwrap(), NumberType::Lines(123)); - assert_eq!( - NumberType::from("l/123/456").unwrap(), - NumberType::KthLines(123, 456) - ); - assert_eq!( - NumberType::from("r/123").unwrap(), - NumberType::RoundRobin(123) - ); - assert_eq!( - NumberType::from("r/123/456").unwrap(), - NumberType::KthRoundRobin(123, 456) - ); - } - - #[test] - #[allow(clippy::cognitive_complexity)] - fn test_number_type_from_error() { - assert_eq!( - NumberType::from("xyz").unwrap_err(), - NumberTypeError::NumberOfChunks("xyz".to_string()) - ); - assert_eq!( - NumberType::from("l/xyz").unwrap_err(), - NumberTypeError::NumberOfChunks("xyz".to_string()) - ); - assert_eq!( - NumberType::from("l/123/xyz").unwrap_err(), - NumberTypeError::NumberOfChunks("xyz".to_string()) - ); - assert_eq!( - NumberType::from("l/abc/456").unwrap_err(), - NumberTypeError::ChunkNumber("abc".to_string()) - ); - assert_eq!( - NumberType::from("l/456/123").unwrap_err(), - NumberTypeError::ChunkNumber("456".to_string()) - ); - assert_eq!( - NumberType::from("r/456/123").unwrap_err(), - NumberTypeError::ChunkNumber("456".to_string()) - ); - assert_eq!( - NumberType::from("456/123").unwrap_err(), - NumberTypeError::ChunkNumber("456".to_string()) - ); - // In GNU split, the number of chunks get precedence: - // - // $ split -n l/abc/xyz - // split: invalid number of chunks: ‘xyz’ - // - assert_eq!( - NumberType::from("l/abc/xyz").unwrap_err(), - NumberTypeError::NumberOfChunks("xyz".to_string()) - ); - assert_eq!( - NumberType::from("r/xyz").unwrap_err(), - NumberTypeError::NumberOfChunks("xyz".to_string()) - ); - assert_eq!( - NumberType::from("r/123/xyz").unwrap_err(), - NumberTypeError::NumberOfChunks("xyz".to_string()) - ); - assert_eq!( - NumberType::from("r/abc/456").unwrap_err(), - NumberTypeError::ChunkNumber("abc".to_string()) - ); - // In GNU split, the number of chunks get precedence: - // - // $ split -n r/abc/xyz - // split: invalid number of chunks: ‘xyz’ - // - assert_eq!( - NumberType::from("r/abc/xyz").unwrap_err(), - NumberTypeError::NumberOfChunks("xyz".to_string()) - ); - } - - #[test] - fn test_number_type_num_chunks() { - assert_eq!(NumberType::from("123").unwrap().num_chunks(), 123); - assert_eq!(NumberType::from("123/456").unwrap().num_chunks(), 456); - assert_eq!(NumberType::from("l/123").unwrap().num_chunks(), 123); - assert_eq!(NumberType::from("l/123/456").unwrap().num_chunks(), 456); - assert_eq!(NumberType::from("r/123").unwrap().num_chunks(), 123); - assert_eq!(NumberType::from("r/123/456").unwrap().num_chunks(), 456); - } -} diff --git a/src/uu/split/src/strategy.rs b/src/uu/split/src/strategy.rs new file mode 100644 index 00000000000..e85abcee58b --- /dev/null +++ b/src/uu/split/src/strategy.rs @@ -0,0 +1,379 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. +//! Determine the strategy for breaking up the input (file or stdin) into chunks +//! based on the command line options + +use crate::{OPT_BYTES, OPT_LINES, OPT_LINE_BYTES, OPT_NUMBER}; +use clap::{parser::ValueSource, ArgMatches}; +use std::fmt; +use uucore::parse_size::{parse_size_u64, parse_size_u64_max, ParseSizeError}; + +/// Sub-strategy of the [`Strategy::Number`] +/// Splitting a file into a specific number of chunks. +#[derive(Debug, PartialEq)] +pub enum NumberType { + /// Split into a specific number of chunks by byte. + Bytes(u64), + + /// Split into a specific number of chunks by byte + /// but output only the *k*th chunk. + KthBytes(u64, u64), + + /// Split into a specific number of chunks by line (approximately). + Lines(u64), + + /// Split into a specific number of chunks by line + /// (approximately), but output only the *k*th chunk. + KthLines(u64, u64), + + /// Assign lines via round-robin to the specified number of output chunks. + RoundRobin(u64), + + /// Assign lines via round-robin to the specified number of output + /// chunks, but output only the *k*th chunk. + KthRoundRobin(u64, u64), +} + +impl NumberType { + /// The number of chunks for this number type. + pub fn num_chunks(&self) -> u64 { + match self { + Self::Bytes(n) => *n, + Self::KthBytes(_, n) => *n, + Self::Lines(n) => *n, + Self::KthLines(_, n) => *n, + Self::RoundRobin(n) => *n, + Self::KthRoundRobin(_, n) => *n, + } + } +} + +/// An error due to an invalid parameter to the `-n` command-line option. +#[derive(Debug, PartialEq)] +pub enum NumberTypeError { + /// The number of chunks was invalid. + /// + /// This can happen if the value of `N` in any of the following + /// command-line options is not a positive integer: + /// + /// ```ignore + /// -n N + /// -n K/N + /// -n l/N + /// -n l/K/N + /// -n r/N + /// -n r/K/N + /// ``` + NumberOfChunks(String), + + /// The chunk number was invalid. + /// + /// This can happen if the value of `K` in any of the following + /// command-line options is not a positive integer + /// or if `K` is 0 + /// or if `K` is greater than `N`: + /// + /// ```ignore + /// -n K/N + /// -n l/K/N + /// -n r/K/N + /// ``` + ChunkNumber(String), +} + +impl NumberType { + /// Parse a `NumberType` from a string. + /// + /// The following strings are valid arguments: + /// + /// ```ignore + /// "N" + /// "K/N" + /// "l/N" + /// "l/K/N" + /// "r/N" + /// "r/K/N" + /// ``` + /// + /// The `N` represents the number of chunks and the `K` represents + /// a chunk number. + /// + /// # Errors + /// + /// If the string is not one of the valid number types, + /// if `K` is not a nonnegative integer, + /// or if `K` is 0, + /// or if `N` is not a positive integer, + /// or if `K` is greater than `N` + /// then this function returns [`NumberTypeError`]. + fn from(s: &str) -> Result { + fn is_invalid_chunk(chunk_number: u64, num_chunks: u64) -> bool { + chunk_number > num_chunks || chunk_number == 0 + } + let parts: Vec<&str> = s.split('/').collect(); + match &parts[..] { + [n_str] => { + let num_chunks = parse_size_u64(n_str) + .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; + if num_chunks > 0 { + Ok(Self::Bytes(num_chunks)) + } else { + Err(NumberTypeError::NumberOfChunks(s.to_string())) + } + } + [k_str, n_str] if !k_str.starts_with('l') && !k_str.starts_with('r') => { + let num_chunks = parse_size_u64(n_str) + .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; + let chunk_number = parse_size_u64(k_str) + .map_err(|_| NumberTypeError::ChunkNumber(k_str.to_string()))?; + if is_invalid_chunk(chunk_number, num_chunks) { + return Err(NumberTypeError::ChunkNumber(k_str.to_string())); + } + Ok(Self::KthBytes(chunk_number, num_chunks)) + } + ["l", n_str] => { + let num_chunks = parse_size_u64(n_str) + .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; + Ok(Self::Lines(num_chunks)) + } + ["l", k_str, n_str] => { + let num_chunks = parse_size_u64(n_str) + .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; + let chunk_number = parse_size_u64(k_str) + .map_err(|_| NumberTypeError::ChunkNumber(k_str.to_string()))?; + if is_invalid_chunk(chunk_number, num_chunks) { + return Err(NumberTypeError::ChunkNumber(k_str.to_string())); + } + Ok(Self::KthLines(chunk_number, num_chunks)) + } + ["r", n_str] => { + let num_chunks = parse_size_u64(n_str) + .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; + Ok(Self::RoundRobin(num_chunks)) + } + ["r", k_str, n_str] => { + let num_chunks = parse_size_u64(n_str) + .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; + let chunk_number = parse_size_u64(k_str) + .map_err(|_| NumberTypeError::ChunkNumber(k_str.to_string()))?; + if is_invalid_chunk(chunk_number, num_chunks) { + return Err(NumberTypeError::ChunkNumber(k_str.to_string())); + } + Ok(Self::KthRoundRobin(chunk_number, num_chunks)) + } + _ => Err(NumberTypeError::NumberOfChunks(s.to_string())), + } + } +} + +/// The strategy for breaking up the input file into chunks. +pub enum Strategy { + /// Each chunk has the specified number of lines. + Lines(u64), + + /// Each chunk has the specified number of bytes. + Bytes(u64), + + /// Each chunk has as many lines as possible without exceeding the + /// specified number of bytes. + LineBytes(u64), + + /// Split the file into this many chunks. + /// + /// There are several sub-strategies available, as defined by + /// [`NumberType`]. + Number(NumberType), +} + +/// An error when parsing a chunking strategy from command-line arguments. +pub enum StrategyError { + /// Invalid number of lines. + Lines(ParseSizeError), + + /// Invalid number of bytes. + Bytes(ParseSizeError), + + /// Invalid number type. + NumberType(NumberTypeError), + + /// Multiple chunking strategies were specified (but only one should be). + MultipleWays, +} + +impl fmt::Display for StrategyError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Lines(e) => write!(f, "invalid number of lines: {e}"), + Self::Bytes(e) => write!(f, "invalid number of bytes: {e}"), + Self::NumberType(NumberTypeError::NumberOfChunks(s)) => { + write!(f, "invalid number of chunks: {s}") + } + Self::NumberType(NumberTypeError::ChunkNumber(s)) => { + write!(f, "invalid chunk number: {s}") + } + Self::MultipleWays => write!(f, "cannot split in more than one way"), + } + } +} + +impl Strategy { + /// Parse a strategy from the command-line arguments. + pub fn from(matches: &ArgMatches, obs_lines: &Option) -> Result { + fn get_and_parse( + matches: &ArgMatches, + option: &str, + strategy: fn(u64) -> Strategy, + error: fn(ParseSizeError) -> StrategyError, + ) -> Result { + let s = matches.get_one::(option).unwrap(); + let n = parse_size_u64_max(s).map_err(error)?; + if n > 0 { + Ok(strategy(n)) + } else { + Err(error(ParseSizeError::ParseFailure(s.to_string()))) + } + } + // Check that the user is not specifying more than one strategy. + // + // Note: right now, this exact behavior cannot be handled by + // overrides_with_all() due to obsolete lines value option + match ( + obs_lines, + matches.value_source(OPT_LINES) == Some(ValueSource::CommandLine), + matches.value_source(OPT_BYTES) == Some(ValueSource::CommandLine), + matches.value_source(OPT_LINE_BYTES) == Some(ValueSource::CommandLine), + matches.value_source(OPT_NUMBER) == Some(ValueSource::CommandLine), + ) { + (Some(v), false, false, false, false) => { + let v = parse_size_u64_max(v).map_err(|_| { + StrategyError::Lines(ParseSizeError::ParseFailure(v.to_string())) + })?; + if v > 0 { + Ok(Self::Lines(v)) + } else { + Err(StrategyError::Lines(ParseSizeError::ParseFailure( + v.to_string(), + ))) + } + } + (None, false, false, false, false) => Ok(Self::Lines(1000)), + (None, true, false, false, false) => { + get_and_parse(matches, OPT_LINES, Self::Lines, StrategyError::Lines) + } + (None, false, true, false, false) => { + get_and_parse(matches, OPT_BYTES, Self::Bytes, StrategyError::Bytes) + } + (None, false, false, true, false) => get_and_parse( + matches, + OPT_LINE_BYTES, + Self::LineBytes, + StrategyError::Bytes, + ), + (None, false, false, false, true) => { + let s = matches.get_one::(OPT_NUMBER).unwrap(); + let number_type = NumberType::from(s).map_err(StrategyError::NumberType)?; + Ok(Self::Number(number_type)) + } + _ => Err(StrategyError::MultipleWays), + } + } +} + +#[cfg(test)] +mod tests { + + use crate::{strategy::NumberType, strategy::NumberTypeError}; + + #[test] + fn test_number_type_from() { + assert_eq!(NumberType::from("123").unwrap(), NumberType::Bytes(123)); + assert_eq!(NumberType::from("l/123").unwrap(), NumberType::Lines(123)); + assert_eq!( + NumberType::from("l/123/456").unwrap(), + NumberType::KthLines(123, 456) + ); + assert_eq!( + NumberType::from("r/123").unwrap(), + NumberType::RoundRobin(123) + ); + assert_eq!( + NumberType::from("r/123/456").unwrap(), + NumberType::KthRoundRobin(123, 456) + ); + } + + #[test] + #[allow(clippy::cognitive_complexity)] + fn test_number_type_from_error() { + assert_eq!( + NumberType::from("xyz").unwrap_err(), + NumberTypeError::NumberOfChunks("xyz".to_string()) + ); + assert_eq!( + NumberType::from("l/xyz").unwrap_err(), + NumberTypeError::NumberOfChunks("xyz".to_string()) + ); + assert_eq!( + NumberType::from("l/123/xyz").unwrap_err(), + NumberTypeError::NumberOfChunks("xyz".to_string()) + ); + assert_eq!( + NumberType::from("l/abc/456").unwrap_err(), + NumberTypeError::ChunkNumber("abc".to_string()) + ); + assert_eq!( + NumberType::from("l/456/123").unwrap_err(), + NumberTypeError::ChunkNumber("456".to_string()) + ); + assert_eq!( + NumberType::from("r/456/123").unwrap_err(), + NumberTypeError::ChunkNumber("456".to_string()) + ); + assert_eq!( + NumberType::from("456/123").unwrap_err(), + NumberTypeError::ChunkNumber("456".to_string()) + ); + // In GNU split, the number of chunks get precedence: + // + // $ split -n l/abc/xyz + // split: invalid number of chunks: ‘xyz’ + // + assert_eq!( + NumberType::from("l/abc/xyz").unwrap_err(), + NumberTypeError::NumberOfChunks("xyz".to_string()) + ); + assert_eq!( + NumberType::from("r/xyz").unwrap_err(), + NumberTypeError::NumberOfChunks("xyz".to_string()) + ); + assert_eq!( + NumberType::from("r/123/xyz").unwrap_err(), + NumberTypeError::NumberOfChunks("xyz".to_string()) + ); + assert_eq!( + NumberType::from("r/abc/456").unwrap_err(), + NumberTypeError::ChunkNumber("abc".to_string()) + ); + // In GNU split, the number of chunks get precedence: + // + // $ split -n r/abc/xyz + // split: invalid number of chunks: ‘xyz’ + // + assert_eq!( + NumberType::from("r/abc/xyz").unwrap_err(), + NumberTypeError::NumberOfChunks("xyz".to_string()) + ); + } + + #[test] + fn test_number_type_num_chunks() { + assert_eq!(NumberType::from("123").unwrap().num_chunks(), 123); + assert_eq!(NumberType::from("123/456").unwrap().num_chunks(), 456); + assert_eq!(NumberType::from("l/123").unwrap().num_chunks(), 123); + assert_eq!(NumberType::from("l/123/456").unwrap().num_chunks(), 456); + assert_eq!(NumberType::from("r/123").unwrap().num_chunks(), 123); + assert_eq!(NumberType::from("r/123/456").unwrap().num_chunks(), 456); + } +}