From 7d7d4fb04a6bd053c91cc8cdc469ca734000e60d Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Tue, 22 Aug 2023 13:09:28 -0400 Subject: [PATCH 01/36] build-gnu.sh: `/usr/bin/timeout` should not be hardcoded to /usr/bin location Fixes #5193 --- util/build-gnu.sh | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/util/build-gnu.sh b/util/build-gnu.sh index d852ed66fb..d02c8c842e 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -82,7 +82,16 @@ else ./bootstrap --skip-po ./configure --quiet --disable-gcc-warnings #Add timeout to to protect against hangs - sed -i 's|^"\$@|/usr/bin/timeout 600 "\$@|' build-aux/test-driver + # On MacOS there is no system /usr/bin/timeout + # and trying to add it to /usr/bin (with symlink of copy binary) will fail unless system integrity protection is disabled (not ideal) + # ref: https://support.apple.com/en-us/102149 + # On MacOS the Homebrew coreutils could be installed and then "sudo ln -s /opt/homebrew/bin/timeout /usr/local/bin/timeout" + # Set to /usr/local/timeout instead if /usr/bin/timeout is not found + if [ -x /usr/bin/timeout ] ; then + sed -i 's|^"\$@|/usr/bin/timeout 600 "\$@|' build-aux/test-driver + else + sed -i 's|^"\$@|/usr/local/bin/timeout 600 "\$@|' build-aux/test-driver + fi # Change the PATH in the Makefile to test the uutils coreutils instead of the GNU coreutils sed -i "s/^[[:blank:]]*PATH=.*/ PATH='${UU_BUILD_DIR//\//\\/}\$(PATH_SEPARATOR)'\"\$\$PATH\" \\\/" Makefile sed -i 's| tr | /usr/bin/tr |' tests/init.sh @@ -153,13 +162,28 @@ sed -i 's|touch |/usr/bin/touch |' tests/cp/reflink-perm.sh tests/ls/block-size. sed -i 's|ln -|/usr/bin/ln -|' tests/cp/link-deref.sh sed -i 's|cp |/usr/bin/cp |' tests/mv/hard-2.sh sed -i 's|paste |/usr/bin/paste |' tests/misc/od-endian.sh -sed -i 's|timeout |/usr/bin/timeout |' tests/tail-2/follow-stdin.sh +# On MacOS there is no system /usr/bin/timeout +# and trying to add it to /usr/bin (with symlink of copy binary) will fail unless system integrity protection is disabled (not ideal) +# ref: https://support.apple.com/en-us/102149 +# On MacOS the Homebrew coreutils could be installed and then "sudo ln -s /opt/homebrew/bin/timeout /usr/local/bin/timeout" +# Set to /usr/local/timeout instead if /usr/bin/timeout is not found +if [ -x /usr/bin/timeout ] ; then + sed -i 's|timeout |/usr/bin/timeout |' tests/tail-2/follow-stdin.sh +else + sed -i 's|timeout |/usr/local/bin/timeout |' tests/tail-2/follow-stdin.sh +fi + # Add specific timeout to tests that currently hang to limit time spent waiting -sed -i 's|\(^\s*\)seq \$|\1/usr/bin/timeout 0.1 seq \$|' tests/misc/seq-precision.sh tests/misc/seq-long-double.sh +if [ -x /usr/bin/timeout ] ; then + sed -i 's|\(^\s*\)seq \$|\1/usr/bin/timeout 0.1 seq \$|' tests/misc/seq-precision.sh tests/misc/seq-long-double.sh +else + sed -i 's|\(^\s*\)seq \$|\1/usr/local/bin/timeout 0.1 seq \$|' tests/misc/seq-precision.sh tests/misc/seq-long-double.sh +fi -# Remove dup of /usr/bin/ when executed several times +# Remove dup of /usr/bin/ (and /usr/local/bin) when executed several times grep -rlE '/usr/bin/\s?/usr/bin' init.cfg tests/* | xargs --no-run-if-empty sed -Ei 's|/usr/bin/\s?/usr/bin/|/usr/bin/|g' +grep -rlE '/usr/local/bin/\s?/usr/local/bin' init.cfg tests/* | xargs --no-run-if-empty sed -Ei 's|/usr/local/bin/\s?/usr/local/bin/|/usr/local/bin/|g' #### Adjust tests to make them work with Rust/coreutils # in some cases, what we are doing in rust/coreutils is good (or better) From 75cc6e3fdccedf551e297e54715cc65ed6045c7c Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Tue, 22 Aug 2023 13:22:24 -0400 Subject: [PATCH 02/36] added some TODO(s) for missing/moved locations --- util/build-gnu.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/util/build-gnu.sh b/util/build-gnu.sh index d02c8c842e..6c608a0607 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -151,6 +151,9 @@ sed -i -e '/tests\/misc\/seq-precision.sh/ D' \ sed -i '/INT_OFLOW/ D' tests/misc/printf.sh # Use the system coreutils where the test fails due to error in a util that is not the one being tested +# TODO : tests/tail-2/ does not appear to exist +# and have been moved to just tests/tail/ location +# Might need to update the section bvelow to reflect that sed -i 's|stat|/usr/bin/stat|' tests/touch/60-seconds.sh tests/misc/sort-compress-proc.sh sed -i 's|ls -|/usr/bin/ls -|' tests/cp/same-file.sh tests/misc/mknod.sh tests/mv/part-symlink.sh sed -i 's|chmod |/usr/bin/chmod |' tests/du/inacc-dir.sh tests/tail-2/tail-n0f.sh tests/cp/fail-perm.sh tests/mv/i-2.sh tests/misc/shuf.sh @@ -175,6 +178,9 @@ fi # Add specific timeout to tests that currently hang to limit time spent waiting +# TODO : tests/misc/seq-precision.sh tests/misc/seq-long-double.sh do not appear to exist +# and have been moved to tests/seq/ location +# Might need to update the section bvelow to reflect that if [ -x /usr/bin/timeout ] ; then sed -i 's|\(^\s*\)seq \$|\1/usr/bin/timeout 0.1 seq \$|' tests/misc/seq-precision.sh tests/misc/seq-long-double.sh else @@ -205,6 +211,9 @@ sed -i -e "s|rm: cannot remove 'rel': Permission denied|rm: cannot remove 'rel': # overlay-headers.sh test intends to check for inotify events, # however there's a bug because `---dis` is an alias for: `---disable-inotify` +# TODO : tests/tail-2/ does not appear to exist +# and have been moved to just tests/tail/ location +# Might need to update the section bvelow to reflect that sed -i -e "s|---dis ||g" tests/tail-2/overlay-headers.sh test -f "${UU_BUILD_DIR}/getlimits" || cp src/getlimits "${UU_BUILD_DIR}" From c44c3cd71649252ae333dca0c55ab56d9ca52163 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Tue, 22 Aug 2023 15:28:02 -0400 Subject: [PATCH 03/36] fixed spelling --- util/build-gnu.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/util/build-gnu.sh b/util/build-gnu.sh index 6c608a0607..4414b7f12b 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -153,7 +153,7 @@ sed -i '/INT_OFLOW/ D' tests/misc/printf.sh # Use the system coreutils where the test fails due to error in a util that is not the one being tested # TODO : tests/tail-2/ does not appear to exist # and have been moved to just tests/tail/ location -# Might need to update the section bvelow to reflect that +# Might need to update the section below to reflect that sed -i 's|stat|/usr/bin/stat|' tests/touch/60-seconds.sh tests/misc/sort-compress-proc.sh sed -i 's|ls -|/usr/bin/ls -|' tests/cp/same-file.sh tests/misc/mknod.sh tests/mv/part-symlink.sh sed -i 's|chmod |/usr/bin/chmod |' tests/du/inacc-dir.sh tests/tail-2/tail-n0f.sh tests/cp/fail-perm.sh tests/mv/i-2.sh tests/misc/shuf.sh @@ -180,7 +180,7 @@ fi # Add specific timeout to tests that currently hang to limit time spent waiting # TODO : tests/misc/seq-precision.sh tests/misc/seq-long-double.sh do not appear to exist # and have been moved to tests/seq/ location -# Might need to update the section bvelow to reflect that +# Might need to update the section below to reflect that if [ -x /usr/bin/timeout ] ; then sed -i 's|\(^\s*\)seq \$|\1/usr/bin/timeout 0.1 seq \$|' tests/misc/seq-precision.sh tests/misc/seq-long-double.sh else @@ -213,7 +213,7 @@ sed -i -e "s|rm: cannot remove 'rel': Permission denied|rm: cannot remove 'rel': # however there's a bug because `---dis` is an alias for: `---disable-inotify` # TODO : tests/tail-2/ does not appear to exist # and have been moved to just tests/tail/ location -# Might need to update the section bvelow to reflect that +# Might need to update the section below to reflect that sed -i -e "s|---dis ||g" tests/tail-2/overlay-headers.sh test -f "${UU_BUILD_DIR}/getlimits" || cp src/getlimits "${UU_BUILD_DIR}" From 6468845850a0c847f42fb400ffecbdcf54f8e99c Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Thu, 24 Aug 2023 20:36:52 -0400 Subject: [PATCH 04/36] refactor to check for system timeout once + commented out many moved/deleted test files that make script fail --- util/build-gnu.sh | 71 ++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 38 deletions(-) diff --git a/util/build-gnu.sh b/util/build-gnu.sh index 4414b7f12b..c6991d15c1 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -18,6 +18,20 @@ path_GNU="$(readlink -fm -- "${path_GNU:-${path_UUTILS}/../gnu}")" ### +# On MacOS there is no system /usr/bin/timeout +# and trying to add it to /usr/bin (with symlink of copy binary) will fail unless system integrity protection is disabled (not ideal) +# ref: https://support.apple.com/en-us/102149 +# On MacOS the Homebrew coreutils could be installed and then "sudo ln -s /opt/homebrew/bin/timeout /usr/local/bin/timeout" +# Set to /usr/local/bin/timeout instead if /usr/bin/timeout is not found +SYSTEM_TIMEOUT="timeout" +if [ -x /usr/bin/timeout ] ; then + SYSTEM_TIMEOUT="/usr/bin/timeout" +elif [ -x /usr/local/bin/timeout ] ; then + SYSTEM_TIMEOUT="/usr/local/bin/timeout" +fi + +### + if test ! -d "${path_GNU}"; then echo "Could not find GNU coreutils (expected at '${path_GNU}')" echo "Run the following to download into the expected path:" @@ -82,16 +96,7 @@ else ./bootstrap --skip-po ./configure --quiet --disable-gcc-warnings #Add timeout to to protect against hangs - # On MacOS there is no system /usr/bin/timeout - # and trying to add it to /usr/bin (with symlink of copy binary) will fail unless system integrity protection is disabled (not ideal) - # ref: https://support.apple.com/en-us/102149 - # On MacOS the Homebrew coreutils could be installed and then "sudo ln -s /opt/homebrew/bin/timeout /usr/local/bin/timeout" - # Set to /usr/local/timeout instead if /usr/bin/timeout is not found - if [ -x /usr/bin/timeout ] ; then - sed -i 's|^"\$@|/usr/bin/timeout 600 "\$@|' build-aux/test-driver - else - sed -i 's|^"\$@|/usr/local/bin/timeout 600 "\$@|' build-aux/test-driver - fi + sed -i 's|^"\$@|'"${SYSTEM_TIMEOUT}"' 600 "\$@|' build-aux/test-driver # Change the PATH in the Makefile to test the uutils coreutils instead of the GNU coreutils sed -i "s/^[[:blank:]]*PATH=.*/ PATH='${UU_BUILD_DIR//\//\\/}\$(PATH_SEPARATOR)'\"\$\$PATH\" \\\/" Makefile sed -i 's| tr | /usr/bin/tr |' tests/init.sh @@ -148,44 +153,32 @@ sed -i -e '/tests\/misc\/seq-precision.sh/ D' \ Makefile # printf doesn't limit the values used in its arg, so this produced ~2GB of output -sed -i '/INT_OFLOW/ D' tests/misc/printf.sh +# Looks like tests/misc/printf.sh does not exist anymore - comment it out for now +#sed -i '/INT_OFLOW/ D' tests/misc/printf.sh # Use the system coreutils where the test fails due to error in a util that is not the one being tested # TODO : tests/tail-2/ does not appear to exist # and have been moved to just tests/tail/ location # Might need to update the section below to reflect that -sed -i 's|stat|/usr/bin/stat|' tests/touch/60-seconds.sh tests/misc/sort-compress-proc.sh +# Also looks like tests/misc/sort-compress-proc.sh and tests/tail-2/tail-n0f.sh and tests/misc/shuf.sh and many others do not exist anymore or moved - comment it out for now +sed -i 's|stat|/usr/bin/stat|' tests/touch/60-seconds.sh #tests/misc/sort-compress-proc.sh sed -i 's|ls -|/usr/bin/ls -|' tests/cp/same-file.sh tests/misc/mknod.sh tests/mv/part-symlink.sh -sed -i 's|chmod |/usr/bin/chmod |' tests/du/inacc-dir.sh tests/tail-2/tail-n0f.sh tests/cp/fail-perm.sh tests/mv/i-2.sh tests/misc/shuf.sh -sed -i 's|sort |/usr/bin/sort |' tests/ls/hyperlink.sh tests/misc/test-N.sh -sed -i 's|split |/usr/bin/split |' tests/misc/factor-parallel.sh -sed -i 's|id -|/usr/bin/id -|' tests/misc/runcon-no-reorder.sh +sed -i 's|chmod |/usr/bin/chmod |' tests/du/inacc-dir.sh tests/cp/fail-perm.sh tests/mv/i-2.sh #tests/misc/shuf.sh #tests/tail-2/tail-n0f.sh +sed -i 's|sort |/usr/bin/sort |' tests/ls/hyperlink.sh #tests/misc/test-N.sh +#sed -i 's|split |/usr/bin/split |' tests/misc/factor-parallel.sh +#sed -i 's|id -|/usr/bin/id -|' tests/misc/runcon-no-reorder.sh # tests/ls/abmon-align.sh - https://github.com/uutils/coreutils/issues/3505 -sed -i 's|touch |/usr/bin/touch |' tests/cp/reflink-perm.sh tests/ls/block-size.sh tests/mv/update.sh tests/misc/ls-time.sh tests/misc/stat-nanoseconds.sh tests/misc/time-style.sh tests/misc/test-N.sh tests/ls/abmon-align.sh +sed -i 's|touch |/usr/bin/touch |' tests/cp/reflink-perm.sh tests/ls/block-size.sh tests/mv/update.sh tests/misc/time-style.sh tests/ls/abmon-align.sh #tests/misc/ls-time.sh tests/misc/stat-nanoseconds.sh tests/misc/test-N.sh sed -i 's|ln -|/usr/bin/ln -|' tests/cp/link-deref.sh sed -i 's|cp |/usr/bin/cp |' tests/mv/hard-2.sh -sed -i 's|paste |/usr/bin/paste |' tests/misc/od-endian.sh -# On MacOS there is no system /usr/bin/timeout -# and trying to add it to /usr/bin (with symlink of copy binary) will fail unless system integrity protection is disabled (not ideal) -# ref: https://support.apple.com/en-us/102149 -# On MacOS the Homebrew coreutils could be installed and then "sudo ln -s /opt/homebrew/bin/timeout /usr/local/bin/timeout" -# Set to /usr/local/timeout instead if /usr/bin/timeout is not found -if [ -x /usr/bin/timeout ] ; then - sed -i 's|timeout |/usr/bin/timeout |' tests/tail-2/follow-stdin.sh -else - sed -i 's|timeout |/usr/local/bin/timeout |' tests/tail-2/follow-stdin.sh -fi - +#sed -i 's|paste |/usr/bin/paste |' tests/misc/od-endian.sh +#sed -i 's|timeout |'"${SYSTEM_TIMEOUT}"' |' tests/tail-2/follow-stdin.sh # Add specific timeout to tests that currently hang to limit time spent waiting # TODO : tests/misc/seq-precision.sh tests/misc/seq-long-double.sh do not appear to exist # and have been moved to tests/seq/ location -# Might need to update the section below to reflect that -if [ -x /usr/bin/timeout ] ; then - sed -i 's|\(^\s*\)seq \$|\1/usr/bin/timeout 0.1 seq \$|' tests/misc/seq-precision.sh tests/misc/seq-long-double.sh -else - sed -i 's|\(^\s*\)seq \$|\1/usr/local/bin/timeout 0.1 seq \$|' tests/misc/seq-precision.sh tests/misc/seq-long-double.sh -fi +# Might need to update the section below to reflect that, but comment it out for now +#sed -i 's|\(^\s*\)seq \$|\1'"${SYSTEM_TIMEOUT}"' 0.1 seq \$|' tests/misc/seq-precision.sh tests/misc/seq-long-double.sh # Remove dup of /usr/bin/ (and /usr/local/bin) when executed several times grep -rlE '/usr/bin/\s?/usr/bin' init.cfg tests/* | xargs --no-run-if-empty sed -Ei 's|/usr/bin/\s?/usr/bin/|/usr/bin/|g' @@ -214,7 +207,7 @@ sed -i -e "s|rm: cannot remove 'rel': Permission denied|rm: cannot remove 'rel': # TODO : tests/tail-2/ does not appear to exist # and have been moved to just tests/tail/ location # Might need to update the section below to reflect that -sed -i -e "s|---dis ||g" tests/tail-2/overlay-headers.sh +#sed -i -e "s|---dis ||g" tests/tail-2/overlay-headers.sh test -f "${UU_BUILD_DIR}/getlimits" || cp src/getlimits "${UU_BUILD_DIR}" @@ -271,11 +264,13 @@ sed -i -e "s/Try 'mv --help' for more information/For more information, try '--h # GNU doesn't support width > INT_MAX # disable these test cases -sed -i -E "s|^([^#]*2_31.*)$|#\1|g" tests/misc/printf-cov.pl +# TODO: moved or deleted tests/misc/printf-cov.pl +#sed -i -E "s|^([^#]*2_31.*)$|#\1|g" tests/misc/printf-cov.pl sed -i -e "s/du: invalid -t argument/du: invalid --threshold argument/" -e "s/du: option requires an argument/error: a value is required for '--threshold ' but none was supplied/" -e "/Try 'du --help' for more information./d" tests/du/threshold.sh # disable two kind of tests: # "hostid BEFORE --help" doesn't fail for GNU. we fail. we are probably doing better # "hostid BEFORE --help AFTER " same for this -sed -i -e "s/env \$prog \$BEFORE \$opt > out2/env \$prog \$BEFORE \$opt > out2 #/" -e "s/env \$prog \$BEFORE \$opt AFTER > out3/env \$prog \$BEFORE \$opt AFTER > out3 #/" -e "s/compare exp out2/compare exp out2 #/" -e "s/compare exp out3/compare exp out3 #/" tests/misc/help-version-getopt.sh +# TODO moved or deleted tests/misc/help-version-getopt.sh +#sed -i -e "s/env \$prog \$BEFORE \$opt > out2/env \$prog \$BEFORE \$opt > out2 #/" -e "s/env \$prog \$BEFORE \$opt AFTER > out3/env \$prog \$BEFORE \$opt AFTER > out3 #/" -e "s/compare exp out2/compare exp out2 #/" -e "s/compare exp out3/compare exp out3 #/" tests/misc/help-version-getopt.sh From 350b9c3d486449f592b9fd92f349b884926073cb Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Fri, 25 Aug 2023 13:43:48 -0400 Subject: [PATCH 05/36] reverted commented out test files and added more details in TODO(s) --- util/build-gnu.sh | 51 +++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/util/build-gnu.sh b/util/build-gnu.sh index c6991d15c1..587cf85809 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -152,37 +152,37 @@ sed -i -e '/tests\/misc\/invalid-opt.pl/ D' \ sed -i -e '/tests\/misc\/seq-precision.sh/ D' \ Makefile +# TODO: many files and some directories modified with 'sed' in the sections below either no longer exist or have been moved +# TODO: Might need to review and updated sections below +# TODO: As a result this script will fail when executed normally as 'bash util/build-gnu.hs' due to the 'set -e' set at the beginning +# TODO: The rest of the 'sed' commands after first failure in this scenario will not be executed as bash will exit on first error +# TODO: However, the behaviour might be different when running it via GitHub actions (GnuTests) +# TODO: For now, when running in local a workaround would be to comment out the 'set -e' at the beginning of the file + # printf doesn't limit the values used in its arg, so this produced ~2GB of output -# Looks like tests/misc/printf.sh does not exist anymore - comment it out for now -#sed -i '/INT_OFLOW/ D' tests/misc/printf.sh +# TODO: this is the first one to likely fail as tests/misc/printf.sh does not exist/have been moved +sed -i '/INT_OFLOW/ D' tests/misc/printf.sh +# TODO: all commands below might not be executed # Use the system coreutils where the test fails due to error in a util that is not the one being tested -# TODO : tests/tail-2/ does not appear to exist -# and have been moved to just tests/tail/ location -# Might need to update the section below to reflect that -# Also looks like tests/misc/sort-compress-proc.sh and tests/tail-2/tail-n0f.sh and tests/misc/shuf.sh and many others do not exist anymore or moved - comment it out for now -sed -i 's|stat|/usr/bin/stat|' tests/touch/60-seconds.sh #tests/misc/sort-compress-proc.sh +sed -i 's|stat|/usr/bin/stat|' tests/touch/60-seconds.sh tests/misc/sort-compress-proc.sh sed -i 's|ls -|/usr/bin/ls -|' tests/cp/same-file.sh tests/misc/mknod.sh tests/mv/part-symlink.sh -sed -i 's|chmod |/usr/bin/chmod |' tests/du/inacc-dir.sh tests/cp/fail-perm.sh tests/mv/i-2.sh #tests/misc/shuf.sh #tests/tail-2/tail-n0f.sh -sed -i 's|sort |/usr/bin/sort |' tests/ls/hyperlink.sh #tests/misc/test-N.sh -#sed -i 's|split |/usr/bin/split |' tests/misc/factor-parallel.sh -#sed -i 's|id -|/usr/bin/id -|' tests/misc/runcon-no-reorder.sh +sed -i 's|chmod |/usr/bin/chmod |' tests/du/inacc-dir.sh tests/tail-2/tail-n0f.sh tests/cp/fail-perm.sh tests/mv/i-2.sh tests/misc/shuf.sh +sed -i 's|sort |/usr/bin/sort |' tests/ls/hyperlink.sh tests/misc/test-N.sh +sed -i 's|split |/usr/bin/split |' tests/misc/factor-parallel.sh +sed -i 's|id -|/usr/bin/id -|' tests/misc/runcon-no-reorder.sh # tests/ls/abmon-align.sh - https://github.com/uutils/coreutils/issues/3505 -sed -i 's|touch |/usr/bin/touch |' tests/cp/reflink-perm.sh tests/ls/block-size.sh tests/mv/update.sh tests/misc/time-style.sh tests/ls/abmon-align.sh #tests/misc/ls-time.sh tests/misc/stat-nanoseconds.sh tests/misc/test-N.sh +sed -i 's|touch |/usr/bin/touch |' tests/cp/reflink-perm.sh tests/ls/block-size.sh tests/mv/update.sh tests/misc/ls-time.sh tests/misc/stat-nanoseconds.sh tests/misc/time-style.sh tests/misc/test-N.sh tests/ls/abmon-align.sh sed -i 's|ln -|/usr/bin/ln -|' tests/cp/link-deref.sh sed -i 's|cp |/usr/bin/cp |' tests/mv/hard-2.sh -#sed -i 's|paste |/usr/bin/paste |' tests/misc/od-endian.sh -#sed -i 's|timeout |'"${SYSTEM_TIMEOUT}"' |' tests/tail-2/follow-stdin.sh +sed -i 's|paste |/usr/bin/paste |' tests/misc/od-endian.sh +sed -i 's|timeout |/usr/bin/timeout |' tests/tail-2/follow-stdin.sh # Add specific timeout to tests that currently hang to limit time spent waiting -# TODO : tests/misc/seq-precision.sh tests/misc/seq-long-double.sh do not appear to exist -# and have been moved to tests/seq/ location -# Might need to update the section below to reflect that, but comment it out for now -#sed -i 's|\(^\s*\)seq \$|\1'"${SYSTEM_TIMEOUT}"' 0.1 seq \$|' tests/misc/seq-precision.sh tests/misc/seq-long-double.sh +sed -i 's|\(^\s*\)seq \$|\1/usr/bin/timeout 0.1 seq \$|' tests/misc/seq-precision.sh tests/misc/seq-long-double.sh -# Remove dup of /usr/bin/ (and /usr/local/bin) when executed several times +# Remove dup of /usr/bin/ when executed several times grep -rlE '/usr/bin/\s?/usr/bin' init.cfg tests/* | xargs --no-run-if-empty sed -Ei 's|/usr/bin/\s?/usr/bin/|/usr/bin/|g' -grep -rlE '/usr/local/bin/\s?/usr/local/bin' init.cfg tests/* | xargs --no-run-if-empty sed -Ei 's|/usr/local/bin/\s?/usr/local/bin/|/usr/local/bin/|g' #### Adjust tests to make them work with Rust/coreutils # in some cases, what we are doing in rust/coreutils is good (or better) @@ -204,10 +204,7 @@ sed -i -e "s|rm: cannot remove 'rel': Permission denied|rm: cannot remove 'rel': # overlay-headers.sh test intends to check for inotify events, # however there's a bug because `---dis` is an alias for: `---disable-inotify` -# TODO : tests/tail-2/ does not appear to exist -# and have been moved to just tests/tail/ location -# Might need to update the section below to reflect that -#sed -i -e "s|---dis ||g" tests/tail-2/overlay-headers.sh +sed -i -e "s|---dis ||g" tests/tail-2/overlay-headers.sh test -f "${UU_BUILD_DIR}/getlimits" || cp src/getlimits "${UU_BUILD_DIR}" @@ -264,13 +261,11 @@ sed -i -e "s/Try 'mv --help' for more information/For more information, try '--h # GNU doesn't support width > INT_MAX # disable these test cases -# TODO: moved or deleted tests/misc/printf-cov.pl -#sed -i -E "s|^([^#]*2_31.*)$|#\1|g" tests/misc/printf-cov.pl +sed -i -E "s|^([^#]*2_31.*)$|#\1|g" tests/misc/printf-cov.pl sed -i -e "s/du: invalid -t argument/du: invalid --threshold argument/" -e "s/du: option requires an argument/error: a value is required for '--threshold ' but none was supplied/" -e "/Try 'du --help' for more information./d" tests/du/threshold.sh # disable two kind of tests: # "hostid BEFORE --help" doesn't fail for GNU. we fail. we are probably doing better # "hostid BEFORE --help AFTER " same for this -# TODO moved or deleted tests/misc/help-version-getopt.sh -#sed -i -e "s/env \$prog \$BEFORE \$opt > out2/env \$prog \$BEFORE \$opt > out2 #/" -e "s/env \$prog \$BEFORE \$opt AFTER > out3/env \$prog \$BEFORE \$opt AFTER > out3 #/" -e "s/compare exp out2/compare exp out2 #/" -e "s/compare exp out3/compare exp out3 #/" tests/misc/help-version-getopt.sh +sed -i -e "s/env \$prog \$BEFORE \$opt > out2/env \$prog \$BEFORE \$opt > out2 #/" -e "s/env \$prog \$BEFORE \$opt AFTER > out3/env \$prog \$BEFORE \$opt AFTER > out3 #/" -e "s/compare exp out2/compare exp out2 #/" -e "s/compare exp out3/compare exp out3 #/" tests/misc/help-version-getopt.sh From 84d96f9d028a4dd8c0f98ba6db2ba6dc90c79dcb Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Sat, 26 Aug 2023 11:11:46 -0400 Subject: [PATCH 06/36] split: refactor for more common use case --- src/uu/split/src/split.rs | 81 ++++++++++++++++++- tests/by-util/test_split.rs | 152 +++++++++++++++++++++++++++++++++++- 2 files changed, 229 insertions(+), 4 deletions(-) diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 71aacfed00..c4b7e6d660 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -51,14 +51,66 @@ const AFTER_HELP: &str = help_section!("after help", "split.md"); #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { + let args = args.collect_lossy(); + + let (args, obs_lines) = handle_obsolete(&args[..]); + let matches = uu_app().try_get_matches_from(args)?; - match Settings::from(&matches) { + + match Settings::from(&matches, &obs_lines) { Ok(settings) => split(&settings), Err(e) if e.requires_usage() => Err(UUsageError::new(1, format!("{e}"))), Err(e) => Err(USimpleError::new(1, format!("{e}"))), } } +/// Extract obsolete shorthand (if any) for specifying lines in following scenarios (and similar) +/// `split -22 file` would mean `split -l 22 file` +/// `split -2de file` would mean `split -l 2 -d -e file` +/// `split -x300e file` would mean `split -x -l 300 -e file` +/// `split -x300e -22 file` would mean `split -x -e -l 22 file` (last obsolete lines option wins) +/// following GNU `split` behavior +fn handle_obsolete(args: &[String]) -> (Vec, Option) { + let mut v: Vec = vec![]; + let mut obs_lines = None; + for arg in args.iter() { + let slice = &arg; + if slice.starts_with('-') && !slice.starts_with("--") { + // start of the short option string + // extract numeric part and filter it out + let mut obs_lines_extracted: Vec = vec![]; + let filtered_slice: Vec = slice + .chars() + .filter(|c| { + if c.is_ascii_digit() { + obs_lines_extracted.push(*c); + false + } else { + true + } + }) + .collect(); + + if filtered_slice.get(1).is_some() { + // there were some short options in front of obsolete lines number + // i.e. '-xd100' or similar + // preserve it + v.push(filtered_slice.iter().collect()) + } + if !obs_lines_extracted.is_empty() { + // obsolete lines value was extracted + obs_lines = Some(obs_lines_extracted.iter().collect()); + } + } else { + // not a short option + // preserve it + v.push(arg.to_owned()); + } + } + println!("{:#?}",v); + (v, obs_lines) +} + pub fn uu_app() -> Command { Command::new(uucore::util_name()) .version(crate_version!()) @@ -357,6 +409,17 @@ impl fmt::Display for StrategyError { } impl Strategy { + /// Parse a strategy from obsolete line option value + fn from_obs(obs_value: &str) -> Result { + let n = parse_size(obs_value).map_err(StrategyError::Lines)?; + if n > 0 { + Ok(Self::Lines(n)) + } else { + Err(StrategyError::Lines(ParseSizeError::ParseFailure( + obs_value.to_string(), + ))) + } + } /// Parse a strategy from the command-line arguments. fn from(matches: &ArgMatches) -> Result { fn get_and_parse( @@ -506,7 +569,7 @@ impl fmt::Display for SettingsError { impl Settings { /// Parse a strategy from the command-line arguments. - fn from(matches: &ArgMatches) -> Result { + fn from(matches: &ArgMatches, obs_lines: &Option) -> Result { let additional_suffix = matches .get_one::(OPT_ADDITIONAL_SUFFIX) .unwrap() @@ -514,7 +577,19 @@ impl Settings { if additional_suffix.contains('/') { return Err(SettingsError::SuffixContainsSeparator(additional_suffix)); } - let strategy = Strategy::from(matches).map_err(SettingsError::Strategy)?; + + // obsolete lines option cannot be used simultaneously with OPT_LINES + let strategy = match obs_lines { + Some(obs_value) => { + if matches.value_source(OPT_LINES) == Some(ValueSource::CommandLine) { + return Err(SettingsError::Strategy(StrategyError::MultipleWays)); + } else { + Strategy::from_obs(obs_value).map_err(SettingsError::Strategy)? + } + } + None => Strategy::from(matches).map_err(SettingsError::Strategy)?, + }; + let (suffix_type, suffix_start) = suffix_type_from(matches)?; let suffix_length_str = matches.get_one::(OPT_SUFFIX_LENGTH).unwrap(); let suffix_length: usize = suffix_length_str diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index c5f32482ed..fe753f0849 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -2,7 +2,7 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines nbbbb +// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines nbbbb dxen use crate::common::util::{AtPath, TestScenario}; use rand::{thread_rng, Rng, SeedableRng}; @@ -320,6 +320,156 @@ fn test_split_lines_number() { .stderr_only("split: invalid number of lines: '2fb'\n"); } +/// Test for obsolete lines option standalone +#[test] +fn test_split_obs_lines_standalone() { + let (at, mut ucmd) = at_and_ucmd!(); + let name = "obs-lines-standalone"; + RandomFile::new(&at, name).add_lines(4); + ucmd.args(&["-2", name]).succeeds().no_stderr().no_stdout(); + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); + assert_eq!(glob.count(), 2); + assert_eq!(glob.collate(), at.read_bytes(name)); +} + +/// Test for invalid obsolete lines option +#[test] +fn test_split_invalid_obs_lines_standalone() { + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + at.touch("file"); + + scene + .ucmd() + .args(&["-2fb", "file"]) + .fails() + .code_is(1) + .stderr_only("error: unexpected argument '-f' found\n"); +} + +/// Test for obsolete lines option as part of combined short options +#[test] +fn test_split_obs_lines_within_combined_shorts() { + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + let name = "obs-lines-within-shorts"; + RandomFile::new(&at, name).add_lines(400); + + scene + .ucmd() + .args(&["-d200xe", name]) + .succeeds() + .no_stderr() + .no_stdout(); + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); + assert_eq!(glob.count(), 2); + assert_eq!(glob.collate(), at.read_bytes(name)) +} + +/// Test for obsolete lines option starts as part of combined short options +#[test] +fn test_split_obs_lines_starts_combined_shorts() { + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + let name = "obs-lines-starts-shorts"; + RandomFile::new(&at, name).add_lines(400); + + scene + .ucmd() + .args(&["-x200d", name]) + .succeeds() + .no_stderr() + .no_stdout(); + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); + assert_eq!(glob.count(), 2); + assert_eq!(glob.collate(), at.read_bytes(name)) +} + +/// Test for using both obsolete lines (standalone) option and short/long lines option simultaneously +#[test] +fn test_split_both_lines_and_obs_lines_standalone() { + // This test will ensure that if both lines option '-l' or '--lines' + // and obsolete lines option '-100' are used + // it fails + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + at.touch("file"); + + scene + .ucmd() + .args(&["-l", "2", "-2", "file"]) + .fails() + .code_is(1) + .stderr_contains("split: cannot split in more than one way\n"); + scene + .ucmd() + .args(&["--lines", "2", "-2", "file"]) + .fails() + .code_is(1) + .stderr_contains("split: cannot split in more than one way\n"); +} + +/// Test for using more than one obsolete lines option (standalone) +/// last one wins +#[test] +fn test_split_multiple_obs_lines_standalone() { + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + let name = "multiple-obs-lines"; + RandomFile::new(&at, name).add_lines(400); + + scene + .ucmd() + .args(&["-3000", "-200", name]) + .succeeds() + .no_stderr() + .no_stdout(); + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); + assert_eq!(glob.count(), 2); + assert_eq!(glob.collate(), at.read_bytes(name)) +} + +/// Test for using more than one obsolete lines option within combined shorts +/// last one wins +#[test] +fn test_split_multiple_obs_lines_within_combined() { + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + let name = "multiple-obs-lines"; + RandomFile::new(&at, name).add_lines(400); + + scene + .ucmd() + .args(&["-x5000d -e200x", name]) + .succeeds() + .no_stderr() + .no_stdout(); + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); + assert_eq!(glob.count(), 2); + assert_eq!(glob.collate(), at.read_bytes(name)) +} + +/// Test for using both obsolete lines option within combined shorts with conflicting -n option simultaneously +#[test] +fn test_split_obs_lines_within_combined_with_number() { + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + at.touch("file"); + + scene + .ucmd() + .args(&["-3dxen", "4", "file"]) + .fails() + .code_is(1) + .stderr_contains("split: cannot split in more than one way\n"); + scene + .ucmd() + .args(&["-dxe30n", "4", "file"]) + .fails() + .code_is(1) + .stderr_contains("split: cannot split in more than one way\n"); +} + #[test] fn test_split_invalid_bytes_size() { new_ucmd!() From 70dd8eb8dcdfffa7b69a22bc9a3162408d912f84 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Sat, 26 Aug 2023 13:22:36 -0400 Subject: [PATCH 07/36] split: updates to target correct GNU coreutils release --- util/build-gnu.sh | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/util/build-gnu.sh b/util/build-gnu.sh index 587cf85809..f6be73ff47 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -32,10 +32,16 @@ fi ### +release_tag_GNU="v9.3" + if test ! -d "${path_GNU}"; then echo "Could not find GNU coreutils (expected at '${path_GNU}')" echo "Run the following to download into the expected path:" echo "git clone --recurse-submodules https://github.com/coreutils/coreutils.git \"${path_GNU}\"" + echo "After downloading GNU coreutils to \"${path_GNU}\" run the following commands to cheout latest release tag" + echo "cd \"${path_GNU}\"" + echo "git fetch --all --tags" + echo "git checkout tags/\"${release_tag_GNU}\"" exit 1 fi @@ -152,17 +158,8 @@ sed -i -e '/tests\/misc\/invalid-opt.pl/ D' \ sed -i -e '/tests\/misc\/seq-precision.sh/ D' \ Makefile -# TODO: many files and some directories modified with 'sed' in the sections below either no longer exist or have been moved -# TODO: Might need to review and updated sections below -# TODO: As a result this script will fail when executed normally as 'bash util/build-gnu.hs' due to the 'set -e' set at the beginning -# TODO: The rest of the 'sed' commands after first failure in this scenario will not be executed as bash will exit on first error -# TODO: However, the behaviour might be different when running it via GitHub actions (GnuTests) -# TODO: For now, when running in local a workaround would be to comment out the 'set -e' at the beginning of the file - # printf doesn't limit the values used in its arg, so this produced ~2GB of output -# TODO: this is the first one to likely fail as tests/misc/printf.sh does not exist/have been moved sed -i '/INT_OFLOW/ D' tests/misc/printf.sh -# TODO: all commands below might not be executed # Use the system coreutils where the test fails due to error in a util that is not the one being tested sed -i 's|stat|/usr/bin/stat|' tests/touch/60-seconds.sh tests/misc/sort-compress-proc.sh From eac08f72c2ef97a7d7206fb3f98fc28a447172be Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Sat, 26 Aug 2023 13:24:08 -0400 Subject: [PATCH 08/36] split: double quotes --- util/build-gnu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/build-gnu.sh b/util/build-gnu.sh index f6be73ff47..74ec296706 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -41,7 +41,7 @@ if test ! -d "${path_GNU}"; then echo "After downloading GNU coreutils to \"${path_GNU}\" run the following commands to cheout latest release tag" echo "cd \"${path_GNU}\"" echo "git fetch --all --tags" - echo "git checkout tags/\"${release_tag_GNU}\"" + echo "git checkout tags/${release_tag_GNU}" exit 1 fi From 4a4759c43c7cd10c81e53f9e352cd9f8f2a5fb63 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Sat, 26 Aug 2023 13:44:34 -0400 Subject: [PATCH 09/36] split: updated to SYSTEM_TIMEOUT in a few more places --- util/build-gnu.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/build-gnu.sh b/util/build-gnu.sh index 74ec296706..4e196debe5 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -173,10 +173,10 @@ sed -i 's|touch |/usr/bin/touch |' tests/cp/reflink-perm.sh tests/ls/block-size. sed -i 's|ln -|/usr/bin/ln -|' tests/cp/link-deref.sh sed -i 's|cp |/usr/bin/cp |' tests/mv/hard-2.sh sed -i 's|paste |/usr/bin/paste |' tests/misc/od-endian.sh -sed -i 's|timeout |/usr/bin/timeout |' tests/tail-2/follow-stdin.sh +sed -i 's|timeout |'"${SYSTEM_TIMEOUT}"' |' tests/tail-2/follow-stdin.sh # Add specific timeout to tests that currently hang to limit time spent waiting -sed -i 's|\(^\s*\)seq \$|\1/usr/bin/timeout 0.1 seq \$|' tests/misc/seq-precision.sh tests/misc/seq-long-double.sh +sed -i 's|\(^\s*\)seq \$|\1'"${SYSTEM_TIMEOUT}"' 0.1 seq \$|' tests/misc/seq-precision.sh tests/misc/seq-long-double.sh # Remove dup of /usr/bin/ when executed several times grep -rlE '/usr/bin/\s?/usr/bin' init.cfg tests/* | xargs --no-run-if-empty sed -Ei 's|/usr/bin/\s?/usr/bin/|/usr/bin/|g' From a384973b1a7e40435e165ab7a65a59ab26da7fd8 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Sat, 26 Aug 2023 13:51:21 -0400 Subject: [PATCH 10/36] split: remove dup for /usr/local/bin when executed several times --- util/build-gnu.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/util/build-gnu.sh b/util/build-gnu.sh index 4e196debe5..ce15dd3f8e 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -180,6 +180,7 @@ sed -i 's|\(^\s*\)seq \$|\1'"${SYSTEM_TIMEOUT}"' 0.1 seq \$|' tests/misc/seq-pre # Remove dup of /usr/bin/ when executed several times grep -rlE '/usr/bin/\s?/usr/bin' init.cfg tests/* | xargs --no-run-if-empty sed -Ei 's|/usr/bin/\s?/usr/bin/|/usr/bin/|g' +grep -rlE '/usr/local/bin/\s?/usr/local/bin' init.cfg tests/* | xargs --no-run-if-empty sed -Ei 's|/usr/local/bin/\s?/usr/local/bin/|/usr/local/bin/|g' #### Adjust tests to make them work with Rust/coreutils # in some cases, what we are doing in rust/coreutils is good (or better) From 0edba89b55ba51d974c90645079454f128c2ae6b Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Sat, 26 Aug 2023 13:53:36 -0400 Subject: [PATCH 11/36] split: comments --- util/build-gnu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/build-gnu.sh b/util/build-gnu.sh index ce15dd3f8e..57764ed507 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -178,7 +178,7 @@ sed -i 's|timeout |'"${SYSTEM_TIMEOUT}"' |' tests/tail-2/follow-stdin.sh # Add specific timeout to tests that currently hang to limit time spent waiting sed -i 's|\(^\s*\)seq \$|\1'"${SYSTEM_TIMEOUT}"' 0.1 seq \$|' tests/misc/seq-precision.sh tests/misc/seq-long-double.sh -# Remove dup of /usr/bin/ when executed several times +# Remove dup of /usr/bin/ and /usr/local/bin/ when executed several times grep -rlE '/usr/bin/\s?/usr/bin' init.cfg tests/* | xargs --no-run-if-empty sed -Ei 's|/usr/bin/\s?/usr/bin/|/usr/bin/|g' grep -rlE '/usr/local/bin/\s?/usr/local/bin' init.cfg tests/* | xargs --no-run-if-empty sed -Ei 's|/usr/local/bin/\s?/usr/local/bin/|/usr/local/bin/|g' From 2f35989ac34cda92b65328d325bdc615057e2dbe Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Mon, 28 Aug 2023 18:26:48 -0400 Subject: [PATCH 12/36] split: comments --- src/uu/split/src/split.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 77c87e7f5c..698c7818c0 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -108,7 +108,6 @@ fn handle_obsolete(args: &[String]) -> (Vec, Option) { v.push(arg.to_owned()); } } - // println!("{:#?} , {:#?}", v, obs_lines); (v, obs_lines) } @@ -465,8 +464,7 @@ impl Strategy { // Check that the user is not specifying more than one strategy. // // Note: right now, this exact behavior cannot be handled by - // `ArgGroup` since `ArgGroup` considers a default value `Arg` - // as "defined". + // overrides_with_all() due to obsolete lines value option match ( obs_lines, matches.value_source(OPT_LINES) == Some(ValueSource::CommandLine), From e79753c1cf25db25104e853e52e7f687dd776cfb Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Tue, 29 Aug 2023 13:58:26 -0400 Subject: [PATCH 13/36] split: refactor handle_obsolete() function --- src/uu/split/src/split.rs | 71 +++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 698c7818c0..876d04606f 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -72,43 +72,50 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { /// `split -x300e -22 file` would mean `split -x -e -l 22 file` (last obsolete lines option wins) /// following GNU `split` behavior fn handle_obsolete(args: &[String]) -> (Vec, Option) { - let mut v: Vec = vec![]; let mut obs_lines = None; - for arg in args.iter() { - let slice = &arg; - if slice.starts_with('-') && !slice.starts_with("--") { - // start of the short option string - // extract numeric part and filter it out - let mut obs_lines_extracted: Vec = vec![]; - let filtered_slice: Vec = slice - .chars() - .filter(|c| { - if c.is_ascii_digit() { - obs_lines_extracted.push(*c); - false + let filtered_args = args + .iter() + .filter_map(|slice| { + if slice.starts_with('-') && !slice.starts_with("--") { + // start of the short option string + // extract numeric part and filter it out + let mut obs_lines_extracted: Vec = vec![]; + let filtered_slice: Vec = slice + .chars() + .filter(|c| { + if c.is_ascii_digit() { + obs_lines_extracted.push(*c); + false + } else { + true + } + }) + .collect(); + + if obs_lines_extracted.is_empty() { + // no obsolete lines value found/extracted + Some(slice.to_owned()) + } else { + // obsolete lines value was extracted + obs_lines = Some(obs_lines_extracted.iter().collect()); + if filtered_slice.get(1).is_some() { + // there were some short options in front of or after obsolete lines value + // i.e. '-xd100' or '-100de' or similar, which after extraction of obsolete lines value + // would look like '-xd' or '-de' or similar + // preserve it + Some(filtered_slice.iter().collect()) } else { - true + None } - }) - .collect(); - - if filtered_slice.get(1).is_some() { - // there were some short options in front of obsolete lines number - // i.e. '-xd100' or similar + } + } else { + // not a short option // preserve it - v.push(filtered_slice.iter().collect()); - } - if !obs_lines_extracted.is_empty() { - // obsolete lines value was extracted - obs_lines = Some(obs_lines_extracted.iter().collect()); + Some(slice.to_owned()) } - } else { - // not a short option - // preserve it - v.push(arg.to_owned()); - } - } - (v, obs_lines) + }) + .collect(); + (filtered_args, obs_lines) } pub fn uu_app() -> Command { From 15c7170d20c532889ac8f3470c5d59bc12d10f2a Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Tue, 29 Aug 2023 15:49:47 -0400 Subject: [PATCH 14/36] split: fix for GNU Tests regression + tests --- src/uu/split/src/split.rs | 15 ++++++--- tests/by-util/test_split.rs | 65 +++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 4 deletions(-) diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 876d04606f..dae24d36b7 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -76,8 +76,16 @@ fn handle_obsolete(args: &[String]) -> (Vec, Option) { let filtered_args = args .iter() .filter_map(|slice| { - if slice.starts_with('-') && !slice.starts_with("--") { + if slice.starts_with('-') + && !slice.starts_with("--") + && !slice.starts_with("-a") + && !slice.starts_with("-b") + && !slice.starts_with("-C") + && !slice.starts_with("-l") + && !slice.starts_with("-n") + { // start of the short option string + // that can have obsolete lines option value in it // extract numeric part and filter it out let mut obs_lines_extracted: Vec = vec![]; let filtered_slice: Vec = slice @@ -102,15 +110,14 @@ fn handle_obsolete(args: &[String]) -> (Vec, Option) { // there were some short options in front of or after obsolete lines value // i.e. '-xd100' or '-100de' or similar, which after extraction of obsolete lines value // would look like '-xd' or '-de' or similar - // preserve it Some(filtered_slice.iter().collect()) } else { None } } } else { - // not a short option - // preserve it + // either not a short option + // or a short option that cannot have obsolete lines value in it Some(slice.to_owned()) } }) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index aabfcbe90d..9fba2177e4 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -170,6 +170,22 @@ fn test_split_str_prefixed_chunks_by_bytes() { assert_eq!(glob.collate(), at.read_bytes(name)); } +// Test short bytes option concatenated with value +#[test] +fn test_split_by_bytes_short_concatenated_with_value() { + let (at, mut ucmd) = at_and_ucmd!(); + let name = "split_by_bytes_short_concatenated_with_value"; + RandomFile::new(&at, name).add_bytes(10000); + ucmd.args(&["-b1000", name]).succeeds(); + + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); + assert_eq!(glob.count(), 10); + for filename in glob.collect() { + assert_eq!(glob.directory.metadata(&filename).len(), 1000); + } + assert_eq!(glob.collate(), at.read_bytes(name)); +} + // This is designed to test what happens when the desired part size is not a // multiple of the buffer size and we hopefully don't overshoot the desired part // size. @@ -326,6 +342,19 @@ fn test_split_lines_number() { .stderr_only("split: invalid number of lines: 'file'\n"); } +// Test short lines option with value concatenated +#[test] +fn test_split_lines_short_concatenated_with_value() { + let (at, mut ucmd) = at_and_ucmd!(); + let name = "split_num_prefixed_chunks_by_lines"; + RandomFile::new(&at, name).add_lines(10000); + ucmd.args(&["-l1000", name]).succeeds(); + + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); + assert_eq!(glob.count(), 10); + assert_eq!(glob.collate(), at.read_bytes(name)); +} + /// Test for obsolete lines option standalone #[test] fn test_split_obs_lines_standalone() { @@ -692,6 +721,19 @@ fn test_invalid_suffix_length() { .stderr_contains("invalid suffix length: 'xyz'"); } +// Test short suffix length option with value concatenated +#[test] +fn test_split_suffix_length_short_concatenated_with_value() { + let (at, mut ucmd) = at_and_ucmd!(); + let name = "split_num_prefixed_chunks_by_lines"; + RandomFile::new(&at, name).add_lines(10000); + ucmd.args(&["-a4", name]).succeeds(); + + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]][[:alpha:]][[:alpha:]]$"); + assert_eq!(glob.count(), 10); + assert_eq!(glob.collate(), at.read_bytes(name)); +} + #[test] fn test_include_newlines() { let (at, mut ucmd) = at_and_ucmd!(); @@ -710,6 +752,19 @@ fn test_include_newlines() { assert_eq!(s, "5\n"); } +// Test short number of chunks option concatenated with value +#[test] +fn test_split_number_chunks_short_concatenated_with_value() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["-n3", "threebytes.txt"]) + .succeeds() + .no_stdout() + .no_stderr(); + assert_eq!(at.read("xaa"), "a"); + assert_eq!(at.read("xab"), "b"); + assert_eq!(at.read("xac"), "c"); +} + #[test] fn test_allow_empty_files() { let (at, mut ucmd) = at_and_ucmd!(); @@ -784,6 +839,16 @@ fn test_line_bytes() { assert_eq!(at.read("xad"), "ee\n"); } +#[test] +fn test_line_bytes_concatenated_with_value() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["-C8", "letters.txt"]).succeeds(); + assert_eq!(at.read("xaa"), "aaaaaaaa"); + assert_eq!(at.read("xab"), "a\nbbbb\n"); + assert_eq!(at.read("xac"), "cccc\ndd\n"); + assert_eq!(at.read("xad"), "ee\n"); +} + #[test] fn test_line_bytes_no_final_newline() { let (at, mut ucmd) = at_and_ucmd!(); From 7f905a3b8d6d08c271d71b25dcba3d9559f83959 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Tue, 29 Aug 2023 16:35:00 -0400 Subject: [PATCH 15/36] split: edge case for obs lines within combined shorts + test --- src/uu/split/src/split.rs | 9 ++++++++- tests/by-util/test_split.rs | 21 +++++++++++++++++---- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index dae24d36b7..afe635456b 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -88,13 +88,20 @@ fn handle_obsolete(args: &[String]) -> (Vec, Option) { // that can have obsolete lines option value in it // extract numeric part and filter it out let mut obs_lines_extracted: Vec = vec![]; + let mut obs_lines_end_reached = false; let filtered_slice: Vec = slice .chars() .filter(|c| { - if c.is_ascii_digit() { + // To correctly process scenario like '-x200a4' + // we need to stop extracting digits once alphabetic character is encountered + // after we already have something in obs_lines_extracted + if c.is_ascii_digit() && !obs_lines_end_reached { obs_lines_extracted.push(*c); false } else { + if !obs_lines_extracted.is_empty() { + obs_lines_end_reached = true; + } true } }) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index 9fba2177e4..3280fff675 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -170,7 +170,7 @@ fn test_split_str_prefixed_chunks_by_bytes() { assert_eq!(glob.collate(), at.read_bytes(name)); } -// Test short bytes option concatenated with value +/// Test short bytes option concatenated with value #[test] fn test_split_by_bytes_short_concatenated_with_value() { let (at, mut ucmd) = at_and_ucmd!(); @@ -342,7 +342,7 @@ fn test_split_lines_number() { .stderr_only("split: invalid number of lines: 'file'\n"); } -// Test short lines option with value concatenated +/// Test short lines option with value concatenated #[test] fn test_split_lines_short_concatenated_with_value() { let (at, mut ucmd) = at_and_ucmd!(); @@ -401,6 +401,19 @@ fn test_split_obs_lines_within_combined_shorts() { assert_eq!(glob.collate(), at.read_bytes(name)) } +/// Test for obsolete lines option as part of combined short options with tailing suffix length with value +#[test] +fn test_split_obs_lines_within_combined_shorts_tailing_suffix_length() { + let (at, mut ucmd) = at_and_ucmd!(); + let name = "obs-lines-combined-shorts-tailing-suffix-length"; + RandomFile::new(&at, name).add_lines(1000); + ucmd.args(&["-d200a4", name]).succeeds(); + + let glob = Glob::new(&at, ".", r"x\d\d\d\d$"); + assert_eq!(glob.count(), 5); + assert_eq!(glob.collate(), at.read_bytes(name)); +} + /// Test for obsolete lines option starts as part of combined short options #[test] fn test_split_obs_lines_starts_combined_shorts() { @@ -721,7 +734,7 @@ fn test_invalid_suffix_length() { .stderr_contains("invalid suffix length: 'xyz'"); } -// Test short suffix length option with value concatenated +/// Test short suffix length option with value concatenated #[test] fn test_split_suffix_length_short_concatenated_with_value() { let (at, mut ucmd) = at_and_ucmd!(); @@ -752,7 +765,7 @@ fn test_include_newlines() { assert_eq!(s, "5\n"); } -// Test short number of chunks option concatenated with value +/// Test short number of chunks option concatenated with value #[test] fn test_split_number_chunks_short_concatenated_with_value() { let (at, mut ucmd) = at_and_ucmd!(); From b2ebe6a1d180fca68b5891ecf542f956e7df8331 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Wed, 30 Aug 2023 11:15:11 -0400 Subject: [PATCH 16/36] build-gnu.sh: target GNU release v9.4 --- util/build-gnu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/build-gnu.sh b/util/build-gnu.sh index a6d8de29ef..21a231bc01 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -32,7 +32,7 @@ fi ### -release_tag_GNU="v9.3" +release_tag_GNU="v9.4" if test ! -d "${path_GNU}"; then echo "Could not find GNU coreutils (expected at '${path_GNU}')" From 6f37b4b4cfcbe45725d91d63dadabd1aa9873a2b Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Wed, 30 Aug 2023 19:29:57 -0400 Subject: [PATCH 17/36] split: hyphenated values + tests --- src/uu/split/src/split.rs | 50 ++++++++++++++++++++--- tests/by-util/test_split.rs | 81 ++++++++++++++++++++++++++++++++++--- 2 files changed, 120 insertions(+), 11 deletions(-) diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index afe635456b..7834dfe682 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -73,11 +73,18 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { /// following GNU `split` behavior fn handle_obsolete(args: &[String]) -> (Vec, Option) { let mut obs_lines = None; + let mut preceding_long_opt_req_value = false; + let mut preceding_short_opt_req_value = false; let filtered_args = args .iter() .filter_map(|slice| { + let filter: Option; + // check if the slice is a true short option (and not hyphen prefixed value of an option) + // and if so, a short option that can contain obsolete lines value if slice.starts_with('-') && !slice.starts_with("--") + && !preceding_long_opt_req_value + && !preceding_short_opt_req_value && !slice.starts_with("-a") && !slice.starts_with("-b") && !slice.starts_with("-C") @@ -109,7 +116,7 @@ fn handle_obsolete(args: &[String]) -> (Vec, Option) { if obs_lines_extracted.is_empty() { // no obsolete lines value found/extracted - Some(slice.to_owned()) + filter = Some(slice.to_owned()); } else { // obsolete lines value was extracted obs_lines = Some(obs_lines_extracted.iter().collect()); @@ -117,16 +124,41 @@ fn handle_obsolete(args: &[String]) -> (Vec, Option) { // there were some short options in front of or after obsolete lines value // i.e. '-xd100' or '-100de' or similar, which after extraction of obsolete lines value // would look like '-xd' or '-de' or similar - Some(filtered_slice.iter().collect()) + filter = Some(filtered_slice.iter().collect()); } else { - None + filter = None; } } } else { // either not a short option // or a short option that cannot have obsolete lines value in it - Some(slice.to_owned()) + filter = Some(slice.to_owned()); } + // capture if current slice is a preceding long option that requires value and does not use '=' to assign that value + // following slice should be treaded as value for this option + // even if it starts with '-' (which would be treated as hyphen prefixed value) + if slice.starts_with("--") { + preceding_long_opt_req_value = &slice[2..] == OPT_BYTES + || &slice[2..] == OPT_LINE_BYTES + || &slice[2..] == OPT_LINES + || &slice[2..] == OPT_ADDITIONAL_SUFFIX + || &slice[2..] == OPT_FILTER + || &slice[2..] == OPT_NUMBER + || &slice[2..] == OPT_SUFFIX_LENGTH; + } + // capture if current slice is a preceding short option that requires value and does not have value in the same slice (value separated by whitespace) + // following slice should be treaded as value for this option + // even if it starts with '-' (which would be treated as hyphen prefixed value) + preceding_short_opt_req_value = + slice == "-b" || slice == "-C" || slice == "-l" || slice == "-n" || slice == "-a"; + // slice is a value + // reset preceding option flags + if !slice.starts_with('-') { + preceding_short_opt_req_value = false; + preceding_long_opt_req_value = false; + } + // return filter + filter }) .collect(); (filtered_args, obs_lines) @@ -144,6 +176,7 @@ pub fn uu_app() -> Command { Arg::new(OPT_BYTES) .short('b') .long(OPT_BYTES) + .allow_hyphen_values(true) .value_name("SIZE") .help("put SIZE bytes per output file"), ) @@ -151,14 +184,15 @@ pub fn uu_app() -> Command { Arg::new(OPT_LINE_BYTES) .short('C') .long(OPT_LINE_BYTES) + .allow_hyphen_values(true) .value_name("SIZE") - .default_value("2") .help("put at most SIZE bytes of lines per output file"), ) .arg( Arg::new(OPT_LINES) .short('l') .long(OPT_LINES) + .allow_hyphen_values(true) .value_name("NUMBER") .default_value("1000") .help("put NUMBER lines/records per output file"), @@ -167,6 +201,7 @@ pub fn uu_app() -> Command { Arg::new(OPT_NUMBER) .short('n') .long(OPT_NUMBER) + .allow_hyphen_values(true) .value_name("CHUNKS") .help("generate CHUNKS output files; see explanation below"), ) @@ -174,6 +209,7 @@ pub fn uu_app() -> Command { .arg( Arg::new(OPT_ADDITIONAL_SUFFIX) .long(OPT_ADDITIONAL_SUFFIX) + .allow_hyphen_values(true) .value_name("SUFFIX") .default_value("") .help("additional SUFFIX to append to output file names"), @@ -181,6 +217,7 @@ pub fn uu_app() -> Command { .arg( Arg::new(OPT_FILTER) .long(OPT_FILTER) + .allow_hyphen_values(true) .value_name("COMMAND") .value_hint(clap::ValueHint::CommandName) .help( @@ -250,9 +287,10 @@ pub fn uu_app() -> Command { Arg::new(OPT_SUFFIX_LENGTH) .short('a') .long(OPT_SUFFIX_LENGTH) + .allow_hyphen_values(true) .value_name("N") .default_value(OPT_DEFAULT_SUFFIX_LENGTH) - .help("use suffixes of fixed length N. 0 implies dynamic length."), + .help("use suffixes of fixed length N. 0 implies dynamic length, starting with 2"), ) .arg( Arg::new(OPT_VERBOSE) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index 3280fff675..466dabda9a 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -254,6 +254,18 @@ fn test_additional_suffix_no_slash() { .usage_error("invalid suffix 'a/b', contains directory separator"); } +#[test] +fn test_split_additional_suffix_hyphen_value() { + let (at, mut ucmd) = at_and_ucmd!(); + let name = "split_additional_suffix"; + RandomFile::new(&at, name).add_lines(2000); + ucmd.args(&["--additional-suffix", "-300", name]).succeeds(); + + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]-300$"); + assert_eq!(glob.count(), 2); + assert_eq!(glob.collate(), at.read_bytes(name)); +} + // note: the test_filter* tests below are unix-only // windows support has been waived for now because of the difficulty of getting // the `cmd` call right @@ -436,9 +448,9 @@ fn test_split_obs_lines_starts_combined_shorts() { /// Test for using both obsolete lines (standalone) option and short/long lines option simultaneously #[test] fn test_split_both_lines_and_obs_lines_standalone() { - // This test will ensure that if both lines option '-l' or '--lines' - // and obsolete lines option '-100' are used - // it fails + // This test will ensure that: + // if both lines option '-l' or '--lines' (with value) and obsolete lines option '-100' are used - it fails + // if standalone lines option is used incorrectly and treated as a hyphen prefixed value of other option - it fails let scene = TestScenario::new(util_name!()); let at = &scene.fixtures; at.touch("file"); @@ -455,18 +467,77 @@ fn test_split_both_lines_and_obs_lines_standalone() { .fails() .code_is(1) .stderr_contains("split: cannot split in more than one way\n"); +} + +/// Test for using obsolete lines option incorrectly, so it is treated as a hyphen prefixed value of other option +#[test] +fn test_split_obs_lines_as_other_option_value() { + // This test will ensure that: + // if obsolete lines option is used incorrectly and treated as a hyphen prefixed value of other option - it fails + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + at.touch("file"); + scene .ucmd() .args(&["--lines", "-200", "file"]) .fails() .code_is(1) - .stderr_contains("split: cannot split in more than one way\n"); + .stderr_contains("split: invalid number of lines: '-200'\n"); scene .ucmd() .args(&["-l", "-200", "file"]) .fails() .code_is(1) - .stderr_contains("split: cannot split in more than one way\n"); + .stderr_contains("split: invalid number of lines: '-200'\n"); + scene + .ucmd() + .args(&["-a", "-200", "file"]) + .fails() + .code_is(1) + .stderr_contains("split: invalid suffix length: '-200'\n"); + scene + .ucmd() + .args(&["--suffix-length", "-d200e", "file"]) + .fails() + .code_is(1) + .stderr_contains("split: invalid suffix length: '-d200e'\n"); + scene + .ucmd() + .args(&["-C", "-200", "file"]) + .fails() + .code_is(1) + .stderr_contains("split: invalid number of bytes: '-200'\n"); + scene + .ucmd() + .args(&["--line-bytes", "-x200a4", "file"]) + .fails() + .code_is(1) + .stderr_contains("split: invalid number of bytes: '-x200a4'\n"); + scene + .ucmd() + .args(&["-b", "-200", "file"]) + .fails() + .code_is(1) + .stderr_contains("split: invalid number of bytes: '-200'\n"); + scene + .ucmd() + .args(&["--bytes", "-200xd", "file"]) + .fails() + .code_is(1) + .stderr_contains("split: invalid number of bytes: '-200xd'\n"); + scene + .ucmd() + .args(&["-n", "-200", "file"]) + .fails() + .code_is(1) + .stderr_contains("split: invalid number of chunks: -200\n"); + scene + .ucmd() + .args(&["--number", "-e200", "file"]) + .fails() + .code_is(1) + .stderr_contains("split: invalid number of chunks: -e200\n"); } /// Test for using more than one obsolete lines option (standalone) From 843540d05fafe427ed34d7b8b546357d327d3df5 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Thu, 31 Aug 2023 09:21:59 +0200 Subject: [PATCH 18/36] fix a typo --- util/build-gnu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/build-gnu.sh b/util/build-gnu.sh index 21a231bc01..157265f61b 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -38,7 +38,7 @@ if test ! -d "${path_GNU}"; then echo "Could not find GNU coreutils (expected at '${path_GNU}')" echo "Run the following to download into the expected path:" echo "git clone --recurse-submodules https://github.com/coreutils/coreutils.git \"${path_GNU}\"" - echo "After downloading GNU coreutils to \"${path_GNU}\" run the following commands to cheout latest release tag" + echo "After downloading GNU coreutils to \"${path_GNU}\" run the following commands to checkout latest release tag" echo "cd \"${path_GNU}\"" echo "git fetch --all --tags" echo "git checkout tags/${release_tag_GNU}" From 5bfe9b19ef775adf40c3e915a731611dbc4a6256 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Thu, 31 Aug 2023 14:46:56 -0400 Subject: [PATCH 19/36] split: avoid using `collect_lossy` + test for invalid UTF8 arguments --- src/uu/split/src/split.rs | 167 +++++++++++++++++++----------------- tests/by-util/test_split.rs | 45 ++++++++++ 2 files changed, 134 insertions(+), 78 deletions(-) diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 7834dfe682..68692d03c0 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -13,6 +13,7 @@ use crate::filenames::FilenameIterator; use crate::filenames::SuffixType; use clap::{crate_version, parser::ValueSource, Arg, ArgAction, ArgMatches, Command}; use std::env; +use std::ffi::OsString; use std::fmt; use std::fs::{metadata, File}; use std::io; @@ -52,9 +53,8 @@ const AFTER_HELP: &str = help_section!("after help", "split.md"); #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { - let args = args.collect_lossy(); - - let (args, obs_lines) = handle_obsolete(&args[..]); + + let (args, obs_lines) = handle_obsolete(args); let matches = uu_app().try_get_matches_from(args)?; @@ -71,91 +71,102 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { /// `split -x300e file` would mean `split -x -l 300 -e file` /// `split -x300e -22 file` would mean `split -x -e -l 22 file` (last obsolete lines option wins) /// following GNU `split` behavior -fn handle_obsolete(args: &[String]) -> (Vec, Option) { +fn handle_obsolete(args: impl uucore::Args) -> (Vec, Option) { let mut obs_lines = None; let mut preceding_long_opt_req_value = false; let mut preceding_short_opt_req_value = false; let filtered_args = args - .iter() - .filter_map(|slice| { - let filter: Option; - // check if the slice is a true short option (and not hyphen prefixed value of an option) - // and if so, a short option that can contain obsolete lines value - if slice.starts_with('-') - && !slice.starts_with("--") - && !preceding_long_opt_req_value - && !preceding_short_opt_req_value - && !slice.starts_with("-a") - && !slice.starts_with("-b") - && !slice.starts_with("-C") - && !slice.starts_with("-l") - && !slice.starts_with("-n") - { - // start of the short option string - // that can have obsolete lines option value in it - // extract numeric part and filter it out - let mut obs_lines_extracted: Vec = vec![]; - let mut obs_lines_end_reached = false; - let filtered_slice: Vec = slice - .chars() - .filter(|c| { - // To correctly process scenario like '-x200a4' - // we need to stop extracting digits once alphabetic character is encountered - // after we already have something in obs_lines_extracted - if c.is_ascii_digit() && !obs_lines_end_reached { - obs_lines_extracted.push(*c); - false - } else { - if !obs_lines_extracted.is_empty() { - obs_lines_end_reached = true; + .filter_map(|os_slice| { + let filter: Option; + if let Some(slice) = os_slice.to_str() { + // check if the slice is a true short option (and not hyphen prefixed value of an option) + // and if so, a short option that can contain obsolete lines value + if slice.starts_with('-') + && !slice.starts_with("--") + && !preceding_long_opt_req_value + && !preceding_short_opt_req_value + && !slice.starts_with("-a") + && !slice.starts_with("-b") + && !slice.starts_with("-C") + && !slice.starts_with("-l") + && !slice.starts_with("-n") + { + // start of the short option string + // that can have obsolete lines option value in it + // extract numeric part and filter it out + let mut obs_lines_extracted: Vec = vec![]; + let mut obs_lines_end_reached = false; + let filtered_slice: Vec = slice + .chars() + .filter(|c| { + // To correctly process scenario like '-x200a4' + // we need to stop extracting digits once alphabetic character is encountered + // after we already have something in obs_lines_extracted + if c.is_ascii_digit() && !obs_lines_end_reached { + obs_lines_extracted.push(*c); + false + } else { + if !obs_lines_extracted.is_empty() { + obs_lines_end_reached = true; + } + true } - true - } - }) - .collect(); + }) + .collect(); - if obs_lines_extracted.is_empty() { - // no obsolete lines value found/extracted - filter = Some(slice.to_owned()); - } else { - // obsolete lines value was extracted - obs_lines = Some(obs_lines_extracted.iter().collect()); - if filtered_slice.get(1).is_some() { - // there were some short options in front of or after obsolete lines value - // i.e. '-xd100' or '-100de' or similar, which after extraction of obsolete lines value - // would look like '-xd' or '-de' or similar - filter = Some(filtered_slice.iter().collect()); + if obs_lines_extracted.is_empty() { + // no obsolete lines value found/extracted + filter = Some(OsString::from(slice)); } else { - filter = None; + // obsolete lines value was extracted + obs_lines = Some(obs_lines_extracted.iter().collect()); + if filtered_slice.get(1).is_some() { + // there were some short options in front of or after obsolete lines value + // i.e. '-xd100' or '-100de' or similar, which after extraction of obsolete lines value + // would look like '-xd' or '-de' or similar + let filtered_slice: String = filtered_slice.iter().collect(); + filter = Some(OsString::from(filtered_slice)); + } else { + filter = None; + } } + } else { + // either not a short option + // or a short option that cannot have obsolete lines value in it + filter = Some(OsString::from(slice)); + } + // capture if current slice is a preceding long option that requires value and does not use '=' to assign that value + // following slice should be treaded as value for this option + // even if it starts with '-' (which would be treated as hyphen prefixed value) + if slice.starts_with("--") { + preceding_long_opt_req_value = &slice[2..] == OPT_BYTES + || &slice[2..] == OPT_LINE_BYTES + || &slice[2..] == OPT_LINES + || &slice[2..] == OPT_ADDITIONAL_SUFFIX + || &slice[2..] == OPT_FILTER + || &slice[2..] == OPT_NUMBER + || &slice[2..] == OPT_SUFFIX_LENGTH; + } + // capture if current slice is a preceding short option that requires value and does not have value in the same slice (value separated by whitespace) + // following slice should be treaded as value for this option + // even if it starts with '-' (which would be treated as hyphen prefixed value) + preceding_short_opt_req_value = slice == "-b" + || slice == "-C" + || slice == "-l" + || slice == "-n" + || slice == "-a"; + // slice is a value + // reset preceding option flags + if !slice.starts_with('-') { + preceding_short_opt_req_value = false; + preceding_long_opt_req_value = false; } } else { - // either not a short option - // or a short option that cannot have obsolete lines value in it - filter = Some(slice.to_owned()); - } - // capture if current slice is a preceding long option that requires value and does not use '=' to assign that value - // following slice should be treaded as value for this option - // even if it starts with '-' (which would be treated as hyphen prefixed value) - if slice.starts_with("--") { - preceding_long_opt_req_value = &slice[2..] == OPT_BYTES - || &slice[2..] == OPT_LINE_BYTES - || &slice[2..] == OPT_LINES - || &slice[2..] == OPT_ADDITIONAL_SUFFIX - || &slice[2..] == OPT_FILTER - || &slice[2..] == OPT_NUMBER - || &slice[2..] == OPT_SUFFIX_LENGTH; - } - // capture if current slice is a preceding short option that requires value and does not have value in the same slice (value separated by whitespace) - // following slice should be treaded as value for this option - // even if it starts with '-' (which would be treated as hyphen prefixed value) - preceding_short_opt_req_value = - slice == "-b" || slice == "-C" || slice == "-l" || slice == "-n" || slice == "-a"; - // slice is a value - // reset preceding option flags - if !slice.starts_with('-') { - preceding_short_opt_req_value = false; - preceding_long_opt_req_value = false; + // Cannot cleanly convert os_slice to UTF-8 + // Do not process and return as-is + // This will cause failure later on, but we should not handle it here + // and let clap panic on invalid UTF-8 argument + filter = Some(os_slice); } // return filter filter diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index 466dabda9a..d24d2cf54d 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -9,6 +9,7 @@ use rand::{thread_rng, Rng, SeedableRng}; use regex::Regex; #[cfg(not(windows))] use std::env; +use std::ffi::OsStr; use std::path::Path; use std::{ fs::{read_dir, File}, @@ -1287,3 +1288,47 @@ fn test_split_invalid_input() { .no_stdout() .stderr_contains("split: invalid number of chunks: 0"); } + +/// Test if there are invalid (non UTF-8) in the arguments - unix +/// clap is expected to fail/panic +#[cfg(unix)] +#[test] +fn test_split_non_utf8_argument_unix() { + use std::os::unix::ffi::OsStrExt; + + let (at, mut ucmd) = at_and_ucmd!(); + let name = "test_split_non_utf8_argument"; + let opt = OsStr::from_bytes("--additional-suffix".as_bytes()); + RandomFile::new(&at, name).add_lines(2000); + // Here, the values 0x66 and 0x6f correspond to 'f' and 'o' + // respectively. The value 0x80 is a lone continuation byte, invalid + // in a UTF-8 sequence. + let opt_value = [0x66, 0x6f, 0x80, 0x6f]; + let opt_value = OsStr::from_bytes(&opt_value[..]); + let name = OsStr::from_bytes(name.as_bytes()); + ucmd.args(&[opt, opt_value, name]) + .fails() + .stderr_contains("error: invalid UTF-8 was detected in one or more arguments"); +} + +/// Test if there are invalid (non UTF-8) in the arguments - windows +/// clap is expected to fail/panic +#[cfg(windows)] +#[test] +fn test_split_non_utf8_argument_windows() { + use std::os::windows::prelude::*; + + let (at, mut ucmd) = at_and_ucmd!(); + let name = "test_split_non_utf8_argument"; + let opt = OsStr::from_bytes("--additional-suffix".as_bytes()); + RandomFile::new(&at, name).add_lines(2000); + // Here the values 0x0066 and 0x006f correspond to 'f' and 'o' + // respectively. The value 0xD800 is a lone surrogate half, invalid + // in a UTF-16 sequence. + let opt_value = [0x0066, 0x006f, 0xD800, 0x006f]; + let opt_value = OsString::from_wide(&opt_value[..]); + let name = OsStr::from_bytes(name.as_bytes()); + ucmd.args(&[opt, opt_value, name]) + .fails() + .stderr_contains("error: invalid UTF-8 was detected in one or more arguments"); +} From 271a108fa9e1ea75ed8c6e42c2a207a6da935c81 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Thu, 31 Aug 2023 15:37:42 -0400 Subject: [PATCH 20/36] split: formatting --- src/uu/split/src/split.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 68692d03c0..d76bfb2de7 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -53,7 +53,6 @@ const AFTER_HELP: &str = help_section!("after help", "split.md"); #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { - let (args, obs_lines) = handle_obsolete(args); let matches = uu_app().try_get_matches_from(args)?; From d2812cbbc387b7dc3ad8f5f31863aad09ce1e6ad Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Thu, 31 Aug 2023 16:04:44 -0400 Subject: [PATCH 21/36] split: disable windows test for invalid UTF8 --- tests/by-util/test_split.rs | 46 +++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index d24d2cf54d..aef9ea040e 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -9,7 +9,6 @@ use rand::{thread_rng, Rng, SeedableRng}; use regex::Regex; #[cfg(not(windows))] use std::env; -use std::ffi::OsStr; use std::path::Path; use std::{ fs::{read_dir, File}, @@ -1294,6 +1293,7 @@ fn test_split_invalid_input() { #[cfg(unix)] #[test] fn test_split_non_utf8_argument_unix() { + use std::ffi::OsStr; use std::os::unix::ffi::OsStrExt; let (at, mut ucmd) = at_and_ucmd!(); @@ -1311,24 +1311,26 @@ fn test_split_non_utf8_argument_unix() { .stderr_contains("error: invalid UTF-8 was detected in one or more arguments"); } -/// Test if there are invalid (non UTF-8) in the arguments - windows -/// clap is expected to fail/panic -#[cfg(windows)] -#[test] -fn test_split_non_utf8_argument_windows() { - use std::os::windows::prelude::*; - - let (at, mut ucmd) = at_and_ucmd!(); - let name = "test_split_non_utf8_argument"; - let opt = OsStr::from_bytes("--additional-suffix".as_bytes()); - RandomFile::new(&at, name).add_lines(2000); - // Here the values 0x0066 and 0x006f correspond to 'f' and 'o' - // respectively. The value 0xD800 is a lone surrogate half, invalid - // in a UTF-16 sequence. - let opt_value = [0x0066, 0x006f, 0xD800, 0x006f]; - let opt_value = OsString::from_wide(&opt_value[..]); - let name = OsStr::from_bytes(name.as_bytes()); - ucmd.args(&[opt, opt_value, name]) - .fails() - .stderr_contains("error: invalid UTF-8 was detected in one or more arguments"); -} +// Test if there are invalid (non UTF-8) in the arguments - windows +// clap is expected to fail/panic +// comment it out for now +// #[cfg(windows)] +// #[test] +// fn test_split_non_utf8_argument_windows() { +// use std::ffi::OsString; +// use std::os::windows::prelude::*; + +// let (at, mut ucmd) = at_and_ucmd!(); +// let name = "test_split_non_utf8_argument"; +// let opt = OsStr::from_bytes("--additional-suffix".as_bytes()); +// RandomFile::new(&at, name).add_lines(2000); +// // Here the values 0x0066 and 0x006f correspond to 'f' and 'o' +// // respectively. The value 0xD800 is a lone surrogate half, invalid +// // in a UTF-16 sequence. +// let opt_value = [0x0066, 0x006f, 0xD800, 0x006f]; +// let opt_value = OsString::from_wide(&opt_value[..]); +// let name = OsStr::from_bytes(name.as_bytes()); +// ucmd.args(&[opt, opt_value, name]) +// .fails() +// .stderr_contains("error: invalid UTF-8 was detected in one or more arguments"); +// } From e597189be7555164fc7f57df0b03f1ae788901ea Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Thu, 31 Aug 2023 20:48:44 -0400 Subject: [PATCH 22/36] split: fixed windows test for invalid unicode args --- tests/by-util/test_split.rs | 47 ++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index aef9ea040e..965f2733e2 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -1290,8 +1290,8 @@ fn test_split_invalid_input() { /// Test if there are invalid (non UTF-8) in the arguments - unix /// clap is expected to fail/panic -#[cfg(unix)] #[test] +#[cfg(unix)] fn test_split_non_utf8_argument_unix() { use std::ffi::OsStr; use std::os::unix::ffi::OsStrExt; @@ -1311,26 +1311,25 @@ fn test_split_non_utf8_argument_unix() { .stderr_contains("error: invalid UTF-8 was detected in one or more arguments"); } -// Test if there are invalid (non UTF-8) in the arguments - windows -// clap is expected to fail/panic -// comment it out for now -// #[cfg(windows)] -// #[test] -// fn test_split_non_utf8_argument_windows() { -// use std::ffi::OsString; -// use std::os::windows::prelude::*; - -// let (at, mut ucmd) = at_and_ucmd!(); -// let name = "test_split_non_utf8_argument"; -// let opt = OsStr::from_bytes("--additional-suffix".as_bytes()); -// RandomFile::new(&at, name).add_lines(2000); -// // Here the values 0x0066 and 0x006f correspond to 'f' and 'o' -// // respectively. The value 0xD800 is a lone surrogate half, invalid -// // in a UTF-16 sequence. -// let opt_value = [0x0066, 0x006f, 0xD800, 0x006f]; -// let opt_value = OsString::from_wide(&opt_value[..]); -// let name = OsStr::from_bytes(name.as_bytes()); -// ucmd.args(&[opt, opt_value, name]) -// .fails() -// .stderr_contains("error: invalid UTF-8 was detected in one or more arguments"); -// } +/// Test if there are invalid (non UTF-8) in the arguments - windows +/// clap is expected to fail/panic +#[test] +#[cfg(windows)] +fn test_split_non_utf8_argument_windows() { + use std::ffi::OsString; + use std::os::windows::ffi::OsStringExt; + + let (at, mut ucmd) = at_and_ucmd!(); + let name = "test_split_non_utf8_argument"; + let opt = OsString::from("--additional-suffix"); + RandomFile::new(&at, name).add_lines(2000); + // Here the values 0x0066 and 0x006f correspond to 'f' and 'o' + // respectively. The value 0xD800 is a lone surrogate half, invalid + // in a UTF-16 sequence. + let opt_value = [0x0066, 0x006f, 0xD800, 0x006f]; + let opt_value = OsString::from_wide(&opt_value[..]); + let name = OsString::from(name); + ucmd.args(&[opt, opt_value, name]) + .fails() + .stderr_contains("error: invalid UTF-8 was detected in one or more arguments"); +} From 636c2bb7ae9444551d900d50829eb87414a23f8f Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Mon, 4 Sep 2023 12:05:26 -0400 Subject: [PATCH 23/36] uucore: parse_size_max and split --- src/uu/split/src/split.rs | 28 ++++++++-------------- src/uucore/src/lib/parser/parse_size.rs | 32 +++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index d76bfb2de7..e39f6e93ff 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -21,7 +21,7 @@ use std::io::{stdin, BufRead, BufReader, BufWriter, ErrorKind, Read, Write}; use std::path::Path; use uucore::display::Quotable; use uucore::error::{FromIo, UIoError, UResult, USimpleError, UUsageError}; -use uucore::parse_size::{parse_size, ParseSizeError}; +use uucore::parse_size::{parse_size_max, ParseSizeError}; use uucore::uio_error; use uucore::{format_usage, help_about, help_section, help_usage}; @@ -419,8 +419,7 @@ impl NumberType { let parts: Vec<&str> = s.split('/').collect(); match &parts[..] { [n_str] => { - let num_chunks = n_str - .parse() + let num_chunks = parse_size_max(n_str) .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; if num_chunks > 0 { Ok(Self::Bytes(num_chunks)) @@ -429,32 +428,26 @@ impl NumberType { } } ["l", n_str] => { - let num_chunks = n_str - .parse() + let num_chunks = parse_size_max(n_str) .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; Ok(Self::Lines(num_chunks)) } ["l", k_str, n_str] => { - let num_chunks = n_str - .parse() + let num_chunks = parse_size_max(n_str) .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; - let chunk_number = k_str - .parse() + let chunk_number = parse_size_max(k_str) .map_err(|_| NumberTypeError::ChunkNumber(k_str.to_string()))?; Ok(Self::KthLines(chunk_number, num_chunks)) } ["r", n_str] => { - let num_chunks = n_str - .parse() + let num_chunks = parse_size_max(n_str) .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; Ok(Self::RoundRobin(num_chunks)) } ["r", k_str, n_str] => { - let num_chunks = n_str - .parse() + let num_chunks = parse_size_max(n_str) .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; - let chunk_number = k_str - .parse() + let chunk_number = parse_size_max(k_str) .map_err(|_| NumberTypeError::ChunkNumber(k_str.to_string()))?; Ok(Self::KthRoundRobin(chunk_number, num_chunks)) } @@ -523,7 +516,7 @@ impl Strategy { error: fn(ParseSizeError) -> StrategyError, ) -> Result { let s = matches.get_one::(option).unwrap(); - let n = parse_size(s).map_err(error)?; + let n = parse_size_max(s).map_err(error)?; if n > 0 { Ok(strategy(n)) } else { @@ -542,7 +535,7 @@ impl Strategy { matches.value_source(OPT_NUMBER) == Some(ValueSource::CommandLine), ) { (Some(v), false, false, false, false) => { - let v = parse_size(v).map_err(|_| { + let v = parse_size_max(v).map_err(|_| { StrategyError::Lines(ParseSizeError::ParseFailure(v.to_string())) })?; Ok(Self::Lines(v)) @@ -687,7 +680,6 @@ impl Settings { if additional_suffix.contains('/') { return Err(SettingsError::SuffixContainsSeparator(additional_suffix)); } - let strategy = Strategy::from(matches, obs_lines).map_err(SettingsError::Strategy)?; let (suffix_type, suffix_start) = suffix_type_from(matches)?; let suffix_length_str = matches.get_one::(OPT_SUFFIX_LENGTH).unwrap(); diff --git a/src/uucore/src/lib/parser/parse_size.rs b/src/uucore/src/lib/parser/parse_size.rs index 5f64afcd8a..63039e6093 100644 --- a/src/uucore/src/lib/parser/parse_size.rs +++ b/src/uucore/src/lib/parser/parse_size.rs @@ -7,6 +7,7 @@ use std::error::Error; use std::fmt; +use std::num::IntErrorKind; use crate::display::Quotable; @@ -201,8 +202,10 @@ impl<'parser> Parser<'parser> { radix: u32, original_size: &str, ) -> Result { - u64::from_str_radix(numeric_string, radix) - .map_err(|_| ParseSizeError::ParseFailure(original_size.to_string())) + u64::from_str_radix(numeric_string, radix).map_err(|e| match e.kind() { + IntErrorKind::PosOverflow => ParseSizeError::size_too_big(original_size), + _ => ParseSizeError::ParseFailure(original_size.to_string()), + }) } } @@ -232,6 +235,23 @@ pub fn parse_size(size: &str) -> Result { Parser::default().parse(size) } +/// Same as `parse_size()`, except returns `u64::MAX` on overflow +/// GNU lib/coreutils include similar functionality +/// and GNU test suite checks this behavior for some utils +pub fn parse_size_max(size: &str) -> Result { + let result = Parser::default().parse(size); + match result { + Ok(_) => result, + Err(error) => { + if let ParseSizeError::SizeTooBig(_) = error { + Ok(u64::MAX) + } else { + Err(error) + } + } + } +} + #[derive(Debug, PartialEq, Eq)] pub enum ParseSizeError { InvalidSuffix(String), // Suffix @@ -392,6 +412,14 @@ mod tests { ); } + #[test] + #[cfg(not(target_pointer_width = "128"))] + fn overflow_to_max_x64() { + assert_eq!(Ok(u64::MAX), parse_size_max("18446744073709551616")); + assert_eq!(Ok(u64::MAX), parse_size_max("10000000000000000000000")); + assert_eq!(Ok(u64::MAX), parse_size_max("1Y")); + } + #[test] fn invalid_suffix() { let test_strings = ["5mib", "1eb", "1H"]; From 2ae1d8d1ccd7c4fff24b51b8d914cda2be9de006 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Tue, 5 Sep 2023 17:13:30 -0400 Subject: [PATCH 24/36] split: missing functionality for --number option --- src/uu/split/src/split.rs | 218 ++++++++++++++++++++++-- src/uucore/src/lib/parser/parse_size.rs | 2 +- 2 files changed, 204 insertions(+), 16 deletions(-) diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index e39f6e93ff..23520f7092 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -19,9 +19,10 @@ use std::fs::{metadata, File}; use std::io; use std::io::{stdin, BufRead, BufReader, BufWriter, ErrorKind, Read, Write}; use std::path::Path; +use std::u64; use uucore::display::Quotable; use uucore::error::{FromIo, UIoError, UResult, USimpleError, UUsageError}; -use uucore::parse_size::{parse_size_max, ParseSizeError}; +use uucore::parse_size::{parse_size, parse_size_max, ParseSizeError}; use uucore::uio_error; use uucore::{format_usage, help_about, help_section, help_usage}; @@ -337,6 +338,10 @@ enum NumberType { /// Split into a specific number of chunks by byte. Bytes(u64), + /// Split into a specific number of chunks by byte + /// but output only the *k*th chunk. + KthBytes(u64, u64), + /// Split into a specific number of chunks by line (approximately). Lines(u64), @@ -349,7 +354,7 @@ enum NumberType { /// Assign lines via round-robin to the specified number of output /// chunks, but output only the *k*th chunk. - KthRoundRobin(u64, u64), + KthRoundRobin(u64, u64), // not yet implemented? } impl NumberType { @@ -357,6 +362,7 @@ impl NumberType { fn num_chunks(&self) -> u64 { match self { Self::Bytes(n) => *n, + Self::KthBytes(_, n) => *n, Self::Lines(n) => *n, Self::KthLines(_, n) => *n, Self::RoundRobin(n) => *n, @@ -375,6 +381,7 @@ enum NumberTypeError { /// /// ```ignore /// -n N + /// -n K/N /// -n l/N /// -n l/K/N /// -n r/N @@ -385,9 +392,12 @@ enum NumberTypeError { /// The chunk number was invalid. /// /// This can happen if the value of `K` in any of the following - /// command-line options is not a positive integer: + /// command-line options is not a positive integer + /// or if `K` is 0 + /// or if `K` is greater than `N`: /// /// ```ignore + /// -n K/N /// -n l/K/N /// -n r/K/N /// ``` @@ -401,6 +411,7 @@ impl NumberType { /// /// ```ignore /// "N" + /// "K/N" /// "l/N" /// "l/K/N" /// "r/N" @@ -412,14 +423,17 @@ impl NumberType { /// /// # Errors /// - /// If the string is not one of the valid number types, if `K` is - /// not a nonnegative integer, or if `N` is not a positive - /// integer, then this function returns [`NumberTypeError`]. + /// If the string is not one of the valid number types, + /// if `K` is not a nonnegative integer, + /// or if `K` is 0, + /// or if `N` is not a positive integer, + /// or if `K` is greater than `N` + /// then this function returns [`NumberTypeError`]. fn from(s: &str) -> Result { let parts: Vec<&str> = s.split('/').collect(); match &parts[..] { [n_str] => { - let num_chunks = parse_size_max(n_str) + let num_chunks = parse_size(n_str) .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; if num_chunks > 0 { Ok(Self::Bytes(num_chunks)) @@ -427,28 +441,44 @@ impl NumberType { Err(NumberTypeError::NumberOfChunks(s.to_string())) } } + [k_str, n_str] if !k_str.starts_with('l') && !k_str.starts_with('r') => { + let num_chunks = parse_size(n_str) + .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; + let chunk_number = parse_size(k_str) + .map_err(|_| NumberTypeError::ChunkNumber(k_str.to_string()))?; + if chunk_number > num_chunks || chunk_number == 0 { + return Err(NumberTypeError::ChunkNumber(k_str.to_string())); + } + Ok(Self::KthBytes(chunk_number, num_chunks)) + } ["l", n_str] => { - let num_chunks = parse_size_max(n_str) + let num_chunks = parse_size(n_str) .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; Ok(Self::Lines(num_chunks)) } ["l", k_str, n_str] => { - let num_chunks = parse_size_max(n_str) + let num_chunks = parse_size(n_str) .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; - let chunk_number = parse_size_max(k_str) + let chunk_number = parse_size(k_str) .map_err(|_| NumberTypeError::ChunkNumber(k_str.to_string()))?; + if chunk_number > num_chunks || chunk_number == 0 { + return Err(NumberTypeError::ChunkNumber(k_str.to_string())); + } Ok(Self::KthLines(chunk_number, num_chunks)) } ["r", n_str] => { - let num_chunks = parse_size_max(n_str) + let num_chunks = parse_size(n_str) .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; Ok(Self::RoundRobin(num_chunks)) } ["r", k_str, n_str] => { - let num_chunks = parse_size_max(n_str) + let num_chunks = parse_size(n_str) .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; - let chunk_number = parse_size_max(k_str) + let chunk_number = parse_size(k_str) .map_err(|_| NumberTypeError::ChunkNumber(k_str.to_string()))?; + if chunk_number > num_chunks || chunk_number == 0 { + return Err(NumberTypeError::ChunkNumber(k_str.to_string())); + } Ok(Self::KthRoundRobin(chunk_number, num_chunks)) } _ => Err(NumberTypeError::NumberOfChunks(s.to_string())), @@ -538,7 +568,13 @@ impl Strategy { let v = parse_size_max(v).map_err(|_| { StrategyError::Lines(ParseSizeError::ParseFailure(v.to_string())) })?; - Ok(Self::Lines(v)) + if v > 0 { + Ok(Self::Lines(v)) + } else { + Err(StrategyError::Lines(ParseSizeError::ParseFailure( + v.to_string(), + ))) + } } (None, false, false, false, false) => Ok(Self::Lines(1000)), (None, true, false, false, false) => { @@ -1263,6 +1299,93 @@ where } } +/// Print the k-th chunk of a file to stdout, splitting by byte. +/// +/// This function is like [`split_into_n_chunks_by_byte`], but instead +/// of writing each chunk to its own file, it only writes to stdout +/// the contents of the chunk identified by `chunk_number` +/// +/// # Errors +/// +/// This function returns an error if there is a problem reading from +/// `reader` or writing to stdout. +fn kth_chunks_by_byte( + settings: &Settings, + reader: &mut R, + chunk_number: u64, + num_chunks: u64, +) -> UResult<()> +where + R: BufRead, +{ + // Get the size of the input file in bytes and compute the number + // of bytes per chunk. + // + // If the requested number of chunks exceeds the number of bytes + // in the file - just write empty byte string to stdout + // NOTE: the `elide_empty_files` parameter is ignored here + // as we do not generate any files + // and instead writing to stdout + let metadata = metadata(&settings.input).map_err(|_| { + USimpleError::new(1, format!("{}: cannot determine file size", settings.input)) + })?; + + let num_bytes = metadata.len(); + // If input file is empty and we would have written zero chunks of output, + // then terminate immediately. + // This happens on `split -e -n 3 /dev/null`, for example. + if num_bytes == 0 { + return Ok(()); + } + + // Write to stdout instead of to a file. + let stdout = std::io::stdout(); + let mut writer = stdout.lock(); + + let chunk_size = (num_bytes / (num_chunks)).max(1); + let mut num_bytes: usize = num_bytes.try_into().unwrap(); + + let mut i = 1; + loop { + let buf: &mut Vec = &mut vec![]; + if num_bytes > 0 { + // Read `chunk_size` bytes from the reader into `buf` + // except the last. + // + // The last chunk gets all remaining bytes so that if the number + // of bytes in the input file was not evenly divisible by + // `num_chunks`, we don't leave any bytes behind. + let limit = { + if i == num_chunks { + num_bytes.try_into().unwrap() + } else { + chunk_size + } + }; + let n_bytes_read = reader.by_ref().take(limit).read_to_end(buf); + match n_bytes_read { + Ok(n_bytes) => { + num_bytes -= n_bytes; + } + Err(error) => { + return Err(USimpleError::new( + 1, + format!("{}: cannot read from input : {}", settings.input, error), + )); + } + } + if i == chunk_number { + writer.write_all(buf)?; + break; + } + i += 1; + } else { + break; + } + } + Ok(()) +} + /// Split a file into a specific number of chunks by line. /// /// This function always creates one output file for each chunk, even @@ -1438,6 +1561,50 @@ where Ok(()) } +/// Print the k-th chunk of a file, splitting by line, but +/// assign lines via round-robin to the specified number of output +/// chunks, but output only the *k*th chunk. +/// +/// This function is like [`kth_chunk_by_line`], as it only writes to stdout and +/// prints out only *k*th chunk +/// It is also like [`split_into_n_chunks_by_line_round_robin`], as it is assigning chunks +/// using round robin distribution +/// +/// # Errors +/// +/// This function returns an error if there is a problem reading from +/// `reader` or writing to one of the output files. +/// +/// # See also +/// +/// * [`split_into_n_chunks_by_line_round_robin`], which splits its input in the +/// same way, but writes each chunk to its own file. +fn kth_chunk_by_line_round_robin( + _settings: &Settings, + reader: &mut R, + chunk_number: u64, + num_chunks: u64, +) -> UResult<()> +where + R: BufRead, +{ + // Write to stdout instead of to a file. + let stdout = std::io::stdout(); + let mut writer = stdout.lock(); + + let num_chunks: usize = num_chunks.try_into().unwrap(); + let chunk_number: usize = chunk_number.try_into().unwrap(); + for (i, line_result) in reader.lines().enumerate() { + let line = line_result?; + let bytes = line.as_bytes(); + if (i % num_chunks) == chunk_number { + writer.write_all(bytes)?; + writer.write_all(b"\n")?; + } + } + Ok(()) +} + fn split(settings: &Settings) -> UResult<()> { let mut reader = BufReader::new(if settings.input == "-" { Box::new(stdin()) as Box @@ -1455,6 +1622,9 @@ fn split(settings: &Settings) -> UResult<()> { Strategy::Number(NumberType::Bytes(num_chunks)) => { split_into_n_chunks_by_byte(settings, &mut reader, num_chunks) } + Strategy::Number(NumberType::KthBytes(chunk_number, num_chunks)) => { + kth_chunks_by_byte(settings, &mut reader, chunk_number, num_chunks) + } Strategy::Number(NumberType::Lines(num_chunks)) => { split_into_n_chunks_by_line(settings, &mut reader, num_chunks) } @@ -1467,7 +1637,12 @@ fn split(settings: &Settings) -> UResult<()> { Strategy::Number(NumberType::RoundRobin(num_chunks)) => { split_into_n_chunks_by_line_round_robin(settings, &mut reader, num_chunks) } - Strategy::Number(_) => Err(USimpleError::new(1, "-n mode not yet fully implemented")), + Strategy::Number(NumberType::KthRoundRobin(chunk_number, num_chunks)) => { + // The chunk number is given as a 1-indexed number, but it + // is a little easier to deal with a 0-indexed number. + let chunk_number = chunk_number - 1; + kth_chunk_by_line_round_robin(settings, &mut reader, chunk_number, num_chunks) + } Strategy::Lines(chunk_size) => { let mut writer = LineChunkWriter::new(chunk_size, settings)?; match std::io::copy(&mut reader, &mut writer) { @@ -1570,6 +1745,18 @@ mod tests { NumberType::from("l/abc/456").unwrap_err(), NumberTypeError::ChunkNumber("abc".to_string()) ); + assert_eq!( + NumberType::from("l/456/123").unwrap_err(), + NumberTypeError::ChunkNumber("456".to_string()) + ); + assert_eq!( + NumberType::from("r/456/123").unwrap_err(), + NumberTypeError::ChunkNumber("456".to_string()) + ); + assert_eq!( + NumberType::from("456/123").unwrap_err(), + NumberTypeError::ChunkNumber("456".to_string()) + ); // In GNU split, the number of chunks get precedence: // // $ split -n l/abc/xyz @@ -1605,6 +1792,7 @@ mod tests { #[test] fn test_number_type_num_chunks() { assert_eq!(NumberType::from("123").unwrap().num_chunks(), 123); + assert_eq!(NumberType::from("123/456").unwrap().num_chunks(), 456); assert_eq!(NumberType::from("l/123").unwrap().num_chunks(), 123); assert_eq!(NumberType::from("l/123/456").unwrap().num_chunks(), 456); assert_eq!(NumberType::from("r/123").unwrap().num_chunks(), 123); diff --git a/src/uucore/src/lib/parser/parse_size.rs b/src/uucore/src/lib/parser/parse_size.rs index 63039e6093..4d9968bb73 100644 --- a/src/uucore/src/lib/parser/parse_size.rs +++ b/src/uucore/src/lib/parser/parse_size.rs @@ -236,7 +236,7 @@ pub fn parse_size(size: &str) -> Result { } /// Same as `parse_size()`, except returns `u64::MAX` on overflow -/// GNU lib/coreutils include similar functionality +/// GNU lib/coreutils include similar functionality /// and GNU test suite checks this behavior for some utils pub fn parse_size_max(size: &str) -> Result { let result = Parser::default().parse(size); From 2a0b4f88370f1ad787937d174b0ca8e6c2e4bf10 Mon Sep 17 00:00:00 2001 From: Yury Zhytkou <54360928+zhitkoff@users.noreply.github.com> Date: Tue, 5 Sep 2023 18:14:14 -0400 Subject: [PATCH 25/36] Update build-gnu.sh --- util/build-gnu.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/util/build-gnu.sh b/util/build-gnu.sh index b57b4fe769..2d949bbb33 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -19,7 +19,6 @@ path_GNU="$(readlink -fm -- "${path_GNU:-${path_UUTILS}/../gnu}")" ### # On MacOS there is no system /usr/bin/timeout - # and trying to add it to /usr/bin (with symlink of copy binary) will fail unless system integrity protection is disabled (not ideal) # ref: https://support.apple.com/en-us/102149 # On MacOS the Homebrew coreutils could be installed and then "sudo ln -s /opt/homebrew/bin/timeout /usr/local/bin/timeout" From a0a9ee6491ca3402e9fdc3af5b7b3f5def3b8268 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Tue, 5 Sep 2023 18:38:22 -0400 Subject: [PATCH 26/36] split: fixing tests for parse_size_max() --- tests/by-util/test_split.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index 053b6f8bf1..2d6474511f 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -608,14 +608,6 @@ fn test_split_invalid_bytes_size() { .fails() .code_is(1) .stderr_only("split: invalid number of bytes: '1024R'\n"); - #[cfg(not(target_pointer_width = "128"))] - new_ucmd!() - .args(&["-b", "1Y"]) - .fails() - .code_is(1) - .stderr_only( - "split: invalid number of bytes: '1Y': Value too large for defined data type\n", - ); #[cfg(target_pointer_width = "32")] { let sizes = ["1000G", "10T"]; @@ -625,6 +617,18 @@ fn test_split_invalid_bytes_size() { } } +#[test] +fn test_split_overflow_bytes_size() { + #[cfg(not(target_pointer_width = "128"))] + let (at, mut ucmd) = at_and_ucmd!(); + let name = "test_split_overflow_bytes_size"; + RandomFile::new(&at, name).add_bytes(1000); + ucmd.args(&["-b", "1Y", name]).succeeds(); + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); + assert_eq!(glob.count(), 1); + assert_eq!(glob.collate(), at.read_bytes(name)); +} + #[test] fn test_split_chunks_num_chunks_oversized_32() { #[cfg(target_pointer_width = "32")] From eaae32ec3bfbf842933b706001368314ee221851 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Tue, 5 Sep 2023 20:12:25 -0400 Subject: [PATCH 27/36] split: comments --- src/uu/split/src/split.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 23520f7092..6fb05dabb5 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -354,7 +354,7 @@ enum NumberType { /// Assign lines via round-robin to the specified number of output /// chunks, but output only the *k*th chunk. - KthRoundRobin(u64, u64), // not yet implemented? + KthRoundRobin(u64, u64), } impl NumberType { From d8a16a235149ab60099c93c889437314095426fb Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Wed, 6 Sep 2023 12:42:49 -0400 Subject: [PATCH 28/36] split: tests --- tests/by-util/test_split.rs | 99 ++++++++++++++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index 2d6474511f..da4a4475be 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -377,6 +377,22 @@ fn test_split_obs_lines_standalone() { let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); assert_eq!(glob.count(), 2); assert_eq!(glob.collate(), at.read_bytes(name)); + ucmd.args(&["-99999999999999999991", name]).succeeds().no_stderr().no_stdout(); + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); + assert_eq!(glob.count(), 2); + assert_eq!(glob.collate(), at.read_bytes(name)); +} + +/// Test for obsolete lines option standalone overflow +#[test] +fn test_split_obs_lines_standalone_overflow() { + let (at, mut ucmd) = at_and_ucmd!(); + let name = "obs-lines-standalone"; + RandomFile::new(&at, name).add_lines(4); + ucmd.args(&["-99999999999999999991", name]).succeeds().no_stderr().no_stdout(); + let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); + assert_eq!(glob.count(), 1); + assert_eq!(glob.collate(), at.read_bytes(name)); } /// Test for obsolete lines option as part of invalid combined short options @@ -756,7 +772,7 @@ creating file 'xaf' } #[test] -fn test_number() { +fn test_number_n() { let (at, mut ucmd) = at_and_ucmd!(); let file_read = |f| { let mut s = String::new(); @@ -771,6 +787,80 @@ fn test_number() { assert_eq!(file_read("xae"), "uvwxyz\n"); } +#[test] +fn test_number_kth_of_n() { + new_ucmd!() + .args(&["--number=3/5", "asciilowercase.txt"]) + .succeeds() + .stdout_only("klmno"); + new_ucmd!() + .args(&["-e", "--number=99/100", "asciilowercase.txt"]) + .succeeds() + .stdout_only(""); + new_ucmd!() + .args(&[ + "--number=r/9223372036854775807/18446744073709551615", + "asciilowercase.txt", + ]) + .succeeds() + .stdout_only(""); + new_ucmd!() + .args(&["--number=0/5", "asciilowercase.txt"]) + .fails() + .stderr_contains("split: invalid chunk number: 0"); + new_ucmd!() + .args(&["--number=10/5", "asciilowercase.txt"]) + .fails() + .stderr_contains("split: invalid chunk number: 10"); + new_ucmd!() + .args(&[ + "--number=9223372036854775807/18446744073709551616", + "asciilowercase.txt", + ]) + .fails() + .stderr_contains("split: invalid number of chunks: 18446744073709551616"); +} + +#[test] +fn test_number_kth_of_n_round_robin() { + new_ucmd!() + .args(&["--number", "r/2/3", "fivelines.txt"]) + .succeeds() + .stdout_only("2\n5\n"); + new_ucmd!() + .args(&["--number", "r/1/4", "fivelines.txt"]) + .succeeds() + .stdout_only("1\n5\n"); + new_ucmd!() + .args(&["-e", "--number", "r/7/7", "fivelines.txt"]) + .succeeds() + .stdout_only(""); + new_ucmd!() + .args(&[ + "--number", + "r/9223372036854775807/18446744073709551615", + "fivelines.txt", + ]) + .succeeds() + .stdout_only(""); + new_ucmd!() + .args(&[ + "--number", + "r/9223372036854775807/18446744073709551616", + "fivelines.txt", + ]) + .fails() + .stderr_contains("split: invalid number of chunks: 18446744073709551616"); + new_ucmd!() + .args(&["--number", "r/0/3", "fivelines.txt"]) + .fails() + .stderr_contains("split: invalid chunk number: 0"); + new_ucmd!() + .args(&["--number", "r/10/3", "fivelines.txt"]) + .fails() + .stderr_contains("split: invalid chunk number: 10"); +} + #[test] fn test_split_number_with_io_blksize() { let (at, mut ucmd) = at_and_ucmd!(); @@ -927,6 +1017,13 @@ fn test_line_bytes() { assert_eq!(at.read("xad"), "ee\n"); } +#[test] +fn test_line_bytes_overflow() { + let (at, mut ucmd) = at_and_ucmd!(); + ucmd.args(&["-C", "18446744073709551616", "letters.txt"]).succeeds(); + assert_eq!(at.read("xaa"), "aaaaaaaaa\nbbbb\ncccc\ndd\nee\n"); +} + #[test] fn test_line_bytes_concatenated_with_value() { let (at, mut ucmd) = at_and_ucmd!(); From e378454a265b103b98e940d5ed9b691985433f54 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Wed, 6 Sep 2023 13:15:35 -0400 Subject: [PATCH 29/36] split: formatting --- tests/by-util/test_split.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index da4a4475be..3ea74606ca 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -377,7 +377,10 @@ fn test_split_obs_lines_standalone() { let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); assert_eq!(glob.count(), 2); assert_eq!(glob.collate(), at.read_bytes(name)); - ucmd.args(&["-99999999999999999991", name]).succeeds().no_stderr().no_stdout(); + ucmd.args(&["-99999999999999999991", name]) + .succeeds() + .no_stderr() + .no_stdout(); let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); assert_eq!(glob.count(), 2); assert_eq!(glob.collate(), at.read_bytes(name)); @@ -389,7 +392,10 @@ fn test_split_obs_lines_standalone_overflow() { let (at, mut ucmd) = at_and_ucmd!(); let name = "obs-lines-standalone"; RandomFile::new(&at, name).add_lines(4); - ucmd.args(&["-99999999999999999991", name]).succeeds().no_stderr().no_stdout(); + ucmd.args(&["-99999999999999999991", name]) + .succeeds() + .no_stderr() + .no_stdout(); let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); assert_eq!(glob.count(), 1); assert_eq!(glob.collate(), at.read_bytes(name)); @@ -1020,7 +1026,8 @@ fn test_line_bytes() { #[test] fn test_line_bytes_overflow() { let (at, mut ucmd) = at_and_ucmd!(); - ucmd.args(&["-C", "18446744073709551616", "letters.txt"]).succeeds(); + ucmd.args(&["-C", "18446744073709551616", "letters.txt"]) + .succeeds(); assert_eq!(at.read("xaa"), "aaaaaaaaa\nbbbb\ncccc\ndd\nee\n"); } From 4fd598e4d513d3fec00876e3c897c7bd6024508f Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Wed, 6 Sep 2023 13:20:58 -0400 Subject: [PATCH 30/36] split: tests --- tests/by-util/test_split.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index 3ea74606ca..c3e8445a62 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -377,13 +377,6 @@ fn test_split_obs_lines_standalone() { let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); assert_eq!(glob.count(), 2); assert_eq!(glob.collate(), at.read_bytes(name)); - ucmd.args(&["-99999999999999999991", name]) - .succeeds() - .no_stderr() - .no_stdout(); - let glob = Glob::new(&at, ".", r"x[[:alpha:]][[:alpha:]]$"); - assert_eq!(glob.count(), 2); - assert_eq!(glob.collate(), at.read_bytes(name)); } /// Test for obsolete lines option standalone overflow From 1669a92694985c8e2ad3df661fe457731698d172 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Wed, 6 Sep 2023 14:02:08 -0400 Subject: [PATCH 31/36] split: tests overflow --- tests/by-util/test_split.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index c3e8445a62..849fdc7cfe 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -2,7 +2,7 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines nbbbb dxen +// spell-checker:ignore xzaaa sixhundredfiftyonebytes ninetyonebytes threebytes asciilowercase fghij klmno pqrst uvwxyz fivelines twohundredfortyonebytes onehundredlines nbbbb dxen ncccc use crate::common::util::{AtPath, TestScenario}; use rand::{thread_rng, Rng, SeedableRng}; @@ -796,6 +796,7 @@ fn test_number_kth_of_n() { .args(&["-e", "--number=99/100", "asciilowercase.txt"]) .succeeds() .stdout_only(""); + #[cfg(target_pointer_width = "64")] new_ucmd!() .args(&[ "--number=r/9223372036854775807/18446744073709551615", @@ -811,6 +812,7 @@ fn test_number_kth_of_n() { .args(&["--number=10/5", "asciilowercase.txt"]) .fails() .stderr_contains("split: invalid chunk number: 10"); + #[cfg(target_pointer_width = "64")] new_ucmd!() .args(&[ "--number=9223372036854775807/18446744073709551616", @@ -834,6 +836,7 @@ fn test_number_kth_of_n_round_robin() { .args(&["-e", "--number", "r/7/7", "fivelines.txt"]) .succeeds() .stdout_only(""); + #[cfg(target_pointer_width = "64")] new_ucmd!() .args(&[ "--number", @@ -842,6 +845,7 @@ fn test_number_kth_of_n_round_robin() { ]) .succeeds() .stdout_only(""); + #[cfg(target_pointer_width = "64")] new_ucmd!() .args(&[ "--number", @@ -1018,6 +1022,7 @@ fn test_line_bytes() { #[test] fn test_line_bytes_overflow() { + #[cfg(target_pointer_width = "64")] let (at, mut ucmd) = at_and_ucmd!(); ucmd.args(&["-C", "18446744073709551616", "letters.txt"]) .succeeds(); From fbf5ac4329b83c8c6e6ffb3f87ab7c2fea09e263 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Wed, 6 Sep 2023 14:16:21 -0400 Subject: [PATCH 32/36] split: tests 32bit --- tests/by-util/test_split.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index 849fdc7cfe..247b3cc578 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -1021,8 +1021,8 @@ fn test_line_bytes() { } #[test] +#[cfg(target_pointer_width = "64")] fn test_line_bytes_overflow() { - #[cfg(target_pointer_width = "64")] let (at, mut ucmd) = at_and_ucmd!(); ucmd.args(&["-C", "18446744073709551616", "letters.txt"]) .succeeds(); From e40e88702270930470d99301118e03eaba37ca6d Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Wed, 6 Sep 2023 18:43:20 -0400 Subject: [PATCH 33/36] split: some refactoring for handle_obsolete() --- src/uu/split/src/split.rs | 241 +++++++++++++++++++++++--------------- 1 file changed, 146 insertions(+), 95 deletions(-) diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 6fb05dabb5..65032dfd9e 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -75,104 +75,152 @@ fn handle_obsolete(args: impl uucore::Args) -> (Vec, Option) { let mut obs_lines = None; let mut preceding_long_opt_req_value = false; let mut preceding_short_opt_req_value = false; + let filtered_args = args .filter_map(|os_slice| { - let filter: Option; - if let Some(slice) = os_slice.to_str() { - // check if the slice is a true short option (and not hyphen prefixed value of an option) - // and if so, a short option that can contain obsolete lines value - if slice.starts_with('-') - && !slice.starts_with("--") - && !preceding_long_opt_req_value - && !preceding_short_opt_req_value - && !slice.starts_with("-a") - && !slice.starts_with("-b") - && !slice.starts_with("-C") - && !slice.starts_with("-l") - && !slice.starts_with("-n") - { - // start of the short option string - // that can have obsolete lines option value in it - // extract numeric part and filter it out - let mut obs_lines_extracted: Vec = vec![]; - let mut obs_lines_end_reached = false; - let filtered_slice: Vec = slice - .chars() - .filter(|c| { - // To correctly process scenario like '-x200a4' - // we need to stop extracting digits once alphabetic character is encountered - // after we already have something in obs_lines_extracted - if c.is_ascii_digit() && !obs_lines_end_reached { - obs_lines_extracted.push(*c); - false - } else { - if !obs_lines_extracted.is_empty() { - obs_lines_end_reached = true; - } - true - } - }) - .collect(); - - if obs_lines_extracted.is_empty() { - // no obsolete lines value found/extracted - filter = Some(OsString::from(slice)); - } else { - // obsolete lines value was extracted - obs_lines = Some(obs_lines_extracted.iter().collect()); - if filtered_slice.get(1).is_some() { - // there were some short options in front of or after obsolete lines value - // i.e. '-xd100' or '-100de' or similar, which after extraction of obsolete lines value - // would look like '-xd' or '-de' or similar - let filtered_slice: String = filtered_slice.iter().collect(); - filter = Some(OsString::from(filtered_slice)); - } else { - filter = None; - } - } - } else { - // either not a short option - // or a short option that cannot have obsolete lines value in it - filter = Some(OsString::from(slice)); - } - // capture if current slice is a preceding long option that requires value and does not use '=' to assign that value - // following slice should be treaded as value for this option - // even if it starts with '-' (which would be treated as hyphen prefixed value) - if slice.starts_with("--") { - preceding_long_opt_req_value = &slice[2..] == OPT_BYTES - || &slice[2..] == OPT_LINE_BYTES - || &slice[2..] == OPT_LINES - || &slice[2..] == OPT_ADDITIONAL_SUFFIX - || &slice[2..] == OPT_FILTER - || &slice[2..] == OPT_NUMBER - || &slice[2..] == OPT_SUFFIX_LENGTH; - } - // capture if current slice is a preceding short option that requires value and does not have value in the same slice (value separated by whitespace) - // following slice should be treaded as value for this option - // even if it starts with '-' (which would be treated as hyphen prefixed value) - preceding_short_opt_req_value = slice == "-b" - || slice == "-C" - || slice == "-l" - || slice == "-n" - || slice == "-a"; - // slice is a value - // reset preceding option flags - if !slice.starts_with('-') { - preceding_short_opt_req_value = false; - preceding_long_opt_req_value = false; - } + filter_args( + os_slice, + &mut obs_lines, + &mut preceding_long_opt_req_value, + &mut preceding_short_opt_req_value, + ) + }) + .collect(); + + (filtered_args, obs_lines) +} + +/// Helper function to [`handle_obsolete`] +/// Filters out obsolete lines option from args +fn filter_args( + os_slice: OsString, + obs_lines: &mut Option, + preceding_long_opt_req_value: &mut bool, + preceding_short_opt_req_value: &mut bool, +) -> Option { + let filter: Option; + if let Some(slice) = os_slice.to_str() { + if should_extract_obs_lines( + slice, + preceding_long_opt_req_value, + preceding_short_opt_req_value, + ) { + // start of the short option string + // that can have obsolete lines option value in it + filter = handle_extract_obs_lines(slice, obs_lines); + } else { + // either not a short option + // or a short option that cannot have obsolete lines value in it + filter = Some(OsString::from(slice)); + } + handle_preceding_options( + slice, + preceding_long_opt_req_value, + preceding_short_opt_req_value, + ); + } else { + // Cannot cleanly convert os_slice to UTF-8 + // Do not process and return as-is + // This will cause failure later on, but we should not handle it here + // and let clap panic on invalid UTF-8 argument + filter = Some(os_slice); + } + filter +} + +/// Helper function to [`filter_args`] +/// Checks if the slice is a true short option (and not hyphen prefixed value of an option) +/// and if so, a short option that can contain obsolete lines value +fn should_extract_obs_lines( + slice: &str, + preceding_long_opt_req_value: &bool, + preceding_short_opt_req_value: &bool, +) -> bool { + slice.starts_with('-') + && !slice.starts_with("--") + && !preceding_long_opt_req_value + && !preceding_short_opt_req_value + && !slice.starts_with("-a") + && !slice.starts_with("-b") + && !slice.starts_with("-C") + && !slice.starts_with("-l") + && !slice.starts_with("-n") +} + +/// Helper function to [`filter_args`] +/// Extracts obsolete lines numeric part from argument slice +/// and filters it out +fn handle_extract_obs_lines(slice: &str, obs_lines: &mut Option) -> Option { + let mut obs_lines_extracted: Vec = vec![]; + let mut obs_lines_end_reached = false; + let filtered_slice: Vec = slice + .chars() + .filter(|c| { + // To correctly process scenario like '-x200a4' + // we need to stop extracting digits once alphabetic character is encountered + // after we already have something in obs_lines_extracted + if c.is_ascii_digit() && !obs_lines_end_reached { + obs_lines_extracted.push(*c); + false } else { - // Cannot cleanly convert os_slice to UTF-8 - // Do not process and return as-is - // This will cause failure later on, but we should not handle it here - // and let clap panic on invalid UTF-8 argument - filter = Some(os_slice); + if !obs_lines_extracted.is_empty() { + obs_lines_end_reached = true; + } + true } - // return filter - filter }) .collect(); - (filtered_args, obs_lines) + + if obs_lines_extracted.is_empty() { + // no obsolete lines value found/extracted + Some(OsString::from(slice)) + } else { + // obsolete lines value was extracted + let extracted: String = obs_lines_extracted.iter().collect(); + *obs_lines = Some(extracted); + if filtered_slice.get(1).is_some() { + // there were some short options in front of or after obsolete lines value + // i.e. '-xd100' or '-100de' or similar, which after extraction of obsolete lines value + // would look like '-xd' or '-de' or similar + let filtered_slice: String = filtered_slice.iter().collect(); + Some(OsString::from(filtered_slice)) + } else { + None + } + } +} + +/// Helper function to [`handle_extract_obs_lines`] +/// Captures if current slice is a preceding option +/// that requires value +fn handle_preceding_options( + slice: &str, + preceding_long_opt_req_value: &mut bool, + preceding_short_opt_req_value: &mut bool, +) { + // capture if current slice is a preceding long option that requires value and does not use '=' to assign that value + // following slice should be treaded as value for this option + // even if it starts with '-' (which would be treated as hyphen prefixed value) + if slice.starts_with("--") { + *preceding_long_opt_req_value = &slice[2..] == OPT_BYTES + || &slice[2..] == OPT_LINE_BYTES + || &slice[2..] == OPT_LINES + || &slice[2..] == OPT_ADDITIONAL_SUFFIX + || &slice[2..] == OPT_FILTER + || &slice[2..] == OPT_NUMBER + || &slice[2..] == OPT_SUFFIX_LENGTH; + } + // capture if current slice is a preceding short option that requires value and does not have value in the same slice (value separated by whitespace) + // following slice should be treaded as value for this option + // even if it starts with '-' (which would be treated as hyphen prefixed value) + *preceding_short_opt_req_value = + slice == "-b" || slice == "-C" || slice == "-l" || slice == "-n" || slice == "-a"; + // slice is a value + // reset preceding option flags + if !slice.starts_with('-') { + *preceding_short_opt_req_value = false; + *preceding_long_opt_req_value = false; + } } pub fn uu_app() -> Command { @@ -430,6 +478,9 @@ impl NumberType { /// or if `K` is greater than `N` /// then this function returns [`NumberTypeError`]. fn from(s: &str) -> Result { + fn is_invalid_chunk(chunk_number: u64, num_chunks: u64) -> bool { + chunk_number > num_chunks || chunk_number == 0 + } let parts: Vec<&str> = s.split('/').collect(); match &parts[..] { [n_str] => { @@ -446,7 +497,7 @@ impl NumberType { .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; let chunk_number = parse_size(k_str) .map_err(|_| NumberTypeError::ChunkNumber(k_str.to_string()))?; - if chunk_number > num_chunks || chunk_number == 0 { + if is_invalid_chunk(chunk_number, num_chunks) { return Err(NumberTypeError::ChunkNumber(k_str.to_string())); } Ok(Self::KthBytes(chunk_number, num_chunks)) @@ -461,7 +512,7 @@ impl NumberType { .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; let chunk_number = parse_size(k_str) .map_err(|_| NumberTypeError::ChunkNumber(k_str.to_string()))?; - if chunk_number > num_chunks || chunk_number == 0 { + if is_invalid_chunk(chunk_number, num_chunks) { return Err(NumberTypeError::ChunkNumber(k_str.to_string())); } Ok(Self::KthLines(chunk_number, num_chunks)) @@ -476,7 +527,7 @@ impl NumberType { .map_err(|_| NumberTypeError::NumberOfChunks(n_str.to_string()))?; let chunk_number = parse_size(k_str) .map_err(|_| NumberTypeError::ChunkNumber(k_str.to_string()))?; - if chunk_number > num_chunks || chunk_number == 0 { + if is_invalid_chunk(chunk_number, num_chunks) { return Err(NumberTypeError::ChunkNumber(k_str.to_string())); } Ok(Self::KthRoundRobin(chunk_number, num_chunks)) From 3be284e0d94eb0c8d61e7ab684a9ff1bec5a4196 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Wed, 6 Sep 2023 19:49:26 -0400 Subject: [PATCH 34/36] split: more test coverage --- src/uu/split/src/split.rs | 2 +- tests/by-util/test_split.rs | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs index 65032dfd9e..7762789ac6 100644 --- a/src/uu/split/src/split.rs +++ b/src/uu/split/src/split.rs @@ -1295,7 +1295,7 @@ where // If we would have written zero chunks of output, then terminate // immediately. This happens on `split -e -n 3 /dev/null`, for // example. - if num_chunks == 0 { + if num_chunks == 0 || num_bytes == 0 { return Ok(()); } diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index 247b3cc578..bbac3c7392 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -340,6 +340,18 @@ fn test_split_lines_number() { .succeeds() .no_stderr() .no_stdout(); + scene + .ucmd() + .args(&["--lines", "0", "file"]) + .fails() + .code_is(1) + .stderr_only("split: invalid number of lines: 0\n"); + scene + .ucmd() + .args(&["-0", "file"]) + .fails() + .code_is(1) + .stderr_only("split: invalid number of lines: 0\n"); scene .ucmd() .args(&["--lines", "2fb", "file"]) @@ -669,6 +681,15 @@ fn test_split_stdin_num_chunks() { .stderr_only("split: -: cannot determine file size\n"); } +#[test] +fn test_split_stdin_num_kth_chunk() { + new_ucmd!() + .args(&["--number=1/2"]) + .fails() + .code_is(1) + .stderr_only("split: -: cannot determine file size\n"); +} + fn file_read(at: &AtPath, filename: &str) -> String { let mut s = String::new(); at.open(filename).read_to_string(&mut s).unwrap(); @@ -784,6 +805,10 @@ fn test_number_n() { assert_eq!(file_read("xac"), "klmno"); assert_eq!(file_read("xad"), "pqrst"); assert_eq!(file_read("xae"), "uvwxyz\n"); + new_ucmd!() + .args(&["--number=100", "/dev/null"]) + .succeeds() + .stdout_only(""); } #[test] @@ -792,10 +817,18 @@ fn test_number_kth_of_n() { .args(&["--number=3/5", "asciilowercase.txt"]) .succeeds() .stdout_only("klmno"); + new_ucmd!() + .args(&["--number=5/5", "asciilowercase.txt"]) + .succeeds() + .stdout_only("uvwxyz\n"); new_ucmd!() .args(&["-e", "--number=99/100", "asciilowercase.txt"]) .succeeds() .stdout_only(""); + new_ucmd!() + .args(&["--number=3/10", "/dev/null"]) + .succeeds() + .stdout_only(""); #[cfg(target_pointer_width = "64")] new_ucmd!() .args(&[ From 8883f016d4ffd9929949130ab724a20b1c3cb132 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Wed, 6 Sep 2023 20:09:26 -0400 Subject: [PATCH 35/36] split: fix windows tests --- tests/by-util/test_split.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index bbac3c7392..c204a6238d 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -805,6 +805,7 @@ fn test_number_n() { assert_eq!(file_read("xac"), "klmno"); assert_eq!(file_read("xad"), "pqrst"); assert_eq!(file_read("xae"), "uvwxyz\n"); + #[cfg(unix)] new_ucmd!() .args(&["--number=100", "/dev/null"]) .succeeds() @@ -825,6 +826,7 @@ fn test_number_kth_of_n() { .args(&["-e", "--number=99/100", "asciilowercase.txt"]) .succeeds() .stdout_only(""); + #[cfg(unix)] new_ucmd!() .args(&["--number=3/10", "/dev/null"]) .succeeds() From 3f065eed8a265e24ea8790a8680dd62410c5f024 Mon Sep 17 00:00:00 2001 From: zhitkoff Date: Wed, 6 Sep 2023 21:04:01 -0400 Subject: [PATCH 36/36] split: fixing test for 32bit --- tests/by-util/test_split.rs | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs index c204a6238d..5eba5071f6 100644 --- a/tests/by-util/test_split.rs +++ b/tests/by-util/test_split.rs @@ -657,19 +657,17 @@ fn test_split_overflow_bytes_size() { } #[test] +#[cfg(target_pointer_width = "32")] fn test_split_chunks_num_chunks_oversized_32() { - #[cfg(target_pointer_width = "32")] - { - let scene = TestScenario::new(util_name!()); - let at = &scene.fixtures; - at.touch("file"); - scene - .ucmd() - .args(&["--number", "5000000000", "file"]) - .fails() - .code_is(1) - .stderr_only("split: Number of chunks too big\n"); - } + let scene = TestScenario::new(util_name!()); + let at = &scene.fixtures; + at.touch("file"); + scene + .ucmd() + .args(&["--number", "5000000000", "sixhundredfiftyonebytes.txt"]) + .fails() + .code_is(1) + .stderr_only("split: Number of chunks too big\n"); } #[test]