From 565a51973a4f64f522e66c8af87bad339c966f31 Mon Sep 17 00:00:00 2001 From: Chris Denton Date: Thu, 29 Jul 2021 02:50:59 +0100 Subject: [PATCH 1/2] Update Windows arg parsing tests This updates the tests to be consistent with argv in modern C/C++ applications. --- library/std/src/sys/windows/args/tests.rs | 64 +++++++++++++++++------ 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/library/std/src/sys/windows/args/tests.rs b/library/std/src/sys/windows/args/tests.rs index 756a4361ea3de..82c32d08c5ea8 100644 --- a/library/std/src/sys/windows/args/tests.rs +++ b/library/std/src/sys/windows/args/tests.rs @@ -5,9 +5,9 @@ fn chk(string: &str, parts: &[&str]) { let mut wide: Vec = OsString::from(string).encode_wide().collect(); wide.push(0); let parsed = - unsafe { parse_lp_cmd_line(wide.as_ptr() as *const u16, || OsString::from("TEST.EXE")) }; + unsafe { parse_lp_cmd_line(WStrUnits::new(wide.as_ptr()), || OsString::from("TEST.EXE")) }; let expected: Vec = parts.iter().map(|k| OsString::from(k)).collect(); - assert_eq!(parsed.as_slice(), expected.as_slice()); + assert_eq!(parsed.as_slice(), expected.as_slice(), "{:?}", string); } #[test] @@ -27,35 +27,65 @@ fn single_words() { #[test] fn official_examples() { chk(r#"EXE "abc" d e"#, &["EXE", "abc", "d", "e"]); - chk(r#"EXE a\\\b d"e f"g h"#, &["EXE", r#"a\\\b"#, "de fg", "h"]); + chk(r#"EXE a\\\b d"e f"g h"#, &["EXE", r"a\\\b", "de fg", "h"]); chk(r#"EXE a\\\"b c d"#, &["EXE", r#"a\"b"#, "c", "d"]); - chk(r#"EXE a\\\\"b c" d e"#, &["EXE", r#"a\\b c"#, "d", "e"]); + chk(r#"EXE a\\\\"b c" d e"#, &["EXE", r"a\\b c", "d", "e"]); } #[test] fn whitespace_behavior() { - chk(r#" test"#, &["", "test"]); - chk(r#" test"#, &["", "test"]); - chk(r#" test test2"#, &["", "test", "test2"]); - chk(r#" test test2"#, &["", "test", "test2"]); - chk(r#"test test2 "#, &["test", "test2"]); - chk(r#"test test2 "#, &["test", "test2"]); - chk(r#"test "#, &["test"]); + chk(" test", &["", "test"]); + chk(" test", &["", "test"]); + chk(" test test2", &["", "test", "test2"]); + chk(" test test2", &["", "test", "test2"]); + chk("test test2 ", &["test", "test2"]); + chk("test test2 ", &["test", "test2"]); + chk("test ", &["test"]); } #[test] fn genius_quotes() { chk(r#"EXE "" """#, &["EXE", "", ""]); - chk(r#"EXE "" """"#, &["EXE", "", "\""]); + chk(r#"EXE "" """"#, &["EXE", "", r#"""#]); chk( r#"EXE "this is """all""" in the same argument""#, - &["EXE", "this is \"all\" in the same argument"], + &["EXE", r#"this is "all" in the same argument"#], ); - chk(r#"EXE "a"""#, &["EXE", "a\""]); - chk(r#"EXE "a"" a"#, &["EXE", "a\"", "a"]); + chk(r#"EXE "a"""#, &["EXE", r#"a""#]); + chk(r#"EXE "a"" a"#, &["EXE", r#"a" a"#]); // quotes cannot be escaped in command names chk(r#""EXE" check"#, &["EXE", "check"]); chk(r#""EXE check""#, &["EXE check"]); - chk(r#""EXE """for""" check"#, &["EXE ", r#"for""#, "check"]); - chk(r#""EXE \"for\" check"#, &[r#"EXE \"#, r#"for""#, "check"]); + chk(r#""EXE """for""" check"#, &["EXE for check"]); + chk(r#""EXE \"for\" check"#, &[r"EXE \for\ check"]); + chk(r#""EXE \" for \" check"#, &[r"EXE \", "for", r#"""#, "check"]); + chk(r#"E"X"E test"#, &["EXE", "test"]); + chk(r#"EX""E test"#, &["EXE", "test"]); +} + +// from https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULESEX +#[test] +fn post_2008() { + chk("EXE CallMeIshmael", &["EXE", "CallMeIshmael"]); + chk(r#"EXE "Call Me Ishmael""#, &["EXE", "Call Me Ishmael"]); + chk(r#"EXE Cal"l Me I"shmael"#, &["EXE", "Call Me Ishmael"]); + chk(r#"EXE CallMe\"Ishmael"#, &["EXE", r#"CallMe"Ishmael"#]); + chk(r#"EXE "CallMe\"Ishmael""#, &["EXE", r#"CallMe"Ishmael"#]); + chk(r#"EXE "Call Me Ishmael\\""#, &["EXE", r"Call Me Ishmael\"]); + chk(r#"EXE "CallMe\\\"Ishmael""#, &["EXE", r#"CallMe\"Ishmael"#]); + chk(r#"EXE a\\\b"#, &["EXE", r"a\\\b"]); + chk(r#"EXE "a\\\b""#, &["EXE", r"a\\\b"]); + chk(r#"EXE "\"Call Me Ishmael\"""#, &["EXE", r#""Call Me Ishmael""#]); + chk(r#"EXE "C:\TEST A\\""#, &["EXE", r"C:\TEST A\"]); + chk(r#"EXE "\"C:\TEST A\\\"""#, &["EXE", r#""C:\TEST A\""#]); + chk(r#"EXE "a b c" d e"#, &["EXE", "a b c", "d", "e"]); + chk(r#"EXE "ab\"c" "\\" d"#, &["EXE", r#"ab"c"#, r"\", "d"]); + chk(r#"EXE a\\\b d"e f"g h"#, &["EXE", r"a\\\b", "de fg", "h"]); + chk(r#"EXE a\\\"b c d"#, &["EXE", r#"a\"b"#, "c", "d"]); + chk(r#"EXE a\\\\"b c" d e"#, &["EXE", r"a\\b c", "d", "e"]); + // Double Double Quotes + chk(r#"EXE "a b c"""#, &["EXE", r#"a b c""#]); + chk(r#"EXE """CallMeIshmael""" b c"#, &["EXE", r#""CallMeIshmael""#, "b", "c"]); + chk(r#"EXE """Call Me Ishmael""""#, &["EXE", r#""Call Me Ishmael""#]); + chk(r#"EXE """"Call Me Ishmael"" b c"#, &["EXE", r#""Call"#, "Me", "Ishmael", "b", "c"]); } From e26dda564219341e25589ff745b16258ad424b78 Mon Sep 17 00:00:00 2001 From: Chris Denton Date: Fri, 6 Aug 2021 00:15:22 +0100 Subject: [PATCH 2/2] Implement modern Windows arg parsing As derived from extensive testing of `argv` in a C/C++ application. Co-Authored-By: Jane Lusby --- library/std/src/lib.rs | 1 + library/std/src/sys/windows/args.rs | 257 ++++++++++++++++------------ library/std/src/sys/windows/c.rs | 2 +- 3 files changed, 154 insertions(+), 106 deletions(-) diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index 1af6157ca68bf..4a9405905dc32 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -253,6 +253,7 @@ #![feature(const_ip)] #![feature(const_ipv4)] #![feature(const_ipv6)] +#![feature(const_option)] #![feature(const_raw_ptr_deref)] #![feature(const_socketaddr)] #![feature(container_error_extra)] diff --git a/library/std/src/sys/windows/args.rs b/library/std/src/sys/windows/args.rs index f1264130faf7a..3919025b08006 100644 --- a/library/std/src/sys/windows/args.rs +++ b/library/std/src/sys/windows/args.rs @@ -1,13 +1,18 @@ -#![allow(dead_code)] // runtime init functions not used during testing +//! The Windows command line is just a string +//! +//! +//! This module implements the parsing necessary to turn that string into a list of arguments. #[cfg(test)] mod tests; use crate::ffi::OsString; use crate::fmt; +use crate::marker::PhantomData; +use crate::num::NonZeroU16; use crate::os::windows::prelude::*; use crate::path::PathBuf; -use crate::slice; +use crate::ptr::NonNull; use crate::sys::c; use crate::sys::windows::os::current_exe; use crate::vec; @@ -15,9 +20,11 @@ use crate::vec; use core::iter; pub fn args() -> Args { + // SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16 + // string so it's safe for `WStrUnits` to use. unsafe { let lp_cmd_line = c::GetCommandLineW(); - let parsed_args_list = parse_lp_cmd_line(lp_cmd_line as *const u16, || { + let parsed_args_list = parse_lp_cmd_line(WStrUnits::new(lp_cmd_line), || { current_exe().map(PathBuf::into_os_string).unwrap_or_else(|_| OsString::new()) }); @@ -28,129 +35,120 @@ pub fn args() -> Args { /// Implements the Windows command-line argument parsing algorithm. /// /// Microsoft's documentation for the Windows CLI argument format can be found at -/// . +/// /// -/// Windows includes a function to do this in shell32.dll, -/// but linking with that DLL causes the process to be registered as a GUI application. +/// A more in-depth explanation is here: +/// +/// +/// Windows includes a function to do command line parsing in shell32.dll. +/// However, this is not used for two reasons: +/// +/// 1. Linking with that DLL causes the process to be registered as a GUI application. /// GUI applications add a bunch of overhead, even if no windows are drawn. See /// . /// -/// This function was tested for equivalence to the shell32.dll implementation in -/// Windows 10 Pro v1803, using an exhaustive test suite available at -/// or -/// . -unsafe fn parse_lp_cmd_line OsString>( - lp_cmd_line: *const u16, +/// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above. +/// +/// This function was tested for equivalence to the C/C++ parsing rules using an +/// extensive test suite available at +/// . +fn parse_lp_cmd_line<'a, F: Fn() -> OsString>( + lp_cmd_line: Option>, exe_name: F, ) -> Vec { - const BACKSLASH: u16 = '\\' as u16; - const QUOTE: u16 = '"' as u16; - const TAB: u16 = '\t' as u16; - const SPACE: u16 = ' ' as u16; + const BACKSLASH: NonZeroU16 = NonZeroU16::new(b'\\' as u16).unwrap(); + const QUOTE: NonZeroU16 = NonZeroU16::new(b'"' as u16).unwrap(); + const TAB: NonZeroU16 = NonZeroU16::new(b'\t' as u16).unwrap(); + const SPACE: NonZeroU16 = NonZeroU16::new(b' ' as u16).unwrap(); + let mut ret_val = Vec::new(); - if lp_cmd_line.is_null() || *lp_cmd_line == 0 { + // If the cmd line pointer is null or it points to an empty string then + // return the name of the executable as argv[0]. + if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() { ret_val.push(exe_name()); return ret_val; } - let mut cmd_line = { - let mut end = 0; - while *lp_cmd_line.offset(end) != 0 { - end += 1; - } - slice::from_raw_parts(lp_cmd_line, end as usize) - }; + let mut code_units = lp_cmd_line.unwrap(); + // The executable name at the beginning is special. - cmd_line = match cmd_line[0] { - // The executable name ends at the next quote mark, - // no matter what. - QUOTE => { - let args = { - let mut cut = cmd_line[1..].splitn(2, |&c| c == QUOTE); - if let Some(exe) = cut.next() { - ret_val.push(OsString::from_wide(exe)); - } - cut.next() - }; - if let Some(args) = args { - args - } else { - return ret_val; - } - } - // Implement quirk: when they say whitespace here, - // they include the entire ASCII control plane: - // "However, if lpCmdLine starts with any amount of whitespace, CommandLineToArgvW - // will consider the first argument to be an empty string. Excess whitespace at the - // end of lpCmdLine is ignored." - 0..=SPACE => { - ret_val.push(OsString::new()); - &cmd_line[1..] - } - // The executable name ends at the next whitespace, - // no matter what. - _ => { - let args = { - let mut cut = cmd_line.splitn(2, |&c| c > 0 && c <= SPACE); - if let Some(exe) = cut.next() { - ret_val.push(OsString::from_wide(exe)); - } - cut.next() - }; - if let Some(args) = args { - args - } else { - return ret_val; - } + let mut in_quotes = false; + let mut cur = Vec::new(); + for w in &mut code_units { + match w { + // A quote mark always toggles `in_quotes` no matter what because + // there are no escape characters when parsing the executable name. + QUOTE => in_quotes = !in_quotes, + // If not `in_quotes` then whitespace ends argv[0]. + SPACE | TAB if !in_quotes => break, + // In all other cases the code unit is taken literally. + _ => cur.push(w.get()), } - }; + } + // Skip whitespace. + code_units.advance_while(|w| w == SPACE || w == TAB); + ret_val.push(OsString::from_wide(&cur)); + + // Parse the arguments according to these rules: + // * All code units are taken literally except space, tab, quote and backslash. + // * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are + // treated as a single separator. + // * A space or tab `in_quotes` is taken literally. + // * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally. + // * A quote can be escaped if preceded by an odd number of backslashes. + // * If any number of backslashes is immediately followed by a quote then the number of + // backslashes is halved (rounding down). + // * Backslashes not followed by a quote are all taken literally. + // * If `in_quotes` then a quote can also be escaped using another quote + // (i.e. two consecutive quotes become one literal quote). let mut cur = Vec::new(); let mut in_quotes = false; - let mut was_in_quotes = false; - let mut backslash_count: usize = 0; - for &c in cmd_line { - match c { - // backslash - BACKSLASH => { - backslash_count += 1; - was_in_quotes = false; + while let Some(w) = code_units.next() { + match w { + // If not `in_quotes`, a space or tab ends the argument. + SPACE | TAB if !in_quotes => { + ret_val.push(OsString::from_wide(&cur[..])); + cur.truncate(0); + + // Skip whitespace. + code_units.advance_while(|w| w == SPACE || w == TAB); } - QUOTE if backslash_count % 2 == 0 => { - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2)); - backslash_count = 0; - if was_in_quotes { - cur.push('"' as u16); - was_in_quotes = false; + // Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote. + BACKSLASH => { + let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1; + if code_units.peek() == Some(QUOTE) { + cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2)); + // The quote is escaped if there are an odd number of backslashes. + if backslash_count % 2 == 1 { + code_units.next(); + cur.push(QUOTE.get()); + } } else { - was_in_quotes = in_quotes; - in_quotes = !in_quotes; + // If there is no quote on the end then there is no escaping. + cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count)); } } - QUOTE if backslash_count % 2 != 0 => { - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2)); - backslash_count = 0; - was_in_quotes = false; - cur.push(b'"' as u16); - } - SPACE | TAB if !in_quotes => { - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count)); - if !cur.is_empty() || was_in_quotes { - ret_val.push(OsString::from_wide(&cur[..])); - cur.truncate(0); + // If `in_quotes` and not backslash escaped (see above) then a quote either + // unsets `in_quote` or is escaped by another quote. + QUOTE if in_quotes => match code_units.peek() { + // Two consecutive quotes when `in_quotes` produces one literal quote. + Some(QUOTE) => { + cur.push(QUOTE.get()); + code_units.next(); } - backslash_count = 0; - was_in_quotes = false; - } - _ => { - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count)); - backslash_count = 0; - was_in_quotes = false; - cur.push(c); - } + // Otherwise set `in_quotes`. + Some(_) => in_quotes = false, + // The end of the command line. + // Push `cur` even if empty, which we do by breaking while `in_quotes` is still set. + None => break, + }, + // If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`. + QUOTE => in_quotes = true, + // Everything else is always taken literally. + _ => cur.push(w.get()), } } - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count)); - // include empty quoted strings at the end of the arguments list - if !cur.is_empty() || was_in_quotes || in_quotes { + // Push the final argument, if any. + if !cur.is_empty() || in_quotes { ret_val.push(OsString::from_wide(&cur[..])); } ret_val @@ -187,3 +185,52 @@ impl ExactSizeIterator for Args { self.parsed_args_list.len() } } + +/// A safe iterator over a LPWSTR +/// (aka a pointer to a series of UTF-16 code units terminated by a NULL). +struct WStrUnits<'a> { + // The pointer must never be null... + lpwstr: NonNull, + // ...and the memory it points to must be valid for this lifetime. + lifetime: PhantomData<&'a [u16]>, +} +impl WStrUnits<'_> { + /// Create the iterator. Returns `None` if `lpwstr` is null. + /// + /// SAFETY: `lpwstr` must point to a null-terminated wide string that lives + /// at least as long as the lifetime of this struct. + unsafe fn new(lpwstr: *const u16) -> Option { + Some(Self { lpwstr: NonNull::new(lpwstr as _)?, lifetime: PhantomData }) + } + fn peek(&self) -> Option { + // SAFETY: It's always safe to read the current item because we don't + // ever move out of the array's bounds. + unsafe { NonZeroU16::new(*self.lpwstr.as_ptr()) } + } + /// Advance the iterator while `predicate` returns true. + /// Returns the number of items it advanced by. + fn advance_while bool>(&mut self, mut predicate: P) -> usize { + let mut counter = 0; + while let Some(w) = self.peek() { + if !predicate(w) { + break; + } + counter += 1; + self.next(); + } + counter + } +} +impl Iterator for WStrUnits<'_> { + // This can never return zero as that marks the end of the string. + type Item = NonZeroU16; + fn next(&mut self) -> Option { + // SAFETY: If NULL is reached we immediately return. + // Therefore it's safe to advance the pointer after that. + unsafe { + let next = self.peek()?; + self.lpwstr = NonNull::new_unchecked(self.lpwstr.as_ptr().add(1)); + Some(next) + } + } +} diff --git a/library/std/src/sys/windows/c.rs b/library/std/src/sys/windows/c.rs index 63f9be7b7e350..023186c7a9067 100644 --- a/library/std/src/sys/windows/c.rs +++ b/library/std/src/sys/windows/c.rs @@ -781,7 +781,7 @@ extern "system" { pub fn RemoveDirectoryW(lpPathName: LPCWSTR) -> BOOL; pub fn SetFileAttributesW(lpFileName: LPCWSTR, dwFileAttributes: DWORD) -> BOOL; pub fn SetLastError(dwErrCode: DWORD); - pub fn GetCommandLineW() -> *mut LPCWSTR; + pub fn GetCommandLineW() -> LPWSTR; pub fn GetTempPathW(nBufferLength: DWORD, lpBuffer: LPCWSTR) -> DWORD; pub fn GetCurrentProcess() -> HANDLE; pub fn GetCurrentThread() -> HANDLE;