From 73b65746e81a31c03cbd3751966eb399073f2d9a Mon Sep 17 00:00:00 2001 From: Chris Denton Date: Thu, 20 Apr 2023 08:58:14 +0100 Subject: [PATCH 1/2] Fix Unreadable non-UTF-8 output on localized MSVC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #35785 by converting non UTF-8 linker output to Unicode using the OEM code page. Before: ```text = note: Non-UTF-8 output: LINK : fatal error LNK1181: cannot open input file \'m\x84rchenhaft.obj\'\r\n ``` After: ```text = note: LINK : fatal error LNK1181: cannot open input file 'märchenhaft.obj' ``` The difference is more dramatic if using a non-ascii language pack for Visual Studio. --- Cargo.lock | 1 + compiler/rustc_codegen_ssa/Cargo.toml | 4 ++ compiler/rustc_codegen_ssa/src/back/link.rs | 55 ++++++++++++++++++- .../msvc-non-utf8-output.rs | 6 ++ .../msvc-non-utf8-output.stderr | 7 +++ 5 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 tests/ui/native-library-link-flags/msvc-non-utf8-output.rs create mode 100644 tests/ui/native-library-link-flags/msvc-non-utf8-output.stderr diff --git a/Cargo.lock b/Cargo.lock index 06a2a36f4552b..cd319574f7977 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3250,6 +3250,7 @@ dependencies = [ "tempfile", "thorin-dwp", "tracing", + "windows 0.46.0", ] [[package]] diff --git a/compiler/rustc_codegen_ssa/Cargo.toml b/compiler/rustc_codegen_ssa/Cargo.toml index a421535c9b46f..4f73b731f5a2a 100644 --- a/compiler/rustc_codegen_ssa/Cargo.toml +++ b/compiler/rustc_codegen_ssa/Cargo.toml @@ -49,3 +49,7 @@ libc = "0.2.50" version = "0.30.1" default-features = false features = ["read_core", "elf", "macho", "pe", "unaligned", "archive", "write"] + +[target.'cfg(windows)'.dependencies.windows] +version = "0.46.0" +features = ["Win32_Globalization"] diff --git a/compiler/rustc_codegen_ssa/src/back/link.rs b/compiler/rustc_codegen_ssa/src/back/link.rs index 02e21e74fadc8..feab57e98208f 100644 --- a/compiler/rustc_codegen_ssa/src/back/link.rs +++ b/compiler/rustc_codegen_ssa/src/back/link.rs @@ -857,7 +857,7 @@ fn link_natively<'a>( if !prog.status.success() { let mut output = prog.stderr.clone(); output.extend_from_slice(&prog.stdout); - let escaped_output = escape_string(&output); + let escaped_output = escape_linker_output(&output, flavor); // FIXME: Add UI tests for this error. let err = errors::LinkingFailed { linker_path: &linker_path, @@ -1049,6 +1049,59 @@ fn escape_string(s: &[u8]) -> String { } } +#[cfg(not(windows))] +fn escape_linker_output(s: &[u8], _flavour: LinkerFlavor) -> String { + escape_string(s) +} + +/// If the output of the msvc linker is not UTF-8 and the host is Windows, +/// then try to convert the string from the OEM encoding. +#[cfg(windows)] +fn escape_linker_output(s: &[u8], flavour: LinkerFlavor) -> String { + // This only applies to the actual MSVC linker. + if flavour != LinkerFlavor::Msvc(Lld::No) { + return escape_string(s); + } + match str::from_utf8(s) { + Ok(s) => return s.to_owned(), + Err(_) if s.len() <= i32::MAX as usize => { + use windows::Win32::Globalization::{ + GetLocaleInfoEx, MultiByteToWideChar, CP_OEMCP, LOCALE_IUSEUTF8LEGACYOEMCP, + LOCALE_NAME_SYSTEM_DEFAULT, LOCALE_RETURN_NUMBER, MB_ERR_INVALID_CHARS, + }; + // Get the legacy system OEM code page. + let code_page = unsafe { + let mut cp: u32 = 0; + // We're using the `LOCALE_RETURN_NUMBER` flag to return a u32. + // But the API requires us to pass the data as though it's a [u16] string. + let len = std::mem::size_of::() / std::mem::size_of::(); + let data = std::slice::from_raw_parts_mut(&mut cp as *mut u32 as *mut u16, len); + let len_written = GetLocaleInfoEx( + LOCALE_NAME_SYSTEM_DEFAULT, + LOCALE_IUSEUTF8LEGACYOEMCP | LOCALE_RETURN_NUMBER, + Some(data), + ); + if len_written as usize == len { cp } else { CP_OEMCP } + }; + // Error if the string is not valid for the expected code page. + let flags = MB_ERR_INVALID_CHARS; + // Call MultiByteToWideChar twice. + // First to calculate the length then to convert the string. + let mut len = unsafe { MultiByteToWideChar(code_page, flags, s, None) }; + if len > 0 { + let mut utf16 = vec![0; len as usize]; + len = unsafe { MultiByteToWideChar(code_page, flags, s, Some(&mut utf16)) }; + if len > 0 { + return String::from_utf16_lossy(&utf16[..len as usize]); + } + } + } + _ => {} + }; + // The string is not UTF-8 and isn't valid for the OEM code page + format!("Non-UTF-8 output: {}", s.escape_ascii()) +} + fn add_sanitizer_libraries(sess: &Session, crate_type: CrateType, linker: &mut dyn Linker) { // On macOS the runtimes are distributed as dylibs which should be linked to // both executables and dynamic shared objects. Everywhere else the runtimes diff --git a/tests/ui/native-library-link-flags/msvc-non-utf8-output.rs b/tests/ui/native-library-link-flags/msvc-non-utf8-output.rs new file mode 100644 index 0000000000000..3fb2842d694cc --- /dev/null +++ b/tests/ui/native-library-link-flags/msvc-non-utf8-output.rs @@ -0,0 +1,6 @@ +// build-fail +// compile-flags:-C link-arg=märchenhaft +// only-msvc +// error-pattern:= note: LINK : fatal error LNK1181: +// normalize-stderr-test "(\s*\|\n)\s*= note: .*\n" -> "$1" +pub fn main() {} diff --git a/tests/ui/native-library-link-flags/msvc-non-utf8-output.stderr b/tests/ui/native-library-link-flags/msvc-non-utf8-output.stderr new file mode 100644 index 0000000000000..f843aad782c30 --- /dev/null +++ b/tests/ui/native-library-link-flags/msvc-non-utf8-output.stderr @@ -0,0 +1,7 @@ +error: linking with `link.exe` failed: exit code: 1181 + | + = note: LINK : fatal error LNK1181: cannot open input file 'märchenhaft.obj' + + +error: aborting due to previous error + From 9b9d39e43f3d8723b36f7e4b9ecafa36203fde45 Mon Sep 17 00:00:00 2001 From: Chris Denton Date: Thu, 27 Apr 2023 09:27:23 +0100 Subject: [PATCH 2/2] Abstract `MultiByteToWideChar` --- compiler/rustc_codegen_ssa/src/back/link.rs | 90 +++++++++++++-------- 1 file changed, 57 insertions(+), 33 deletions(-) diff --git a/compiler/rustc_codegen_ssa/src/back/link.rs b/compiler/rustc_codegen_ssa/src/back/link.rs index feab57e98208f..fe21986884f06 100644 --- a/compiler/rustc_codegen_ssa/src/back/link.rs +++ b/compiler/rustc_codegen_ssa/src/back/link.rs @@ -1064,42 +1064,66 @@ fn escape_linker_output(s: &[u8], flavour: LinkerFlavor) -> String { } match str::from_utf8(s) { Ok(s) => return s.to_owned(), - Err(_) if s.len() <= i32::MAX as usize => { - use windows::Win32::Globalization::{ - GetLocaleInfoEx, MultiByteToWideChar, CP_OEMCP, LOCALE_IUSEUTF8LEGACYOEMCP, - LOCALE_NAME_SYSTEM_DEFAULT, LOCALE_RETURN_NUMBER, MB_ERR_INVALID_CHARS, - }; - // Get the legacy system OEM code page. - let code_page = unsafe { - let mut cp: u32 = 0; - // We're using the `LOCALE_RETURN_NUMBER` flag to return a u32. - // But the API requires us to pass the data as though it's a [u16] string. - let len = std::mem::size_of::() / std::mem::size_of::(); - let data = std::slice::from_raw_parts_mut(&mut cp as *mut u32 as *mut u16, len); - let len_written = GetLocaleInfoEx( - LOCALE_NAME_SYSTEM_DEFAULT, - LOCALE_IUSEUTF8LEGACYOEMCP | LOCALE_RETURN_NUMBER, - Some(data), - ); - if len_written as usize == len { cp } else { CP_OEMCP } - }; - // Error if the string is not valid for the expected code page. - let flags = MB_ERR_INVALID_CHARS; - // Call MultiByteToWideChar twice. - // First to calculate the length then to convert the string. - let mut len = unsafe { MultiByteToWideChar(code_page, flags, s, None) }; + Err(_) => match win::locale_byte_str_to_string(s, win::oem_code_page()) { + Some(s) => s, + // The string is not UTF-8 and isn't valid for the OEM code page + None => format!("Non-UTF-8 output: {}", s.escape_ascii()), + }, + } +} + +/// Wrappers around the Windows API. +#[cfg(windows)] +mod win { + use windows::Win32::Globalization::{ + GetLocaleInfoEx, MultiByteToWideChar, CP_OEMCP, LOCALE_IUSEUTF8LEGACYOEMCP, + LOCALE_NAME_SYSTEM_DEFAULT, LOCALE_RETURN_NUMBER, MB_ERR_INVALID_CHARS, + }; + + /// Get the Windows system OEM code page. This is most notably the code page + /// used for link.exe's output. + pub fn oem_code_page() -> u32 { + unsafe { + let mut cp: u32 = 0; + // We're using the `LOCALE_RETURN_NUMBER` flag to return a u32. + // But the API requires us to pass the data as though it's a [u16] string. + let len = std::mem::size_of::() / std::mem::size_of::(); + let data = std::slice::from_raw_parts_mut(&mut cp as *mut u32 as *mut u16, len); + let len_written = GetLocaleInfoEx( + LOCALE_NAME_SYSTEM_DEFAULT, + LOCALE_IUSEUTF8LEGACYOEMCP | LOCALE_RETURN_NUMBER, + Some(data), + ); + if len_written as usize == len { cp } else { CP_OEMCP } + } + } + /// Try to convert a multi-byte string to a UTF-8 string using the given code page + /// The string does not need to be null terminated. + /// + /// This is implemented as a wrapper around `MultiByteToWideChar`. + /// See + /// + /// It will fail if the multi-byte string is longer than `i32::MAX` or if it contains + /// any invalid bytes for the expected encoding. + pub fn locale_byte_str_to_string(s: &[u8], code_page: u32) -> Option { + // `MultiByteToWideChar` requires a length to be a "positive integer". + if s.len() > isize::MAX as usize { + return None; + } + // Error if the string is not valid for the expected code page. + let flags = MB_ERR_INVALID_CHARS; + // Call MultiByteToWideChar twice. + // First to calculate the length then to convert the string. + let mut len = unsafe { MultiByteToWideChar(code_page, flags, s, None) }; + if len > 0 { + let mut utf16 = vec![0; len as usize]; + len = unsafe { MultiByteToWideChar(code_page, flags, s, Some(&mut utf16)) }; if len > 0 { - let mut utf16 = vec![0; len as usize]; - len = unsafe { MultiByteToWideChar(code_page, flags, s, Some(&mut utf16)) }; - if len > 0 { - return String::from_utf16_lossy(&utf16[..len as usize]); - } + return utf16.get(..len as usize).map(String::from_utf16_lossy); } } - _ => {} - }; - // The string is not UTF-8 and isn't valid for the OEM code page - format!("Non-UTF-8 output: {}", s.escape_ascii()) + None + } } fn add_sanitizer_libraries(sess: &Session, crate_type: CrateType, linker: &mut dyn Linker) {