diff --git a/Cargo.lock b/Cargo.lock index b46de69fa80..af721b2f6cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1650,6 +1650,7 @@ name = "gix-filter" version = "0.0.0" dependencies = [ "bstr", + "encoding_rs", "gix-hash 0.11.3", "gix-object 0.31.0", "gix-trace 0.1.1", diff --git a/gix-filter/Cargo.toml b/gix-filter/Cargo.toml index 3e95ead7180..44123379c3d 100644 --- a/gix-filter/Cargo.toml +++ b/gix-filter/Cargo.toml @@ -16,5 +16,6 @@ gix-hash = { version = "^0.11.3", path = "../gix-hash" } gix-trace = { version = "^0.1.1", path = "../gix-trace" } gix-object = { version = "^0.31.0", path = "../gix-object" } +encoding_rs = "0.8.32" bstr = { version = "1.5.0", default-features = false, features = ["std"] } thiserror = "1.0.38" diff --git a/gix-filter/src/ident.rs b/gix-filter/src/ident.rs index 611824a2943..ab390e11c2b 100644 --- a/gix-filter/src/ident.rs +++ b/gix-filter/src/ident.rs @@ -1,9 +1,10 @@ -use bstr::{BStr, ByteSlice, ByteVec}; -use std::borrow::Cow; +use crate::clear_and_set_capacity; +use bstr::{ByteSlice, ByteVec}; use std::ops::Range; -/// Undo identifiers like `$Id:$` to `$Id$`. Newlines between dollars are ignored. -pub fn undo(mut input: Cow<'_, BStr>) -> Cow<'_, BStr> { +/// Undo identifiers like `$Id:$` to `$Id$` in `src` and write to `buf`. Newlines between dollars are ignored. +/// Return `true` if `buf` was written or `false` if `src` was left unaltered (as there was nothing to do). +pub fn undo(src: &[u8], buf: &mut Vec) -> bool { fn find_range(input: &[u8]) -> Option> { let mut ofs = 0; loop { @@ -21,37 +22,54 @@ pub fn undo(mut input: Cow<'_, BStr>) -> Cow<'_, BStr> { } let mut ofs = 0; - while let Some(range) = find_range(&input[ofs..]) { - input - .to_mut() - .replace_range((range.start + ofs)..(range.end + ofs), b"$Id$"); - ofs += range.start + 4; + let mut initialized = false; + while let Some(range) = find_range(&src[ofs..]) { + if !initialized { + clear_and_set_capacity(buf, src.len()); + initialized = true; + } + buf.push_str(&src[ofs..][..range.start]); + buf.push_str(b"$Id$"); + ofs += range.end; + } + if initialized { + buf.push_str(&src[ofs..]); } - input + initialized } -/// Substitute all occurrences of `$Id$` with `$Id: $` if present and return the changed buffer, with `object_hash` -/// being used accordingly. +/// Substitute all occurrences of `$Id$` with `$Id: $` if present in `src` and write all changes to `buf`, +/// with `object_hash` being used accordingly. Return `true` if `buf` was written to or `false` if no change was made +/// (as there was nothing to do). /// /// ### Deviation /// /// `Git` also tries to cleanup 'stray' substituted `$Id: $`, but we don't do that, sticking exactly to what ought to be done. /// The respective code is up to 16 years old and one might assume that `git` by now handles checking and checkout filters correctly. -pub fn apply(mut input: Cow<'_, BStr>, object_hash: gix_hash::Kind) -> Cow<'_, BStr> { - let mut buf: [u8; b": $".len() + gix_hash::Kind::longest().len_in_hex()] = std::array::from_fn(|_| 0); +pub fn apply(src: &[u8], object_hash: gix_hash::Kind, buf: &mut Vec) -> bool { + const HASH_LEN: usize = ": ".len() + gix_hash::Kind::longest().len_in_hex(); let mut id = None; let mut ofs = 0; - while let Some(pos) = input[ofs..].find(b"$Id$") { - let id = id.get_or_insert_with(|| gix_object::compute_hash(object_hash, gix_object::Kind::Blob, &input)); + while let Some(pos) = src[ofs..].find(b"$Id$") { + let id = match id { + None => { + let new_id = gix_object::compute_hash(object_hash, gix_object::Kind::Blob, src); + id = new_id.into(); + clear_and_set_capacity(buf, src.len() + HASH_LEN); // pre-allocate for one ID + new_id + } + Some(id) => id.to_owned(), + }; - buf[..2].copy_from_slice(b": "); - let _ = id.hex_to_buf(&mut buf[2..][..object_hash.len_in_hex()]); - let replaced_id = &mut buf[..2 + object_hash.len_in_hex() + 1]; - *replaced_id.last_mut().expect("present") = b'$'; - input - .to_mut() - .replace_range((ofs + pos + 3)..(ofs + pos + 4), &*replaced_id); - ofs += pos + 3 + replaced_id.len(); + buf.push_str(&src[ofs..][..pos + 3]); + buf.push_str(b": "); + id.write_hex_to(&mut *buf).expect("writes to memory always work"); + buf.push(b'$'); + + ofs += pos + 4; + } + if id.is_some() { + buf.push_str(&src[ofs..]); } - input + id.is_some() } diff --git a/gix-filter/src/lib.rs b/gix-filter/src/lib.rs index b2211625430..543e32a8f95 100644 --- a/gix-filter/src/lib.rs +++ b/gix-filter/src/lib.rs @@ -8,8 +8,7 @@ //! //! This crate implements the building blocks in terms of applying and undoing filters, along with logic to decide whether //! or not to apply such a filter. -#![deny(rust_2018_idioms, missing_docs)] -#![forbid(unsafe_code)] +#![deny(rust_2018_idioms, missing_docs, unsafe_code)] /// pub mod ident; @@ -17,6 +16,9 @@ pub mod ident; /// utilities related to handling line endings in buffers pub mod eol; +/// Utilities for handling worktree encodings. +pub mod worktree; + fn clear_and_set_capacity(buf: &mut Vec, cap: usize) { buf.clear(); if buf.capacity() < cap { diff --git a/gix-filter/src/worktree/encode_to_git.rs b/gix-filter/src/worktree/encode_to_git.rs new file mode 100644 index 00000000000..da1bbf71bf8 --- /dev/null +++ b/gix-filter/src/worktree/encode_to_git.rs @@ -0,0 +1,90 @@ +/// Whether or not to perform round-trip checks. +#[derive(Debug, Copy, Clone)] +pub enum RoundTrip { + /// Assure that we can losslessly convert the UTF-8 result back to the original encoding. + Validate, + /// Do not check if the encoding is round-trippable. + Ignore, +} + +/// The error returned by [`encode_to_git()][super::encode_to_git()]. +#[derive(Debug, thiserror::Error)] +#[allow(missing_docs)] +pub enum Error { + #[error("Cannot convert input of {input_len} bytes to UTF-8 without overflowing")] + Overflow { input_len: usize }, + #[error("The input was malformed and could not be decoded as '{encoding}'")] + Malformed { encoding: &'static str }, + #[error("Encoding from '{src_encoding}' to '{dest_encoding}' and back is not the same")] + RoundTrip { + src_encoding: &'static str, + dest_encoding: &'static str, + }, +} + +pub(crate) mod function { + use super::{Error, RoundTrip}; + use crate::clear_and_set_capacity; + use encoding_rs::DecoderResult; + + /// Decode `src` according to `src_encoding` to `UTF-8` for storage in git and place it in `buf`. + /// Note that the encoding is always applied, there is no conditional even if `src_encoding` already is `UTF-8`. + pub fn encode_to_git( + src: &[u8], + src_encoding: &'static encoding_rs::Encoding, + buf: &mut Vec, + round_trip: RoundTrip, + ) -> Result<(), Error> { + let mut decoder = src_encoding.new_decoder_with_bom_removal(); + let buf_len = decoder + .max_utf8_buffer_length_without_replacement(src.len()) + .ok_or(Error::Overflow { input_len: src.len() })?; + clear_and_set_capacity(buf, buf_len); + // SAFETY: `clear_and_set_capacity` assure that we have the given `buf_len` allocated, so setting its length is only making available + // what is allocated. Later we will truncate to the amount of actually written bytes. + #[allow(unsafe_code)] + unsafe { + buf.set_len(buf_len); + } + let (res, read, written) = decoder.decode_to_utf8_without_replacement(src, buf, true); + match res { + DecoderResult::InputEmpty => { + assert!( + buf_len >= written, + "encoding_rs estimates the maximum amount of bytes written correctly" + ); + assert_eq!(read, src.len(), "input buffer should be fully consumed"); + // SAFETY: we trust that `encoding_rs` reports this number correctly, and truncate everything else. + #[allow(unsafe_code)] + unsafe { + buf.set_len(written); + } + } + DecoderResult::OutputFull => { + unreachable!("we assure that the output buffer is big enough as per the encoder's estimate") + } + DecoderResult::Malformed(_, _) => { + return Err(Error::Malformed { + encoding: src_encoding.name(), + }) + } + } + + match round_trip { + RoundTrip::Validate => { + // SAFETY: we trust `encoding_rs` to output valid UTF-8 only if we ask it to. + #[allow(unsafe_code)] + let str = unsafe { std::str::from_utf8_unchecked(buf) }; + let (should_equal_src, _actual_encoding, _had_errors) = src_encoding.encode(str); + if should_equal_src != src { + return Err(Error::RoundTrip { + src_encoding: src_encoding.name(), + dest_encoding: "UTF-8", + }); + } + } + RoundTrip::Ignore => {} + } + Ok(()) + } +} diff --git a/gix-filter/src/worktree/encode_to_worktree.rs b/gix-filter/src/worktree/encode_to_worktree.rs new file mode 100644 index 00000000000..0a534193781 --- /dev/null +++ b/gix-filter/src/worktree/encode_to_worktree.rs @@ -0,0 +1,69 @@ +/// The error returned by [`encode_to_worktree()][super::encode_to_worktree()]. +#[derive(Debug, thiserror::Error)] +#[allow(missing_docs)] +pub enum Error { + #[error("Cannot convert input of {input_len} UTF-8 bytes to target encoding without overflowing")] + Overflow { input_len: usize }, + #[error("Input was not UTF-8 encoded")] + InputAsUtf8(#[from] std::str::Utf8Error), + #[error("The character '{character}' could not be mapped to the {worktree_encoding}")] + Unmappable { + character: char, + worktree_encoding: &'static str, + }, +} + +pub(crate) mod function { + use super::Error; + use crate::clear_and_set_capacity; + use encoding_rs::EncoderResult; + + /// Encode `src_utf8`, which is assumed to be UTF-8 encoded, according to `worktree_encoding` for placement in the working directory, + /// and write it to `buf`, possibly resizing it. + /// Note that the encoding is always applied, there is no conditional even if `worktree_encoding` and the `src` encoding are the same. + pub fn encode_to_worktree( + src_utf8: &[u8], + worktree_encoding: &'static encoding_rs::Encoding, + buf: &mut Vec, + ) -> Result<(), Error> { + let mut encoder = worktree_encoding.new_encoder(); + let buf_len = encoder + .max_buffer_length_from_utf8_if_no_unmappables(src_utf8.len()) + .ok_or(Error::Overflow { + input_len: src_utf8.len(), + })?; + clear_and_set_capacity(buf, buf_len); + // SAFETY: `clear_and_set_capacity` assure that we have the given `buf_len` allocated, so setting its length is only making available + // what is allocated. Later we will truncate to the amount of actually written bytes. + #[allow(unsafe_code)] + unsafe { + buf.set_len(buf_len); + } + let src = std::str::from_utf8(src_utf8)?; + let (res, read, written) = encoder.encode_from_utf8_without_replacement(src, buf, true); + match res { + EncoderResult::InputEmpty => { + assert!( + buf_len >= written, + "encoding_rs estimates the maximum amount of bytes written correctly" + ); + assert_eq!(read, src_utf8.len(), "input buffer should be fully consumed"); + // SAFETY: we trust that `encoding_rs` reports this number correctly, and truncate everything else. + #[allow(unsafe_code)] + unsafe { + buf.set_len(written); + } + } + EncoderResult::OutputFull => { + unreachable!("we assure that the output buffer is big enough as per the encoder's estimate") + } + EncoderResult::Unmappable(c) => { + return Err(Error::Unmappable { + worktree_encoding: worktree_encoding.name(), + character: c, + }) + } + } + Ok(()) + } +} diff --git a/gix-filter/src/worktree/encoding.rs b/gix-filter/src/worktree/encoding.rs new file mode 100644 index 00000000000..0b75adc96aa --- /dev/null +++ b/gix-filter/src/worktree/encoding.rs @@ -0,0 +1,31 @@ +use bstr::BStr; +use encoding_rs::Encoding; + +/// +pub mod for_label { + use bstr::BString; + + /// The error returned by [for_label()][super::for_label()]. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("An encoding named '{name}' is not known")] + Unknown { name: BString }, + } +} + +/// Try to produce a new `Encoding` for `label` or report an error if it is not known. +/// +/// ### Deviation +/// +/// * There is no special handling of UTF-16LE/BE with checks if data contains a BOM or not, like `git` as we don't expect to have +/// data available here. +/// * Special `-BOM` suffixed versions of `UTF-16` encodings are not supported. +pub fn for_label<'a>(label: impl Into<&'a BStr>) -> Result<&'static Encoding, for_label::Error> { + let mut label = label.into(); + if label == "latin-1" { + label = "ISO-8859-1".into(); + } + let enc = Encoding::for_label(label.as_ref()).ok_or_else(|| for_label::Error::Unknown { name: label.into() })?; + Ok(enc) +} diff --git a/gix-filter/src/worktree/mod.rs b/gix-filter/src/worktree/mod.rs new file mode 100644 index 00000000000..3b13ea49ede --- /dev/null +++ b/gix-filter/src/worktree/mod.rs @@ -0,0 +1,16 @@ +//! Worktree encodings are powered by the `encoding_rs` crate, which has a narrower focus than the `iconv` library. Thus this implementation +//! is inherently more limited but will handle the common cases. +//! +//! Note that for encoding to legacy formats, [additional normalization steps](https://docs.rs/encoding_rs/0.8.32/encoding_rs/#preparing-text-for-the-encoders) +//! can be taken, which we do not yet take unless there is specific examples or problems to solve. + +/// +pub mod encoding; + +/// +pub mod encode_to_git; +pub use encode_to_git::function::encode_to_git; + +/// +pub mod encode_to_worktree; +pub use encode_to_worktree::function::encode_to_worktree; diff --git a/gix-filter/tests/filter.rs b/gix-filter/tests/filter.rs index fffceb2a79f..f693813132a 100644 --- a/gix-filter/tests/filter.rs +++ b/gix-filter/tests/filter.rs @@ -1,4 +1,5 @@ mod eol; mod ident; +mod worktree; pub type Result = std::result::Result>; diff --git a/gix-filter/tests/ident/mod.rs b/gix-filter/tests/ident/mod.rs index 58727bbbc43..ad63cf70dd8 100644 --- a/gix-filter/tests/ident/mod.rs +++ b/gix-filter/tests/ident/mod.rs @@ -1,107 +1,111 @@ -use bstr::BStr; -use std::borrow::Cow; - -fn cowstr(input: &str) -> Cow<'_, BStr> { - Cow::Borrowed(input.into()) -} mod undo { - use crate::ident::cowstr; - use std::borrow::Cow; + use bstr::{ByteSlice, B}; #[test] fn no_id_changes_nothing() { - let cow = gix_filter::ident::undo(cowstr("hello")); - assert!(matches!(cow, Cow::Borrowed(_)), "the buffer is not touched"); - assert_eq!(cow.as_ref(), "hello"); + let mut buf = Vec::new(); + let changed = gix_filter::ident::undo(B("hello"), &mut buf); + assert!(!changed, "the buffer is not touched"); + assert_eq!(buf.len(), 0); } #[test] fn empty() { - assert!(matches!(gix_filter::ident::undo(cowstr("")), Cow::Borrowed(_))); + let mut buf = Vec::new(); + assert!( + !gix_filter::ident::undo(B(""), &mut buf), + "nothing to be done in empty buffer" + ); } #[test] fn nothing_if_newline_between_dollars() { - assert!(matches!(gix_filter::ident::undo(cowstr(" $Id: \n$")), Cow::Borrowed(_))); + let mut buf = Vec::new(); + assert!(!gix_filter::ident::undo(B(" $Id: \n$"), &mut buf)); + assert_eq!(buf.len(), 0); } #[test] fn nothing_if_it_is_not_id() { + let mut buf = Vec::new(); assert!( - matches!(gix_filter::ident::undo(cowstr(" $id: something$")), Cow::Borrowed(_)), + !gix_filter::ident::undo(B(" $id: something$"), &mut buf), "it's matching case-sensitively" ); + assert_eq!(buf.len(), 0); } #[test] fn anything_between_dollar_id_dollar() { - assert_eq!( - gix_filter::ident::undo(cowstr(" $Id: something$\nhello")).as_ref(), - " $Id$\nhello" - ); + let mut buf = Vec::new(); + assert!(gix_filter::ident::undo(B(" $Id: something$\nhello"), &mut buf)); + assert_eq!(buf.as_bstr(), " $Id$\nhello"); } #[test] fn multiple() { - assert_eq!( - gix_filter::ident::undo(cowstr( - "$Id: a\n$ $Id: something$\nhello$Id: hex$\nlast $Id:other$\n$Id: \n$" - )) - .as_ref(), - "$Id: a\n$ $Id$\nhello$Id$\nlast $Id$\n$Id: \n$", - ); - assert_eq!( - gix_filter::ident::undo(cowstr("$Id: a\n$$Id:$$Id: hex$\n$Id:other$$Id: $end")).as_ref(), - "$Id: a\n$$Id$$Id$\n$Id$$Id$end", - ); + let mut buf = Vec::new(); + assert!(gix_filter::ident::undo( + B("$Id: a\n$ $Id: something$\nhello$Id: hex$\nlast $Id:other$\n$Id: \n$"), + &mut buf + )); + assert_eq!(buf.as_bstr(), "$Id: a\n$ $Id$\nhello$Id$\nlast $Id$\n$Id: \n$"); + + assert!(gix_filter::ident::undo( + B("$Id: a\n$$Id:$$Id: hex$\n$Id:other$$Id: $end"), + &mut buf + )); + assert_eq!(buf.as_bstr(), "$Id: a\n$$Id$$Id$\n$Id$$Id$end"); } } mod apply { - use crate::ident::cowstr; + use bstr::{ByteSlice, B}; use gix_filter::ident; - use std::borrow::Cow; #[test] fn no_change() { + let mut buf = Vec::new(); for input_no_match in [ "", "nothing", "$ID$ case sensitive matching", "$Id: expanded is ignored$", ] { - let res = ident::apply(cowstr(input_no_match), gix_hash::Kind::Sha1); - assert!( - matches!(res, Cow::Borrowed(_)), - "no substitution happens, so no mutable version of the Cow is created" - ); - assert_eq!(res.as_ref(), input_no_match, "there definitely is no change"); + let changed = ident::apply(input_no_match.as_bytes(), gix_hash::Kind::Sha1, &mut buf); + assert!(!changed, "no substitution happens, nothing to do"); + assert_eq!(buf.len(), 0); } } #[test] fn simple() { - assert_eq!( - ident::apply(cowstr("$Id$"), gix_hash::Kind::Sha1).as_ref(), - "$Id: b3f5ebfb5843bc43ceecff6d4f26bb37c615beb1$" + let mut buf = Vec::new(); + assert!( + ident::apply(B("$Id$"), gix_hash::Kind::Sha1, &mut buf), + "a change happens" ); + assert_eq!(buf.as_bstr(), "$Id: b3f5ebfb5843bc43ceecff6d4f26bb37c615beb1$"); + assert!(ident::apply(B("$Id$ $Id$ foo"), gix_hash::Kind::Sha1, &mut buf)); assert_eq!( - ident::apply(cowstr("$Id$ $Id$"), gix_hash::Kind::Sha1).as_ref(), - "$Id: f6f3176060328ef7030a8b8eeda57fbf0587b2f9$ $Id: f6f3176060328ef7030a8b8eeda57fbf0587b2f9$" + buf.as_bstr(), + "$Id: e230cff7a9624f59eaa28bfb97602c3a03651a49$ $Id: e230cff7a9624f59eaa28bfb97602c3a03651a49$ foo" ); } #[test] fn round_trips() { + let mut buf = Vec::new(); for input in [ "hi\n$Id$\nho\n\t$Id$$Id$$Id$", "$Id$", "$Id$ and one more $Id$ and done", ] { - let res = ident::apply(cowstr(input), gix_hash::Kind::Sha1); - assert_ne!(res.as_ref(), input, "the input was rewritten"); - assert_eq!(ident::undo(res).as_ref(), input, "the filter can be undone perfectly"); + let changed = ident::apply(B(input), gix_hash::Kind::Sha1, &mut buf); + assert!(changed, "the input was rewritten"); + assert!(ident::undo(&buf.clone(), &mut buf), "undo does something as well"); + assert_eq!(buf.as_bstr(), input, "the filter can be undone perfectly"); } } } diff --git a/gix-filter/tests/worktree/mod.rs b/gix-filter/tests/worktree/mod.rs new file mode 100644 index 00000000000..1eb1a8e3f11 --- /dev/null +++ b/gix-filter/tests/worktree/mod.rs @@ -0,0 +1,101 @@ +mod encoding { + mod for_label { + use gix_filter::worktree; + + #[test] + fn unknown() { + assert_eq!( + worktree::encoding::for_label("FOO").unwrap_err().to_string(), + "An encoding named 'FOO' is not known" + ); + } + + #[test] + fn utf32_is_not_supported() { + for enc in ["UTF-32BE", "UTF-32LE", "UTF-32", "UTF-32LE-BOM", "UTF-32BE-BOM"] { + assert!( + matches!( + worktree::encoding::for_label(enc).unwrap_err(), + worktree::encoding::for_label::Error::Unknown { .. } + ), + "it's not needed for the web and this crate is meant for use in firefox" + ); + } + } + + #[test] + fn various_spellings_of_utf_8_are_supported() { + for enc in ["UTF8", "UTF-8", "utf-8", "utf8"] { + let enc = worktree::encoding::for_label(enc).unwrap(); + assert_eq!(enc.name(), "UTF-8"); + } + } + + #[test] + fn various_utf_16_without_bom_suffix_are_supported() { + for label in ["UTF-16BE", "UTF-16LE"] { + let enc = worktree::encoding::for_label(label).unwrap(); + assert_eq!(enc.name(), label); + } + } + + #[test] + fn various_utf_16_with_bom_suffix_are_unsupported() { + for label in ["UTF-16BE-BOM", "UTF-16LE-BOM"] { + assert!( + matches!( + worktree::encoding::for_label(label).unwrap_err(), + worktree::encoding::for_label::Error::Unknown { .. } + ), + "git supports these and has special handling, but we have not for now. Git has no tests for that either." + ); + } + } + + #[test] + fn latin_1_is_supported_with_fallback() { + let enc = worktree::encoding::for_label("latin-1").unwrap(); + assert_eq!( + enc.name(), + "windows-1252", + "the encoding crate has its own fallback for ISO-8859-1 which we try to use" + ); + } + } +} + +mod encode_to_git { + use bstr::ByteSlice; + use gix_filter::worktree; + use gix_filter::worktree::encode_to_git::RoundTrip; + + #[test] + fn simple() -> crate::Result { + let input = &b"hello"[..]; + for round_trip in [RoundTrip::Ignore, RoundTrip::Validate] { + let mut buf = Vec::new(); + worktree::encode_to_git(input, encoding_rs::UTF_8, &mut buf, round_trip)?; + assert_eq!(buf.as_bstr(), input) + } + Ok(()) + } +} + +mod encode_to_worktree { + use bstr::ByteSlice; + use gix_filter::worktree; + use gix_filter::worktree::encode_to_git::RoundTrip; + + #[test] + fn shift_jis() -> crate::Result { + let input = "ハローワールド"; + let mut buf = Vec::new(); + worktree::encode_to_worktree(input.as_bytes(), encoding_rs::SHIFT_JIS, &mut buf)?; + + let mut re_encoded = Vec::new(); + worktree::encode_to_git(&buf, encoding_rs::SHIFT_JIS, &mut re_encoded, RoundTrip::Validate)?; + + assert_eq!(re_encoded.as_bstr(), input, "this should be round-trippable too"); + Ok(()) + } +}