Merge branch 'worktree-encoding'

Byron · Jun 27, 2023 · 5af2cf3 · 5af2cf3
2 parents 3fd5e16 + d1fed3e
commit 5af2cf3
Show file tree

Hide file tree

Showing 11 changed files with 406 additions and 72 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/gix-filter/Cargo.toml b/gix-filter/Cargo.toml
@@ -16,5 +16,6 @@ gix-hash = { version = "^0.11.3", path = "../gix-hash" }
 gix-trace = { version = "^0.1.1", path = "../gix-trace" }
 gix-object = { version = "^0.31.0", path = "../gix-object" }
 
+encoding_rs = "0.8.32"
 bstr = { version = "1.5.0", default-features = false, features = ["std"] }
 thiserror = "1.0.38"
diff --git a/gix-filter/src/ident.rs b/gix-filter/src/ident.rs
@@ -1,9 +1,10 @@
-use bstr::{BStr, ByteSlice, ByteVec};
-use std::borrow::Cow;
+use crate::clear_and_set_capacity;
+use bstr::{ByteSlice, ByteVec};
 use std::ops::Range;
 
-/// Undo identifiers like `$Id:<hexsha>$` to `$Id$`. Newlines between dollars are ignored.
-pub fn undo(mut input: Cow<'_, BStr>) -> Cow<'_, BStr> {
+/// Undo identifiers like `$Id:<hexsha>$` to `$Id$` in `src` and write to `buf`. Newlines between dollars are ignored.
+/// Return `true` if `buf` was written or `false` if `src` was left unaltered (as there was nothing to do).
+pub fn undo(src: &[u8], buf: &mut Vec<u8>) -> bool {
     fn find_range(input: &[u8]) -> Option<Range<usize>> {
         let mut ofs = 0;
         loop {
@@ -21,37 +22,54 @@ pub fn undo(mut input: Cow<'_, BStr>) -> Cow<'_, BStr> {
     }
 
     let mut ofs = 0;
-    while let Some(range) = find_range(&input[ofs..]) {
-        input
-            .to_mut()
-            .replace_range((range.start + ofs)..(range.end + ofs), b"$Id$");
-        ofs += range.start + 4;
+    let mut initialized = false;
+    while let Some(range) = find_range(&src[ofs..]) {
+        if !initialized {
+            clear_and_set_capacity(buf, src.len());
+            initialized = true;
+        }
+        buf.push_str(&src[ofs..][..range.start]);
+        buf.push_str(b"$Id$");
+        ofs += range.end;
+    }
+    if initialized {
+        buf.push_str(&src[ofs..]);
     }
-    input
+    initialized
 }
 
-/// Substitute all occurrences of `$Id$` with `$Id: <hexsha-of-input>$` if present and return the changed buffer, with `object_hash`
-/// being used accordingly.
+/// Substitute all occurrences of `$Id$` with `$Id: <hexsha-of-input>$` if present in `src` and write all changes to `buf`,
+/// with `object_hash` being used accordingly. Return `true` if `buf` was written to or `false` if no change was made
+/// (as there was nothing to do).
 ///
 /// ### Deviation
 ///
 /// `Git` also tries to cleanup 'stray' substituted `$Id: <hex>$`, but we don't do that, sticking exactly to what ought to be done.
 /// The respective code is up to 16 years old and one might assume that `git` by now handles checking and checkout filters correctly.
-pub fn apply(mut input: Cow<'_, BStr>, object_hash: gix_hash::Kind) -> Cow<'_, BStr> {
-    let mut buf: [u8; b": $".len() + gix_hash::Kind::longest().len_in_hex()] = std::array::from_fn(|_| 0);
+pub fn apply(src: &[u8], object_hash: gix_hash::Kind, buf: &mut Vec<u8>) -> bool {
+    const HASH_LEN: usize = ": ".len() + gix_hash::Kind::longest().len_in_hex();
     let mut id = None;
     let mut ofs = 0;
-    while let Some(pos) = input[ofs..].find(b"$Id$") {
-        let id = id.get_or_insert_with(|| gix_object::compute_hash(object_hash, gix_object::Kind::Blob, &input));
+    while let Some(pos) = src[ofs..].find(b"$Id$") {
+        let id = match id {
+            None => {
+                let new_id = gix_object::compute_hash(object_hash, gix_object::Kind::Blob, src);
+                id = new_id.into();
+                clear_and_set_capacity(buf, src.len() + HASH_LEN); // pre-allocate for one ID
+                new_id
+            }
+            Some(id) => id.to_owned(),
+        };
 
-        buf[..2].copy_from_slice(b": ");
-        let _ = id.hex_to_buf(&mut buf[2..][..object_hash.len_in_hex()]);
-        let replaced_id = &mut buf[..2 + object_hash.len_in_hex() + 1];
-        *replaced_id.last_mut().expect("present") = b'$';
-        input
-            .to_mut()
-            .replace_range((ofs + pos + 3)..(ofs + pos + 4), &*replaced_id);
-        ofs += pos + 3 + replaced_id.len();
+        buf.push_str(&src[ofs..][..pos + 3]);
+        buf.push_str(b": ");
+        id.write_hex_to(&mut *buf).expect("writes to memory always work");
+        buf.push(b'$');
+
+        ofs += pos + 4;
+    }
+    if id.is_some() {
+        buf.push_str(&src[ofs..]);
     }
-    input
+    id.is_some()
 }
diff --git a/gix-filter/src/lib.rs b/gix-filter/src/lib.rs
@@ -8,15 +8,17 @@
 //!
 //! This crate implements the building blocks in terms of applying and undoing filters, along with logic to decide whether
 //! or not to apply such a filter.
-#![deny(rust_2018_idioms, missing_docs)]
-#![forbid(unsafe_code)]
+#![deny(rust_2018_idioms, missing_docs, unsafe_code)]
 
 ///
 pub mod ident;
 
 /// utilities related to handling line endings in buffers
 pub mod eol;
 
+/// Utilities for handling worktree encodings.
+pub mod worktree;
+
 fn clear_and_set_capacity(buf: &mut Vec<u8>, cap: usize) {
     buf.clear();
     if buf.capacity() < cap {

diff --git a/gix-filter/src/worktree/encode_to_git.rs b/gix-filter/src/worktree/encode_to_git.rs
@@ -0,0 +1,90 @@
+/// Whether or not to perform round-trip checks.
+#[derive(Debug, Copy, Clone)]
+pub enum RoundTrip {
+    /// Assure that we can losslessly convert the UTF-8 result back to the original encoding.
+    Validate,
+    /// Do not check if the encoding is round-trippable.
+    Ignore,
+}
+
+/// The error returned by [`encode_to_git()][super::encode_to_git()].
+#[derive(Debug, thiserror::Error)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[error("Cannot convert input of {input_len} bytes to UTF-8 without overflowing")]
+    Overflow { input_len: usize },
+    #[error("The input was malformed and could not be decoded as '{encoding}'")]
+    Malformed { encoding: &'static str },
+    #[error("Encoding from '{src_encoding}' to '{dest_encoding}' and back is not the same")]
+    RoundTrip {
+        src_encoding: &'static str,
+        dest_encoding: &'static str,
+    },
+}
+
+pub(crate) mod function {
+    use super::{Error, RoundTrip};
+    use crate::clear_and_set_capacity;
+    use encoding_rs::DecoderResult;
+
+    /// Decode `src` according to `src_encoding` to `UTF-8` for storage in git and place it in `buf`.
+    /// Note that the encoding is always applied, there is no conditional even if `src_encoding` already is `UTF-8`.
+    pub fn encode_to_git(
+        src: &[u8],
+        src_encoding: &'static encoding_rs::Encoding,
+        buf: &mut Vec<u8>,
+        round_trip: RoundTrip,
+    ) -> Result<(), Error> {
+        let mut decoder = src_encoding.new_decoder_with_bom_removal();
+        let buf_len = decoder
+            .max_utf8_buffer_length_without_replacement(src.len())
+            .ok_or(Error::Overflow { input_len: src.len() })?;
+        clear_and_set_capacity(buf, buf_len);
+        // SAFETY: `clear_and_set_capacity` assure that we have the given `buf_len` allocated, so setting its length is only making available
+        //          what is allocated. Later we will truncate to the amount of actually written bytes.
+        #[allow(unsafe_code)]
+        unsafe {
+            buf.set_len(buf_len);
+        }
+        let (res, read, written) = decoder.decode_to_utf8_without_replacement(src, buf, true);
+        match res {
+            DecoderResult::InputEmpty => {
+                assert!(
+                    buf_len >= written,
+                    "encoding_rs estimates the maximum amount of bytes written correctly"
+                );
+                assert_eq!(read, src.len(), "input buffer should be fully consumed");
+                // SAFETY: we trust that `encoding_rs` reports this number correctly, and truncate everything else.
+                #[allow(unsafe_code)]
+                unsafe {
+                    buf.set_len(written);
+                }
+            }
+            DecoderResult::OutputFull => {
+                unreachable!("we assure that the output buffer is big enough as per the encoder's estimate")
+            }
+            DecoderResult::Malformed(_, _) => {
+                return Err(Error::Malformed {
+                    encoding: src_encoding.name(),
+                })
+            }
+        }
+
+        match round_trip {
+            RoundTrip::Validate => {
+                // SAFETY: we trust `encoding_rs` to output valid UTF-8 only if we ask it to.
+                #[allow(unsafe_code)]
+                let str = unsafe { std::str::from_utf8_unchecked(buf) };
+                let (should_equal_src, _actual_encoding, _had_errors) = src_encoding.encode(str);
+                if should_equal_src != src {
+                    return Err(Error::RoundTrip {
+                        src_encoding: src_encoding.name(),
+                        dest_encoding: "UTF-8",
+                    });
+                }
+            }
+            RoundTrip::Ignore => {}
+        }
+        Ok(())
+    }
+}
diff --git a/gix-filter/src/worktree/encode_to_worktree.rs b/gix-filter/src/worktree/encode_to_worktree.rs
@@ -0,0 +1,69 @@
+/// The error returned by [`encode_to_worktree()][super::encode_to_worktree()].
+#[derive(Debug, thiserror::Error)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[error("Cannot convert input of {input_len} UTF-8 bytes to target encoding without overflowing")]
+    Overflow { input_len: usize },
+    #[error("Input was not UTF-8 encoded")]
+    InputAsUtf8(#[from] std::str::Utf8Error),
+    #[error("The character '{character}' could not be mapped to the {worktree_encoding}")]
+    Unmappable {
+        character: char,
+        worktree_encoding: &'static str,
+    },
+}
+
+pub(crate) mod function {
+    use super::Error;
+    use crate::clear_and_set_capacity;
+    use encoding_rs::EncoderResult;
+
+    /// Encode `src_utf8`, which is assumed to be UTF-8 encoded, according to `worktree_encoding` for placement in the working directory,
+    /// and write it to `buf`, possibly resizing it.
+    /// Note that the encoding is always applied, there is no conditional even if `worktree_encoding` and the `src` encoding are the same.
+    pub fn encode_to_worktree(
+        src_utf8: &[u8],
+        worktree_encoding: &'static encoding_rs::Encoding,
+        buf: &mut Vec<u8>,
+    ) -> Result<(), Error> {
+        let mut encoder = worktree_encoding.new_encoder();
+        let buf_len = encoder
+            .max_buffer_length_from_utf8_if_no_unmappables(src_utf8.len())
+            .ok_or(Error::Overflow {
+                input_len: src_utf8.len(),
+            })?;
+        clear_and_set_capacity(buf, buf_len);
+        // SAFETY: `clear_and_set_capacity` assure that we have the given `buf_len` allocated, so setting its length is only making available
+        //          what is allocated. Later we will truncate to the amount of actually written bytes.
+        #[allow(unsafe_code)]
+        unsafe {
+            buf.set_len(buf_len);
+        }
+        let src = std::str::from_utf8(src_utf8)?;
+        let (res, read, written) = encoder.encode_from_utf8_without_replacement(src, buf, true);
+        match res {
+            EncoderResult::InputEmpty => {
+                assert!(
+                    buf_len >= written,
+                    "encoding_rs estimates the maximum amount of bytes written correctly"
+                );
+                assert_eq!(read, src_utf8.len(), "input buffer should be fully consumed");
+                // SAFETY: we trust that `encoding_rs` reports this number correctly, and truncate everything else.
+                #[allow(unsafe_code)]
+                unsafe {
+                    buf.set_len(written);
+                }
+            }
+            EncoderResult::OutputFull => {
+                unreachable!("we assure that the output buffer is big enough as per the encoder's estimate")
+            }
+            EncoderResult::Unmappable(c) => {
+                return Err(Error::Unmappable {
+                    worktree_encoding: worktree_encoding.name(),
+                    character: c,
+                })
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/gix-filter/src/worktree/encoding.rs b/gix-filter/src/worktree/encoding.rs
@@ -0,0 +1,31 @@
+use bstr::BStr;
+use encoding_rs::Encoding;
+
+///
+pub mod for_label {
+    use bstr::BString;
+
+    /// The error returned by [for_label()][super::for_label()].
+    #[derive(Debug, thiserror::Error)]
+    #[allow(missing_docs)]
+    pub enum Error {
+        #[error("An encoding named '{name}' is not known")]
+        Unknown { name: BString },
+    }
+}
+
+/// Try to produce a new `Encoding` for `label` or report an error if it is not known.
+///
+/// ### Deviation
+///
+/// * There is no special handling of UTF-16LE/BE with checks if data contains a BOM or not, like `git` as we don't expect to have
+///   data available here.
+/// * Special `-BOM` suffixed versions of `UTF-16` encodings are not supported.
+pub fn for_label<'a>(label: impl Into<&'a BStr>) -> Result<&'static Encoding, for_label::Error> {
+    let mut label = label.into();
+    if label == "latin-1" {
+        label = "ISO-8859-1".into();
+    }
+    let enc = Encoding::for_label(label.as_ref()).ok_or_else(|| for_label::Error::Unknown { name: label.into() })?;
+    Ok(enc)
+}
diff --git a/gix-filter/src/worktree/mod.rs b/gix-filter/src/worktree/mod.rs
@@ -0,0 +1,16 @@
+//! Worktree encodings are powered by the `encoding_rs` crate, which has a narrower focus than the `iconv` library. Thus this implementation
+//! is inherently more limited but will handle the common cases.
+//!  
+//! Note that for encoding to legacy formats, [additional normalization steps](https://docs.rs/encoding_rs/0.8.32/encoding_rs/#preparing-text-for-the-encoders)
+//! can be taken, which we do not yet take unless there is specific examples or problems to solve.
+
+///
+pub mod encoding;
+
+///
+pub mod encode_to_git;
+pub use encode_to_git::function::encode_to_git;
+
+///
+pub mod encode_to_worktree;
+pub use encode_to_worktree::function::encode_to_worktree;
diff --git a/gix-filter/tests/filter.rs b/gix-filter/tests/filter.rs
@@ -1,4 +1,5 @@
 mod eol;
 mod ident;
+mod worktree;
 
 pub type Result<T = ()> = std::result::Result<T, Box<dyn std::error::Error>>;