Skip to content

Commit

Permalink
Merge branch 'worktree-encoding'
Browse files Browse the repository at this point in the history
  • Loading branch information
Byron committed Jun 27, 2023
2 parents 3fd5e16 + d1fed3e commit 5af2cf3
Show file tree
Hide file tree
Showing 11 changed files with 406 additions and 72 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions gix-filter/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,6 @@ gix-hash = { version = "^0.11.3", path = "../gix-hash" }
gix-trace = { version = "^0.1.1", path = "../gix-trace" }
gix-object = { version = "^0.31.0", path = "../gix-object" }

encoding_rs = "0.8.32"
bstr = { version = "1.5.0", default-features = false, features = ["std"] }
thiserror = "1.0.38"
68 changes: 43 additions & 25 deletions gix-filter/src/ident.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use bstr::{BStr, ByteSlice, ByteVec};
use std::borrow::Cow;
use crate::clear_and_set_capacity;
use bstr::{ByteSlice, ByteVec};
use std::ops::Range;

/// Undo identifiers like `$Id:<hexsha>$` to `$Id$`. Newlines between dollars are ignored.
pub fn undo(mut input: Cow<'_, BStr>) -> Cow<'_, BStr> {
/// Undo identifiers like `$Id:<hexsha>$` to `$Id$` in `src` and write to `buf`. Newlines between dollars are ignored.
/// Return `true` if `buf` was written or `false` if `src` was left unaltered (as there was nothing to do).
pub fn undo(src: &[u8], buf: &mut Vec<u8>) -> bool {
fn find_range(input: &[u8]) -> Option<Range<usize>> {
let mut ofs = 0;
loop {
Expand All @@ -21,37 +22,54 @@ pub fn undo(mut input: Cow<'_, BStr>) -> Cow<'_, BStr> {
}

let mut ofs = 0;
while let Some(range) = find_range(&input[ofs..]) {
input
.to_mut()
.replace_range((range.start + ofs)..(range.end + ofs), b"$Id$");
ofs += range.start + 4;
let mut initialized = false;
while let Some(range) = find_range(&src[ofs..]) {
if !initialized {
clear_and_set_capacity(buf, src.len());
initialized = true;
}
buf.push_str(&src[ofs..][..range.start]);
buf.push_str(b"$Id$");
ofs += range.end;
}
if initialized {
buf.push_str(&src[ofs..]);
}
input
initialized
}

/// Substitute all occurrences of `$Id$` with `$Id: <hexsha-of-input>$` if present and return the changed buffer, with `object_hash`
/// being used accordingly.
/// Substitute all occurrences of `$Id$` with `$Id: <hexsha-of-input>$` if present in `src` and write all changes to `buf`,
/// with `object_hash` being used accordingly. Return `true` if `buf` was written to or `false` if no change was made
/// (as there was nothing to do).
///
/// ### Deviation
///
/// `Git` also tries to cleanup 'stray' substituted `$Id: <hex>$`, but we don't do that, sticking exactly to what ought to be done.
/// The respective code is up to 16 years old and one might assume that `git` by now handles checking and checkout filters correctly.
pub fn apply(mut input: Cow<'_, BStr>, object_hash: gix_hash::Kind) -> Cow<'_, BStr> {
let mut buf: [u8; b": $".len() + gix_hash::Kind::longest().len_in_hex()] = std::array::from_fn(|_| 0);
pub fn apply(src: &[u8], object_hash: gix_hash::Kind, buf: &mut Vec<u8>) -> bool {
const HASH_LEN: usize = ": ".len() + gix_hash::Kind::longest().len_in_hex();
let mut id = None;
let mut ofs = 0;
while let Some(pos) = input[ofs..].find(b"$Id$") {
let id = id.get_or_insert_with(|| gix_object::compute_hash(object_hash, gix_object::Kind::Blob, &input));
while let Some(pos) = src[ofs..].find(b"$Id$") {
let id = match id {
None => {
let new_id = gix_object::compute_hash(object_hash, gix_object::Kind::Blob, src);
id = new_id.into();
clear_and_set_capacity(buf, src.len() + HASH_LEN); // pre-allocate for one ID
new_id
}
Some(id) => id.to_owned(),
};

buf[..2].copy_from_slice(b": ");
let _ = id.hex_to_buf(&mut buf[2..][..object_hash.len_in_hex()]);
let replaced_id = &mut buf[..2 + object_hash.len_in_hex() + 1];
*replaced_id.last_mut().expect("present") = b'$';
input
.to_mut()
.replace_range((ofs + pos + 3)..(ofs + pos + 4), &*replaced_id);
ofs += pos + 3 + replaced_id.len();
buf.push_str(&src[ofs..][..pos + 3]);
buf.push_str(b": ");
id.write_hex_to(&mut *buf).expect("writes to memory always work");
buf.push(b'$');

ofs += pos + 4;
}
if id.is_some() {
buf.push_str(&src[ofs..]);
}
input
id.is_some()
}
6 changes: 4 additions & 2 deletions gix-filter/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,17 @@
//!
//! This crate implements the building blocks in terms of applying and undoing filters, along with logic to decide whether
//! or not to apply such a filter.
#![deny(rust_2018_idioms, missing_docs)]
#![forbid(unsafe_code)]
#![deny(rust_2018_idioms, missing_docs, unsafe_code)]

///
pub mod ident;

/// utilities related to handling line endings in buffers
pub mod eol;

/// Utilities for handling worktree encodings.
pub mod worktree;

fn clear_and_set_capacity(buf: &mut Vec<u8>, cap: usize) {
buf.clear();
if buf.capacity() < cap {
Expand Down
90 changes: 90 additions & 0 deletions gix-filter/src/worktree/encode_to_git.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/// Whether or not to perform round-trip checks.
#[derive(Debug, Copy, Clone)]
pub enum RoundTrip {
/// Assure that we can losslessly convert the UTF-8 result back to the original encoding.
Validate,
/// Do not check if the encoding is round-trippable.
Ignore,
}

/// The error returned by [`encode_to_git()][super::encode_to_git()].
#[derive(Debug, thiserror::Error)]
#[allow(missing_docs)]
pub enum Error {
#[error("Cannot convert input of {input_len} bytes to UTF-8 without overflowing")]
Overflow { input_len: usize },
#[error("The input was malformed and could not be decoded as '{encoding}'")]
Malformed { encoding: &'static str },
#[error("Encoding from '{src_encoding}' to '{dest_encoding}' and back is not the same")]
RoundTrip {
src_encoding: &'static str,
dest_encoding: &'static str,
},
}

pub(crate) mod function {
use super::{Error, RoundTrip};
use crate::clear_and_set_capacity;
use encoding_rs::DecoderResult;

/// Decode `src` according to `src_encoding` to `UTF-8` for storage in git and place it in `buf`.
/// Note that the encoding is always applied, there is no conditional even if `src_encoding` already is `UTF-8`.
pub fn encode_to_git(
src: &[u8],
src_encoding: &'static encoding_rs::Encoding,
buf: &mut Vec<u8>,
round_trip: RoundTrip,
) -> Result<(), Error> {
let mut decoder = src_encoding.new_decoder_with_bom_removal();
let buf_len = decoder
.max_utf8_buffer_length_without_replacement(src.len())
.ok_or(Error::Overflow { input_len: src.len() })?;
clear_and_set_capacity(buf, buf_len);
// SAFETY: `clear_and_set_capacity` assure that we have the given `buf_len` allocated, so setting its length is only making available
// what is allocated. Later we will truncate to the amount of actually written bytes.
#[allow(unsafe_code)]
unsafe {
buf.set_len(buf_len);
}
let (res, read, written) = decoder.decode_to_utf8_without_replacement(src, buf, true);
match res {
DecoderResult::InputEmpty => {
assert!(
buf_len >= written,
"encoding_rs estimates the maximum amount of bytes written correctly"
);
assert_eq!(read, src.len(), "input buffer should be fully consumed");
// SAFETY: we trust that `encoding_rs` reports this number correctly, and truncate everything else.
#[allow(unsafe_code)]
unsafe {
buf.set_len(written);
}
}
DecoderResult::OutputFull => {
unreachable!("we assure that the output buffer is big enough as per the encoder's estimate")
}
DecoderResult::Malformed(_, _) => {
return Err(Error::Malformed {
encoding: src_encoding.name(),
})
}
}

match round_trip {
RoundTrip::Validate => {
// SAFETY: we trust `encoding_rs` to output valid UTF-8 only if we ask it to.
#[allow(unsafe_code)]
let str = unsafe { std::str::from_utf8_unchecked(buf) };
let (should_equal_src, _actual_encoding, _had_errors) = src_encoding.encode(str);
if should_equal_src != src {
return Err(Error::RoundTrip {
src_encoding: src_encoding.name(),
dest_encoding: "UTF-8",
});
}
}
RoundTrip::Ignore => {}
}
Ok(())
}
}
69 changes: 69 additions & 0 deletions gix-filter/src/worktree/encode_to_worktree.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/// The error returned by [`encode_to_worktree()][super::encode_to_worktree()].
#[derive(Debug, thiserror::Error)]
#[allow(missing_docs)]
pub enum Error {
#[error("Cannot convert input of {input_len} UTF-8 bytes to target encoding without overflowing")]
Overflow { input_len: usize },
#[error("Input was not UTF-8 encoded")]
InputAsUtf8(#[from] std::str::Utf8Error),
#[error("The character '{character}' could not be mapped to the {worktree_encoding}")]
Unmappable {
character: char,
worktree_encoding: &'static str,
},
}

pub(crate) mod function {
use super::Error;
use crate::clear_and_set_capacity;
use encoding_rs::EncoderResult;

/// Encode `src_utf8`, which is assumed to be UTF-8 encoded, according to `worktree_encoding` for placement in the working directory,
/// and write it to `buf`, possibly resizing it.
/// Note that the encoding is always applied, there is no conditional even if `worktree_encoding` and the `src` encoding are the same.
pub fn encode_to_worktree(
src_utf8: &[u8],
worktree_encoding: &'static encoding_rs::Encoding,
buf: &mut Vec<u8>,
) -> Result<(), Error> {
let mut encoder = worktree_encoding.new_encoder();
let buf_len = encoder
.max_buffer_length_from_utf8_if_no_unmappables(src_utf8.len())
.ok_or(Error::Overflow {
input_len: src_utf8.len(),
})?;
clear_and_set_capacity(buf, buf_len);
// SAFETY: `clear_and_set_capacity` assure that we have the given `buf_len` allocated, so setting its length is only making available
// what is allocated. Later we will truncate to the amount of actually written bytes.
#[allow(unsafe_code)]
unsafe {
buf.set_len(buf_len);
}
let src = std::str::from_utf8(src_utf8)?;
let (res, read, written) = encoder.encode_from_utf8_without_replacement(src, buf, true);
match res {
EncoderResult::InputEmpty => {
assert!(
buf_len >= written,
"encoding_rs estimates the maximum amount of bytes written correctly"
);
assert_eq!(read, src_utf8.len(), "input buffer should be fully consumed");
// SAFETY: we trust that `encoding_rs` reports this number correctly, and truncate everything else.
#[allow(unsafe_code)]
unsafe {
buf.set_len(written);
}
}
EncoderResult::OutputFull => {
unreachable!("we assure that the output buffer is big enough as per the encoder's estimate")
}
EncoderResult::Unmappable(c) => {
return Err(Error::Unmappable {
worktree_encoding: worktree_encoding.name(),
character: c,
})
}
}
Ok(())
}
}
31 changes: 31 additions & 0 deletions gix-filter/src/worktree/encoding.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
use bstr::BStr;
use encoding_rs::Encoding;

///
pub mod for_label {
use bstr::BString;

/// The error returned by [for_label()][super::for_label()].
#[derive(Debug, thiserror::Error)]
#[allow(missing_docs)]
pub enum Error {
#[error("An encoding named '{name}' is not known")]
Unknown { name: BString },
}
}

/// Try to produce a new `Encoding` for `label` or report an error if it is not known.
///
/// ### Deviation
///
/// * There is no special handling of UTF-16LE/BE with checks if data contains a BOM or not, like `git` as we don't expect to have
/// data available here.
/// * Special `-BOM` suffixed versions of `UTF-16` encodings are not supported.
pub fn for_label<'a>(label: impl Into<&'a BStr>) -> Result<&'static Encoding, for_label::Error> {
let mut label = label.into();
if label == "latin-1" {
label = "ISO-8859-1".into();
}
let enc = Encoding::for_label(label.as_ref()).ok_or_else(|| for_label::Error::Unknown { name: label.into() })?;
Ok(enc)
}
16 changes: 16 additions & 0 deletions gix-filter/src/worktree/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
//! Worktree encodings are powered by the `encoding_rs` crate, which has a narrower focus than the `iconv` library. Thus this implementation
//! is inherently more limited but will handle the common cases.
//!
//! Note that for encoding to legacy formats, [additional normalization steps](https://docs.rs/encoding_rs/0.8.32/encoding_rs/#preparing-text-for-the-encoders)
//! can be taken, which we do not yet take unless there is specific examples or problems to solve.

///
pub mod encoding;

///
pub mod encode_to_git;
pub use encode_to_git::function::encode_to_git;

///
pub mod encode_to_worktree;
pub use encode_to_worktree::function::encode_to_worktree;
1 change: 1 addition & 0 deletions gix-filter/tests/filter.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
mod eol;
mod ident;
mod worktree;

pub type Result<T = ()> = std::result::Result<T, Box<dyn std::error::Error>>;
Loading

0 comments on commit 5af2cf3

Please sign in to comment.