From be30309ea0581324f89c1ce2c6f6aa7af85cb0f9 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 20 Mar 2024 13:44:36 -0400
Subject: [PATCH] deps: update simdutf to 5.0.0

PR-URL: https://github.com/nodejs/node/pull/52138
Reviewed-By: Yagiz Nizipli <yagiz.nizipli@sentry.io>
Reviewed-By: Michael Dawson <midawson@redhat.com>
Reviewed-By: Marco Ippolito <marcoippolito54@gmail.com>
Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com>
---
 deps/simdutf/simdutf.cpp | 4532 ++++++++++++++++++++++++++++++++++++--
 deps/simdutf/simdutf.h   |  185 +-
 2 files changed, 4522 insertions(+), 195 deletions(-)

diff --git a/deps/simdutf/simdutf.cpp b/deps/simdutf/simdutf.cpp
index d9b854a8cc1a3a..8452ff3896c4da 100644
--- a/deps/simdutf/simdutf.cpp
+++ b/deps/simdutf/simdutf.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on 2024-01-29 10:40:15 -0500. Do not edit! */
+/* auto-generated on 2024-03-18 10:58:28 -0400. Do not edit! */
 /* begin file src/simdutf.cpp */
 #include "simdutf.h"
 /* begin file src/implementation.cpp */
@@ -140,7 +140,10 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
-
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
 
 } // namespace arm64
@@ -179,6 +182,20 @@ simdutf_really_inline int count_ones(uint64_t input_num) {
    return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
 }
 
+#if SIMDUTF_NEED_TRAILING_ZEROES
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  unsigned long ret;
+  // Search the mask data from least significant bit (LSB)
+  // to the most significant bit (MSB) for a set bit (1).
+  _BitScanForward64(&ret, input_num);
+  return (int)ret;
+#else // SIMDUTF_REGULAR_VISUAL_STUDIO
+  return __builtin_ctzll(input_num);
+#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+#endif
+
 } // unnamed namespace
 } // namespace arm64
 } // namespace simdutf
@@ -1340,6 +1357,10 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
 
 } // namespace icelake
@@ -1385,6 +1406,16 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) {
 }
 #endif
 
+#if SIMDUTF_NEED_TRAILING_ZEROES
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+#if SIMDUTF_REGULAR_VISUAL_STUDIO
+  return (int)_tzcnt_u64(input_num);
+#else // SIMDUTF_REGULAR_VISUAL_STUDIO
+  return __builtin_ctzll(input_num);
+#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+#endif
+
 } // unnamed namespace
 } // namespace icelake
 } // namespace simdutf
@@ -1547,6 +1578,10 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused virtual size_t base64_length_from_binary(size_t length) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
 
 } // namespace haswell
@@ -1656,6 +1691,16 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) {
 }
 #endif
 
+#if SIMDUTF_NEED_TRAILING_ZEROES
+simdutf_inline int trailing_zeroes(uint64_t input_num) {
+#if SIMDUTF_REGULAR_VISUAL_STUDIO
+  return (int)_tzcnt_u64(input_num);
+#else // SIMDUTF_REGULAR_VISUAL_STUDIO
+  return __builtin_ctzll(input_num);
+#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+#endif
+
 } // unnamed namespace
 } // namespace haswell
 } // namespace simdutf
@@ -2478,6 +2523,10 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
 
 } // namespace westmere
@@ -2564,6 +2613,18 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) {
 }
 #endif
 
+#if SIMDUTF_NEED_TRAILING_ZEROES
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+#if SIMDUTF_REGULAR_VISUAL_STUDIO
+  unsigned long ret;
+  _BitScanForward64(&ret, input_num);
+  return (int)ret;
+#else // SIMDUTF_REGULAR_VISUAL_STUDIO
+  return __builtin_ctzll(input_num);
+#endif // SIMDUTF_REGULAR_VISUAL_STUDIO
+}
+#endif
+
 } // unnamed namespace
 } // namespace westmere
 } // namespace simdutf
@@ -3397,6 +3458,10 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
 
 } // namespace ppc64
@@ -3954,6 +4019,278 @@ template <typename T> struct simd8x64 {
 
 #endif // SIMDUTF_PPC64_H
 /* end file src/simdutf/ppc64.h */
+/* begin file src/simdutf/rvv.h */
+#ifndef SIMDUTF_RVV_H
+#define SIMDUTF_RVV_H
+
+#ifdef SIMDUTF_FALLBACK_H
+#error "rvv.h must be included before fallback.h"
+#endif
+
+
+#define SIMDUTF_CAN_ALWAYS_RUN_RVV SIMDUTF_IS_RVV
+
+#ifndef SIMDUTF_IMPLEMENTATION_RVV
+#define SIMDUTF_IMPLEMENTATION_RVV (SIMDUTF_CAN_ALWAYS_RUN_RVV || (SIMDUTF_IS_RISCV64 && SIMDUTF_HAS_RVV_INTRINSICS && SIMDUTF_HAS_RVV_TARGET_REGION))
+#endif
+
+#if SIMDUTF_IMPLEMENTATION_RVV
+
+#if SIMDUTF_CAN_ALWAYS_RUN_RVV
+#define SIMDUTF_TARGET_RVV
+#else
+#define SIMDUTF_TARGET_RVV SIMDUTF_TARGET_REGION("arch=+v")
+#endif
+#if !SIMDUTF_IS_ZVBB && SIMDUTF_HAS_ZVBB_INTRINSICS
+#define SIMDUTF_TARGET_ZVBB SIMDUTF_TARGET_REGION("arch=+v,+zvbb")
+#endif
+
+namespace simdutf {
+namespace rvv {
+} // namespace rvv
+} // namespace simdutf
+
+/* begin file src/simdutf/rvv/implementation.h */
+#ifndef SIMDUTF_RVV_IMPLEMENTATION_H
+#define SIMDUTF_RVV_IMPLEMENTATION_H
+
+
+namespace simdutf {
+namespace rvv {
+
+namespace {
+using namespace simdutf;
+} // namespace
+
+class implementation final : public simdutf::implementation {
+public:
+  simdutf_really_inline implementation()
+      : simdutf::implementation("rvv", "RISC-V Vector Extension",
+                                 internal::instruction_set::RVV)
+      , _supports_zvbb(internal::detect_supported_architectures() & internal::instruction_set::ZVBB)
+  {}
+  simdutf_warn_unused int detect_encodings(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf8(const char *buf, size_t len, char *utf8_output) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf32(const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_latin1(const char *buf, size_t len, char *latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char *buf, size_t len, char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char *buf, size_t len, char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf32(const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t *buf, size_t len, char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t *buf, size_t len, char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t *buf, size_t len, char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t *buf, size_t len, char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len, char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len, char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t *buf, size_t len, char *latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len, char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t *buf, size_t len, char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+  void change_endianness_utf16(const char16_t *buf, size_t len, char16_t *output) const noexcept final;
+  simdutf_warn_unused size_t count_utf16le(const char16_t *buf, size_t len) const noexcept;
+  simdutf_warn_unused size_t count_utf16be(const char16_t *buf, size_t len) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char *buf, size_t len) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *buf, size_t len) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *buf, size_t len) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *buf, size_t len) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *buf, size_t len) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf8(const char *buf, size_t len) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *buf, size_t len) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *buf, size_t len) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf8(const char *buf, size_t len) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf8(const char *buf, size_t len) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) const noexcept;
+  simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_latin1(size_t len) const noexcept;
+  simdutf_warn_unused size_t utf16_length_from_latin1(size_t len) const noexcept;
+  simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf, size_t len) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+private:
+  const bool _supports_zvbb;
+
+#if SIMDUTF_IS_ZVBB
+  bool supports_zvbb() const { return true; }
+#elif SIMDUTF_HAS_ZVBB_INTRINSICS
+  bool supports_zvbb() const { return _supports_zvbb; }
+#else
+  bool supports_zvbb() const { return false; }
+#endif
+};
+
+} // namespace rvv
+} // namespace simdutf
+
+#endif // SIMDUTF_RVV_IMPLEMENTATION_H
+/* end file src/simdutf/rvv/implementation.h */
+/* begin file src/simdutf/rvv/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "rvv"
+// #define SIMDUTF_IMPLEMENTATION rvv
+
+#if SIMDUTF_CAN_ALWAYS_RUN_RVV
+// nothing needed.
+#else
+SIMDUTF_TARGET_RVV
+#endif
+/* end file src/simdutf/rvv/begin.h */
+/* begin file src/simdutf/rvv/intrinsics.h */
+#ifndef SIMDUTF_RVV_INTRINSICS_H
+#define SIMDUTF_RVV_INTRINSICS_H
+
+
+#include <riscv_vector.h>
+
+#if __riscv_v_intrinsic >= 1000000 ||  __GCC__ >= 14
+#define simdutf_vrgather_u8m1x2(tbl, idx) __riscv_vcreate_v_u8m1_u8m2( \
+        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 0), __riscv_vsetvlmax_e8m1()), \
+        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 1), __riscv_vsetvlmax_e8m1()));
+
+#define simdutf_vrgather_u8m1x4(tbl, idx) __riscv_vcreate_v_u8m1_u8m4( \
+        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 0), __riscv_vsetvlmax_e8m1()), \
+        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 1), __riscv_vsetvlmax_e8m1()), \
+        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 2), __riscv_vsetvlmax_e8m1()), \
+        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 3), __riscv_vsetvlmax_e8m1()));
+#else
+// This has worse codegen on gcc
+#define simdutf_vrgather_u8m1x2(tbl, idx) \
+        __riscv_vset_v_u8m1_u8m2(__riscv_vlmul_ext_v_u8m1_u8m2( \
+        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 0), __riscv_vsetvlmax_e8m1())), 1, \
+        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m2_u8m1(idx, 1), __riscv_vsetvlmax_e8m1()))
+
+#define simdutf_vrgather_u8m1x4(tbl, idx) \
+        __riscv_vset_v_u8m1_u8m4(__riscv_vset_v_u8m1_u8m4(\
+        __riscv_vset_v_u8m1_u8m4(__riscv_vlmul_ext_v_u8m1_u8m4( \
+        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 0), __riscv_vsetvlmax_e8m1())), 1, \
+        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 1), __riscv_vsetvlmax_e8m1())), 2, \
+        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 2), __riscv_vsetvlmax_e8m1())), 3, \
+        __riscv_vrgather_vv_u8m1(tbl, __riscv_vget_v_u8m4_u8m1(idx, 3), __riscv_vsetvlmax_e8m1()))
+#endif
+
+/* Zvbb adds dedicated support for endianness swaps with vrev8, but if we can't
+ * use that, we have to emulate it with the standard V extension.
+ * Using LMUL=1 vrgathers could be faster than the srl+macc variant, but that
+ * would increase register pressure, and vrgather implementations performance
+ * varies a lot. */
+enum class simdutf_ByteFlip { NONE, V, ZVBB };
+
+template<simdutf_ByteFlip method>
+simdutf_really_inline static uint16_t simdutf_byteflip(uint16_t v) {
+  if (method != simdutf_ByteFlip::NONE)
+    return (uint16_t)((v*1u) << 8 | (v*1u) >> 8);
+  return v;
+}
+
+#ifdef SIMDUTF_TARGET_ZVBB
+SIMDUTF_UNTARGET_REGION
+SIMDUTF_TARGET_ZVBB
+#endif
+
+template<simdutf_ByteFlip method>
+simdutf_really_inline static vuint16m1_t simdutf_byteflip(vuint16m1_t v, size_t vl) {
+#if SIMDUTF_HAS_ZVBB_INTRINSICS
+  if (method == simdutf_ByteFlip::ZVBB)
+    return __riscv_vrev8_v_u16m1(v, vl);
+#endif
+  if (method == simdutf_ByteFlip::V)
+    return __riscv_vmacc_vx_u16m1(__riscv_vsrl_vx_u16m1(v, 8, vl), 0x100, v, vl);
+  return v;
+}
+
+template<simdutf_ByteFlip method>
+simdutf_really_inline static vuint16m2_t simdutf_byteflip(vuint16m2_t v, size_t vl) {
+#if SIMDUTF_HAS_ZVBB_INTRINSICS
+  if (method == simdutf_ByteFlip::ZVBB)
+    return __riscv_vrev8_v_u16m2(v, vl);
+#endif
+  if (method == simdutf_ByteFlip::V)
+    return __riscv_vmacc_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 8, vl), 0x100, v, vl);
+  return v;
+}
+
+template<simdutf_ByteFlip method>
+simdutf_really_inline static vuint16m4_t simdutf_byteflip(vuint16m4_t v, size_t vl) {
+#if SIMDUTF_HAS_ZVBB_INTRINSICS
+  if (method == simdutf_ByteFlip::ZVBB)
+    return __riscv_vrev8_v_u16m4(v, vl);
+#endif
+  if (method == simdutf_ByteFlip::V)
+    return __riscv_vmacc_vx_u16m4(__riscv_vsrl_vx_u16m4(v, 8, vl), 0x100, v, vl);
+  return v;
+}
+
+template<simdutf_ByteFlip method>
+simdutf_really_inline static vuint16m8_t simdutf_byteflip(vuint16m8_t v, size_t vl) {
+#if SIMDUTF_HAS_ZVBB_INTRINSICS
+  if (method == simdutf_ByteFlip::ZVBB)
+    return __riscv_vrev8_v_u16m8(v, vl);
+#endif
+  if (method == simdutf_ByteFlip::V)
+    return __riscv_vmacc_vx_u16m8(__riscv_vsrl_vx_u16m8(v, 8, vl), 0x100, v, vl);
+  return v;
+}
+
+#ifdef SIMDUTF_TARGET_ZVBB
+SIMDUTF_UNTARGET_REGION
+SIMDUTF_TARGET_RVV
+#endif
+
+#endif //  SIMDUTF_RVV_INTRINSICS_H
+/* end file src/simdutf/rvv/intrinsics.h */
+/* begin file src/simdutf/rvv/end.h */
+#if SIMDUTF_CAN_ALWAYS_RUN_RVV
+// nothing needed.
+#else
+SIMDUTF_UNTARGET_REGION
+#endif
+
+/* end file src/simdutf/rvv/end.h */
+
+#endif // SIMDUTF_IMPLEMENTATION_RVV
+
+#endif // SIMDUTF_RVV_H
+/* end file src/simdutf/rvv.h */
 /* begin file src/simdutf/fallback.h */
 #ifndef SIMDUTF_FALLBACK_H
 #define SIMDUTF_FALLBACK_H
@@ -3963,7 +4300,7 @@ template <typename T> struct simd8x64 {
 
 // Default Fallback to on unless a builtin implementation has already been selected.
 #ifndef SIMDUTF_IMPLEMENTATION_FALLBACK
-#if SIMDUTF_CAN_ALWAYS_RUN_ARM64 || SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL || SIMDUTF_CAN_ALWAYS_RUN_WESTMERE || SIMDUTF_CAN_ALWAYS_RUN_PPC64
+#if SIMDUTF_CAN_ALWAYS_RUN_ARM64 || SIMDUTF_CAN_ALWAYS_RUN_ICELAKE || SIMDUTF_CAN_ALWAYS_RUN_HASWELL || SIMDUTF_CAN_ALWAYS_RUN_WESTMERE || SIMDUTF_CAN_ALWAYS_RUN_PPC64 || SIMDUTF_CAN_ALWAYS_RUN_RVV
 #define SIMDUTF_IMPLEMENTATION_FALLBACK 0
 #else
 #define SIMDUTF_IMPLEMENTATION_FALLBACK 1
@@ -4075,8 +4412,12 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
   simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;};
-
+  simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+};
 } // namespace fallback
 } // namespace simdutf
 
@@ -4121,7 +4462,7 @@ namespace simdutf {
 namespace scalar {
 namespace {
 namespace utf8 {
-#if SIMDUTF_IMPLEMENTATION_FALLBACK
+#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_RVV
 // only used by the fallback kernel.
 // credit: based on code from Google Fuchsia (Apache Licensed)
 inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
@@ -4486,6 +4827,13 @@ simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char
 }
 
 namespace internal {
+// When there is a single implementation, we should not pay a price
+ // for dispatching to the best implementation. We should just use the
+ // one we have. This is a compile-time check.
+ #define SIMDUTF_SINGLE_IMPLEMENTATION (SIMDUTF_IMPLEMENTATION_ICELAKE \
+              + SIMDUTF_IMPLEMENTATION_HASWELL + SIMDUTF_IMPLEMENTATION_WESTMERE \
+              + SIMDUTF_IMPLEMENTATION_ARM64 + SIMDUTF_IMPLEMENTATION_PPC64 \
+              + SIMDUTF_IMPLEMENTATION_FALLBACK == 1)
 
 // Static array of known implementations. We're hoping these get baked into the executable
 // without requiring a static initializer.
@@ -4521,6 +4869,12 @@ static const ppc64::implementation* get_ppc64_singleton() {
   return &ppc64_singleton;
 }
 #endif
+#if SIMDUTF_IMPLEMENTATION_RVV
+static const rvv::implementation* get_rvv_singleton() {
+  static const rvv::implementation rvv_singleton{};
+  return &rvv_singleton;
+}
+#endif
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
 static const fallback::implementation* get_fallback_singleton() {
   static const fallback::implementation fallback_singleton{};
@@ -4528,6 +4882,30 @@ static const fallback::implementation* get_fallback_singleton() {
 }
 #endif
 
+#if SIMDUTF_SINGLE_IMPLEMENTATION
+static const implementation* get_single_implementation() {
+    return
+#if SIMDUTF_IMPLEMENTATION_ICELAKE
+    get_icelake_singleton();
+#endif
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+    get_haswell_singleton();
+#endif
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+    get_westmere_singleton();
+#endif
+#if SIMDUTF_IMPLEMENTATION_ARM64
+    get_arm64_singleton();
+#endif
+#if SIMDUTF_IMPLEMENTATION_PPC64
+    get_ppc64_singleton();
+#endif
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+    get_fallback_singleton();
+#endif
+}
+#endif
+
 /**
  * @private Detects best supported implementation on first use, and sets it
  */
@@ -4837,6 +5215,22 @@ class detect_best_supported_implementation_on_first_use final : public implement
     return set_best()->utf32_length_from_utf8(buf, len);
   }
 
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept override {
+    return set_best()->maximal_binary_length_from_base64(input, length);
+  }
+
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept override {
+    return set_best()->base64_to_binary(input, length, output);
+  }
+
+  simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept override {
+    return set_best()->base64_length_from_binary(length);
+  }
+
+  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept override {
+    return set_best()->binary_to_base64(input, length, output);
+  }
+
   simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
 
 private:
@@ -4860,6 +5254,9 @@ static const std::initializer_list<const implementation *>& get_available_implem
 #if SIMDUTF_IMPLEMENTATION_PPC64
     get_ppc64_singleton(),
 #endif
+#if SIMDUTF_IMPLEMENTATION_RVV
+    get_rvv_singleton(),
+#endif
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
     get_fallback_singleton(),
 #endif
@@ -5152,7 +5549,7 @@ class unsupported_implementation final : public implementation {
     return 0;
   }
 
-    simdutf_warn_unused size_t utf32_length_from_latin1(size_t) const noexcept override {
+  simdutf_warn_unused size_t utf32_length_from_latin1(size_t) const noexcept override {
     return 0;
   }
 
@@ -5174,6 +5571,22 @@ class unsupported_implementation final : public implementation {
     return 0;
   }
 
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused result base64_to_binary(const char *, size_t, char*) const noexcept override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t base64_length_from_binary(size_t) const noexcept override {
+    return 0;
+  }
+
+  size_t binary_to_base64(const char *, size_t, char*) const noexcept override {
+    return 0;
+  }
+
   unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
 };
 
@@ -5232,22 +5645,40 @@ SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_avail
   * The active implementation.
   */
 SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation() {
+#if SIMDUTF_SINGLE_IMPLEMENTATION
+    // skip runtime detection
+    static internal::atomic_ptr<const implementation> active_implementation{internal::get_single_implementation()};
+    return active_implementation;
+#else
     static const internal::detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton;
     static internal::atomic_ptr<const implementation> active_implementation{&detect_best_supported_implementation_on_first_use_singleton};
     return active_implementation;
+#endif
 }
 
+
+#if SIMDUTF_SINGLE_IMPLEMENTATION
+const implementation * get_default_implementation() {
+  return internal::get_single_implementation();
+}
+#else
+internal::atomic_ptr<const implementation>& get_default_implementation() {
+  return get_active_implementation();
+}
+#endif
+#define SIMDUTF_GET_CURRENT_IMPLEMENTION
+
 simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf8(buf, len);
+  return get_default_implementation()->validate_utf8(buf, len);
 }
 simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf8_with_errors(buf, len);
+  return get_default_implementation()->validate_utf8_with_errors(buf, len);
 }
 simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept {
-  return get_active_implementation()->validate_ascii(buf, len);
+  return get_default_implementation()->validate_ascii(buf, len);
 }
 simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept {
-  return get_active_implementation()->validate_ascii_with_errors(buf, len);
+  return get_default_implementation()->validate_ascii_with_errors(buf, len);
 }
 simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5257,31 +5688,31 @@ simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t leng
   #endif
 }
 simdutf_warn_unused size_t convert_latin1_to_utf8(const char * buf, size_t len, char* utf8_output) noexcept {
-  return get_active_implementation()->convert_latin1_to_utf8(buf, len,utf8_output);
+  return get_default_implementation()->convert_latin1_to_utf8(buf, len,utf8_output);
 }
 simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) noexcept {
-  return get_active_implementation()->convert_latin1_to_utf16le(buf, len, utf16_output);
+  return get_default_implementation()->convert_latin1_to_utf16le(buf, len, utf16_output);
 }
 simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) noexcept{
-  return get_active_implementation()->convert_latin1_to_utf16be(buf, len, utf16_output);
+  return get_default_implementation()->convert_latin1_to_utf16be(buf, len, utf16_output);
 }
 simdutf_warn_unused size_t convert_latin1_to_utf32(const char * buf, size_t len, char32_t * latin1_output) noexcept {
-  return get_active_implementation()->convert_latin1_to_utf32(buf, len,latin1_output);
+  return get_default_implementation()->convert_latin1_to_utf32(buf, len,latin1_output);
 }
 simdutf_warn_unused size_t convert_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_latin1(buf, len,latin1_output);
+  return get_default_implementation()->convert_utf8_to_latin1(buf, len,latin1_output);
 }
 simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_latin1_with_errors(buf, len, latin1_output);
+  return get_default_implementation()->convert_utf8_to_latin1_with_errors(buf, len, latin1_output);
 }
 simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * buf, size_t len, char* latin1_output) noexcept {
-  return get_active_implementation()->convert_valid_utf8_to_latin1(buf, len,latin1_output);
+  return get_default_implementation()->convert_valid_utf8_to_latin1(buf, len,latin1_output);
 }
 simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf16le(input, length, utf16_output);
+  return get_default_implementation()->convert_utf8_to_utf16le(input, length, utf16_output);
 }
 simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf16be(input, length, utf16_output);
+  return get_default_implementation()->convert_utf8_to_utf16be(input, length, utf16_output);
 }
 simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5291,16 +5722,16 @@ simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input,
   #endif
 }
 simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
+  return get_default_implementation()->convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
 }
 simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
+  return get_default_implementation()->convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
 }
 simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf32(input, length, utf32_output);
+  return get_default_implementation()->convert_utf8_to_utf32(input, length, utf32_output);
 }
 simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf32_with_errors(input, length, utf32_output);
+  return get_default_implementation()->convert_utf8_to_utf32_with_errors(input, length, utf32_output);
 }
 simdutf_warn_unused bool validate_utf16(const char16_t * buf, size_t len) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5310,10 +5741,10 @@ simdutf_warn_unused bool validate_utf16(const char16_t * buf, size_t len) noexce
   #endif
 }
 simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf16le(buf, len);
+  return get_default_implementation()->validate_utf16le(buf, len);
 }
 simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf16be(buf, len);
+  return get_default_implementation()->validate_utf16be(buf, len);
 }
 simdutf_warn_unused result validate_utf16_with_errors(const char16_t * buf, size_t len) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5323,16 +5754,16 @@ simdutf_warn_unused result validate_utf16_with_errors(const char16_t * buf, size
   #endif
 }
 simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf16le_with_errors(buf, len);
+  return get_default_implementation()->validate_utf16le_with_errors(buf, len);
 }
 simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf16be_with_errors(buf, len);
+  return get_default_implementation()->validate_utf16be_with_errors(buf, len);
 }
 simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf32(buf, len);
+  return get_default_implementation()->validate_utf32(buf, len);
 }
 simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf32_with_errors(buf, len);
+  return get_default_implementation()->validate_utf32_with_errors(buf, len);
 }
 simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5342,13 +5773,13 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_
   #endif
 }
 simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
+  return get_default_implementation()->convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
 }
 simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
+  return get_default_implementation()->convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
 }
 simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf8_to_utf32(input, length, utf32_buffer);
+  return get_default_implementation()->convert_valid_utf8_to_utf32(input, length, utf32_buffer);
 }
 simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5372,28 +5803,28 @@ simdutf_warn_unused size_t convert_latin1_to_utf16(const char * buf, size_t len,
   #endif
 }
 simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
-  return get_active_implementation()->convert_utf16be_to_latin1(buf, len, latin1_buffer);
+  return get_default_implementation()->convert_utf16be_to_latin1(buf, len, latin1_buffer);
 }
 simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
-  return get_active_implementation()->convert_utf16le_to_latin1(buf, len, latin1_buffer);
+  return get_default_implementation()->convert_utf16le_to_latin1(buf, len, latin1_buffer);
 }
 simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf16be_to_latin1(buf, len, latin1_buffer);
+  return get_default_implementation()->convert_valid_utf16be_to_latin1(buf, len, latin1_buffer);
 }
 simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf16le_to_latin1(buf, len, latin1_buffer);
+  return get_default_implementation()->convert_valid_utf16le_to_latin1(buf, len, latin1_buffer);
 }
 simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
-  return get_active_implementation()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer);
+  return get_default_implementation()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer);
 }
 simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * buf, size_t len, char* latin1_buffer) noexcept {
-  return get_active_implementation()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer);
+  return get_default_implementation()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer);
 }
 simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf16le_to_utf8(buf, len, utf8_buffer);
+  return get_default_implementation()->convert_utf16le_to_utf8(buf, len, utf8_buffer);
 }
 simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf16be_to_utf8(buf, len, utf8_buffer);
+  return get_default_implementation()->convert_utf16be_to_utf8(buf, len, utf8_buffer);
 }
 simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5410,10 +5841,10 @@ simdutf_warn_unused result convert_utf16_to_latin1_with_errors(const char16_t *
   #endif
 }
 simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
+  return get_default_implementation()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
 }
 simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
+  return get_default_implementation()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
 }
 simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5430,19 +5861,19 @@ simdutf_warn_unused size_t convert_valid_utf16_to_latin1(const char16_t * buf, s
   #endif
 }
 simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
+  return get_default_implementation()->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
 }
 simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
+  return get_default_implementation()->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
 }
 simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf8(buf, len, utf8_buffer);
+  return get_default_implementation()->convert_utf32_to_utf8(buf, len, utf8_buffer);
 }
 simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer);
+  return get_default_implementation()->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer);
 }
 simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf32_to_utf8(buf, len, utf8_buffer);
+  return get_default_implementation()->convert_valid_utf32_to_utf8(buf, len, utf8_buffer);
 }
 simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5452,13 +5883,13 @@ simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * buf, size_t l
   #endif
 }
 simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_output) noexcept {
-  return get_active_implementation()->convert_utf32_to_latin1(input, length, latin1_output);
+  return get_default_implementation()->convert_utf32_to_latin1(input, length, latin1_output);
 }
 simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf16le(buf, len, utf16_buffer);
+  return get_default_implementation()->convert_utf32_to_utf16le(buf, len, utf16_buffer);
 }
 simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf16be(buf, len, utf16_buffer);
+  return get_default_implementation()->convert_utf32_to_utf16be(buf, len, utf16_buffer);
 }
 simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5468,10 +5899,10 @@ simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * b
   #endif
 }
 simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
+  return get_default_implementation()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
 }
 simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
+  return get_default_implementation()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
 }
 simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5481,10 +5912,10 @@ simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * buf, si
   #endif
 }
 simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
+  return get_default_implementation()->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
 }
 simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
+  return get_default_implementation()->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
 }
 simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5494,10 +5925,10 @@ simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * buf, size_t l
   #endif
 }
 simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_utf16le_to_utf32(buf, len, utf32_buffer);
+  return get_default_implementation()->convert_utf16le_to_utf32(buf, len, utf32_buffer);
 }
 simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_utf16be_to_utf32(buf, len, utf32_buffer);
+  return get_default_implementation()->convert_utf16be_to_utf32(buf, len, utf32_buffer);
 }
 simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5507,10 +5938,10 @@ simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * b
   #endif
 }
 simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
+  return get_default_implementation()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
 }
 simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
+  return get_default_implementation()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
 }
 simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5520,13 +5951,13 @@ simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * buf, si
   #endif
 }
 simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
+  return get_default_implementation()->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
 }
 simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
+  return get_default_implementation()->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
 }
 void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept {
-  get_active_implementation()->change_endianness_utf16(input, length, output);
+  get_default_implementation()->change_endianness_utf16(input, length, output);
 }
 simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5536,25 +5967,25 @@ simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) no
   #endif
 }
 simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->count_utf16le(input, length);
+  return get_default_implementation()->count_utf16le(input, length);
 }
 simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->count_utf16be(input, length);
+  return get_default_implementation()->count_utf16be(input, length);
 }
 simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept {
-  return get_active_implementation()->count_utf8(input, length);
+  return get_default_implementation()->count_utf8(input, length);
 }
 simdutf_warn_unused size_t latin1_length_from_utf8(const char * buf, size_t len) noexcept {
-  return get_active_implementation()->latin1_length_from_utf8(buf, len);
+  return get_default_implementation()->latin1_length_from_utf8(buf, len);
 }
 simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) noexcept {
-  return get_active_implementation()->latin1_length_from_utf16(len);
+  return get_default_implementation()->latin1_length_from_utf16(len);
 }
 simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) noexcept {
-  return get_active_implementation()->latin1_length_from_utf32(len);
+  return get_default_implementation()->latin1_length_from_utf32(len);
 }
 simdutf_warn_unused size_t utf8_length_from_latin1(const char * buf, size_t len) noexcept {
-  return get_active_implementation()->utf8_length_from_latin1(buf, len);
+  return get_default_implementation()->utf8_length_from_latin1(buf, len);
 }
 simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5564,10 +5995,10 @@ simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t
   #endif
 }
 simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf8_length_from_utf16le(input, length);
+  return get_default_implementation()->utf8_length_from_utf16le(input, length);
 }
 simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf8_length_from_utf16be(input, length);
+  return get_default_implementation()->utf8_length_from_utf16be(input, length);
 }
 simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept {
   #if SIMDUTF_IS_BIG_ENDIAN
@@ -5577,31 +6008,48 @@ simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_
   #endif
 }
 simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf32_length_from_utf16le(input, length);
+  return get_default_implementation()->utf32_length_from_utf16le(input, length);
 }
 simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf32_length_from_utf16be(input, length);
+  return get_default_implementation()->utf32_length_from_utf16be(input, length);
 }
 simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept {
-  return get_active_implementation()->utf16_length_from_utf8(input, length);
+  return get_default_implementation()->utf16_length_from_utf8(input, length);
 }
 simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept {
-  return get_active_implementation()->utf16_length_from_latin1(length);
+  return get_default_implementation()->utf16_length_from_latin1(length);
 }
 simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf8_length_from_utf32(input, length);
+  return get_default_implementation()->utf8_length_from_utf32(input, length);
 }
 simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf16_length_from_utf32(input, length);
+  return get_default_implementation()->utf16_length_from_utf32(input, length);
 }
 simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept {
-  return get_active_implementation()->utf32_length_from_utf8(input, length);
+  return get_default_implementation()->utf32_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept {
+  return get_default_implementation()->maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) noexcept {
+  return get_default_implementation()->base64_to_binary(input, length, output);
 }
+
+simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept {
+  return get_default_implementation()->base64_length_from_binary(length);
+}
+
+size_t binary_to_base64(const char * input, size_t length, char* output) noexcept {
+  return get_default_implementation()->binary_to_base64(input, length, output);
+}
+
 simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, size_t length) noexcept {
-  return get_active_implementation()->autodetect_encoding(buf, length);
+  return get_default_implementation()->autodetect_encoding(buf, length);
 }
 simdutf_warn_unused int detect_encodings(const char * buf, size_t length) noexcept {
-  return get_active_implementation()->detect_encodings(buf, length);
+  return get_default_implementation()->detect_encodings(buf, length);
 }
 const implementation * builtin_implementation() {
   static const implementation * builtin_impl = get_available_implementations()[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)];
@@ -5667,7 +6115,7 @@ encoding_type check_bom(const uint8_t* byte, size_t length) {
             return encoding_type::UTF16_BE;
         } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and byte[2] == 0xfe and byte[3] == 0xff) {
             return encoding_type::UTF32_BE;
-        } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[3] == 0xbf) {
+        } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[2] == 0xbf) {
             return encoding_type::UTF8;
         }
         return encoding_type::unspecified;
@@ -5703,6 +6151,420 @@ namespace simdutf {
 /* end file src/error.cpp */
 // The large tables should be included once and they
 // should not depend on a kernel.
+/* begin file src/tables/base64_tables.h */
+#ifndef SIMDUTF_BASE64_TABLES_H
+#define SIMDUTF_BASE64_TABLES_H
+#include <array>
+#include <cstdint>
+
+namespace simdutf {
+namespace {
+namespace tables {
+namespace base64 {
+
+const char e0[256] = {
+    'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D',
+    'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H',
+    'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L',
+    'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O',
+    'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S',
+    'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W',
+    'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a',
+    'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd',
+    'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h',
+    'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l',
+    'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p',
+    'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's',
+    't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w',
+    'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0',
+    '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4',
+    '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7',
+    '8', '8', '8', '8', '9', '9', '9', '9', '+', '+', '+', '+', '/', '/', '/',
+    '/'};
+
+const char e1[256] = {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
+    'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+    't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
+    '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+    'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+    '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+    'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
+    'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C',
+    'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+    'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+    'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+',
+    '/'};
+
+const char e2[256] = {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
+    'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+    't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
+    '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+    'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+    '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+    'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
+    'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', 'A', 'B', 'C',
+    'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+    'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+    'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+',
+    '/'};
+
+const int8_t decoding_table[256] = {
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -1, -1, -2, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, 62, -1, 62, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+    61, -1, -1, -1, -1, -1, -1, -1, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
+    11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1,
+    63, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+    43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1};
+
+/* SPECIAL DECODE TABLES FOR LITTLE ENDIAN CPUS */
+
+const uint32_t d0[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc,
+    0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4,
+    0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018,
+    0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030,
+    0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048,
+    0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060,
+    0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078,
+    0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090,
+    0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8,
+    0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0,
+    0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+
+const uint32_t d1[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003,
+    0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003,
+    0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000,
+    0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000,
+    0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001,
+    0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001,
+    0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001,
+    0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002,
+    0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002,
+    0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003,
+    0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+
+const uint32_t d2[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00,
+    0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00,
+    0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100,
+    0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300,
+    0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400,
+    0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600,
+    0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700,
+    0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900,
+    0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00,
+    0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00,
+    0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+
+const uint32_t d3[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000,
+    0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000,
+    0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000,
+    0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000,
+    0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000,
+    0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000,
+    0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000,
+    0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000,
+    0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000,
+    0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000,
+    0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+const uint64_t thintable_epi8[256] = {
+    0x0706050403020100, 0x0007060504030201, 0x0007060504030200,
+    0x0000070605040302, 0x0007060504030100, 0x0000070605040301,
+    0x0000070605040300, 0x0000000706050403, 0x0007060504020100,
+    0x0000070605040201, 0x0000070605040200, 0x0000000706050402,
+    0x0000070605040100, 0x0000000706050401, 0x0000000706050400,
+    0x0000000007060504, 0x0007060503020100, 0x0000070605030201,
+    0x0000070605030200, 0x0000000706050302, 0x0000070605030100,
+    0x0000000706050301, 0x0000000706050300, 0x0000000007060503,
+    0x0000070605020100, 0x0000000706050201, 0x0000000706050200,
+    0x0000000007060502, 0x0000000706050100, 0x0000000007060501,
+    0x0000000007060500, 0x0000000000070605, 0x0007060403020100,
+    0x0000070604030201, 0x0000070604030200, 0x0000000706040302,
+    0x0000070604030100, 0x0000000706040301, 0x0000000706040300,
+    0x0000000007060403, 0x0000070604020100, 0x0000000706040201,
+    0x0000000706040200, 0x0000000007060402, 0x0000000706040100,
+    0x0000000007060401, 0x0000000007060400, 0x0000000000070604,
+    0x0000070603020100, 0x0000000706030201, 0x0000000706030200,
+    0x0000000007060302, 0x0000000706030100, 0x0000000007060301,
+    0x0000000007060300, 0x0000000000070603, 0x0000000706020100,
+    0x0000000007060201, 0x0000000007060200, 0x0000000000070602,
+    0x0000000007060100, 0x0000000000070601, 0x0000000000070600,
+    0x0000000000000706, 0x0007050403020100, 0x0000070504030201,
+    0x0000070504030200, 0x0000000705040302, 0x0000070504030100,
+    0x0000000705040301, 0x0000000705040300, 0x0000000007050403,
+    0x0000070504020100, 0x0000000705040201, 0x0000000705040200,
+    0x0000000007050402, 0x0000000705040100, 0x0000000007050401,
+    0x0000000007050400, 0x0000000000070504, 0x0000070503020100,
+    0x0000000705030201, 0x0000000705030200, 0x0000000007050302,
+    0x0000000705030100, 0x0000000007050301, 0x0000000007050300,
+    0x0000000000070503, 0x0000000705020100, 0x0000000007050201,
+    0x0000000007050200, 0x0000000000070502, 0x0000000007050100,
+    0x0000000000070501, 0x0000000000070500, 0x0000000000000705,
+    0x0000070403020100, 0x0000000704030201, 0x0000000704030200,
+    0x0000000007040302, 0x0000000704030100, 0x0000000007040301,
+    0x0000000007040300, 0x0000000000070403, 0x0000000704020100,
+    0x0000000007040201, 0x0000000007040200, 0x0000000000070402,
+    0x0000000007040100, 0x0000000000070401, 0x0000000000070400,
+    0x0000000000000704, 0x0000000703020100, 0x0000000007030201,
+    0x0000000007030200, 0x0000000000070302, 0x0000000007030100,
+    0x0000000000070301, 0x0000000000070300, 0x0000000000000703,
+    0x0000000007020100, 0x0000000000070201, 0x0000000000070200,
+    0x0000000000000702, 0x0000000000070100, 0x0000000000000701,
+    0x0000000000000700, 0x0000000000000007, 0x0006050403020100,
+    0x0000060504030201, 0x0000060504030200, 0x0000000605040302,
+    0x0000060504030100, 0x0000000605040301, 0x0000000605040300,
+    0x0000000006050403, 0x0000060504020100, 0x0000000605040201,
+    0x0000000605040200, 0x0000000006050402, 0x0000000605040100,
+    0x0000000006050401, 0x0000000006050400, 0x0000000000060504,
+    0x0000060503020100, 0x0000000605030201, 0x0000000605030200,
+    0x0000000006050302, 0x0000000605030100, 0x0000000006050301,
+    0x0000000006050300, 0x0000000000060503, 0x0000000605020100,
+    0x0000000006050201, 0x0000000006050200, 0x0000000000060502,
+    0x0000000006050100, 0x0000000000060501, 0x0000000000060500,
+    0x0000000000000605, 0x0000060403020100, 0x0000000604030201,
+    0x0000000604030200, 0x0000000006040302, 0x0000000604030100,
+    0x0000000006040301, 0x0000000006040300, 0x0000000000060403,
+    0x0000000604020100, 0x0000000006040201, 0x0000000006040200,
+    0x0000000000060402, 0x0000000006040100, 0x0000000000060401,
+    0x0000000000060400, 0x0000000000000604, 0x0000000603020100,
+    0x0000000006030201, 0x0000000006030200, 0x0000000000060302,
+    0x0000000006030100, 0x0000000000060301, 0x0000000000060300,
+    0x0000000000000603, 0x0000000006020100, 0x0000000000060201,
+    0x0000000000060200, 0x0000000000000602, 0x0000000000060100,
+    0x0000000000000601, 0x0000000000000600, 0x0000000000000006,
+    0x0000050403020100, 0x0000000504030201, 0x0000000504030200,
+    0x0000000005040302, 0x0000000504030100, 0x0000000005040301,
+    0x0000000005040300, 0x0000000000050403, 0x0000000504020100,
+    0x0000000005040201, 0x0000000005040200, 0x0000000000050402,
+    0x0000000005040100, 0x0000000000050401, 0x0000000000050400,
+    0x0000000000000504, 0x0000000503020100, 0x0000000005030201,
+    0x0000000005030200, 0x0000000000050302, 0x0000000005030100,
+    0x0000000000050301, 0x0000000000050300, 0x0000000000000503,
+    0x0000000005020100, 0x0000000000050201, 0x0000000000050200,
+    0x0000000000000502, 0x0000000000050100, 0x0000000000000501,
+    0x0000000000000500, 0x0000000000000005, 0x0000000403020100,
+    0x0000000004030201, 0x0000000004030200, 0x0000000000040302,
+    0x0000000004030100, 0x0000000000040301, 0x0000000000040300,
+    0x0000000000000403, 0x0000000004020100, 0x0000000000040201,
+    0x0000000000040200, 0x0000000000000402, 0x0000000000040100,
+    0x0000000000000401, 0x0000000000000400, 0x0000000000000004,
+    0x0000000003020100, 0x0000000000030201, 0x0000000000030200,
+    0x0000000000000302, 0x0000000000030100, 0x0000000000000301,
+    0x0000000000000300, 0x0000000000000003, 0x0000000000020100,
+    0x0000000000000201, 0x0000000000000200, 0x0000000000000002,
+    0x0000000000000100, 0x0000000000000001, 0x0000000000000000,
+    0x0000000000000000,
+};
+
+const uint8_t pshufb_combine_table[272] = {
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
+    0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08,
+    0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0x00, 0x01, 0x02, 0x03,
+    0x04, 0x05, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
+    0x0f, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
+    0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x08,
+    0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x00, 0x01, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
+    0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x08, 0x09, 0x0a, 0x0b,
+    0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+};
+
+const unsigned char BitsSetTable256mul2[256] = {
+    0,  2,  2,  4,  2,  4,  4,  6,  2,  4,  4,  6,  4,  6,  6,  8,  2,  4,  4,
+    6,  4,  6,  6,  8,  4,  6,  6,  8,  6,  8,  8,  10, 2,  4,  4,  6,  4,  6,
+    6,  8,  4,  6,  6,  8,  6,  8,  8,  10, 4,  6,  6,  8,  6,  8,  8,  10, 6,
+    8,  8,  10, 8,  10, 10, 12, 2,  4,  4,  6,  4,  6,  6,  8,  4,  6,  6,  8,
+    6,  8,  8,  10, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,  10, 8,  10, 10,
+    12, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,  10, 8,  10, 10, 12, 6,  8,
+    8,  10, 8,  10, 10, 12, 8,  10, 10, 12, 10, 12, 12, 14, 2,  4,  4,  6,  4,
+    6,  6,  8,  4,  6,  6,  8,  6,  8,  8,  10, 4,  6,  6,  8,  6,  8,  8,  10,
+    6,  8,  8,  10, 8,  10, 10, 12, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,
+    10, 8,  10, 10, 12, 6,  8,  8,  10, 8,  10, 10, 12, 8,  10, 10, 12, 10, 12,
+    12, 14, 4,  6,  6,  8,  6,  8,  8,  10, 6,  8,  8,  10, 8,  10, 10, 12, 6,
+    8,  8,  10, 8,  10, 10, 12, 8,  10, 10, 12, 10, 12, 12, 14, 6,  8,  8,  10,
+    8,  10, 10, 12, 8,  10, 10, 12, 10, 12, 12, 14, 8,  10, 10, 12, 10, 12, 12,
+    14, 10, 12, 12, 14, 12, 14, 14, 16};
+
+const uint8_t to_base64_value[] = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 64,  64,  255, 255, 64,  255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 64,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62,  255,
+    255, 255, 63,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
+    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+    25,  255, 255, 255, 255, 255, 255, 26,  27,  28,  29,  30,  31,  32,  33,
+    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
+    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255};
+} // namespace base64
+} // namespace tables
+} // unnamed namespace
+} // namespace simdutf
+
+#endif // SIMDUTF_BASE64_TABLES_H
+/* end file src/tables/base64_tables.h */
 /* begin file src/tables/utf8_to_utf16_tables.h */
 #ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H
 #define SIMDUTF_UTF8_TO_UTF16_TABLES_H
@@ -10707,7 +11569,12 @@ inline size_t latin1_length_from_utf32(size_t len) {
   return len; // a utf32 codepoint will always represent 1 latin1 character
 }
 
-
+inline simdutf_warn_unused uint32_t swap_bytes(const uint32_t word) {
+  return ((word >> 24) & 0xff) |      // move byte 3 to byte 0
+         ((word << 8) & 0xff0000) |   // move byte 1 to byte 2
+         ((word >> 8) & 0xff00) |     // move byte 2 to byte 1
+         ((word << 24) & 0xff000000); // byte 0 to byte 3
+}
 
 } // utf32 namespace
 } // unnamed namespace
@@ -10750,84 +11617,261 @@ inline size_t utf16_length_from_latin1(size_t len) {
 
 #endif
 /* end file src/scalar/latin1.h */
+/* begin file src/scalar/base64.h */
+#ifndef SIMDUTF_BASE64_H
+#define SIMDUTF_BASE64_H
 
-/* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
-#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
-#define SIMDUTF_VALID_UTF32_TO_UTF8_H
-
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
 namespace simdutf {
 namespace scalar {
 namespace {
-namespace utf32_to_utf8 {
+namespace base64 {
 
-#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
-// only used by the fallback and POWER kernel
-inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output) {
-	const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char* start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 2 ASCII characters
-    if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-        *utf8_output++ = char(buf[pos]);
-				*utf8_output++ = char(buf[pos+1]);
-        pos += 2;
-        continue;
+// Returns true upon success. The destination buffer must be large enough and is
+// incremented by the number of bytes written and src is incremented by the number of bytes read.
+// This functions assumes that the padding (=) has been removed.
+result base64_tail_decode(char *dst, const char *src, size_t length) {
+  const char *srcend = src + length;
+  const char *srcinit = src;
+  const char *dstinit = dst;
+
+  uint32_t x;
+  size_t idx;
+  uint8_t buffer[4];
+  while (true) {
+    while (src + 4 <= srcend &&
+           (x = tables::base64::d0[uint8_t(src[0])] | tables::base64::d1[uint8_t(src[1])] |
+                tables::base64::d2[uint8_t(src[2])] | tables::base64::d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+      if(match_system(endianness::BIG)) {
+        x = scalar::utf32::swap_bytes(x);
+      }
+      std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
+      dst += 3;
+      src += 4;
+    }
+    idx = 0;
+    // we need at least four characters.
+    while (idx < 4 && src < srcend) {
+      char c = *src;
+      uint8_t code = tables::base64::to_base64_value[uint8_t(c)];
+      buffer[idx] = uint8_t(code);
+      if (code <= 63) {
+        idx++;
+      } else if (code > 64) {
+        return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
       }
+      src++;
+    }
+    if (idx != 4) {
+      if (idx == 2) {
+        uint32_t triple =
+            (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6);
+        if(match_system(endianness::BIG)) {
+          triple <<= 8;
+          std::memcpy(dst, &triple, 1);
+        } else {
+          triple = scalar::utf32::swap_bytes(triple);
+          triple >>= 8;
+          std::memcpy(dst, &triple, 1);
+        }
+        dst += 1;
+
+      } else if (idx == 3) {
+        uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
+                          (uint32_t(buffer[1]) << 2 * 6) +
+                          (uint32_t(buffer[2]) << 1 * 6);
+        if(match_system(endianness::BIG)) {
+          triple <<= 8;
+          std::memcpy(dst, &triple, 2);
+        } else {
+          triple = scalar::utf32::swap_bytes(triple);
+          triple >>= 8;
+          std::memcpy(dst, &triple, 2);
+        }
+        dst += 2;
+      } else if (idx == 1) {
+        return {BASE64_INPUT_REMAINDER, size_t(dst - dstinit)};
+      }
+      return {SUCCESS, size_t(dst - dstinit)};
     }
-    uint32_t word = data[pos];
-    if((word & 0xFFFFFF80)==0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if((word & 0xFFFFF800)==0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if((word & 0xFFFF0000)==0) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>12) | 0b11100000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
+
+    uint32_t triple =
+        (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
+        (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6);
+    if(match_system(endianness::BIG)) {
+      triple <<= 8;
+      std::memcpy(dst, &triple, 3);
     } else {
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>18) | 0b11110000);
-      *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos ++;
+      triple = scalar::utf32::swap_bytes(triple);
+      triple >>= 8;
+      std::memcpy(dst, &triple, 3);
     }
+    dst += 3;
   }
-  return utf8_output - start;
 }
-#endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
 
-} // utf32_to_utf8 namespace
+// Returns the number of bytes written. The destination buffer must be large
+// enough. It will add padding (=) if needed.
+size_t tail_encode_base64(char *dst, const char *src, size_t srclen) {
+  char *out = dst;
+  size_t i = 0;
+  uint8_t t1, t2, t3;
+  for (; i + 2 < srclen; i += 3) {
+    t1 = (uint8_t)src[i];
+    t2 = (uint8_t)src[i + 1];
+    t3 = (uint8_t)src[i + 2];
+    *out++ = tables::base64::e0[t1];
+    *out++ = tables::base64::e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+    *out++ = tables::base64::e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
+    *out++ = tables::base64::e2[t3];
+  }
+  switch (srclen - i) {
+  case 0:
+    break;
+  case 1:
+    t1 = (uint8_t)src[i];
+    *out++ = tables::base64::e0[t1];
+    *out++ = tables::base64::e1[(t1 & 0x03) << 4];
+    *out++ = '=';
+    *out++ = '=';
+    break;
+  default: /* case 2 */
+    t1 = (uint8_t)src[i];
+    t2 = (uint8_t)src[i + 1];
+    *out++ = tables::base64::e0[t1];
+    *out++ = tables::base64::e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+    *out++ = tables::base64::e2[(t2 & 0x0F) << 2];
+    *out++ = '=';
+  }
+  return (size_t)(out - dst);
+}
+
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept {
+  // We follow https://infra.spec.whatwg.org/#forgiving-base64-decode
+  size_t padding = 0;
+  if(length > 0) {
+    if(input[length - 1] == '=') {
+      padding++;
+      if(length > 1 && input[length - 2] == '=') {
+        padding++;
+      }
+    }
+  }
+  size_t actual_length = length - padding;
+  if(actual_length % 4 == 0) {
+    return actual_length / 4 * 3;
+  }
+  // if we have a valid input, then the remainder must be 2 or 3 adding one or two extra bytes.
+  return  actual_length / 4 * 3 + (actual_length %4)  - 1;
+}
+
+simdutf_warn_unused simdutf_really_inline result base64_to_binary(const char * input, size_t length, char* output) noexcept {
+  if(length > 0 && input[length - 1] == '=') {
+    length -= 1;
+    if(length > 0 && input[length - 1] == '=') {
+      length -= 1;
+    }
+  }
+  if(length == 0) {
+    return {SUCCESS, 0};
+  }
+  return base64_tail_decode(output, input, length);
+}
+
+simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept {
+  return (length + 2)/3 * 4; // We use padding to make the length a multiple of 4.
+}
+
+simdutf_really_inline size_t binary_to_base64(const char * input, size_t length, char* output) noexcept {
+  return tail_encode_base64(output, input, length);
+}
+} // namespace base64
 } // unnamed namespace
 } // namespace scalar
 } // namespace simdutf
 
 #endif
-/* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
-/* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
-#ifndef SIMDUTF_UTF32_TO_UTF8_H
-#define SIMDUTF_UTF32_TO_UTF8_H
+/* end file src/scalar/base64.h */
+
+/* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
+#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
+#define SIMDUTF_VALID_UTF32_TO_UTF8_H
 
 namespace simdutf {
 namespace scalar {
 namespace {
 namespace utf32_to_utf8 {
 
-inline size_t convert(const char32_t* buf, size_t len, char* utf8_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
+// only used by the fallback and POWER kernel
+inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output) {
+	const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char* start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 2 ASCII characters
+    if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+        *utf8_output++ = char(buf[pos]);
+				*utf8_output++ = char(buf[pos+1]);
+        pos += 2;
+        continue;
+      }
+    }
+    uint32_t word = data[pos];
+    if((word & 0xFFFFFF80)==0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if((word & 0xFFFFF800)==0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if((word & 0xFFFF0000)==0) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>12) | 0b11100000);
+      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word>>18) | 0b11110000);
+      *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos ++;
+    }
+  }
+  return utf8_output - start;
+}
+#endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
+
+} // utf32_to_utf8 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
+/* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
+#ifndef SIMDUTF_UTF32_TO_UTF8_H
+#define SIMDUTF_UTF32_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_utf8 {
+
+inline size_t convert(const char32_t* buf, size_t len, char* utf8_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
   size_t pos = 0;
   char* start{utf8_output};
   while (pos < len) {
@@ -12156,7 +13200,7 @@ inline size_t convert(const char* buf, size_t len, char* latin_output) {
       // range check -
       uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
       if (code_point < 0x80 || 0xFF < code_point) {
-        return 0; // We only care about the range 129-255 which is Non-ASCII latin1 characters. A code_point beneath 0x80 is invalid as it's already covered by bytes whose leading bit is zero. 
+        return 0; // We only care about the range 129-255 which is Non-ASCII latin1 characters. A code_point beneath 0x80 is invalid as it's already covered by bytes whose leading bit is zero.
       }
       *latin_output++ = char(code_point);
       pos += 2;
@@ -12307,7 +13351,7 @@ inline size_t convert(const char16_t* buf, size_t len, char* latin_output) {
 
   // Only copy to latin_output if there were no errors
   std::memcpy(latin_output, temp_output.data(), len);
-  
+
   return current_write - temp_output.data();
 }
 
@@ -14535,8 +15579,9 @@ std::pair<const char32_t*, char*> arm_convert_utf32_to_utf8(const char32_t* buf,
   const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
 
   uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 8 < end) {
+  while (buf + 16 + safety_margin < end) {
     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
     uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
 
@@ -14768,8 +15813,9 @@ std::pair<result, char*> arm_convert_utf32_to_utf8_with_errors(const char32_t* b
   const char32_t* end = buf + len;
 
   const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+  const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 8 < end) {
+  while (buf + 16 + safety_margin < end) {
     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
     uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
 
@@ -15111,6 +16157,395 @@ std::pair<result, char16_t*> arm_convert_utf32_to_utf16_with_errors(const char32
   return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char16_t*>(utf16_output));
 }
 /* end file src/arm64/arm_convert_utf32_to_utf16.cpp */
+/* begin file src/arm64/arm_base64.cpp */
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
+
+size_t encode_base64(char *dst, const char *src, size_t srclen) {
+  // credit: Wojciech Muła
+  uint8_t *out = (uint8_t *)dst;
+  constexpr static uint8_t source_table[64] = {
+      'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
+      'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
+      'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
+      '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
+      'N', 'd', 't', '9', 'O', 'e', 'u', '+', 'P', 'f', 'v', '/',
+  };
+  const uint8x16_t v3f = vdupq_n_u8(0x3f);
+  const uint8x16x4_t table = vld4q_u8(source_table);
+  size_t i = 0;
+  for (; i + 16 * 3 <= srclen; i += 16 * 3) {
+    const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i);
+    uint8x16x4_t result;
+    result.val[0] = vshrq_n_u8(in.val[0], 2);
+    result.val[1] =
+        vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[1], 4), in.val[0], 4), v3f);
+    result.val[2] =
+        vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[2], 6), in.val[1], 2), v3f);
+    result.val[3] = vandq_u8(in.val[2], v3f);
+    result.val[0] = vqtbl4q_u8(table, result.val[0]);
+    result.val[1] = vqtbl4q_u8(table, result.val[1]);
+    result.val[2] = vqtbl4q_u8(table, result.val[2]);
+    result.val[3] = vqtbl4q_u8(table, result.val[3]);
+    vst4q_u8(out, result);
+    out += 64;
+  }
+  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+
+  return size_t((char *)out - dst);
+}
+
+static inline void compress(uint8x16_t data, uint16_t mask, char *output) {
+  if (mask == 0) {
+    vst1q_u8((uint8_t *)output, data);
+    return;
+  }
+  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
+  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+  uint64x2_t compactmasku64 = {tables::base64::thintable_epi8[mask1],
+                               tables::base64::thintable_epi8[mask2]};
+  uint8x16_t compactmask = vreinterpretq_u8_u64(compactmasku64);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  const uint8x16_t off =
+      simdutf_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
+#else
+  const uint8x16_t off = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
+#endif
+
+  compactmask = vaddq_u8(compactmask, off);
+  uint8x16_t pruned = vqtbl1q_u8(data, compactmask);
+
+  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+  // then load the corresponding mask, what it does is to write
+  // only the first pop1 bytes from the first 8 bytes, and then
+  // it fills in with the bytes from the second 8 bytes + some filling
+  // at the end.
+  compactmask = vld1q_u8(tables::base64::pshufb_combine_table + pop1 * 8);
+  uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
+  vst1q_u8((uint8_t *)output, answer);
+}
+
+struct block64 {
+  uint8x16_t chunks[4];
+};
+static_assert(sizeof(block64) == 64, "block64 is not 64 bytes");
+uint64_t to_base64_mask(block64 *b, bool *error) {
+  uint8x16_t v0f = vdupq_n_u8(0xf);
+
+  uint8x16_t lo_nibbles0 = vandq_u8(b->chunks[0], v0f);
+  uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f);
+  uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f);
+  uint8x16_t lo_nibbles3 = vandq_u8(b->chunks[3], v0f);
+  // Needed by the decoding step.
+  uint8x16_t hi_nibbles0 = vshrq_n_u8(b->chunks[0], 4);
+  uint8x16_t hi_nibbles1 = vshrq_n_u8(b->chunks[1], 4);
+  uint8x16_t hi_nibbles2 = vshrq_n_u8(b->chunks[2], 4);
+  uint8x16_t hi_nibbles3 = vshrq_n_u8(b->chunks[3], 4);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  const uint8x16_t lut_lo =
+      simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                              0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4);
+#else
+  const uint8x16_t lut_lo = {0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                             0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4};
+#endif
+  uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0);
+  uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1);
+  uint8x16_t lo2 = vqtbl1q_u8(lut_lo, lo_nibbles2);
+  uint8x16_t lo3 = vqtbl1q_u8(lut_lo, lo_nibbles3);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  const uint8x16_t lut_hi =
+      simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4, 0x20,
+                              0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+#else
+  const uint8x16_t lut_hi = {0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
+                             0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+#endif
+  uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_nibbles0);
+  uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_nibbles1);
+  uint8x16_t hi2 = vqtbl1q_u8(lut_hi, hi_nibbles2);
+  uint8x16_t hi3 = vqtbl1q_u8(lut_hi, hi_nibbles3);
+
+  uint8_t checks =
+      vmaxvq_u8(vorrq_u8(vorrq_u8(vandq_u8(lo0, hi0), vandq_u8(lo1, hi1)),
+                         vorrq_u8(vandq_u8(lo2, hi2), vandq_u8(lo3, hi3))));
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  const uint8x16_t bit_mask =
+      simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                              0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+#else
+  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+#endif
+  uint64_t badcharmask = 0;
+  *error = checks > 0x3;
+  if (checks) {
+    // Add each of the elements next to each other, successively, to stuff each
+    // 8 byte mask into one.
+    uint8x16_t test0 = vtstq_u8(lo0, hi0);
+    uint8x16_t test1 = vtstq_u8(lo1, hi1);
+    uint8x16_t test2 = vtstq_u8(lo2, hi2);
+    uint8x16_t test3 = vtstq_u8(lo3, hi3);
+    uint8x16_t sum0 =
+        vpaddq_u8(vandq_u8(test0, bit_mask), vandq_u8(test1, bit_mask));
+    uint8x16_t sum1 =
+        vpaddq_u8(vandq_u8(test2, bit_mask), vandq_u8(test3, bit_mask));
+    sum0 = vpaddq_u8(sum0, sum1);
+    sum0 = vpaddq_u8(sum0, sum0);
+    badcharmask = vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+  }
+  // This is the transformation step that can be done while we are waiting for
+  // sum0
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  const uint8x16_t roll_lut =
+      simdutf_make_uint8x16_t(0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, 0x0,
+                              0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+#else
+  const uint8x16_t roll_lut = {0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                               0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+#endif
+  uint8x16_t v2f = vdupq_n_u8(0x2f);
+  uint8x16_t roll0 =
+      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[0], v2f), hi_nibbles0));
+  uint8x16_t roll1 =
+      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[1], v2f), hi_nibbles1));
+  uint8x16_t roll2 =
+      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[2], v2f), hi_nibbles2));
+  uint8x16_t roll3 =
+      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[3], v2f), hi_nibbles3));
+  b->chunks[0] = vaddq_u8(b->chunks[0], roll0);
+  b->chunks[1] = vaddq_u8(b->chunks[1], roll1);
+  b->chunks[2] = vaddq_u8(b->chunks[2], roll2);
+  b->chunks[3] = vaddq_u8(b->chunks[3], roll3);
+  return badcharmask;
+}
+
+void copy_block(block64 *b, char *output) {
+  vst1q_u8((uint8_t *)output, b->chunks[0]);
+  vst1q_u8((uint8_t *)output + 16, b->chunks[1]);
+  vst1q_u8((uint8_t *)output + 32, b->chunks[2]);
+  vst1q_u8((uint8_t *)output + 48, b->chunks[3]);
+}
+
+uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+  uint64_t popcounts =
+      vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
+  uint64_t offsets = popcounts * 0x0101010101010101;
+  compress(b->chunks[0], uint16_t(mask), output);
+  compress(b->chunks[1], uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF]);
+  compress(b->chunks[2], uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF]);
+  compress(b->chunks[3], uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF]);
+  return offsets >> 56;
+}
+
+void load_block(block64 *b, const char *src) {
+  b->chunks[0] = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
+  b->chunks[1] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 16);
+  b->chunks[2] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 32);
+  b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
+}
+
+// decode 64 bytes and output 48 bytes
+void base64_decode_block(char *out, const char *src) {
+  uint8x16x4_t str = vld4q_u8((uint8_t *)src);
+  uint8x16x3_t outvec;
+  outvec.val[0] =
+      vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
+  outvec.val[1] =
+      vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
+  outvec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
+  vst3q_u8((uint8_t *)out, outvec);
+}
+
+result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+  size_t equalsigns = 0;
+  if (srclen > 0 && src[srclen - 1] == '=') {
+    srclen--;
+    equalsigns = 1;
+    if (srclen > 0 && src[srclen - 1] == '=') {
+      srclen--;
+      equalsigns = 2;
+    }
+  }
+  const char *const srcinit = src;
+  const char *const dstinit = dst;
+  const char *const srcend = src + srclen;
+
+  constexpr size_t block_size = 10;
+  char buffer[block_size * 64];
+  char *bufferptr = buffer;
+  if (srclen >= 64) {
+    const char *const srcend64 = src + srclen - 64;
+    while (src <= srcend64) {
+      block64 b;
+      load_block(&b, src);
+      src += 64;
+      bool error = false;
+      uint64_t badcharmask = to_base64_mask(&b, &error);
+      if (error) {
+        src -= 64;
+
+        while (src < srcend &&
+               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+          src++;
+        }
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      }
+
+      if (badcharmask != 0) {
+        // optimization opportunity: check for simple masks like those made of
+        // continuous 1s followed by continuous 0s. And masks containing a
+        // single bad character.
+
+        bufferptr += compress_block(&b, badcharmask, bufferptr);
+      } else {
+        // optimization opportunity: if bufferptr == buffer and mask == 0, we
+        // can avoid the call to compress_block and decode directly.
+        copy_block(&b, bufferptr);
+        bufferptr += 64;
+        //          base64_decode_block(dst, &b);
+        // dst += 48;
+      }
+      if (bufferptr >= (block_size - 1) * 64 + buffer) {
+        for (size_t i = 0; i < (block_size - 1); i++) {
+          base64_decode_block(dst, buffer + i * 64);
+          dst += 48;
+        }
+        std::memcpy(buffer, buffer + (block_size - 1) * 64,
+                    64); // 64 might be too much
+        bufferptr -= (block_size - 1) * 64;
+      }
+    }
+  }
+  char *buffer_start = buffer;
+  // Optimization note: if this is almost full, then it is worth our
+  // time, otherwise, we should just decode directly.
+  int last_block = (int)((bufferptr - buffer_start) % 64);
+  if (last_block != 0 && srcend - src + last_block >= 64) {
+    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      *bufferptr = char(val);
+      if (val > 64) {
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      }
+      bufferptr += (val <= 63);
+      src++;
+    }
+  }
+
+  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+    base64_decode_block(dst, buffer_start);
+    dst += 48;
+  }
+  if ((bufferptr - buffer_start) % 64 != 0) {
+    while (buffer_start + 4 < bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 4);
+
+      dst += 3;
+      buffer_start += 4;
+    }
+    if (buffer_start + 4 <= bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 3);
+
+      dst += 3;
+      buffer_start += 4;
+    }
+    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+    // bring in src content
+    int leftover = int(bufferptr - buffer_start);
+    if (leftover > 0) {
+      while (leftover < 4 && src < srcend) {
+        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        if (val > 64) {
+          return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+        }
+        buffer_start[leftover] = char(val);
+        leftover += (val <= 63);
+        src++;
+      }
+
+      if (leftover == 1) {
+        return {BASE64_INPUT_REMAINDER, size_t(dst - dstinit)};
+      }
+      if (leftover == 2) {
+        uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) +
+                          (uint32_t(buffer_start[1]) << 2 * 6);
+        triple = scalar::utf32::swap_bytes(triple);
+        triple >>= 8;
+        std::memcpy(dst, &triple, 1);
+        dst += 1;
+      } else if (leftover == 3) {
+        uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) +
+                          (uint32_t(buffer_start[1]) << 2 * 6) +
+                          (uint32_t(buffer_start[2]) << 1 * 6);
+        triple = scalar::utf32::swap_bytes(triple);
+        triple >>= 8;
+
+        std::memcpy(dst, &triple, 2);
+        dst += 2;
+      } else {
+        uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                           (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                           (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                           (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                          << 8;
+        triple = scalar::utf32::swap_bytes(triple);
+        std::memcpy(dst, &triple, 3);
+        dst += 3;
+      }
+    }
+  }
+  if (src < srcend + equalsigns) {
+    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    if (r.error == error_code::INVALID_BASE64_CHARACTER) {
+      r.count += size_t(src - srcinit);
+      return r;
+    } else {
+      r.count += size_t(dst - dstinit);
+    }
+    return r;
+  }
+  return {SUCCESS, size_t(dst - dstinit)};
+}
+/* end file src/arm64/arm_base64.cpp */
+
 } // unnamed namespace
 } // namespace arm64
 } // namespace simdutf
@@ -17418,6 +18853,23 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * i
   return utf8::count_code_points(input, length);
 }
 
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+  return compress_decode_base64(output, input, length);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
+  return scalar::base64::base64_length_from_binary(length);
+}
+
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
+  return encode_base64(output, input, length);
+}
+
+
 } // namespace arm64
 } // namespace simdutf
 
@@ -17756,6 +19208,21 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * i
   return scalar::utf8::count_code_points(input, length);
 }
 
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+  return scalar::base64::base64_to_binary(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
+  return scalar::base64::base64_length_from_binary(length);
+}
+
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
+  return scalar::base64::binary_to_base64(input, length, output);
+}
 } // namespace fallback
 } // namespace simdutf
 
@@ -20486,17 +21953,17 @@ const char32_t* validate_utf32(const char32_t* buf, size_t len) {
 static inline size_t latin1_to_utf8_avx512_vec(__m512i input, size_t input_len, char *utf8_output, int mask_output) {
   __mmask64 nonascii = _mm512_movepi8_mask(input);
   size_t output_size = input_len + (size_t)count_ones(nonascii);
-  
+
   // Mask to denote whether the byte is a leading byte that is not ascii
   __mmask64 sixth =
       _mm512_cmpge_epu8_mask(input, _mm512_set1_epi8(-64)); //binary representation of -64: 1100 0000
-  
+
   const uint64_t alternate_bits = UINT64_C(0x5555555555555555);
   uint64_t ascii = ~nonascii;
   // the bits in ascii are inverted and zeros are interspersed in between them
   uint64_t maskA = ~_pdep_u64(ascii, alternate_bits);
   uint64_t maskB = ~_pdep_u64(ascii>>32, alternate_bits);
-  
+
   // interleave bytes from top and bottom halves (abcd...ABCD -> aAbBcCdD)
   __m512i input_interleaved = _mm512_permutexvar_epi8(_mm512_set_epi32(
     0x3f1f3e1e, 0x3d1d3c1c, 0x3b1b3a1a, 0x39193818,
@@ -20504,35 +21971,35 @@ static inline size_t latin1_to_utf8_avx512_vec(__m512i input, size_t input_len,
     0x2f0f2e0e, 0x2d0d2c0c, 0x2b0b2a0a, 0x29092808,
     0x27072606, 0x25052404, 0x23032202, 0x21012000
   ), input);
-  
+
   // double size of each byte, and insert the leading byte 1100 0010
 
-/* 
+/*
 upscale the bytes to 16-bit value, adding the 0b11000000 leading byte in the process.
 We adjust for the bytes that have their two most significant bits. This takes care of the first 32 bytes, assuming we interleaved the bytes. */
-  __m512i outputA = _mm512_shldi_epi16(input_interleaved, _mm512_set1_epi8(-62), 8); 
+  __m512i outputA = _mm512_shldi_epi16(input_interleaved, _mm512_set1_epi8(-62), 8);
   outputA = _mm512_mask_add_epi16(
-                                  outputA, 
-                                 (__mmask32)sixth, 
-                                  outputA, 
+                                  outputA,
+                                 (__mmask32)sixth,
+                                  outputA,
                                   _mm512_set1_epi16(1 - 0x4000)); // 1- 0x4000 = 1100 0000 0000 0001????
-  
+
   // in the second 32-bit half, set first or second option based on whether original input is leading byte (second case) or not (first case)
   __m512i leadingB = _mm512_mask_blend_epi16(
-                                              (__mmask32)(sixth>>32), 
+                                              (__mmask32)(sixth>>32),
                                               _mm512_set1_epi16(0x00c2), // 0000 0000 1101 0010
                                               _mm512_set1_epi16(0x40c3));// 0100 0000 1100 0011
   __m512i outputB = _mm512_ternarylogic_epi32(
-                                              input_interleaved, 
-                                              leadingB, 
-                                              _mm512_set1_epi16((short)0xff00), 
+                                              input_interleaved,
+                                              leadingB,
+                                              _mm512_set1_epi16((short)0xff00),
                                               (240 & 170) ^ 204); // (input_interleaved & 0xff00) ^ leadingB
-  
+
   // prune redundant bytes
   outputA = _mm512_maskz_compress_epi8(maskA, outputA);
   outputB = _mm512_maskz_compress_epi8(maskB, outputB);
-  
-  
+
+
   size_t output_sizeA = (size_t)count_ones((uint32_t)nonascii) + 32;
 
   if(mask_output) {
@@ -20553,7 +22020,7 @@ We adjust for the bytes that have their two most significant bits. This takes ca
   }
   return output_size;
 }
- 
+
 static inline size_t latin1_to_utf8_avx512_branch(__m512i input, char *utf8_output) {
   __mmask64 nonascii = _mm512_movepi8_mask(input);
   if(nonascii) {
@@ -20563,7 +22030,7 @@ static inline size_t latin1_to_utf8_avx512_branch(__m512i input, char *utf8_outp
     return 64;
   }
 }
- 
+
 size_t latin1_to_utf8_avx512_start(const char *buf, size_t len, char *utf8_output) {
   char *start = utf8_output;
   size_t pos = 0;
@@ -20628,14 +22095,14 @@ size_t icelake_convert_latin1_to_utf16(const char *latin1_input, size_t len,
 /* begin file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
 std::pair<const char*, char32_t*> avx512_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) {
     size_t rounded_len = len & ~0xF;  // Round down to nearest multiple of 16
-    
-    for (size_t i = 0; i < rounded_len; i += 16) { 
+
+    for (size_t i = 0; i < rounded_len; i += 16) {
         // Load 16 Latin1 characters into a 128-bit register
         __m128i in = _mm_loadu_si128((__m128i*)&buf[i]);
-        
+
         // Zero extend each set of 8 Latin1 characters to 16 32-bit integers using vpmovzxbd
         __m512i out = _mm512_cvtepu8_epi32(in);
-        
+
         // Store the results back to memory
         _mm512_storeu_si512((__m512i*)&utf32_output[i], out);
     }
@@ -20644,6 +22111,299 @@ std::pair<const char*, char32_t*> avx512_convert_latin1_to_utf32(const char* buf
     return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
 }
 /* end file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
+/* begin file src/icelake/icelake_base64.inl.cpp */
+// file included directly
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
+
+struct block64 {
+  __m512i chunks[1];
+};
+
+size_t encode_base64(char *dst, const char *src, size_t srclen) {
+  // credit: Wojciech Muła
+
+  const uint8_t *input = (const uint8_t *)src;
+
+  uint8_t *out = (uint8_t *)dst;
+  static const char *lookup_tbl =
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+  const __m512i shuffle_input = _mm512_setr_epi32(
+      0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
+      0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
+      0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
+  const __m512i lookup =
+      _mm512_loadu_si512(reinterpret_cast<const __m512i *>(lookup_tbl));
+  const __m512i multi_shifts = _mm512_set1_epi64(UINT64_C(0x3036242a1016040a));
+  size_t i = 0;
+  for (; i + 64 <= srclen; i += 48) {
+    const __m512i v =
+        _mm512_loadu_si512(reinterpret_cast<const __m512i *>(input + i));
+    const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
+    const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
+    const __m512i result = _mm512_permutexvar_epi8(indices, lookup);
+    _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result);
+    out += 64;
+  }
+  return i / 3 * 4 +
+         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+}
+
+static inline uint64_t to_base64_mask(block64 *b, bool *error) {
+  __m512i input = b->chunks[0];
+  const __m512i ascii_space_tbl = _mm512_set_epi8(
+      0, 0, 13, 0, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 0, 0, 10, 9,
+      0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 0, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0,
+      32, 0, 0, 13, 0, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32);
+  __m512i lookup0 = _mm512_set_epi8(
+      -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+      52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
+      -128, -128, -128, -64, -128, -128, -128, -128, -128, -128, -128, -128,
+      -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -64, -128,
+      -128, -64, -64, -128, -128, -128, -128, -128, -128, -128, -128, -64);
+  __m512i lookup1 = _mm512_set_epi8(
+      -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41,
+      40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, -128,
+      -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14,
+      13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+  const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1);
+  const __m512i combined = _mm512_or_si512(translated, input);
+  const __mmask64 mask = _mm512_movepi8_mask(combined);
+  if (mask) {
+    const __mmask64 spaces = _mm512_cmpeq_epi8_mask(
+        _mm512_shuffle_epi8(ascii_space_tbl, input), input);
+    *error |= (mask != spaces);
+  }
+  b->chunks[0] = translated;
+
+  return mask;
+}
+
+static inline void copy_block(block64 *b, char *output) {
+  _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), b->chunks[0]);
+}
+
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+  uint64_t nmask = ~mask;
+  __m512i c = _mm512_maskz_compress_epi8(nmask, b->chunks[0]);
+  _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), c);
+  return _mm_popcnt_u64(nmask);
+}
+
+static inline void load_block(block64 *b, const char *src) {
+  b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
+}
+
+static inline void base64_decode(char *out, __m512i str) {
+  const __m512i merge_ab_and_bc =
+      _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
+  const __m512i merged =
+      _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
+  const __m512i pack = _mm512_set_epi8(
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58,
+      52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34,
+      28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4,
+      5, 6, 0, 1, 2);
+  const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);
+  _mm512_mask_storeu_epi8(
+      (__m512i *)out, 0xffffffffffff,
+      shuffled); // mask would be 0xffffffffffff since we write 48 bytes.
+}
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+  base64_decode(out,
+                _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src)));
+}
+static inline void base64_decode_block(char *out, block64 *b) {
+  base64_decode(out, b->chunks[0]);
+}
+
+result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+  size_t equalsigns = 0;
+  if (srclen > 0 && src[srclen - 1] == '=') {
+    srclen--;
+    equalsigns = 1;
+    if (srclen > 0 && src[srclen - 1] == '=') {
+      srclen--;
+      equalsigns = 2;
+    }
+  }
+  const char *const srcinit = src;
+  const char *const dstinit = dst;
+  const char *const srcend = src + srclen;
+
+  // figure out why block_size == 2 is sometimes best???
+  constexpr size_t block_size = 6;
+  char buffer[block_size * 64];
+  char *bufferptr = buffer;
+  if (srclen >= 64) {
+    const char *const srcend64 = src + srclen - 64;
+    while (src <= srcend64) {
+      block64 b;
+      load_block(&b, src);
+      src += 64;
+      bool error = false;
+      uint64_t badcharmask = to_base64_mask(&b, &error);
+      if (error) {
+        src -= 64;
+        while (src < srcend &&
+               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+          src++;
+        }
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      }
+      if (badcharmask != 0) {
+        // optimization opportunity: check for simple masks like those made of
+        // continuous 1s followed by continuous 0s. And masks containing a
+        // single bad character.
+        bufferptr += compress_block(&b, badcharmask, bufferptr);
+      } else if (bufferptr != buffer) {
+        copy_block(&b, bufferptr);
+        bufferptr += 64;
+      } else {
+        base64_decode_block(dst, &b);
+        dst += 48;
+      }
+      if (bufferptr >= (block_size - 1) * 64 + buffer) {
+        for (size_t i = 0; i < (block_size - 1); i++) {
+          base64_decode_block(dst, buffer + i * 64);
+          dst += 48;
+        }
+        std::memcpy(buffer, buffer + (block_size - 1) * 64,
+                    64); // 64 might be too much
+        bufferptr -= (block_size - 1) * 64;
+      }
+    }
+  }
+
+  char *buffer_start = buffer;
+  // Optimization note: if this is almost full, then it is worth our
+  // time, otherwise, we should just decode directly.
+  int last_block = (int)((bufferptr - buffer_start) % 64);
+  if (last_block != 0 && srcend - src + last_block >= 64) {
+
+    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      *bufferptr = char(val);
+      if (val > 64) {
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      }
+      bufferptr += (val <= 63);
+      src++;
+    }
+  }
+
+  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+    base64_decode_block(dst, buffer_start);
+    dst += 48;
+  }
+  if ((bufferptr - buffer_start) % 64 != 0) {
+    while (buffer_start + 4 < bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 4);
+      dst += 3;
+      buffer_start += 4;
+    }
+    if (buffer_start + 4 <= bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 3);
+      dst += 3;
+      buffer_start += 4;
+    }
+    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+    // bring in src content
+    int leftover = int(bufferptr - buffer_start);
+    if (leftover > 0) {
+      while (leftover < 4 && src < srcend) {
+        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        if (val > 64) {
+          return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+        }
+        buffer_start[leftover] = char(val);
+        leftover += (val <= 63);
+        src++;
+      }
+
+      if (leftover == 1) {
+        return {BASE64_INPUT_REMAINDER, size_t(dst - dstinit)};
+      }
+      if (leftover == 2) {
+        uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) +
+                          (uint32_t(buffer_start[1]) << 2 * 6);
+        triple = scalar::utf32::swap_bytes(triple);
+        triple >>= 8;
+        std::memcpy(dst, &triple, 1);
+        dst += 1;
+      } else if (leftover == 3) {
+        uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) +
+                          (uint32_t(buffer_start[1]) << 2 * 6) +
+                          (uint32_t(buffer_start[2]) << 1 * 6);
+        triple = scalar::utf32::swap_bytes(triple);
+        triple >>= 8;
+
+        std::memcpy(dst, &triple, 2);
+        dst += 2;
+      } else {
+        uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                           (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                           (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                           (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                          << 8;
+        triple = scalar::utf32::swap_bytes(triple);
+        std::memcpy(dst, &triple, 3);
+        dst += 3;
+      }
+    }
+  }
+  if (src < srcend + equalsigns) {
+    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    if (r.error == error_code::INVALID_BASE64_CHARACTER) {
+      r.count += size_t(src - srcinit);
+      return r;
+    } else {
+      r.count += size_t(dst - dstinit);
+    }
+    return r;
+  }
+  return {SUCCESS, size_t(dst - dstinit)};
+}
+/* end file src/icelake/icelake_base64.inl.cpp */
 
 
 #include <cstdint>
@@ -21129,15 +22889,15 @@ simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(co
 
   // First, try to convert as much as possible using the SIMD implementation.
   inlen = icelake::utf8_to_latin1_avx512(buf, len, latin1_output);
-  
+
   // If we have completely converted the string
   if(inlen == len) {
     return {simdutf::SUCCESS, len};
   }
-  
+
   // Else if there are remaining bytes, use the scalar function to process them.
-  // Note: This is assuming scalar::utf8_to_latin1::convert_with_errors is a function that takes 
-  // the input buffer, length, and output buffer, and returns a result object with an error code 
+  // Note: This is assuming scalar::utf8_to_latin1::convert_with_errors is a function that takes
+  // the input buffer, length, and output buffer, and returns a result object with an error code
   // and the number of characters processed.
   result res = scalar::utf8_to_latin1::convert_with_errors(buf + inlen, len - inlen, latin1_output + inlen);
   res.count += inlen; // Add the number of characters processed by the SIMD implementation
@@ -21692,7 +23452,7 @@ simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t
   const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
   size_t answer =  length / sizeof(__m512i) * sizeof(__m512i); // Number of 512-bit chunks that fits into the length.
   size_t i = 0;
-  __m512i unrolled_popcount{0}; 
+  __m512i unrolled_popcount{0};
 
   const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
 
@@ -21972,6 +23732,22 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * i
   return implementation::count_utf8(input, length);
 }
 
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+  return compress_decode_base64(output, input, length);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
+  return scalar::base64::base64_length_from_binary(length);
+}
+
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
+  return encode_base64(output, input, length);
+}
+
 } // namespace icelake
 } // namespace simdutf
 
@@ -22581,10 +24357,10 @@ std::pair<const char*, char16_t*> avx2_convert_latin1_to_utf16(const char* latin
 
         // Zero extend each byte in xmm0 to word and put it in another xmm register
         __m128i xmm1 = _mm_cvtepu8_epi16(xmm0);
-        
+
         // Shift xmm0 to the right by 8 bytes
         xmm0 = _mm_srli_si128(xmm0, 8);
-        
+
         // Zero extend each byte in the shifted xmm0 to word in xmm0
         xmm0 = _mm_cvtepu8_epi16(xmm0);
 
@@ -22593,10 +24369,10 @@ std::pair<const char*, char16_t*> avx2_convert_latin1_to_utf16(const char* latin
             xmm0 = _mm_shuffle_epi8(xmm0, swap);
             xmm1 = _mm_shuffle_epi8(xmm1, swap);
         }
-        
+
         // Store the contents of xmm1 into the address pointed by (output + i)
         _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output + i), xmm1);
-        
+
         // Store the contents of xmm0 into the address pointed by (output + i + 8)
         _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output + i + 8), xmm0);
     }
@@ -22608,14 +24384,14 @@ std::pair<const char*, char16_t*> avx2_convert_latin1_to_utf16(const char* latin
 /* begin file src/haswell/avx2_convert_latin1_to_utf32.cpp */
 std::pair<const char*, char32_t*> avx2_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) {
     size_t rounded_len = ((len | 7) ^ 7);  // Round down to nearest multiple of 8
-    
-    for (size_t i = 0; i < rounded_len; i += 8) { 
+
+    for (size_t i = 0; i < rounded_len; i += 8) {
         // Load 8 Latin1 characters into a 64-bit register
         __m128i in = _mm_loadl_epi64((__m128i*)&buf[i]);
-        
+
         // Zero extend each set of 8 Latin1 characters to 8 32-bit integers using vpmovzxbd
         __m256i out = _mm256_cvtepu8_epi32(in);
-        
+
         // Store the results back to memory
         _mm256_storeu_si256((__m256i*)&utf32_output[i], out);
     }
@@ -24520,6 +26296,503 @@ size_t convert_masked_utf8_to_latin1(const char *input,
 }
 /* end file src/haswell/avx2_convert_utf8_to_latin1.cpp */
 
+/* begin file src/haswell/avx2_base64.cpp */
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
+
+__m256i lookup_pshufb_improved(const __m256i input) {
+  // credit: Wojciech Muła
+  __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
+  const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
+  result =
+      _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
+  const __m256i shift_LUT = _mm256_setr_epi8(
+      'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+      '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
+
+      'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+      '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+
+  result = _mm256_shuffle_epi8(shift_LUT, result);
+  return _mm256_add_epi8(result, input);
+}
+
+size_t encode_base64(char *dst, const char *src, size_t srclen) {
+  // credit: Wojciech Muła
+  const uint8_t *input = (const uint8_t *)src;
+
+  uint8_t *out = (uint8_t *)dst;
+  const __m256i shuf =
+      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
+
+                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
+  size_t i = 0;
+  for (; i + 100 <= srclen; i += 96) {
+    const __m128i lo0 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
+    const __m128i hi0 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
+    const __m128i lo1 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
+    const __m128i hi1 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
+    const __m128i lo2 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
+    const __m128i hi2 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
+    const __m128i lo3 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
+    const __m128i hi3 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
+
+    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
+    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
+    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
+    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
+
+    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
+    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
+    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
+    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
+
+    const __m256i t1_0 =
+        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
+    const __m256i t1_1 =
+        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
+    const __m256i t1_2 =
+        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
+    const __m256i t1_3 =
+        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
+
+    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
+    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
+    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
+    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
+
+    const __m256i t3_0 =
+        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
+    const __m256i t3_1 =
+        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
+    const __m256i t3_2 =
+        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
+    const __m256i t3_3 =
+        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
+
+    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
+    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
+    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
+    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
+
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved(input0));
+    out += 32;
+
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved(input1));
+    out += 32;
+
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved(input2));
+    out += 32;
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved(input3));
+    out += 32;
+  }
+  for (; i + 28 <= srclen; i += 24) {
+    // lo = [xxxx|DDDC|CCBB|BAAA]
+    // hi = [xxxx|HHHG|GGFF|FEEE]
+    const __m128i lo =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
+    const __m128i hi =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
+
+    // bytes from groups A, B and C are needed in separate 32-bit lanes
+    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
+    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
+
+    // this part is well commented in encode.sse.cpp
+
+    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
+    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
+    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
+    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
+    const __m256i indices = _mm256_or_si256(t1, t3);
+
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved(indices));
+    out += 32;
+  }
+  return i / 3 * 4 +
+         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+}
+
+static inline void compress(__m128i data, uint16_t mask, char *output) {
+  if (mask == 0) {
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
+    return;
+  }
+  // this particular implementation was inspired by work done by @animetosho
+  // we do it in two steps, first 8 bytes and then second 8 bytes
+  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
+  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+  // next line just loads the 64-bit values thintable_epi8[mask1] and
+  // thintable_epi8[mask2] into a 128-bit register, using only
+  // two instructions on most compilers.
+
+  __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
+                                    tables::base64::thintable_epi8[mask1]);
+  // we increment by 0x08 the second half of the mask
+  shufmask =
+      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+  // this is the version "nearly pruned"
+  __m128i pruned = _mm_shuffle_epi8(data, shufmask);
+  // we still need to put the two halves together.
+  // we compute the popcount of the first half:
+  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+  // then load the corresponding mask, what it does is to write
+  // only the first pop1 bytes from the first 8 bytes, and then
+  // it fills in with the bytes from the second 8 bytes + some filling
+  // at the end.
+  __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
+      tables::base64::pshufb_combine_table + pop1 * 8));
+  __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
+
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
+}
+
+static inline void compress(__m256i data, uint32_t mask, char *output) {
+  if (mask == 0) {
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data);
+    return;
+  }
+  compress(_mm256_castsi256_si128(data), uint16_t(mask), output);
+  compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16),
+           output + _mm_popcnt_u32(~mask & 0xFFFF));
+}
+
+struct block64 {
+  __m256i chunks[2];
+};
+
+static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
+  const __m256i ascii_space_tbl =
+      _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
+                       0x0, 0x0, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
+                       0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0x0, 0xd, 0x0, 0x0);
+  // credit: aqrit
+  const __m256i delta_asso = _mm256_setr_epi8(
+      0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+      0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+      0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+  const __m256i delta_values = _mm256_setr_epi8(
+      int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
+      int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
+      int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+      int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+      int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
+      int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
+      int8_t(0xB9), int8_t(0xB9));
+  const __m256i check_asso = _mm256_setr_epi8(
+      0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
+      0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+      0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+  const __m256i check_values = _mm256_setr_epi8(
+      int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
+      int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
+      int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
+      int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+      int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
+      int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
+      int8_t(0x91), int8_t(0x80));
+  const __m256i shifted = _mm256_srli_epi32(*src, 3);
+
+  const __m256i delta_hash =
+      _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
+  const __m256i check_hash =
+      _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
+
+  const __m256i out =
+      _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
+  const __m256i chk =
+      _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
+  const int mask = _mm256_movemask_epi8(chk);
+  if (mask) {
+    __m256i ascii_space =
+        _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
+    *error |= (mask != _mm256_movemask_epi8(ascii_space));
+  }
+  *src = out;
+  return (uint32_t)mask;
+}
+static inline uint64_t to_base64_mask(block64 *b, bool *error) {
+  *error = 0;
+  uint64_t m0 = to_base64_mask(&b->chunks[0], error);
+  uint64_t m1 = to_base64_mask(&b->chunks[1], error);
+  return m0 | (m1 << 32);
+}
+
+static inline void copy_block(block64 *b, char *output) {
+  _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), b->chunks[0]);
+  _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), b->chunks[1]);
+}
+
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+  uint64_t nmask = ~mask;
+  compress(b->chunks[0], uint32_t(mask), output);
+  compress(b->chunks[1], uint32_t(mask >> 32),
+           output + _mm_popcnt_u64(nmask & 0xFFFFFFFF));
+  return _mm_popcnt_u64(nmask);
+}
+
+static inline void load_block(block64 *b, const char *src) {
+  b->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+  b->chunks[1] =
+      _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
+}
+
+static inline void base64_decode(char *out, __m256i str) {
+  // credit: aqrit
+  const __m256i pack_shuffle =
+      _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
+                       2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
+  const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140));
+  const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000));
+  const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle);
+
+  // Store the output:
+  _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2));
+  _mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1));
+}
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+  base64_decode(out,
+                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
+  base64_decode(out + 24, _mm256_loadu_si256(
+                              reinterpret_cast<const __m256i *>(src + 32)));
+}
+static inline void base64_decode_block_safe(char *out, const char *src) {
+  base64_decode(out,
+                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
+  char buffer[32]; // We enforce safety with a buffer.
+  base64_decode(
+      buffer, _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)));
+  std::memcpy(out + 24, buffer, 24);
+}
+static inline void base64_decode_block(char *out, block64 *b) {
+  base64_decode(out, b->chunks[0]);
+  base64_decode(out + 24, b->chunks[1]);
+}
+static inline void base64_decode_block_safe(char *out, block64 *b) {
+  base64_decode(out, b->chunks[0]);
+  char buffer[32]; // We enforce safety with a buffer.
+  base64_decode(buffer, b->chunks[1]);
+  std::memcpy(out + 24, buffer, 24);
+}
+
+result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+  size_t equalsigns = 0;
+  if (srclen > 0 && src[srclen - 1] == '=') {
+    srclen--;
+    equalsigns = 1;
+    if (srclen > 0 && src[srclen - 1] == '=') {
+      srclen--;
+      equalsigns = 2;
+    }
+  }
+  char *end_of_safe_64byte_zone =
+      (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
+
+  const char *const srcinit = src;
+  const char *const dstinit = dst;
+  const char *const srcend = src + srclen;
+
+  constexpr size_t block_size = 6;
+  static_assert(block_size >= 2, "block_size must be at least two");
+  char buffer[block_size * 64];
+  char *bufferptr = buffer;
+  if (srclen >= 64) {
+    const char *const srcend64 = src + srclen - 64;
+    while (src <= srcend64) {
+      block64 b;
+      load_block(&b, src);
+      src += 64;
+      bool error = false;
+      uint64_t badcharmask = to_base64_mask(&b, &error);
+      if (error) {
+        src -= 64;
+        while (src < srcend &&
+               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+          src++;
+        }
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      }
+      if (badcharmask != 0) {
+        // optimization opportunity: check for simple masks like those made of
+        // continuous 1s followed by continuous 0s. And masks containing a
+        // single bad character.
+        bufferptr += compress_block(&b, badcharmask, bufferptr);
+      } else if (bufferptr != buffer) {
+        copy_block(&b, bufferptr);
+        bufferptr += 64;
+      } else {
+        if (dst >= end_of_safe_64byte_zone) {
+          base64_decode_block_safe(dst, &b);
+        } else {
+          base64_decode_block(dst, &b);
+        }
+        dst += 48;
+      }
+      if (bufferptr >= (block_size - 1) * 64 + buffer) {
+        for (size_t i = 0; i < (block_size - 2); i++) {
+          base64_decode_block(dst, buffer + i * 64);
+          dst += 48;
+        }
+        if (dst >= end_of_safe_64byte_zone) {
+          base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
+        } else {
+          base64_decode_block(dst, buffer + (block_size - 2) * 64);
+        }
+        dst += 48;
+        std::memcpy(buffer, buffer + (block_size - 1) * 64,
+                    64); // 64 might be too much
+        bufferptr -= (block_size - 1) * 64;
+      }
+    }
+  }
+
+  char *buffer_start = buffer;
+  // Optimization note: if this is almost full, then it is worth our
+  // time, otherwise, we should just decode directly.
+  int last_block = (int)((bufferptr - buffer_start) % 64);
+  if (last_block != 0 && srcend - src + last_block >= 64) {
+
+    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      *bufferptr = char(val);
+      if (val > 64) {
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      }
+      bufferptr += (val <= 63);
+      src++;
+    }
+  }
+
+  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+    if (dst >= end_of_safe_64byte_zone) {
+      base64_decode_block_safe(dst, buffer_start);
+    } else {
+      base64_decode_block(dst, buffer_start);
+    }
+    dst += 48;
+  }
+  if ((bufferptr - buffer_start) % 64 != 0) {
+    while (buffer_start + 4 < bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 4);
+
+      dst += 3;
+      buffer_start += 4;
+    }
+    if (buffer_start + 4 <= bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 3);
+
+      dst += 3;
+      buffer_start += 4;
+    }
+    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+    // bring in src content
+    int leftover = int(bufferptr - buffer_start);
+    if (leftover > 0) {
+      while (leftover < 4 && src < srcend) {
+        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        if (val > 64) {
+          return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+        }
+        buffer_start[leftover] = char(val);
+        leftover += (val <= 63);
+        src++;
+      }
+
+      if (leftover == 1) {
+        return {BASE64_INPUT_REMAINDER, size_t(dst - dstinit)};
+      }
+      if (leftover == 2) {
+        uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) +
+                          (uint32_t(buffer_start[1]) << 2 * 6);
+        triple = scalar::utf32::swap_bytes(triple);
+        triple >>= 8;
+        std::memcpy(dst, &triple, 1);
+        dst += 1;
+      } else if (leftover == 3) {
+        uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) +
+                          (uint32_t(buffer_start[1]) << 2 * 6) +
+                          (uint32_t(buffer_start[2]) << 1 * 6);
+        triple = scalar::utf32::swap_bytes(triple);
+        triple >>= 8;
+        std::memcpy(dst, &triple, 2);
+        dst += 2;
+      } else {
+        uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                           (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                           (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                           (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                          << 8;
+        triple = scalar::utf32::swap_bytes(triple);
+        std::memcpy(dst, &triple, 3);
+        dst += 3;
+      }
+    }
+  }
+  if (src < srcend + equalsigns) {
+    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    if (r.error == error_code::INVALID_BASE64_CHARACTER) {
+      r.count += size_t(src - srcinit);
+      return r;
+    } else {
+      r.count += size_t(dst - dstinit);
+    }
+    return r;
+  }
+  return {SUCCESS, size_t(dst - dstinit)};
+}
+/* end file src/haswell/avx2_base64.cpp */
+
 } // unnamed namespace
 } // namespace haswell
 } // namespace simdutf
@@ -26835,6 +29108,21 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * i
   return utf8::count_code_points(input, length);
 }
 
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+  return compress_decode_base64(output, input, length);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
+  return scalar::base64::base64_length_from_binary(length);
+}
+
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
+  return encode_base64(output, input, length);
+}
 } // namespace haswell
 } // namespace simdutf
 
@@ -28366,6 +30654,22 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * i
   return scalar::utf8::count_code_points(input, length);
 }
 
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+  return scalar::base64::base64_to_binary(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
+  return scalar::base64::base64_length_from_binary(length);
+}
+
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
+  return scalar::base64::binary_to_base64(input, length, output);
+}
 } // namespace ppc64
 } // namespace simdutf
 
@@ -28373,6 +30677,1334 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * i
 /* end file src/simdutf/ppc64/end.h */
 /* end file src/ppc64/implementation.cpp */
 #endif
+#if SIMDUTF_IMPLEMENTATION_RVV
+/* begin file src/rvv/implementation.cpp */
+
+
+
+
+
+/* begin file src/simdutf/rvv/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "rvv"
+// #define SIMDUTF_IMPLEMENTATION rvv
+
+#if SIMDUTF_CAN_ALWAYS_RUN_RVV
+// nothing needed.
+#else
+SIMDUTF_TARGET_RVV
+#endif
+/* end file src/simdutf/rvv/begin.h */
+namespace simdutf {
+namespace rvv {
+namespace {
+#ifndef SIMDUTF_RVV_H
+#error "rvv.h must be included"
+#endif
+
+} // unnamed namespace
+} // namespace rvv
+} // namespace simdutf
+
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace rvv {
+
+/* begin file src/rvv/rvv_length_from.inl.cpp */
+
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t *src, size_t len) const noexcept {
+  return utf32_length_from_utf16le(src, len);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t *src, size_t len) const noexcept {
+  return utf32_length_from_utf16be(src, len);
+}
+
+simdutf_warn_unused size_t implementation::count_utf8(const char *src, size_t len) const noexcept {
+  return utf32_length_from_utf8(src, len);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char *src, size_t len) const noexcept {
+  return utf32_length_from_utf8(src, len);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t len) const noexcept {
+  return len;
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t len) const noexcept {
+  return len;
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t len) const noexcept {
+  return len;
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t len) const noexcept {
+  return len;
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char *src, size_t len) const noexcept {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t*)src, vl);
+    vbool1_t mask = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
+    count += __riscv_vcpop_m_b1(mask, vl);
+  }
+  return count;
+}
+
+template<simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t rvv_utf32_length_from_utf16(const char16_t *src, size_t len) {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t*)src, vl);
+    v = simdutf_byteflip<bflip>(v, vl);
+    vbool2_t notHigh = __riscv_vmor_mm_b2(
+            __riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl),
+            __riscv_vmsltu_vx_u16m8_b2(v, 0xDC00, vl), vl);
+    count += __riscv_vcpop_m_b2(notHigh, vl);
+  }
+  return count;
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t *src, size_t len) const noexcept {
+  return rvv_utf32_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t *src, size_t len) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf32_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
+  else
+    return rvv_utf32_length_from_utf16<simdutf_ByteFlip::V>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char *src, size_t len) const noexcept {
+  size_t count = len;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t*)src, vl);
+    count += __riscv_vcpop_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
+  }
+  return count;
+}
+
+template<simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t rvv_utf8_length_from_utf16(const char16_t *src, size_t len) {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t*)src, vl);
+    v = simdutf_byteflip<bflip>(v, vl);
+    vbool2_t m234 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7F, vl);
+    vbool2_t m34 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7FF, vl);
+    vbool2_t notSur = __riscv_vmor_mm_b2(
+            __riscv_vmsltu_vx_u16m8_b2(v, 0xD800, vl),
+            __riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl), vl);
+    vbool2_t m3 = __riscv_vmand_mm_b2(m34, notSur, vl);
+    count += vl + __riscv_vcpop_m_b2(m234, vl) + __riscv_vcpop_m_b2(m3, vl);
+  }
+  return count;
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t *src, size_t len) const noexcept {
+  return rvv_utf8_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t *src, size_t len) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf8_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
+  else
+    return rvv_utf8_length_from_utf16<simdutf_ByteFlip::V>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t *src, size_t len) const noexcept {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)src, vl);
+    vbool4_t m234 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7F, vl);
+    vbool4_t m34  = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7FF, vl);
+    vbool4_t m4   = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
+    count += vl + __riscv_vcpop_m_b4(m234, vl) + __riscv_vcpop_m_b4(m34, vl) + __riscv_vcpop_m_b4(m4, vl);
+  }
+  return count;
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char *src, size_t len) const noexcept {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t*)src, vl);
+    vbool1_t m1234 = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
+    vbool1_t m4 = __riscv_vmsgtu_vx_u8m8_b1(
+            __riscv_vreinterpret_u8m8(v), (uint8_t)0b11101111, vl);
+    count += __riscv_vcpop_m_b1(m1234, vl) + __riscv_vcpop_m_b1(m4, vl);
+  }
+  return count;
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t *src, size_t len) const noexcept {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)src, vl);
+    vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
+    count += vl + __riscv_vcpop_m_b4(m4, vl);
+  }
+  return count;
+}
+
+/* end file src/rvv/rvv_length_from.inl.cpp */
+/* begin file src/rvv/rvv_validate.inl.cpp */
+
+
+simdutf_warn_unused bool implementation::validate_ascii(const char *src, size_t len) const noexcept {
+  size_t vlmax = __riscv_vsetvlmax_e8m8();
+  vint8m8_t mask = __riscv_vmv_v_x_i8m8(0, vlmax);
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t*)src, vl);
+    mask = __riscv_vor_vv_i8m8_tu(mask, mask, v, vl);
+  }
+  return __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(mask, 0, vlmax), vlmax) < 0;
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *src, size_t len) const noexcept {
+  const char *beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t*)src, vl);
+    long idx = __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
+    if (idx >= 0) return result(error_code::TOO_LARGE, src - beg + idx);
+  }
+  return result(error_code::SUCCESS, src - beg);
+}
+
+/* Returns a close estimation of the number of valid UTF-8 bytes up to the
+ * first invalid one, but never overestimating. */
+simdutf_really_inline static size_t rvv_count_valid_utf8(const char *src, size_t len) {
+  const char *beg = src;
+  size_t tail = 32; // minimum of 3
+  if (len < tail) return 0;
+
+  /* validate first three bytes */
+  {
+    size_t idx = tail;
+    while (idx < len && (src[idx] >> 6) == 0b10)
+      ++idx;
+    if (idx > tail + 3 || !scalar::utf8::validate(src, idx))
+      return 0;
+  }
+
+  static const uint64_t err1m[] = { 0x0202020202020202, 0x4915012180808080 };
+  static const uint64_t err2m[] = { 0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB };
+  static const uint64_t err3m[] = { 0x0101010101010101, 0X01010101BABAAEE6 };
+
+  const vuint8m1_t err1tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
+  const vuint8m1_t err2tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
+  const vuint8m1_t err3tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
+
+  size_t n = len - tail;
+
+  for (size_t vl; n > 0; n -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m4(n);
+    vuint8m4_t v0 = __riscv_vle8_v_u8m4((uint8_t const*)src, vl);
+
+    /* fast path: ASCII */
+    if (__riscv_vfirst(__riscv_vmsgtu(v0, 0b01111111, vl), vl) < 0)
+      continue;
+
+    /* see "Validating UTF-8 In Less Than One Instruction Per Byte"
+     * https://arxiv.org/abs/2010.03090 */
+    vuint8m4_t v1 = __riscv_vslide1down_vx_u8m4(v0, src[vl+0], vl);
+    vuint8m4_t v2 = __riscv_vslide1down_vx_u8m4(v1, src[vl+1], vl);
+    vuint8m4_t v3 = __riscv_vslide1down_vx_u8m4(v2, src[vl+2], vl);
+
+    vuint8m4_t s1 = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vsrl_vx_u16m4(__riscv_vreinterpret_v_u8m4_u16m4(v2), 4, __riscv_vsetvlmax_e16m4()));
+    vuint8m4_t s3 = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vsrl_vx_u16m4(__riscv_vreinterpret_v_u8m4_u16m4(v3), 4, __riscv_vsetvlmax_e16m4()));
+
+    vuint8m4_t idx2 = __riscv_vand_vx_u8m4(v2, 0xF, vl);
+    vuint8m4_t idx1 = __riscv_vand_vx_u8m4(s1, 0xF, vl);
+    vuint8m4_t idx3 = __riscv_vand_vx_u8m4(s3, 0xF, vl);
+
+    vuint8m4_t err1 = simdutf_vrgather_u8m1x4(err1tbl, idx1);
+    vuint8m4_t err2 = simdutf_vrgather_u8m1x4(err2tbl, idx2);
+    vuint8m4_t err3 = simdutf_vrgather_u8m1x4(err3tbl, idx3);
+    vint8m4_t errs = __riscv_vreinterpret_v_u8m4_i8m4(__riscv_vand_vv_u8m4(__riscv_vand_vv_u8m4(err1, err2, vl), err3, vl));
+
+    vbool2_t is_3 = __riscv_vmsgtu_vx_u8m4_b2(v1, 0b11100000-1, vl);
+    vbool2_t is_4 = __riscv_vmsgtu_vx_u8m4_b2(v0, 0b11110000-1, vl);
+    vbool2_t is_34 = __riscv_vmor_mm_b2(is_3, is_4, vl);
+    vbool2_t err34 = __riscv_vmxor_mm_b2(is_34, __riscv_vmslt_vx_i8m4_b2(errs, 0, vl), vl);
+    vbool2_t errm = __riscv_vmor_mm_b2(__riscv_vmsgt_vx_i8m4_b2(errs, 0, vl), err34, vl);
+    if (__riscv_vfirst_m_b2(errm , vl) >= 0)
+      break;
+  }
+
+  /* we need to validate the last character */
+  while (tail < len && (src[0] >> 6) == 0b10) --src, ++tail;
+  return src - beg;
+}
+
+simdutf_warn_unused bool implementation::validate_utf8(const char *src, size_t len) const noexcept {
+  size_t count = rvv_count_valid_utf8(src, len);
+  return scalar::utf8::validate(src + count, len - count);
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *src, size_t len) const noexcept {
+  size_t count = rvv_count_valid_utf8(src, len);
+  result res = scalar::utf8::validate_with_errors(src + count, len - count);
+  return result(res.error, count + res.count);
+}
+
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *src, size_t len) const noexcept {
+  return validate_utf16le_with_errors(src, len).error == error_code::SUCCESS;
+}
+
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *src, size_t len) const noexcept {
+  return validate_utf16be_with_errors(src, len).error == error_code::SUCCESS;
+}
+
+template<simdutf_ByteFlip bflip>
+simdutf_really_inline static result rvv_validate_utf16_with_errors(const char16_t *src, size_t len) {
+  const char16_t *beg = src;
+  uint16_t last = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl, last = simdutf_byteflip<bflip>(src[-1])) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v1 = __riscv_vle16_v_u16m8((const uint16_t*)src, vl);
+    v1 = simdutf_byteflip<bflip>(v1, vl);
+    vuint16m8_t v0 = __riscv_vslide1up_vx_u16m8(v1, last, vl);
+
+    vbool2_t surhi = __riscv_vmseq_vx_u16m8_b2(__riscv_vand_vx_u16m8(v0, 0xFC00, vl), 0xD800, vl);
+    vbool2_t surlo = __riscv_vmseq_vx_u16m8_b2(__riscv_vand_vx_u16m8(v1, 0xFC00, vl), 0xDC00, vl);
+
+    long idx = __riscv_vfirst_m_b2(__riscv_vmxor_mm_b2(surhi, surlo, vl), vl);
+    if (idx >= 0) {
+      last = idx > 0 ? simdutf_byteflip<bflip>(src[idx-1]) : last;
+      return result(error_code::SURROGATE, src - beg + idx - (last - 0xD800u < 0x400u));
+      break;
+    }
+  }
+  if (last - 0xD800u < 0x400u)
+    return result(error_code::SURROGATE, src - beg - 1); /* end on high surrogate */
+  else
+    return result(error_code::SUCCESS, src - beg);
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *src, size_t len) const noexcept {
+  return rvv_validate_utf16_with_errors<simdutf_ByteFlip::NONE>(src, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *src, size_t len) const noexcept {
+  if (supports_zvbb())
+    return rvv_validate_utf16_with_errors<simdutf_ByteFlip::ZVBB>(src, len);
+  else
+    return rvv_validate_utf16_with_errors<simdutf_ByteFlip::V>(src, len);
+}
+
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t *src, size_t len) const noexcept {
+  size_t vlmax = __riscv_vsetvlmax_e32m8();
+  vuint32m8_t max    = __riscv_vmv_v_x_u32m8(0x10FFFF, vlmax);
+  vuint32m8_t maxOff = __riscv_vmv_v_x_u32m8(0xFFFFF7FF, vlmax);
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)src, vl);
+    vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
+    max    = __riscv_vmaxu_vv_u32m8_tu(max, max, v, vl);
+    maxOff = __riscv_vmaxu_vv_u32m8_tu(maxOff, maxOff, off, vl);
+  }
+  return __riscv_vfirst_m_b4(__riscv_vmor_mm_b4(
+             __riscv_vmsne_vx_u32m8_b4(max, 0x10FFFF, vlmax),
+             __riscv_vmsne_vx_u32m8_b4(maxOff, 0xFFFFF7FF, vlmax), vlmax), vlmax) < 0;
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *src, size_t len) const noexcept {
+  const char32_t *beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)src, vl);
+    vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
+    long idx;
+    idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 0x10FFFF, vl), vl);
+    if (idx >= 0) return result(error_code::TOO_LARGE, src - beg + idx);
+    idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(off, 0xFFFFF7FF, vl), vl);
+    if (idx >= 0) return result(error_code::SURROGATE, src - beg + idx);
+  }
+  return result(error_code::SUCCESS, src - beg);
+}
+
+/* end file src/rvv/rvv_validate.inl.cpp */
+
+/* begin file src/rvv/rvv_latin1_to.inl.cpp */
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char *src, size_t len, char *dst) const noexcept {
+  char *beg = dst;
+  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
+    vl = __riscv_vsetvl_e8m2(len);
+    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t*)src, vl);
+    vbool4_t nascii = __riscv_vmslt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v1), 0, vl);
+    size_t cnt = __riscv_vcpop_m_b4(nascii, vl);
+    vlOut = vl + cnt;
+    if (cnt == 0) {
+      __riscv_vse8_v_u8m2((uint8_t*)dst, v1, vlOut);
+      continue;
+    }
+
+    vuint8m2_t v0 = __riscv_vor_vx_u8m2(__riscv_vsrl_vx_u8m2(v1, 6, vl), 0b11000000, vl);
+    v1 = __riscv_vand_vx_u8m2_mu(nascii, v1, v1, 0b10111111, vl);
+
+    vuint8m4_t wide = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vwmaccu_vx_u16m4(__riscv_vwaddu_vv_u16m4(v0, v1, vl), 0xFF, v1, vl));
+    vbool2_t mask = __riscv_vmsgtu_vx_u8m4_b2(__riscv_vsub_vx_u8m4(wide, 0b11000000, vl*2), 1, vl*2);
+    vuint8m4_t comp = __riscv_vcompress_vm_u8m4(wide, mask, vl*2);
+
+    __riscv_vse8_v_u8m4((uint8_t*)dst, comp, vlOut);
+  }
+  return dst - beg;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char *src, size_t len, char16_t *dst) const noexcept {
+  char16_t *beg = dst;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e8m4(len);
+    vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t*)src, vl);
+    __riscv_vse16_v_u16m8((uint16_t*)dst, __riscv_vzext_vf2_u16m8(v, vl), vl);
+  }
+  return dst - beg;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char *src, size_t len, char16_t *dst) const noexcept {
+  char16_t *beg = dst;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e8m4(len);
+    vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t*)src, vl);
+    __riscv_vse16_v_u16m8((uint16_t*)dst, __riscv_vsll_vx_u16m8(__riscv_vzext_vf2_u16m8(v, vl), 8, vl), vl);
+  }
+  return dst - beg;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char *src, size_t len, char32_t *dst) const noexcept {
+  char32_t *beg = dst;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e8m2(len);
+    vuint8m2_t v = __riscv_vle8_v_u8m2((uint8_t*)src, vl);
+    __riscv_vse32_v_u32m8((uint32_t*)dst, __riscv_vzext_vf4_u32m8(v, vl), vl);
+  }
+  return dst - beg;
+}
+
+/* end file src/rvv/rvv_latin1_to.inl.cpp */
+/* begin file src/rvv/rvv_utf8_to.inl.cpp */
+template<simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t rvv_utf32_store_utf16_m4(uint16_t *dst, vuint32m4_t utf32, size_t vl, vbool4_t m4even) {
+  /* convert [000000000000aaaa|aaaaaabbbbbbbbbb]
+   * to      [110111bbbbbbbbbb|110110aaaaaaaaaa] */
+  vuint32m4_t sur = __riscv_vsub_vx_u32m4(utf32, 0x10000, vl);
+  sur = __riscv_vor_vv_u32m4(__riscv_vsll_vx_u32m4(sur, 16, vl),
+                             __riscv_vsrl_vx_u32m4(sur, 10, vl), vl);
+  sur = __riscv_vand_vx_u32m4(sur, 0x3FF03FF, vl);
+  sur = __riscv_vor_vx_u32m4(sur, 0xDC00D800, vl);
+  /* merge 1 byte utf32 and 2 byte sur */
+  vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(utf32, 0xFFFF, vl);
+  vuint16m4_t utf32_16 = __riscv_vreinterpret_v_u32m4_u16m4(__riscv_vmerge_vvm_u32m4(utf32, sur, m4, vl));
+  /* compress and store */
+  vbool4_t mOut = __riscv_vmor_mm_b4(__riscv_vmsne_vx_u16m4_b4(utf32_16, 0, vl*2), m4even, vl*2);
+  vuint16m4_t vout = __riscv_vcompress_vm_u16m4(utf32_16, mOut, vl*2);
+  vl = __riscv_vcpop_m_b4(mOut, vl*2);
+  __riscv_vse16_v_u16m4(dst, simdutf_byteflip<bflip>(vout, vl), vl);
+  return vl;
+};
+
+template<typename Tdst, simdutf_ByteFlip bflip, bool validate=true>
+simdutf_really_inline static size_t rvv_utf8_to_common(char const *src, size_t len, Tdst *dst) {
+  static_assert(std::is_same<Tdst, uint16_t>() || std::is_same<Tdst, uint32_t>(), "invalid type");
+  constexpr bool is16 = std::is_same<Tdst, uint16_t>();
+  constexpr endianness endian = bflip == simdutf_ByteFlip::NONE ? endianness::LITTLE : endianness::BIG;
+  const auto scalar = [](char const *in, size_t count, Tdst *out) {
+    return is16 ? scalar::utf8_to_utf16::convert<endian>(in, count, (char16_t*)out)
+                : scalar::utf8_to_utf32::convert(in, count, (char32_t*)out);
+  };
+
+  size_t tail = 32; // the minimum value is 3
+  if (len < tail) return scalar(src, len, dst);
+
+  /* validate first three bytes */
+  if (validate) {
+    size_t idx = tail;
+    while (idx < len && (src[idx] >> 6) == 0b10)
+      ++idx;
+    if (idx > tail + 3 || !scalar::utf8::validate(src, idx))
+      return 0;
+  }
+
+  size_t n = len - tail;
+  Tdst *beg = dst;
+
+  static const uint64_t err1m[] = { 0x0202020202020202, 0x4915012180808080 };
+  static const uint64_t err2m[] = { 0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB };
+  static const uint64_t err3m[] = { 0x0101010101010101, 0X01010101BABAAEE6 };
+
+  const vuint8m1_t err1tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
+  const vuint8m1_t err2tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
+  const vuint8m1_t err3tbl = __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
+
+  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
+  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(__riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
+
+  for (size_t vl, vlOut; n > 0; n -= vl, src += vl, dst += vlOut) {
+    vl = __riscv_vsetvl_e8m2(n);
+
+    vuint8m2_t v0 = __riscv_vle8_v_u8m2((uint8_t const*)src, vl);
+    uint64_t max = __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m2_u8m1(v0, __riscv_vmv_s_x_u8m1(0, vl), vl));
+
+    /* fast path: ASCII */
+    if (max < 0b10000000) {
+      vlOut = vl;
+      if (is16) __riscv_vse16_v_u16m4((uint16_t*)dst, simdutf_byteflip<bflip>(__riscv_vzext_vf2_u16m4(v0, vlOut), vlOut), vlOut);
+      else      __riscv_vse32_v_u32m8((uint32_t*)dst, __riscv_vzext_vf4_u32m8(v0, vlOut), vlOut);
+      continue;
+    }
+
+    /* see "Validating UTF-8 In Less Than One Instruction Per Byte"
+     * https://arxiv.org/abs/2010.03090 */
+    vuint8m2_t v1 = __riscv_vslide1down_vx_u8m2(v0, src[vl+0], vl);
+    vuint8m2_t v2 = __riscv_vslide1down_vx_u8m2(v1, src[vl+1], vl);
+    vuint8m2_t v3 = __riscv_vslide1down_vx_u8m2(v2, src[vl+2], vl);
+
+    if (validate) {
+      vuint8m2_t s1 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(__riscv_vreinterpret_v_u8m2_u16m2(v2), 4, __riscv_vsetvlmax_e16m2()));
+      vuint8m2_t s3 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(__riscv_vreinterpret_v_u8m2_u16m2(v3), 4, __riscv_vsetvlmax_e16m2()));
+
+      vuint8m2_t idx2 = __riscv_vand_vx_u8m2(v2, 0xF, vl);
+      vuint8m2_t idx1 = __riscv_vand_vx_u8m2(s1, 0xF, vl);
+      vuint8m2_t idx3 = __riscv_vand_vx_u8m2(s3, 0xF, vl);
+
+      vuint8m2_t err1 = simdutf_vrgather_u8m1x2(err1tbl, idx1);
+      vuint8m2_t err2 = simdutf_vrgather_u8m1x2(err2tbl, idx2);
+      vuint8m2_t err3 = simdutf_vrgather_u8m1x2(err3tbl, idx3);
+      vint8m2_t errs = __riscv_vreinterpret_v_u8m2_i8m2(__riscv_vand_vv_u8m2(__riscv_vand_vv_u8m2(err1, err2, vl), err3, vl));
+
+      vbool4_t is_3 = __riscv_vmsgtu_vx_u8m2_b4(v1, 0b11100000-1, vl);
+      vbool4_t is_4 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b11110000-1, vl);
+      vbool4_t is_34 = __riscv_vmor_mm_b4(is_3, is_4, vl);
+      vbool4_t err34 = __riscv_vmxor_mm_b4(is_34, __riscv_vmslt_vx_i8m2_b4(errs, 0, vl), vl);
+      vbool4_t errm = __riscv_vmor_mm_b4(__riscv_vmsgt_vx_i8m2_b4(errs, 0, vl), err34, vl);
+      if (__riscv_vfirst_m_b4(errm , vl) >= 0)
+        return 0;
+    }
+
+    /* decoding */
+
+    /* mask of non continuation bytes */
+    vbool4_t m = __riscv_vmsgt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v0), -65, vl);
+    vlOut = __riscv_vcpop_m_b4(m, vl);
+
+    /* extract first and second bytes */
+    vuint8m2_t b1 = __riscv_vcompress_vm_u8m2(v0, m, vl);
+    vuint8m2_t b2 = __riscv_vcompress_vm_u8m2(v1, m, vl);
+
+    /* fast path: one and two byte */
+    if (max < 0b11100000) {
+      b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
+
+      vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
+      b1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
+
+      vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(b1, __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1<<6, m1, vlOut), vlOut);
+      b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
+      if (is16) __riscv_vse16_v_u16m4((uint16_t*)dst, simdutf_byteflip<bflip>(b12, vlOut), vlOut);
+      else      __riscv_vse32_v_u32m8((uint32_t*)dst, __riscv_vzext_vf2_u32m8(b12, vlOut), vlOut);
+      continue;
+    }
+
+    /* fast path: one, two and three byte */
+    if (max < 0b11110000) {
+      vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
+
+      b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
+      b3 = __riscv_vand_vx_u8m2(b3, 0b00111111, vlOut);
+
+      vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
+      vbool4_t m3 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b11011111, vlOut);
+
+      vuint8m2_t t1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
+      b1 = __riscv_vand_vx_u8m2_mu(m3, t1, b1, 15, vlOut);
+
+      vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(b1, __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1<<6, m1, vlOut), vlOut);
+      b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
+      vuint16m4_t b123 = __riscv_vwaddu_wv_u16m4_mu(m3, b12, __riscv_vsll_vx_u16m4_mu(m3, b12, b12, 6, vlOut), b3, vlOut);
+      if (is16) __riscv_vse16_v_u16m4((uint16_t*)dst, simdutf_byteflip<bflip>(b123, vlOut), vlOut);
+      else      __riscv_vse32_v_u32m8((uint32_t*)dst, __riscv_vzext_vf2_u32m8(b123, vlOut), vlOut);
+      continue;
+    }
+
+    /* extract third and fourth bytes */
+    vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
+    vuint8m2_t b4 = __riscv_vcompress_vm_u8m2(v3, m, vl);
+
+    #define SIMDUTF_RVV_UTF8_TO_COMMON_M1(idx) \
+      vuint8m1_t c1 = __riscv_vget_v_u8m2_u8m1(b1, idx); \
+      vuint8m1_t c2 = __riscv_vget_v_u8m2_u8m1(b2, idx); \
+      vuint8m1_t c3 = __riscv_vget_v_u8m2_u8m1(b3, idx); \
+      vuint8m1_t c4 = __riscv_vget_v_u8m2_u8m1(b4, idx); \
+      /* remove prefix from trailing bytes */ \
+      c2 = __riscv_vand_vx_u8m1(c2, 0b00111111, vlOut); \
+      c3 = __riscv_vand_vx_u8m1(c3, 0b00111111, vlOut); \
+      c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut); \
+      /* remove prefix from leading bytes
+       *
+       * We could also use vrgather here, but it increases register pressure,
+       * and its performance varies widely on current platforms. It might be
+       * worth reconsidering, though, once there is more hardware available.
+       * Same goes for the __riscv_vsrl_vv_u32m4 correction step.
+       *
+       * We shift left and then right by the number of bytes in the prefix,
+       * which can be calculated as follows:
+       *         x                                max(x-10, 0)
+       * 0xxx -> 0000-0111 -> sift by 0 or 1   -> 0
+       * 10xx -> 1000-1011 -> don't care
+       * 110x -> 1100,1101 -> sift by 3        -> 2,3
+       * 1110 -> 1110      -> sift by 4        -> 4
+       * 1111 -> 1111      -> sift by 5        -> 5
+       *
+       * vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we
+       * just need to manually detect and handle the one special case:
+       */ \
+      vuint8m1_t shift = __riscv_vsrl_vx_u8m1(c1, 4, vlOut); \
+      shift = __riscv_vmerge_vxm_u8m1(__riscv_vssubu_vx_u8m1(shift, 10, vlOut), 3, __riscv_vmseq_vx_u8m1_b8(shift, 12, vlOut), vlOut); \
+      c1 = __riscv_vsll_vv_u8m1(c1, shift, vlOut); \
+      c1 = __riscv_vsrl_vv_u8m1(c1, shift, vlOut); \
+      /* unconditionally widen and combine to c1234 */ \
+      vuint16m2_t c34 = __riscv_vwaddu_wv_u16m2(__riscv_vwmulu_vx_u16m2(c3, 1<<6, vlOut), c4, vlOut); \
+      vuint16m2_t c12 = __riscv_vwaddu_wv_u16m2(__riscv_vwmulu_vx_u16m2(c1, 1<<6, vlOut), c2, vlOut); \
+      vuint32m4_t c1234 = __riscv_vwaddu_wv_u32m4(__riscv_vwmulu_vx_u32m4(c12, 1 << 12, vlOut), c34, vlOut); \
+      /* derive required right-shift amount from `shift` to reduce
+       * c1234 to the required number of bytes */ \
+      c1234 = __riscv_vsrl_vv_u32m4(c1234, __riscv_vzext_vf4_u32m4(__riscv_vmul_vx_u8m1( \
+              __riscv_vrsub_vx_u8m1( __riscv_vssubu_vx_u8m1(shift, 2, vlOut), 3, vlOut), 6, vlOut), vlOut), vlOut); \
+      /* store result in desired format */ \
+      if (is16) vlDst = rvv_utf32_store_utf16_m4<bflip>((uint16_t*)dst, c1234, vlOut, m4even); \
+      else      vlDst = vlOut, __riscv_vse32_v_u32m4((uint32_t*)dst, c1234, vlOut);
+
+    /* Unrolling this manually reduces register pressure and allows
+     * us to terminate early. */
+    {
+      size_t vlOutm2 = vlOut, vlDst;
+      vlOut = __riscv_vsetvl_e8m1(vlOut);
+      SIMDUTF_RVV_UTF8_TO_COMMON_M1(0)
+      if (vlOutm2 == vlOut) {
+        vlOut = vlDst;
+        continue;
+      }
+
+      dst += vlDst;
+      vlOut = vlOutm2 - vlOut;
+    }
+    {
+      size_t vlDst;
+      SIMDUTF_RVV_UTF8_TO_COMMON_M1(1)
+      vlOut = vlDst;
+    }
+
+#undef SIMDUTF_RVV_UTF8_TO_COMMON_M1
+  }
+
+  /* validate the last character and reparse it + tail */
+  if (len > tail) {
+    if ((src[0] >> 6) == 0b10)
+      --dst;
+    while ((src[0] >> 6) == 0b10 && tail < len)
+      --src, ++tail;
+    if (is16) {
+      /* go back one more, when on high surrogate */
+      if (simdutf_byteflip<bflip>((uint16_t)dst[-1]) >= 0xD800 && simdutf_byteflip<bflip>((uint16_t)dst[-1]) <= 0xDBFF)
+        --dst;
+    }
+  }
+  size_t ret = scalar(src, tail, dst);
+  if (ret == 0) return 0;
+  return (size_t)(dst - beg) + ret;
+}
+
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char *src, size_t len, char *dst) const noexcept {
+  const char *beg = dst;
+  uint8_t last = 0b10000000;
+  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut, last = src[-1]) {
+    vl = __riscv_vsetvl_e8m2(len);
+    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t*)src, vl);
+    vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
+    vlOut = __riscv_vcpop_m_b4(m, vl);
+    if (vlOut != vl || last > 0b01111111) {
+      vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
+
+      vbool4_t leading0  = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b10111111, vl);
+      vbool4_t trailing1 = __riscv_vmslt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v1), (uint8_t)0b11000000, vl);
+      vbool4_t tobig = __riscv_vmand_mm_b4(leading0, __riscv_vmsgtu_vx_u8m2_b4(__riscv_vxor_vx_u8m2(v0, (uint8_t)-62, vl), 1, vl), vl);
+      if (__riscv_vfirst_m_b4(__riscv_vmor_mm_b4(tobig, __riscv_vmxor_mm_b4(leading0, trailing1, vl), vl), vl) >= 0)
+        return 0;
+
+      v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl), v1, v1, 0b01000000, vl);
+      v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
+    }
+    __riscv_vse8_v_u8m2((uint8_t*)dst, v1, vlOut);
+  }
+  if (last > 0b10111111)
+    return 0;
+  return dst - beg;
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char *src, size_t len, char *dst) const noexcept {
+  size_t res = convert_utf8_to_latin1(src, len, dst);
+  if (res) return result(error_code::SUCCESS, res);
+  return scalar::utf8_to_latin1::convert_with_errors(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char *src, size_t len, char *dst) const noexcept {
+  const char *beg = dst;
+  uint8_t last = 0b11000000;
+  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut, last = src[-1]) {
+    vl = __riscv_vsetvl_e8m2(len);
+    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t*)src, vl);
+    vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
+    vlOut = __riscv_vcpop_m_b4(m, vl);
+    if (vlOut != vl || last > 0b01111111) {
+      vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
+      v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl), v1, v1, 0b01000000, vl);
+      v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
+    }
+    __riscv_vse8_v_u8m2((uint8_t*)dst, v1, vlOut);
+  }
+  return dst - beg;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char *src, size_t len, char16_t *dst) const noexcept {
+  return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::NONE>(src, len, (uint16_t*)dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char *src, size_t len, char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::ZVBB>(src, len, (uint16_t*)dst);
+  else
+    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::V>(src, len, (uint16_t*)dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char *src, size_t len, char16_t *dst) const noexcept {
+  size_t res = convert_utf8_to_utf16le(src, len, dst);
+  if (res) return result(error_code::SUCCESS, res);
+  return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(src, len, dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char *src, size_t len, char16_t *dst) const noexcept {
+  size_t res = convert_utf8_to_utf16be(src, len, dst);
+  if (res) return result(error_code::SUCCESS, res);
+  return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char *src, size_t len, char16_t *dst) const noexcept {
+  return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::NONE, false>(src, len, (uint16_t*)dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char *src, size_t len, char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::ZVBB, false>(src, len, (uint16_t*)dst);
+  else
+    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::V, false>(src, len, (uint16_t*)dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char *src, size_t len, char32_t *dst) const noexcept {
+  return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE>(src, len, (uint32_t*)dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char *src, size_t len, char32_t *dst) const noexcept {
+  size_t res = convert_utf8_to_utf32(src, len, dst);
+  if (res) return result(error_code::SUCCESS, res);
+  return scalar::utf8_to_utf32::convert_with_errors(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char *src, size_t len, char32_t *dst) const noexcept {
+  return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE, false>(src, len, (uint32_t*)dst);
+}
+
+/* end file src/rvv/rvv_utf8_to.inl.cpp */
+/* begin file src/rvv/rvv_utf16_to.inl.cpp */
+template<simdutf_ByteFlip bflip>
+simdutf_really_inline static result rvv_utf16_to_latin1_with_errors(const char16_t *src, size_t len, char *dst) {
+  const char16_t *const beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t*)src, vl);
+    v = simdutf_byteflip<bflip>(v, vl);
+    long idx = __riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 255, vl), vl);
+    if (idx >= 0)
+      return result(error_code::TOO_LARGE, beg - src + idx);
+    __riscv_vse8_v_u8m4((uint8_t*)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
+  }
+  return result(error_code::SUCCESS, src - beg);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf16le_to_latin1_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf16be_to_latin1_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t *src, size_t len, char *dst) const noexcept {
+  return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t *src, size_t len, char *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::ZVBB>(src, len, dst);
+  else
+    return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::V>(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t *src, size_t len, char *dst) const noexcept {
+  const char16_t *const beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t*)src, vl);
+    __riscv_vse8_v_u8m4((uint8_t*)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
+  }
+  return src - beg;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t *src, size_t len, char *dst) const noexcept {
+  const char16_t *const beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t*)src, vl);
+    __riscv_vse8_v_u8m4((uint8_t*)dst, __riscv_vnsrl_wx_u8m4(v, 8, vl), vl);
+  }
+  return src - beg;
+}
+
+template<simdutf_ByteFlip bflip>
+simdutf_really_inline static result rvv_utf16_to_utf8_with_errors(const char16_t *src, size_t len, char *dst) {
+  size_t n = len;
+  const char16_t *srcBeg = src;
+  const char *dstBeg = dst;
+  size_t vl8m4 = __riscv_vsetvlmax_e8m4();
+  vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(__riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
+
+  for (size_t vl, vlOut; n > 0; ) {
+    vl = __riscv_vsetvl_e16m2(n);
+
+    vuint16m2_t v = __riscv_vle16_v_u16m2((uint16_t const*)src, vl);
+    v = simdutf_byteflip<bflip>(v, vl);
+    vbool8_t m234 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x80-1, vl);
+
+    if (__riscv_vfirst_m_b8(m234,vl) < 0) { /* 1 byte utf8 */
+      vlOut = vl;
+      __riscv_vse8_v_u8m1((uint8_t*)dst, __riscv_vncvt_x_x_w_u8m1(v, vlOut), vlOut);
+      n -= vl, src += vl, dst += vlOut;
+      continue;
+    }
+
+    vbool8_t m34  = __riscv_vmsgtu_vx_u16m2_b8(v, 0x800-1, vl);
+
+    if (__riscv_vfirst_m_b8(m34,vl) < 0) { /* 1/2 byte utf8 */
+      /* 0: [     aaa|aabbbbbb]
+       * 1: [aabbbbbb|        ] vsll 8
+       * 2: [        |   aaaaa] vsrl 6
+       * 3: [00111111|00011111]
+       * 4: [  bbbbbb|000aaaaa] (1|2)&3
+       * 5: [11000000|11000000]
+       * 6: [10bbbbbb|110aaaaa] 4|5 */
+      vuint16m2_t twoByte  =
+        __riscv_vand_vx_u16m2(__riscv_vor_vv_u16m2(
+          __riscv_vsll_vx_u16m2(v, 8, vl),
+          __riscv_vsrl_vx_u16m2(v, 6, vl),
+        vl), 0b0011111100011111, vl);
+      vuint16m2_t vout16 = __riscv_vor_vx_u16m2_mu(m234, v, twoByte, 0b1000000011000000, vl);
+      vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);
+
+      /* Every high byte that is zero should be compressed
+       * low bytes should never be compressed, so we set them
+       * to all ones, and then create a non-zero bytes mask */
+      vbool4_t mcomp = __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(__riscv_vor_vx_u16m2(vout16, 0xFF, vl)), 0, vl*2);
+      vlOut = __riscv_vcpop_m_b4(mcomp, vl*2);
+
+      vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl*2);
+      __riscv_vse8_v_u8m2((uint8_t*)dst, vout, vlOut);
+
+      n -= vl, src += vl, dst += vlOut;
+      continue;
+    }
+
+    vbool8_t sur = __riscv_vmseq_vx_u16m2_b8(__riscv_vand_vx_u16m2(v, 0xF800, vl), 0xD800, vl);
+    long first = __riscv_vfirst_m_b8(sur, vl);
+    size_t tail = vl - first;
+    vl = first < 0 ? vl : first;
+
+    if (vl > 0) { /* 1/2/3 byte utf8 */
+      /* in: [aaaabbbb|bbcccccc]
+       * v1: [0bcccccc|        ] vsll  8
+       * v1: [10cccccc|        ] vsll  8 & 0b00111111 | 0b10000000
+       * v2: [        |110bbbbb] vsrl  6 & 0b00111111 | 0b11000000
+       * v2: [        |10bbbbbb] vsrl  6 & 0b00111111 | 0b10000000
+       * v3: [        |1110aaaa] vsrl 12 | 0b11100000
+       *  1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
+       *  2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
+       *  3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb] [10cccccc]
+       */
+      vuint16m2_t v1, v2, v3, v12;
+      v1 = __riscv_vor_vx_u16m2_mu(m234, v, __riscv_vand_vx_u16m2(v, 0b00111111, vl), 0b10000000, vl);
+      v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
+
+      v2 = __riscv_vor_vx_u16m2(__riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 6, vl), 0b00111111, vl), 0b10000000, vl);
+      v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34,vl), v2, v2, 0b01000000, vl);
+      v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 12, vl), 0b11100000, vl);
+      v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
+
+      vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1<<8, vl);
+      vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
+      vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
+
+      vbool2_t mcomp = __riscv_vmor_mm_b2(m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl*4), vl*4);
+      vlOut = __riscv_vcpop_m_b2(mcomp, vl*4);
+
+      vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl*4);
+      __riscv_vse8_v_u8m4((uint8_t*)dst, vout, vlOut);
+
+      n -= vl, src += vl, dst += vlOut;
+    }
+
+    if (tail) while (n) {
+      uint16_t word = simdutf_byteflip<bflip>(src[0]);
+      if((word & 0xFF80)==0) {
+        break;
+      } else if((word & 0xF800)==0) {
+        break;
+      } else if ((word & 0xF800) != 0xD800) {
+        break;
+      } else {
+        // must be a surrogate pair
+        if (n <= 1) return result(error_code::SURROGATE, src - srcBeg);
+        uint16_t diff = word - 0xD800;
+        if (diff > 0x3FF) return result(error_code::SURROGATE, src - srcBeg);
+        uint16_t diff2 = simdutf_byteflip<bflip>(src[1]) - 0xDC00;
+        if (diff2 > 0x3FF) return result(error_code::SURROGATE, src - srcBeg);
+
+        uint32_t value = ((diff + 0x40) << 10) + diff2 ;
+
+        // will generate four UTF-8 bytes
+        // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+        *dst++ = (char)( (value>>18)             | 0b11110000);
+        *dst++ = (char)(((value>>12) & 0b111111) | 0b10000000);
+        *dst++ = (char)(((value>> 6) & 0b111111) | 0b10000000);
+        *dst++ = (char)(( value      & 0b111111) | 0b10000000);
+        src += 2;
+        n -= 2;
+      }
+    }
+  }
+
+  return result(error_code::SUCCESS, dst - dstBeg);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf16le_to_utf8_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf16be_to_utf8_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t *src, size_t len, char *dst) const noexcept {
+  return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t *src, size_t len, char *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::ZVBB>(src, len, dst);
+  else
+    return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::V>(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t *src, size_t len, char *dst) const noexcept {
+  return convert_utf16le_to_utf8(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t *src, size_t len, char *dst) const noexcept {
+  return convert_utf16be_to_utf8(src, len, dst);
+}
+
+template<simdutf_ByteFlip bflip>
+simdutf_really_inline static result rvv_utf16_to_utf32_with_errors(const char16_t *src, size_t len, char32_t *dst) {
+  const char16_t *const srcBeg = src;
+  char32_t *const dstBeg = dst;
+
+  uint16_t last = 0;
+  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut, last = simdutf_byteflip<bflip>(src[-1])) {
+    vl = __riscv_vsetvl_e16m2(len);
+    vuint16m2_t v1 = __riscv_vle16_v_u16m2((uint16_t const*)src, vl);
+    v1 = simdutf_byteflip<bflip>(v1, vl);
+    vuint16m2_t v0 = __riscv_vslide1up_vx_u16m2(v1, last, vl);
+
+    vbool8_t surhi0 = __riscv_vmseq_vx_u16m2_b8(__riscv_vand_vx_u16m2(v0, 0xFC00, vl), 0xD800, vl);
+    vbool8_t surlo1 = __riscv_vmseq_vx_u16m2_b8(__riscv_vand_vx_u16m2(v1, 0xFC00, vl), 0xDC00, vl);
+
+    /* no surrogates */
+    if (__riscv_vfirst_m_b8(__riscv_vmor_mm_b8(surhi0, surlo1, vl), vl) < 0) {
+      vlOut = vl;
+      __riscv_vse32_v_u32m4((uint32_t*)dst, __riscv_vzext_vf2_u32m4(v1, vl), vl);
+      continue;
+    }
+
+    long idx = __riscv_vfirst_m_b8(__riscv_vmxor_mm_b8(surhi0, surlo1, vl), vl);
+    if (idx >= 0) {
+      last = idx > 0 ? simdutf_byteflip<bflip>(src[idx-1]) : last;
+      return result(error_code::SURROGATE, src - srcBeg + idx - (last - 0xD800u < 0x400u));
+    }
+
+    vbool8_t surhi1 = __riscv_vmseq_vx_u16m2_b8(__riscv_vand_vx_u16m2(v1, 0xFC00, vl), 0xD800, vl);
+    uint16_t next = vl < len ? simdutf_byteflip<bflip>(src[vl]) : 0;
+
+    vuint32m4_t wide    = __riscv_vzext_vf2_u32m4(v1, vl);
+    vuint32m4_t slided  = __riscv_vslide1down_vx_u32m4(wide, next, vl);
+    vuint32m4_t aligned = __riscv_vsll_vx_u32m4_mu(surhi1, wide, wide, 10, vl);
+    vuint32m4_t added   = __riscv_vadd_vv_u32m4_mu(surhi1, aligned, aligned, slided, vl);
+    vuint32m4_t utf32   = __riscv_vadd_vx_u32m4_mu(surhi1, added, added, 0xFCA02400, vl);
+    vbool8_t m = __riscv_vmnot_m_b8(surlo1, vl);
+    vlOut = __riscv_vcpop_m_b8(m, vl);
+    vuint32m4_t comp = __riscv_vcompress_vm_u32m4(utf32, m, vl);
+    __riscv_vse32_v_u32m4((uint32_t*)dst, comp, vlOut);
+  }
+
+  if (last - 0xD800u < 0x400u)
+    return result(error_code::SURROGATE, src - srcBeg - 1); /* end on high surrogate */
+  else
+    return result(error_code::SUCCESS, dst - dstBeg);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  result res = convert_utf16le_to_utf32_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  result res = convert_utf16be_to_utf32_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::ZVBB>(src, len, dst);
+  else
+    return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::V>(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  return convert_utf16le_to_utf32(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  return convert_utf16be_to_utf32(src, len, dst);
+}
+/* end file src/rvv/rvv_utf16_to.inl.cpp */
+/* begin file src/rvv/rvv_utf32_to.inl.cpp */
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf32_to_latin1_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t *src, size_t len, char *dst) const noexcept {
+  const char32_t *const beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t*)src, vl);
+    long idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 255, vl), vl);
+    if (idx >= 0)
+      return result(error_code::TOO_LARGE, src - beg + idx);
+      /* We don't use vcompress here, because its performance varies widely on current platforms.
+       * This might be worth reconsidering once there is more hardware available. */
+    __riscv_vse8_v_u8m2((uint8_t*)dst, __riscv_vncvt_x_x_w_u8m2(__riscv_vncvt_x_x_w_u16m4(v, vl), vl), vl);
+  }
+  return result(error_code::SUCCESS, src - beg);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t *src, size_t len, char *dst) const noexcept {
+  return convert_utf32_to_latin1(src, len, dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t *src, size_t len, char *dst) const noexcept {
+  size_t n = len;
+  const char32_t *srcBeg = src;
+  const char *dstBeg = dst;
+  size_t vl8m4 = __riscv_vsetvlmax_e8m4();
+  vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(__riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
+
+  for (size_t vl, vlOut; n > 0; ) {
+    vl = __riscv_vsetvl_e32m4(n);
+
+    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t const*)src, vl);
+    vbool8_t m234 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x80-1, vl);
+    vuint16m2_t vn = __riscv_vncvt_x_x_w_u16m2(v, vl);
+
+    if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */
+      vlOut = vl;
+      __riscv_vse8_v_u8m1((uint8_t*)dst, __riscv_vncvt_x_x_w_u8m1(vn, vlOut), vlOut);
+      n -= vl, src += vl, dst += vlOut;
+      continue;
+    }
+
+    vbool8_t m34  = __riscv_vmsgtu_vx_u32m4_b8(v, 0x800-1, vl);
+
+    if (__riscv_vfirst_m_b8(m34,vl) < 0) { /* 1/2 byte utf8 */
+      /* 0: [     aaa|aabbbbbb]
+       * 1: [aabbbbbb|        ] vsll 8
+       * 2: [        |   aaaaa] vsrl 6
+       * 3: [00111111|00111111]
+       * 4: [  bbbbbb|000aaaaa] (1|2)&3
+       * 5: [10000000|11000000]
+       * 6: [10bbbbbb|110aaaaa] 4|5 */
+      vuint16m2_t twoByte  =
+        __riscv_vand_vx_u16m2(__riscv_vor_vv_u16m2(
+          __riscv_vsll_vx_u16m2(vn, 8, vl),
+          __riscv_vsrl_vx_u16m2(vn, 6, vl),
+        vl), 0b0011111100111111, vl);
+      vuint16m2_t vout16 = __riscv_vor_vx_u16m2_mu(m234, vn, twoByte, 0b1000000011000000, vl);
+      vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);
+
+      /* Every high byte that is zero should be compressed
+       * low bytes should never be compressed, so we set them
+       * to all ones, and then create a non-zero bytes mask */
+      vbool4_t mcomp = __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(__riscv_vor_vx_u16m2(vout16, 0xFF, vl)), 0, vl*2);
+      vlOut = __riscv_vcpop_m_b4(mcomp, vl*2);
+
+      vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl*2);
+      __riscv_vse8_v_u8m2((uint8_t*)dst, vout, vlOut);
+
+      n -= vl, src += vl, dst += vlOut;
+      continue;
+    }
+
+    vbool8_t sur = __riscv_vmseq_vx_u32m4_b8(__riscv_vand_vx_u32m4(v, 0xFFFFF800, vl), 0xD800, vl);
+    long idx = __riscv_vfirst_m_b8(sur, vl);
+    if (idx >= 0) return result(error_code::SURROGATE, src - srcBeg + idx);
+
+    vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x10000-1, vl);
+    long first = __riscv_vfirst_m_b8(m4, vl);
+    size_t tail = vl - first;
+    vl = first < 0 ? vl : first;
+
+    if (vl > 0) { /* 1/2/3 byte utf8 */
+      /* vn: [aaaabbbb|bbcccccc]
+       * v1: [0bcccccc|        ] vsll  8
+       * v1: [10cccccc|        ] vsll  8 & 0b00111111 | 0b10000000
+       * v2: [        |110bbbbb] vsrl  6 & 0b00111111 | 0b11000000
+       * v2: [        |10bbbbbb] vsrl  6 & 0b00111111 | 0b10000000
+       * v3: [        |1110aaaa] vsrl 12 | 0b11100000
+       *  1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
+       *  2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
+       *  3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb] [10cccccc]
+       */
+      vuint16m2_t v1, v2, v3, v12;
+      v1 = __riscv_vor_vx_u16m2_mu(m234, vn, __riscv_vand_vx_u16m2(vn, 0b00111111, vl), 0b10000000, vl);
+      v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
+
+      v2 = __riscv_vor_vx_u16m2(__riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 6, vl), 0b00111111, vl), 0b10000000, vl);
+      v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34,vl), v2, v2, 0b01000000, vl);
+      v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 12, vl), 0b11100000, vl);
+      v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
+
+      vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1<<8, vl);
+      vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
+      vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
+
+      vbool2_t mcomp = __riscv_vmor_mm_b2(m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl*4), vl*4);
+      vlOut = __riscv_vcpop_m_b2(mcomp, vl*4);
+
+      vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl*4);
+      __riscv_vse8_v_u8m4((uint8_t*)dst, vout, vlOut);
+
+      n -= vl, src += vl, dst += vlOut;
+    }
+
+    if (tail) while (n) {
+      uint32_t word = src[0];
+      if (word < 0x10000) break;
+      if (word > 0x10FFFF) return result(error_code::TOO_LARGE, src - srcBeg);
+      *dst++ = (uint8_t)(( word>>18)             | 0b11110000);
+      *dst++ = (uint8_t)(((word>>12) & 0b111111) | 0b10000000);
+      *dst++ = (uint8_t)(((word>> 6) & 0b111111) | 0b10000000);
+      *dst++ = (uint8_t)(( word      & 0b111111) | 0b10000000);
+      ++src;
+      --n;
+    }
+  }
+
+  return result(error_code::SUCCESS, dst - dstBeg);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf32_to_utf8_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t *src, size_t len, char *dst) const noexcept {
+  return convert_utf32_to_utf8(src, len, dst);
+}
+
+template<simdutf_ByteFlip bflip>
+simdutf_really_inline static result rvv_convert_utf32_to_utf16_with_errors(const char32_t *src, size_t len, char16_t *dst) {
+  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
+  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(__riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
+  const char16_t *dstBeg = dst;
+  const char32_t *srcBeg = src;
+  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
+    vl = __riscv_vsetvl_e32m4(len);
+    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t*)src, vl);
+    vuint32m4_t off = __riscv_vadd_vx_u32m4(v, 0xFFFF2000, vl);
+    long idx;
+    idx = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(off, 0xFFFFF7FF, vl), vl);
+    if (idx >= 0) return result(error_code::SURROGATE, src - srcBeg + idx);
+    idx = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl);
+    if (idx < 0) {
+      vlOut = vl;
+      vuint16m2_t n = simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
+      __riscv_vse16_v_u16m2((uint16_t*)dst, n, vlOut);
+      continue;
+    }
+    idx = __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
+    if (idx >= 0) return result(error_code::TOO_LARGE, src - srcBeg + idx);
+    vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t*)dst, v, vl, m4even);
+  }
+  return result(error_code::SUCCESS, dst - dstBeg);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  result res = convert_utf32_to_utf16le_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  result res = convert_utf32_to_utf16be_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::ZVBB>(src, len, dst);
+  else
+    return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::V>(src, len, dst);
+}
+
+template<simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t rvv_convert_valid_utf32_to_utf16(const char32_t *src, size_t len, char16_t *dst) {
+  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
+  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(__riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
+  char16_t *dstBeg = dst;
+  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
+    vl = __riscv_vsetvl_e32m4(len);
+    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t*)src, vl);
+    if (__riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl) < 0) {
+      vlOut = vl;
+      vuint16m2_t n = simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
+      __riscv_vse16_v_u16m2((uint16_t*)dst, n, vlOut);
+      continue;
+    }
+    vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t*)dst, v, vl, m4even);
+  }
+  return dst - dstBeg;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::NONE>(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::ZVBB>(src, len, dst);
+  else
+    return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::V>(src, len, dst);
+}
+/* end file src/rvv/rvv_utf32_to.inl.cpp */
+
+simdutf_warn_unused int implementation::detect_encodings(const char *input, size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if (bom_encoding != encoding_type::unspecified)
+    return bom_encoding;
+  int out = 0;
+  if (validate_utf8(input, length))
+    out |= encoding_type::UTF8;
+  if (length % 2 == 0) {
+    if (validate_utf16(reinterpret_cast<const char16_t*>(input), length/2))
+      out |= encoding_type::UTF16_LE;
+  }
+  if (length % 4 == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t*>(input), length/4))
+      out |= encoding_type::UTF32_LE;
+  }
+
+  return out;
+}
+
+template<simdutf_ByteFlip bflip>
+simdutf_really_inline static void rvv_change_endianness_utf16(const char16_t *src, size_t len, char16_t *dst) {
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t*)src, vl);
+    __riscv_vse16_v_u16m8((uint16_t *)dst, simdutf_byteflip<bflip>(v, vl), vl);
+  }
+}
+
+void implementation::change_endianness_utf16(const char16_t *src, size_t len, char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_change_endianness_utf16<simdutf_ByteFlip::ZVBB>(src, len, dst);
+  else
+    return rvv_change_endianness_utf16<simdutf_ByteFlip::V>(src, len, dst);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+  return scalar::base64::base64_to_binary(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
+  return scalar::base64::base64_length_from_binary(length);
+}
+
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
+  return scalar::base64::binary_to_base64(input, length, output);
+}
+} // namespace rvv
+} // namespace simdutf
+
+/* begin file src/simdutf/rvv/end.h */
+#if SIMDUTF_CAN_ALWAYS_RUN_RVV
+// nothing needed.
+#else
+SIMDUTF_UNTARGET_REGION
+#endif
+
+/* end file src/simdutf/rvv/end.h */
+/* end file src/rvv/implementation.cpp */
+#endif
 #if SIMDUTF_IMPLEMENTATION_WESTMERE
 /* begin file src/westmere/implementation.cpp */
 /* begin file src/simdutf/westmere/begin.h */
@@ -28999,7 +32631,7 @@ std::pair<const char* const, char* const> sse_convert_latin1_to_utf8(
 
   // each latin1 takes 1-2 utf8 bytes
   // slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then adjust the pointer)
-  // so the last write can exceed the utf8_output size by 8-1 bytes 
+  // so the last write can exceed the utf8_output size by 8-1 bytes
   // by reserving 8 extra input bytes, we expect the output to have 8-16 bytes free
   while (latin_input + 16 + 8 <= end) {
     // Load 16 Latin1 characters (16 bytes) into a 128-bit register
@@ -29012,7 +32644,7 @@ std::pair<const char* const, char* const> sse_convert_latin1_to_utf8(
       utf8_output += 16;
       continue;
     }
-    
+
 
     // assuming a/b are bytes and A/B are uint16 of the same value
     // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
@@ -29079,7 +32711,7 @@ std::pair<const char*, char32_t*> sse_convert_latin1_to_utf32(const char* buf, s
         __m128i in_shifted2 = _mm_srli_si128(in, 8);
         __m128i in_shifted3 = _mm_srli_si128(in, 12);
 
-        // expand 8-bit to 32-bit unit      
+        // expand 8-bit to 32-bit unit
         __m128i out1 = _mm_cvtepu8_epi32(in);
         __m128i out2 = _mm_cvtepu8_epi32(in_shifted1);
         __m128i out3 = _mm_cvtepu8_epi32(in_shifted2);
@@ -30260,7 +33892,7 @@ std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf,
   const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); //1100 0000 1000 0000
   const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); //1111 1111 1000 0000
   const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); //1111 1111 1111 1111 0000 0000 0000 0000
-  const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); //0111 1111 1111 1111 1111 1111 1111 1111 
+  const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); //0111 1111 1111 1111 1111 1111 1111 1111
   __m128i running_max = _mm_setzero_si128();
   __m128i forbidden_bytemask = _mm_setzero_si128();
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
@@ -30275,15 +33907,15 @@ std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf,
 
     // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned saturation
     __m128i in_16 = _mm_packus_epi32(
-                                      _mm_and_si128(in, v_7fffffff), 
+                                      _mm_and_si128(in, v_7fffffff),
                                       _mm_and_si128(nextin, v_7fffffff)
-                                      );//in this context pack the two __m128 into a single 
+                                      );//in this context pack the two __m128 into a single
     //By ensuring the highest bit is set to 0(&v_7fffffff), we're making sure all values are interpreted as non-negative, or specifically, the values are within the range of valid Unicode code points.
-    //remember : having leading byte 0 means a positive number by the two complements system. Unicode is well beneath the range where you'll start getting issues so that's OK. 
+    //remember : having leading byte 0 means a positive number by the two complements system. Unicode is well beneath the range where you'll start getting issues so that's OK.
 
     // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
 
-    // Check for ASCII fast path 
+    // Check for ASCII fast path
 
     // ASCII fast path!!!!
       // We eagerly load another 32 bytes, hoping that they will be ASCII too.
@@ -30322,7 +33954,7 @@ std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf,
     }
 
     // no bits set above 7th bit -- find out all the ASCII characters
-    const __m128i one_byte_bytemask = _mm_cmpeq_epi16( // this takes four bytes at a time and compares: 
+    const __m128i one_byte_bytemask = _mm_cmpeq_epi16( // this takes four bytes at a time and compares:
                                                       _mm_and_si128(in_16, v_ff80), // the vector that get only the first 9 bits of each 16-bit/2-byte units
                                                        v_0000 //
                                                        ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is of format 0000 0000 0000 0XXX XXXX
@@ -30346,11 +33978,11 @@ std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf,
       // t1 = [000a|aaaa|0000|0000]
       const __m128i t1 = _mm_and_si128(t0, v_1f00); // potentital first utf8 byte
       // t2 = [0000|0000|00bb|bbbb]
-      const __m128i t2 = _mm_and_si128(in_16, v_003f);// potential second utf8 byte 
+      const __m128i t2 = _mm_and_si128(in_16, v_003f);// potential second utf8 byte
       // t3 = [000a|aaaa|00bb|bbbb]
-      const __m128i t3 = _mm_or_si128(t1, t2); // first and second potential utf8 byte together 
+      const __m128i t3 = _mm_or_si128(t1, t2); // first and second potential utf8 byte together
       // t4 = [110a|aaaa|10bb|bbbb]
-      const __m128i t4 = _mm_or_si128(t3, v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit 
+      const __m128i t4 = _mm_or_si128(t3, v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit
 
       // 2. merge ASCII and 2-byte codewords
       const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
@@ -30897,6 +34529,511 @@ std::pair<result, char16_t*> sse_convert_utf32_to_utf16_with_errors(const char32
   return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
 }
 /* end file src/westmere/sse_convert_utf32_to_utf16.cpp */
+/* begin file src/westmere/sse_base64.cpp */
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
+
+__m128i lookup_pshufb_improved(const __m128i input) {
+  // credit: Wojciech Muła
+  // reduce  0..51 -> 0
+  //        52..61 -> 1 .. 10
+  //            62 -> 11
+  //            63 -> 12
+  __m128i result = _mm_subs_epu8(input, _mm_set1_epi8(51));
+
+  // distinguish between ranges 0..25 and 26..51:
+  //         0 .. 25 -> remains 0
+  //        26 .. 51 -> becomes 13
+  const __m128i less = _mm_cmpgt_epi8(_mm_set1_epi8(26), input);
+  result = _mm_or_si128(result, _mm_and_si128(less, _mm_set1_epi8(13)));
+
+  const __m128i shift_LUT = _mm_setr_epi8(
+      'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+      '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+
+  // read shift
+  result = _mm_shuffle_epi8(shift_LUT, result);
+
+  return _mm_add_epi8(result, input);
+}
+
+size_t encode_base64(char *dst, const char *src, size_t srclen) {
+  // credit: Wojciech Muła
+  // SSE (lookup: pshufb improved unrolled)
+  const uint8_t *input = (const uint8_t *)src;
+
+  uint8_t *out = (uint8_t *)dst;
+  const __m128i shuf =
+      _mm_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
+
+  size_t i = 0;
+  for (; i + 52 <= srclen; i += 48) {
+    __m128i in0 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
+    __m128i in1 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
+    __m128i in2 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
+    __m128i in3 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
+
+    in0 = _mm_shuffle_epi8(in0, shuf);
+    in1 = _mm_shuffle_epi8(in1, shuf);
+    in2 = _mm_shuffle_epi8(in2, shuf);
+    in3 = _mm_shuffle_epi8(in3, shuf);
+
+    const __m128i t0_0 = _mm_and_si128(in0, _mm_set1_epi32(0x0fc0fc00));
+    const __m128i t0_1 = _mm_and_si128(in1, _mm_set1_epi32(0x0fc0fc00));
+    const __m128i t0_2 = _mm_and_si128(in2, _mm_set1_epi32(0x0fc0fc00));
+    const __m128i t0_3 = _mm_and_si128(in3, _mm_set1_epi32(0x0fc0fc00));
+
+    const __m128i t1_0 = _mm_mulhi_epu16(t0_0, _mm_set1_epi32(0x04000040));
+    const __m128i t1_1 = _mm_mulhi_epu16(t0_1, _mm_set1_epi32(0x04000040));
+    const __m128i t1_2 = _mm_mulhi_epu16(t0_2, _mm_set1_epi32(0x04000040));
+    const __m128i t1_3 = _mm_mulhi_epu16(t0_3, _mm_set1_epi32(0x04000040));
+
+    const __m128i t2_0 = _mm_and_si128(in0, _mm_set1_epi32(0x003f03f0));
+    const __m128i t2_1 = _mm_and_si128(in1, _mm_set1_epi32(0x003f03f0));
+    const __m128i t2_2 = _mm_and_si128(in2, _mm_set1_epi32(0x003f03f0));
+    const __m128i t2_3 = _mm_and_si128(in3, _mm_set1_epi32(0x003f03f0));
+
+    const __m128i t3_0 = _mm_mullo_epi16(t2_0, _mm_set1_epi32(0x01000010));
+    const __m128i t3_1 = _mm_mullo_epi16(t2_1, _mm_set1_epi32(0x01000010));
+    const __m128i t3_2 = _mm_mullo_epi16(t2_2, _mm_set1_epi32(0x01000010));
+    const __m128i t3_3 = _mm_mullo_epi16(t2_3, _mm_set1_epi32(0x01000010));
+
+    const __m128i input0 = _mm_or_si128(t1_0, t3_0);
+    const __m128i input1 = _mm_or_si128(t1_1, t3_1);
+    const __m128i input2 = _mm_or_si128(t1_2, t3_2);
+    const __m128i input3 = _mm_or_si128(t1_3, t3_3);
+
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved(input0));
+    out += 16;
+
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved(input1));
+    out += 16;
+
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved(input2));
+    out += 16;
+
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved(input3));
+    out += 16;
+  }
+  for (; i + 16 <= srclen; i += 12) {
+
+    __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
+
+    // bytes from groups A, B and C are needed in separate 32-bit lanes
+    // in = [DDDD|CCCC|BBBB|AAAA]
+    //
+    //      an input triplet has layout
+    //      [????????|ccdddddd|bbbbcccc|aaaaaabb]
+    //        byte 3   byte 2   byte 1   byte 0    -- byte 3 comes from the next
+    //        triplet
+    //
+    //      shuffling changes the order of bytes: 1, 0, 2, 1
+    //      [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
+    //           ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
+    //                  processed bits
+    in = _mm_shuffle_epi8(in, shuf);
+
+    // unpacking
+
+    // t0    = [0000cccc|cc000000|aaaaaa00|00000000]
+    const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
+    // t1    = [00000000|00cccccc|00000000|00aaaaaa]
+    //          (c * (1 << 10), a * (1 << 6)) >> 16 (note: an unsigned
+    //          multiplication)
+    const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
+
+    // t2    = [00000000|00dddddd|000000bb|bbbb0000]
+    const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
+    // t3    = [00dddddd|00000000|00bbbbbb|00000000](
+    //          (d * (1 << 8), b * (1 << 4))
+    const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
+
+    // res   = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
+    const __m128i indices = _mm_or_si128(t1, t3);
+
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved(indices));
+    out += 16;
+  }
+
+  return i / 3 * 4 +
+         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+}
+static inline void compress(__m128i data, uint16_t mask, char *output) {
+  if (mask == 0) {
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
+    return;
+  }
+
+  // this particular implementation was inspired by work done by @animetosho
+  // we do it in two steps, first 8 bytes and then second 8 bytes
+  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
+  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+  // next line just loads the 64-bit values thintable_epi8[mask1] and
+  // thintable_epi8[mask2] into a 128-bit register, using only
+  // two instructions on most compilers.
+
+  __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
+                                    tables::base64::thintable_epi8[mask1]);
+  // we increment by 0x08 the second half of the mask
+  shufmask =
+      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+  // this is the version "nearly pruned"
+  __m128i pruned = _mm_shuffle_epi8(data, shufmask);
+  // we still need to put the two halves together.
+  // we compute the popcount of the first half:
+  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+  // then load the corresponding mask, what it does is to write
+  // only the first pop1 bytes from the first 8 bytes, and then
+  // it fills in with the bytes from the second 8 bytes + some filling
+  // at the end.
+  __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
+      tables::base64::pshufb_combine_table + pop1 * 8));
+  __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
+}
+
+struct block64 {
+  __m128i chunks[4];
+};
+
+static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
+  const __m128i ascii_space_tbl =
+      _mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0,
+                    0x0, 0xd, 0x0, 0x0);
+  // credit: aqrit
+  const __m128i delta_asso =
+      _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
+                    0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+  const __m128i delta_values =
+      _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+                    int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+                    int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
+                    int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9));
+  const __m128i check_asso =
+      _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                    0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+  const __m128i check_values =
+      _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+                    int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
+                    int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
+                    int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
+  const __m128i shifted = _mm_srli_epi32(*src, 3);
+
+  const __m128i delta_hash =
+      _mm_avg_epu8(_mm_shuffle_epi8(delta_asso, *src), shifted);
+  const __m128i check_hash =
+      _mm_avg_epu8(_mm_shuffle_epi8(check_asso, *src), shifted);
+
+  const __m128i out =
+      _mm_adds_epi8(_mm_shuffle_epi8(delta_values, delta_hash), *src);
+  const __m128i chk =
+      _mm_adds_epi8(_mm_shuffle_epi8(check_values, check_hash), *src);
+  const int mask = _mm_movemask_epi8(chk);
+  if (mask) {
+    __m128i ascii_space =
+        _mm_cmpeq_epi8(_mm_shuffle_epi8(ascii_space_tbl, *src), *src);
+    *error |= (mask != _mm_movemask_epi8(ascii_space));
+  }
+  *src = out;
+  return (uint16_t)mask;
+}
+static inline uint64_t to_base64_mask(block64 *b, bool *error) {
+  *error = 0;
+  uint64_t m0 = to_base64_mask(&b->chunks[0], error);
+  uint64_t m1 = to_base64_mask(&b->chunks[1], error);
+  uint64_t m2 = to_base64_mask(&b->chunks[2], error);
+  uint64_t m3 = to_base64_mask(&b->chunks[3], error);
+  return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
+}
+
+static inline void copy_block(block64 *b, char *output) {
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), b->chunks[0]);
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 16), b->chunks[1]);
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 32), b->chunks[2]);
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 48), b->chunks[3]);
+}
+
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+  uint64_t nmask = ~mask;
+  compress(b->chunks[0], uint16_t(mask), output);
+  compress(b->chunks[1], uint16_t(mask >> 16),
+           output + _mm_popcnt_u64(nmask & 0xFFFF));
+  compress(b->chunks[2], uint16_t(mask >> 32),
+           output + _mm_popcnt_u64(nmask & 0xFFFFFFFF));
+  compress(b->chunks[3], uint16_t(mask >> 48),
+           output + _mm_popcnt_u64(nmask & 0xFFFFFFFFFFFFULL));
+  return _mm_popcnt_u64(nmask);
+}
+
+static inline void load_block(block64 *b, const char *src) {
+  b->chunks[0] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+  b->chunks[1] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
+  b->chunks[2] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
+  b->chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
+}
+
+static inline void base64_decode(char *out, __m128i str) {
+  // credit: aqrit
+
+  const __m128i pack_shuffle =
+      _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
+
+  const __m128i t0 = _mm_maddubs_epi16(str, _mm_set1_epi32(0x01400140));
+  const __m128i t1 = _mm_madd_epi16(t0, _mm_set1_epi32(0x00011000));
+  const __m128i t2 = _mm_shuffle_epi8(t1, pack_shuffle);
+  // Store the output:
+  // this writes 16 bytes, but we only need 12.
+  _mm_storeu_si128((__m128i *)out, t2);
+}
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+  base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
+  base64_decode(out + 12,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
+  base64_decode(out + 24,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
+  base64_decode(out + 36,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
+}
+static inline void base64_decode_block_safe(char *out, const char *src) {
+  base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
+  base64_decode(out + 12,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
+  base64_decode(out + 24,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
+  char buffer[16];
+  base64_decode(buffer,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
+  std::memcpy(out + 36, buffer, 12);
+}
+static inline void base64_decode_block(char *out, block64 *b) {
+  base64_decode(out, b->chunks[0]);
+  base64_decode(out + 12, b->chunks[1]);
+  base64_decode(out + 24, b->chunks[2]);
+  base64_decode(out + 36, b->chunks[3]);
+}
+static inline void base64_decode_block_safe(char *out, block64 *b) {
+  base64_decode(out, b->chunks[0]);
+  base64_decode(out + 12, b->chunks[1]);
+  base64_decode(out + 24, b->chunks[2]);
+  char buffer[16];
+  base64_decode(buffer, b->chunks[3]);
+  std::memcpy(out + 36, buffer, 12);
+}
+
+result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+  size_t equalsigns = 0;
+  if (srclen > 0 && src[srclen - 1] == '=') {
+    srclen--;
+    equalsigns = 1;
+    if (srclen > 0 && src[srclen - 1] == '=') {
+      srclen--;
+      equalsigns = 2;
+    }
+  }
+  char *end_of_safe_64byte_zone =
+      (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
+
+  const char *const srcinit = src;
+  const char *const dstinit = dst;
+  const char *const srcend = src + srclen;
+
+  constexpr size_t block_size = 6;
+  static_assert(block_size >= 2, "block should of size 2 or more");
+  char buffer[block_size * 64];
+  char *bufferptr = buffer;
+  if (srclen >= 64) {
+    const char *const srcend64 = src + srclen - 64;
+    while (src <= srcend64) {
+      block64 b;
+      load_block(&b, src);
+      src += 64;
+      bool error = false;
+      uint64_t badcharmask = to_base64_mask(&b, &error);
+      if (error) {
+        src -= 64;
+        while (src < srcend &&
+               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+          src++;
+        }
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      }
+      if (badcharmask != 0) {
+        // optimization opportunity: check for simple masks like those made of
+        // continuous 1s followed by continuous 0s. And masks containing a
+        // single bad character.
+        bufferptr += compress_block(&b, badcharmask, bufferptr);
+      } else if (bufferptr != buffer) {
+        copy_block(&b, bufferptr);
+        bufferptr += 64;
+      } else {
+        if (dst >= end_of_safe_64byte_zone) {
+          base64_decode_block_safe(dst, &b);
+        } else {
+          base64_decode_block(dst, &b);
+        }
+        dst += 48;
+      }
+      if (bufferptr >= (block_size - 1) * 64 + buffer) {
+        for (size_t i = 0; i < (block_size - 2); i++) {
+          base64_decode_block(dst, buffer + i * 64);
+          dst += 48;
+        }
+        if (dst >= end_of_safe_64byte_zone) {
+          base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
+        } else {
+          base64_decode_block(dst, buffer + (block_size - 2) * 64);
+        }
+        dst += 48;
+        std::memcpy(buffer, buffer + (block_size - 1) * 64,
+                    64); // 64 might be too much
+        bufferptr -= (block_size - 1) * 64;
+      }
+    }
+  }
+
+  char *buffer_start = buffer;
+  // Optimization note: if this is almost full, then it is worth our
+  // time, otherwise, we should just decode directly.
+  int last_block = (int)((bufferptr - buffer_start) % 64);
+  if (last_block != 0 && srcend - src + last_block >= 64) {
+    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      *bufferptr = char(val);
+      if (val > 64) {
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      }
+      bufferptr += (val <= 63);
+      src++;
+    }
+  }
+
+  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+    if (dst >= end_of_safe_64byte_zone) {
+      base64_decode_block_safe(dst, buffer_start);
+    } else {
+      base64_decode_block(dst, buffer_start);
+    }
+    dst += 48;
+  }
+  if ((bufferptr - buffer_start) % 64 != 0) {
+    while (buffer_start + 4 < bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 4);
+
+      dst += 3;
+      buffer_start += 4;
+    }
+    if (buffer_start + 4 <= bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 3);
+
+      dst += 3;
+      buffer_start += 4;
+    }
+    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+    // bring in src content
+    int leftover = int(bufferptr - buffer_start);
+    if (leftover > 0) {
+      while (leftover < 4 && src < srcend) {
+        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        if (val > 64) {
+          return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+        }
+        buffer_start[leftover] = char(val);
+        leftover += (val <= 63);
+        src++;
+      }
+
+      if (leftover == 1) {
+        return {BASE64_INPUT_REMAINDER, size_t(dst - dstinit)};
+      }
+      if (leftover == 2) {
+        uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) +
+                          (uint32_t(buffer_start[1]) << 2 * 6);
+        triple = scalar::utf32::swap_bytes(triple);
+        triple >>= 8;
+        std::memcpy(dst, &triple, 1);
+        dst += 1;
+      } else if (leftover == 3) {
+        uint32_t triple = (uint32_t(buffer_start[0]) << 3 * 6) +
+                          (uint32_t(buffer_start[1]) << 2 * 6) +
+                          (uint32_t(buffer_start[2]) << 1 * 6);
+        triple = scalar::utf32::swap_bytes(triple);
+
+        triple >>= 8;
+
+        std::memcpy(dst, &triple, 2);
+        dst += 2;
+      } else {
+        uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                           (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                           (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                           (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                          << 8;
+        triple = scalar::utf32::swap_bytes(triple);
+        std::memcpy(dst, &triple, 3);
+        dst += 3;
+      }
+    }
+  }
+  if (src < srcend + equalsigns) {
+    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    if (r.error == error_code::INVALID_BASE64_CHARACTER) {
+      r.count += size_t(src - srcinit);
+      return r;
+    } else {
+      r.count += size_t(dst - dstinit);
+    }
+    return r;
+  }
+  return {SUCCESS, size_t(dst - dstinit)};
+}
+/* end file src/westmere/sse_base64.cpp */
 
 } // unnamed namespace
 } // namespace westmere
@@ -33142,7 +37279,7 @@ simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char *
       __m128i input4 = _mm_loadu_si128((const __m128i *)(str + i + 3*sizeof(__m128i)));
       __m128i input12 = _mm_add_epi8(
                                       _mm_cmpgt_epi8(
-                                                    _mm_setzero_si128(), 
+                                                    _mm_setzero_si128(),
                                                     input1),
                                       _mm_cmpgt_epi8(
                                                     _mm_setzero_si128(),
@@ -33227,6 +37364,21 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * i
   return utf8::count_code_points(input, length);
 }
 
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+  return compress_decode_base64(output, input, length);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
+  return scalar::base64::base64_length_from_binary(length);
+}
+
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
+  return encode_base64(output, input, length);
+}
 } // namespace westmere
 } // namespace simdutf
 
diff --git a/deps/simdutf/simdutf.h b/deps/simdutf/simdutf.h
index b0466f52d9d742..539b1ebfc28eb1 100644
--- a/deps/simdutf/simdutf.h
+++ b/deps/simdutf/simdutf.h
@@ -1,4 +1,4 @@
-/* auto-generated on 2024-01-29 10:40:15 -0500. Do not edit! */
+/* auto-generated on 2024-03-18 10:58:28 -0400. Do not edit! */
 /* begin file include/simdutf.h */
 #ifndef SIMDUTF_H
 #define SIMDUTF_H
@@ -142,6 +142,30 @@
 // s390 IBM system. Big endian.
 #elif (defined(__riscv) || defined(__riscv__)) && __riscv_xlen == 64
 // RISC-V 64-bit
+#define SIMDUTF_IS_RISCV64 1
+
+#if __clang_major__ >= 19
+// Does the compiler support target regions for RISC-V
+#define SIMDUTF_HAS_RVV_TARGET_REGION 1
+#endif
+
+#if __riscv_v_intrinsic >= 11000 && !(__GNUC__ == 13 && __GNUC_MINOR__ == 2 && __GNUC_PATCHLEVEL__ == 0)
+#define SIMDUTF_HAS_RVV_INTRINSICS 1
+#endif
+
+#define SIMDUTF_HAS_ZVBB_INTRINSICS 0 // there is currently no way to detect this
+
+#if SIMDUTF_HAS_RVV_INTRINSICS && __riscv_vector && __riscv_v_min_vlen >= 128 && __riscv_v_elen >= 64
+// RISC-V V extension
+#define SIMDUTF_IS_RVV 1
+#if SIMDUTF_HAS_ZVBB_INTRINSICS && __riscv_zvbb >= 1000000
+// RISC-V Vector Basic Bit-manipulation
+#define SIMDUTF_IS_ZVBB 1
+#endif
+#endif
+
+#elif defined(__loongarch_lp64)
+// LoongArch 64-bit
 #else
 // The simdutf library is designed
 // for 64-bit processors and it seems that you are not
@@ -540,6 +564,8 @@ enum error_code {
   SURROGATE,    // The decoded character must be not be in U+D800...DFFF (UTF-8 or UTF-32) OR
                 // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16) OR
                 // there must be no surrogate at all (Latin1)
+  INVALID_BASE64_CHARACTER, // Found a character that cannot be part of a valid base64 string.
+  BASE64_INPUT_REMAINDER, // The base64 input terminates with a single character, excluding padding (=).
   OTHER         // Not related to validation/transcoding.
 };
 
@@ -567,14 +593,14 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 #define SIMDUTF_SIMDUTF_VERSION_H
 
 /** The version of simdutf being used (major.minor.revision) */
-#define SIMDUTF_VERSION "4.0.9"
+#define SIMDUTF_VERSION "5.0.0"
 
 namespace simdutf {
 enum {
   /**
    * The major version (MAJOR.minor.revision) of simdutf being used.
    */
-  SIMDUTF_VERSION_MAJOR = 4,
+  SIMDUTF_VERSION_MAJOR = 5,
   /**
    * The minor version (major.MINOR.revision) of simdutf being used.
    */
@@ -582,7 +608,7 @@ enum {
   /**
    * The revision (major.minor.REVISION) of simdutf being used.
    */
-  SIMDUTF_VERSION_REVISION = 9
+  SIMDUTF_VERSION_REVISION = 0
 };
 } // namespace simdutf
 
@@ -654,6 +680,7 @@ POSSIBILITY OF SUCH DAMAGE.
 #include <cpuid.h>
 #endif
 
+
 namespace simdutf {
 namespace internal {
 
@@ -675,7 +702,9 @@ enum instruction_set {
   AVX512BW = 0x4000,
   AVX512VL = 0x8000,
   AVX512VBMI2 = 0x10000,
-  AVX512VPOPCNTDQ = 0x2000
+  AVX512VPOPCNTDQ = 0x2000,
+  RVV = 0x4000,
+  ZVBB = 0x8000,
 };
 
 #if defined(__PPC64__)
@@ -684,6 +713,40 @@ static inline uint32_t detect_supported_architectures() {
   return instruction_set::ALTIVEC;
 }
 
+#elif SIMDUTF_IS_RISCV64
+
+#if defined(__linux__)
+#include <unistd.h>
+// We define these our selfs, for backwards compatibility
+struct simdutf_riscv_hwprobe { int64_t key; uint64_t value; };
+#define simdutf_riscv_hwprobe(...) syscall(258, __VA_ARGS__)
+#define SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0 4
+#define SIMDUTF_RISCV_HWPROBE_IMA_V    (1 << 2)
+#define SIMDUTF_RISCV_HWPROBE_EXT_ZVBB (1 << 17)
+#endif
+
+static inline uint32_t detect_supported_architectures() {
+  uint32_t host_isa = instruction_set::DEFAULT;
+#if SIMDUTF_IS_RVV
+  host_isa |= instruction_set::RVV;
+#endif
+#if SIMDUTF_IS_ZVBB
+  host_isa |= instruction_set::ZVBB;
+#endif
+#if defined(__linux__)
+  simdutf_riscv_hwprobe probes[] = { { SIMDUTF_RISCV_HWPROBE_KEY_IMA_EXT_0, 0 } };
+  long ret = simdutf_riscv_hwprobe(&probes, sizeof probes/sizeof *probes, 0, nullptr, 0);
+  if (ret == 0) {
+    uint64_t extensions = probes[0].value;
+    if (extensions & SIMDUTF_RISCV_HWPROBE_IMA_V)
+      host_isa |= instruction_set::RVV;
+    if (extensions & SIMDUTF_RISCV_HWPROBE_EXT_ZVBB)
+      host_isa |= instruction_set::ZVBB;
+  }
+#endif
+  return host_isa;
+}
+
 #elif defined(__aarch64__) || defined(_M_ARM64)
 
 static inline uint32_t detect_supported_architectures() {
@@ -2222,6 +2285,63 @@ simdutf_warn_unused size_t trim_partial_utf16le(const char16_t* input, size_t le
  */
 simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t length);
 
+
+/**
+ * Provide the maximal binary length in bytes given the base64 input.
+ * In general, if the input contains ASCII spaces, the result will be less than
+ * the maximum length.
+ *
+ * @param input         the base64 input to process
+ * @param length        the length of the base64 input in bytes
+ * @return number of base64 bytes
+ */
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept;
+
+/**
+ * Convert a base64 input to a binary ouput.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it will
+ * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+ * equal signs at the end) or an unpadded input (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. There are two possible reasons for
+ * failure: the input is contains a number of base64 characters that when divided by 4, leaves
+ * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+ *
+ * You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long.
+ * If you fail to provide that much space, the function may cause a buffer overflow.
+ *
+ * @param input         the base64 string to process
+ * @param length        the length of the string in bytes
+ * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful.
+ */
+simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) noexcept;
+
+/**
+ * Provide the base64 length in bytes given the length of a binary input.
+ *
+ * @param length        the length of the input in bytes
+ * @return number of base64 bytes
+ */
+simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept;
+
+/**
+ * Convert a binary input to a base64 ouput. The output is always padded with equal signs so that it is
+ * a multiple of 4 bytes long.
+ *
+ * This function always succeeds.
+ *
+ * @param input         the binary to process
+ * @param length        the length of the input in bytes
+ * @param output        the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long)
+ * @return number of written bytes, will be equal to base64_length_from_binary(length)
+ */
+size_t binary_to_base64(const char * input, size_t length, char* output) noexcept;
+
 /**
  * An implementation of simdutf for a particular CPU architecture.
  *
@@ -3282,6 +3402,61 @@ class implementation {
    */
   simdutf_warn_unused virtual size_t count_utf8(const char * input, size_t length) const noexcept = 0;
 
+  /**
+   * Provide the maximal binary length in bytes given the base64 input.
+   * In general, if the input contains ASCII spaces, the result will be less than
+   * the maximum length.
+   *
+   * @param input         the base64 input to process
+   * @param length        the length of the base64 input in bytes
+   * @return number of base64 bytes
+   */
+  simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept = 0;
+
+  /**
+   * Convert a base64 input to a binary ouput.
+   *
+   * This function follows the WHATWG forgiving-base64 format, which means that it will
+   * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+   * equal signs at the end) or an unpadded input (without any equal signs at the end).
+   *
+   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+   *
+   * This function will fail in case of invalid input. There are two possible reasons for
+   * failure: the input is contains a number of base64 characters that when divided by 4, leaves
+   * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+   * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+   *
+   * You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long.
+   * If you fail to provide that much space, the function may cause a buffer overflow.
+   *
+   * @param input         the base64 string to process
+   * @param length        the length of the string in bytes
+   * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful.
+   */
+  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output) const noexcept = 0;
+
+  /**
+   * Provide the base64 length in bytes given the length of a binary input.
+   *
+   * @param length        the length of the input in bytes
+   * @return number of base64 bytes
+   */
+  simdutf_warn_unused virtual size_t base64_length_from_binary(size_t length) const noexcept = 0;
+
+  /**
+   * Convert a binary input to a base64 ouput. The output is always padded with equal signs so that it is
+   * a multiple of 4 bytes long.
+   *
+   * This function always succeeds.
+   *
+   * @param input         the binary to process
+   * @param length        the length of the input in bytes
+   * @param output        the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long)
+   * @return number of written bytes, will be equal to base64_length_from_binary(length)
+   */
+  virtual size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept = 0;
 
 
 protected: