Update benchmark project for code changes + enable ARM64 build

By design the benchmark project is not build as it relies on Google Benchmark that is retrieved using vcpkg. Vcpkg is now part of Visual Studio 2022, so building with VS 2022 works. One of the build steps of the CI pipeline build CharLS however with VS 2019 to ensure that VS 2019 still can be used. Enabled benchmark in the solution file for x86 and X64 would break VS 2019. ARM64 build are only support in VS 2022, so enabling that version doesn't break VS 2019.
team-charls · Aug 24, 2024 · 738f77a · 738f77a
1 parent b035a73
commit 738f77a
Show file tree

Hide file tree

Showing 12 changed files with 144 additions and 46 deletions.
diff --git a/CharLS.sln b/CharLS.sln
@@ -142,12 +142,15 @@ Global
 		{E09F024E-A125-48AA-8E9D-7D1302BEAC97}.Release|x86.ActiveCfg = Release|Win32
 		{E09F024E-A125-48AA-8E9D-7D1302BEAC97}.Release|x86.Build.0 = Release|Win32
 		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Checked|ARM64.ActiveCfg = Checked|ARM64
+		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Checked|ARM64.Build.0 = Checked|ARM64
 		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Checked|x64.ActiveCfg = Checked|x64
 		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Checked|x86.ActiveCfg = Checked|Win32
 		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|ARM64.ActiveCfg = Debug|ARM64
+		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|ARM64.Build.0 = Debug|ARM64
 		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|x64.ActiveCfg = Debug|x64
 		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|x86.ActiveCfg = Debug|Win32
 		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|ARM64.ActiveCfg = Release|ARM64
+		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|ARM64.Build.0 = Release|ARM64
 		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|x64.ActiveCfg = Release|x64
 		{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|x86.ActiveCfg = Release|Win32
 		{5637C116-ABF5-4274-A71F-34433713A538}.Checked|ARM64.ActiveCfg = Checked|ARM64

diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
@@ -3,7 +3,7 @@
 
 #include <benchmark/benchmark.h>
 
-#include "../src/jpegls_preset_coding_parameters.h"
+#include "../src/jpegls_preset_coding_parameters.hpp"
 
 #include <cstdint>
 #include <memory>
@@ -12,7 +12,7 @@
 #pragma warning(disable : 26409) // Avoid calling new explicitly (triggered by BENCHMARK macro)
 
 
-int8_t quantize_gradient_org(const charls::jpegls_pc_parameters& preset, const int32_t di) noexcept
+static int8_t quantize_gradient_org(const charls::jpegls_pc_parameters& preset, const int32_t di) noexcept
 {
     constexpr int32_t near_lossless{};
 
@@ -36,7 +36,7 @@ int8_t quantize_gradient_org(const charls::jpegls_pc_parameters& preset, const i
     return 4;
 }
 
-std::vector<int8_t> create_quantize_lut_lossless(const int32_t bit_count)
+static std::vector<int8_t> create_quantize_lut_lossless(const int32_t bit_count)
 {
     const charls::jpegls_pc_parameters preset{charls::compute_default((1 << static_cast<uint32_t>(bit_count)) - 1, 0)};
     const int32_t range{preset.maximum_sample_value + 1};
@@ -100,7 +100,8 @@ struct lossless_traits final
 };
 
 
-__declspec(noinline) int32_t get_predicted_value_default(const int32_t ra, const int32_t rb, const int32_t rc) noexcept
+static __declspec(noinline) int32_t
+    get_predicted_value_default(const int32_t ra, const int32_t rb, const int32_t rc) noexcept
 {
     if (ra < rb)
     {
@@ -126,13 +127,14 @@ __declspec(noinline) int32_t get_predicted_value_default(const int32_t ra, const
 constexpr size_t int32_t_bit_count = sizeof(int32_t) * 8;
 
 
-constexpr int32_t bit_wise_sign(const int32_t i) noexcept
+static constexpr int32_t bit_wise_sign(const int32_t i) noexcept
 {
     return i >> (int32_t_bit_count - 1);
 }
 
 
-__declspec(noinline) int32_t get_predicted_value_optimized(const int32_t ra, const int32_t rb, const int32_t rc) noexcept
+static __declspec(noinline) int32_t
+    get_predicted_value_optimized(const int32_t ra, const int32_t rb, const int32_t rc) noexcept
 {
     // sign trick reduces the number of if statements (branches)
     const int32_t sign{bit_wise_sign(rb - ra)};
@@ -153,7 +155,7 @@ __declspec(noinline) int32_t get_predicted_value_optimized(const int32_t ra, con
 
 
 #if defined(_M_X64) || defined(_M_ARM64)
-inline int countl_zero(const uint64_t value) noexcept
+inline static int countl_zero(const uint64_t value) noexcept
 {
     if (value == 0)
         return 64;
@@ -211,7 +213,7 @@ static void bm_quantize_gradient_lut(benchmark::State& state)
 BENCHMARK(bm_quantize_gradient_lut);
 
 
-int peek_zero_bits(uint64_t val_test) noexcept
+static int peek_zero_bits(uint64_t val_test) noexcept
 {
     for (int32_t count{}; count < 16; ++count)
     {
@@ -254,7 +256,7 @@ BENCHMARK(bm_peek_zero_bits_intrinsic);
 #endif
 
 
-std::vector<uint8_t> allocate_buffer(const size_t size)
+static std::vector<uint8_t> allocate_buffer(const size_t size)
 {
     std::vector<uint8_t> buffer;
     buffer.resize(size);
@@ -306,7 +308,7 @@ class overwrite_buffer
 };
 
 
-overwrite_buffer allocate_overwrite_buffer(const size_t size)
+static overwrite_buffer allocate_overwrite_buffer(const size_t size)
 {
     overwrite_buffer buffer;
     buffer.reset(size);
@@ -324,7 +326,7 @@ static void bm_resize_overwrite_buffer(benchmark::State& state)
 BENCHMARK(bm_resize_overwrite_buffer);
 
 
-int memset_buffer(uint8_t* data, const size_t size)
+static int memset_buffer(uint8_t* data, const size_t size) noexcept
 {
     memset(data, 0, size);
     return 0;
@@ -342,7 +344,7 @@ static void bm_memset_buffer(benchmark::State& state)
 BENCHMARK(bm_memset_buffer);
 
 
-bool has_ff_byte_classic(const unsigned int value)
+constexpr static bool has_ff_byte_classic(const unsigned int value) noexcept
 {
     // Check if any byte is equal to 0xFF
     return ((value & 0xFF) == 0xFF) || (((value >> 8) & 0xFF) == 0xFF) || (((value >> 16) & 0xFF) == 0xFF) ||
@@ -358,7 +360,7 @@ static void bm_has_ff_byte_classic(benchmark::State& state)
 }
 BENCHMARK(bm_has_ff_byte_classic);
 
-bool has_ff_byte_loop(const unsigned int value)
+static bool has_ff_byte_loop(const unsigned int value) noexcept
 {
     // Iterate over each byte and check if it is equal to 0xFF
     for (int i = 0; i < sizeof(unsigned int); ++i)
@@ -380,7 +382,8 @@ static void bm_has_ff_byte_loop(benchmark::State& state)
 }
 BENCHMARK(bm_has_ff_byte_loop);
 
-bool has_ff_byte_simd(const unsigned int value) {
+#if !defined(_M_ARM64)
+static bool has_ff_byte_simd(const unsigned int value) {
      // Use SSE instructions for parallel comparison
      const __m128i xmm_value = _mm_set1_epi32(value);
      const __m128i xmm_ff = _mm_set1_epi32(0xFF);
@@ -400,9 +403,9 @@ static void bm_has_ff_byte_simd(benchmark::State& state)
     }
 }
 BENCHMARK(bm_has_ff_byte_simd);
+#endif
 
-
-const std::byte* find_jpeg_marker_start_byte(const std::byte* position, const std::byte* end_position) noexcept
+static const std::byte* find_jpeg_marker_start_byte(const std::byte* position, const std::byte* end_position) noexcept
 {
     constexpr std::byte jpeg_marker_start_byte{0xFF};
 
@@ -484,7 +487,8 @@ T read_big_endian_unaligned(const void* buffer) noexcept
 #endif
 }
 
-uint32_t read_all_bytes_with_ff_check(const std::byte* position, const std::byte* end_position)
+#if !defined(_M_ARM64)
+static uint32_t read_all_bytes_with_ff_check(const std::byte* position, const std::byte* end_position)
 {
     uint32_t result{};
 
@@ -514,9 +518,10 @@ static void bm_read_all_bytes_with_ff_check(benchmark::State& state)
     }
 }
 BENCHMARK(bm_read_all_bytes_with_ff_check);
+#endif
 
-
-bool has_ff_byte_simd64(const uint64_t value)
+#if !defined(_M_ARM64)
+static bool has_ff_byte_simd64(const uint64_t value)
 {
     // Use SSE instructions for parallel comparison
     const __m128i xmm_value = _mm_set1_epi64x(value);
@@ -529,7 +534,7 @@ bool has_ff_byte_simd64(const uint64_t value)
     return _mm_testz_si128(comparison, comparison) == 0;
 }
 
-uint64_t read_all_bytes_with_ff_check64(const std::byte* position, const std::byte* end_position)
+static uint64_t read_all_bytes_with_ff_check64(const std::byte* position, const std::byte* end_position)
 {
     uint64_t result{};
 
@@ -557,9 +562,10 @@ static void bm_read_all_bytes_with_ff_check64(benchmark::State& state)
     }
 }
 BENCHMARK(bm_read_all_bytes_with_ff_check64);
+#endif
 
 
-uint32_t read_all_bytes_no_check(const std::byte* position, const std::byte* end_position)
+static uint32_t read_all_bytes_no_check(const std::byte* position, const std::byte* end_position) noexcept
 {
     uint32_t result{};
 
@@ -582,7 +588,7 @@ static void bm_read_all_bytes_no_check(benchmark::State& state)
 }
 BENCHMARK(bm_read_all_bytes_no_check);
 
-uint64_t read_all_bytes_no_check64(const std::byte* position, const std::byte* end_position)
+static uint64_t read_all_bytes_no_check64(const std::byte* position, const std::byte* end_position) noexcept
 {
     uint64_t result{};
 
@@ -605,7 +611,9 @@ static void bm_read_all_bytes_no_check64(benchmark::State& state)
 }
 BENCHMARK(bm_read_all_bytes_no_check64);
 
+// Tips to run the benchmark tests:
 
-
+// To run a single benchmark:
+// benchmark --benchmark_filter = bm_decode   
 
 BENCHMARK_MAIN();
diff --git a/benchmark/benchmark.vcxproj b/benchmark/benchmark.vcxproj
@@ -177,6 +177,7 @@
   <ItemGroup>
     <ClCompile Include="benchmark.cpp" />
     <ClCompile Include="context_regular_mode.cpp" />
+    <ClCompile Include="decode.cpp" />
     <ClCompile Include="log2.cpp" />
   </ItemGroup>
   <ItemGroup>
@@ -187,6 +188,9 @@
   <ItemGroup>
     <ClInclude Include="context_regular_mode_v220.h" />
   </ItemGroup>
+  <ItemGroup>
+    <None Include="vcpkg.json" />
+  </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>

diff --git a/benchmark/benchmark.vcxproj.filters b/benchmark/benchmark.vcxproj.filters
@@ -20,10 +20,16 @@
     <ClCompile Include="log2.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="decode.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="context_regular_mode_v220.h">
       <Filter>Header Files</Filter>
     </ClInclude>
   </ItemGroup>
+  <ItemGroup>
+    <None Include="vcpkg.json" />
+  </ItemGroup>
 </Project>
diff --git a/benchmark/context_regular_mode.cpp b/benchmark/context_regular_mode.cpp
@@ -6,10 +6,11 @@
 #include "context_regular_mode_v220.h"
 
 #pragma warning(disable : 26409) // Avoid calling new explicitly (triggered by BENCHMARK macro)
+#pragma warning(disable : 4746) // volatile access of 'reset_threshold' is subject to /volatile:<iso|ms> setting; (in ARM64 mode)
 
 using namespace charls;
 
-context_regular_mode g_context;
+regular_mode_context g_context;
 jls_context_v220 g_context_v220;
 
 volatile int32_t error_value;
@@ -29,7 +30,7 @@ BENCHMARK(bm_regular_mode_update_variables_220);
 
 static void bm_regular_mode_update_variables(benchmark::State& state)
 {
-    g_context = context_regular_mode();
+    g_context = regular_mode_context();
 
     for (const auto _ : state)
     {
@@ -52,12 +53,12 @@ BENCHMARK(bm_regular_mode_get_golomb_coding_parameter_v220);
 
 static void bm_regular_mode_get_golomb_coding_parameter(benchmark::State& state)
 {
-    g_context = context_regular_mode();
+    g_context = regular_mode_context();
     g_context.update_variables_and_bias(error_value, near_lossless, reset_threshold);
 
     for (const auto _ : state)
     {
-        benchmark::DoNotOptimize(g_context.get_golomb_coding_parameter());
+        benchmark::DoNotOptimize(g_context.compute_golomb_coding_parameter());
     }
 }
 BENCHMARK(bm_regular_mode_get_golomb_coding_parameter);
diff --git a/benchmark/context_regular_mode_v220.h b/benchmark/context_regular_mode_v220.h
@@ -3,7 +3,8 @@
 
 #pragma once
 
-#include "../src/context_regular_mode.h"
+#include "../src/regular_mode_context.hpp"
+
 #include <cassert>
 #include <cstdint>
 
@@ -43,7 +44,7 @@ struct jls_context_v220 final
         int n{N};
 
         if (constexpr int limit{65536 * 256}; UNLIKELY(a >= limit || std::abs(b) >= limit))
-            impl::throw_jpegls_error(jpegls_errc::invalid_encoded_data);
+            impl::throw_jpegls_error(jpegls_errc::invalid_data);
 
         if (n == reset_threshold)
         {
@@ -92,7 +93,7 @@ struct jls_context_v220 final
         }
 
         if (UNLIKELY(k == max_k_value))
-            impl::throw_jpegls_error(jpegls_errc::invalid_encoded_data);
+            impl::throw_jpegls_error(jpegls_errc::invalid_data);
 
         return k;
     }

diff --git a/benchmark/decode.cpp b/benchmark/decode.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) Team CharLS.
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <benchmark/benchmark.h>
+
+#include "../include/charls/charls.hpp"
+
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#pragma warning(disable : 26409) // Avoid calling new explicitly (triggered by BENCHMARK macro)
+
+using namespace charls;
+using std::byte;
+using std::ifstream;
+using std::ios;
+using std::vector;
+
+template<typename Container>
+void read(std::istream& input, Container& destination)
+{
+    input.read(reinterpret_cast<char*>(destination.data()), static_cast<std::streamsize>(destination.size()));
+}
+
+vector<byte> read_file(const char* filename, long offset = 0, size_t bytes = 0)
+try
+{
+    ifstream input;
+    input.exceptions(ios::eofbit | ios::failbit | ios::badbit);
+    input.open(filename, ios::in | ios::binary);
+
+    input.seekg(0, ios::end);
+    const auto byte_count_file{static_cast<int>(input.tellg())};
+    input.seekg(offset, ios::beg);
+
+    if (offset < 0)
+    {
+        offset = static_cast<long>(byte_count_file - bytes);
+    }
+    if (bytes == 0)
+    {
+        bytes = static_cast<size_t>(byte_count_file) - offset;
+    }
+
+    vector<byte> buffer(bytes);
+    read(input, buffer);
+
+    return buffer;
+}
+catch (const std::ifstream::failure&)
+{
+    std::cout << "Failed to open/read file: " << std::filesystem::absolute(filename) << "\n";
+    throw;
+}
+
+
+static void bm_decode(benchmark::State& state)
+{
+    const auto source{read_file("d:/benchmark-test-image.jls")};
+
+    // Pre-allocate the destination outside the measurement loop.
+    // std::vector initializes its elements and this step needs to be excluded from the measurement.
+    vector<byte> destination(jpegls_decoder{source, true}.get_destination_size());
+
+    for (const auto _ : state)
+    {
+        jpegls_decoder decoder(source.data(), source.size());
+        decoder.decode(destination);
+    }
+}
+BENCHMARK(bm_decode);