Skip to content

Commit

Permalink
Update benchmark project for code changes + enable ARM64 build
Browse files Browse the repository at this point in the history
By design the benchmark project is not build as it relies on Google Benchmark that is retrieved using vcpkg.
Vcpkg is now part of Visual Studio 2022, so building with VS 2022 works.
One of the build steps of the CI pipeline build CharLS however with VS 2019 to ensure that VS 2019 still can be used. Enabled benchmark in the solution file for x86 and X64 would break VS 2019.
ARM64 build are only support in VS 2022, so enabling that version doesn't break VS 2019.
  • Loading branch information
vbaderks committed Aug 24, 2024
1 parent b035a73 commit 738f77a
Show file tree
Hide file tree
Showing 12 changed files with 144 additions and 46 deletions.
3 changes: 3 additions & 0 deletions CharLS.sln
Original file line number Diff line number Diff line change
Expand Up @@ -142,12 +142,15 @@ Global
{E09F024E-A125-48AA-8E9D-7D1302BEAC97}.Release|x86.ActiveCfg = Release|Win32
{E09F024E-A125-48AA-8E9D-7D1302BEAC97}.Release|x86.Build.0 = Release|Win32
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Checked|ARM64.ActiveCfg = Checked|ARM64
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Checked|ARM64.Build.0 = Checked|ARM64
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Checked|x64.ActiveCfg = Checked|x64
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Checked|x86.ActiveCfg = Checked|Win32
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|ARM64.ActiveCfg = Debug|ARM64
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|ARM64.Build.0 = Debug|ARM64
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|x64.ActiveCfg = Debug|x64
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Debug|x86.ActiveCfg = Debug|Win32
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|ARM64.ActiveCfg = Release|ARM64
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|ARM64.Build.0 = Release|ARM64
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|x64.ActiveCfg = Release|x64
{F961EC29-4ACE-4D5E-B7ED-55681A678A90}.Release|x86.ActiveCfg = Release|Win32
{5637C116-ABF5-4274-A71F-34433713A538}.Checked|ARM64.ActiveCfg = Checked|ARM64
Expand Down
54 changes: 31 additions & 23 deletions benchmark/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include <benchmark/benchmark.h>

#include "../src/jpegls_preset_coding_parameters.h"
#include "../src/jpegls_preset_coding_parameters.hpp"

#include <cstdint>
#include <memory>
Expand All @@ -12,7 +12,7 @@
#pragma warning(disable : 26409) // Avoid calling new explicitly (triggered by BENCHMARK macro)


int8_t quantize_gradient_org(const charls::jpegls_pc_parameters& preset, const int32_t di) noexcept
static int8_t quantize_gradient_org(const charls::jpegls_pc_parameters& preset, const int32_t di) noexcept
{
constexpr int32_t near_lossless{};

Expand All @@ -36,7 +36,7 @@ int8_t quantize_gradient_org(const charls::jpegls_pc_parameters& preset, const i
return 4;
}

std::vector<int8_t> create_quantize_lut_lossless(const int32_t bit_count)
static std::vector<int8_t> create_quantize_lut_lossless(const int32_t bit_count)
{
const charls::jpegls_pc_parameters preset{charls::compute_default((1 << static_cast<uint32_t>(bit_count)) - 1, 0)};
const int32_t range{preset.maximum_sample_value + 1};
Expand Down Expand Up @@ -100,7 +100,8 @@ struct lossless_traits final
};


__declspec(noinline) int32_t get_predicted_value_default(const int32_t ra, const int32_t rb, const int32_t rc) noexcept
static __declspec(noinline) int32_t
get_predicted_value_default(const int32_t ra, const int32_t rb, const int32_t rc) noexcept
{
if (ra < rb)
{
Expand All @@ -126,13 +127,14 @@ __declspec(noinline) int32_t get_predicted_value_default(const int32_t ra, const
constexpr size_t int32_t_bit_count = sizeof(int32_t) * 8;


constexpr int32_t bit_wise_sign(const int32_t i) noexcept
static constexpr int32_t bit_wise_sign(const int32_t i) noexcept
{
return i >> (int32_t_bit_count - 1);
}


__declspec(noinline) int32_t get_predicted_value_optimized(const int32_t ra, const int32_t rb, const int32_t rc) noexcept
static __declspec(noinline) int32_t
get_predicted_value_optimized(const int32_t ra, const int32_t rb, const int32_t rc) noexcept
{
// sign trick reduces the number of if statements (branches)
const int32_t sign{bit_wise_sign(rb - ra)};
Expand All @@ -153,7 +155,7 @@ __declspec(noinline) int32_t get_predicted_value_optimized(const int32_t ra, con


#if defined(_M_X64) || defined(_M_ARM64)
inline int countl_zero(const uint64_t value) noexcept
inline static int countl_zero(const uint64_t value) noexcept
{
if (value == 0)
return 64;
Expand Down Expand Up @@ -211,7 +213,7 @@ static void bm_quantize_gradient_lut(benchmark::State& state)
BENCHMARK(bm_quantize_gradient_lut);


int peek_zero_bits(uint64_t val_test) noexcept
static int peek_zero_bits(uint64_t val_test) noexcept
{
for (int32_t count{}; count < 16; ++count)
{
Expand Down Expand Up @@ -254,7 +256,7 @@ BENCHMARK(bm_peek_zero_bits_intrinsic);
#endif


std::vector<uint8_t> allocate_buffer(const size_t size)
static std::vector<uint8_t> allocate_buffer(const size_t size)
{
std::vector<uint8_t> buffer;
buffer.resize(size);
Expand Down Expand Up @@ -306,7 +308,7 @@ class overwrite_buffer
};


overwrite_buffer allocate_overwrite_buffer(const size_t size)
static overwrite_buffer allocate_overwrite_buffer(const size_t size)
{
overwrite_buffer buffer;
buffer.reset(size);
Expand All @@ -324,7 +326,7 @@ static void bm_resize_overwrite_buffer(benchmark::State& state)
BENCHMARK(bm_resize_overwrite_buffer);


int memset_buffer(uint8_t* data, const size_t size)
static int memset_buffer(uint8_t* data, const size_t size) noexcept
{
memset(data, 0, size);
return 0;
Expand All @@ -342,7 +344,7 @@ static void bm_memset_buffer(benchmark::State& state)
BENCHMARK(bm_memset_buffer);


bool has_ff_byte_classic(const unsigned int value)
constexpr static bool has_ff_byte_classic(const unsigned int value) noexcept
{
// Check if any byte is equal to 0xFF
return ((value & 0xFF) == 0xFF) || (((value >> 8) & 0xFF) == 0xFF) || (((value >> 16) & 0xFF) == 0xFF) ||
Expand All @@ -358,7 +360,7 @@ static void bm_has_ff_byte_classic(benchmark::State& state)
}
BENCHMARK(bm_has_ff_byte_classic);

bool has_ff_byte_loop(const unsigned int value)
static bool has_ff_byte_loop(const unsigned int value) noexcept
{
// Iterate over each byte and check if it is equal to 0xFF
for (int i = 0; i < sizeof(unsigned int); ++i)
Expand All @@ -380,7 +382,8 @@ static void bm_has_ff_byte_loop(benchmark::State& state)
}
BENCHMARK(bm_has_ff_byte_loop);

bool has_ff_byte_simd(const unsigned int value) {
#if !defined(_M_ARM64)
static bool has_ff_byte_simd(const unsigned int value) {
// Use SSE instructions for parallel comparison
const __m128i xmm_value = _mm_set1_epi32(value);
const __m128i xmm_ff = _mm_set1_epi32(0xFF);
Expand All @@ -400,9 +403,9 @@ static void bm_has_ff_byte_simd(benchmark::State& state)
}
}
BENCHMARK(bm_has_ff_byte_simd);
#endif


const std::byte* find_jpeg_marker_start_byte(const std::byte* position, const std::byte* end_position) noexcept
static const std::byte* find_jpeg_marker_start_byte(const std::byte* position, const std::byte* end_position) noexcept
{
constexpr std::byte jpeg_marker_start_byte{0xFF};

Expand Down Expand Up @@ -484,7 +487,8 @@ T read_big_endian_unaligned(const void* buffer) noexcept
#endif
}

uint32_t read_all_bytes_with_ff_check(const std::byte* position, const std::byte* end_position)
#if !defined(_M_ARM64)
static uint32_t read_all_bytes_with_ff_check(const std::byte* position, const std::byte* end_position)
{
uint32_t result{};

Expand Down Expand Up @@ -514,9 +518,10 @@ static void bm_read_all_bytes_with_ff_check(benchmark::State& state)
}
}
BENCHMARK(bm_read_all_bytes_with_ff_check);
#endif


bool has_ff_byte_simd64(const uint64_t value)
#if !defined(_M_ARM64)
static bool has_ff_byte_simd64(const uint64_t value)
{
// Use SSE instructions for parallel comparison
const __m128i xmm_value = _mm_set1_epi64x(value);
Expand All @@ -529,7 +534,7 @@ bool has_ff_byte_simd64(const uint64_t value)
return _mm_testz_si128(comparison, comparison) == 0;
}

uint64_t read_all_bytes_with_ff_check64(const std::byte* position, const std::byte* end_position)
static uint64_t read_all_bytes_with_ff_check64(const std::byte* position, const std::byte* end_position)
{
uint64_t result{};

Expand Down Expand Up @@ -557,9 +562,10 @@ static void bm_read_all_bytes_with_ff_check64(benchmark::State& state)
}
}
BENCHMARK(bm_read_all_bytes_with_ff_check64);
#endif


uint32_t read_all_bytes_no_check(const std::byte* position, const std::byte* end_position)
static uint32_t read_all_bytes_no_check(const std::byte* position, const std::byte* end_position) noexcept
{
uint32_t result{};

Expand All @@ -582,7 +588,7 @@ static void bm_read_all_bytes_no_check(benchmark::State& state)
}
BENCHMARK(bm_read_all_bytes_no_check);

uint64_t read_all_bytes_no_check64(const std::byte* position, const std::byte* end_position)
static uint64_t read_all_bytes_no_check64(const std::byte* position, const std::byte* end_position) noexcept
{
uint64_t result{};

Expand All @@ -605,7 +611,9 @@ static void bm_read_all_bytes_no_check64(benchmark::State& state)
}
BENCHMARK(bm_read_all_bytes_no_check64);

// Tips to run the benchmark tests:


// To run a single benchmark:
// benchmark --benchmark_filter = bm_decode

BENCHMARK_MAIN();
4 changes: 4 additions & 0 deletions benchmark/benchmark.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@
<ItemGroup>
<ClCompile Include="benchmark.cpp" />
<ClCompile Include="context_regular_mode.cpp" />
<ClCompile Include="decode.cpp" />
<ClCompile Include="log2.cpp" />
</ItemGroup>
<ItemGroup>
Expand All @@ -187,6 +188,9 @@
<ItemGroup>
<ClInclude Include="context_regular_mode_v220.h" />
</ItemGroup>
<ItemGroup>
<None Include="vcpkg.json" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
Expand Down
6 changes: 6 additions & 0 deletions benchmark/benchmark.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,16 @@
<ClCompile Include="log2.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="decode.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="context_regular_mode_v220.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="vcpkg.json" />
</ItemGroup>
</Project>
9 changes: 5 additions & 4 deletions benchmark/context_regular_mode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
#include "context_regular_mode_v220.h"

#pragma warning(disable : 26409) // Avoid calling new explicitly (triggered by BENCHMARK macro)
#pragma warning(disable : 4746) // volatile access of 'reset_threshold' is subject to /volatile:<iso|ms> setting; (in ARM64 mode)

using namespace charls;

context_regular_mode g_context;
regular_mode_context g_context;
jls_context_v220 g_context_v220;

volatile int32_t error_value;
Expand All @@ -29,7 +30,7 @@ BENCHMARK(bm_regular_mode_update_variables_220);

static void bm_regular_mode_update_variables(benchmark::State& state)
{
g_context = context_regular_mode();
g_context = regular_mode_context();

for (const auto _ : state)
{
Expand All @@ -52,12 +53,12 @@ BENCHMARK(bm_regular_mode_get_golomb_coding_parameter_v220);

static void bm_regular_mode_get_golomb_coding_parameter(benchmark::State& state)
{
g_context = context_regular_mode();
g_context = regular_mode_context();
g_context.update_variables_and_bias(error_value, near_lossless, reset_threshold);

for (const auto _ : state)
{
benchmark::DoNotOptimize(g_context.get_golomb_coding_parameter());
benchmark::DoNotOptimize(g_context.compute_golomb_coding_parameter());
}
}
BENCHMARK(bm_regular_mode_get_golomb_coding_parameter);
7 changes: 4 additions & 3 deletions benchmark/context_regular_mode_v220.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

#pragma once

#include "../src/context_regular_mode.h"
#include "../src/regular_mode_context.hpp"

#include <cassert>
#include <cstdint>

Expand Down Expand Up @@ -43,7 +44,7 @@ struct jls_context_v220 final
int n{N};

if (constexpr int limit{65536 * 256}; UNLIKELY(a >= limit || std::abs(b) >= limit))
impl::throw_jpegls_error(jpegls_errc::invalid_encoded_data);
impl::throw_jpegls_error(jpegls_errc::invalid_data);

if (n == reset_threshold)
{
Expand Down Expand Up @@ -92,7 +93,7 @@ struct jls_context_v220 final
}

if (UNLIKELY(k == max_k_value))
impl::throw_jpegls_error(jpegls_errc::invalid_encoded_data);
impl::throw_jpegls_error(jpegls_errc::invalid_data);

return k;
}
Expand Down
74 changes: 74 additions & 0 deletions benchmark/decode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// Copyright (c) Team CharLS.
// SPDX-License-Identifier: BSD-3-Clause

#include <benchmark/benchmark.h>

#include "../include/charls/charls.hpp"

#include <filesystem>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <vector>

#pragma warning(disable : 26409) // Avoid calling new explicitly (triggered by BENCHMARK macro)

using namespace charls;
using std::byte;
using std::ifstream;
using std::ios;
using std::vector;

template<typename Container>
void read(std::istream& input, Container& destination)
{
input.read(reinterpret_cast<char*>(destination.data()), static_cast<std::streamsize>(destination.size()));
}

vector<byte> read_file(const char* filename, long offset = 0, size_t bytes = 0)
try
{
ifstream input;
input.exceptions(ios::eofbit | ios::failbit | ios::badbit);
input.open(filename, ios::in | ios::binary);

input.seekg(0, ios::end);
const auto byte_count_file{static_cast<int>(input.tellg())};
input.seekg(offset, ios::beg);

if (offset < 0)
{
offset = static_cast<long>(byte_count_file - bytes);
}
if (bytes == 0)
{
bytes = static_cast<size_t>(byte_count_file) - offset;
}

vector<byte> buffer(bytes);
read(input, buffer);

return buffer;
}
catch (const std::ifstream::failure&)
{
std::cout << "Failed to open/read file: " << std::filesystem::absolute(filename) << "\n";
throw;
}


static void bm_decode(benchmark::State& state)
{
const auto source{read_file("d:/benchmark-test-image.jls")};

// Pre-allocate the destination outside the measurement loop.
// std::vector initializes its elements and this step needs to be excluded from the measurement.
vector<byte> destination(jpegls_decoder{source, true}.get_destination_size());

for (const auto _ : state)
{
jpegls_decoder decoder(source.data(), source.size());
decoder.decode(destination);
}
}
BENCHMARK(bm_decode);
Loading

0 comments on commit 738f77a

Please sign in to comment.