Skip to content

Commit

Permalink
Moved SignExtend into src/include and added bitpacking chunk size con…
Browse files Browse the repository at this point in the history
…stant
  • Loading branch information
benjaminwinger committed Sep 12, 2023
1 parent 2a04d85 commit 0d3327e
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 67 deletions.
8 changes: 5 additions & 3 deletions src/include/storage/copier/compression.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ template<typename T>
class IntegerBitpacking : public CompressionAlg {
static const common::LogicalType LOGICAL_TYPE;
using U = std::make_unsigned_t<T>;
// This is an implementation detail of the fastpfor bitpacking algorithm
static constexpr uint64_t CHUNK_SIZE = 32;

public:
IntegerBitpacking() = default;
Expand All @@ -161,11 +163,11 @@ class IntegerBitpacking : public CompressionAlg {

uint64_t numValuesPerPage(uint8_t bitWidth, uint64_t pageSize) {
auto numValues = pageSize * 8 / bitWidth;
// Round down to nearest multiple of 32 to ensure that we don't write any extra values
// Rounding up could overflow the buffer
// Round down to nearest multiple of CHUNK_SIZE to ensure that we don't write any extra
// values Rounding up could overflow the buffer
// TODO(bmwinger): Pack extra values into the space at the end. This will probably be
// slower, but only needs to be done once.
numValues -= numValues % 32;
numValues -= numValues % CHUNK_SIZE;
return numValues;
}

Expand Down
49 changes: 49 additions & 0 deletions src/include/storage/copier/sign_extend.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#pragma once

/* Adapted from
https://github.com/duckdb/duckdb/blob/312b9954507386305544a42c4f43c2bd410a64cb/src/include/duckdb/common/bitpacking.hpp#L190-L199
* Copyright 2018-2023 Stichting DuckDB Foundation
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
associated documentation files (the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge, publish, distribute,
sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial
portions of the Software.
*/

#include <string.h>

#include <cstdint>
#include <limits>
#include <type_traits>

namespace kuzu {
namespace storage {

template<typename T>
void Store(const T& val, uint8_t* ptr) {
memcpy(ptr, (void*)&val, sizeof(val));
}

template<typename T>
const T Load(const uint8_t* ptr) {
T ret;
memcpy(&ret, ptr, sizeof(ret));
return ret;
}

// Sign bit extension
template<class T, class T_U = typename std::make_unsigned<T>::type, uint64_t CHUNK_SIZE>
static void SignExtend(uint8_t* dst, uint8_t width) {
T const mask = T_U(1) << (width - 1);
for (uint64_t i = 0; i < CHUNK_SIZE; ++i) {
T value = Load<T>(dst + i * sizeof(T));
value = value & ((T_U(1) << width) - T_U(1));
T result = (value ^ mask) - mask;
Store(result, dst + i * sizeof(T));
}
}
} // namespace storage
} // namespace kuzu
2 changes: 1 addition & 1 deletion src/storage/copier/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ set(ALL_OBJECT_FILES
${ALL_OBJECT_FILES} $<TARGET_OBJECTS:kuzu_storage_in_mem_csv_copier>
PARENT_SCOPE)

target_link_libraries(kuzu_storage_in_mem_csv_copier PRIVATE fastpfor duckdb)
target_link_libraries(kuzu_storage_in_mem_csv_copier PRIVATE fastpfor)
23 changes: 11 additions & 12 deletions src/storage/copier/compression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
#include "common/exception.h"
#include "common/null_mask.h"
#include "common/vector/value_vector.h"
#include "duckdb/common/bitpacking.hpp"
#include "fastpfor/bitpackinghelpers.h"
#include "storage/copier/sign_extend.h"
#include <bit>

using namespace kuzu::common;
Expand Down Expand Up @@ -113,13 +113,13 @@ void IntegerBitpacking<T>::getValue(const uint8_t* buffer, common::offset_t pos,
const CompressionMetadata& metadata) const {
auto header = BitpackHeader::readHeader(metadata.data);
// TODO(bmwinger): optimize as in setValueFromUncompressed
auto chunkIndex = pos / 32;
auto posInChunk = pos % 32;
auto chunkIndex = pos / BITPACKING_CHUNK_SIZE;
auto posInChunk = pos % BITPACKING_CHUNK_SIZE;

U chunk[32];
U chunk[BITPACKING_CHUNK_SIZE];
FastPForLib::fastunpack(
(const uint32_t*)buffer + chunkIndex * header.bitWidth, chunk, header.bitWidth);
duckdb::SignExtend<T>((uint8_t*)chunk, header.bitWidth);
SignExtend<T, U, CHUNK_SIZE>((uint8_t*)chunk, header.bitWidth);
memcpy(dst, &chunk[posInChunk], sizeof(T));
}

Expand All @@ -132,13 +132,13 @@ uint64_t IntegerBitpacking<T>::compressNextPage(const uint8_t*& srcBuffer,
return 0;
}
auto numValues = std::min(numValuesRemaining, numValuesPerPage(bitWidth, dstBufferSize));
assert(dstBufferSize >= 32 + BitpackHeader::size());
assert(dstBufferSize >= BITPACKING_CHUNK_SIZE + BitpackHeader::size());
assert(dstBufferSize >= BitpackHeader::size() + numValues * bitWidth / 8);
for (auto i = 0ull; i < numValues; i += 32) {
assert(dstBuffer + 32 <= dstBufferEnd);
for (auto i = 0ull; i < numValues; i += CHUNK_SIZE) {
assert(dstBuffer + BITPACKING_CHUNK_SIZE <= dstBufferEnd);
FastPForLib::fastpack((const U*)srcBuffer + i, (uint32_t*)dstBuffer, bitWidth);
// fastpack packs 32 values at a time, i.e. 4 bytes per bit of width.
dstBuffer += bitWidth * 4;
dstBuffer += bitWidth * CHUNK_SIZE / 8;
}
srcBuffer += numValues * sizeof(U);
return numValues * bitWidth / 8;
Expand All @@ -149,17 +149,16 @@ void IntegerBitpacking<T>::decompressFromPage(const uint8_t* srcBuffer, uint64_t
uint8_t* dstBuffer, uint64_t dstOffset, uint64_t numValues,
const CompressionMetadata& metadata) {
auto header = BitpackHeader::readHeader(metadata.data);
auto chunkSize = 32;
// FIXME(bmwinger): will overflow data with fewer than 32 values
// assert(numValues >= chunkSize);
// But most of the time, the buffers are large enough.
// But we should either fix overflows via a slow unpack on the last chunk that works on
// an arbitrary number of values, or assert that the buffers are indeed large enough if possible
for (auto i = 0ull; i < numValues; i += chunkSize) {
for (auto i = 0ull; i < numValues; i += CHUNK_SIZE) {
FastPForLib::fastunpack(
(const uint32_t*)(srcBuffer + i * header.bitWidth), (U*)dstBuffer + i, header.bitWidth);
if (header.hasNegative) {
duckdb::SignExtend<T>(dstBuffer + i, header.bitWidth);
SignExtend<T, U, CHUNK_SIZE>(dstBuffer + i, header.bitWidth);
}
}
}
Expand Down
1 change: 0 additions & 1 deletion third_party/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,3 @@ add_subdirectory(utf8proc)
add_subdirectory(pybind11)
add_subdirectory(re2)
add_subdirectory(fastpfor)
add_subdirectory(duckdb)
3 changes: 0 additions & 3 deletions third_party/duckdb/CMakeLists.txt

This file was deleted.

7 changes: 0 additions & 7 deletions third_party/duckdb/LICENSE

This file was deleted.

40 changes: 0 additions & 40 deletions third_party/duckdb/duckdb/common/bitpacking.hpp

This file was deleted.

0 comments on commit 0d3327e

Please sign in to comment.