diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3b78b17247f..ceae1e985df 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -14,7 +14,7 @@ add_subdirectory(transaction) add_library(kuzu STATIC ${ALL_OBJECT_FILES}) target_link_libraries(kuzu - PUBLIC antlr4_cypher antlr4_runtime utf8proc re2 serd ${PARQUET_LIB} ${ARROW_LIB} Threads::Threads) + PUBLIC antlr4_cypher antlr4_runtime utf8proc re2 serd ${PARQUET_LIB} ${ARROW_LIB} Threads::Threads fastpfor) target_include_directories(kuzu PUBLIC $ $) add_library(kuzu_shared SHARED ${ALL_OBJECT_FILES}) @@ -24,6 +24,6 @@ else() set_target_properties(kuzu_shared PROPERTIES OUTPUT_NAME kuzu) endif() target_link_libraries(kuzu_shared - PUBLIC antlr4_cypher antlr4_runtime utf8proc re2 serd ${PARQUET_LIB} ${ARROW_LIB} Threads::Threads) + PUBLIC antlr4_cypher antlr4_runtime utf8proc re2 serd ${PARQUET_LIB} ${ARROW_LIB} Threads::Threads fastpfor) target_include_directories(kuzu_shared PUBLIC $ $) diff --git a/src/common/file_utils.cpp b/src/common/file_utils.cpp index 1a341d5b2c9..497631984de 100644 --- a/src/common/file_utils.cpp +++ b/src/common/file_utils.cpp @@ -73,7 +73,7 @@ std::unique_ptr FileUtils::openFile(const std::string& path, int flags } void FileUtils::writeToFile( - FileInfo* fileInfo, uint8_t* buffer, uint64_t numBytes, uint64_t offset) { + FileInfo* fileInfo, const uint8_t* buffer, uint64_t numBytes, uint64_t offset) { auto fileSize = fileInfo->getFileSize(); if (fileSize == -1) { throw Exception(StringUtils::string_format("File {} not open.", fileInfo->path)); diff --git a/src/include/common/constants.h b/src/include/common/constants.h index b63172e67af..d5c2c6fc246 100644 --- a/src/include/common/constants.h +++ b/src/include/common/constants.h @@ -7,7 +7,7 @@ namespace kuzu { namespace common { -constexpr char KUZU_VERSION[] = "v0.0.8.7"; +constexpr char KUZU_VERSION[] = "v0.0.8.8"; constexpr uint64_t DEFAULT_VECTOR_CAPACITY_LOG_2 = 11; constexpr uint64_t DEFAULT_VECTOR_CAPACITY = (uint64_t)1 << DEFAULT_VECTOR_CAPACITY_LOG_2; diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 716559077b3..0b621c041e2 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -41,7 +41,7 @@ class FileUtils { static void readFromFile( FileInfo* fileInfo, void* buffer, uint64_t numBytes, uint64_t position); static void writeToFile( - FileInfo* fileInfo, uint8_t* buffer, uint64_t numBytes, uint64_t offset); + FileInfo* fileInfo, const uint8_t* buffer, uint64_t numBytes, uint64_t offset); // This function is a no-op if either file, from or to, does not exist. static void overwriteFile(const std::string& from, const std::string& to); static void copyFile(const std::string& from, const std::string& to, diff --git a/src/include/storage/copier/column_chunk.h b/src/include/storage/copier/column_chunk.h index b2b6857eb54..9faf2a49d41 100644 --- a/src/include/storage/copier/column_chunk.h +++ b/src/include/storage/copier/column_chunk.h @@ -4,6 +4,7 @@ #include "common/type_utils.h" #include "common/types/types.h" #include "common/vector/value_vector.h" +#include "compression.h" #include "storage/buffer_manager/bm_file_handle.h" #include "storage/wal/wal.h" #include "transaction/transaction.h" @@ -16,6 +17,7 @@ namespace kuzu { namespace storage { class NullColumnChunk; +class CompressionAlg; struct BaseColumnChunkMetadata { common::page_idx_t pageIdx; @@ -29,11 +31,13 @@ struct BaseColumnChunkMetadata { struct ColumnChunkMetadata : public BaseColumnChunkMetadata { uint64_t numValues; + CompressionMetadata compMeta; ColumnChunkMetadata() : BaseColumnChunkMetadata(), numValues{UINT64_MAX} {} - ColumnChunkMetadata( - common::page_idx_t pageIdx, common::page_idx_t numPages, uint64_t numNodesInChunk) - : BaseColumnChunkMetadata{pageIdx, numPages}, numValues(numNodesInChunk) {} + ColumnChunkMetadata(common::page_idx_t pageIdx, common::page_idx_t numPages, + uint64_t numNodesInChunk, CompressionMetadata compMeta) + : BaseColumnChunkMetadata{pageIdx, numPages}, numValues(numNodesInChunk), + compMeta(compMeta) {} }; struct OverflowColumnChunkMetadata : public BaseColumnChunkMetadata { @@ -90,7 +94,7 @@ class ColumnChunk { virtual void append( arrow::Array* array, common::offset_t startPosInChunk, uint32_t numValuesToAppend); - virtual common::page_idx_t flushBuffer(BMFileHandle* dataFH, common::page_idx_t startPageIdx); + ColumnChunkMetadata flushBuffer(BMFileHandle* dataFH, common::page_idx_t startPageIdx); // Returns the size of the data type in bytes static uint32_t getDataTypeSizeInChunk(common::LogicalType& dataType); @@ -160,6 +164,9 @@ class ColumnChunk { std::vector> childrenChunks; std::unique_ptr csvReaderConfig; uint64_t numValues; + std::function + flushBufferFunction; }; template<> @@ -174,6 +181,7 @@ inline bool ColumnChunk::getValue(common::offset_t pos) const { return common::NullMask::isNull((uint64_t*)buffer.get(), pos); } +// Stored as bitpacked booleans in-memory and on-disk class BoolColumnChunk : public ColumnChunk { public: BoolColumnChunk( diff --git a/src/include/storage/copier/compression.h b/src/include/storage/copier/compression.h new file mode 100644 index 00000000000..edb41c575aa --- /dev/null +++ b/src/include/storage/copier/compression.h @@ -0,0 +1,312 @@ +#pragma once + +#include +#include + +#include "common/types/types.h" + +namespace kuzu { +namespace common { +class ValueVector; +} + +namespace storage { + +struct PageElementCursor; + +// Returns the size of the data type in bytes +uint32_t getDataTypeSizeInChunk(const common::LogicalType& dataType); + +// Compression type is written to the data header both so we can usually catch issues when we +// decompress uncompressed data by mistake, and to allow for runtime-configurable compression in +// future. +enum class CompressionType : uint8_t { + UNCOMPRESSED = 0, + INTEGER_BITPACKING = 1, + BOOLEAN_BITPACKING = 2, +}; + +struct CompressionMetadata { + CompressionType compression; + // Extra data to be used to store codec-specific information + uint8_t data; + explicit CompressionMetadata( + CompressionType compression = CompressionType::UNCOMPRESSED, uint8_t data = 0) + : compression{compression}, data{data} {} + + // Returns the number of values which will be stored in the given data size + // This must be consistent with the compression implementation for the given size + uint64_t numValues(uint64_t dataSize, const common::LogicalType& dataType) const; +}; + +class CompressionAlg { +public: + virtual ~CompressionAlg() = default; + + // Takes a single uncompressed value from the srcBuffer and compresses it into the dstBuffer + // Offsets refer to value offsets, not byte offsets + virtual void setValueFromUncompressed(uint8_t* srcBuffer, common::offset_t posInSrc, + uint8_t* dstBuffer, common::offset_t posInDst, + const CompressionMetadata& metadata) const = 0; + + // Reads a value from the buffer at the given position and stores it at the given memory address + // dst should point to an uncompressed value + virtual inline void getValue(const uint8_t* buffer, common::offset_t posInBuffer, uint8_t* dst, + common::offset_t posInDst, const CompressionMetadata& metadata) const = 0; + + // TODO(bmwinger): this should probably be scoped. E.g. by having a separate class for handling + // compression which is returned by the compress function. Called when compression starts. Will + // always be called before compressNextPage + // Returns the number of values per page (currently this must be consistent across a + // ColumnChunk) + virtual CompressionMetadata startCompression( + const uint8_t* srcBuffer, uint64_t numValues) const = 0; + + // Takes uncompressed data from the srcBuffer and compresses it into the dstBuffer + // + // stores only as much data in dstBuffer as will fit, and advances the srcBuffer pointer + // to the beginning of the next value to store. + // (This means that we can't start the next page on an unaligned value. + // Maybe instead we could use value offsets, but the compression algorithms + // usually work on aligned chunks anyway) + // + // dstBufferSize is the size in bytes + // numValuesRemaining is the number of values remaining in the srcBuffer to be compressed. + // compressNextPage must store the least of either the number of values per page + // (as returned by startCompression), or the remaining number of values. + // + // returns the size in bytes of the compressed data within the page (rounded up to the nearest + // byte) + virtual uint64_t compressNextPage(const uint8_t*& srcBuffer, uint64_t numValuesRemaining, + uint8_t* dstBuffer, uint64_t dstBufferSize, + const struct CompressionMetadata& metadata) const = 0; + + // Takes compressed data from the srcBuffer and decompresses it into the dstBuffer + // Offsets refer to value offsets, not byte offsets + // srcBuffer points to the beginning of a page + virtual void decompressFromPage(const uint8_t* srcBuffer, uint64_t srcOffset, + uint8_t* dstBuffer, uint64_t dstOffset, uint64_t numValues, + const CompressionMetadata& metadata) const = 0; +}; + +// Compression alg which does not compress values and instead just copies them. +class CopyCompression : public CompressionAlg { +public: + explicit CopyCompression(const common::LogicalType& logicalType) + : logicalType{logicalType}, numBytesPerValue{getDataTypeSizeInChunk(this->logicalType)} { + assert(numBytesPerValue > 0); + } + + CopyCompression(const CopyCompression&) = default; + + inline void setValueFromUncompressed(uint8_t* srcBuffer, common::offset_t posInSrc, + uint8_t* dstBuffer, common::offset_t posInDst, + const CompressionMetadata& metadata) const final { + memcpy(dstBuffer + posInDst * numBytesPerValue, srcBuffer + posInSrc * numBytesPerValue, + numBytesPerValue); + } + + inline void getValue(const uint8_t* buffer, common::offset_t posInBuffer, uint8_t* dst, + common::offset_t posInDst, const CompressionMetadata& metadata) const override { + memcpy(dst + posInDst * numBytesPerValue, buffer + posInBuffer * numBytesPerValue, + numBytesPerValue); + } + + static inline uint64_t numValues(uint64_t dataSize, const common::LogicalType& logicalType) { + return dataSize / getDataTypeSizeInChunk(logicalType); + } + + inline CompressionMetadata startCompression( + const uint8_t* srcBuffer, uint64_t numValues) const override { + return CompressionMetadata(); + } + + inline uint64_t compressNextPage(const uint8_t*& srcBuffer, uint64_t numValuesRemaining, + uint8_t* dstBuffer, uint64_t dstBufferSize, + const struct CompressionMetadata& metadata) const override { + uint64_t numValues = std::min(numValuesRemaining, dstBufferSize / numBytesPerValue); + uint64_t sizeToCopy = numValues * numBytesPerValue; + assert(sizeToCopy <= dstBufferSize); + std::memcpy(dstBuffer, srcBuffer, sizeToCopy); + srcBuffer += sizeToCopy; + return sizeToCopy; + } + + void decompressFromPage(const uint8_t* srcBuffer, uint64_t srcOffset, uint8_t* dstBuffer, + uint64_t dstOffset, uint64_t numValues, + const CompressionMetadata& metadata) const override { + std::memcpy(dstBuffer + dstOffset * numBytesPerValue, + srcBuffer + srcOffset * numBytesPerValue, numValues * numBytesPerValue); + } + +protected: + common::LogicalType logicalType; + const uint32_t numBytesPerValue; +}; + +// Six bits are needed for the bit width (fewer for smaller types, but the header byte is the same +// for simplicity) One bit (the eighth) is needed to indicate if there are negative values The +// seventh bit is unused +struct BitpackHeader { + uint8_t bitWidth; + bool hasNegative; + static const uint8_t NEGATIVE_FLAG = 0b10000000; + static const uint8_t BITWIDTH_MASK = 0b01111111; + + uint8_t getDataByte() const { + uint8_t data = bitWidth; + if (hasNegative) { + data |= NEGATIVE_FLAG; + } + return data; + } + + static BitpackHeader readHeader(uint8_t data); +}; + +template +class IntegerBitpacking : public CompressionAlg { + using U = std::make_unsigned_t; + // This is an implementation detail of the fastpfor bitpacking algorithm + static constexpr uint64_t CHUNK_SIZE = 32; + +public: + IntegerBitpacking() = default; + IntegerBitpacking(const IntegerBitpacking&) = default; + + void setValueFromUncompressed(uint8_t* srcBuffer, common::offset_t posInSrc, uint8_t* dstBuffer, + common::offset_t posInDst, const CompressionMetadata& metadata) const final; + + // Read a single value from the buffer + void getValue(const uint8_t* buffer, common::offset_t posInBuffer, uint8_t* dst, + common::offset_t posInDst, const CompressionMetadata& metadata) const final; + + BitpackHeader getBitWidth(const uint8_t* srcBuffer, uint64_t numValues) const; + + static inline uint64_t numValues(uint64_t dataSize, const BitpackHeader& header) { + if (header.bitWidth == 0) { + return UINT64_MAX; + } + auto numValues = dataSize * 8 / header.bitWidth; + // Round down to nearest multiple of CHUNK_SIZE to ensure that we don't write any extra + // values Rounding up could overflow the buffer + // TODO(bmwinger): Pack extra values into the space at the end. This will probably be + // slower, but only needs to be done once. + numValues -= numValues % CHUNK_SIZE; + return numValues; + } + + CompressionMetadata startCompression( + const uint8_t* srcBuffer, uint64_t numValues) const override { + auto header = getBitWidth(srcBuffer, numValues); + CompressionMetadata metadata{CompressionType::INTEGER_BITPACKING, header.getDataByte()}; + return metadata; + } + + uint64_t compressNextPage(const uint8_t*& srcBuffer, uint64_t numValuesRemaining, + uint8_t* dstBuffer, uint64_t dstBufferSize, + const struct CompressionMetadata& metadata) const final; + + void decompressFromPage(const uint8_t* srcBuffer, uint64_t srcOffset, uint8_t* dstBuffer, + uint64_t dstOffset, uint64_t numValues, + const struct CompressionMetadata& metadata) const final; + +protected: + // Read multiple values from within a chunk. Cannot span multiple chunks. + void getValues(const uint8_t* chunkStart, uint8_t pos, uint8_t* dst, uint8_t numValuesToRead, + const BitpackHeader& header) const; + + inline const uint8_t* getChunkStart( + const uint8_t* buffer, uint64_t pos, uint8_t bitWidth) const { + // Order of operations is important so that pos is rounded down to a multiple of CHUNK_SIZE + return buffer + (pos / CHUNK_SIZE) * bitWidth * CHUNK_SIZE / 8; + } +}; + +class BooleanBitpacking : public CompressionAlg { +public: + BooleanBitpacking() = default; + BooleanBitpacking(const BooleanBitpacking&) = default; + + void setValueFromUncompressed(uint8_t* srcBuffer, common::offset_t posInSrc, uint8_t* dstBuffer, + common::offset_t posInDst, const CompressionMetadata& metadata) const final; + + void getValue(const uint8_t* buffer, common::offset_t posInBuffer, uint8_t* dst, + common::offset_t posInDst, const CompressionMetadata& metadata) const final; + + static inline uint64_t numValues(uint64_t dataSize) { return dataSize * 8; } + + inline CompressionMetadata startCompression( + const uint8_t* srcBuffer, uint64_t numValues) const override { + return CompressionMetadata{CompressionType::BOOLEAN_BITPACKING, 0}; + } + uint64_t compressNextPage(const uint8_t*& srcBuffer, uint64_t numValuesRemaining, + uint8_t* dstBuffer, uint64_t dstBufferSize, + const struct CompressionMetadata& metadata) const final; + + void decompressFromPage(const uint8_t* srcBuffer, uint64_t srcOffset, uint8_t* dstBuffer, + uint64_t dstOffset, uint64_t numValues, const CompressionMetadata& metadata) const final; +}; + +class CompressedFunctor { +public: + CompressedFunctor(const CompressedFunctor&) = default; + +protected: + explicit CompressedFunctor(const common::LogicalType& logicalType) : copy{logicalType} { + switch (logicalType.getLogicalTypeID()) { + case common::LogicalTypeID::TIMESTAMP: + case common::LogicalTypeID::INT64: { + integerBitpacking = std::make_unique>(); + break; + } + case common::LogicalTypeID::DATE: + case common::LogicalTypeID::INT32: { + integerBitpacking = std::make_unique>(); + break; + } + default: { + } + } + } + const CopyCompression copy; + const BooleanBitpacking booleanBitpacking; + // Columns may have different compression types per chunk, but will always have the same type, + // so we only need one integer type, but can't hardcode it if we want this to work for other + // logical types too + std::shared_ptr integerBitpacking; +}; + +class ReadCompressedValuesFromPage : public CompressedFunctor { +public: + explicit ReadCompressedValuesFromPage(const common::LogicalType& logicalType) + : CompressedFunctor(logicalType) {} + ReadCompressedValuesFromPage(const ReadCompressedValuesFromPage&) = default; + + void operator()(uint8_t* frame, PageElementCursor& pageCursor, + common::ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead, + const CompressionMetadata& metadata); +}; + +class LookupCompressedValueInPage : public CompressedFunctor { +public: + explicit LookupCompressedValueInPage(const common::LogicalType& logicalType) + : CompressedFunctor(logicalType) {} + LookupCompressedValueInPage(const LookupCompressedValueInPage&) = default; + + void operator()(uint8_t* frame, PageElementCursor& pageCursor, uint8_t* result, + uint32_t posInResult, const CompressionMetadata& metadata); +}; + +class WriteCompressedValueToPage : public CompressedFunctor { +public: + explicit WriteCompressedValueToPage(const common::LogicalType& logicalType) + : CompressedFunctor(logicalType) {} + WriteCompressedValueToPage(const WriteCompressedValueToPage&) = default; + + void operator()(uint8_t* frame, uint16_t posInFrame, common::ValueVector* vector, + uint32_t posInVector, const CompressionMetadata& metadata); +}; + +} // namespace storage +} // namespace kuzu diff --git a/src/include/storage/copier/sign_extend.h b/src/include/storage/copier/sign_extend.h new file mode 100644 index 00000000000..90a428f1f34 --- /dev/null +++ b/src/include/storage/copier/sign_extend.h @@ -0,0 +1,49 @@ +#pragma once + +/* Adapted from +https://github.com/duckdb/duckdb/blob/312b9954507386305544a42c4f43c2bd410a64cb/src/include/duckdb/common/bitpacking.hpp#L190-L199 + * Copyright 2018-2023 Stichting DuckDB Foundation + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +associated documentation files (the "Software"), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, publish, distribute, +sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial +portions of the Software. +*/ + +#include + +#include +#include +#include + +namespace kuzu { +namespace storage { + +template +void Store(const T& val, uint8_t* ptr) { + memcpy(ptr, (void*)&val, sizeof(val)); +} + +template +const T Load(const uint8_t* ptr) { + T ret; + memcpy(&ret, ptr, sizeof(ret)); + return ret; +} + +// Sign bit extension +template::type, uint64_t CHUNK_SIZE> +static void SignExtend(uint8_t* dst, uint8_t width) { + T const mask = T_U(1) << (width - 1); + for (uint64_t i = 0; i < CHUNK_SIZE; ++i) { + T value = Load(dst + i * sizeof(T)); + value = value & ((T_U(1) << width) - T_U(1)); + T result = (value ^ mask) - mask; + Store(result, dst + i * sizeof(T)); + } +} +} // namespace storage +} // namespace kuzu diff --git a/src/include/storage/copier/var_list_column_chunk.h b/src/include/storage/copier/var_list_column_chunk.h index f0aedc5cc61..17ea01e2653 100644 --- a/src/include/storage/copier/var_list_column_chunk.h +++ b/src/include/storage/copier/var_list_column_chunk.h @@ -45,6 +45,8 @@ class VarListColumnChunk : public ColumnChunk { void append(common::ValueVector* vector, common::offset_t startPosInChunk) final; inline void resizeDataColumnChunk(uint64_t numBytesForBuffer) { + // TODO(bmwinger): This won't work properly for booleans (will be one eighth as many values + // as could fit) varListDataColumnChunk.resizeBuffer( numBytesForBuffer / varListDataColumnChunk.dataColumnChunk->getNumBytesPerValue()); } diff --git a/src/include/storage/store/node_column.h b/src/include/storage/store/node_column.h index 4142ebf5263..053279c95b9 100644 --- a/src/include/storage/store/node_column.h +++ b/src/include/storage/store/node_column.h @@ -11,38 +11,20 @@ namespace transaction { class TransactionTests; } +namespace copier { +struct CompressionMetadata; +} + namespace storage { using read_node_column_func_t = std::function; -using write_node_column_func_t = std::function; - -struct FixedSizedNodeColumnFunc { - static void readValuesFromPage(uint8_t* frame, PageElementCursor& pageCursor, - common::ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead); - static void writeValueToPage( - uint8_t* frame, uint16_t posInFrame, common::ValueVector* vector, uint32_t posInVecto); - - static void readInternalIDValuesFromPage(uint8_t* frame, PageElementCursor& pageCursor, - common::ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead); - static void writeInternalIDValueToPage( - uint8_t* frame, uint16_t posInFrame, common::ValueVector* vector, uint32_t posInVecto); -}; + common::ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead, + const CompressionMetadata& metadata)>; +using write_node_column_func_t = std::function; -struct NullNodeColumnFunc { - static void readValuesFromPage(uint8_t* frame, PageElementCursor& pageCursor, - common::ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead); - static void writeValueToPage( - uint8_t* frame, uint16_t posInFrame, common::ValueVector* vector, uint32_t posInVector); -}; - -struct BoolNodeColumnFunc { - static void readValuesFromPage(uint8_t* frame, PageElementCursor& pageCursor, - common::ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead); - static void writeValueToPage( - uint8_t* frame, uint16_t posInFrame, common::ValueVector* vector, uint32_t posInVector); -}; +using lookup_node_column_func_t = std::function; class NullNodeColumn; class StructNodeColumn; @@ -56,13 +38,10 @@ class NodeColumn { friend class StructNodeColumn; public: - NodeColumn(const catalog::Property& property, BMFileHandle* dataFH, BMFileHandle* metadataFH, - BufferManager* bufferManager, WAL* wal, transaction::Transaction* transaction, - RWPropertyStats propertyStatistics, bool requireNullColumn = true); NodeColumn(common::LogicalType dataType, const catalog::MetadataDAHInfo& metaDAHeaderInfo, BMFileHandle* dataFH, BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal, transaction::Transaction* transaction, RWPropertyStats PropertyStatistics, - bool requireNullColumn); + bool requireNullColumn = true); virtual ~NodeColumn() = default; // Expose for feature store @@ -103,9 +82,11 @@ class NodeColumn { virtual void scanInternal(transaction::Transaction* transaction, common::ValueVector* nodeIDVector, common::ValueVector* resultVector); void scanUnfiltered(transaction::Transaction* transaction, PageElementCursor& pageCursor, - uint64_t numValuesToScan, common::ValueVector* resultVector, uint64_t startPosInVector = 0); + uint64_t numValuesToScan, common::ValueVector* resultVector, + const CompressionMetadata& compMeta, uint64_t startPosInVector = 0); void scanFiltered(transaction::Transaction* transaction, PageElementCursor& pageCursor, - common::ValueVector* nodeIDVector, common::ValueVector* resultVector); + common::ValueVector* nodeIDVector, common::ValueVector* resultVector, + const CompressionMetadata& compMeta); virtual void lookupInternal(transaction::Transaction* transaction, common::ValueVector* nodeIDVector, common::ValueVector* resultVector); virtual void lookupValue(transaction::Transaction* transaction, common::offset_t nodeOffset, @@ -133,8 +114,9 @@ class NodeColumn { protected: StorageStructureID storageStructureID; common::LogicalType dataType; + // TODO(bmwinger): Remove. Only used by var_list_column_chunk for something which should be + // rewritten uint32_t numBytesPerFixedSizedValue; - uint32_t numValuesPerPage; BMFileHandle* dataFH; BMFileHandle* metadataFH; BufferManager* bufferManager; @@ -144,6 +126,7 @@ class NodeColumn { std::vector> childrenColumns; read_node_column_func_t readNodeColumnFunc; write_node_column_func_t writeNodeColumnFunc; + lookup_node_column_func_t lookupNodeColumnFunc; RWPropertyStats propertyStatistics; }; @@ -153,9 +136,6 @@ class BoolNodeColumn : public NodeColumn { BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal, transaction::Transaction* transaction, RWPropertyStats propertyStatistics, bool requireNullColumn = true); - - void batchLookup(transaction::Transaction* transaction, const common::offset_t* nodeOffsets, - size_t size, uint8_t* result) final; }; class NullNodeColumn : public NodeColumn { diff --git a/src/include/storage/store/string_node_column.h b/src/include/storage/store/string_node_column.h index 34fc7a6f36b..1b4ada6aba3 100644 --- a/src/include/storage/store/string_node_column.h +++ b/src/include/storage/store/string_node_column.h @@ -7,8 +7,8 @@ namespace kuzu { namespace storage { struct StringNodeColumnFunc { - static void writeStringValuesToPage( - uint8_t* frame, uint16_t posInFrame, common::ValueVector* vector, uint32_t posInVector); + static void writeStringValuesToPage(uint8_t* frame, uint16_t posInFrame, + common::ValueVector* vector, uint32_t posInVector, const CompressionMetadata& metadata); }; class StringNodeColumn : public NodeColumn { diff --git a/src/storage/copier/CMakeLists.txt b/src/storage/copier/CMakeLists.txt index 1d341338e3f..4f4134365ef 100644 --- a/src/storage/copier/CMakeLists.txt +++ b/src/storage/copier/CMakeLists.txt @@ -1,6 +1,7 @@ add_library(kuzu_storage_in_mem_csv_copier OBJECT column_chunk.cpp + compression.cpp node_group.cpp npy_reader.cpp struct_column_chunk.cpp @@ -11,3 +12,5 @@ add_library(kuzu_storage_in_mem_csv_copier set(ALL_OBJECT_FILES ${ALL_OBJECT_FILES} $ PARENT_SCOPE) + +target_link_libraries(kuzu_storage_in_mem_csv_copier PRIVATE fastpfor) diff --git a/src/storage/copier/column_chunk.cpp b/src/storage/copier/column_chunk.cpp index e2d1dd938bf..3d959832d82 100644 --- a/src/storage/copier/column_chunk.cpp +++ b/src/storage/copier/column_chunk.cpp @@ -2,6 +2,7 @@ #include "arrow/array.h" #include "common/types/value/nested.h" +#include "storage/copier/compression.h" #include "storage/copier/string_column_chunk.h" #include "storage/copier/struct_column_chunk.h" #include "storage/copier/table_copy_utils.h" @@ -14,6 +15,59 @@ using namespace kuzu::transaction; namespace kuzu { namespace storage { +ColumnChunkMetadata fixedSizedFlushBuffer(const uint8_t* buffer, uint64_t bufferSize, + uint64_t numValues, BMFileHandle* dataFH, page_idx_t startPageIdx) { + FileUtils::writeToFile(dataFH->getFileInfo(), buffer, bufferSize, + startPageIdx * BufferPoolConstants::PAGE_4KB_SIZE); + return ColumnChunkMetadata(startPageIdx, ColumnChunk::getNumPagesForBytes(bufferSize), + numValues, CompressionMetadata()); +} + +ColumnChunkMetadata booleanFlushBuffer(const uint8_t* buffer, uint64_t bufferSize, + uint64_t numValues, BMFileHandle* dataFH, page_idx_t startPageIdx) { + // Since we compress into memory, storage is the same as fixed-sized values, + // but we need to mark it as being boolean compressed. + FileUtils::writeToFile(dataFH->getFileInfo(), buffer, bufferSize, + startPageIdx * BufferPoolConstants::PAGE_4KB_SIZE); + return ColumnChunkMetadata(startPageIdx, ColumnChunk::getNumPagesForBytes(bufferSize), + numValues, CompressionMetadata(CompressionType::BOOLEAN_BITPACKING)); +} + +class CompressedFlushBuffer { + std::shared_ptr alg; + const LogicalType& dataType; + +public: + CompressedFlushBuffer(std::unique_ptr alg, LogicalType& dataType) + : alg{std::move(alg)}, dataType{dataType} {} + + CompressedFlushBuffer(const CompressedFlushBuffer& other) = default; + + ColumnChunkMetadata operator()(const uint8_t* buffer, uint64_t bufferSize, uint64_t numValues, + BMFileHandle* dataFH, page_idx_t startPageIdx) { + int64_t valuesRemaining = numValues; + const uint8_t* bufferStart = buffer; + auto compressedBuffer = std::make_unique(BufferPoolConstants::PAGE_4KB_SIZE); + auto numPages = 0; + auto metadata = alg->startCompression(buffer, numValues); + auto numValuesPerPage = metadata.numValues(BufferPoolConstants::PAGE_4KB_SIZE, dataType); + do { + auto compressedSize = alg->compressNextPage(bufferStart, valuesRemaining, + compressedBuffer.get(), BufferPoolConstants::PAGE_4KB_SIZE, metadata); + // Avoid underflows + if (numValuesPerPage > valuesRemaining) { + valuesRemaining = 0; + } else { + valuesRemaining -= numValuesPerPage; + } + FileUtils::writeToFile(dataFH->getFileInfo(), compressedBuffer.get(), compressedSize, + (startPageIdx + numPages) * BufferPoolConstants::PAGE_4KB_SIZE); + numPages++; + } while (valuesRemaining > 0); + return ColumnChunkMetadata(startPageIdx, numPages, numValues, metadata); + } +}; + ColumnChunk::ColumnChunk( LogicalType dataType, std::unique_ptr csvReaderConfig, bool hasNullChunk) : dataType{std::move(dataType)}, numBytesPerValue{getDataTypeSizeInChunk(this->dataType)}, @@ -21,6 +75,15 @@ ColumnChunk::ColumnChunk( if (hasNullChunk) { nullChunk = std::make_unique(); } + switch (this->dataType.getPhysicalType()) { + case PhysicalTypeID::BOOL: { + flushBufferFunction = booleanFlushBuffer; + break; + } + default: { + flushBufferFunction = fixedSizedFlushBuffer; + } + } } void ColumnChunk::initialize(offset_t capacity) { @@ -335,10 +398,8 @@ page_idx_t ColumnChunk::getNumPages() const { return numPagesToFlush; } -page_idx_t ColumnChunk::flushBuffer(BMFileHandle* dataFH, page_idx_t startPageIdx) { - FileUtils::writeToFile(dataFH->getFileInfo(), buffer.get(), bufferSize, - startPageIdx * BufferPoolConstants::PAGE_4KB_SIZE); - return getNumPagesForBuffer(); +ColumnChunkMetadata ColumnChunk::flushBuffer(BMFileHandle* dataFH, page_idx_t startPageIdx) { + return flushBufferFunction(buffer.get(), bufferSize, numValues, dataFH, startPageIdx); } uint32_t ColumnChunk::getDataTypeSizeInChunk(LogicalType& dataType) { diff --git a/src/storage/copier/compression.cpp b/src/storage/copier/compression.cpp new file mode 100644 index 00000000000..697ee2717cf --- /dev/null +++ b/src/storage/copier/compression.cpp @@ -0,0 +1,312 @@ +#include "storage/copier/compression.h" + +#include + +#include + +#include "arrow/array.h" +#include "common/exception/not_implemented.h" +#include "common/exception/storage.h" +#include "common/null_mask.h" +#include "common/types/types.h" +#include "common/vector/value_vector.h" +#include "fastpfor/bitpackinghelpers.h" +#include "storage/copier/sign_extend.h" +#include "storage/store/node_column.h" +#include + +using namespace kuzu::common; +namespace arrow { +class Array; +} + +namespace kuzu { +namespace storage { + +uint32_t getDataTypeSizeInChunk(const common::LogicalType& dataType) { + using namespace common; + switch (dataType.getLogicalTypeID()) { + case LogicalTypeID::STRUCT: { + return 0; + } + case LogicalTypeID::STRING: { + return sizeof(ku_string_t); + } + case LogicalTypeID::VAR_LIST: { + return sizeof(offset_t); + } + case LogicalTypeID::INTERNAL_ID: { + return sizeof(offset_t); + } + case LogicalTypeID::SERIAL: { + return sizeof(int64_t); + } + default: { + auto size = StorageUtils::getDataTypeSize(dataType); + assert(size <= BufferPoolConstants::PAGE_4KB_SIZE); + return size; + } + } +} + +uint64_t CompressionMetadata::numValues(uint64_t pageSize, const LogicalType& dataType) const { + switch (compression) { + case CompressionType::UNCOMPRESSED: { + return CopyCompression::numValues(pageSize, dataType); + } + case CompressionType::INTEGER_BITPACKING: { + switch (dataType.getPhysicalType()) { + case PhysicalTypeID::INT64: + return IntegerBitpacking::numValues(pageSize, BitpackHeader::readHeader(data)); + case PhysicalTypeID::INT32: + return IntegerBitpacking::numValues(pageSize, BitpackHeader::readHeader(data)); + default: { + throw common::StorageException( + "Attempted to read from a column chunk which uses integer bitpacking but does not " + "have a supported integer physical type: " + + PhysicalTypeUtils::physicalTypeToString(dataType.getPhysicalType())); + } + } + } + case CompressionType::BOOLEAN_BITPACKING: { + return BooleanBitpacking::numValues(pageSize); + } + default: { + throw common::StorageException( + "Unknown compression type with ID " + std::to_string((uint8_t)compression)); + } + } +} + +template +BitpackHeader IntegerBitpacking::getBitWidth( + const uint8_t* srcBuffer, uint64_t numValues) const { + auto max = 0ull; + auto hasNegative = false; + for (int i = 0; i < numValues; i++) { + T value = ((T*)srcBuffer)[i]; + auto abs = std::abs(value); + if (abs > max) { + max = abs; + } + if (value < 0) { + hasNegative = true; + } + } + if (hasNegative) { + // Needs an extra bit for two's complement encoding + return BitpackHeader{static_cast(std::bit_width(max) + 1), true}; + } else { + return BitpackHeader{static_cast(std::bit_width(max)), false}; + } +} + +template +void IntegerBitpacking::setValueFromUncompressed(uint8_t* srcBuffer, common::offset_t posInSrc, + uint8_t* dstBuffer, common::offset_t posInDst, const CompressionMetadata& metadata) const { + auto header = BitpackHeader::readHeader(metadata.data); + // This is a fairly naive implementation which uses fastunpack/fastpack + // to modify the data by decompressing/compressing a single chunk of values. + // + // TODO(bmwinger): modify the data in-place + // + // Data can be considered to be stored in aligned chunks of 32 values + // with a size of 32 * bitWidth bits, + // or bitWidth 32-bit values (we cast the buffer to a uint32_t* later). + auto chunkStart = getChunkStart(dstBuffer, posInDst, header.bitWidth); + auto posInChunk = posInDst % CHUNK_SIZE; + auto value = ((T*)srcBuffer)[posInSrc]; + // If there are negatives, the effective bit width is smaller + auto valueSize = std::bit_width((U)std::abs(value)); + if (!header.hasNegative && value < 0) { + throw NotImplementedException( + "Setting negative values to a chunk stored without negatives is not implemented yet"); + } + if ((header.hasNegative && valueSize > header.bitWidth - 1) || + (!header.hasNegative && valueSize > header.bitWidth)) { + throw NotImplementedException( + "Setting values larger than the bit width is not implemented yet"); + } + + U chunk[CHUNK_SIZE]; + FastPForLib::fastunpack((const uint32_t*)chunkStart, chunk, header.bitWidth); + chunk[posInChunk] = (U)value; + FastPForLib::fastpack(chunk, (uint32_t*)chunkStart, header.bitWidth); +} + +template +void IntegerBitpacking::getValues(const uint8_t* chunkStart, uint8_t pos, uint8_t* dst, + uint8_t numValuesToRead, const BitpackHeader& header) const { + // TODO(bmwinger): optimize as in setValueFromUncompressed + assert(pos + numValuesToRead <= CHUNK_SIZE); + + U chunk[CHUNK_SIZE]; + FastPForLib::fastunpack((const uint32_t*)chunkStart, chunk, header.bitWidth); + if (header.hasNegative) { + SignExtend((uint8_t*)chunk, header.bitWidth); + } + memcpy(dst, &chunk[pos], sizeof(T) * numValuesToRead); +} + +template +void IntegerBitpacking::getValue(const uint8_t* buffer, offset_t posInBuffer, uint8_t* dst, + offset_t posInDst, const CompressionMetadata& metadata) const { + auto header = BitpackHeader::readHeader(metadata.data); + auto chunkStart = getChunkStart(buffer, posInBuffer, header.bitWidth); + getValues(chunkStart, posInBuffer % CHUNK_SIZE, dst + posInDst * sizeof(T), 1, header); +} + +template +uint64_t IntegerBitpacking::compressNextPage(const uint8_t*& srcBuffer, + uint64_t numValuesRemaining, uint8_t* dstBuffer, uint64_t dstBufferSize, + const struct CompressionMetadata& metadata) const { + auto header = BitpackHeader::readHeader(metadata.data); + auto bitWidth = header.bitWidth; + + if (bitWidth == 0) { + return 0; + } + auto numValuesToCompress = std::min(numValuesRemaining, numValues(dstBufferSize, header)); + assert(dstBufferSize >= CHUNK_SIZE); + assert(dstBufferSize >= numValuesToCompress * bitWidth / 8); + for (auto i = 0ull; i < numValuesToCompress; i += CHUNK_SIZE) { + FastPForLib::fastpack( + (const U*)srcBuffer + i, (uint32_t*)(dstBuffer + i * bitWidth / 8), bitWidth); + } + srcBuffer += numValuesToCompress * sizeof(U); + return numValuesToCompress * bitWidth / 8; +} + +template +void IntegerBitpacking::decompressFromPage(const uint8_t* srcBuffer, uint64_t srcOffset, + uint8_t* dstBuffer, uint64_t dstOffset, uint64_t numValues, + const CompressionMetadata& metadata) const { + auto header = BitpackHeader::readHeader(metadata.data); + + auto srcCursor = getChunkStart(srcBuffer, srcOffset, header.bitWidth); + auto valuesInFirstChunk = std::min(CHUNK_SIZE - (srcOffset % CHUNK_SIZE), numValues); + auto bytesPerChunk = CHUNK_SIZE / 8 * header.bitWidth; + auto dstIndex = dstOffset; + + // Copy values which aren't aligned to the start of the chunk + if (valuesInFirstChunk < CHUNK_SIZE) { + getValues(srcCursor, srcOffset % CHUNK_SIZE, dstBuffer + dstIndex * sizeof(U), + valuesInFirstChunk, header); + if (numValues == valuesInFirstChunk) { + return; + } + // Start at the end of the first partial chunk + srcCursor += bytesPerChunk; + dstIndex += valuesInFirstChunk; + } + + // Use fastunpack to directly unpack the full-sized chunks + for (; dstIndex < dstOffset + numValues - numValues % CHUNK_SIZE; dstIndex += CHUNK_SIZE) { + FastPForLib::fastunpack( + (const uint32_t*)srcCursor, (U*)dstBuffer + dstIndex, header.bitWidth); + if (header.hasNegative) { + SignExtend(dstBuffer + dstIndex * sizeof(U), header.bitWidth); + } + srcCursor += bytesPerChunk; + } + // Copy remaining values from within the last chunk. + if (dstIndex < dstOffset + numValues) { + getValues(srcCursor, 0, dstBuffer + dstIndex * sizeof(U), dstOffset + numValues - dstIndex, + header); + } +} + +// Uses unsigned types since the storage is unsigned +// TODO: Doesn't currently support int16 +// template class IntegerBitpacking; +template class IntegerBitpacking; +template class IntegerBitpacking; + +void BooleanBitpacking::setValueFromUncompressed(uint8_t* srcBuffer, offset_t posInSrc, + uint8_t* dstBuffer, offset_t posInDst, const CompressionMetadata& metadata) const { + auto val = ((bool*)srcBuffer)[posInSrc]; + common::NullMask::setNull((uint64_t*)dstBuffer, posInDst, val); +} + +void BooleanBitpacking::getValue(const uint8_t* buffer, offset_t posInBuffer, uint8_t* dst, + offset_t posInDst, const CompressionMetadata& metadata) const { + *(dst + posInDst) = common::NullMask::isNull((uint64_t*)buffer, posInBuffer); +} + +uint64_t BooleanBitpacking::compressNextPage(const uint8_t*& srcBuffer, uint64_t numValuesRemaining, + uint8_t* dstBuffer, uint64_t dstBufferSize, const struct CompressionMetadata& metadata) const { + // TODO(bmwinger): Optimize, e.g. using an integer bitpacking function + auto numValuesToCompress = std::min(numValuesRemaining, numValues(dstBufferSize)); + for (auto i = 0ull; i < numValuesToCompress; i++) { + common::NullMask::setNull((uint64_t*)dstBuffer, i, srcBuffer[i]); + } + srcBuffer += numValuesToCompress / 8; + // Will be a multiple of 8 except for the last iteration + return numValuesToCompress / 8 + (bool)(numValuesToCompress % 8); +} + +void BooleanBitpacking::decompressFromPage(const uint8_t* srcBuffer, uint64_t srcOffset, + uint8_t* dstBuffer, uint64_t dstOffset, uint64_t numValues, + const CompressionMetadata& metadata) const { + // TODO(bmwinger): Optimize, e.g. using an integer bitpacking function + for (auto i = 0ull; i < numValues; i++) { + ((bool*)dstBuffer)[dstOffset + i] = + common::NullMask::isNull((uint64_t*)srcBuffer, srcOffset + i); + } +} + +// Reads the header from the buffer and advances the buffer pointer to the end of the header +BitpackHeader BitpackHeader::readHeader(uint8_t data) { + BitpackHeader header; + header.bitWidth = data & BITWIDTH_MASK; + header.hasNegative = data & NEGATIVE_FLAG; + return header; +} + +void ReadCompressedValuesFromPage::operator()(uint8_t* frame, PageElementCursor& pageCursor, + common::ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead, + const CompressionMetadata& metadata) { + switch (metadata.compression) { + case CompressionType::UNCOMPRESSED: + return copy.decompressFromPage(frame, pageCursor.elemPosInPage, resultVector->getData(), + posInVector, numValuesToRead, metadata); + case CompressionType::INTEGER_BITPACKING: + return integerBitpacking->decompressFromPage(frame, pageCursor.elemPosInPage, + resultVector->getData(), posInVector, numValuesToRead, metadata); + case CompressionType::BOOLEAN_BITPACKING: + return booleanBitpacking.decompressFromPage(frame, pageCursor.elemPosInPage, + resultVector->getData(), posInVector, numValuesToRead, metadata); + } +} + +void LookupCompressedValueInPage::operator()(uint8_t* frame, PageElementCursor& pageCursor, + uint8_t* result, uint32_t posInResult, const CompressionMetadata& metadata) { + switch (metadata.compression) { + case CompressionType::UNCOMPRESSED: + return copy.getValue(frame, pageCursor.elemPosInPage, result, posInResult, metadata); + case CompressionType::INTEGER_BITPACKING: + return integerBitpacking->getValue( + frame, pageCursor.elemPosInPage, result, posInResult, metadata); + case CompressionType::BOOLEAN_BITPACKING: + return booleanBitpacking.getValue( + frame, pageCursor.elemPosInPage, result, posInResult, metadata); + } +} + +void WriteCompressedValueToPage::operator()(uint8_t* frame, uint16_t posInFrame, + common::ValueVector* vector, uint32_t posInVector, const CompressionMetadata& metadata) { + switch (metadata.compression) { + case CompressionType::UNCOMPRESSED: + return copy.setValueFromUncompressed( + vector->getData(), posInVector, frame, posInFrame, metadata); + case CompressionType::INTEGER_BITPACKING: + return integerBitpacking->setValueFromUncompressed( + vector->getData(), posInVector, frame, posInFrame, metadata); + case CompressionType::BOOLEAN_BITPACKING: + return booleanBitpacking.setValueFromUncompressed( + vector->getData(), posInVector, frame, posInFrame, metadata); + } +} + +} // namespace storage +} // namespace kuzu diff --git a/src/storage/store/node_column.cpp b/src/storage/store/node_column.cpp index 34feaf1874c..239cf8e6929 100644 --- a/src/storage/store/node_column.cpp +++ b/src/storage/store/node_column.cpp @@ -1,5 +1,9 @@ #include "storage/store/node_column.h" +#include + +#include "storage/copier/column_chunk.h" +#include "storage/copier/compression.h" #include "storage/storage_structure/storage_structure.h" #include "storage/store/property_statistics.h" #include "storage/store/string_node_column.h" @@ -15,103 +19,169 @@ using namespace kuzu::transaction; namespace kuzu { namespace storage { -void FixedSizedNodeColumnFunc::readValuesFromPage(uint8_t* frame, PageElementCursor& pageCursor, - ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead) { - auto numBytesPerValue = resultVector->getNumBytesPerValue(); - memcpy(resultVector->getData() + posInVector * numBytesPerValue, - frame + pageCursor.elemPosInPage * numBytesPerValue, numValuesToRead * numBytesPerValue); -} +struct FixedSizedNodeColumnFunc { + static void readValuesFromPage(uint8_t* frame, PageElementCursor& pageCursor, + ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead, + const CompressionMetadata& metadata) { + auto numBytesPerValue = resultVector->getNumBytesPerValue(); + memcpy(resultVector->getData() + posInVector * numBytesPerValue, + frame + pageCursor.elemPosInPage * numBytesPerValue, + numValuesToRead * numBytesPerValue); + } -void FixedSizedNodeColumnFunc::writeValueToPage( - uint8_t* frame, uint16_t posInFrame, ValueVector* vector, uint32_t posInVector) { - auto numBytesPerValue = vector->getNumBytesPerValue(); - memcpy(frame + posInFrame * numBytesPerValue, - vector->getData() + posInVector * numBytesPerValue, numBytesPerValue); -} + class LookupFixedValueInPage { + const LogicalType& logicalType; -void FixedSizedNodeColumnFunc::readInternalIDValuesFromPage(uint8_t* frame, - PageElementCursor& pageCursor, ValueVector* resultVector, uint32_t posInVector, - uint32_t numValuesToRead) { - auto resultData = (internalID_t*)resultVector->getData(); - for (auto i = 0u; i < numValuesToRead; i++) { - auto posInFrame = pageCursor.elemPosInPage + i; - resultData[posInVector + i].offset = *(offset_t*)(frame + (posInFrame * sizeof(offset_t))); + public: + explicit LookupFixedValueInPage(const LogicalType& logicalType) + : logicalType{logicalType} {} + + void operator()(uint8_t* frame, PageElementCursor& pageCursor, uint8_t* result, + uint32_t posInResult, const CompressionMetadata& metadata) { + auto numBytesPerValue = getDataTypeSizeInChunk(logicalType); + memcpy(result + posInResult * numBytesPerValue, + frame + pageCursor.elemPosInPage * numBytesPerValue, numBytesPerValue); + } + }; + + static void writeValueToPage(uint8_t* frame, uint16_t posInFrame, ValueVector* vector, + uint32_t posInVector, const CompressionMetadata& metadata) { + auto numBytesPerValue = vector->getNumBytesPerValue(); + memcpy(frame + posInFrame * numBytesPerValue, + vector->getData() + posInVector * numBytesPerValue, numBytesPerValue); + } + + static void readInternalIDValuesFromPage(uint8_t* frame, PageElementCursor& pageCursor, + ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead, + const CompressionMetadata& metadata) { + auto resultData = (internalID_t*)resultVector->getData(); + for (auto i = 0u; i < numValuesToRead; i++) { + auto posInFrame = pageCursor.elemPosInPage + i; + resultData[posInVector + i].offset = + *(offset_t*)(frame + (posInFrame * sizeof(offset_t))); + } } -} -void FixedSizedNodeColumnFunc::writeInternalIDValueToPage( - uint8_t* frame, uint16_t posInFrame, ValueVector* vector, uint32_t posInVector) { - auto relID = vector->getValue(posInVector); - memcpy(frame + posInFrame * sizeof(offset_t), &relID.offset, sizeof(offset_t)); -} + static void writeInternalIDValueToPage(uint8_t* frame, uint16_t posInFrame, ValueVector* vector, + uint32_t posInVector, const CompressionMetadata& metadata) { + auto relID = vector->getValue(posInVector); + memcpy(frame + posInFrame * sizeof(offset_t), &relID.offset, sizeof(offset_t)); + } +}; + +struct NullNodeColumnFunc { + static void readValuesFromPage(uint8_t* frame, PageElementCursor& pageCursor, + ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead, + const CompressionMetadata& metadata) { + // Read bit-packed null flags from the frame into the result vector + // Casting to uint64_t should be safe as long as the page size is a multiple of 8 bytes. + // Otherwise, it could read off the end of the page. + resultVector->setNullFromBits( + (uint64_t*)frame, pageCursor.elemPosInPage, posInVector, numValuesToRead); + } + + static void writeValueToPage(uint8_t* frame, uint16_t posInFrame, ValueVector* vector, + uint32_t posInVector, const CompressionMetadata& metadata) { + // Casting to uint64_t should be safe as long as the page size is a multiple of 8 bytes. + // Otherwise, it could read off the end of the page. + NullMask::setNull( + (uint64_t*)frame, posInFrame, NullMask::isNull(vector->getNullMaskData(), posInVector)); + } +}; + +struct BoolNodeColumnFunc { + static void readValuesFromPage(uint8_t* frame, PageElementCursor& pageCursor, + ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead, + const CompressionMetadata& metadata) { + // Read bit-packed null flags from the frame into the result vector + // Casting to uint64_t should be safe as long as the page size is a multiple of 8 bytes. + // Otherwise, it could read off the end of the page. + // + // Currently, the frame stores bitpacked bools, but the value_vector does not + for (auto i = 0; i < numValuesToRead; i++) { + resultVector->setValue( + posInVector + i, NullMask::isNull((uint64_t*)frame, pageCursor.elemPosInPage + i)); + } + } -void NullNodeColumnFunc::readValuesFromPage(uint8_t* frame, PageElementCursor& pageCursor, - ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead) { - // Read bit-packed null flags from the frame into the result vector - // Casting to uint64_t should be safe as long as the page size is a multiple of 8 bytes. - // Otherwise, it could read off the end of the page. - resultVector->setNullFromBits( - (uint64_t*)frame, pageCursor.elemPosInPage, posInVector, numValuesToRead); -} + static void writeValueToPage(uint8_t* frame, uint16_t posInFrame, ValueVector* vector, + uint32_t posInVector, const CompressionMetadata& metadata) { + // Casting to uint64_t should be safe as long as the page size is a multiple of 8 bytes. + // Otherwise, it could read/write off the end of the page. + NullMask::copyNullMask(vector->getValue(posInVector) ? &NullMask::ALL_NULL_ENTRY : + &NullMask::NO_NULL_ENTRY, + posInVector, (uint64_t*)frame, posInFrame, 1); + } -void NullNodeColumnFunc::writeValueToPage( - uint8_t* frame, uint16_t posInFrame, ValueVector* vector, uint32_t posInVector) { - // Casting to uint64_t should be safe as long as the page size is a multiple of 8 bytes. - // Otherwise, it could read off the end of the page. - NullMask::setNull( - (uint64_t*)frame, posInFrame, NullMask::isNull(vector->getNullMaskData(), posInVector)); -} + static void lookupValueInPage(uint8_t* frame, PageElementCursor& pageCursor, uint8_t* result, + uint32_t posInResult, const CompressionMetadata& metadata) { + result[posInResult] = NullMask::isNull((uint64_t*)frame, pageCursor.elemPosInPage); + } +}; -void BoolNodeColumnFunc::readValuesFromPage(uint8_t* frame, PageElementCursor& pageCursor, - ValueVector* resultVector, uint32_t posInVector, uint32_t numValuesToRead) { - // Read bit-packed null flags from the frame into the result vector - // Casting to uint64_t should be safe as long as the page size is a multiple of 8 bytes. - // Otherwise, it could read off the end of the page. - // - // Currently, the frame stores bitpacked bools, but the value_vector does not - for (auto i = 0; i < numValuesToRead; i++) { - resultVector->setValue( - posInVector + i, NullMask::isNull((uint64_t*)frame, pageCursor.elemPosInPage + i)); +static read_node_column_func_t getReadNodeColumnFunc(const LogicalType& logicalType) { + switch (logicalType.getLogicalTypeID()) { + case LogicalTypeID::INTERNAL_ID: + return FixedSizedNodeColumnFunc::readInternalIDValuesFromPage; + case LogicalTypeID::BOOL: + return BoolNodeColumnFunc::readValuesFromPage; + case LogicalTypeID::INT64: + case LogicalTypeID::INT32: + case LogicalTypeID::INT16: + case LogicalTypeID::DATE: + case LogicalTypeID::TIMESTAMP: + return ReadCompressedValuesFromPage(logicalType); + default: + return FixedSizedNodeColumnFunc::readValuesFromPage; } } -void BoolNodeColumnFunc::writeValueToPage( - uint8_t* frame, uint16_t posInFrame, ValueVector* vector, uint32_t posInVector) { - // Casting to uint64_t should be safe as long as the page size is a multiple of 8 bytes. - // Otherwise, it could read/write off the end of the page. - NullMask::copyNullMask( - vector->getValue(posInVector) ? &NullMask::ALL_NULL_ENTRY : &NullMask::NO_NULL_ENTRY, - posInVector, (uint64_t*)frame, posInFrame, 1); +static lookup_node_column_func_t getLookupNodeColumnFunc(const LogicalType& logicalType) { + switch (logicalType.getLogicalTypeID()) { + case LogicalTypeID::BOOL: + return BoolNodeColumnFunc::lookupValueInPage; + case LogicalTypeID::INT64: + case LogicalTypeID::INT32: + case LogicalTypeID::INT16: + case LogicalTypeID::DATE: + case LogicalTypeID::TIMESTAMP: + return LookupCompressedValueInPage(logicalType); + default: + return FixedSizedNodeColumnFunc::LookupFixedValueInPage(logicalType); + } } -NodeColumn::NodeColumn(const Property& property, BMFileHandle* dataFH, BMFileHandle* metadataFH, - BufferManager* bufferManager, WAL* wal, Transaction* transaction, - RWPropertyStats propertyStatistics, bool requireNullColumn) - : NodeColumn{*property.getDataType(), *property.getMetadataDAHInfo(), dataFH, metadataFH, - bufferManager, wal, transaction, propertyStatistics, requireNullColumn} {} +static write_node_column_func_t getWriteNodeColumnFunc(const LogicalType& logicalType) { + switch (logicalType.getLogicalTypeID()) { + case LogicalTypeID::INTERNAL_ID: + return FixedSizedNodeColumnFunc::writeInternalIDValueToPage; + case LogicalTypeID::BOOL: + return BoolNodeColumnFunc::writeValueToPage; + case LogicalTypeID::INT64: + case LogicalTypeID::INT32: + case LogicalTypeID::DATE: + case LogicalTypeID::TIMESTAMP: + return WriteCompressedValueToPage(logicalType); + default: + return FixedSizedNodeColumnFunc::writeValueToPage; + } +} NodeColumn::NodeColumn(LogicalType dataType, const MetadataDAHInfo& metaDAHeaderInfo, BMFileHandle* dataFH, BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal, transaction::Transaction* transaction, RWPropertyStats propertyStatistics, bool requireNullColumn) - : storageStructureID{StorageStructureID::newDataID()}, dataType{std::move(dataType)}, - dataFH{dataFH}, metadataFH{metadataFH}, bufferManager{bufferManager}, + : storageStructureID{StorageStructureID::newDataID()}, dataType{dataType}, dataFH{dataFH}, + metadataFH{metadataFH}, bufferManager{bufferManager}, propertyStatistics{propertyStatistics}, wal{wal} { metadataDA = std::make_unique>(*metadataFH, StorageStructureID::newMetadataID(), metaDAHeaderInfo.dataDAHPageIdx, bufferManager, wal, transaction); - numBytesPerFixedSizedValue = ColumnChunk::getDataTypeSizeInChunk(this->dataType); + numBytesPerFixedSizedValue = getDataTypeSizeInChunk(this->dataType); + readNodeColumnFunc = getReadNodeColumnFunc(this->dataType); + lookupNodeColumnFunc = getLookupNodeColumnFunc(this->dataType); + writeNodeColumnFunc = getWriteNodeColumnFunc(this->dataType); assert(numBytesPerFixedSizedValue <= BufferPoolConstants::PAGE_4KB_SIZE); - numValuesPerPage = - numBytesPerFixedSizedValue == 0 ? - 0 : - PageUtils::getNumElementsInAPage(numBytesPerFixedSizedValue, false /* hasNull */); - readNodeColumnFunc = this->dataType.getLogicalTypeID() == LogicalTypeID::INTERNAL_ID ? - FixedSizedNodeColumnFunc::readInternalIDValuesFromPage : - FixedSizedNodeColumnFunc::readValuesFromPage; - writeNodeColumnFunc = this->dataType.getLogicalTypeID() == LogicalTypeID::INTERNAL_ID ? - FixedSizedNodeColumnFunc::writeInternalIDValueToPage : - FixedSizedNodeColumnFunc::writeValueToPage; if (requireNullColumn) { nullColumn = std::make_unique(metaDAHeaderInfo.nullDAHPageIdx, dataFH, metadataFH, bufferManager, wal, transaction, propertyStatistics); @@ -123,22 +193,10 @@ void NodeColumn::batchLookup( for (auto i = 0u; i < size; ++i) { auto nodeOffset = nodeOffsets[i]; auto cursor = getPageCursorForOffset(transaction->getType(), nodeOffset); + auto nodeGroupIdx = StorageUtils::getNodeGroupIdx(nodeOffset); + auto chunkMeta = metadataDA->get(nodeGroupIdx, transaction->getType()); readFromPage(transaction, cursor.pageIdx, [&](uint8_t* frame) -> void { - memcpy(result + i * numBytesPerFixedSizedValue, - frame + (cursor.elemPosInPage * numBytesPerFixedSizedValue), - numBytesPerFixedSizedValue); - }); - } -} - -void BoolNodeColumn::batchLookup( - Transaction* transaction, const offset_t* nodeOffsets, size_t size, uint8_t* result) { - for (auto i = 0u; i < size; ++i) { - auto nodeOffset = nodeOffsets[i]; - auto cursor = getPageCursorForOffset(transaction->getType(), nodeOffset); - readFromPage(transaction, cursor.pageIdx, [&](uint8_t* frame) -> void { - // De-compress bitpacked bools - result[i] = NullMask::isNull((uint64_t*)frame, cursor.elemPosInPage); + lookupNodeColumnFunc(frame, cursor, result, i, chunkMeta.compMeta); }); } } @@ -156,11 +214,13 @@ void NodeColumn::scan(transaction::Transaction* transaction, node_group_idx_t no nullColumn->scan(transaction, nodeGroupIdx, startOffsetInGroup, endOffsetInGroup, resultVector, offsetInVector); } - auto pageCursor = PageUtils::getPageElementCursorForPos(startOffsetInGroup, numValuesPerPage); auto chunkMeta = metadataDA->get(nodeGroupIdx, transaction->getType()); + auto pageCursor = PageUtils::getPageElementCursorForPos(startOffsetInGroup, + chunkMeta.compMeta.numValues(BufferPoolConstants::PAGE_4KB_SIZE, dataType)); pageCursor.pageIdx += chunkMeta.pageIdx; auto numValuesToScan = endOffsetInGroup - startOffsetInGroup; - scanUnfiltered(transaction, pageCursor, numValuesToScan, resultVector, offsetInVector); + scanUnfiltered( + transaction, pageCursor, numValuesToScan, resultVector, chunkMeta.compMeta, offsetInVector); } void NodeColumn::scan(node_group_idx_t nodeGroupIdx, ColumnChunk* columnChunk) { @@ -182,24 +242,28 @@ void NodeColumn::scanInternal( auto startNodeOffset = nodeIDVector->readNodeOffset(0); assert(startNodeOffset % DEFAULT_VECTOR_CAPACITY == 0); auto cursor = getPageCursorForOffset(transaction->getType(), startNodeOffset); + auto nodeGroupIdx = StorageUtils::getNodeGroupIdx(startNodeOffset); + auto chunkMeta = metadataDA->get(nodeGroupIdx, transaction->getType()); if (nodeIDVector->state->selVector->isUnfiltered()) { - scanUnfiltered( - transaction, cursor, nodeIDVector->state->selVector->selectedSize, resultVector); + scanUnfiltered(transaction, cursor, nodeIDVector->state->selVector->selectedSize, + resultVector, chunkMeta.compMeta); } else { - scanFiltered(transaction, cursor, nodeIDVector, resultVector); + scanFiltered(transaction, cursor, nodeIDVector, resultVector, chunkMeta.compMeta); } } void NodeColumn::scanUnfiltered(Transaction* transaction, PageElementCursor& pageCursor, - uint64_t numValuesToScan, ValueVector* resultVector, uint64_t startPosInVector) { + uint64_t numValuesToScan, ValueVector* resultVector, const CompressionMetadata& compMeta, + uint64_t startPosInVector) { uint64_t numValuesScanned = 0; while (numValuesScanned < numValuesToScan) { uint64_t numValuesToScanInPage = - std::min((uint64_t)numValuesPerPage - pageCursor.elemPosInPage, + std::min((uint64_t)compMeta.numValues(BufferPoolConstants::PAGE_4KB_SIZE, dataType) - + pageCursor.elemPosInPage, numValuesToScan - numValuesScanned); readFromPage(transaction, pageCursor.pageIdx, [&](uint8_t* frame) -> void { readNodeColumnFunc(frame, pageCursor, resultVector, numValuesScanned + startPosInVector, - numValuesToScanInPage); + numValuesToScanInPage, compMeta); }); numValuesScanned += numValuesToScanInPage; pageCursor.nextPage(); @@ -207,20 +271,21 @@ void NodeColumn::scanUnfiltered(Transaction* transaction, PageElementCursor& pag } void NodeColumn::scanFiltered(Transaction* transaction, PageElementCursor& pageCursor, - ValueVector* nodeIDVector, ValueVector* resultVector) { + ValueVector* nodeIDVector, ValueVector* resultVector, const CompressionMetadata& compMeta) { auto numValuesToScan = nodeIDVector->state->getOriginalSize(); auto numValuesScanned = 0u; auto posInSelVector = 0u; while (numValuesScanned < numValuesToScan) { uint64_t numValuesToScanInPage = - std::min((uint64_t)numValuesPerPage - pageCursor.elemPosInPage, + std::min((uint64_t)compMeta.numValues(BufferPoolConstants::PAGE_4KB_SIZE, dataType) - + pageCursor.elemPosInPage, numValuesToScan - numValuesScanned); if (StorageStructure::isInRange( nodeIDVector->state->selVector->selectedPositions[posInSelVector], numValuesScanned, numValuesScanned + numValuesToScanInPage)) { readFromPage(transaction, pageCursor.pageIdx, [&](uint8_t* frame) -> void { - readNodeColumnFunc( - frame, pageCursor, resultVector, numValuesScanned, numValuesToScanInPage); + readNodeColumnFunc(frame, pageCursor, resultVector, numValuesScanned, + numValuesToScanInPage, compMeta); }); } numValuesScanned += numValuesToScanInPage; @@ -254,8 +319,11 @@ void NodeColumn::lookupInternal( void NodeColumn::lookupValue(transaction::Transaction* transaction, offset_t nodeOffset, ValueVector* resultVector, uint32_t posInVector) { auto cursor = getPageCursorForOffset(transaction->getType(), nodeOffset); + auto nodeGroupIdx = StorageUtils::getNodeGroupIdx(nodeOffset); + auto chunkMeta = metadataDA->get(nodeGroupIdx, transaction->getType()); readFromPage(transaction, cursor.pageIdx, [&](uint8_t* frame) -> void { - readNodeColumnFunc(frame, cursor, resultVector, posInVector, 1 /* numValuesToRead */); + readNodeColumnFunc( + frame, cursor, resultVector, posInVector, 1 /* numValuesToRead */, chunkMeta.compMeta); }); } @@ -271,12 +339,11 @@ page_idx_t NodeColumn::append( ColumnChunk* columnChunk, page_idx_t startPageIdx, uint64_t nodeGroupIdx) { // Main column chunk. page_idx_t numPagesFlushed = 0; - auto numPagesForChunk = columnChunk->flushBuffer(dataFH, startPageIdx); - ColumnChunkMetadata metadata{startPageIdx, numPagesForChunk, columnChunk->getNumValues()}; + auto metadata = columnChunk->flushBuffer(dataFH, startPageIdx); metadataDA->resize(nodeGroupIdx + 1); metadataDA->update(nodeGroupIdx, metadata); - numPagesFlushed += numPagesForChunk; - startPageIdx += numPagesForChunk; + numPagesFlushed += metadata.numPages; + startPageIdx += metadata.numPages; // Null column chunk. auto numPagesForNullChunk = nullColumn->append(columnChunk->getNullChunk(), startPageIdx, nodeGroupIdx); @@ -333,9 +400,11 @@ void NodeColumn::writeInternal( void NodeColumn::writeValue( offset_t nodeOffset, ValueVector* vectorToWriteFrom, uint32_t posInVectorToWriteFrom) { auto walPageInfo = createWALVersionOfPageForValue(nodeOffset); + auto nodeGroupIdx = StorageUtils::getNodeGroupIdx(nodeOffset); + auto chunkMeta = metadataDA->get(nodeGroupIdx, TransactionType::WRITE); try { - writeNodeColumnFunc( - walPageInfo.frame, walPageInfo.posInPage, vectorToWriteFrom, posInVectorToWriteFrom); + writeNodeColumnFunc(walPageInfo.frame, walPageInfo.posInPage, vectorToWriteFrom, + posInVectorToWriteFrom, chunkMeta.compMeta); } catch (Exception& e) { bufferManager->unpin(*wal->fileHandle, walPageInfo.pageIdxInWAL); dataFH->releaseWALPageIdxLock(walPageInfo.originalPageIdx); @@ -401,8 +470,10 @@ PageElementCursor NodeColumn::getPageCursorForOffset( TransactionType transactionType, offset_t nodeOffset) { auto nodeGroupIdx = StorageUtils::getNodeGroupIdx(nodeOffset); auto offsetInNodeGroup = nodeOffset - StorageUtils::getStartOffsetOfNodeGroup(nodeGroupIdx); - auto pageCursor = PageUtils::getPageElementCursorForPos(offsetInNodeGroup, numValuesPerPage); - pageCursor.pageIdx += metadataDA->get(nodeGroupIdx, transactionType).pageIdx; + auto chunkMeta = metadataDA->get(nodeGroupIdx, transactionType); + auto pageCursor = PageUtils::getPageElementCursorForPos(offsetInNodeGroup, + chunkMeta.compMeta.numValues(BufferPoolConstants::PAGE_4KB_SIZE, dataType)); + pageCursor.pageIdx += chunkMeta.pageIdx; return pageCursor; } @@ -414,12 +485,7 @@ BoolNodeColumn::BoolNodeColumn(const MetadataDAHInfo& metaDAHeaderInfo, BMFileHa BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal, Transaction* transaction, RWPropertyStats propertyStatistics, bool requireNullColumn) : NodeColumn{LogicalType(LogicalTypeID::BOOL), metaDAHeaderInfo, dataFH, metadataFH, - bufferManager, wal, transaction, propertyStatistics, requireNullColumn} { - readNodeColumnFunc = BoolNodeColumnFunc::readValuesFromPage; - writeNodeColumnFunc = BoolNodeColumnFunc::writeValueToPage; - // 8 values per byte (on-disk) - numValuesPerPage = PageUtils::getNumElementsInAPage(1, false /*requireNullColumn*/) * 8; -} + bufferManager, wal, transaction, propertyStatistics, requireNullColumn} {} NullNodeColumn::NullNodeColumn(page_idx_t metaDAHPageIdx, BMFileHandle* dataFH, BMFileHandle* metadataFH, BufferManager* bufferManager, WAL* wal, Transaction* transaction, @@ -429,9 +495,6 @@ NullNodeColumn::NullNodeColumn(page_idx_t metaDAHPageIdx, BMFileHandle* dataFH, false /*requireNullColumn*/} { readNodeColumnFunc = NullNodeColumnFunc::readValuesFromPage; writeNodeColumnFunc = NullNodeColumnFunc::writeValueToPage; - - // 8 values per byte - numValuesPerPage = PageUtils::getNumElementsInAPage(1, false /*requireNullColumn*/) * 8; } void NullNodeColumn::scan( @@ -476,14 +539,13 @@ void NullNodeColumn::lookup( page_idx_t NullNodeColumn::append( ColumnChunk* columnChunk, page_idx_t startPageIdx, uint64_t nodeGroupIdx) { - auto numPagesFlushed = columnChunk->flushBuffer(dataFH, startPageIdx); + auto metadata = columnChunk->flushBuffer(dataFH, startPageIdx); metadataDA->resize(nodeGroupIdx + 1); - metadataDA->update(nodeGroupIdx, - ColumnChunkMetadata{startPageIdx, numPagesFlushed, columnChunk->getNumValues()}); + metadataDA->update(nodeGroupIdx, metadata); if (static_cast(columnChunk)->mayHaveNull()) { propertyStatistics.setHasNull(DUMMY_WRITE_TRANSACTION); } - return numPagesFlushed; + return metadata.numPages; } void NullNodeColumn::setNull(offset_t nodeOffset) { @@ -546,8 +608,8 @@ std::unique_ptr NodeColumnFactory::createNodeColumn(const LogicalTyp BufferManager* bufferManager, WAL* wal, Transaction* transaction, RWPropertyStats stats) { switch (dataType.getLogicalTypeID()) { case LogicalTypeID::BOOL: { - return std::make_unique(metaDAHeaderInfo, dataFH, metadataFH, bufferManager, - wal, transaction, stats, true /* requireNullColumn */); + return std::make_unique( + metaDAHeaderInfo, dataFH, metadataFH, bufferManager, wal, transaction, stats); } case LogicalTypeID::INT64: case LogicalTypeID::INT32: @@ -560,8 +622,8 @@ std::unique_ptr NodeColumnFactory::createNodeColumn(const LogicalTyp case LogicalTypeID::INTERVAL: case LogicalTypeID::INTERNAL_ID: case LogicalTypeID::FIXED_LIST: { - return std::make_unique(dataType, metaDAHeaderInfo, dataFH, metadataFH, - bufferManager, wal, transaction, stats, true /* requireNullColumn */); + return std::make_unique( + dataType, metaDAHeaderInfo, dataFH, metadataFH, bufferManager, wal, transaction, stats); } case LogicalTypeID::BLOB: case LogicalTypeID::STRING: { diff --git a/src/storage/store/string_node_column.cpp b/src/storage/store/string_node_column.cpp index 338b2b90211..df49d36bc1e 100644 --- a/src/storage/store/string_node_column.cpp +++ b/src/storage/store/string_node_column.cpp @@ -9,8 +9,8 @@ using namespace kuzu::transaction; namespace kuzu { namespace storage { -void StringNodeColumnFunc::writeStringValuesToPage( - uint8_t* frame, uint16_t posInFrame, ValueVector* vector, uint32_t posInVector) { +void StringNodeColumnFunc::writeStringValuesToPage(uint8_t* frame, uint16_t posInFrame, + ValueVector* vector, uint32_t posInVector, const CompressionMetadata& metadata) { auto kuStrInFrame = (ku_string_t*)(frame + (posInFrame * sizeof(ku_string_t))); auto kuStrInVector = vector->getValue(posInVector); memcpy(kuStrInFrame->prefix, kuStrInVector.prefix, diff --git a/src/storage/store/var_list_node_column.cpp b/src/storage/store/var_list_node_column.cpp index ba0292b9018..1051ba9604b 100644 --- a/src/storage/store/var_list_node_column.cpp +++ b/src/storage/store/var_list_node_column.cpp @@ -140,11 +140,13 @@ offset_t VarListNodeColumn::readOffset( Transaction* transaction, node_group_idx_t nodeGroupIdx, offset_t offsetInNodeGroup) { auto offsetVector = std::make_unique(LogicalTypeID::INT64); offsetVector->state = DataChunkState::getSingleValueDataChunkState(); - auto pageCursor = PageUtils::getPageElementCursorForPos(offsetInNodeGroup, numValuesPerPage); - pageCursor.pageIdx += metadataDA->get(nodeGroupIdx, transaction->getType()).pageIdx; + auto chunkMeta = metadataDA->get(nodeGroupIdx, transaction->getType()); + auto pageCursor = PageUtils::getPageElementCursorForPos(offsetInNodeGroup, + chunkMeta.compMeta.numValues(BufferPoolConstants::PAGE_4KB_SIZE, dataType)); + pageCursor.pageIdx += chunkMeta.pageIdx; readFromPage(transaction, pageCursor.pageIdx, [&](uint8_t* frame) -> void { - readNodeColumnFunc( - frame, pageCursor, offsetVector.get(), 0 /* posInVector */, 1 /* numValuesToRead */); + readNodeColumnFunc(frame, pageCursor, offsetVector.get(), 0 /* posInVector */, + 1 /* numValuesToRead */, chunkMeta.compMeta); }); return offsetVector->getValue(0); } diff --git a/test/storage/CMakeLists.txt b/test/storage/CMakeLists.txt index 8741bd6ec57..2e3e10493e7 100644 --- a/test/storage/CMakeLists.txt +++ b/test/storage/CMakeLists.txt @@ -4,3 +4,4 @@ add_kuzu_test(wal_record_test wal_record_test.cpp) add_kuzu_test(wal_replayer_test wal_replayer_test.cpp) add_kuzu_test(wal_test wal_test.cpp) add_kuzu_test(table_statistics_test table_statistics_test.cpp) +add_kuzu_test(compression_test compression_test.cpp) diff --git a/test/storage/compression_test.cpp b/test/storage/compression_test.cpp new file mode 100644 index 00000000000..a226d02d417 --- /dev/null +++ b/test/storage/compression_test.cpp @@ -0,0 +1,172 @@ +#include "gtest/gtest.h" +#include "storage/copier/compression.h" +#include + +using namespace kuzu::common; +using namespace kuzu::storage; + +template +void test_compression(CompressionAlg& alg, std::vector src) { + auto pageSize = 4096; + std::vector dest(pageSize); + + auto metadata = alg.startCompression((uint8_t*)src.data(), src.size()); + // For simplicity, we'll ignore the possibility of it requiring multiple pages + // That's tested separately + + auto numValuesRemaining = src.size(); + const uint8_t* srcCursor = (uint8_t*)src.data(); + alg.compressNextPage(srcCursor, numValuesRemaining, dest.data(), pageSize, metadata); + std::vector decompressed(src.size()); + alg.decompressFromPage(dest.data(), 0, (uint8_t*)decompressed.data(), 0, src.size(), metadata); + EXPECT_EQ(src, decompressed); + // works with all bit widths + T value = 0; + alg.setValueFromUncompressed((uint8_t*)&value, 0, (uint8_t*)dest.data(), 1, metadata); + alg.decompressFromPage(dest.data(), 0, (uint8_t*)decompressed.data(), 0, src.size(), metadata); + src[1] = value; + EXPECT_EQ(decompressed, src); + EXPECT_EQ(decompressed[1], value); + + for (int i = 0; i < src.size(); i++) { + alg.getValue(dest.data(), i, (uint8_t*)decompressed.data(), i, metadata); + EXPECT_EQ(decompressed[i], src[i]); + } + EXPECT_EQ(decompressed, src); + + // Decompress part of a page + decompressed.clear(); + decompressed.resize(src.size() / 2); + alg.decompressFromPage( + dest.data(), src.size() / 3, (uint8_t*)decompressed.data(), 0, src.size() / 2, metadata); + auto expected = std::vector(src); + expected.erase(expected.begin(), expected.begin() + src.size() / 3); + expected.resize(src.size() / 2); + EXPECT_EQ(decompressed, expected); + + decompressed.clear(); + decompressed.resize(src.size() / 2); + alg.decompressFromPage( + dest.data(), src.size() / 7, (uint8_t*)decompressed.data(), 0, src.size() / 2, metadata); + expected = std::vector(src); + expected.erase(expected.begin(), expected.begin() + src.size() / 7); + expected.resize(src.size() / 2); + EXPECT_EQ(decompressed, expected); +} + +TEST(CompressionTests, BooleanBitpackingTest) { + std::vector src{true, false, true, true, false, true, false}; + auto alg = BooleanBitpacking(); + test_compression(alg, src); +} + +TEST(CompressionTests, CopyCompressionTest) { + std::vector src{true, false, true, true, false, true, false}; + auto alg = CopyCompression(LogicalType(LogicalTypeID::BOOL)); + test_compression(alg, src); +} + +TEST(CompressionTests, IntegerPackingTest32) { + std::vector src(128, 6); + auto alg = IntegerBitpacking(); + ASSERT_EQ(alg.getBitWidth((uint8_t*)src.data(), src.size()).bitWidth, std::bit_width(6u)); + test_compression(alg, src); +} + +TEST(CompressionTests, IntegerPackingTest64) { + std::vector src(128, 6); + auto alg = IntegerBitpacking(); + ASSERT_EQ(alg.getBitWidth((uint8_t*)src.data(), src.size()).bitWidth, std::bit_width(6u)); + test_compression(alg, src); +} + +TEST(CompressionTests, IntegerPackingTestNegative32) { + std::vector src(128, -6); + src[5] = 20; + auto alg = IntegerBitpacking(); + ASSERT_EQ(alg.getBitWidth((uint8_t*)src.data(), src.size()).bitWidth, std::bit_width(20u) + 1); + test_compression(alg, src); +} + +TEST(CompressionTests, IntegerPackingTestNegative64) { + std::vector src(128, -6); + src[5] = 20; + auto alg = IntegerBitpacking(); + ASSERT_EQ(alg.getBitWidth((uint8_t*)src.data(), src.size()).bitWidth, std::bit_width(20u) + 1); + test_compression(alg, src); +} + +TEST(CompressionTests, CopyMultiPage) { + int64_t numValues = 512; + std::vector src(numValues, -6); + + auto alg = CopyCompression(LogicalType(LogicalTypeID::INT64)); + auto pageSize = 64; + auto metadata = alg.startCompression((uint8_t*)src.data(), src.size()); + auto numValuesRemaining = numValues; + auto numValuesPerPage = metadata.numValues(pageSize, LogicalType(LogicalTypeID::INT64)); + const uint8_t* srcCursor = (uint8_t*)src.data(); + // TODO: accumulate output and then decompress + while (numValuesRemaining > 0) { + std::vector dest(pageSize); + auto compressedSize = + alg.compressNextPage(srcCursor, numValuesRemaining, dest.data(), pageSize, metadata); + numValuesRemaining -= numValuesPerPage; + ASSERT_EQ(compressedSize, pageSize); + } + ASSERT_EQ((int64_t*)srcCursor - src.data(), numValues); +} + +void integerPackingMultiPage(std::vector src) { + auto alg = IntegerBitpacking(); + auto pageSize = 4096; + auto metadata = alg.startCompression((uint8_t*)src.data(), src.size()); + auto numValuesPerPage = metadata.numValues(pageSize, LogicalType(LogicalTypeID::INT64)); + int64_t numValuesRemaining = src.size(); + const uint8_t* srcCursor = (uint8_t*)src.data(); + auto pages = src.size() / numValuesPerPage + 1; + std::vector> dest(pages, std::vector(pageSize)); + size_t pageNum = 0; + while (numValuesRemaining > 0) { + ASSERT_LT(pageNum, pages); + alg.compressNextPage( + srcCursor, numValuesRemaining, dest[pageNum++].data(), pageSize, metadata); + numValuesRemaining -= numValuesPerPage; + } + ASSERT_EQ(srcCursor, (uint8_t*)(src.data() + src.size())); + for (int i = 0; i < src.size(); i++) { + auto page = i / numValuesPerPage; + auto indexInPage = i % numValuesPerPage; + int64_t value; + alg.getValue(dest[page].data(), indexInPage, (uint8_t*)&value, 0, metadata); + EXPECT_EQ(src[i] - value, 0); + EXPECT_EQ(src[i], value); + } + std::vector decompressed(src.size()); + for (int i = 0; i < src.size(); i += numValuesPerPage) { + auto page = i / numValuesPerPage; + alg.decompressFromPage(dest[page].data(), i % numValuesPerPage, + (uint8_t*)decompressed.data(), i, numValuesPerPage, metadata); + } + ASSERT_EQ(decompressed, src); +} + +TEST(CompressionTests, IntegerPackingMultiPage) { + int64_t numValues = 10000; + std::vector src(numValues); + for (int i = 0; i < numValues; i++) { + src[i] = i; + } + + integerPackingMultiPage(src); +} + +TEST(CompressionTests, IntegerPackingMultiPageNegative) { + int64_t numValues = 10000; + std::vector src(numValues); + for (int i = 0; i < numValues; i++) { + src[i] = -i; + } + + integerPackingMultiPage(src); +} diff --git a/test/test_files/tinysnb/function/table.test b/test/test_files/tinysnb/function/table.test index a8133c287c5..e445f091cb1 100644 --- a/test/test_files/tinysnb/function/table.test +++ b/test/test_files/tinysnb/function/table.test @@ -79,4 +79,4 @@ height -LOG ReturnDBVersion -STATEMENT CALL db_version() RETURN version ---- 1 -v0.0.8.7 +v0.0.8.8 diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index 9a5c2b4fced..328aef3bf43 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -5,3 +5,4 @@ add_subdirectory(utf8proc) add_subdirectory(pybind11) add_subdirectory(re2) add_subdirectory(serd) +add_subdirectory(fastpfor) diff --git a/third_party/fastpfor/CMakeLists.txt b/third_party/fastpfor/CMakeLists.txt new file mode 100644 index 00000000000..edb19c8bfc0 --- /dev/null +++ b/third_party/fastpfor/CMakeLists.txt @@ -0,0 +1,6 @@ +add_library(fastpfor + STATIC + fastpfor/bitpacking.cpp fastpfor/bitpackingaligned.cpp) + +target_include_directories(fastpfor PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/third_party/fastpfor/LICENSE b/third_party/fastpfor/LICENSE new file mode 100644 index 00000000000..37ec93a14fd --- /dev/null +++ b/third_party/fastpfor/LICENSE @@ -0,0 +1,191 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and +distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright +owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities +that control, are controlled by, or are under common control with that entity. +For the purposes of this definition, "control" means (i) the power, direct or +indirect, to cause the direction or management of such entity, whether by +contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising +permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including +but not limited to software source code, documentation source, and configuration +files. + +"Object" form shall mean any form resulting from mechanical transformation or +translation of a Source form, including but not limited to compiled object code, +generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made +available under the License, as indicated by a copyright notice that is included +in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that +is based on (or derived from) the Work and for which the editorial revisions, +annotations, elaborations, or other modifications represent, as a whole, an +original work of authorship. For the purposes of this License, Derivative Works +shall not include works that remain separable from, or merely link (or bind by +name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version +of the Work and any modifications or additions to that Work or Derivative Works +thereof, that is intentionally submitted to Licensor for inclusion in the Work +by the copyright owner or by an individual or Legal Entity authorized to submit +on behalf of the copyright owner. For the purposes of this definition, +"submitted" means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, and +issue tracking systems that are managed by, or on behalf of, the Licensor for +the purpose of discussing and improving the Work, but excluding communication +that is conspicuously marked or otherwise designated in writing by the copyright +owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently +incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the Work and such +Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable (except as stated in this section) patent license to make, have +made, use, offer to sell, sell, import, and otherwise transfer the Work, where +such license applies only to those patent claims licensable by such Contributor +that are necessarily infringed by their Contribution(s) alone or by combination +of their Contribution(s) with the Work to which such Contribution(s) was +submitted. If You institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work or a +Contribution incorporated within the Work constitutes direct or contributory +patent infringement, then any patent licenses granted to You under this License +for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof +in any medium, with or without modifications, and in Source or Object form, +provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of +this License; and +You must cause any modified files to carry prominent notices stating that You +changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, +all copyright, patent, trademark, and attribution notices from the Source form +of the Work, excluding those notices that do not pertain to any part of the +Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any +Derivative Works that You distribute must include a readable copy of the +attribution notices contained within such NOTICE file, excluding those notices +that do not pertain to any part of the Derivative Works, in at least one of the +following places: within a NOTICE text file distributed as part of the +Derivative Works; within the Source form or documentation, if provided along +with the Derivative Works; or, within a display generated by the Derivative +Works, if and wherever such third-party notices normally appear. The contents of +the NOTICE file are for informational purposes only and do not modify the +License. You may add Your own attribution notices within Derivative Works that +You distribute, alongside or as an addendum to the NOTICE text from the Work, +provided that such additional attribution notices cannot be construed as +modifying the License. +You may add Your own copyright statement to Your modifications and may provide +additional or different license terms and conditions for use, reproduction, or +distribution of Your modifications, or for any such Derivative Works as a whole, +provided Your use, reproduction, and distribution of the Work otherwise complies +with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted +for inclusion in the Work by You to the Licensor shall be under the terms and +conditions of this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify the terms of +any separate license agreement you may have executed with Licensor regarding +such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, +service marks, or product names of the Licensor, except as required for +reasonable and customary use in describing the origin of the Work and +reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the +Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, +including, without limitation, any warranties or conditions of TITLE, +NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are +solely responsible for determining the appropriateness of using or +redistributing the Work and assume any risks associated with Your exercise of +permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), +contract, or otherwise, unless required by applicable law (such as deliberate +and grossly negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, incidental, +or consequential damages of any character arising as a result of this License or +out of the use or inability to use the Work (including but not limited to +damages for loss of goodwill, work stoppage, computer failure or malfunction, or +any and all other commercial damages or losses), even if such Contributor has +been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to +offer, and charge a fee for, acceptance of support, warranty, indemnity, or +other liability obligations and/or rights consistent with this License. However, +in accepting such obligations, You may act only on Your own behalf and on Your +sole responsibility, not on behalf of any other Contributor, and only if You +agree to indemnify, defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason of your +accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work + +To apply the Apache License to your work, attach the following boilerplate +notice, with the fields enclosed by brackets "[]" replaced with your own +identifying information. (Don't include the brackets!) The text should be +enclosed in the appropriate comment syntax for the file format. We also +recommend that a file or class name and description of purpose be included on +the same "printed page" as the copyright notice for easier identification within +third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/third_party/fastpfor/README b/third_party/fastpfor/README new file mode 100644 index 00000000000..698326b27b2 --- /dev/null +++ b/third_party/fastpfor/README @@ -0,0 +1,3 @@ +https://github.com/lemire/FastPFor + +Modified to remove 24 and 32-bit versions of the bitpackingaligned functions since we won't need them, and to remove unneeded includes in common.h. diff --git a/third_party/fastpfor/fastpfor/bitpacking.cpp b/third_party/fastpfor/fastpfor/bitpacking.cpp new file mode 100644 index 00000000000..60bc428d605 --- /dev/null +++ b/third_party/fastpfor/fastpfor/bitpacking.cpp @@ -0,0 +1,1652 @@ +#include "bitpacking.h" +#include +#include + +namespace { + +template +typename std::enable_if<(DELTA + SHR) < 32>::type unpack_single_out( + const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + *out = ((*in) >> SHR) % (1 << DELTA); +} + +template +typename std::enable_if<(DELTA + SHR) >= 32>::type unpack_single_out( + const uint32_t *__restrict__ &in, uint32_t *__restrict__ out) { + *out = (*in) >> SHR; + ++in; + + static const uint8_t NEXT_SHR = SHR + DELTA - 32; + *out |= ((*in) % (1U << NEXT_SHR)) << (32 - SHR); +} + +template +typename std::enable_if<(DELTA + SHR) < 32>::type unpack_single_out( + const uint32_t *__restrict__ in, uint64_t *__restrict__ out) { + *out = ((static_cast(*in)) >> SHR) % (1ULL << DELTA); +} + +template +typename std::enable_if<(DELTA + SHR) >= 32 && (DELTA + SHR) < 64>::type +unpack_single_out(const uint32_t *__restrict__ &in, + uint64_t *__restrict__ out) { + *out = static_cast(*in) >> SHR; + ++in; + if (DELTA + SHR > 32) { + static const uint8_t NEXT_SHR = SHR + DELTA - 32; + *out |= static_cast((*in) % (1U << NEXT_SHR)) << (32 - SHR); + } +} + +template +typename std::enable_if<(DELTA + SHR) >= 64>::type unpack_single_out( + const uint32_t *__restrict__ &in, uint64_t *__restrict__ out) { + *out = static_cast(*in) >> SHR; + ++in; + + *out |= static_cast(*in) << (32 - SHR); + ++in; + + if (DELTA + SHR > 64) { + static const uint8_t NEXT_SHR = DELTA + SHR - 64; + *out |= static_cast((*in) % (1U << NEXT_SHR)) << (64 - SHR); + } +} + +template + typename std::enable_if < + DELTA + SHL<32>::type pack_single_in(const uint32_t in, + uint32_t *__restrict__ out) { + if (SHL == 0) { + *out = in & MASK; + } else { + *out |= (in & MASK) << SHL; + } +} + +template +typename std::enable_if= 32>::type pack_single_in( + const uint32_t in, uint32_t *__restrict__ &out) { + *out |= in << SHL; + ++out; + + if (DELTA + SHL > 32) { + *out = (in & MASK) >> (32 - SHL); + } +} + +template + typename std::enable_if < + DELTA + SHL<32>::type pack_single_in64(const uint64_t in, + uint32_t *__restrict__ out) { + if (SHL == 0) { + *out = static_cast(in & MASK); + } else { + *out |= (in & MASK) << SHL; + } +} + +template + typename std::enable_if < DELTA + SHL >= 32 && + DELTA + SHL<64>::type pack_single_in64(const uint64_t in, + uint32_t *__restrict__ &out) { + if (SHL == 0) { + *out = static_cast(in & MASK); + } else { + *out |= (in & MASK) << SHL; + } + + ++out; + + if (DELTA + SHL > 32) { + *out = static_cast((in & MASK) >> (32 - SHL)); + } +} + +template +typename std::enable_if= 64>::type pack_single_in64( + const uint64_t in, uint32_t *__restrict__ &out) { + *out |= in << SHL; + ++out; + + *out = static_cast((in & MASK) >> (32 - SHL)); + ++out; + + if (DELTA + SHL > 64) { + *out = (in & MASK) >> (64 - SHL); + } +} + +template +struct Unroller { + static void Unpack(const uint32_t *__restrict__ &in, + uint32_t *__restrict__ out) { + unpack_single_out(in, out + OINDEX); + + Unroller::Unpack(in, out); + } + + static void Unpack(const uint32_t *__restrict__ &in, + uint64_t *__restrict__ out) { + unpack_single_out(in, out + OINDEX); + + Unroller::Unpack(in, out); + } + + static void Pack(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + pack_single_in(in[OINDEX], + out); + + Unroller::Pack(in, out); + } + + static void Pack(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + pack_single_in64( + in[OINDEX], out); + + Unroller::Pack(in, out); + } + + static void PackNoMask(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + pack_single_in(in[OINDEX], out); + + Unroller::PackNoMask(in, out); + } + + static void PackNoMask(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + pack_single_in64(in[OINDEX], + out); + + Unroller::PackNoMask(in, out); + } +}; + +template +struct Unroller { + enum { SHIFT = (DELTA * 31) % 32 }; + + static void Unpack(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + out[31] = (*in) >> SHIFT; + } + + static void Unpack(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + out[31] = (*in) >> SHIFT; + if (DELTA > 32) { + ++in; + out[31] |= static_cast(*in) << (32 - SHIFT); + } + } + + static void Pack(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out |= (in[31] << SHIFT); + } + + static void Pack(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out |= (in[31] << SHIFT); + if (DELTA > 32) { + ++out; + *out = static_cast(in[31] >> (32 - SHIFT)); + } + } + + static void PackNoMask(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out |= (in[31] << SHIFT); + } + + static void PackNoMask(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + *out |= (in[31] << SHIFT); + if (DELTA > 32) { + ++out; + *out = static_cast(in[31] >> (32 - SHIFT)); + } + } +}; +} // namespace + +// Special cases +void __fastunpack0(const uint32_t *__restrict__, uint32_t *__restrict__ out) { + for (uint32_t i = 0; i < 32; ++i) *(out++) = 0; +} + +void __fastunpack0(const uint32_t *__restrict__, uint64_t *__restrict__ out) { + for (uint32_t i = 0; i < 32; ++i) *(out++) = 0; +} + +void __fastpack0(const uint32_t *__restrict__, uint32_t *__restrict__) {} +void __fastpack0(const uint64_t *__restrict__, uint32_t *__restrict__) {} + +void __fastpackwithoutmask0(const uint32_t *__restrict__, + uint32_t *__restrict__) {} +void __fastpackwithoutmask0(const uint64_t *__restrict__, + uint32_t *__restrict__) {} + +// fastunpack for 32 bits +void __fastunpack1(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<1>::Unpack(in, out); +} + +void __fastunpack2(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<2>::Unpack(in, out); +} + +void __fastunpack3(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<3>::Unpack(in, out); +} + +void __fastunpack4(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + for (uint32_t outer = 0; outer < 4; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 4) + *(out++) = ((*in) >> inwordpointer) % (1U << 4); + ++in; + } +} + +void __fastunpack5(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<5>::Unpack(in, out); +} + +void __fastunpack6(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<6>::Unpack(in, out); +} + +void __fastunpack7(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<7>::Unpack(in, out); +} + +void __fastunpack8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + for (uint32_t outer = 0; outer < 8; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 8) + *(out++) = ((*in) >> inwordpointer) % (1U << 8); + ++in; + } +} + +void __fastunpack9(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<9>::Unpack(in, out); +} + +void __fastunpack10(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<10>::Unpack(in, out); +} + +void __fastunpack11(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<11>::Unpack(in, out); +} + +void __fastunpack12(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<12>::Unpack(in, out); +} + +void __fastunpack13(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<13>::Unpack(in, out); +} + +void __fastunpack14(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<14>::Unpack(in, out); +} + +void __fastunpack15(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<15>::Unpack(in, out); +} + +void __fastunpack16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + for (uint32_t outer = 0; outer < 16; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 16) + *(out++) = ((*in) >> inwordpointer) % (1U << 16); + ++in; + } +} + +void __fastunpack17(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<17>::Unpack(in, out); +} + +void __fastunpack18(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<18>::Unpack(in, out); +} + +void __fastunpack19(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<19>::Unpack(in, out); +} + +void __fastunpack20(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<20>::Unpack(in, out); +} + +void __fastunpack21(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<21>::Unpack(in, out); +} + +void __fastunpack22(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<22>::Unpack(in, out); +} + +void __fastunpack23(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<23>::Unpack(in, out); +} + +void __fastunpack24(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<24>::Unpack(in, out); +} + +void __fastunpack25(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<25>::Unpack(in, out); +} + +void __fastunpack26(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<26>::Unpack(in, out); +} + +void __fastunpack27(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<27>::Unpack(in, out); +} + +void __fastunpack28(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<28>::Unpack(in, out); +} + +void __fastunpack29(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<29>::Unpack(in, out); +} + +void __fastunpack30(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<30>::Unpack(in, out); +} + +void __fastunpack31(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<31>::Unpack(in, out); +} + +void __fastunpack32(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + for (int k = 0; k < 32; ++k) out[k] = in[k]; +} + +// fastupack for 64 bits +void __fastunpack1(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<1>::Unpack(in, out); +} + +void __fastunpack2(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<2>::Unpack(in, out); +} + +void __fastunpack3(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<3>::Unpack(in, out); +} + +void __fastunpack4(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + for (uint32_t outer = 0; outer < 4; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 4) + *(out++) = ((*in) >> inwordpointer) % (1U << 4); + ++in; + } +} + +void __fastunpack5(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<5>::Unpack(in, out); +} + +void __fastunpack6(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<6>::Unpack(in, out); +} + +void __fastunpack7(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<7>::Unpack(in, out); +} + +void __fastunpack8(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + for (uint32_t outer = 0; outer < 8; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 8) { + *(out++) = ((*in) >> inwordpointer) % (1U << 8); + } + ++in; + } +} + +void __fastunpack9(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<9>::Unpack(in, out); +} + +void __fastunpack10(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<10>::Unpack(in, out); +} + +void __fastunpack11(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<11>::Unpack(in, out); +} + +void __fastunpack12(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<12>::Unpack(in, out); +} + +void __fastunpack13(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<13>::Unpack(in, out); +} + +void __fastunpack14(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<14>::Unpack(in, out); +} + +void __fastunpack15(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<15>::Unpack(in, out); +} + +void __fastunpack16(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + for (uint32_t outer = 0; outer < 16; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 16) + *(out++) = ((*in) >> inwordpointer) % (1U << 16); + ++in; + } +} + +void __fastunpack17(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<17>::Unpack(in, out); +} + +void __fastunpack18(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<18>::Unpack(in, out); +} + +void __fastunpack19(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<19>::Unpack(in, out); +} + +void __fastunpack20(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<20>::Unpack(in, out); +} + +void __fastunpack21(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<21>::Unpack(in, out); +} + +void __fastunpack22(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<22>::Unpack(in, out); +} + +void __fastunpack23(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<23>::Unpack(in, out); +} + +void __fastunpack24(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<24>::Unpack(in, out); +} + +void __fastunpack25(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<25>::Unpack(in, out); +} + +void __fastunpack26(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<26>::Unpack(in, out); +} + +void __fastunpack27(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<27>::Unpack(in, out); +} + +void __fastunpack28(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<28>::Unpack(in, out); +} + +void __fastunpack29(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<29>::Unpack(in, out); +} + +void __fastunpack30(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<30>::Unpack(in, out); +} + +void __fastunpack31(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<31>::Unpack(in, out); +} + +void __fastunpack32(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + for (int k = 0; k < 32; ++k) out[k] = in[k]; +} + +void __fastunpack33(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<33>::Unpack(in, out); +} + +void __fastunpack34(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<34>::Unpack(in, out); +} + +void __fastunpack35(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<35>::Unpack(in, out); +} + +void __fastunpack36(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<36>::Unpack(in, out); +} + +void __fastunpack37(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<37>::Unpack(in, out); +} + +void __fastunpack38(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<38>::Unpack(in, out); +} + +void __fastunpack39(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<39>::Unpack(in, out); +} + +void __fastunpack40(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<40>::Unpack(in, out); +} + +void __fastunpack41(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<41>::Unpack(in, out); +} + +void __fastunpack42(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<42>::Unpack(in, out); +} + +void __fastunpack43(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<43>::Unpack(in, out); +} + +void __fastunpack44(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<44>::Unpack(in, out); +} + +void __fastunpack45(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<45>::Unpack(in, out); +} + +void __fastunpack46(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<46>::Unpack(in, out); +} + +void __fastunpack47(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<47>::Unpack(in, out); +} + +void __fastunpack48(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<48>::Unpack(in, out); +} + +void __fastunpack49(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<49>::Unpack(in, out); +} + +void __fastunpack50(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<50>::Unpack(in, out); +} + +void __fastunpack51(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<51>::Unpack(in, out); +} + +void __fastunpack52(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<52>::Unpack(in, out); +} + +void __fastunpack53(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<53>::Unpack(in, out); +} + +void __fastunpack54(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<54>::Unpack(in, out); +} + +void __fastunpack55(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<55>::Unpack(in, out); +} + +void __fastunpack56(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<56>::Unpack(in, out); +} + +void __fastunpack57(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<57>::Unpack(in, out); +} + +void __fastunpack58(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<58>::Unpack(in, out); +} + +void __fastunpack59(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<59>::Unpack(in, out); +} + +void __fastunpack60(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<60>::Unpack(in, out); +} + +void __fastunpack61(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<61>::Unpack(in, out); +} + +void __fastunpack62(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<62>::Unpack(in, out); +} + +void __fastunpack63(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + Unroller<63>::Unpack(in, out); +} + +void __fastunpack64(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out) { + for (int k = 0; k < 32; ++k) { + out[k] = in[k * 2]; + out[k] |= static_cast(in[k * 2 + 1]) << 32; + } +} + +// fastpack for 32 bits + +void __fastpack1(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<1>::Pack(in, out); +} + +void __fastpack2(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<2>::Pack(in, out); +} + +void __fastpack3(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<3>::Pack(in, out); +} + +void __fastpack4(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<4>::Pack(in, out); +} + +void __fastpack5(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<5>::Pack(in, out); +} + +void __fastpack6(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<6>::Pack(in, out); +} + +void __fastpack7(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<7>::Pack(in, out); +} + +void __fastpack8(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<8>::Pack(in, out); +} + +void __fastpack9(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<9>::Pack(in, out); +} + +void __fastpack10(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<10>::Pack(in, out); +} + +void __fastpack11(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<11>::Pack(in, out); +} + +void __fastpack12(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<12>::Pack(in, out); +} + +void __fastpack13(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<13>::Pack(in, out); +} + +void __fastpack14(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<14>::Pack(in, out); +} + +void __fastpack15(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<15>::Pack(in, out); +} + +void __fastpack16(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<16>::Pack(in, out); +} + +void __fastpack17(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<17>::Pack(in, out); +} + +void __fastpack18(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<18>::Pack(in, out); +} + +void __fastpack19(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<19>::Pack(in, out); +} + +void __fastpack20(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<20>::Pack(in, out); +} + +void __fastpack21(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<21>::Pack(in, out); +} + +void __fastpack22(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<22>::Pack(in, out); +} + +void __fastpack23(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<23>::Pack(in, out); +} + +void __fastpack24(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<24>::Pack(in, out); +} + +void __fastpack25(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<25>::Pack(in, out); +} + +void __fastpack26(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<26>::Pack(in, out); +} + +void __fastpack27(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<27>::Pack(in, out); +} + +void __fastpack28(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<28>::Pack(in, out); +} + +void __fastpack29(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<29>::Pack(in, out); +} + +void __fastpack30(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<30>::Pack(in, out); +} + +void __fastpack31(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<31>::Pack(in, out); +} + +void __fastpack32(const uint32_t *__restrict__ in, uint32_t *__restrict__ out) { + for (int k = 0; k < 32; ++k) out[k] = in[k]; +} + +// fastpack for 64 bits + +void __fastpack1(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<1>::Pack(in, out); +} + +void __fastpack2(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<2>::Pack(in, out); +} + +void __fastpack3(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<3>::Pack(in, out); +} + +void __fastpack4(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<4>::Pack(in, out); +} + +void __fastpack5(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<5>::Pack(in, out); +} + +void __fastpack6(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<6>::Pack(in, out); +} + +void __fastpack7(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<7>::Pack(in, out); +} + +void __fastpack8(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<8>::Pack(in, out); +} + +void __fastpack9(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<9>::Pack(in, out); +} + +void __fastpack10(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<10>::Pack(in, out); +} + +void __fastpack11(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<11>::Pack(in, out); +} + +void __fastpack12(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<12>::Pack(in, out); +} + +void __fastpack13(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<13>::Pack(in, out); +} + +void __fastpack14(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<14>::Pack(in, out); +} + +void __fastpack15(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<15>::Pack(in, out); +} + +void __fastpack16(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<16>::Pack(in, out); +} + +void __fastpack17(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<17>::Pack(in, out); +} + +void __fastpack18(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<18>::Pack(in, out); +} + +void __fastpack19(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<19>::Pack(in, out); +} + +void __fastpack20(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<20>::Pack(in, out); +} + +void __fastpack21(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<21>::Pack(in, out); +} + +void __fastpack22(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<22>::Pack(in, out); +} + +void __fastpack23(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<23>::Pack(in, out); +} + +void __fastpack24(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<24>::Pack(in, out); +} + +void __fastpack25(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<25>::Pack(in, out); +} + +void __fastpack26(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<26>::Pack(in, out); +} + +void __fastpack27(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<27>::Pack(in, out); +} + +void __fastpack28(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<28>::Pack(in, out); +} + +void __fastpack29(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<29>::Pack(in, out); +} + +void __fastpack30(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<30>::Pack(in, out); +} + +void __fastpack31(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<31>::Pack(in, out); +} + +void __fastpack32(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + for (int k = 0; k < 32; ++k) { + out[k] = static_cast(in[k]); + } +} + +void __fastpack33(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<33>::Pack(in, out); +} + +void __fastpack34(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<34>::Pack(in, out); +} + +void __fastpack35(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<35>::Pack(in, out); +} + +void __fastpack36(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<36>::Pack(in, out); +} + +void __fastpack37(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<37>::Pack(in, out); +} + +void __fastpack38(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<38>::Pack(in, out); +} + +void __fastpack39(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<39>::Pack(in, out); +} + +void __fastpack40(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<40>::Pack(in, out); +} + +void __fastpack41(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<41>::Pack(in, out); +} + +void __fastpack42(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<42>::Pack(in, out); +} + +void __fastpack43(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<43>::Pack(in, out); +} + +void __fastpack44(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<44>::Pack(in, out); +} + +void __fastpack45(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<45>::Pack(in, out); +} + +void __fastpack46(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<46>::Pack(in, out); +} + +void __fastpack47(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<47>::Pack(in, out); +} + +void __fastpack48(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<48>::Pack(in, out); +} + +void __fastpack49(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<49>::Pack(in, out); +} + +void __fastpack50(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<50>::Pack(in, out); +} + +void __fastpack51(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<51>::Pack(in, out); +} + +void __fastpack52(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<52>::Pack(in, out); +} + +void __fastpack53(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<53>::Pack(in, out); +} + +void __fastpack54(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<54>::Pack(in, out); +} + +void __fastpack55(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<55>::Pack(in, out); +} + +void __fastpack56(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<56>::Pack(in, out); +} + +void __fastpack57(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<57>::Pack(in, out); +} + +void __fastpack58(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<58>::Pack(in, out); +} + +void __fastpack59(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<59>::Pack(in, out); +} + +void __fastpack60(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<60>::Pack(in, out); +} + +void __fastpack61(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<61>::Pack(in, out); +} + +void __fastpack62(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<62>::Pack(in, out); +} + +void __fastpack63(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + Unroller<63>::Pack(in, out); +} + +void __fastpack64(const uint64_t *__restrict__ in, uint32_t *__restrict__ out) { + for (int i = 0; i < 32; ++i) { + out[2 * i] = static_cast(in[i]); + out[2 * i + 1] = in[i] >> 32; + } +} + +// fastpackwithoutmask for 32 bits +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask1(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<1>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask2(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<2>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask3(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<3>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask4(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<4>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask5(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<5>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask6(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<6>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask7(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<7>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<8>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask9(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<9>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask10(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<10>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask11(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<11>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask12(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<12>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask13(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<13>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask14(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<14>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask15(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<15>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<16>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask17(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<17>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask18(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<18>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask19(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<19>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask20(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<20>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask21(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<21>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask22(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<22>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask23(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<23>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask24(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<24>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask25(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<25>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask26(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<26>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask27(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<27>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask28(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<28>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask29(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<29>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask30(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<30>::PackNoMask(in, out); +} + +/*assumes that integers fit in the prescribed number of bits */ +void __fastpackwithoutmask31(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<31>::PackNoMask(in, out); +} + +void __fastpackwithoutmask32(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + for (int k = 0; k < 32; ++k) out[k] = in[k]; +} + +// fastpackwithoutmask for 64 bits +void __fastpackwithoutmask1(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<1>::PackNoMask(in, out); +} + +void __fastpackwithoutmask2(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<2>::PackNoMask(in, out); +} + +void __fastpackwithoutmask3(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<3>::PackNoMask(in, out); +} + +void __fastpackwithoutmask4(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<4>::PackNoMask(in, out); +} + +void __fastpackwithoutmask5(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<5>::PackNoMask(in, out); +} + +void __fastpackwithoutmask6(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<6>::PackNoMask(in, out); +} + +void __fastpackwithoutmask7(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<7>::PackNoMask(in, out); +} + +void __fastpackwithoutmask8(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<8>::PackNoMask(in, out); +} + +void __fastpackwithoutmask9(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<9>::PackNoMask(in, out); +} + +void __fastpackwithoutmask10(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<10>::PackNoMask(in, out); +} + +void __fastpackwithoutmask11(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<11>::PackNoMask(in, out); +} + +void __fastpackwithoutmask12(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<12>::PackNoMask(in, out); +} + +void __fastpackwithoutmask13(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<13>::PackNoMask(in, out); +} + +void __fastpackwithoutmask14(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<14>::PackNoMask(in, out); +} + +void __fastpackwithoutmask15(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<15>::PackNoMask(in, out); +} + +void __fastpackwithoutmask16(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<16>::PackNoMask(in, out); +} + +void __fastpackwithoutmask17(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<17>::PackNoMask(in, out); +} + +void __fastpackwithoutmask18(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<18>::PackNoMask(in, out); +} + +void __fastpackwithoutmask19(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<19>::PackNoMask(in, out); +} + +void __fastpackwithoutmask20(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<20>::PackNoMask(in, out); +} + +void __fastpackwithoutmask21(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<21>::PackNoMask(in, out); +} + +void __fastpackwithoutmask22(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<22>::PackNoMask(in, out); +} + +void __fastpackwithoutmask23(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<23>::PackNoMask(in, out); +} + +void __fastpackwithoutmask24(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<24>::PackNoMask(in, out); +} + +void __fastpackwithoutmask25(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<25>::PackNoMask(in, out); +} + +void __fastpackwithoutmask26(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<26>::PackNoMask(in, out); +} + +void __fastpackwithoutmask27(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<27>::PackNoMask(in, out); +} + +void __fastpackwithoutmask28(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<28>::PackNoMask(in, out); +} + +void __fastpackwithoutmask29(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<29>::PackNoMask(in, out); +} + +void __fastpackwithoutmask30(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<30>::PackNoMask(in, out); +} + +void __fastpackwithoutmask31(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<31>::PackNoMask(in, out); +} + +void __fastpackwithoutmask32(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + for (int i = 0; i < 32; ++i) { + out[i] = static_cast(in[i]); + } +} + +void __fastpackwithoutmask33(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<33>::PackNoMask(in, out); +} + +void __fastpackwithoutmask34(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<34>::PackNoMask(in, out); +} + +void __fastpackwithoutmask35(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<35>::PackNoMask(in, out); +} + +void __fastpackwithoutmask36(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<36>::PackNoMask(in, out); +} + +void __fastpackwithoutmask37(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<37>::PackNoMask(in, out); +} + +void __fastpackwithoutmask38(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<38>::PackNoMask(in, out); +} + +void __fastpackwithoutmask39(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<39>::PackNoMask(in, out); +} + +void __fastpackwithoutmask40(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<40>::PackNoMask(in, out); +} + +void __fastpackwithoutmask41(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<41>::PackNoMask(in, out); +} + +void __fastpackwithoutmask42(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<42>::PackNoMask(in, out); +} + +void __fastpackwithoutmask43(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<43>::PackNoMask(in, out); +} + +void __fastpackwithoutmask44(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<44>::PackNoMask(in, out); +} + +void __fastpackwithoutmask45(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<45>::PackNoMask(in, out); +} + +void __fastpackwithoutmask46(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<46>::PackNoMask(in, out); +} + +void __fastpackwithoutmask47(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<47>::PackNoMask(in, out); +} + +void __fastpackwithoutmask48(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<48>::PackNoMask(in, out); +} + +void __fastpackwithoutmask49(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<49>::PackNoMask(in, out); +} + +void __fastpackwithoutmask50(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<50>::PackNoMask(in, out); +} + +void __fastpackwithoutmask51(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<51>::PackNoMask(in, out); +} + +void __fastpackwithoutmask52(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<52>::PackNoMask(in, out); +} + +void __fastpackwithoutmask53(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<53>::PackNoMask(in, out); +} + +void __fastpackwithoutmask54(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<54>::PackNoMask(in, out); +} + +void __fastpackwithoutmask55(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<55>::PackNoMask(in, out); +} + +void __fastpackwithoutmask56(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<56>::PackNoMask(in, out); +} + +void __fastpackwithoutmask57(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<57>::PackNoMask(in, out); +} + +void __fastpackwithoutmask58(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<58>::PackNoMask(in, out); +} + +void __fastpackwithoutmask59(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<59>::PackNoMask(in, out); +} + +void __fastpackwithoutmask60(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<60>::PackNoMask(in, out); +} + +void __fastpackwithoutmask61(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<61>::PackNoMask(in, out); +} + +void __fastpackwithoutmask62(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<62>::PackNoMask(in, out); +} + +void __fastpackwithoutmask63(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + Unroller<63>::PackNoMask(in, out); +} + +void __fastpackwithoutmask64(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out) { + for (int i = 0; i < 32; ++i) { + out[2 * i] = static_cast(in[i]); + out[2 * i + 1] = in[i] >> 32; + } +} diff --git a/third_party/fastpfor/fastpfor/bitpacking.h b/third_party/fastpfor/fastpfor/bitpacking.h new file mode 100644 index 00000000000..4edf09f7e6b --- /dev/null +++ b/third_party/fastpfor/fastpfor/bitpacking.h @@ -0,0 +1,487 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +#ifndef BITPACKING_H_ +#define BITPACKING_H_ +#include "common.h" + +void __fastunpack0(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack1(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack2(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack3(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack4(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack5(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack6(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack7(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack8(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack9(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastunpack10(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack11(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack12(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack13(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack14(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack15(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack17(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack18(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack19(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack20(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack21(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack22(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack23(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack24(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack25(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack26(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack27(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack28(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack29(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack30(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack31(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastunpack32(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); + +void __fastunpack0(const uint32_t *__restrict__ in, uint64_t *__restrict__ out); +void __fastunpack1(const uint32_t *__restrict__ in, uint64_t *__restrict__ out); +void __fastunpack2(const uint32_t *__restrict__ in, uint64_t *__restrict__ out); +void __fastunpack3(const uint32_t *__restrict__ in, uint64_t *__restrict__ out); +void __fastunpack4(const uint32_t *__restrict__ in, uint64_t *__restrict__ out); +void __fastunpack5(const uint32_t *__restrict__ in, uint64_t *__restrict__ out); +void __fastunpack6(const uint32_t *__restrict__ in, uint64_t *__restrict__ out); +void __fastunpack7(const uint32_t *__restrict__ in, uint64_t *__restrict__ out); +void __fastunpack8(const uint32_t *__restrict__ in, uint64_t *__restrict__ out); +void __fastunpack9(const uint32_t *__restrict__ in, uint64_t *__restrict__ out); +void __fastunpack10(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack11(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack12(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack13(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack14(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack15(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack16(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack17(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack18(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack19(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack20(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack21(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack22(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack23(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack24(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack25(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack26(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack27(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack28(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack29(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack30(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack31(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack32(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack33(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack34(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack35(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack36(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack37(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack38(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack39(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack40(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack41(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack42(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack43(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack44(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack45(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack46(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack47(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack48(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack49(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack50(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack51(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack52(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack53(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack54(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack55(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack56(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack57(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack58(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack59(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack60(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack61(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack62(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack63(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); +void __fastunpack64(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out); + +void __fastpack0(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack1(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack2(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack3(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack4(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack5(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack6(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack7(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack8(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack9(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack10(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack11(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack12(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack13(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack14(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack15(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack16(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack17(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack18(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack19(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack20(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack21(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack22(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack23(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack24(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack25(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack26(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack27(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack28(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack29(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack30(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack31(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack32(const uint32_t *__restrict__ in, uint32_t *__restrict__ out); + +void __fastpack0(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack1(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack2(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack3(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack4(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack5(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack6(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack7(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack8(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack9(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack10(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack11(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack12(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack13(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack14(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack15(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack16(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack17(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack18(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack19(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack20(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack21(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack22(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack23(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack24(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack25(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack26(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack27(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack28(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack29(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack30(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack31(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack32(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack33(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack34(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack35(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack36(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack37(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack38(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack39(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack40(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack41(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack42(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack43(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack44(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack45(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack46(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack47(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack48(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack49(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack50(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack51(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack52(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack53(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack54(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack55(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack56(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack57(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack58(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack59(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack60(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack61(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack62(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack63(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); +void __fastpack64(const uint64_t *__restrict__ in, uint32_t *__restrict__ out); + +void __fastpackwithoutmask0(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask1(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask2(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask3(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask4(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask5(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask6(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask7(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask9(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask10(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask11(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask12(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask13(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask14(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask15(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask17(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask18(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask19(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask20(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask21(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask22(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask23(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask24(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask25(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask26(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask27(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask28(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask29(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask30(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask31(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask32(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out); + +void __fastpackwithoutmask0(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask1(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask2(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask3(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask4(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask5(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask6(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask7(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask8(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask9(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask10(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask11(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask12(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask13(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask14(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask15(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask16(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask17(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask18(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask19(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask20(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask21(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask22(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask23(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask24(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask25(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask26(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask27(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask28(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask29(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask30(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask31(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask32(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask33(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask34(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask35(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask36(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask37(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask38(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask39(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask40(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask41(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask42(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask43(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask44(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask45(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask46(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask47(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask48(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask49(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask50(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask51(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask52(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask53(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask54(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask55(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask56(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask57(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask58(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask59(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask60(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask61(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask62(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask63(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); +void __fastpackwithoutmask64(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out); + +#endif /* BITPACKING_H_ */ diff --git a/third_party/fastpfor/fastpfor/bitpackingaligned.cpp b/third_party/fastpfor/fastpfor/bitpackingaligned.cpp new file mode 100644 index 00000000000..690b7a6feea --- /dev/null +++ b/third_party/fastpfor/fastpfor/bitpackingaligned.cpp @@ -0,0 +1,5794 @@ +#include "bitpackingaligned.h" + +namespace FastPForLib { + +uint32_t *nullpacker(const uint32_t *__restrict__ /*in*/, + uint32_t *__restrict__ out) { + return out; +} + +const uint32_t *nullunpacker8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + memset(out, 0, 8 * 4); + return in; +} + +uint32_t *__fastpackwithoutmask1_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 7; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask2_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 14; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask3_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 21; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask4_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 28; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask5_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 25; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (5 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask6_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (6 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 10; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask7_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 21; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (7 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 17; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask8_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask9_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (9 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 22; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (9 - 8); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask10_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (10 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (10 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask11_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (11 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (11 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 13; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask12_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (12 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (12 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 20; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask13_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (13 - 7); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (13 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (13 - 8); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask14_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (14 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (14 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (14 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask15_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (15 - 13); + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (15 - 11); + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (15 - 9); + ++in; + *out |= ((*in)) << 9; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask16_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask17_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (17 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (17 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (17 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (17 - 8); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask18_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (18 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (18 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (18 - 12); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (18 - 16); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask19_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (19 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (19 - 12); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (19 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (19 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask20_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (20 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (20 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (20 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (20 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask21_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (21 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (21 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (21 - 9); + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (21 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (21 - 8); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask22_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (22 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (22 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (22 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (22 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (22 - 16); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask23_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (23 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (23 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (23 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (23 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in)) >> (23 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask24_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask25_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (25 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (25 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in)) >> (25 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (25 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (25 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in)) >> (25 - 8); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask26_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (26 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (26 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (26 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (26 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (26 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (26 - 16); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask27_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (27 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (27 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (27 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (27 - 7); + ++in; + *out |= ((*in)) << 7; + ++out; + *out = ((*in)) >> (27 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (27 - 24); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask28_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (28 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (28 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (28 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (28 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (28 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (28 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask29_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (29 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (29 - 23); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (29 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (29 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (29 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (29 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in)) >> (29 - 8); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask30_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (30 - 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (30 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (30 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (30 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (30 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (30 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (30 - 16); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask31_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (31 - 30); + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (31 - 29); + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (31 - 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (31 - 27); + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (31 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (31 - 25); + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (31 - 24); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask32_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + + return out; +} + +const uint32_t *__fastunpack1_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) & 1; + out++; + *out = ((*in) >> 1) & 1; + out++; + *out = ((*in) >> 2) & 1; + out++; + *out = ((*in) >> 3) & 1; + out++; + *out = ((*in) >> 4) & 1; + out++; + *out = ((*in) >> 5) & 1; + out++; + *out = ((*in) >> 6) & 1; + out++; + *out = ((*in) >> 7) & 1; + out++; + + return in + 1; +} + +const uint32_t *__fastunpack2_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 2); + out++; + *out = ((*in) >> 2) % (1U << 2); + out++; + *out = ((*in) >> 4) % (1U << 2); + out++; + *out = ((*in) >> 6) % (1U << 2); + out++; + *out = ((*in) >> 8) % (1U << 2); + out++; + *out = ((*in) >> 10) % (1U << 2); + out++; + *out = ((*in) >> 12) % (1U << 2); + out++; + *out = ((*in) >> 14) % (1U << 2); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack3_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 3); + out++; + *out = ((*in) >> 3) % (1U << 3); + out++; + *out = ((*in) >> 6) % (1U << 3); + out++; + *out = ((*in) >> 9) % (1U << 3); + out++; + *out = ((*in) >> 12) % (1U << 3); + out++; + *out = ((*in) >> 15) % (1U << 3); + out++; + *out = ((*in) >> 18) % (1U << 3); + out++; + *out = ((*in) >> 21) % (1U << 3); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack4_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 4); + out++; + *out = ((*in) >> 4) % (1U << 4); + out++; + *out = ((*in) >> 8) % (1U << 4); + out++; + *out = ((*in) >> 12) % (1U << 4); + out++; + *out = ((*in) >> 16) % (1U << 4); + out++; + *out = ((*in) >> 20) % (1U << 4); + out++; + *out = ((*in) >> 24) % (1U << 4); + out++; + *out = ((*in) >> 28); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack5_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 5); + out++; + *out = ((*in) >> 5) % (1U << 5); + out++; + *out = ((*in) >> 10) % (1U << 5); + out++; + *out = ((*in) >> 15) % (1U << 5); + out++; + *out = ((*in) >> 20) % (1U << 5); + out++; + *out = ((*in) >> 25) % (1U << 5); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 3)) << (5 - 3); + out++; + *out = ((*in) >> 3) % (1U << 5); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack6_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 6); + out++; + *out = ((*in) >> 6) % (1U << 6); + out++; + *out = ((*in) >> 12) % (1U << 6); + out++; + *out = ((*in) >> 18) % (1U << 6); + out++; + *out = ((*in) >> 24) % (1U << 6); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 4)) << (6 - 4); + out++; + *out = ((*in) >> 4) % (1U << 6); + out++; + *out = ((*in) >> 10) % (1U << 6); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack7_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 7); + out++; + *out = ((*in) >> 7) % (1U << 7); + out++; + *out = ((*in) >> 14) % (1U << 7); + out++; + *out = ((*in) >> 21) % (1U << 7); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 3)) << (7 - 3); + out++; + *out = ((*in) >> 3) % (1U << 7); + out++; + *out = ((*in) >> 10) % (1U << 7); + out++; + *out = ((*in) >> 17) % (1U << 7); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack8_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack9_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 9); + out++; + *out = ((*in) >> 9) % (1U << 9); + out++; + *out = ((*in) >> 18) % (1U << 9); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 4)) << (9 - 4); + out++; + *out = ((*in) >> 4) % (1U << 9); + out++; + *out = ((*in) >> 13) % (1U << 9); + out++; + *out = ((*in) >> 22) % (1U << 9); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 8)) << (9 - 8); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack10_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 10); + out++; + *out = ((*in) >> 10) % (1U << 10); + out++; + *out = ((*in) >> 20) % (1U << 10); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 8)) << (10 - 8); + out++; + *out = ((*in) >> 8) % (1U << 10); + out++; + *out = ((*in) >> 18) % (1U << 10); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 6)) << (10 - 6); + out++; + *out = ((*in) >> 6) % (1U << 10); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack11_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 11); + out++; + *out = ((*in) >> 11) % (1U << 11); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 1)) << (11 - 1); + out++; + *out = ((*in) >> 1) % (1U << 11); + out++; + *out = ((*in) >> 12) % (1U << 11); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 2)) << (11 - 2); + out++; + *out = ((*in) >> 2) % (1U << 11); + out++; + *out = ((*in) >> 13) % (1U << 11); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack12_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack13_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 13); + out++; + *out = ((*in) >> 13) % (1U << 13); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 7)) << (13 - 7); + out++; + *out = ((*in) >> 7) % (1U << 13); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 1)) << (13 - 1); + out++; + *out = ((*in) >> 1) % (1U << 13); + out++; + *out = ((*in) >> 14) % (1U << 13); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 8)) << (13 - 8); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack14_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 14); + out++; + *out = ((*in) >> 14) % (1U << 14); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 10)) << (14 - 10); + out++; + *out = ((*in) >> 10) % (1U << 14); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 6)) << (14 - 6); + out++; + *out = ((*in) >> 6) % (1U << 14); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 2)) << (14 - 2); + out++; + *out = ((*in) >> 2) % (1U << 14); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack15_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 15); + out++; + *out = ((*in) >> 15) % (1U << 15); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 13)) << (15 - 13); + out++; + *out = ((*in) >> 13) % (1U << 15); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 11)) << (15 - 11); + out++; + *out = ((*in) >> 11) % (1U << 15); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 9)) << (15 - 9); + out++; + *out = ((*in) >> 9) % (1U << 15); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack16_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack17_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 2)) << (17 - 2); + out++; + *out = ((*in) >> 2) % (1U << 17); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 4)) << (17 - 4); + out++; + *out = ((*in) >> 4) % (1U << 17); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 6)) << (17 - 6); + out++; + *out = ((*in) >> 6) % (1U << 17); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 8)) << (17 - 8); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack18_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 4)) << (18 - 4); + out++; + *out = ((*in) >> 4) % (1U << 18); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 8)) << (18 - 8); + out++; + *out = ((*in) >> 8) % (1U << 18); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 12)) << (18 - 12); + out++; + *out = ((*in) >> 12) % (1U << 18); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 16)) << (18 - 16); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack19_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 6)) << (19 - 6); + out++; + *out = ((*in) >> 6) % (1U << 19); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 12)) << (19 - 12); + out++; + *out = ((*in) >> 12) % (1U << 19); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 18)) << (19 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 5)) << (19 - 5); + out++; + *out = ((*in) >> 5) % (1U << 19); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack20_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack21_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 10)) << (21 - 10); + out++; + *out = ((*in) >> 10) % (1U << 21); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 20)) << (21 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 9)) << (21 - 9); + out++; + *out = ((*in) >> 9) % (1U << 21); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 19)) << (21 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 8)) << (21 - 8); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack22_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 12)) << (22 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 2)) << (22 - 2); + out++; + *out = ((*in) >> 2) % (1U << 22); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 14)) << (22 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 4)) << (22 - 4); + out++; + *out = ((*in) >> 4) % (1U << 22); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 16)) << (22 - 16); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack23_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 14)) << (23 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 5)) << (23 - 5); + out++; + *out = ((*in) >> 5) % (1U << 23); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 19)) << (23 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 10)) << (23 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 1)) << (23 - 1); + out++; + *out = ((*in) >> 1) % (1U << 23); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack24_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack25_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 18)) << (25 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 11)) << (25 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 4)) << (25 - 4); + out++; + *out = ((*in) >> 4) % (1U << 25); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 22)) << (25 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 15)) << (25 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 8)) << (25 - 8); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack26_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 20)) << (26 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 14)) << (26 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 8)) << (26 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 2)) << (26 - 2); + out++; + *out = ((*in) >> 2) % (1U << 26); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 22)) << (26 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 16)) << (26 - 16); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack27_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 27); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 22)) << (27 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 17)) << (27 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 12)) << (27 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 7)) << (27 - 7); + out++; + *out = ((*in) >> 7); + ++in; + *out |= ((*in) % (1U << 2)) << (27 - 2); + out++; + *out = ((*in) >> 2) % (1U << 27); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 24)) << (27 - 24); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack28_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack29_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 29); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 26)) << (29 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 23)) << (29 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 20)) << (29 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 17)) << (29 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 14)) << (29 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 11)) << (29 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 8)) << (29 - 8); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack30_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 30); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 28)) << (30 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 26)) << (30 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 24)) << (30 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 22)) << (30 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 20)) << (30 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 18)) << (30 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 16)) << (30 - 16); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack31_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 31); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 30)) << (31 - 30); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 29)) << (31 - 29); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 28)) << (31 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 27)) << (31 - 27); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 26)) << (31 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 25)) << (31 - 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 24)) << (31 - 24); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack32_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + + return in; +} + +const uint32_t *fastunpack_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, const uint32_t bit) { + switch (bit) { + case 0: + return nullunpacker8(in, out); + + case 1: + return __fastunpack1_8(in, out); + + case 2: + return __fastunpack2_8(in, out); + + case 3: + return __fastunpack3_8(in, out); + + case 4: + return __fastunpack4_8(in, out); + + case 5: + return __fastunpack5_8(in, out); + + case 6: + return __fastunpack6_8(in, out); + + case 7: + return __fastunpack7_8(in, out); + + case 8: + return __fastunpack8_8(in, out); + + case 9: + return __fastunpack9_8(in, out); + + case 10: + return __fastunpack10_8(in, out); + + case 11: + return __fastunpack11_8(in, out); + + case 12: + return __fastunpack12_8(in, out); + + case 13: + return __fastunpack13_8(in, out); + + case 14: + return __fastunpack14_8(in, out); + + case 15: + return __fastunpack15_8(in, out); + + case 16: + return __fastunpack16_8(in, out); + + case 17: + return __fastunpack17_8(in, out); + + case 18: + return __fastunpack18_8(in, out); + + case 19: + return __fastunpack19_8(in, out); + + case 20: + return __fastunpack20_8(in, out); + + case 21: + return __fastunpack21_8(in, out); + + case 22: + return __fastunpack22_8(in, out); + + case 23: + return __fastunpack23_8(in, out); + + case 24: + return __fastunpack24_8(in, out); + + case 25: + return __fastunpack25_8(in, out); + + case 26: + return __fastunpack26_8(in, out); + + case 27: + return __fastunpack27_8(in, out); + + case 28: + return __fastunpack28_8(in, out); + + case 29: + return __fastunpack29_8(in, out); + + case 30: + return __fastunpack30_8(in, out); + + case 31: + return __fastunpack31_8(in, out); + + case 32: + return __fastunpack32_8(in, out); + + default: + break; + } + throw std::logic_error("number of bits is unsupported"); +} + +/*assumes that integers fit in the prescribed number of bits*/ +uint32_t *fastpackwithoutmask_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, + const uint32_t bit) { + switch (bit) { + case 0: + return nullpacker(in, out); + + case 1: + return __fastpackwithoutmask1_8(in, out); + + case 2: + return __fastpackwithoutmask2_8(in, out); + + case 3: + return __fastpackwithoutmask3_8(in, out); + + case 4: + return __fastpackwithoutmask4_8(in, out); + + case 5: + return __fastpackwithoutmask5_8(in, out); + + case 6: + return __fastpackwithoutmask6_8(in, out); + + case 7: + return __fastpackwithoutmask7_8(in, out); + + case 8: + return __fastpackwithoutmask8_8(in, out); + + case 9: + return __fastpackwithoutmask9_8(in, out); + + case 10: + return __fastpackwithoutmask10_8(in, out); + + case 11: + return __fastpackwithoutmask11_8(in, out); + + case 12: + return __fastpackwithoutmask12_8(in, out); + + case 13: + return __fastpackwithoutmask13_8(in, out); + + case 14: + return __fastpackwithoutmask14_8(in, out); + + case 15: + return __fastpackwithoutmask15_8(in, out); + + case 16: + return __fastpackwithoutmask16_8(in, out); + + case 17: + return __fastpackwithoutmask17_8(in, out); + + case 18: + return __fastpackwithoutmask18_8(in, out); + + case 19: + return __fastpackwithoutmask19_8(in, out); + + case 20: + return __fastpackwithoutmask20_8(in, out); + + case 21: + return __fastpackwithoutmask21_8(in, out); + + case 22: + return __fastpackwithoutmask22_8(in, out); + + case 23: + return __fastpackwithoutmask23_8(in, out); + + case 24: + return __fastpackwithoutmask24_8(in, out); + + case 25: + return __fastpackwithoutmask25_8(in, out); + + case 26: + return __fastpackwithoutmask26_8(in, out); + + case 27: + return __fastpackwithoutmask27_8(in, out); + + case 28: + return __fastpackwithoutmask28_8(in, out); + + case 29: + return __fastpackwithoutmask29_8(in, out); + + case 30: + return __fastpackwithoutmask30_8(in, out); + + case 31: + return __fastpackwithoutmask31_8(in, out); + + case 32: + return __fastpackwithoutmask32_8(in, out); + + default: + break; + } + throw std::logic_error("number of bits is unsupported"); +} + +const uint32_t *nullunpacker16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + memset(out, 0, 16 * 4); + return in; +} + +uint32_t *__fastpackwithoutmask1_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 15; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask2_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 22; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 26; + ++in; + *out |= ((*in)) << 28; + ++in; + *out |= ((*in)) << 30; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask3_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 21; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 27; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (3 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 13; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask4_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 28; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 28; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask5_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 25; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (5 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 23; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (5 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 11; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask6_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (6 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 22; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (6 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 26; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask7_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 21; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (7 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 17; + ++in; + *out |= ((*in)) << 24; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (7 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (7 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 9; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask8_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 24; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask9_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (9 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 22; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (9 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 17; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (9 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 21; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (9 - 7); + ++in; + *out |= ((*in)) << 7; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask10_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 20; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (10 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 18; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (10 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (10 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (10 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 22; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask11_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (11 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (11 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (11 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (11 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (11 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask12_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (12 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (12 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 20; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (12 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (12 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 20; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask13_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (13 - 7); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (13 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (13 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (13 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (13 - 9); + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (13 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask14_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (14 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (14 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (14 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 16; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (14 - 12); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (14 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (14 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 18; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask15_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 15; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (15 - 13); + ++in; + *out |= ((*in)) << 13; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (15 - 11); + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (15 - 9); + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (15 - 7); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (15 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (15 - 3); + ++in; + *out |= ((*in)) << 3; + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (15 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask16_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 16; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask17_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (17 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (17 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (17 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (17 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (17 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (17 - 12); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (17 - 14); + ++in; + *out |= ((*in)) << 14; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (17 - 16); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask18_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (18 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (18 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (18 - 12); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (18 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (18 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (18 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (18 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (18 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask19_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (19 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (19 - 12); + ++in; + *out |= ((*in)) << 12; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (19 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (19 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (19 - 11); + ++in; + *out |= ((*in)) << 11; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (19 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (19 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (19 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (19 - 16); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask20_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (20 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (20 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (20 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (20 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (20 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (20 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (20 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (20 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask21_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (21 - 10); + ++in; + *out |= ((*in)) << 10; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (21 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (21 - 9); + ++in; + *out |= ((*in)) << 9; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (21 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (21 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (21 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (21 - 7); + ++in; + *out |= ((*in)) << 7; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (21 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (21 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (21 - 16); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask22_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (22 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (22 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (22 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (22 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (22 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (22 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (22 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (22 - 8); + ++in; + *out |= ((*in)) << 8; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (22 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (22 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask23_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (23 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (23 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (23 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (23 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in)) >> (23 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (23 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in)) >> (23 - 6); + ++in; + *out |= ((*in)) << 6; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (23 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (23 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in)) >> (23 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (23 - 16); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask24_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (24 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (24 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask25_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (25 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (25 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in)) >> (25 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (25 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (25 - 15); + ++in; + *out |= ((*in)) << 15; + ++out; + *out = ((*in)) >> (25 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (25 - 1); + ++in; + *out |= ((*in)) << 1; + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (25 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (25 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (25 - 5); + ++in; + *out |= ((*in)) << 5; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (25 - 23); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (25 - 16); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask26_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (26 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (26 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (26 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (26 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (26 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (26 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (26 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in)) >> (26 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (26 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (26 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (26 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (26 - 6); + ++in; + *out |= ((*in)) << 6; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask27_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (27 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (27 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (27 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (27 - 7); + ++in; + *out |= ((*in)) << 7; + ++out; + *out = ((*in)) >> (27 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (27 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (27 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (27 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (27 - 9); + ++in; + *out |= ((*in)) << 9; + ++out; + *out = ((*in)) >> (27 - 4); + ++in; + *out |= ((*in)) << 4; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (27 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (27 - 21); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (27 - 16); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask28_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (28 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (28 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (28 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (28 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (28 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (28 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + ++in; + *out = (*in); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (28 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (28 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (28 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (28 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (28 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (28 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask29_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (29 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (29 - 23); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (29 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (29 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (29 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (29 - 11); + ++in; + *out |= ((*in)) << 11; + ++out; + *out = ((*in)) >> (29 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (29 - 5); + ++in; + *out |= ((*in)) << 5; + ++out; + *out = ((*in)) >> (29 - 2); + ++in; + *out |= ((*in)) << 2; + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (29 - 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (29 - 25); + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (29 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (29 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (29 - 16); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask30_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (30 - 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (30 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (30 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (30 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (30 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (30 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (30 - 16); + ++in; + *out |= ((*in)) << 16; + ++out; + *out = ((*in)) >> (30 - 14); + ++in; + *out |= ((*in)) << 14; + ++out; + *out = ((*in)) >> (30 - 12); + ++in; + *out |= ((*in)) << 12; + ++out; + *out = ((*in)) >> (30 - 10); + ++in; + *out |= ((*in)) << 10; + ++out; + *out = ((*in)) >> (30 - 8); + ++in; + *out |= ((*in)) << 8; + ++out; + *out = ((*in)) >> (30 - 6); + ++in; + *out |= ((*in)) << 6; + ++out; + *out = ((*in)) >> (30 - 4); + ++in; + *out |= ((*in)) << 4; + ++out; + *out = ((*in)) >> (30 - 2); + ++in; + *out |= ((*in)) << 2; + ++out; + ++in; + + return out; +} + +uint32_t *__fastpackwithoutmask31_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++in; + *out |= ((*in)) << 31; + ++out; + *out = ((*in)) >> (31 - 30); + ++in; + *out |= ((*in)) << 30; + ++out; + *out = ((*in)) >> (31 - 29); + ++in; + *out |= ((*in)) << 29; + ++out; + *out = ((*in)) >> (31 - 28); + ++in; + *out |= ((*in)) << 28; + ++out; + *out = ((*in)) >> (31 - 27); + ++in; + *out |= ((*in)) << 27; + ++out; + *out = ((*in)) >> (31 - 26); + ++in; + *out |= ((*in)) << 26; + ++out; + *out = ((*in)) >> (31 - 25); + ++in; + *out |= ((*in)) << 25; + ++out; + *out = ((*in)) >> (31 - 24); + ++in; + *out |= ((*in)) << 24; + ++out; + *out = ((*in)) >> (31 - 23); + ++in; + *out |= ((*in)) << 23; + ++out; + *out = ((*in)) >> (31 - 22); + ++in; + *out |= ((*in)) << 22; + ++out; + *out = ((*in)) >> (31 - 21); + ++in; + *out |= ((*in)) << 21; + ++out; + *out = ((*in)) >> (31 - 20); + ++in; + *out |= ((*in)) << 20; + ++out; + *out = ((*in)) >> (31 - 19); + ++in; + *out |= ((*in)) << 19; + ++out; + *out = ((*in)) >> (31 - 18); + ++in; + *out |= ((*in)) << 18; + ++out; + *out = ((*in)) >> (31 - 17); + ++in; + *out |= ((*in)) << 17; + ++out; + *out = ((*in)) >> (31 - 16); + ++in; + + return out + 1; +} + +uint32_t *__fastpackwithoutmask32_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + *out = (*in); + ++out; + ++in; + + return out; +} + +const uint32_t *__fastunpack1_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) & 1; + out++; + *out = ((*in) >> 1) & 1; + out++; + *out = ((*in) >> 2) & 1; + out++; + *out = ((*in) >> 3) & 1; + out++; + *out = ((*in) >> 4) & 1; + out++; + *out = ((*in) >> 5) & 1; + out++; + *out = ((*in) >> 6) & 1; + out++; + *out = ((*in) >> 7) & 1; + out++; + *out = ((*in) >> 8) & 1; + out++; + *out = ((*in) >> 9) & 1; + out++; + *out = ((*in) >> 10) & 1; + out++; + *out = ((*in) >> 11) & 1; + out++; + *out = ((*in) >> 12) & 1; + out++; + *out = ((*in) >> 13) & 1; + out++; + *out = ((*in) >> 14) & 1; + out++; + *out = ((*in) >> 15) & 1; + out++; + + return in + 1; +} + +const uint32_t *__fastunpack2_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 2); + out++; + *out = ((*in) >> 2) % (1U << 2); + out++; + *out = ((*in) >> 4) % (1U << 2); + out++; + *out = ((*in) >> 6) % (1U << 2); + out++; + *out = ((*in) >> 8) % (1U << 2); + out++; + *out = ((*in) >> 10) % (1U << 2); + out++; + *out = ((*in) >> 12) % (1U << 2); + out++; + *out = ((*in) >> 14) % (1U << 2); + out++; + *out = ((*in) >> 16) % (1U << 2); + out++; + *out = ((*in) >> 18) % (1U << 2); + out++; + *out = ((*in) >> 20) % (1U << 2); + out++; + *out = ((*in) >> 22) % (1U << 2); + out++; + *out = ((*in) >> 24) % (1U << 2); + out++; + *out = ((*in) >> 26) % (1U << 2); + out++; + *out = ((*in) >> 28) % (1U << 2); + out++; + *out = ((*in) >> 30); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack3_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 3); + out++; + *out = ((*in) >> 3) % (1U << 3); + out++; + *out = ((*in) >> 6) % (1U << 3); + out++; + *out = ((*in) >> 9) % (1U << 3); + out++; + *out = ((*in) >> 12) % (1U << 3); + out++; + *out = ((*in) >> 15) % (1U << 3); + out++; + *out = ((*in) >> 18) % (1U << 3); + out++; + *out = ((*in) >> 21) % (1U << 3); + out++; + *out = ((*in) >> 24) % (1U << 3); + out++; + *out = ((*in) >> 27) % (1U << 3); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 1)) << (3 - 1); + out++; + *out = ((*in) >> 1) % (1U << 3); + out++; + *out = ((*in) >> 4) % (1U << 3); + out++; + *out = ((*in) >> 7) % (1U << 3); + out++; + *out = ((*in) >> 10) % (1U << 3); + out++; + *out = ((*in) >> 13) % (1U << 3); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack4_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 4); + out++; + *out = ((*in) >> 4) % (1U << 4); + out++; + *out = ((*in) >> 8) % (1U << 4); + out++; + *out = ((*in) >> 12) % (1U << 4); + out++; + *out = ((*in) >> 16) % (1U << 4); + out++; + *out = ((*in) >> 20) % (1U << 4); + out++; + *out = ((*in) >> 24) % (1U << 4); + out++; + *out = ((*in) >> 28); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 4); + out++; + *out = ((*in) >> 4) % (1U << 4); + out++; + *out = ((*in) >> 8) % (1U << 4); + out++; + *out = ((*in) >> 12) % (1U << 4); + out++; + *out = ((*in) >> 16) % (1U << 4); + out++; + *out = ((*in) >> 20) % (1U << 4); + out++; + *out = ((*in) >> 24) % (1U << 4); + out++; + *out = ((*in) >> 28); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack5_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 5); + out++; + *out = ((*in) >> 5) % (1U << 5); + out++; + *out = ((*in) >> 10) % (1U << 5); + out++; + *out = ((*in) >> 15) % (1U << 5); + out++; + *out = ((*in) >> 20) % (1U << 5); + out++; + *out = ((*in) >> 25) % (1U << 5); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 3)) << (5 - 3); + out++; + *out = ((*in) >> 3) % (1U << 5); + out++; + *out = ((*in) >> 8) % (1U << 5); + out++; + *out = ((*in) >> 13) % (1U << 5); + out++; + *out = ((*in) >> 18) % (1U << 5); + out++; + *out = ((*in) >> 23) % (1U << 5); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 1)) << (5 - 1); + out++; + *out = ((*in) >> 1) % (1U << 5); + out++; + *out = ((*in) >> 6) % (1U << 5); + out++; + *out = ((*in) >> 11) % (1U << 5); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack6_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 6); + out++; + *out = ((*in) >> 6) % (1U << 6); + out++; + *out = ((*in) >> 12) % (1U << 6); + out++; + *out = ((*in) >> 18) % (1U << 6); + out++; + *out = ((*in) >> 24) % (1U << 6); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 4)) << (6 - 4); + out++; + *out = ((*in) >> 4) % (1U << 6); + out++; + *out = ((*in) >> 10) % (1U << 6); + out++; + *out = ((*in) >> 16) % (1U << 6); + out++; + *out = ((*in) >> 22) % (1U << 6); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 2)) << (6 - 2); + out++; + *out = ((*in) >> 2) % (1U << 6); + out++; + *out = ((*in) >> 8) % (1U << 6); + out++; + *out = ((*in) >> 14) % (1U << 6); + out++; + *out = ((*in) >> 20) % (1U << 6); + out++; + *out = ((*in) >> 26); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack7_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 7); + out++; + *out = ((*in) >> 7) % (1U << 7); + out++; + *out = ((*in) >> 14) % (1U << 7); + out++; + *out = ((*in) >> 21) % (1U << 7); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 3)) << (7 - 3); + out++; + *out = ((*in) >> 3) % (1U << 7); + out++; + *out = ((*in) >> 10) % (1U << 7); + out++; + *out = ((*in) >> 17) % (1U << 7); + out++; + *out = ((*in) >> 24) % (1U << 7); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 6)) << (7 - 6); + out++; + *out = ((*in) >> 6) % (1U << 7); + out++; + *out = ((*in) >> 13) % (1U << 7); + out++; + *out = ((*in) >> 20) % (1U << 7); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 2)) << (7 - 2); + out++; + *out = ((*in) >> 2) % (1U << 7); + out++; + *out = ((*in) >> 9) % (1U << 7); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack8_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 8); + out++; + *out = ((*in) >> 8) % (1U << 8); + out++; + *out = ((*in) >> 16) % (1U << 8); + out++; + *out = ((*in) >> 24); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack9_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 9); + out++; + *out = ((*in) >> 9) % (1U << 9); + out++; + *out = ((*in) >> 18) % (1U << 9); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 4)) << (9 - 4); + out++; + *out = ((*in) >> 4) % (1U << 9); + out++; + *out = ((*in) >> 13) % (1U << 9); + out++; + *out = ((*in) >> 22) % (1U << 9); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 8)) << (9 - 8); + out++; + *out = ((*in) >> 8) % (1U << 9); + out++; + *out = ((*in) >> 17) % (1U << 9); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 3)) << (9 - 3); + out++; + *out = ((*in) >> 3) % (1U << 9); + out++; + *out = ((*in) >> 12) % (1U << 9); + out++; + *out = ((*in) >> 21) % (1U << 9); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 7)) << (9 - 7); + out++; + *out = ((*in) >> 7) % (1U << 9); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack10_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 10); + out++; + *out = ((*in) >> 10) % (1U << 10); + out++; + *out = ((*in) >> 20) % (1U << 10); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 8)) << (10 - 8); + out++; + *out = ((*in) >> 8) % (1U << 10); + out++; + *out = ((*in) >> 18) % (1U << 10); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 6)) << (10 - 6); + out++; + *out = ((*in) >> 6) % (1U << 10); + out++; + *out = ((*in) >> 16) % (1U << 10); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 4)) << (10 - 4); + out++; + *out = ((*in) >> 4) % (1U << 10); + out++; + *out = ((*in) >> 14) % (1U << 10); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 2)) << (10 - 2); + out++; + *out = ((*in) >> 2) % (1U << 10); + out++; + *out = ((*in) >> 12) % (1U << 10); + out++; + *out = ((*in) >> 22); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack11_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 11); + out++; + *out = ((*in) >> 11) % (1U << 11); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 1)) << (11 - 1); + out++; + *out = ((*in) >> 1) % (1U << 11); + out++; + *out = ((*in) >> 12) % (1U << 11); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 2)) << (11 - 2); + out++; + *out = ((*in) >> 2) % (1U << 11); + out++; + *out = ((*in) >> 13) % (1U << 11); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 3)) << (11 - 3); + out++; + *out = ((*in) >> 3) % (1U << 11); + out++; + *out = ((*in) >> 14) % (1U << 11); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 4)) << (11 - 4); + out++; + *out = ((*in) >> 4) % (1U << 11); + out++; + *out = ((*in) >> 15) % (1U << 11); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 5)) << (11 - 5); + out++; + *out = ((*in) >> 5) % (1U << 11); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack12_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 12); + out++; + *out = ((*in) >> 12) % (1U << 12); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 4)) << (12 - 4); + out++; + *out = ((*in) >> 4) % (1U << 12); + out++; + *out = ((*in) >> 16) % (1U << 12); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 8)) << (12 - 8); + out++; + *out = ((*in) >> 8) % (1U << 12); + out++; + *out = ((*in) >> 20); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack13_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 13); + out++; + *out = ((*in) >> 13) % (1U << 13); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 7)) << (13 - 7); + out++; + *out = ((*in) >> 7) % (1U << 13); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 1)) << (13 - 1); + out++; + *out = ((*in) >> 1) % (1U << 13); + out++; + *out = ((*in) >> 14) % (1U << 13); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 8)) << (13 - 8); + out++; + *out = ((*in) >> 8) % (1U << 13); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 2)) << (13 - 2); + out++; + *out = ((*in) >> 2) % (1U << 13); + out++; + *out = ((*in) >> 15) % (1U << 13); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 9)) << (13 - 9); + out++; + *out = ((*in) >> 9) % (1U << 13); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 3)) << (13 - 3); + out++; + *out = ((*in) >> 3) % (1U << 13); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack14_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 14); + out++; + *out = ((*in) >> 14) % (1U << 14); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 10)) << (14 - 10); + out++; + *out = ((*in) >> 10) % (1U << 14); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 6)) << (14 - 6); + out++; + *out = ((*in) >> 6) % (1U << 14); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 2)) << (14 - 2); + out++; + *out = ((*in) >> 2) % (1U << 14); + out++; + *out = ((*in) >> 16) % (1U << 14); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 12)) << (14 - 12); + out++; + *out = ((*in) >> 12) % (1U << 14); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 8)) << (14 - 8); + out++; + *out = ((*in) >> 8) % (1U << 14); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 4)) << (14 - 4); + out++; + *out = ((*in) >> 4) % (1U << 14); + out++; + *out = ((*in) >> 18); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack15_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 15); + out++; + *out = ((*in) >> 15) % (1U << 15); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 13)) << (15 - 13); + out++; + *out = ((*in) >> 13) % (1U << 15); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 11)) << (15 - 11); + out++; + *out = ((*in) >> 11) % (1U << 15); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 9)) << (15 - 9); + out++; + *out = ((*in) >> 9) % (1U << 15); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 7)) << (15 - 7); + out++; + *out = ((*in) >> 7) % (1U << 15); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 5)) << (15 - 5); + out++; + *out = ((*in) >> 5) % (1U << 15); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 3)) << (15 - 3); + out++; + *out = ((*in) >> 3) % (1U << 15); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 1)) << (15 - 1); + out++; + *out = ((*in) >> 1) % (1U << 15); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack16_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 16); + out++; + *out = ((*in) >> 16); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack17_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 2)) << (17 - 2); + out++; + *out = ((*in) >> 2) % (1U << 17); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 4)) << (17 - 4); + out++; + *out = ((*in) >> 4) % (1U << 17); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 6)) << (17 - 6); + out++; + *out = ((*in) >> 6) % (1U << 17); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 8)) << (17 - 8); + out++; + *out = ((*in) >> 8) % (1U << 17); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 10)) << (17 - 10); + out++; + *out = ((*in) >> 10) % (1U << 17); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 12)) << (17 - 12); + out++; + *out = ((*in) >> 12) % (1U << 17); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 14)) << (17 - 14); + out++; + *out = ((*in) >> 14) % (1U << 17); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 16)) << (17 - 16); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack18_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 4)) << (18 - 4); + out++; + *out = ((*in) >> 4) % (1U << 18); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 8)) << (18 - 8); + out++; + *out = ((*in) >> 8) % (1U << 18); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 12)) << (18 - 12); + out++; + *out = ((*in) >> 12) % (1U << 18); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 16)) << (18 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 2)) << (18 - 2); + out++; + *out = ((*in) >> 2) % (1U << 18); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 6)) << (18 - 6); + out++; + *out = ((*in) >> 6) % (1U << 18); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 10)) << (18 - 10); + out++; + *out = ((*in) >> 10) % (1U << 18); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 14)) << (18 - 14); + out++; + *out = ((*in) >> 14); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack19_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 6)) << (19 - 6); + out++; + *out = ((*in) >> 6) % (1U << 19); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 12)) << (19 - 12); + out++; + *out = ((*in) >> 12) % (1U << 19); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 18)) << (19 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 5)) << (19 - 5); + out++; + *out = ((*in) >> 5) % (1U << 19); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 11)) << (19 - 11); + out++; + *out = ((*in) >> 11) % (1U << 19); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 17)) << (19 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 4)) << (19 - 4); + out++; + *out = ((*in) >> 4) % (1U << 19); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 10)) << (19 - 10); + out++; + *out = ((*in) >> 10) % (1U << 19); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 16)) << (19 - 16); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack20_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 8)) << (20 - 8); + out++; + *out = ((*in) >> 8) % (1U << 20); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 16)) << (20 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 4)) << (20 - 4); + out++; + *out = ((*in) >> 4) % (1U << 20); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 12)) << (20 - 12); + out++; + *out = ((*in) >> 12); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack21_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 10)) << (21 - 10); + out++; + *out = ((*in) >> 10) % (1U << 21); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 20)) << (21 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 9)) << (21 - 9); + out++; + *out = ((*in) >> 9) % (1U << 21); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 19)) << (21 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 8)) << (21 - 8); + out++; + *out = ((*in) >> 8) % (1U << 21); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 18)) << (21 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 7)) << (21 - 7); + out++; + *out = ((*in) >> 7) % (1U << 21); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 17)) << (21 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 6)) << (21 - 6); + out++; + *out = ((*in) >> 6) % (1U << 21); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 16)) << (21 - 16); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack22_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 12)) << (22 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 2)) << (22 - 2); + out++; + *out = ((*in) >> 2) % (1U << 22); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 14)) << (22 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 4)) << (22 - 4); + out++; + *out = ((*in) >> 4) % (1U << 22); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 16)) << (22 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 6)) << (22 - 6); + out++; + *out = ((*in) >> 6) % (1U << 22); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 18)) << (22 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 8)) << (22 - 8); + out++; + *out = ((*in) >> 8) % (1U << 22); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 20)) << (22 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 10)) << (22 - 10); + out++; + *out = ((*in) >> 10); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack23_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 14)) << (23 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 5)) << (23 - 5); + out++; + *out = ((*in) >> 5) % (1U << 23); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 19)) << (23 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 10)) << (23 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 1)) << (23 - 1); + out++; + *out = ((*in) >> 1) % (1U << 23); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 15)) << (23 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 6)) << (23 - 6); + out++; + *out = ((*in) >> 6) % (1U << 23); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 20)) << (23 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 11)) << (23 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 2)) << (23 - 2); + out++; + *out = ((*in) >> 2) % (1U << 23); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 16)) << (23 - 16); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack24_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 16)) << (24 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 8)) << (24 - 8); + out++; + *out = ((*in) >> 8); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack25_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 18)) << (25 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 11)) << (25 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 4)) << (25 - 4); + out++; + *out = ((*in) >> 4) % (1U << 25); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 22)) << (25 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 15)) << (25 - 15); + out++; + *out = ((*in) >> 15); + ++in; + *out |= ((*in) % (1U << 8)) << (25 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 1)) << (25 - 1); + out++; + *out = ((*in) >> 1) % (1U << 25); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 19)) << (25 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 12)) << (25 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 5)) << (25 - 5); + out++; + *out = ((*in) >> 5) % (1U << 25); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 23)) << (25 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 16)) << (25 - 16); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack26_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 20)) << (26 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 14)) << (26 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 8)) << (26 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 2)) << (26 - 2); + out++; + *out = ((*in) >> 2) % (1U << 26); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 22)) << (26 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 16)) << (26 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 10)) << (26 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 4)) << (26 - 4); + out++; + *out = ((*in) >> 4) % (1U << 26); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 24)) << (26 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 18)) << (26 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 12)) << (26 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 6)) << (26 - 6); + out++; + *out = ((*in) >> 6); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack27_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 27); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 22)) << (27 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 17)) << (27 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 12)) << (27 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 7)) << (27 - 7); + out++; + *out = ((*in) >> 7); + ++in; + *out |= ((*in) % (1U << 2)) << (27 - 2); + out++; + *out = ((*in) >> 2) % (1U << 27); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 24)) << (27 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 19)) << (27 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 14)) << (27 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 9)) << (27 - 9); + out++; + *out = ((*in) >> 9); + ++in; + *out |= ((*in) % (1U << 4)) << (27 - 4); + out++; + *out = ((*in) >> 4) % (1U << 27); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 26)) << (27 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 21)) << (27 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 16)) << (27 - 16); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack28_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); + ++in; + out++; + *out = ((*in) >> 0) % (1U << 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 24)) << (28 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 20)) << (28 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 16)) << (28 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 12)) << (28 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 8)) << (28 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 4)) << (28 - 4); + out++; + *out = ((*in) >> 4); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack29_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 29); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 26)) << (29 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 23)) << (29 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 20)) << (29 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 17)) << (29 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 14)) << (29 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 11)) << (29 - 11); + out++; + *out = ((*in) >> 11); + ++in; + *out |= ((*in) % (1U << 8)) << (29 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 5)) << (29 - 5); + out++; + *out = ((*in) >> 5); + ++in; + *out |= ((*in) % (1U << 2)) << (29 - 2); + out++; + *out = ((*in) >> 2) % (1U << 29); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 28)) << (29 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 25)) << (29 - 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 22)) << (29 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 19)) << (29 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 16)) << (29 - 16); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack30_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 30); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 28)) << (30 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 26)) << (30 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 24)) << (30 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 22)) << (30 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 20)) << (30 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 18)) << (30 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 16)) << (30 - 16); + out++; + *out = ((*in) >> 16); + ++in; + *out |= ((*in) % (1U << 14)) << (30 - 14); + out++; + *out = ((*in) >> 14); + ++in; + *out |= ((*in) % (1U << 12)) << (30 - 12); + out++; + *out = ((*in) >> 12); + ++in; + *out |= ((*in) % (1U << 10)) << (30 - 10); + out++; + *out = ((*in) >> 10); + ++in; + *out |= ((*in) % (1U << 8)) << (30 - 8); + out++; + *out = ((*in) >> 8); + ++in; + *out |= ((*in) % (1U << 6)) << (30 - 6); + out++; + *out = ((*in) >> 6); + ++in; + *out |= ((*in) % (1U << 4)) << (30 - 4); + out++; + *out = ((*in) >> 4); + ++in; + *out |= ((*in) % (1U << 2)) << (30 - 2); + out++; + *out = ((*in) >> 2); + ++in; + out++; + + return in; +} + +const uint32_t *__fastunpack31_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0) % (1U << 31); + out++; + *out = ((*in) >> 31); + ++in; + *out |= ((*in) % (1U << 30)) << (31 - 30); + out++; + *out = ((*in) >> 30); + ++in; + *out |= ((*in) % (1U << 29)) << (31 - 29); + out++; + *out = ((*in) >> 29); + ++in; + *out |= ((*in) % (1U << 28)) << (31 - 28); + out++; + *out = ((*in) >> 28); + ++in; + *out |= ((*in) % (1U << 27)) << (31 - 27); + out++; + *out = ((*in) >> 27); + ++in; + *out |= ((*in) % (1U << 26)) << (31 - 26); + out++; + *out = ((*in) >> 26); + ++in; + *out |= ((*in) % (1U << 25)) << (31 - 25); + out++; + *out = ((*in) >> 25); + ++in; + *out |= ((*in) % (1U << 24)) << (31 - 24); + out++; + *out = ((*in) >> 24); + ++in; + *out |= ((*in) % (1U << 23)) << (31 - 23); + out++; + *out = ((*in) >> 23); + ++in; + *out |= ((*in) % (1U << 22)) << (31 - 22); + out++; + *out = ((*in) >> 22); + ++in; + *out |= ((*in) % (1U << 21)) << (31 - 21); + out++; + *out = ((*in) >> 21); + ++in; + *out |= ((*in) % (1U << 20)) << (31 - 20); + out++; + *out = ((*in) >> 20); + ++in; + *out |= ((*in) % (1U << 19)) << (31 - 19); + out++; + *out = ((*in) >> 19); + ++in; + *out |= ((*in) % (1U << 18)) << (31 - 18); + out++; + *out = ((*in) >> 18); + ++in; + *out |= ((*in) % (1U << 17)) << (31 - 17); + out++; + *out = ((*in) >> 17); + ++in; + *out |= ((*in) % (1U << 16)) << (31 - 16); + out++; + + return in + 1; +} + +const uint32_t *__fastunpack32_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out) { + + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + *out = ((*in) >> 0); + ++in; + out++; + + return in; +} + +const uint32_t *fastunpack_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, const uint32_t bit) { + switch (bit) { + case 0: + return nullunpacker16(in, out); + + case 1: + return __fastunpack1_16(in, out); + + case 2: + return __fastunpack2_16(in, out); + + case 3: + return __fastunpack3_16(in, out); + + case 4: + return __fastunpack4_16(in, out); + + case 5: + return __fastunpack5_16(in, out); + + case 6: + return __fastunpack6_16(in, out); + + case 7: + return __fastunpack7_16(in, out); + + case 8: + return __fastunpack8_16(in, out); + + case 9: + return __fastunpack9_16(in, out); + + case 10: + return __fastunpack10_16(in, out); + + case 11: + return __fastunpack11_16(in, out); + + case 12: + return __fastunpack12_16(in, out); + + case 13: + return __fastunpack13_16(in, out); + + case 14: + return __fastunpack14_16(in, out); + + case 15: + return __fastunpack15_16(in, out); + + case 16: + return __fastunpack16_16(in, out); + + case 17: + return __fastunpack17_16(in, out); + + case 18: + return __fastunpack18_16(in, out); + + case 19: + return __fastunpack19_16(in, out); + + case 20: + return __fastunpack20_16(in, out); + + case 21: + return __fastunpack21_16(in, out); + + case 22: + return __fastunpack22_16(in, out); + + case 23: + return __fastunpack23_16(in, out); + + case 24: + return __fastunpack24_16(in, out); + + case 25: + return __fastunpack25_16(in, out); + + case 26: + return __fastunpack26_16(in, out); + + case 27: + return __fastunpack27_16(in, out); + + case 28: + return __fastunpack28_16(in, out); + + case 29: + return __fastunpack29_16(in, out); + + case 30: + return __fastunpack30_16(in, out); + + case 31: + return __fastunpack31_16(in, out); + + case 32: + return __fastunpack32_16(in, out); + + default: + break; + } + throw std::logic_error("number of bits is unsupported"); +} + +/*assumes that integers fit in the prescribed number of bits*/ +uint32_t *fastpackwithoutmask_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, + const uint32_t bit) { + switch (bit) { + case 0: + return nullpacker(in, out); + + case 1: + return __fastpackwithoutmask1_16(in, out); + + case 2: + return __fastpackwithoutmask2_16(in, out); + + case 3: + return __fastpackwithoutmask3_16(in, out); + + case 4: + return __fastpackwithoutmask4_16(in, out); + + case 5: + return __fastpackwithoutmask5_16(in, out); + + case 6: + return __fastpackwithoutmask6_16(in, out); + + case 7: + return __fastpackwithoutmask7_16(in, out); + + case 8: + return __fastpackwithoutmask8_16(in, out); + + case 9: + return __fastpackwithoutmask9_16(in, out); + + case 10: + return __fastpackwithoutmask10_16(in, out); + + case 11: + return __fastpackwithoutmask11_16(in, out); + + case 12: + return __fastpackwithoutmask12_16(in, out); + + case 13: + return __fastpackwithoutmask13_16(in, out); + + case 14: + return __fastpackwithoutmask14_16(in, out); + + case 15: + return __fastpackwithoutmask15_16(in, out); + + case 16: + return __fastpackwithoutmask16_16(in, out); + + case 17: + return __fastpackwithoutmask17_16(in, out); + + case 18: + return __fastpackwithoutmask18_16(in, out); + + case 19: + return __fastpackwithoutmask19_16(in, out); + + case 20: + return __fastpackwithoutmask20_16(in, out); + + case 21: + return __fastpackwithoutmask21_16(in, out); + + case 22: + return __fastpackwithoutmask22_16(in, out); + + case 23: + return __fastpackwithoutmask23_16(in, out); + + case 24: + return __fastpackwithoutmask24_16(in, out); + + case 25: + return __fastpackwithoutmask25_16(in, out); + + case 26: + return __fastpackwithoutmask26_16(in, out); + + case 27: + return __fastpackwithoutmask27_16(in, out); + + case 28: + return __fastpackwithoutmask28_16(in, out); + + case 29: + return __fastpackwithoutmask29_16(in, out); + + case 30: + return __fastpackwithoutmask30_16(in, out); + + case 31: + return __fastpackwithoutmask31_16(in, out); + + case 32: + return __fastpackwithoutmask32_16(in, out); + + default: + break; + } + throw std::logic_error("number of bits is unsupported"); +} + +} // namespace FastPForLib diff --git a/third_party/fastpfor/fastpfor/bitpackingaligned.h b/third_party/fastpfor/fastpfor/bitpackingaligned.h new file mode 100644 index 00000000000..2f5096c614e --- /dev/null +++ b/third_party/fastpfor/fastpfor/bitpackingaligned.h @@ -0,0 +1,27 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +#ifndef BITPACKINGALIGNED_H_ +#define BITPACKINGALIGNED_H_ + +#include "common.h" + +namespace FastPForLib { + +const uint32_t *fastunpack_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, const uint32_t bit); +uint32_t *fastpackwithoutmask_8(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, const uint32_t bit); + +const uint32_t *fastunpack_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, const uint32_t bit); +uint32_t *fastpackwithoutmask_16(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, + const uint32_t bit); + +} // namespace FastPForLib + +#endif /* BITPACKINGALIGNED_H_ */ diff --git a/third_party/fastpfor/fastpfor/bitpackinghelpers.h b/third_party/fastpfor/fastpfor/bitpackinghelpers.h new file mode 100644 index 00000000000..7b7dac0d603 --- /dev/null +++ b/third_party/fastpfor/fastpfor/bitpackinghelpers.h @@ -0,0 +1,982 @@ +/* + * bitpackinghelpers.h + * + * Created on: Jul 11, 2012 + * Author: lemire + */ + +#ifndef BITPACKINGHELPERS_H_ +#define BITPACKINGHELPERS_H_ + +#include "bitpacking.h" + +namespace FastPForLib { + +inline void fastunpack(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + __fastunpack0(in, out); + break; + case 1: + __fastunpack1(in, out); + break; + case 2: + __fastunpack2(in, out); + break; + case 3: + __fastunpack3(in, out); + break; + case 4: + __fastunpack4(in, out); + break; + case 5: + __fastunpack5(in, out); + break; + case 6: + __fastunpack6(in, out); + break; + case 7: + __fastunpack7(in, out); + break; + case 8: + __fastunpack8(in, out); + break; + case 9: + __fastunpack9(in, out); + break; + case 10: + __fastunpack10(in, out); + break; + case 11: + __fastunpack11(in, out); + break; + case 12: + __fastunpack12(in, out); + break; + case 13: + __fastunpack13(in, out); + break; + case 14: + __fastunpack14(in, out); + break; + case 15: + __fastunpack15(in, out); + break; + case 16: + __fastunpack16(in, out); + break; + case 17: + __fastunpack17(in, out); + break; + case 18: + __fastunpack18(in, out); + break; + case 19: + __fastunpack19(in, out); + break; + case 20: + __fastunpack20(in, out); + break; + case 21: + __fastunpack21(in, out); + break; + case 22: + __fastunpack22(in, out); + break; + case 23: + __fastunpack23(in, out); + break; + case 24: + __fastunpack24(in, out); + break; + case 25: + __fastunpack25(in, out); + break; + case 26: + __fastunpack26(in, out); + break; + case 27: + __fastunpack27(in, out); + break; + case 28: + __fastunpack28(in, out); + break; + case 29: + __fastunpack29(in, out); + break; + case 30: + __fastunpack30(in, out); + break; + case 31: + __fastunpack31(in, out); + break; + case 32: + __fastunpack32(in, out); + break; + default: + break; + } +} + +inline void fastunpack(const uint32_t *__restrict__ in, + uint64_t *__restrict__ out, const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + __fastunpack0(in, out); + break; + case 1: + __fastunpack1(in, out); + break; + case 2: + __fastunpack2(in, out); + break; + case 3: + __fastunpack3(in, out); + break; + case 4: + __fastunpack4(in, out); + break; + case 5: + __fastunpack5(in, out); + break; + case 6: + __fastunpack6(in, out); + break; + case 7: + __fastunpack7(in, out); + break; + case 8: + __fastunpack8(in, out); + break; + case 9: + __fastunpack9(in, out); + break; + case 10: + __fastunpack10(in, out); + break; + case 11: + __fastunpack11(in, out); + break; + case 12: + __fastunpack12(in, out); + break; + case 13: + __fastunpack13(in, out); + break; + case 14: + __fastunpack14(in, out); + break; + case 15: + __fastunpack15(in, out); + break; + case 16: + __fastunpack16(in, out); + break; + case 17: + __fastunpack17(in, out); + break; + case 18: + __fastunpack18(in, out); + break; + case 19: + __fastunpack19(in, out); + break; + case 20: + __fastunpack20(in, out); + break; + case 21: + __fastunpack21(in, out); + break; + case 22: + __fastunpack22(in, out); + break; + case 23: + __fastunpack23(in, out); + break; + case 24: + __fastunpack24(in, out); + break; + case 25: + __fastunpack25(in, out); + break; + case 26: + __fastunpack26(in, out); + break; + case 27: + __fastunpack27(in, out); + break; + case 28: + __fastunpack28(in, out); + break; + case 29: + __fastunpack29(in, out); + break; + case 30: + __fastunpack30(in, out); + break; + case 31: + __fastunpack31(in, out); + break; + case 32: + __fastunpack32(in, out); + break; + case 33: + __fastunpack33(in, out); + break; + case 34: + __fastunpack34(in, out); + break; + case 35: + __fastunpack35(in, out); + break; + case 36: + __fastunpack36(in, out); + break; + case 37: + __fastunpack37(in, out); + break; + case 38: + __fastunpack38(in, out); + break; + case 39: + __fastunpack39(in, out); + break; + case 40: + __fastunpack40(in, out); + break; + case 41: + __fastunpack41(in, out); + break; + case 42: + __fastunpack42(in, out); + break; + case 43: + __fastunpack43(in, out); + break; + case 44: + __fastunpack44(in, out); + break; + case 45: + __fastunpack45(in, out); + break; + case 46: + __fastunpack46(in, out); + break; + case 47: + __fastunpack47(in, out); + break; + case 48: + __fastunpack48(in, out); + break; + case 49: + __fastunpack49(in, out); + break; + case 50: + __fastunpack50(in, out); + break; + case 51: + __fastunpack51(in, out); + break; + case 52: + __fastunpack52(in, out); + break; + case 53: + __fastunpack53(in, out); + break; + case 54: + __fastunpack54(in, out); + break; + case 55: + __fastunpack55(in, out); + break; + case 56: + __fastunpack56(in, out); + break; + case 57: + __fastunpack57(in, out); + break; + case 58: + __fastunpack58(in, out); + break; + case 59: + __fastunpack59(in, out); + break; + case 60: + __fastunpack60(in, out); + break; + case 61: + __fastunpack61(in, out); + break; + case 62: + __fastunpack62(in, out); + break; + case 63: + __fastunpack63(in, out); + break; + case 64: + __fastunpack64(in, out); + break; + default: + break; + } +} + +inline void fastpack(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + __fastpack0(in, out); + break; + case 1: + __fastpack1(in, out); + break; + case 2: + __fastpack2(in, out); + break; + case 3: + __fastpack3(in, out); + break; + case 4: + __fastpack4(in, out); + break; + case 5: + __fastpack5(in, out); + break; + case 6: + __fastpack6(in, out); + break; + case 7: + __fastpack7(in, out); + break; + case 8: + __fastpack8(in, out); + break; + case 9: + __fastpack9(in, out); + break; + case 10: + __fastpack10(in, out); + break; + case 11: + __fastpack11(in, out); + break; + case 12: + __fastpack12(in, out); + break; + case 13: + __fastpack13(in, out); + break; + case 14: + __fastpack14(in, out); + break; + case 15: + __fastpack15(in, out); + break; + case 16: + __fastpack16(in, out); + break; + case 17: + __fastpack17(in, out); + break; + case 18: + __fastpack18(in, out); + break; + case 19: + __fastpack19(in, out); + break; + case 20: + __fastpack20(in, out); + break; + case 21: + __fastpack21(in, out); + break; + case 22: + __fastpack22(in, out); + break; + case 23: + __fastpack23(in, out); + break; + case 24: + __fastpack24(in, out); + break; + case 25: + __fastpack25(in, out); + break; + case 26: + __fastpack26(in, out); + break; + case 27: + __fastpack27(in, out); + break; + case 28: + __fastpack28(in, out); + break; + case 29: + __fastpack29(in, out); + break; + case 30: + __fastpack30(in, out); + break; + case 31: + __fastpack31(in, out); + break; + case 32: + __fastpack32(in, out); + break; + default: + break; + } +} + +inline void fastpack(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out, const uint32_t bit) { + switch (bit) { + case 0: + __fastpack0(in, out); + break; + case 1: + __fastpack1(in, out); + break; + case 2: + __fastpack2(in, out); + break; + case 3: + __fastpack3(in, out); + break; + case 4: + __fastpack4(in, out); + break; + case 5: + __fastpack5(in, out); + break; + case 6: + __fastpack6(in, out); + break; + case 7: + __fastpack7(in, out); + break; + case 8: + __fastpack8(in, out); + break; + case 9: + __fastpack9(in, out); + break; + case 10: + __fastpack10(in, out); + break; + case 11: + __fastpack11(in, out); + break; + case 12: + __fastpack12(in, out); + break; + case 13: + __fastpack13(in, out); + break; + case 14: + __fastpack14(in, out); + break; + case 15: + __fastpack15(in, out); + break; + case 16: + __fastpack16(in, out); + break; + case 17: + __fastpack17(in, out); + break; + case 18: + __fastpack18(in, out); + break; + case 19: + __fastpack19(in, out); + break; + case 20: + __fastpack20(in, out); + break; + case 21: + __fastpack21(in, out); + break; + case 22: + __fastpack22(in, out); + break; + case 23: + __fastpack23(in, out); + break; + case 24: + __fastpack24(in, out); + break; + case 25: + __fastpack25(in, out); + break; + case 26: + __fastpack26(in, out); + break; + case 27: + __fastpack27(in, out); + break; + case 28: + __fastpack28(in, out); + break; + case 29: + __fastpack29(in, out); + break; + case 30: + __fastpack30(in, out); + break; + case 31: + __fastpack31(in, out); + break; + case 32: + __fastpack32(in, out); + break; + case 33: + __fastpack33(in, out); + break; + case 34: + __fastpack34(in, out); + break; + case 35: + __fastpack35(in, out); + break; + case 36: + __fastpack36(in, out); + break; + case 37: + __fastpack37(in, out); + break; + case 38: + __fastpack38(in, out); + break; + case 39: + __fastpack39(in, out); + break; + case 40: + __fastpack40(in, out); + break; + case 41: + __fastpack41(in, out); + break; + case 42: + __fastpack42(in, out); + break; + case 43: + __fastpack43(in, out); + break; + case 44: + __fastpack44(in, out); + break; + case 45: + __fastpack45(in, out); + break; + case 46: + __fastpack46(in, out); + break; + case 47: + __fastpack47(in, out); + break; + case 48: + __fastpack48(in, out); + break; + case 49: + __fastpack49(in, out); + break; + case 50: + __fastpack50(in, out); + break; + case 51: + __fastpack51(in, out); + break; + case 52: + __fastpack52(in, out); + break; + case 53: + __fastpack53(in, out); + break; + case 54: + __fastpack54(in, out); + break; + case 55: + __fastpack55(in, out); + break; + case 56: + __fastpack56(in, out); + break; + case 57: + __fastpack57(in, out); + break; + case 58: + __fastpack58(in, out); + break; + case 59: + __fastpack59(in, out); + break; + case 60: + __fastpack60(in, out); + break; + case 61: + __fastpack61(in, out); + break; + case 62: + __fastpack62(in, out); + break; + case 63: + __fastpack63(in, out); + break; + case 64: + __fastpack64(in, out); + break; + default: + break; + } +} + +/*assumes that integers fit in the prescribed number of bits*/ +inline void fastpackwithoutmask(const uint32_t *__restrict__ in, + uint32_t *__restrict__ out, + const uint32_t bit) { + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) { + case 0: + __fastpackwithoutmask0(in, out); + break; + case 1: + __fastpackwithoutmask1(in, out); + break; + case 2: + __fastpackwithoutmask2(in, out); + break; + case 3: + __fastpackwithoutmask3(in, out); + break; + case 4: + __fastpackwithoutmask4(in, out); + break; + case 5: + __fastpackwithoutmask5(in, out); + break; + case 6: + __fastpackwithoutmask6(in, out); + break; + case 7: + __fastpackwithoutmask7(in, out); + break; + case 8: + __fastpackwithoutmask8(in, out); + break; + case 9: + __fastpackwithoutmask9(in, out); + break; + case 10: + __fastpackwithoutmask10(in, out); + break; + case 11: + __fastpackwithoutmask11(in, out); + break; + case 12: + __fastpackwithoutmask12(in, out); + break; + case 13: + __fastpackwithoutmask13(in, out); + break; + case 14: + __fastpackwithoutmask14(in, out); + break; + case 15: + __fastpackwithoutmask15(in, out); + break; + case 16: + __fastpackwithoutmask16(in, out); + break; + case 17: + __fastpackwithoutmask17(in, out); + break; + case 18: + __fastpackwithoutmask18(in, out); + break; + case 19: + __fastpackwithoutmask19(in, out); + break; + case 20: + __fastpackwithoutmask20(in, out); + break; + case 21: + __fastpackwithoutmask21(in, out); + break; + case 22: + __fastpackwithoutmask22(in, out); + break; + case 23: + __fastpackwithoutmask23(in, out); + break; + case 24: + __fastpackwithoutmask24(in, out); + break; + case 25: + __fastpackwithoutmask25(in, out); + break; + case 26: + __fastpackwithoutmask26(in, out); + break; + case 27: + __fastpackwithoutmask27(in, out); + break; + case 28: + __fastpackwithoutmask28(in, out); + break; + case 29: + __fastpackwithoutmask29(in, out); + break; + case 30: + __fastpackwithoutmask30(in, out); + break; + case 31: + __fastpackwithoutmask31(in, out); + break; + case 32: + __fastpackwithoutmask32(in, out); + break; + default: + break; + } +} + +inline void fastpackwithoutmask(const uint64_t *__restrict__ in, + uint32_t *__restrict__ out, + const uint32_t bit) { + switch (bit) { + case 0: + __fastpackwithoutmask0(in, out); + break; + case 1: + __fastpackwithoutmask1(in, out); + break; + case 2: + __fastpackwithoutmask2(in, out); + break; + case 3: + __fastpackwithoutmask3(in, out); + break; + case 4: + __fastpackwithoutmask4(in, out); + break; + case 5: + __fastpackwithoutmask5(in, out); + break; + case 6: + __fastpackwithoutmask6(in, out); + break; + case 7: + __fastpackwithoutmask7(in, out); + break; + case 8: + __fastpackwithoutmask8(in, out); + break; + case 9: + __fastpackwithoutmask9(in, out); + break; + case 10: + __fastpackwithoutmask10(in, out); + break; + case 11: + __fastpackwithoutmask11(in, out); + break; + case 12: + __fastpackwithoutmask12(in, out); + break; + case 13: + __fastpackwithoutmask13(in, out); + break; + case 14: + __fastpackwithoutmask14(in, out); + break; + case 15: + __fastpackwithoutmask15(in, out); + break; + case 16: + __fastpackwithoutmask16(in, out); + break; + case 17: + __fastpackwithoutmask17(in, out); + break; + case 18: + __fastpackwithoutmask18(in, out); + break; + case 19: + __fastpackwithoutmask19(in, out); + break; + case 20: + __fastpackwithoutmask20(in, out); + break; + case 21: + __fastpackwithoutmask21(in, out); + break; + case 22: + __fastpackwithoutmask22(in, out); + break; + case 23: + __fastpackwithoutmask23(in, out); + break; + case 24: + __fastpackwithoutmask24(in, out); + break; + case 25: + __fastpackwithoutmask25(in, out); + break; + case 26: + __fastpackwithoutmask26(in, out); + break; + case 27: + __fastpackwithoutmask27(in, out); + break; + case 28: + __fastpackwithoutmask28(in, out); + break; + case 29: + __fastpackwithoutmask29(in, out); + break; + case 30: + __fastpackwithoutmask30(in, out); + break; + case 31: + __fastpackwithoutmask31(in, out); + break; + case 32: + __fastpackwithoutmask32(in, out); + break; + case 33: + __fastpackwithoutmask33(in, out); + break; + case 34: + __fastpackwithoutmask34(in, out); + break; + case 35: + __fastpackwithoutmask35(in, out); + break; + case 36: + __fastpackwithoutmask36(in, out); + break; + case 37: + __fastpackwithoutmask37(in, out); + break; + case 38: + __fastpackwithoutmask38(in, out); + break; + case 39: + __fastpackwithoutmask39(in, out); + break; + case 40: + __fastpackwithoutmask40(in, out); + break; + case 41: + __fastpackwithoutmask41(in, out); + break; + case 42: + __fastpackwithoutmask42(in, out); + break; + case 43: + __fastpackwithoutmask43(in, out); + break; + case 44: + __fastpackwithoutmask44(in, out); + break; + case 45: + __fastpackwithoutmask45(in, out); + break; + case 46: + __fastpackwithoutmask46(in, out); + break; + case 47: + __fastpackwithoutmask47(in, out); + break; + case 48: + __fastpackwithoutmask48(in, out); + break; + case 49: + __fastpackwithoutmask49(in, out); + break; + case 50: + __fastpackwithoutmask50(in, out); + break; + case 51: + __fastpackwithoutmask51(in, out); + break; + case 52: + __fastpackwithoutmask52(in, out); + break; + case 53: + __fastpackwithoutmask53(in, out); + break; + case 54: + __fastpackwithoutmask54(in, out); + break; + case 55: + __fastpackwithoutmask55(in, out); + break; + case 56: + __fastpackwithoutmask56(in, out); + break; + case 57: + __fastpackwithoutmask57(in, out); + break; + case 58: + __fastpackwithoutmask58(in, out); + break; + case 59: + __fastpackwithoutmask59(in, out); + break; + case 60: + __fastpackwithoutmask60(in, out); + break; + case 61: + __fastpackwithoutmask61(in, out); + break; + case 62: + __fastpackwithoutmask62(in, out); + break; + case 63: + __fastpackwithoutmask63(in, out); + break; + case 64: + __fastpackwithoutmask64(in, out); + break; + default: + break; + } +} + +template +uint32_t *packblockup(const IntType * source, uint32_t *out, + const uint32_t bit) { + for (uint32_t j = 0; j != BlockSize; j += 32) { + fastpack(source + j, out, bit); + out += bit; + } + return out; +} + +template +const uint32_t *unpackblock(const uint32_t *source, IntType *out, + const uint32_t bit) { + for (uint32_t j = 0; j != BlockSize; j += 32) { + fastunpack(source, out + j, bit); + source += bit; + } + return source; +} + +} // namespace FastPForLib + +#endif /* BITPACKINGHELPERS_H_ */ diff --git a/third_party/fastpfor/fastpfor/common.h b/third_party/fastpfor/fastpfor/common.h new file mode 100644 index 00000000000..2ac0dcb61ec --- /dev/null +++ b/third_party/fastpfor/fastpfor/common.h @@ -0,0 +1,14 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +#ifndef COMMON_H_ +#define COMMON_H_ + +#include +#include +#include + +#endif /* COMMON_H_ */