Skip to content

Commit

Permalink
Merge pull request #2994 from kuzudb/string-column-chunk-index
Browse files Browse the repository at this point in the history
More efficient ColumnChunk string dictionary caching
  • Loading branch information
benjaminwinger committed Mar 5, 2024
2 parents 1131621 + 3415ff1 commit 93e6b3e
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 7 deletions.
36 changes: 32 additions & 4 deletions src/include/storage/store/dictionary_chunk.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#pragma once

#include "common/string_utils.h"
#include "storage/store/column_chunk.h"

namespace kuzu {
Expand Down Expand Up @@ -32,9 +31,38 @@ class DictionaryChunk {
// of characters stored.
std::unique_ptr<ColumnChunk> stringDataChunk;
std::unique_ptr<ColumnChunk> offsetChunk;
std::unordered_map<std::string, string_index_t, common::StringUtils::string_hash,
std::equal_to<>>
indexTable;

struct DictionaryEntry {
string_index_t index;

std::string_view get(const DictionaryChunk& dict) const { return dict.getString(index); }
};

struct StringOps {
explicit StringOps(const DictionaryChunk* dict) : dict(dict) {}
const DictionaryChunk* dict;
using hash_type = std::hash<std::string_view>;
using is_transparent = void;

std::size_t operator()(const DictionaryEntry& entry) const {
return std::hash<std::string_view>()(entry.get(*dict));
}
std::size_t operator()(const char* str) const { return hash_type{}(str); }
std::size_t operator()(std::string_view str) const { return hash_type{}(str); }
std::size_t operator()(std::string const& str) const { return hash_type{}(str); }

bool operator()(const DictionaryEntry& lhs, const DictionaryEntry& rhs) const {
return lhs.get(*dict) == rhs.get(*dict);
}
bool operator()(const DictionaryEntry& lhs, std::string_view rhs) const {
return lhs.get(*dict) == rhs;
}
bool operator()(std::string_view lhs, const DictionaryEntry& rhs) const {
return lhs == rhs.get(*dict);
}
};

std::unordered_set<DictionaryEntry, StringOps /*hash*/, StringOps /*equals*/> indexTable;
};
} // namespace storage
} // namespace kuzu
9 changes: 6 additions & 3 deletions src/storage/store/dictionary_chunk.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "storage/store/dictionary_chunk.h"

#include <cstdint>

#include <bit>

using namespace kuzu::common;
Expand All @@ -17,7 +19,8 @@ namespace storage {
static const uint64_t OFFSET_CHUNK_INITIAL_CAPACITY = StorageConstants::NODE_GROUP_SIZE * 0.75;

DictionaryChunk::DictionaryChunk(uint64_t capacity, bool enableCompression)
: enableCompression{enableCompression} {
: enableCompression{enableCompression},
indexTable(0, StringOps(this) /*hash*/, StringOps(this) /*equals*/) {
// Bitpacking might save 1 bit per value with regular ascii compared to UTF-8
stringDataChunk = ColumnChunkFactory::createColumnChunk(
*LogicalType::UINT8(), false /*enableCompression*/, capacity);
Expand Down Expand Up @@ -45,7 +48,7 @@ DictionaryChunk::string_index_t DictionaryChunk::appendString(std::string_view v
auto found = indexTable.find(val);
// If the string already exists in the dictionary, skip it and refer to the existing string
if (enableCompression && found != indexTable.end()) {
return found->second;
return found->index;
}
auto leftSpace = stringDataChunk->getCapacity() - stringDataChunk->getNumValues();
if (leftSpace < val.size()) {
Expand All @@ -61,7 +64,7 @@ DictionaryChunk::string_index_t DictionaryChunk::appendString(std::string_view v
offsetChunk->setValue<string_offset_t>(startOffset, index);
offsetChunk->setNumValues(index + 1);
if (enableCompression) {
indexTable.insert({std::string{val}, index});
indexTable.insert({static_cast<string_index_t>(index)});
}
return index;
}
Expand Down

0 comments on commit 93e6b3e

Please sign in to comment.