Skip to content

Commit

Permalink
[improvement](agg) iterate aggregation data in memory written order (a…
Browse files Browse the repository at this point in the history
…pache#12704)

Following the iteration order of the hash table will result in out-of-order access to aggregate states, which is very inefficient.
Traversing aggregate states in memory write order can significantly improve memory read efficiency.

Test
hash table items count: 3.35M

Before this optimization: insert keys into column takes 500ms
With this optimization only takes 80ms
  • Loading branch information
mrhhsg authored and Yijia Su committed Oct 8, 2022
1 parent efac836 commit 79e5c04
Show file tree
Hide file tree
Showing 12 changed files with 522 additions and 50 deletions.
42 changes: 42 additions & 0 deletions be/src/vec/common/columns_hashing.h
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,37 @@ struct HashMethodSingleLowNullableColumn : public SingleColumnMethod {
return EmplaceResult(inserted);
}

template <typename Data, typename Func, typename CreatorForNull>
ALWAYS_INLINE typename std::enable_if_t<has_mapped, Mapped>& lazy_emplace_key(
Data& data, size_t row, Arena& pool, Func&& f, CreatorForNull&& null_creator) {
if (key_columns[0]->is_null_at(row)) {
bool has_null_key = data.has_null_key_data();
data.has_null_key_data() = true;
if (!has_null_key) std::forward<CreatorForNull>(null_creator)(data.get_null_key_data());
return data.get_null_key_data();
}
auto key_holder = Base::get_key_holder(row, pool);
typename Data::LookupResult it;
data.lazy_emplace(key_holder, it, std::forward<Func>(f));
return *lookup_result_get_mapped(it);
}

template <typename Data, typename Func, typename CreatorForNull>
ALWAYS_INLINE typename std::enable_if_t<has_mapped, Mapped>& lazy_emplace_key(
Data& data, size_t row, Arena& pool, size_t hash_value, Func&& f,
CreatorForNull&& null_creator) {
if (key_columns[0]->is_null_at(row)) {
bool has_null_key = data.has_null_key_data();
data.has_null_key_data() = true;
if (!has_null_key) std::forward<CreatorForNull>(null_creator)(data.get_null_key_data());
return data.get_null_key_data();
}
auto key_holder = Base::get_key_holder(row, pool);
typename Data::LookupResult it;
data.lazy_emplace(key_holder, it, hash_value, std::forward<Func>(f));
return *lookup_result_get_mapped(it);
}

template <typename Data>
ALWAYS_INLINE FindResult find_key(Data& data, size_t row, Arena& pool) {
if (key_columns[0]->is_null_at(row)) {
Expand All @@ -276,5 +307,16 @@ struct HashMethodSingleLowNullableColumn : public SingleColumnMethod {
}
};

template <typename HashMethod>
struct IsSingleNullableColumnMethod {
static constexpr bool value = false;
};

template <typename SingleColumnMethod, typename Mapped, bool use_cache>
struct IsSingleNullableColumnMethod<
HashMethodSingleLowNullableColumn<SingleColumnMethod, Mapped, use_cache>> {
static constexpr bool value = true;
};

} // namespace ColumnsHashing
} // namespace doris::vectorized
33 changes: 33 additions & 0 deletions be/src/vec/common/columns_hashing_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,22 @@ class HashMethodBase {
return emplaceImpl(key_holder, hash_value, data);
}

template <typename Data, typename Func>
ALWAYS_INLINE typename std::enable_if_t<has_mapped, Mapped>& lazy_emplace_key(Data& data,
size_t row,
Arena& pool,
Func&& f) {
auto key_holder = static_cast<Derived&>(*this).get_key_holder(row, pool);
return lazy_emplace_impl(key_holder, data, std::forward<Func>(f));
}

template <typename Data, typename Func>
ALWAYS_INLINE typename std::enable_if_t<has_mapped, Mapped>& lazy_emplace_key(
Data& data, size_t hash_value, size_t row, Arena& pool, Func&& f) {
auto key_holder = static_cast<Derived&>(*this).get_key_holder(row, pool);
return lazy_emplace_impl(key_holder, hash_value, data, std::forward<Func>(f));
}

template <typename Data>
ALWAYS_INLINE FindResult find_key(Data& data, size_t row, Arena& pool) {
auto key_holder = static_cast<Derived&>(*this).get_key_holder(row, pool);
Expand Down Expand Up @@ -264,6 +280,23 @@ class HashMethodBase {
return EmplaceResult(inserted);
}

template <typename Data, typename KeyHolder, typename Func>
ALWAYS_INLINE typename std::enable_if_t<has_mapped, Mapped>& lazy_emplace_impl(
KeyHolder& key_holder, Data& data, Func&& f) {
typename Data::LookupResult it;
data.lazy_emplace(key_holder, it, std::forward<Func>(f));
return *lookup_result_get_mapped(it);
}

template <typename Data, typename KeyHolder, typename Func>
ALWAYS_INLINE typename std::enable_if_t<has_mapped, Mapped>& lazy_emplace_impl(
KeyHolder& key_holder, size_t hash_value, Data& data, Func&& f) {
typename Data::LookupResult it;
data.lazy_emplace(key_holder, it, hash_value, std::forward<Func>(f));

return *lookup_result_get_mapped(it);
}

template <typename Data, typename Key>
ALWAYS_INLINE FindResult find_key_impl(Key key, Data& data) {
if constexpr (Cache::consecutive_keys_optimization) {
Expand Down
1 change: 1 addition & 0 deletions be/src/vec/common/hash_table/fixed_hash_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ struct FixedHashMapImplicitZeroCell {

FixedHashMapImplicitZeroCell() {}
FixedHashMapImplicitZeroCell(const Key&, const State&) {}
FixedHashMapImplicitZeroCell(const Key&, const Mapped& mapped_) : mapped(mapped_) {}
FixedHashMapImplicitZeroCell(const value_type& value_, const State&) : mapped(value_.second) {}

const VoidKey get_first() const { return {}; }
Expand Down
25 changes: 25 additions & 0 deletions be/src/vec/common/hash_table/fixed_hash_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,31 @@ class FixedHashTable : private boost::noncopyable,
this->increase_size();
}

class Constructor {
public:
friend class FixedHashTable;
template <typename... Args>
void operator()(Args&&... args) const {
new (_cell) Cell(std::forward<Args>(args)...);
}

private:
Constructor(Cell* cell) : _cell(cell) {}
Cell* _cell;
};

template <typename Func>
void ALWAYS_INLINE lazy_emplace(const Key& x, LookupResult& it, Func&& f) {
it = &buf[x];

if (!buf[x].is_zero(*this)) {
return;
}

f(Constructor(&buf[x]), x);
this->increase_size();
}

std::pair<LookupResult, bool> ALWAYS_INLINE insert(const value_type& x) {
std::pair<LookupResult, bool> res;
emplace(Cell::get_key(x), res.first, res.second);
Expand Down
1 change: 1 addition & 0 deletions be/src/vec/common/hash_table/hash_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ struct HashMapCell {

HashMapCell() {}
HashMapCell(const Key& key_, const State&) : value(key_, NoInitTag()) {}
HashMapCell(const Key& key_, const Mapped& mapped_) : value(key_, mapped_) {}
HashMapCell(const value_type& value_, const State&) : value(value_) {}

const Key& get_first() const { return value.first; }
Expand Down
93 changes: 93 additions & 0 deletions be/src/vec/common/hash_table/hash_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,27 @@ class HashTable : private boost::noncopyable,
return false;
}

template <typename Func>
bool ALWAYS_INLINE lazy_emplace_if_zero(const Key& x, LookupResult& it, size_t hash_value,
Func&& f) {
/// If it is claimed that the zero key can not be inserted into the table.
if (!Cell::need_zero_value_storage) return false;

if (Cell::is_zero(x, *this)) {
it = this->zero_value();
if (!this->get_has_zero()) {
++m_size;
this->set_get_has_zero();
std::forward<Func>(f)(Constructor(it), x);
this->zero_value()->set_hash(hash_value);
}

return true;
}

return false;
}

template <typename KeyHolder>
void ALWAYS_INLINE emplace_non_zero_impl(size_t place_value, KeyHolder&& key_holder,
LookupResult& it, bool& inserted, size_t hash_value) {
Expand Down Expand Up @@ -804,6 +825,43 @@ class HashTable : private boost::noncopyable,
}
}

template <typename KeyHolder, typename Func>
void ALWAYS_INLINE lazy_emplace_non_zero_impl(size_t place_value, KeyHolder&& key_holder,
LookupResult& it, size_t hash_value, Func&& f) {
it = &buf[place_value];

if (!buf[place_value].is_zero(*this)) {
key_holder_discard_key(key_holder);
return;
}

key_holder_persist_key(key_holder);
const auto& key = key_holder_get_key(key_holder);

f(Constructor(&buf[place_value]), key);
buf[place_value].set_hash(hash_value);
++m_size;

if (UNLIKELY(grower.overflow(m_size))) {
try {
resize();
} catch (...) {
/** If we have not resized successfully, then there will be problems.
* There remains a key, but uninitialized mapped-value,
* which, perhaps, can not even be called a destructor.
*/
--m_size;
buf[place_value].set_zero();
throw;
}

// The hash table was rehashed, so we have to re-find the key.
size_t new_place = find_cell(key, hash_value, grower.place(hash_value));
assert(!buf[new_place].is_zero(*this));
it = &buf[new_place];
}
}

/// Only for non-zero keys. Find the right place, insert the key there, if it does not already exist. Set iterator to the cell in output parameter.
template <typename KeyHolder>
void ALWAYS_INLINE emplace_non_zero(KeyHolder&& key_holder, LookupResult& it, bool& inserted,
Expand All @@ -813,6 +871,14 @@ class HashTable : private boost::noncopyable,
emplace_non_zero_impl(place_value, key_holder, it, inserted, hash_value);
}

template <typename KeyHolder, typename Func>
void ALWAYS_INLINE lazy_emplace_non_zero(KeyHolder&& key_holder, LookupResult& it,
size_t hash_value, Func&& f) {
const auto& key = key_holder_get_key(key_holder);
size_t place_value = find_cell(key, hash_value, grower.place(hash_value));
lazy_emplace_non_zero_impl(place_value, key_holder, it, hash_value, std::forward<Func>(f));
}

public:
void expanse_for_add_elem(size_t num_elem) {
if (add_elem_size_overflow(num_elem)) {
Expand Down Expand Up @@ -847,6 +913,19 @@ class HashTable : private boost::noncopyable,
reinsert(*it.get_ptr(), hash_value);
}

class Constructor {
public:
friend class HashTable;
template <typename... Args>
void operator()(Args&&... args) const {
new (_cell) Cell(std::forward<Args>(args)...);
}

private:
Constructor(Cell* cell) : _cell(cell) {}
Cell* _cell;
};

/** Insert the key.
* Return values:
* 'it' -- a LookupResult pointing to the corresponding key/mapped pair.
Expand Down Expand Up @@ -877,6 +956,20 @@ class HashTable : private boost::noncopyable,
emplace_non_zero(key_holder, it, inserted, hash_value);
}

template <typename KeyHolder, typename Func>
void ALWAYS_INLINE lazy_emplace(KeyHolder&& key_holder, LookupResult& it, Func&& f) {
const auto& key = key_holder_get_key(key_holder);
lazy_emplace(key_holder, it, hash(key), std::forward<Func>(f));
}

template <typename KeyHolder, typename Func>
void ALWAYS_INLINE lazy_emplace(KeyHolder&& key_holder, LookupResult& it, size_t hash_value,
Func&& f) {
const auto& key = key_holder_get_key(key_holder);
if (!lazy_emplace_if_zero(key, it, hash_value, std::forward<Func>(f)))
lazy_emplace_non_zero(key_holder, it, hash_value, std::forward<Func>(f));
}

/// Copy the cell from another hash table. It is assumed that the cell is not zero, and also that there was no such key in the table yet.
void ALWAYS_INLINE insert_unique_non_zero(const Cell* cell, size_t hash_value) {
size_t place_value = find_empty_cell(grower.place(hash_value));
Expand Down
1 change: 1 addition & 0 deletions be/src/vec/common/hash_table/hash_table_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ template <typename T>
struct HashTableTraits {
static constexpr bool is_phmap = false;
static constexpr bool is_parallel_phmap = false;
static constexpr bool is_string_hash_table = false;
};
30 changes: 30 additions & 0 deletions be/src/vec/common/hash_table/ph_hash_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,16 @@ class PHHashMap : private boost::noncopyable {
it = &*it_;
}

template <typename KeyHolder, typename Func>
void ALWAYS_INLINE lazy_emplace(KeyHolder&& key_holder, LookupResult& it, Func&& f) {
const auto& key = key_holder_get_key(key_holder);
auto it_ = _hash_map.lazy_emplace(key, [&](const auto& ctor) {
key_holder_persist_key(key_holder);
f(ctor, key);
});
it = &*it_;
}

template <typename KeyHolder>
void ALWAYS_INLINE emplace(KeyHolder&& key_holder, LookupResult& it, size_t hash_value,
bool& inserted) {
Expand All @@ -140,6 +150,25 @@ class PHHashMap : private boost::noncopyable {
}
}

template <typename KeyHolder, typename Func>
void ALWAYS_INLINE lazy_emplace(KeyHolder&& key_holder, LookupResult& it, size_t hash_value,
Func&& f) {
const auto& key = key_holder_get_key(key_holder);
if constexpr (use_parallel) {
auto it_ = _hash_map.lazy_emplace_with_hash(hash_value, key, [&](const auto& ctor) {
key_holder_persist_key(key_holder);
f(ctor, key);
});
it = &*it_;
} else {
auto it_ = _hash_map.lazy_emplace_with_hash(key, hash_value, [&](const auto& ctor) {
key_holder_persist_key(key_holder);
f(ctor, key);
});
it = &*it_;
}
}

template <typename KeyHolder>
LookupResult ALWAYS_INLINE find(KeyHolder&& key_holder) {
const auto& key = key_holder_get_key(key_holder);
Expand Down Expand Up @@ -197,4 +226,5 @@ template <typename Key, typename Mapped, typename Hash, bool use_parallel>
struct HashTableTraits<PHHashMap<Key, Mapped, Hash, use_parallel>> {
static constexpr bool is_phmap = true;
static constexpr bool is_parallel_phmap = use_parallel;
static constexpr bool is_string_hash_table = false;
};
14 changes: 14 additions & 0 deletions be/src/vec/common/hash_table/string_hash_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,17 @@ class StringHashMap : public StringHashTable<StringHashMapSubMaps<TMapped, Alloc
char* get_null_key_data() { return nullptr; }
bool has_null_key_data() const { return false; }
};

template <typename TMapped, typename Allocator>
struct HashTableTraits<StringHashMap<TMapped, Allocator>> {
static constexpr bool is_phmap = false;
static constexpr bool is_parallel_phmap = false;
static constexpr bool is_string_hash_table = true;
};

template <template <typename> class Derived, typename TMapped, typename Allocator>
struct HashTableTraits<Derived<StringHashMap<TMapped, Allocator>>> {
static constexpr bool is_phmap = false;
static constexpr bool is_parallel_phmap = false;
static constexpr bool is_string_hash_table = true;
};
Loading

0 comments on commit 79e5c04

Please sign in to comment.