Skip to content

Commit

Permalink
Add support to compute hash on list of struct (#3157)
Browse files Browse the repository at this point in the history
  • Loading branch information
acquamarin committed Mar 27, 2024
1 parent 677d35e commit c747899
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 48 deletions.
92 changes: 70 additions & 22 deletions src/function/vector_hash_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,74 @@ using namespace kuzu::common;
namespace kuzu {
namespace function {

static std::unique_ptr<ValueVector> computeDataVecHash(ValueVector* operand) {
auto hashVector = std::make_unique<ValueVector>(*LogicalType::VAR_LIST(LogicalType::HASH()));
auto numValuesInDataVec = ListVector::getDataVectorSize(operand);
ListVector::resizeDataVector(hashVector.get(), numValuesInDataVec);
auto selectionState = std::make_shared<DataChunkState>();
selectionState->selVector->resetSelectorToValuePosBuffer();
ListVector::getDataVector(operand)->setState(selectionState);
auto numValuesComputed = 0u;
while (numValuesComputed < numValuesInDataVec) {
for (auto i = 0u; i < DEFAULT_VECTOR_CAPACITY; i++) {
selectionState->selVector->selectedPositions[i] = numValuesComputed;
numValuesComputed++;
}
VectorHashFunction::computeHash(
ListVector::getDataVector(operand), ListVector::getDataVector(hashVector.get()));
}
return hashVector;
}

static void finalizeDataVecHash(ValueVector* operand, ValueVector* result, ValueVector* hashVec) {
for (auto i = 0u; i < result->state->getNumSelectedValues(); i++) {
auto pos = operand->state->selVector->selectedPositions[i];
auto entry = operand->getValue<list_entry_t>(pos);
if (operand->isNull(pos)) {
result->setValue(pos, NULL_HASH);
} else {
auto hashValue = NULL_HASH;
for (auto j = 0u; j < entry.size; j++) {
hashValue = combineHashScalar(hashValue,
ListVector::getDataVector(hashVec)->getValue<hash_t>(entry.offset + j));
}
result->setValue(pos, hashValue);
}
}
}

static void computeListVectorHash(ValueVector* operand, ValueVector* result) {
auto dataVecHash = computeDataVecHash(operand);
finalizeDataVecHash(operand, result, dataVecHash.get());
}

static void computeStructVecHash(ValueVector* operand, ValueVector* result) {
switch (operand->dataType.getLogicalTypeID()) {
case LogicalTypeID::NODE: {
KU_ASSERT(0 == common::StructType::getFieldIdx(&operand->dataType, InternalKeyword::ID));
UnaryHashFunctionExecutor::execute<internalID_t, hash_t>(
*StructVector::getFieldVector(operand, 0), *result);
} break;
case LogicalTypeID::REL: {
KU_ASSERT(3 == StructType::getFieldIdx(&operand->dataType, InternalKeyword::ID));
UnaryHashFunctionExecutor::execute<internalID_t, hash_t>(
*StructVector::getFieldVector(operand, 3), *result);
} break;
case LogicalTypeID::STRUCT: {
VectorHashFunction::computeHash(
StructVector::getFieldVector(operand, 0 /* idx */).get(), result);
auto tmpHashVector = std::make_unique<ValueVector>(LogicalTypeID::INT64);
for (auto i = 1u; i < StructType::getNumFields(&operand->dataType); i++) {
auto fieldVector = StructVector::getFieldVector(operand, i);
VectorHashFunction::computeHash(fieldVector.get(), tmpHashVector.get());
VectorHashFunction::combineHash(tmpHashVector.get(), result, result);
}
} break;
default:
KU_UNREACHABLE;
}
}

void VectorHashFunction::computeHash(ValueVector* operand, ValueVector* result) {
result->state = operand->state;
KU_ASSERT(result->dataType.getLogicalTypeID() == LogicalTypeID::INT64);
Expand Down Expand Up @@ -57,30 +125,10 @@ void VectorHashFunction::computeHash(ValueVector* operand, ValueVector* result)
UnaryHashFunctionExecutor::execute<interval_t, hash_t>(*operand, *result);
} break;
case PhysicalTypeID::STRUCT: {
if (operand->dataType.getLogicalTypeID() == LogicalTypeID::NODE) {
KU_ASSERT(
0 == common::StructType::getFieldIdx(&operand->dataType, InternalKeyword::ID));
UnaryHashFunctionExecutor::execute<internalID_t, hash_t>(
*StructVector::getFieldVector(operand, 0), *result);
} else if (operand->dataType.getLogicalTypeID() == LogicalTypeID::REL) {
KU_ASSERT(3 == StructType::getFieldIdx(&operand->dataType, InternalKeyword::ID));
UnaryHashFunctionExecutor::execute<internalID_t, hash_t>(
*StructVector::getFieldVector(operand, 3), *result);
} else {
VectorHashFunction::computeHash(
StructVector::getFieldVector(operand, 0 /* idx */).get(), result);
auto tmpHashVector = std::make_unique<ValueVector>(LogicalTypeID::INT64);
for (auto i = 1u; i < StructType::getNumFields(&operand->dataType); i++) {
auto fieldVector = StructVector::getFieldVector(operand, i);
VectorHashFunction::computeHash(fieldVector.get(), tmpHashVector.get());
VectorHashFunction::combineHash(tmpHashVector.get(), result, result);
}
}
computeStructVecHash(operand, result);
} break;
case PhysicalTypeID::VAR_LIST: {
// TODO(Ziyi): We should pass in the selection state here, and do vectorized hash
// computation.
UnaryHashFunctionExecutor::execute<list_entry_t, hash_t>(*operand, *result);
computeListVectorHash(operand, result);
} break;
// LCOV_EXCL_START
default: {
Expand Down
3 changes: 3 additions & 0 deletions src/include/common/types/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,9 @@ class LogicalType {
static std::unique_ptr<LogicalType> BOOL() {
return std::make_unique<LogicalType>(LogicalTypeID::BOOL);
}
static std::unique_ptr<LogicalType> HASH() {
return std::make_unique<LogicalType>(LogicalTypeID::INT64);
}
static std::unique_ptr<LogicalType> INT64() {
return std::make_unique<LogicalType>(LogicalTypeID::INT64);
}
Expand Down
26 changes: 0 additions & 26 deletions src/include/function/hash/hash_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,32 +189,6 @@ inline void Hash::operation(
combineHashScalar(murmurhash64(key.days), murmurhash64(key.micros)));
}

template<>
inline void Hash::operation(
const common::list_entry_t& key, common::hash_t& result, common::ValueVector* keyVector) {
auto dataVector = common::ListVector::getDataVector(keyVector);
result = NULL_HASH;
common::hash_t tmpResult;
for (auto i = 0u; i < key.size; i++) {
auto pos = key.offset + i;
if (dataVector->isNull(pos)) {
result = combineHashScalar(result, NULL_HASH);
} else {
common::TypeUtils::visit(
dataVector->dataType.getPhysicalType(),
[&]<HashableTypes T>(
T) { operation(dataVector->getValue<T>(pos), tmpResult, dataVector); },
[&](common::struct_entry_t) {
// LCOV_EXCL_START
throw common::RuntimeException{"Hash on list of struct is not supported yet."};
// LCOV_EXCL_STOP
},
[](auto) { KU_UNREACHABLE; });
result = combineHashScalar(result, tmpResult);
}
}
}

template<>
inline void Hash::operation(const std::unordered_set<std::string>& key, common::hash_t& result,
common::ValueVector* /*keyVector*/) {
Expand Down
27 changes: 27 additions & 0 deletions test/test_files/tinysnb/agg/hash.test
Original file line number Diff line number Diff line change
Expand Up @@ -546,3 +546,30 @@ True|2
{rating: 7.000000, stars: 10, views: 982, release: 2018-11-13 13:33:11, release_ns: 2018-11-13 13:33:11.123456, release_ms: 2018-11-13 13:33:11.123, release_sec: 2018-11-13 13:33:11, release_tz: 2018-11-13 13:33:11.123456+00, film: 2014-09-12, u8: 12, u16: 120, u32: 55, u64: 1, hugedata: -1844674407370955161511}|2
{rating: 1223.000000, stars: 100, views: 10003, release: 2011-02-11 16:44:22, release_ns: 2011-02-11 16:44:22.123456, release_ms: 2011-02-11 16:44:22.123, release_sec: 2011-02-11 16:44:22, release_tz: 2011-02-11 16:44:22.123456+00, film: 2013-02-22, u8: 1, u16: 15, u32: 200, u64: 4, hugedata: -15}|2
{rating: 55.000000, stars: 2, views: 88, release: 2022-01-22 00:00:00, release_ns: 2025-01-13 13:33:11.123456, release_ms: 2018-11-13 13:33:11.123, release_sec: 2011-01-11 00:00:00, release_tz: 2011-11-11 00:00:00+00, film: 2022-01-11, u8: 3, u16: 22, u32: 22, u64: 56, hugedata: 999999}|1

-LOG HashOnListOfStruct
-STATEMENT CREATE (:organisation {ID: 22})
---- ok
-STATEMENT MATCH (o:organisation) RETURN [o.state], count(*)
---- 5
[{revenue: 138, location: ['toronto','montr,eal'], stock: {price: [96,56], volume: 1000}}]|2
[{revenue: 152, location: ["vanco,uver north area"], stock: {price: [15,78,671], volume: 432}}]|1
[{revenue: 55, location: ['toronto'], stock: {price: [22,33], volume: 28}}]|1
[{revenue: 558, location: ['very long city name','new york'], stock: {price: [22], volume: 99}}]|2
[]|1
-STATEMENT MATCH (m:movies) RETURN [m.description], count(*)
---- 4
[{rating: 5.300000, stars: 2, views: 152, release: 2011-08-20 11:25:30, release_ns: 2011-08-20 11:25:30.123456, release_ms: 2011-08-20 11:25:30.123, release_sec: 2011-08-20 11:25:30, release_tz: 2011-08-20 11:25:30.123456+00, film: 2012-05-11, u8: 220, u16: 20, u32: 1, u64: 180, hugedata: 1844674407370955161811111111}]|1
[{rating: 7.000000, stars: 10, views: 982, release: 2018-11-13 13:33:11, release_ns: 2018-11-13 13:33:11.123456, release_ms: 2018-11-13 13:33:11.123, release_sec: 2018-11-13 13:33:11, release_tz: 2018-11-13 13:33:11.123456+00, film: 2014-09-12, u8: 12, u16: 120, u32: 55, u64: 1, hugedata: -1844674407370955161511}]|2
[{rating: 1223.000000, stars: 100, views: 10003, release: 2011-02-11 16:44:22, release_ns: 2011-02-11 16:44:22.123456, release_ms: 2011-02-11 16:44:22.123, release_sec: 2011-02-11 16:44:22, release_tz: 2011-02-11 16:44:22.123456+00, film: 2013-02-22, u8: 1, u16: 15, u32: 200, u64: 4, hugedata: -15}]|2
[{rating: 55.000000, stars: 2, views: 88, release: 2022-01-22 00:00:00, release_ns: 2025-01-13 13:33:11.123456, release_ms: 2018-11-13 13:33:11.123, release_sec: 2011-01-11 00:00:00, release_tz: 2011-11-11 00:00:00+00, film: 2022-01-11, u8: 3, u16: 22, u32: 22, u64: 56, hugedata: 999999}]|1
-STATEMENT MATCH (o:organisation) RETURN distinct [o.state];
---- 5
[{revenue: 138, location: ['toronto','montr,eal'], stock: {price: [96,56], volume: 1000}}]
[{revenue: 152, location: ["vanco,uver north area"], stock: {price: [15,78,671], volume: 432}}]
[{revenue: 55, location: ['toronto'], stock: {price: [22,33], volume: 28}}]
[{revenue: 558, location: ['very long city name','new york'], stock: {price: [22], volume: 99}}]
[]
-STATEMENT MATCH (p:person) return distinct collect(p);
---- 1
[{_ID: 0:0, _LABEL: person, ID: 0, fName: Alice, gender: 1, isStudent: True, isWorker: False, age: 35, eyeSight: 5.000000, birthdate: 1900-01-01, registerTime: 2011-08-20 11:25:30, lastJobDuration: 3 years 2 days 13:02:00, workedHours: [10,5], usedNames: [Aida], courseScoresPerTerm: [[10,8],[6,7,8]], grades: [96,54,86,92], height: 1.731000, u: a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11},{_ID: 0:1, _LABEL: person, ID: 2, fName: Bob, gender: 2, isStudent: True, isWorker: False, age: 30, eyeSight: 5.100000, birthdate: 1900-01-01, registerTime: 2008-11-03 15:25:30.000526, lastJobDuration: 10 years 5 months 13:00:00.000024, workedHours: [12,8], usedNames: [Bobby], courseScoresPerTerm: [[8,9],[9,10]], grades: [98,42,93,88], height: 0.990000, u: a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a12},{_ID: 0:2, _LABEL: person, ID: 3, fName: Carol, gender: 1, isStudent: False, isWorker: True, age: 45, eyeSight: 5.000000, birthdate: 1940-06-22, registerTime: 1911-08-20 02:32:21, lastJobDuration: 48:24:11, workedHours: [4,5], usedNames: [Carmen,Fred], courseScoresPerTerm: [[8,10]], grades: [91,75,21,95], height: 1.000000, u: a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a13},{_ID: 0:3, _LABEL: person, ID: 5, fName: Dan, gender: 2, isStudent: False, isWorker: True, age: 20, eyeSight: 4.800000, birthdate: 1950-07-23, registerTime: 2031-11-30 12:25:30, lastJobDuration: 10 years 5 months 13:00:00.000024, workedHours: [1,9], usedNames: [Wolfeschlegelstein,Daniel], courseScoresPerTerm: [[7,4],[8,8],[9]], grades: [76,88,99,89], height: 1.300000, u: a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a14},{_ID: 0:4, _LABEL: person, ID: 7, fName: Elizabeth, gender: 1, isStudent: False, isWorker: True, age: 20, eyeSight: 4.700000, birthdate: 1980-10-26, registerTime: 1976-12-23 11:21:42, lastJobDuration: 48:24:11, workedHours: [2], usedNames: [Ein], courseScoresPerTerm: [[6],[7],[8]], grades: [96,59,65,88], height: 1.463000, u: a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a15},{_ID: 0:5, _LABEL: person, ID: 8, fName: Farooq, gender: 2, isStudent: True, isWorker: False, age: 25, eyeSight: 4.500000, birthdate: 1980-10-26, registerTime: 1972-07-31 13:22:30.678559, lastJobDuration: 00:18:00.024, workedHours: [3,4,5,6,7], usedNames: [Fesdwe], courseScoresPerTerm: [[8]], grades: [80,78,34,83], height: 1.510000, u: a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a16},{_ID: 0:6, _LABEL: person, ID: 9, fName: Greg, gender: 2, isStudent: False, isWorker: False, age: 40, eyeSight: 4.900000, birthdate: 1980-10-26, registerTime: 1976-12-23 04:41:42, lastJobDuration: 10 years 5 months 13:00:00.000024, workedHours: [1], usedNames: [Grad], courseScoresPerTerm: [[10]], grades: [43,83,67,43], height: 1.600000, u: a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a17},{_ID: 0:7, _LABEL: person, ID: 10, fName: Hubert Blaine Wolfeschlegelsteinhausenbergerdorff, gender: 2, isStudent: False, isWorker: True, age: 83, eyeSight: 4.900000, birthdate: 1990-11-27, registerTime: 2023-02-21 13:25:30, lastJobDuration: 3 years 2 days 13:02:00, workedHours: [10,11,12,3,4,5,6,7], usedNames: [Ad,De,Hi,Kye,Orlan], courseScoresPerTerm: [[7],[10],[6,7]], grades: [77,64,100,54], height: 1.323000, u: a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a18}]

0 comments on commit c747899

Please sign in to comment.