Skip to content

Commit

Permalink
Add utility hash functions
Browse files Browse the repository at this point in the history
  • Loading branch information
manh9203 committed Apr 8, 2024
1 parent 9135eda commit 898c111
Show file tree
Hide file tree
Showing 81 changed files with 336 additions and 53 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ include_directories(src/include)
include_directories(third_party/antlr4_cypher/include)
include_directories(third_party/antlr4_runtime/src)
include_directories(third_party/fast_float/include)
include_directories(third_party/mbedtls/include)
include_directories(third_party/miniparquet/src)
include_directories(third_party/miniz)
include_directories(third_party/nlohmann_json)
Expand Down
6 changes: 2 additions & 4 deletions extension/httpfs/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@ find_package(OpenSSL REQUIRED)
include_directories(${OPENSSL_INCLUDE_DIR})
add_compile_definitions(CPPHTTPLIB_OPENSSL_SUPPORT)

add_subdirectory(third_party/mbedtls)

include_directories(
${PROJECT_SOURCE_DIR}/src/include
${PROJECT_SOURCE_DIR}/third_party/httplib
src/include
third_party/mbedtls/include)
${PROJECT_SOURCE_DIR}/third_party/mbedtls/include
src/include)

add_library(httpfs
SHARED
Expand Down
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ add_subdirectory(extension)
add_library(kuzu STATIC ${ALL_OBJECT_FILES})
add_library(kuzu_shared SHARED ${ALL_OBJECT_FILES})

set(KUZU_LIBRARIES antlr4_cypher antlr4_runtime fast_float utf8proc re2 serd Threads::Threads fastpfor miniparquet zstd miniz)
set(KUZU_LIBRARIES antlr4_cypher antlr4_runtime fast_float utf8proc re2 serd Threads::Threads fastpfor miniparquet zstd miniz mbedtls)
if(NOT WIN32)
set(KUZU_LIBRARIES dl ${KUZU_LIBRARIES})
endif()
Expand Down
1 change: 1 addition & 0 deletions src/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ add_library(kuzu_common
profiler.cpp
type_utils.cpp
utils.cpp
sha256.cpp
string_utils.cpp
system_message.cpp
random_engine.cpp
Expand Down
52 changes: 52 additions & 0 deletions src/common/sha256.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include "common/sha256.h"

#include "common/exception/runtime.h"

namespace kuzu {
namespace common {

SHA256::SHA256() {
mbedtls_sha256_init(&shaContext);

// These errors would only occur if there's an issue with shaContext which is wrapped inside
// SHA256, or with the mbedtls library itself
if (mbedtls_sha256_starts(&shaContext, false)) {
throw RuntimeException{"SHA256 Error"};
}
}

SHA256::~SHA256() {
mbedtls_sha256_free(&shaContext);
}

void SHA256::addString(const std::string& str) {
if (mbedtls_sha256_update(
&shaContext, reinterpret_cast<const unsigned char*>(str.data()), str.size())) {
throw RuntimeException{"SHA256 Error"};
}
}

void SHA256::finishSHA256(char* out) {
std::string hash;
hash.resize(SHA256_HASH_LENGTH_BYTES);

if (mbedtls_sha256_finish(&shaContext, reinterpret_cast<unsigned char*>(hash.data()))) {
throw RuntimeException{"SHA256 Error"};
}

toBase16(hash.c_str(), out, SHA256_HASH_LENGTH_BYTES);
}

void SHA256::toBase16(const char* in, char* out, size_t len) {
static char const HEX_CODES[] = "0123456789abcdef";
size_t i, j;

for (j = i = 0; i < len; i++) {
int a = in[i];
out[j++] = HEX_CODES[(a >> 4) & 0xf];
out[j++] = HEX_CODES[a & 0xf];
}
}

} // namespace common
} // namespace kuzu
5 changes: 5 additions & 0 deletions src/function/function_collection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "function/cast/vector_cast_functions.h"
#include "function/comparison/vector_comparison_functions.h"
#include "function/date/vector_date_functions.h"
#include "function/hash/vector_hash_functions.h"
#include "function/interval/vector_interval_functions.h"
#include "function/list/vector_list_functions.h"
#include "function/map/vector_map_functions.h"
Expand Down Expand Up @@ -178,6 +179,10 @@ FunctionCollection* FunctionCollection::getFunctions() {
// Rdf functions
SCALAR_FUNCTION(RDFTypeFunction), SCALAR_FUNCTION(ValidatePredicateFunction),

// Hash functions
SCALAR_FUNCTION(MD5Function), SCALAR_FUNCTION(SHA256Function),
SCALAR_FUNCTION(HashFunction),

// Aggregate functions
AGGREGATE_FUNCTION(CountStarFunction), AGGREGATE_FUNCTION(CountFunction),
AGGREGATE_FUNCTION(AggregateSumFunction), AGGREGATE_FUNCTION(AggregateAvgFunction),
Expand Down
84 changes: 84 additions & 0 deletions src/function/vector_hash_functions.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,64 @@
#include "function/hash/vector_hash_functions.h"

#include "function/binary_function_executor.h"
#include "function/hash/functions/md5_function.h"
#include "function/hash/functions/sha256_function.h"
#include "function/hash/hash_functions.h"
#include "function/scalar_function.h"

using namespace kuzu::common;

namespace kuzu {
namespace function {

template<typename OPERAND_TYPE, typename RESULT_TYPE>
void UnaryHashFunctionExecutor::execute(ValueVector& operand, ValueVector& result) {
auto resultValues = (RESULT_TYPE*)result.getData();
if (operand.state->isFlat()) {
auto pos = operand.state->selVector->selectedPositions[0];
if (!operand.isNull(pos)) {
Hash::operation(operand.getValue<OPERAND_TYPE>(pos), resultValues[pos], &operand);
} else {
result.setValue(pos, NULL_HASH);
}
} else {
if (operand.hasNoNullsGuarantee()) {
if (operand.state->selVector->isUnfiltered()) {
for (auto i = 0u; i < operand.state->selVector->selectedSize; i++) {
Hash::operation(operand.getValue<OPERAND_TYPE>(i), resultValues[i], &operand);
}
} else {
for (auto i = 0u; i < operand.state->selVector->selectedSize; i++) {
auto pos = operand.state->selVector->selectedPositions[i];
Hash::operation(
operand.getValue<OPERAND_TYPE>(pos), resultValues[pos], &operand);
}
}
} else {
if (operand.state->selVector->isUnfiltered()) {
for (auto i = 0u; i < operand.state->selVector->selectedSize; i++) {
if (!operand.isNull(i)) {
Hash::operation(
operand.getValue<OPERAND_TYPE>(i), resultValues[i], &operand);
} else {
result.setValue(i, NULL_HASH);
}
}
} else {
for (auto i = 0u; i < operand.state->selVector->selectedSize; i++) {
auto pos = operand.state->selVector->selectedPositions[i];
if (!operand.isNull(pos)) {
Hash::operation(
operand.getValue<OPERAND_TYPE>(pos), resultValues[pos], &operand);
} else {
resultValues[pos] = NULL_HASH;
}
}
}
}
}
}

static std::unique_ptr<ValueVector> computeDataVecHash(ValueVector* operand) {
auto hashVector = std::make_unique<ValueVector>(*LogicalType::LIST(LogicalType::HASH()));
auto numValuesInDataVec = ListVector::getDataVectorSize(operand);
Expand Down Expand Up @@ -147,5 +199,37 @@ void VectorHashFunction::combineHash(ValueVector* left, ValueVector* right, Valu
BinaryFunctionExecutor::execute<hash_t, hash_t, hash_t, CombineHash>(*left, *right, *result);
}

function_set MD5Function::getFunctionSet() {
function_set functionSet;
functionSet.push_back(std::make_unique<ScalarFunction>(name,
std::vector<LogicalTypeID>{LogicalTypeID::STRING}, LogicalTypeID::STRING,
ScalarFunction::UnaryStringExecFunction<ku_string_t, ku_string_t, MD5Operator>,
false /* isVarLength */));
return functionSet;
}

function_set SHA256Function::getFunctionSet() {
function_set functionSet;
functionSet.push_back(std::make_unique<ScalarFunction>(name,
std::vector<LogicalTypeID>{LogicalTypeID::STRING}, LogicalTypeID::STRING,
ScalarFunction::UnaryStringExecFunction<ku_string_t, ku_string_t, SHA256Operator>,
false /* isVarLength */));
return functionSet;
}

static void HashExecFunc(const std::vector<std::shared_ptr<common::ValueVector>>& params,
common::ValueVector& result, void* /*dataPtr*/ = nullptr) {
KU_ASSERT(params.size() == 1);
VectorHashFunction::computeHash(params[0].get(), &result);
}

function_set HashFunction::getFunctionSet() {
function_set functionSet;
functionSet.push_back(
std::make_unique<ScalarFunction>(name, std::vector<LogicalTypeID>{LogicalTypeID::ANY},
LogicalTypeID::INT64, HashExecFunc, false /* isVarLength */));
return functionSet;
}

} // namespace function
} // namespace kuzu
29 changes: 29 additions & 0 deletions src/include/common/sha256.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma once

#include <string>

#include "mbedtls/sha256.h"

namespace kuzu {
namespace common {

class SHA256 {
public:
static constexpr size_t SHA256_HASH_LENGTH_BYTES = 32;
static constexpr size_t SHA256_HASH_LENGTH_TEXT = 64;

public:
SHA256();
~SHA256();
void addString(const std::string& str);
void finishSHA256(char* out);
static void toBase16(const char* in, char* out, size_t len);

private:
typedef mbedtls_sha256_context SHA256Context;

SHA256Context shaContext;
};

} // namespace common
} // namespace kuzu
20 changes: 20 additions & 0 deletions src/include/function/hash/functions/md5_function.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#pragma once

#include "common/md5.h"
#include "common/vector/value_vector.h"

using namespace kuzu::common;

namespace kuzu {
namespace function {

struct MD5Operator {
static void operation(ku_string_t& operand, ku_string_t& result, ValueVector& resultVector) {
MD5 hasher;
hasher.addToMD5(reinterpret_cast<const char*>(operand.getData()));
StringVector::addString(&resultVector, result, std::string(hasher.finishMD5()));
}
};

} // namespace function
} // namespace kuzu
21 changes: 21 additions & 0 deletions src/include/function/hash/functions/sha256_function.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#include "common/sha256.h"
#include "common/vector/value_vector.h"

using namespace kuzu::common;

namespace kuzu {
namespace function {

struct SHA256Operator {
static void operation(ku_string_t& operand, ku_string_t& result, ValueVector& resultVector) {
StringVector::reserveString(&resultVector, result, SHA256::SHA256_HASH_LENGTH_TEXT);
SHA256 hasher;
hasher.addString(operand.getAsString());
hasher.finishSHA256(reinterpret_cast<char*>(result.getDataUnsafe()));
}
};

} // namespace function
} // namespace kuzu
68 changes: 20 additions & 48 deletions src/include/function/hash/vector_hash_functions.h
Original file line number Diff line number Diff line change
@@ -1,60 +1,14 @@
#pragma once

#include "common/vector/value_vector.h"
#include "hash_functions.h"
#include "function/function.h"

namespace kuzu {
namespace function {

struct UnaryHashFunctionExecutor {
template<typename OPERAND_TYPE, typename RESULT_TYPE>
static void execute(common::ValueVector& operand, common::ValueVector& result) {
auto resultValues = (RESULT_TYPE*)result.getData();
if (operand.state->isFlat()) {
auto pos = operand.state->selVector->selectedPositions[0];
if (!operand.isNull(pos)) {
Hash::operation(operand.getValue<OPERAND_TYPE>(pos), resultValues[pos], &operand);
} else {
result.setValue(pos, NULL_HASH);
}
} else {
if (operand.hasNoNullsGuarantee()) {
if (operand.state->selVector->isUnfiltered()) {
for (auto i = 0u; i < operand.state->selVector->selectedSize; i++) {
Hash::operation(operand.getValue<OPERAND_TYPE>(i), resultValues[i],
&operand);
}
} else {
for (auto i = 0u; i < operand.state->selVector->selectedSize; i++) {
auto pos = operand.state->selVector->selectedPositions[i];
Hash::operation(operand.getValue<OPERAND_TYPE>(pos), resultValues[pos],
&operand);
}
}
} else {
if (operand.state->selVector->isUnfiltered()) {
for (auto i = 0u; i < operand.state->selVector->selectedSize; i++) {
if (!operand.isNull(i)) {
Hash::operation(operand.getValue<OPERAND_TYPE>(i), resultValues[i],
&operand);
} else {
result.setValue(i, NULL_HASH);
}
}
} else {
for (auto i = 0u; i < operand.state->selVector->selectedSize; i++) {
auto pos = operand.state->selVector->selectedPositions[i];
if (!operand.isNull(pos)) {
Hash::operation(operand.getValue<OPERAND_TYPE>(pos), resultValues[pos],
&operand);
} else {
resultValues[pos] = NULL_HASH;
}
}
}
}
}
}
static void execute(common::ValueVector& operand, common::ValueVector& result);
};

struct VectorHashFunction {
Expand All @@ -64,5 +18,23 @@ struct VectorHashFunction {
common::ValueVector* result);
};

struct MD5Function {
static constexpr const char* name = "MD5";

static function_set getFunctionSet();
};

struct SHA256Function {
static constexpr const char* name = "SHA256";

static function_set getFunctionSet();
};

struct HashFunction {
static constexpr const char* name = "HASH";

static function_set getFunctionSet();
};

} // namespace function
} // namespace kuzu
1 change: 1 addition & 0 deletions src/processor/result/base_hash_table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "math.h"

#include "common/null_buffer.h"
#include "common/type_utils.h"
#include "common/utils.h"
#include "function/comparison/comparison_functions.h"
#include "function/hash/vector_hash_functions.h"
Expand Down
Loading

0 comments on commit 898c111

Please sign in to comment.