Skip to content

Commit

Permalink
Add string regex functions
Browse files Browse the repository at this point in the history
Functions added:

1.regexp_matches(string, regex)
Returns true if a part of string matches the
regex.

2. regexp_replace(string, regex, replacement)
Replaces the first occurrence of regex with the
replacement,

3. regexp_extract(string, regex[, group = 0])
Split the string along the regex and extract
first occurrence of group.

4. regexp_extract_all(string, regex[, group = 0])
Split the string along the regex and extract
all occurrences of group.
  • Loading branch information
gaurav8297 committed May 5, 2023
1 parent c7b29cf commit d4f9833
Show file tree
Hide file tree
Showing 9 changed files with 285 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/function/built_in_vector_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,10 @@ void BuiltInVectorOperations::registerStringOperations() {
vectorOperations.insert({TRIM_FUNC_NAME, TrimVectorOperation::getDefinitions()});
vectorOperations.insert({UCASE_FUNC_NAME, UpperVectorOperation::getDefinitions()});
vectorOperations.insert({UPPER_FUNC_NAME, UpperVectorOperation::getDefinitions()});
vectorOperations.insert({REGEXP_MATCHES_FUNC_NAME, RegexpMatchesOperation::getDefinitions()});
vectorOperations.insert({REGEXP_REPLACE_FUNC_NAME, RegexpReplaceOperation::getDefinitions()});
vectorOperations.insert({REGEXP_EXTRACT_FUNC_NAME, RegexpExtractOperation::getDefinitions()});
vectorOperations.insert({REGEXP_EXTRACT_ALL_FUNC_NAME, RegexpExtractAllOperation::getDefinitions()});
}

void BuiltInVectorOperations::registerCastOperations() {
Expand Down
58 changes: 58 additions & 0 deletions src/function/vector_string_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@
#include "function/string/operations/length_operation.h"
#include "function/string/operations/lpad_operation.h"
#include "function/string/operations/regexp_full_match_operation.h"
#include "function/string/operations/regexp_matches_operation.h"
#include "function/string/operations/repeat_operation.h"
#include "function/string/operations/right_operation.h"
#include "function/string/operations/rpad_operation.h"
#include "function/string/operations/starts_with_operation.h"
#include "function/string/operations/substr_operation.h"
#include "function/string/operations/regexp_replace_operation.h"
#include "function/string/operations/regexp_extract_operation.h"
#include "function/string/operations/regexp_extract_all_operation.h"

using namespace kuzu::common;

Expand Down Expand Up @@ -141,5 +145,59 @@ std::vector<std::unique_ptr<VectorOperationDefinition>> SubStrVectorOperation::g
return definitions;
}

std::vector<std::unique_ptr<VectorOperationDefinition>> RegexpMatchesOperation::getDefinitions() {
std::vector<std::unique_ptr<VectorOperationDefinition>> definitions;
definitions.emplace_back(make_unique<VectorOperationDefinition>(REGEXP_MATCHES_FUNC_NAME,
std::vector<DataTypeID>{STRING, STRING}, BOOL,
BinaryExecFunction<ku_string_t, ku_string_t, uint8_t, operation::RegexpMatches>,
BinarySelectFunction<ku_string_t, ku_string_t, operation::RegexpMatches>,
false /* isVarLength */));
return definitions;
}

std::vector<std::unique_ptr<VectorOperationDefinition>> RegexpReplaceOperation::getDefinitions() {
std::vector<std::unique_ptr<VectorOperationDefinition>> definitions;
definitions.emplace_back(make_unique<VectorOperationDefinition>(REGEXP_REPLACE_FUNC_NAME,
std::vector<DataTypeID>{STRING, STRING, STRING}, STRING,
TernaryStringExecFunction<ku_string_t, ku_string_t, ku_string_t, ku_string_t, operation::RegexpReplace>,
false /* isVarLength */));
return definitions;
}

std::vector<std::unique_ptr<VectorOperationDefinition>> RegexpExtractOperation::getDefinitions() {
std::vector<std::unique_ptr<VectorOperationDefinition>> definitions;
definitions.emplace_back(make_unique<VectorOperationDefinition>(REGEXP_EXTRACT_FUNC_NAME,
std::vector<DataTypeID>{STRING, STRING}, STRING,
BinaryStringExecFunction<ku_string_t, ku_string_t, ku_string_t, operation::RegexpExtract>,
false /* isVarLength */));
definitions.emplace_back(make_unique<VectorOperationDefinition>(REGEXP_EXTRACT_FUNC_NAME,
std::vector<DataTypeID>{STRING, STRING, INT64}, STRING,
TernaryStringExecFunction<ku_string_t, ku_string_t, int64_t, ku_string_t, operation::RegexpExtract>,
false /* isVarLength */));
return definitions;
}

std::vector<std::unique_ptr<VectorOperationDefinition>> RegexpExtractAllOperation::getDefinitions() {
std::vector<std::unique_ptr<VectorOperationDefinition>> definitions;
definitions.emplace_back(make_unique<VectorOperationDefinition>(REGEXP_EXTRACT_FUNC_NAME,
std::vector<DataTypeID>{STRING, STRING}, VAR_LIST,
BinaryStringExecFunction<ku_string_t, ku_string_t, list_entry_t, operation::RegexpExtractAll>,
nullptr,
bindFunc,
false /* isVarLength */));
definitions.emplace_back(make_unique<VectorOperationDefinition>(REGEXP_EXTRACT_FUNC_NAME,
std::vector<DataTypeID>{STRING, STRING, INT64}, VAR_LIST,
TernaryStringExecFunction<ku_string_t, ku_string_t, int64_t, list_entry_t, operation::RegexpExtractAll>,
nullptr,
bindFunc,
false /* isVarLength */));
return definitions;
}

std::unique_ptr<FunctionBindData> RegexpExtractAllOperation::bindFunc(
const binder::expression_vector& arguments, FunctionDefinition* definition) {
return std::make_unique<FunctionBindData>(DataType(std::make_unique<DataType>(STRING)));
}

} // namespace function
} // namespace kuzu
4 changes: 4 additions & 0 deletions src/include/common/expression_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ const std::string TRIM_FUNC_NAME = "TRIM";
const std::string UCASE_FUNC_NAME = "UCASE";
const std::string UPPER_FUNC_NAME = "UPPER";
const std::string REGEXP_FULL_MATCH_FUNC_NAME = "REGEXP_FULL_MATCH";
const std::string REGEXP_MATCHES_FUNC_NAME = "REGEXP_MATCHES";
const std::string REGEXP_REPLACE_FUNC_NAME = "REGEXP_REPLACE";
const std::string REGEXP_EXTRACT_FUNC_NAME = "REGEXP_EXTRACT";
const std::string REGEXP_EXTRACT_ALL_FUNC_NAME = "REGEXP_EXTRACT_ALL";

// Date functions.
const std::string DATE_PART_FUNC_NAME = "DATE_PART";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#pragma once

#include <regex>

#include "common/re2_regex.h"
#include "common/types/ku_list.h"
#include "common/types/ku_string.h"
#include "common/vector/value_vector.h"
#include "common/vector/value_vector_utils.h"

namespace kuzu::function::operation {
struct RegexpExtractAll {
static inline void operation(
common::ku_string_t& value,
common::ku_string_t& pattern,
std::int64_t& group,
common::list_entry_t& result,
common::ValueVector& resultVector) {
// Cypher parses escape characters with 2 backslash eg. for expressing '.' requires '\\.'
// Since Regular Expression requires only 1 backslash '\.' we need to replace double slash
// with single
std::string sanitizedPattern = std::regex_replace(pattern.getAsString(), std::regex(R"(\\\\)"), "\\");
std::vector<common::Match> matches = common::RegexExtractAll(value.getAsString(), sanitizedPattern);

result = common::ListVector::addList(&resultVector, matches.size());
auto resultValues = common::ListVector::getListValues(&resultVector, result);
auto resultDataVector = common::ListVector::getDataVector(&resultVector);
auto numBytesPerValue = resultDataVector->getNumBytesPerValue();

for (auto match : matches) {
common::ku_string_t kuString;
copyToKuzuString(match.str(group), kuString, *resultDataVector);
common::ValueVectorUtils::copyValue(
resultValues, *resultDataVector, reinterpret_cast<uint8_t*>(&kuString), *resultDataVector);
resultValues += numBytesPerValue;
}
}

static inline void operation(
common::ku_string_t& value,
common::ku_string_t& pattern,
common::list_entry_t& result,
common::ValueVector& resultVector) {
int64_t defaultGroup = 0;
operation(value, pattern, defaultGroup, result, resultVector);
}

static void copyToKuzuString(const std::string& value, common::ku_string_t& kuString, common::ValueVector& valueVector) {
if (!common::ku_string_t::isShortString(value.length())) {
kuString.overflowPtr = reinterpret_cast<uint64_t>(
common::StringVector::getInMemOverflowBuffer(&valueVector)
->allocateSpace(value.length()));
}
kuString.set(value);
}
};
} // namespace kuzu::function::operation
45 changes: 45 additions & 0 deletions src/include/function/string/operations/regexp_extract_operation.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#pragma once

#include <regex>

#include "common/re2_regex.h"
#include "common/types/ku_string.h"
#include "common/vector/value_vector.h"

namespace kuzu::function::operation {
struct RegexpExtract {
static inline void operation(
common::ku_string_t& value,
common::ku_string_t& pattern,
std::int64_t& group,
common::ku_string_t& result,
common::ValueVector& resultValueVector) {
// Cypher parses escape characters with 2 backslash eg. for expressing '.' requires '\\.'
// Since Regular Expression requires only 1 backslash '\.' we need to replace double slash
// with single
std::string sanitized_pattern = std::regex_replace(pattern.getAsString(), std::regex(R"(\\\\)"), "\\");
common::Match match;
if (common::RegexExtract(value.getAsString(), match, sanitized_pattern)) {
copyToKuzuString(match.str(group), result, resultValueVector);
}
}

static inline void operation(
common::ku_string_t& value,
common::ku_string_t& pattern,
common::ku_string_t& result,
common::ValueVector& resultValueVector) {
int64_t defaultGroup = 0;
operation(value, pattern, defaultGroup, result, resultValueVector);
}

static void copyToKuzuString(const std::string& value, common::ku_string_t& kuString, common::ValueVector& valueVector) {
if (!common::ku_string_t::isShortString(value.length())) {
kuString.overflowPtr = reinterpret_cast<uint64_t>(
common::StringVector::getInMemOverflowBuffer(&valueVector)
->allocateSpace(value.length()));
}
kuString.set(value);
}
};
} // namespace kuzu::function::operation
21 changes: 21 additions & 0 deletions src/include/function/string/operations/regexp_matches_operation.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#include <regex>

#include "common/types/ku_string.h"
#include "common/re2_regex.h"
#include "function/string/operations/find_operation.h"

namespace kuzu::function::operation {

struct RegexpMatches {
static inline void operation(common::ku_string_t& left, common::ku_string_t& right, uint8_t& result) {
// Cypher parses escape characters with 2 backslash eg. for expressing '.' requires '\\.'
// Since Regular Expression requires only 1 backslash '\.' we need to replace double slash
// with single
std::string pattern = std::regex_replace(right.getAsString(), std::regex(R"(\\\\)"), "\\");
result = common::RegexPartialMatch(left.getAsString(), pattern);
}
};

} // namespace kuzu
35 changes: 35 additions & 0 deletions src/include/function/string/operations/regexp_replace_operation.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#pragma once

#include <regex>

#include "common/types/ku_string.h"
#include "common/re2_regex.h"
#include "function/string/operations/find_operation.h"

namespace kuzu::function::operation {
struct RegexpReplace {
static inline void operation(
common::ku_string_t& value,
common::ku_string_t& pattern,
common::ku_string_t& replacement,
common::ku_string_t& result,
common::ValueVector& resultValueVector) {
// Cypher parses escape characters with 2 backslash eg. for expressing '.' requires '\\.'
// Since Regular Expression requires only 1 backslash '\.' we need to replace double slash
// with single
std::string sanitized_pattern = std::regex_replace(pattern.getAsString(), std::regex(R"(\\\\)"), "\\");
std::string result_str = common::RegexReplace(value.getAsString(), sanitized_pattern, replacement.getAsString());
copyToKuzuString(result_str, result, resultValueVector);
}

static void copyToKuzuString(const std::string& value, common::ku_string_t& kuString, common::ValueVector& valueVector) {
if (!common::ku_string_t::isShortString(value.length())) {
kuString.overflowPtr = reinterpret_cast<uint64_t>(
common::StringVector::getInMemOverflowBuffer(&valueVector)
->allocateSpace(value.length()));
}
kuString.set(value);
}
};

} // namespace kuzu::function::operation
18 changes: 18 additions & 0 deletions src/include/function/string/vector_string_operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,5 +139,23 @@ struct UpperVectorOperation : public VectorStringOperations {
}
};

struct RegexpMatchesOperation : public VectorStringOperations {
static std::vector<std::unique_ptr<VectorOperationDefinition>> getDefinitions();
};

struct RegexpReplaceOperation : public VectorStringOperations {
static std::vector<std::unique_ptr<VectorOperationDefinition>> getDefinitions();
};

struct RegexpExtractOperation : public VectorStringOperations {
static std::vector<std::unique_ptr<VectorOperationDefinition>> getDefinitions();
};

struct RegexpExtractAllOperation : public VectorStringOperations {
static std::vector<std::unique_ptr<VectorOperationDefinition>> getDefinitions();
static std::unique_ptr<FunctionBindData> bindFunc(
const binder::expression_vector& arguments, FunctionDefinition* definition);
};

} // namespace function
} // namespace kuzu
43 changes: 43 additions & 0 deletions test/test_files/tinysnb/function/string.test
Original file line number Diff line number Diff line change
Expand Up @@ -529,3 +529,46 @@ True
-QUERY MATCH (p:person) WHERE suffix(p.fName, "l") RETURN p.fName
---- 1
Carol

## Test REGEXP_MATCHES function

-NAME RegexpMatchesString
-QUERY MATCH (a:person) WHERE REGEXP_MATCHES(a.fName, 'Ali.*') = true RETURN a.ID
---- 1
0

-NAME RegexpMatchesSeq1
-QUERY Return REGEXP_MATCHES('anabanana', '(an)*');
---- 1
True

## Test REGEXP_REPLACE function

-NAME RegexpReplaceSeq1
-QUERY Return REGEXP_REPLACE('hello', '[lo]', '-');
---- 1
he-lo

## Test REGEXP_EXTRACT function

-NAME RegexpExtractSeq1
-QUERY Return regexp_extract('hello_world', '([a-z ]+)_?');
---- 1
hello_

-NAME RegexpExtractSeq2
-QUERY Return regexp_extract('hello_world', '([a-z ]+)_?', 1);
---- 1
hello

## Test REGEXP_EXTRACT_ALL function

-NAME RegexpExtractAllSeq1
-QUERY Return regexp_extract_all('hello_world', '([a-z ]+)_?');
---- 1
[hello_,world]

-NAME RegexpExtractAllSeq2
-QUERY Return regexp_extract_all('hello_world', '([a-z ]+)_?', 1);
---- 1
[hello,world]

0 comments on commit d4f9833

Please sign in to comment.