Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Regex match #1208

Merged
merged 1 commit into from
Jan 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ include_directories(third_party/spdlog)
include_directories(third_party/nlohmann_json)
include_directories(third_party/utf8proc/include)
include_directories(third_party/pybind11/include)
include_directories(third_party/re2/include)

add_subdirectory(third_party)
add_subdirectory(src)
Expand Down
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ add_subdirectory(storage)
add_subdirectory(transaction)

add_library(kuzu STATIC ${ALL_OBJECT_FILES})
target_link_libraries(kuzu PUBLIC antlr4_cypher antlr4_runtime utf8proc
target_link_libraries(kuzu PUBLIC antlr4_cypher antlr4_runtime utf8proc re2
parquet_lib arrow_lib arrow_deps Threads::Threads)
target_include_directories(
kuzu PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
Expand Down
5 changes: 4 additions & 1 deletion src/antlr4/Cypher.g4
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,10 @@ kU_ListSliceOperatorExpression
: SP ? '[' oC_Expression? ':' oC_Expression? ']' ;

oC_StringOperatorExpression
: ( ( SP STARTS SP WITH ) | ( SP ENDS SP WITH ) | ( SP CONTAINS ) ) SP? oC_PropertyOrLabelsExpression ;
: ( oC_RegularExpression | ( SP STARTS SP WITH ) | ( SP ENDS SP WITH ) | ( SP CONTAINS ) ) SP? oC_PropertyOrLabelsExpression ;

oC_RegularExpression
: SP? '=~' ;

STARTS : ( 'S' | 's' ) ( 'T' | 't' ) ( 'A' | 'a' ) ( 'R' | 'r' ) ( 'T' | 't' ) ( 'S' | 's' ) ;

Expand Down
1 change: 1 addition & 0 deletions src/function/built_in_vector_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ void BuiltInVectorOperations::registerStringOperations() {
vectorOperations.insert({CONCAT_FUNC_NAME, ConcatVectorOperation::getDefinitions()});
vectorOperations.insert({CONTAINS_FUNC_NAME, ContainsVectorOperation::getDefinitions()});
vectorOperations.insert({ENDS_WITH_FUNC_NAME, EndsWithVectorOperation::getDefinitions()});
vectorOperations.insert({RE_MATCH_FUNC_NAME, REMatchVectorOperation::getDefinitions()});
vectorOperations.insert({LCASE_FUNC_NAME, LowerVectorOperation::getDefinitions()});
vectorOperations.insert({LEFT_FUNC_NAME, LeftVectorOperation::getDefinitions()});
vectorOperations.insert({LENGTH_FUNC_NAME, LengthVectorOperation::getDefinitions()});
Expand Down
11 changes: 11 additions & 0 deletions src/function/vector_string_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "function/string/operations/left_operation.h"
#include "function/string/operations/length_operation.h"
#include "function/string/operations/lpad_operation.h"
#include "function/string/operations/reg_expr_operation.h"
#include "function/string/operations/repeat_operation.h"
#include "function/string/operations/right_operation.h"
#include "function/string/operations/rpad_operation.h"
Expand Down Expand Up @@ -54,6 +55,16 @@ vector<unique_ptr<VectorOperationDefinition>> EndsWithVectorOperation::getDefini
return definitions;
}

vector<unique_ptr<VectorOperationDefinition>> REMatchVectorOperation::getDefinitions() {
vector<unique_ptr<VectorOperationDefinition>> definitions;
definitions.emplace_back(make_unique<VectorOperationDefinition>(RE_MATCH_FUNC_NAME,
vector<DataTypeID>{STRING, STRING}, BOOL,
BinaryExecFunction<ku_string_t, ku_string_t, uint8_t, operation::REMatch>,
BinarySelectFunction<ku_string_t, ku_string_t, operation::REMatch>,
false /* isVarLength */));
return definitions;
}

vector<unique_ptr<VectorOperationDefinition>> LeftVectorOperation::getDefinitions() {
vector<unique_ptr<VectorOperationDefinition>> definitions;
definitions.emplace_back(
Expand Down
1 change: 1 addition & 0 deletions src/include/common/expression_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ const string ARRAY_EXTRACT_FUNC_NAME = "ARRAY_EXTRACT";
const string CONCAT_FUNC_NAME = "CONCAT";
const string CONTAINS_FUNC_NAME = "CONTAINS";
const string ENDS_WITH_FUNC_NAME = "ENDS_WITH";
const string RE_MATCH_FUNC_NAME = "RE_MATCH";
const string LCASE_FUNC_NAME = "LCASE";
const string LEFT_FUNC_NAME = "LEFT";
const string LENGTH_FUNC_NAME = "LENGTH";
Expand Down
28 changes: 28 additions & 0 deletions src/include/function/string/operations/reg_expr_operation.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#pragma once

#include <regex>

#include "common/types/ku_string.h"
#include "function/string/operations/find_operation.h"
#include "re2.h"

using namespace kuzu::common;
using namespace kuzu::regex;

namespace kuzu {
namespace function {
namespace operation {

struct REMatch {
static inline void operation(ku_string_t& left, ku_string_t& right, uint8_t& result) {
// Cypher parses escape characters with 2 backslash eg. for expressing '.' requires '\\.'
// Since Regular Expression requires only 1 backslash '\.' we need to replace double slash
// with single
string pattern = std::regex_replace(right.getAsString(), std::regex(R"(\\\\)"), "\\");
result = RE2::FullMatch(left.getAsString(), pattern);
}
};

} // namespace operation
} // namespace function
} // namespace kuzu
4 changes: 4 additions & 0 deletions src/include/function/string/vector_string_operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ struct EndsWithVectorOperation : public VectorStringOperations {
static vector<unique_ptr<VectorOperationDefinition>> getDefinitions();
};

struct REMatchVectorOperation : public VectorStringOperations {
static vector<unique_ptr<VectorOperationDefinition>> getDefinitions();
};

struct LeftVectorOperation : public VectorStringOperations {
static vector<unique_ptr<VectorOperationDefinition>> getDefinitions();
};
Expand Down
7 changes: 5 additions & 2 deletions src/parser/transformer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -591,10 +591,13 @@ unique_ptr<ParsedExpression> Transformer::transformStringOperatorExpression(
} else if (ctx.ENDS()) {
return make_unique<ParsedFunctionExpression>(
ENDS_WITH_FUNC_NAME, std::move(propertyExpression), std::move(right), rawExpression);
} else {
assert(ctx.CONTAINS());
} else if (ctx.CONTAINS()) {
return make_unique<ParsedFunctionExpression>(
CONTAINS_FUNC_NAME, std::move(propertyExpression), std::move(right), rawExpression);
} else {
assert(ctx.oC_RegularExpression());
return make_unique<ParsedFunctionExpression>(
RE_MATCH_FUNC_NAME, std::move(propertyExpression), std::move(right), rawExpression);
}
}

Expand Down
31 changes: 31 additions & 0 deletions test/test_files/tinysnb/function/string.test
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,37 @@ False
---- 1
Carol

-NAME RegExprMatchString
-QUERY MATCH (a:person) WHERE a.fName =~ 'Ali.*' RETURN a.ID
---- 1
0

-NAME RegExprAnchorMatchString
-QUERY MATCH (a:person) UNWIND a.usedNames as x WITH x AS name WHERE name =~ '^A.*$' RETURN name
---- 2
Ad
Aida

-NAME RegExprUTF8MatchString
-QUERY MATCH (m:movies) WHERE m.name =~ '.*â.*' RETURN m.name
---- 1
Sóló cón tu párejâ

anuchak marked this conversation as resolved.
Show resolved Hide resolved
-NAME RegExprEscapeSeq1
-QUERY Return RE_MATCH("peter_n@example.com", ".*\\.com");
---- 1
True

-NAME RegExprEscapeSeq2
-QUERY Return RE_MATCH("Alice.*^Alice", ".*\\.\\*.*");
---- 1
True

-NAME RegExprEscapeSeq3
-QUERY Return RE_MATCH("peter_n@examplecom", ".*\\.com");
---- 1
False

-NAME LowerStructuredStr
-QUERY MATCH (o:organisation) RETURN lower(o.name)
---- 3
Expand Down
1 change: 1 addition & 0 deletions third_party/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ add_subdirectory(antlr4_runtime)
add_subdirectory(antlr4_cypher)
add_subdirectory(utf8proc)
add_subdirectory(pybind11)
add_subdirectory(re2)
Loading