From e91be1c5128245cce561c13275b1dabe523e423f Mon Sep 17 00:00:00 2001 From: hououou Date: Tue, 16 Apr 2024 01:53:00 -0400 Subject: [PATCH 1/4] str utf8 test --- test/include/test_runner/test_parser.h | 1 + .../function/list_of_string_utf8.test | 121 ++++++++++++ test/test_files/function/string_utf8.test | 172 ++++++++++++++++++ test/test_runner/test_parser.cpp | 14 +- 4 files changed, 307 insertions(+), 1 deletion(-) create mode 100644 test/test_files/function/list_of_string_utf8.test create mode 100644 test/test_files/function/string_utf8.test diff --git a/test/include/test_runner/test_parser.h b/test/include/test_runner/test_parser.h index 297d54cb6b..123e090b54 100644 --- a/test/include/test_runner/test_parser.h +++ b/test/include/test_runner/test_parser.h @@ -90,6 +90,7 @@ class TestParser { void openFile(); void tokenize(); + std::vector splitString(); void parseHeader(); void parseBody(); void extractExpectedResult(TestStatement* statement); diff --git a/test/test_files/function/list_of_string_utf8.test b/test/test_files/function/list_of_string_utf8.test new file mode 100644 index 0000000000..7c12fb4bd0 --- /dev/null +++ b/test/test_files/function/list_of_string_utf8.test @@ -0,0 +1,121 @@ +-GROUP TinySnbReadTest2 +-DATASET CSV empty + +-- + +-CASE ListOfStringFunctionUTF8 +-STATEMENT RETURN list_extract(["成績評価","の甘","業が"], 2) +---- 1 +の甘 + +-STATEMENT RETURN list_element(["成績評価","の甘","業が"], 1) +---- 1 +成績評価 + +-STATEMENT RETURN list_concat(["成績評価","の甘","業が"], ["这是中文","的语句"]) +---- 1 +[成績評価,の甘,業が,这是中文,的语句] + +-STATEMENT RETURN list_cat(["成績評価","の甘","業が"], ["这是中文","的语句"]) +---- 1 +[成績評価,の甘,業が,这是中文,的语句] + +-STATEMENT RETURN array_cat(["成績評価","の甘","業が"], ["这是中文","的语句"]) +---- 1 +[成績評価,の甘,業が,这是中文,的语句] + +-STATEMENT RETURN array_concat(["成績評価","の甘","業が"], ["这是中文","的语句"]) +---- 1 +[成績評価,の甘,業が,这是中文,的语句] + +-STATEMENT RETURN list_append(["成績評価","の甘","業が", "这是中文"], "的语句") +---- 1 +[成績評価,の甘,業が,这是中文,的语句] + +-STATEMENT RETURN array_append(["成績評価","の甘","業が", "这是中文"], "的语句") +---- 1 +[成績評価,の甘,業が,这是中文,的语句] + +-STATEMENT RETURN array_push_back(["成績評価","の甘","業が", "这是中文"], "的语句") +---- 1 +[成績評価,の甘,業が,这是中文,的语句] + +-STATEMENT RETURN list_prepend(["成績評価","の甘","業が", "这是中文"], "的语句") +---- 1 +[的语句,成績評価,の甘,業が,这是中文] + +-STATEMENT RETURN array_prepend(["成績評価","の甘","業が", "这是中文"], "的语句") +---- 1 +[的语句,成績評価,の甘,業が,这是中文] + +-STATEMENT RETURN array_push_front(["成績評価","の甘","業が", "这是中文"], "的语句") +---- 1 +[的语句,成績評価,の甘,業が,这是中文] + +-STATEMENT RETURN list_position(["成績評価","の甘","業が", "这是中文"], "这是中文") +---- 1 +4 +-STATEMENT RETURN list_indexof(["成績評価","の甘","業が", "这是中文"], "这是中文") +---- 1 +4 + +-STATEMENT RETURN array_position(["成績評価","の甘","業が", "这是中文"], "这是中文") +---- 1 +4 +-STATEMENT RETURN array_indexof(["成績評価","の甘","業が", "这是中文"], "这是中文") +---- 1 +4 + +-STATEMENT RETURN list_contains(["成績評価","の甘","業が", "这是中文"], "这是中文") +---- 1 +True + +-STATEMENT RETURN array_has(["成績評価","の甘","業が", "这是中文"], "这是中文吗") +---- 1 +False + +-STATEMENT RETURN list_slice(["成績評価","の甘","業が", "这是中文"], 1, 2) +---- 1 +[成績評価] + +-STATEMENT RETURN array_slice(["成績評価","の甘","業が", "这是中文"], 1, 4) +---- 1 +[成績評価,の甘,業が] + +-STATEMENT RETURN list_reverse(["成績評価","の甘","業が", "这是中文"]) +---- 1 +[这是中文,業が,の甘,成績評価] + +-STATEMENT RETURN list_sort(["成績評価","の甘","業が", "这是中文"]) +---- 1 +[の甘,成績評価,業が,这是中文] + +-STATEMENT RETURN list_reverse_sort(["成績評価","の甘","業が", "这是中文"]) +---- 1 +[这是中文,業が,成績評価,の甘] + + +-STATEMENT RETURN list_sum(["成績評価","の甘","業が", "这是中文"]) +---- error +Binder exception: Unsupported inner data type for LIST_SUM: STRING + + +-STATEMENT RETURN list_sum(["toronto","waterloo"]) +---- error +Binder exception: Unsupported inner data type for LIST_SUM: STRING + +-STATEMENT RETURN list_product(["成績評価","の甘","業が", "这是中文"]) +---- error +Binder exception: Unsupported inner data type for LIST_PRODUCT: STRING + +-STATEMENT RETURN list_distinct(["成績評価","成績評価","成績評価", "这是中文"]) +---- 1 +[成績評価,这是中文] + +-STATEMENT RETURN list_unique(["成績評価","成績評価","成績評価", "这是中文"]) +---- 1 +2 + +-STATEMENT RETURN list_any_value([null, "成績評価","成績評価","成績評価", "这是中文"]) +---- 1 +成績評価 diff --git a/test/test_files/function/string_utf8.test b/test/test_files/function/string_utf8.test new file mode 100644 index 0000000000..952e01a2c0 --- /dev/null +++ b/test/test_files/function/string_utf8.test @@ -0,0 +1,172 @@ +-GROUP TinySnbReadTest +-DATASET CSV tinysnb + +-- + +-CASE StringFunctionUTF8 +-LOG StrAddOperation +-STATEMENT MATCH (a:movies) RETURN a.name + "suffix" +---- 3 +Sóló cón tu párejâsuffix +The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 moviesuffix +Romasuffix + +-LOG StrAdd +-STATEMENT return string("The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movies") + string("成績評価の甘い授業が高く評価"); +---- 1 +The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movies成績評価の甘い授業が高く評価 + +-LOG StrConcat +-STATEMENT return concat(string("The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movies"),string("成績評価の甘い授業が高く評価")); +---- 1 +The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movies成績評価の甘い授業が高く評価 + +-LOG StrEndsWith +-STATEMENT return ends_with(string("The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movies"),string("🍞🚗 movies")); +---- 1 +True +-STATEMENT return ends_with(string("The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movies"),string("成績評価の甘い授業が高く評価")); +---- 1 +False + +-LOG StrLower +-STATEMENT MATCH (m:movies) RETURN lower(m.name) +---- 3 +sóló cón tu párejâ +the 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie +roma + +-LOG StrLcase +-STATEMENT MATCH (m:movies) RETURN lcase(m.name) +---- 3 +sóló cón tu párejâ +the 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie +roma + +-LOG StrLeft +-STATEMENT MATCH (m:movies) RETURN left(m.name, 6) +---- 3 +Sóló c +The 😂😃 +Roma + +-LOG StrLevenshtein +-STATEMENT return levenshtein('成績評価の甘い授業が高く評価', '成績評価の甘い授業が高く‍'); +---- 1 +6 + +-LOG StrSize +-STATEMENT return size('abc'); +---- 1 +3 +-STATEMENT return size('成績評価の甘い授業が高く評価'); +---- 1 +14 + +-LOG StrLpad +-STATEMENT RETURN lpad(string('成績評価'), 10, "<") +---- 1 +<<<<<<成績評価 + +-LOG strReverse +-STATEMENT RETURN reverse('成績評価の甘い授業が高く評価') +---- 1 +価評く高が業授い甘の価評績成 + +-LOG strltrim +-STATEMENT RETURN ltrim(' 😃🧘🏻‍♂️🌍🌦️🍞🚗') +---- 1 +😃🧘🏻‍♂️🌍🌦️🍞🚗 + +-LOG strprefix +-STATEMENT RETURN prefix('😃🧘🏻‍♂️🌍🌦️🍞🚗','😃🧘🏻‍') +---- 1 +True + +-LOG strrepeat +-STATEMENT RETURN repeat('😃🧘🏻‍♂️🌍🌦️🍞🚗',3) +---- 1 +😃🧘🏻‍♂️🌍🌦️🍞🚗😃🧘🏻‍♂️🌍🌦️🍞🚗😃🧘🏻‍♂️🌍🌦️🍞🚗 + +-LOG strRight +-STATEMENT RETURN right('😃🧘🏻‍♂️🌍🌦️🍞🚗', 3) +---- 1 +🌦️🍞🚗 + +-LOG strRpad +-STATEMENT RETURN rpad('😃🌍♂', 5,'<') +---- 1 +😃🌍♂<< +-STATEMENT RETURN rpad('😃🌍♂🍞🚗', 7,'<') +---- 1 +😃🌍♂🍞🚗<< +-STATEMENT RETURN rpad('😃🌍♂🌦🍞🚗', 7,'<') +---- 1 +😃🌍♂🌦🍞🚗< + +-LOG strstartwith +-STATEMENT RETURN starts_with('😃🧘🏻‍♂️🌍🌦️🍞🚗', '😃🧘🏻‍') +---- 1 +True +-STATEMENT RETURN starts_with('成績評価の甘い授業が高く評価', '成績') +---- 1 +True + +-LOG strsubstring +-STATEMENT RETURN substring('😃🌍🌦️🍞🚗', 1,3) +---- 1 +😃🌍🌦️ +-STATEMENT RETURN substring('成績評価の甘い授業が高く評価', 1,3) +---- 1 +成績評 +-STATEMENT RETURN substring('😃🧘♂🌍', 1,3) +---- 1 +😃🧘♂ + +-LOG strsubstr +-STATEMENT RETURN substr('成績評価の甘い授業が高く評価', 1,3) +---- 1 +成績評 + +-LOG strsuffix +-STATEMENT RETURN suffix('成績評価の甘い授業が高く評価', '高く評価') +---- 1 +True + +-LOG strtrim +-STATEMENT RETURN trim(' 成績評価の甘い授業が高く評価') +---- 1 +成績評価の甘い授業が高く評価 +-STATEMENT RETURN trim(' 成 績 評価の甘い授業が高く評価') +---- 1 +成 績 評価の甘い授業が高く評価 + + +-LOG strupper +-STATEMENT MATCH (m:movies) RETURN upper(m.name) +---- 3 +SÓLÓ CÓN TU PÁREJÂ +THE 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 MOVIE +ROMA + +-LOG strlower +-STATEMENT MATCH (m:movies) RETURN lower(m.name) +---- 3 +sóló cón tu párejâ +the 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie +roma + + +-LOG listfunc +-STATEMENT RETURN list_element("成績評価の甘い授業が高く評価", 3) +---- 1 +評 +-STATEMENT RETURN list_extract("成績評価の甘い授業が高く評価", 4) +---- 1 +価 +-STATEMENT RETURN array_slice("成績評価の甘い授業が高く評価", 1, 3) +---- 1 +成績評 +-STATEMENT RETURN array_extract("成績評価の甘い授業が高く評価", 4) +---- 1 +価 diff --git a/test/test_runner/test_parser.cpp b/test/test_runner/test_parser.cpp index e5f4fd4bc5..bdc19be285 100644 --- a/test/test_runner/test_parser.cpp +++ b/test/test_runner/test_parser.cpp @@ -422,8 +422,20 @@ void TestParser::openFile() { fileStream.open(path); } +std::vector TestParser::splitString(){ + std::vector matches; + std::regex re(R"((?:[^'"\s\\]+|'[^'\\]*(?:\\.[^'\\]*)*'|"[^"\\]*(?:\\.[^"\\]*)*"|\S+)+)"); + auto wordsBegin = std::sregex_iterator(line.begin(), line.end(), re); + auto wordsEnd = std::sregex_iterator(); + for (std::sregex_iterator i = wordsBegin; i != wordsEnd; ++i) { + std::smatch match = *i; + matches.push_back(match.str()); + } + return matches; +} + void TestParser::tokenize() { - currentToken.params = StringUtils::splitBySpace(line); + currentToken.params = splitString(); if ((currentToken.params.size() == 0) || (currentToken.params[0][0] == '#')) { currentToken.type = TokenType::EMPTY; } else { From fc811ad25ac201090d542f44954c238e8eca2817 Mon Sep 17 00:00:00 2001 From: CI Bot Date: Tue, 16 Apr 2024 05:55:09 +0000 Subject: [PATCH 2/4] Run clang-format --- test/test_runner/test_parser.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_runner/test_parser.cpp b/test/test_runner/test_parser.cpp index bdc19be285..8ac679e41b 100644 --- a/test/test_runner/test_parser.cpp +++ b/test/test_runner/test_parser.cpp @@ -422,7 +422,7 @@ void TestParser::openFile() { fileStream.open(path); } -std::vector TestParser::splitString(){ +std::vector TestParser::splitString() { std::vector matches; std::regex re(R"((?:[^'"\s\\]+|'[^'\\]*(?:\\.[^'\\]*)*'|"[^"\\]*(?:\\.[^"\\]*)*"|\S+)+)"); auto wordsBegin = std::sregex_iterator(line.begin(), line.end(), re); From 18ee5229f92813b513a64b89e565b98c12315aa4 Mon Sep 17 00:00:00 2001 From: hououou Date: Tue, 16 Apr 2024 11:06:39 -0400 Subject: [PATCH 3/4] small fix --- src/include/function/string/functions/ltrim_function.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/function/string/functions/ltrim_function.h b/src/include/function/string/functions/ltrim_function.h index 173b0dea9f..cf292a9707 100644 --- a/src/include/function/string/functions/ltrim_function.h +++ b/src/include/function/string/functions/ltrim_function.h @@ -21,7 +21,7 @@ struct Ltrim { break; } } - memcpy(data, data + counter, len - counter); + memmove(data, data + counter, len - counter); return len - counter; } }; From e54f68eac13ad3328cd6a04fbe12c81484797e5d Mon Sep 17 00:00:00 2001 From: hououou Date: Tue, 16 Apr 2024 13:40:29 -0400 Subject: [PATCH 4/4] solve comments --- src/include/function/string/functions/ltrim_function.h | 4 +++- test/include/test_runner/test_parser.h | 1 - test/test_runner/test_parser.cpp | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/include/function/string/functions/ltrim_function.h b/src/include/function/string/functions/ltrim_function.h index cf292a9707..5cd121613b 100644 --- a/src/include/function/string/functions/ltrim_function.h +++ b/src/include/function/string/functions/ltrim_function.h @@ -21,7 +21,9 @@ struct Ltrim { break; } } - memmove(data, data + counter, len - counter); + for (uint32_t i = 0; i < len - counter; i++) { + data[i] = data[i + counter]; + } return len - counter; } }; diff --git a/test/include/test_runner/test_parser.h b/test/include/test_runner/test_parser.h index 123e090b54..297d54cb6b 100644 --- a/test/include/test_runner/test_parser.h +++ b/test/include/test_runner/test_parser.h @@ -90,7 +90,6 @@ class TestParser { void openFile(); void tokenize(); - std::vector splitString(); void parseHeader(); void parseBody(); void extractExpectedResult(TestStatement* statement); diff --git a/test/test_runner/test_parser.cpp b/test/test_runner/test_parser.cpp index 8ac679e41b..e76e3cb0df 100644 --- a/test/test_runner/test_parser.cpp +++ b/test/test_runner/test_parser.cpp @@ -422,7 +422,7 @@ void TestParser::openFile() { fileStream.open(path); } -std::vector TestParser::splitString() { +static std::vector extractToken(std::string& line) { std::vector matches; std::regex re(R"((?:[^'"\s\\]+|'[^'\\]*(?:\\.[^'\\]*)*'|"[^"\\]*(?:\\.[^"\\]*)*"|\S+)+)"); auto wordsBegin = std::sregex_iterator(line.begin(), line.end(), re); @@ -435,7 +435,7 @@ std::vector TestParser::splitString() { } void TestParser::tokenize() { - currentToken.params = splitString(); + currentToken.params = extractToken(line); if ((currentToken.params.size() == 0) || (currentToken.params[0][0] == '#')) { currentToken.type = TokenType::EMPTY; } else {