merge utf8 to master

kuzudb · Nov 16, 2022 · 8926807 · 8926807
1 parent 1eb39c9
commit 8926807
Show file tree

Hide file tree

Showing 30 changed files with 80,319 additions and 118 deletions.
diff --git a/dataset/tinysnb/copy_csv.cypher b/dataset/tinysnb/copy_csv.cypher
@@ -1,5 +1,6 @@
 COPY person FROM "dataset/tinysnb/vPerson.csv" (HEADER=true);
 COPY organisation FROM "dataset/tinysnb/vOrganisation.csv";
+COPY movies FROM "dataset/tinysnb/vMovies.csv";
 COPY knows FROM "dataset/tinysnb/eKnows.csv";
 COPY studyAt FROM "dataset/tinysnb/eStudyAt.csv" (HEADER=true);
 COPY workAt FROM "dataset/tinysnb/eWorkAt.csv"

diff --git a/dataset/tinysnb/schema.cypher b/dataset/tinysnb/schema.cypher
@@ -1,5 +1,6 @@
 create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], PRIMARY KEY (ID));
 create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, PRIMARY KEY (ID));
+create node table movies (ID INT64, name STRING, PRIMARY KEY (ID));
 create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
 create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], MANY_ONE);
 create rel table workAt (FROM person TO organisation, year INT64, MANY_ONE);

diff --git a/dataset/tinysnb/vMovies.csv b/dataset/tinysnb/vMovies.csv
@@ -0,0 +1,3 @@
+1,Sóló cón tu párejâ
+2,The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie
+3,Roma
diff --git a/src/common/BUILD.bazel b/src/common/BUILD.bazel
@@ -103,6 +103,7 @@ cc_library(
         "configs",
         "type_utils",
         "//src/common/types",
+        "//third_party/utf8proc:utf8proc"
     ],
 )
 

diff --git a/src/common/csv_reader/csv_reader.cpp b/src/common/csv_reader/csv_reader.cpp
@@ -1,11 +1,14 @@
 #include "src/common/include/csv_reader/csv_reader.h"
 
 #include "spdlog/spdlog.h"
+#include "third_party/utf8proc/include/utf8proc_wrapper.h"
 
 #include "src/common/include/configs.h"
 #include "src/common/include/type_utils.h"
 #include "src/common/include/utils.h"
 
+using namespace kuzu::utf8proc;
+
 namespace kuzu {
 namespace common {
 
@@ -232,7 +235,14 @@ char* CSVReader::getString() {
         // If the string is too long, truncate it.
         strVal[DEFAULT_PAGE_SIZE] = '\0';
     }
-    return strVal;
+    auto unicodeType = Utf8Proc::analyze(strVal, strlen(strVal));
+    if (unicodeType == UnicodeType::ASCII) {
+        return strVal;
+    } else if (unicodeType == UnicodeType::UNICODE) {
+        return Utf8Proc::normalize(strVal, strlen(strVal));
+    } else {
+        throw CSVReaderException("Invalid UTF-8 character encountered.");
+    }
 }
 
 date_t CSVReader::getDate() {

diff --git a/src/function/string/operations/BUILD.bazel b/src/function/string/operations/BUILD.bazel
@@ -12,5 +12,7 @@ cc_library(
     ],
     deps = [
         "//src/common/types",
+        "//third_party/utf8proc:utf8proc",
+        "//src/common:vector"
     ],
 )
diff --git a/src/function/string/operations/base_lower_upper_operation.cpp b/src/function/string/operations/base_lower_upper_operation.cpp
@@ -0,0 +1,50 @@
+#include "include/base_lower_upper_operation.h"
+
+namespace kuzu {
+namespace function {
+namespace operation {
+
+uint32_t BaseLowerUpperOperation::getResultLen(char* inputStr, uint32_t inputLen, bool isUpper) {
+    uint32_t outputLength = 0;
+    for (uint32_t i = 0; i < inputLen;) {
+        // For UTF-8 characters, changing case can increase / decrease total byte length.
+        // Eg.: 'ß' lower case -> 'SS' upper case [more bytes + more chars]
+        if (inputStr[i] & 0x80) {
+            int size = 0;
+            int codepoint = utf8proc_codepoint(inputStr + i, size);
+            int convertedCodepoint =
+                isUpper ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
+            int newSize = utf8proc_codepoint_length(convertedCodepoint);
+            assert(newSize >= 0);
+            outputLength += newSize;
+            i += size;
+        } else {
+            outputLength++;
+            i++;
+        }
+    }
+    return outputLength;
+}
+
+void BaseLowerUpperOperation::convertCase(char* result, uint32_t len, char* input, bool toUpper) {
+    for (auto i = 0u; i < len;) {
+        if (input[i] & 0x80) {
+            int size = 0, newSize = 0;
+            int codepoint = utf8proc_codepoint(input + i, size);
+            int convertedCodepoint =
+                toUpper ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
+            auto success = utf8proc_codepoint_to_utf8(convertedCodepoint, newSize, result);
+            assert(success);
+            result += newSize;
+            i += size;
+        } else {
+            *result = toUpper ? toupper(input[i]) : tolower(input[i]);
+            i++;
+            result++;
+        }
+    }
+}
+
+} // namespace operation
+} // namespace function
+} // namespace kuzu
diff --git a/src/function/string/operations/include/base_lower_upper_operation.h b/src/function/string/operations/include/base_lower_upper_operation.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <cassert>
+#include <cstring>
+
+#include "third_party/utf8proc/include/utf8proc.h"
+
+#include "src/common/include/vector/value_vector.h"
+#include "src/common/types/include/ku_string.h"
+
+using namespace kuzu::common;
+using namespace kuzu::utf8proc;
+
+namespace kuzu {
+namespace function {
+namespace operation {
+
+struct BaseLowerUpperOperation {
+
+    static inline void operation(
+        ku_string_t& input, ku_string_t& result, ValueVector& resultValueVector, bool isUpper) {
+        uint32_t resultLen = getResultLen((char*)input.getData(), input.len, isUpper);
+        result.len = resultLen;
+        if (resultLen <= ku_string_t::SHORT_STR_LENGTH) {
+            convertCase((char*)result.prefix, input.len, (char*)input.getData(), isUpper);
+        } else {
+            result.overflowPtr = reinterpret_cast<uint64_t>(
+                resultValueVector.getOverflowBuffer().allocateSpace(result.len));
+            auto buffer = reinterpret_cast<char*>(result.overflowPtr);
+            convertCase(buffer, input.len, (char*)input.getData(), isUpper);
+            memcpy(result.prefix, buffer, ku_string_t::PREFIX_LENGTH);
+        }
+    }
+
+private:
+    static uint32_t getResultLen(char* inputStr, uint32_t inputLen, bool isUpper);
+    static void convertCase(char* result, uint32_t len, char* input, bool toUpper);
+};
+} // namespace operation
+} // namespace function
+} // namespace kuzu
diff --git a/src/function/string/operations/include/base_pad_operation.h b/src/function/string/operations/include/base_pad_operation.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <cassert>
+#include <cstring>
+
+#include "third_party/utf8proc/include/utf8proc.h"
+
+#include "src/common/include/vector/value_vector.h"
+#include "src/common/types/include/ku_string.h"
+
+using namespace std;
+using namespace kuzu::common;
+using namespace kuzu::utf8proc;
+
+namespace kuzu {
+namespace function {
+namespace operation {
+
+// Padding logic has been taken from DuckDB:
+// https://github.com/duckdb/duckdb/blob/master/src/function/scalar/string/pad.cpp
+struct BasePadOperation {
+public:
+    static inline void operation(ku_string_t& src, int64_t count, ku_string_t& characterToPad,
+        ku_string_t& result, ValueVector& resultValueVector,
+        void (*padOperation)(
+            ku_string_t& src, int64_t count, ku_string_t& characterToPad, string& paddedResult)) {
+        string paddedResult;
+        padOperation(src, count, characterToPad, paddedResult);
+        result.len = paddedResult.size();
+        if (ku_string_t::isShortString(result.len)) {
+            memcpy(result.prefix, paddedResult.data(), result.len);
+        } else {
+            result.overflowPtr = reinterpret_cast<uint64_t>(
+                resultValueVector.getOverflowBuffer().allocateSpace(result.len));
+            auto buffer = reinterpret_cast<char*>(result.overflowPtr);
+            memcpy(buffer, paddedResult.data(), result.len);
+            memcpy(result.prefix, buffer, ku_string_t::PREFIX_LENGTH);
+        }
+    }
+
+    static pair<uint32_t, uint32_t> padCountChars(
+        const uint32_t count, const char* data, const uint32_t size) {
+        auto str = reinterpret_cast<const utf8proc_uint8_t*>(data);
+        uint32_t byteCount = 0, charCount = 0;
+        for (; charCount < count && byteCount < size; charCount++) {
+            utf8proc_int32_t codepoint;
+            auto bytes = utf8proc_iterate(str + byteCount, size - byteCount, &codepoint);
+            byteCount += bytes;
+        }
+        return {byteCount, charCount};
+    }
+
+    static void insertPadding(uint32_t charCount, ku_string_t pad, string& result) {
+        auto padData = pad.getData();
+        auto padSize = pad.len;
+        uint32_t padByteCount = 0;
+        for (auto i = 0; i < charCount; i++) {
+            if (padByteCount >= padSize) {
+                result.insert(result.end(), (char*)padData, (char*)(padData + padByteCount));
+                padByteCount = 0;
+            }
+            utf8proc_int32_t codepoint;
+            auto bytes =
+                utf8proc_iterate(padData + padByteCount, padSize - padByteCount, &codepoint);
+            padByteCount += bytes;
+        }
+        result.insert(result.end(), (char*)padData, (char*)(padData + padByteCount));
+    }
+};
+
+} // namespace operation
+} // namespace function
+} // namespace kuzu
diff --git a/src/function/string/operations/include/left_operation.h b/src/function/string/operations/include/left_operation.h
@@ -3,12 +3,14 @@
 #include <cassert>
 #include <cstring>
 
+#include "length_operation.h"
 #include "substr_operation.h"
 
 #include "src/common/types/include/ku_string.h"
 
 using namespace std;
 using namespace kuzu::common;
+using namespace kuzu::utf8proc;
 
 namespace kuzu {
 namespace function {
@@ -18,8 +20,9 @@ struct Left {
 public:
     static inline void operation(
         ku_string_t& left, int64_t& right, ku_string_t& result, ValueVector& resultValueVector) {
-        auto len = right > 0 ? min(left.len, (uint32_t)right) :
-                               max(left.len + (uint32_t)right, (uint32_t)0u);
+        int64_t leftLen;
+        Length::operation(left, leftLen);
+        int64_t len = (right > 0) ? min(leftLen, right) : max(leftLen + right, (int64_t)0);
         SubStr::operation(left, 1, len, result, resultValueVector);
     }
 };

diff --git a/src/function/string/operations/include/length_operation.h b/src/function/string/operations/include/length_operation.h
@@ -3,17 +3,38 @@
 #include <cassert>
 #include <cstring>
 
+#include "third_party/utf8proc/include/utf8proc.h"
+
 #include "src/common/types/include/ku_string.h"
 
 using namespace std;
 using namespace kuzu::common;
+using namespace kuzu::utf8proc;
 
 namespace kuzu {
 namespace function {
 namespace operation {
 
 struct Length {
-    static inline void operation(ku_string_t& input, int64_t& result) { result = input.len; }
+    static inline void operation(ku_string_t& input, int64_t& result) {
+        auto totalByteLength = input.len;
+        auto inputString = input.getAsString();
+        for (auto i = 0; i < totalByteLength; i++) {
+            if (inputString[i] & 0x80) {
+                int64_t length = 0;
+                // use grapheme iterator to identify bytes of utf8 char and increment once for each
+                // char
+                utf8proc_grapheme_callback(
+                    inputString.c_str(), totalByteLength, [&](size_t start, size_t end) {
+                        length++;
+                        return true;
+                    });
+                result = length;
+                return;
+            }
+        }
+        result = totalByteLength;
+    }
 };
 
 } // namespace operation

diff --git a/src/function/string/operations/include/lower_operation.h b/src/function/string/operations/include/lower_operation.h
@@ -3,12 +3,13 @@
 #include <cassert>
 #include <cstring>
 
-#include "base_str_operation.h"
+#include "base_lower_upper_operation.h"
 
 #include "src/common/types/include/ku_string.h"
 
 using namespace std;
 using namespace kuzu::common;
+using namespace kuzu::function::operation;
 
 namespace kuzu {
 namespace function {
@@ -18,15 +19,7 @@ struct Lower {
 public:
     static inline void operation(
         ku_string_t& input, ku_string_t& result, ValueVector& resultValueVector) {
-        BaseStrOperation::operation(input, result, resultValueVector, lowerStr);
-    }
-
-private:
-    static uint32_t lowerStr(char* str, uint32_t len) {
-        for (auto i = 0u; i < len; i++) {
-            str[i] = tolower(str[i]);
-        }
-        return len;
+        BaseLowerUpperOperation::operation(input, result, resultValueVector, false /* isUpper */);
     }
 };
 

diff --git a/src/function/string/operations/include/lpad_operation.h b/src/function/string/operations/include/lpad_operation.h
@@ -3,33 +3,32 @@
 #include <cassert>
 #include <cstring>
 
-#include "pad_operation.h"
+#include "base_pad_operation.h"
 
 #include "src/common/types/include/ku_string.h"
 
 using namespace std;
-using namespace kuzu::common;
+using namespace kuzu::function::operation;
 
 namespace kuzu {
 namespace function {
 namespace operation {
 
-struct Lpad : PadOperation {
+struct Lpad : BasePadOperation {
 public:
     static inline void operation(ku_string_t& src, int64_t count, ku_string_t& characterToPad,
         ku_string_t& result, ValueVector& resultValueVector) {
-        PadOperation::operation(
+        BasePadOperation::operation(
             src, count, characterToPad, result, resultValueVector, lpadOperation);
     }
 
-    static void lpadOperation(ku_string_t& result, ku_string_t& src, ku_string_t& characterToPad) {
-        auto offset = 0u;
-        if (result.len > src.len) {
-            for (; offset < result.len - src.len; offset++) {
-                memcpy((uint8_t*)result.getData() + offset, characterToPad.getData(), 1);
-            }
-        }
-        memcpy((uint8_t*)result.getData() + offset, src.getData(), src.len);
+    static void lpadOperation(
+        ku_string_t& src, int64_t count, ku_string_t& characterToPad, string& paddedResult) {
+        auto srcPadInfo =
+            BasePadOperation::padCountChars(count, (const char*)src.getData(), src.len);
+        auto srcData = (const char*)src.getData();
+        BasePadOperation::insertPadding(count - srcPadInfo.second, characterToPad, paddedResult);
+        paddedResult.insert(paddedResult.end(), srcData, srcData + srcPadInfo.first);
     }
 };