merge utf8 to master

kuzudb · Nov 20, 2022 · 20fb489 · 20fb489
1 parent 3aff141
commit 20fb489
Show file tree

Hide file tree

Showing 33 changed files with 80,432 additions and 127 deletions.
diff --git a/dataset/tinysnb/copy_csv.cypher b/dataset/tinysnb/copy_csv.cypher
@@ -1,5 +1,6 @@
 COPY person FROM "dataset/tinysnb/vPerson.csv" (HEADER=true);
 COPY organisation FROM "dataset/tinysnb/vOrganisation.csv";
+COPY movies FROM "dataset/tinysnb/vMovies.csv";
 COPY knows FROM "dataset/tinysnb/eKnows.csv";
 COPY studyAt FROM "dataset/tinysnb/eStudyAt.csv" (HEADER=true);
 COPY workAt FROM "dataset/tinysnb/eWorkAt.csv"

diff --git a/dataset/tinysnb/schema.cypher b/dataset/tinysnb/schema.cypher
@@ -1,5 +1,6 @@
 create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], PRIMARY KEY (ID));
 create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, PRIMARY KEY (ID));
+create node table movies (name STRING, PRIMARY KEY (name));
 create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
 create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], MANY_ONE);
 create rel table workAt (FROM person TO organisation, year INT64, MANY_ONE);

diff --git a/dataset/tinysnb/vMovies.csv b/dataset/tinysnb/vMovies.csv
@@ -0,0 +1,3 @@
+Sóló cón tu párejâ
+The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie
+Roma
diff --git a/src/common/BUILD.bazel b/src/common/BUILD.bazel
@@ -103,6 +103,7 @@ cc_library(
         "configs",
         "type_utils",
         "//src/common/types",
+        "//third_party/utf8proc:utf8proc"
     ],
 )
 

diff --git a/src/common/csv_reader/csv_reader.cpp b/src/common/csv_reader/csv_reader.cpp
@@ -1,11 +1,14 @@
 #include "src/common/include/csv_reader/csv_reader.h"
 
 #include "spdlog/spdlog.h"
+#include "third_party/utf8proc/include/utf8proc_wrapper.h"
 
 #include "src/common/include/configs.h"
 #include "src/common/include/type_utils.h"
 #include "src/common/include/utils.h"
 
+using namespace kuzu::utf8proc;
+
 namespace kuzu {
 namespace common {
 
@@ -232,7 +235,14 @@ char* CSVReader::getString() {
         // If the string is too long, truncate it.
         strVal[DEFAULT_PAGE_SIZE] = '\0';
     }
-    return strVal;
+    auto unicodeType = Utf8Proc::analyze(strVal, strlen(strVal));
+    if (unicodeType == UnicodeType::ASCII) {
+        return strVal;
+    } else if (unicodeType == UnicodeType::UNICODE) {
+        return Utf8Proc::normalize(strVal, strlen(strVal));
+    } else {
+        throw CSVReaderException("Invalid UTF-8 character encountered.");
+    }
 }
 
 date_t CSVReader::getDate() {

diff --git a/src/function/list/operations/BUILD.bazel b/src/function/list/operations/BUILD.bazel
@@ -11,5 +11,6 @@ cc_library(
     ],
     deps = [
         "//src/common/types",
+        "//src/function/string/operations:string_operations"
     ],
 )
diff --git a/src/function/list/operations/include/list_extract_operation.h b/src/function/list/operations/include/list_extract_operation.h
@@ -5,6 +5,7 @@
 
 #include "src/common/types/include/ku_list.h"
 #include "src/common/types/include/ku_string.h"
+#include "src/function/string/operations/include/array_extract_operation.h"
 
 using namespace std;
 using namespace kuzu::common;
@@ -39,8 +40,7 @@ struct ListExtract {
         if (str.len < idx) {
             result.set("", 0);
         } else {
-            auto pos = idx > 0 ? min(idx, (int64_t)str.len) : max(str.len + idx, (int64_t)0) + 1;
-            result.set((char*)(str.getData() + pos - 1), 1 /* length */);
+            ArrayExtract::operation(str, idx, result);
         }
     }
 

diff --git a/src/function/string/operations/BUILD.bazel b/src/function/string/operations/BUILD.bazel
@@ -12,5 +12,7 @@ cc_library(
     ],
     deps = [
         "//src/common/types",
+        "//third_party/utf8proc:utf8proc",
+        "//src/common:vector"
     ],
 )
diff --git a/src/function/string/operations/base_lower_upper_operation.cpp b/src/function/string/operations/base_lower_upper_operation.cpp
@@ -0,0 +1,50 @@
+#include "include/base_lower_upper_operation.h"
+
+namespace kuzu {
+namespace function {
+namespace operation {
+
+uint32_t BaseLowerUpperOperation::getResultLen(char* inputStr, uint32_t inputLen, bool isUpper) {
+    uint32_t outputLength = 0;
+    for (uint32_t i = 0; i < inputLen;) {
+        // For UTF-8 characters, changing case can increase / decrease total byte length.
+        // Eg.: 'ß' lower case -> 'SS' upper case [more bytes + more chars]
+        if (inputStr[i] & 0x80) {
+            int size = 0;
+            int codepoint = utf8proc_codepoint(inputStr + i, size);
+            int convertedCodepoint =
+                isUpper ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
+            int newSize = utf8proc_codepoint_length(convertedCodepoint);
+            assert(newSize >= 0);
+            outputLength += newSize;
+            i += size;
+        } else {
+            outputLength++;
+            i++;
+        }
+    }
+    return outputLength;
+}
+
+void BaseLowerUpperOperation::convertCase(char* result, uint32_t len, char* input, bool toUpper) {
+    for (auto i = 0u; i < len;) {
+        if (input[i] & 0x80) {
+            int size = 0, newSize = 0;
+            int codepoint = utf8proc_codepoint(input + i, size);
+            int convertedCodepoint =
+                toUpper ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
+            auto success = utf8proc_codepoint_to_utf8(convertedCodepoint, newSize, result);
+            assert(success);
+            result += newSize;
+            i += size;
+        } else {
+            *result = toUpper ? toupper(input[i]) : tolower(input[i]);
+            i++;
+            result++;
+        }
+    }
+}
+
+} // namespace operation
+} // namespace function
+} // namespace kuzu
diff --git a/src/function/string/operations/include/array_extract_operation.h b/src/function/string/operations/include/array_extract_operation.h
@@ -3,6 +3,9 @@
 #include <cassert>
 #include <cstring>
 
+#include "length_operation.h"
+#include "substr_operation.h"
+
 #include "src/common/types/include/ku_string.h"
 
 using namespace kuzu::common;
@@ -13,8 +16,53 @@ namespace operation {
 
 struct ArrayExtract {
     static inline void operation(ku_string_t& str, int64_t& idx, ku_string_t& result) {
-        auto pos = idx > 0 ? min(idx, (int64_t)str.len) : max(str.len + idx, (int64_t)0) + 1;
-        result.set((char*)(str.getData() + pos - 1), 1 /* length */);
+        if (idx == 0) {
+            result.len = 0;
+            return;
+        }
+        auto stringVal = str.getAsString();
+        int64_t strLen;
+        Length::operation(str, strLen);
+        auto idxPos = idx > 0 ? min(idx, strLen) : max(strLen + idx, (int64_t)0) + 1;
+        auto startPos = idxPos - 1;
+        auto endPos = startPos + 1;
+        bool isAscii = true;
+        for (auto i = 0u; i < min((uint64_t)idxPos + 1, stringVal.size()); i++) {
+            if (stringVal[i] & 0x80) {
+                isAscii = false;
+                break;
+            }
+        }
+        if (isAscii) {
+            copySubstr(str, idxPos, 1 /* length */, result, isAscii);
+        } else {
+            int64_t characterCount = 0, startBytePos = 0, endBytePos = 0;
+            kuzu::utf8proc::utf8proc_grapheme_callback(
+                stringVal.c_str(), stringVal.size(), [&](int64_t gstart, int64_t gend) {
+                    if (characterCount == startPos) {
+                        startBytePos = gstart;
+                    } else if (characterCount == endPos) {
+                        endBytePos = gstart;
+                        return false;
+                    }
+                    characterCount++;
+                    return true;
+                });
+            if (endBytePos == 0) {
+                endBytePos = str.len;
+            }
+            copySubstr(str, startBytePos, endBytePos - startBytePos, result, isAscii);
+        }
+    }
+
+    static inline void copySubstr(
+        ku_string_t& src, int64_t start, int64_t len, ku_string_t& result, bool isAscii) {
+        result.len = min(len, src.len - start + 1);
+        if (isAscii) {
+            memcpy((uint8_t*)result.getData(), src.getData() + start - 1, result.len);
+        } else {
+            memcpy((uint8_t*)result.getData(), src.getData() + start, result.len);
+        }
     }
 };
 

diff --git a/src/function/string/operations/include/base_lower_upper_operation.h b/src/function/string/operations/include/base_lower_upper_operation.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <cassert>
+#include <cstring>
+
+#include "third_party/utf8proc/include/utf8proc.h"
+
+#include "src/common/include/vector/value_vector.h"
+#include "src/common/types/include/ku_string.h"
+
+using namespace kuzu::common;
+using namespace kuzu::utf8proc;
+
+namespace kuzu {
+namespace function {
+namespace operation {
+
+struct BaseLowerUpperOperation {
+
+    static inline void operation(
+        ku_string_t& input, ku_string_t& result, ValueVector& resultValueVector, bool isUpper) {
+        uint32_t resultLen = getResultLen((char*)input.getData(), input.len, isUpper);
+        result.len = resultLen;
+        if (resultLen <= ku_string_t::SHORT_STR_LENGTH) {
+            convertCase((char*)result.prefix, input.len, (char*)input.getData(), isUpper);
+        } else {
+            result.overflowPtr = reinterpret_cast<uint64_t>(
+                resultValueVector.getOverflowBuffer().allocateSpace(result.len));
+            auto buffer = reinterpret_cast<char*>(result.overflowPtr);
+            convertCase(buffer, input.len, (char*)input.getData(), isUpper);
+            memcpy(result.prefix, buffer, ku_string_t::PREFIX_LENGTH);
+        }
+    }
+
+private:
+    static uint32_t getResultLen(char* inputStr, uint32_t inputLen, bool isUpper);
+    static void convertCase(char* result, uint32_t len, char* input, bool toUpper);
+};
+
+} // namespace operation
+} // namespace function
+} // namespace kuzu
diff --git a/src/function/string/operations/include/base_pad_operation.h b/src/function/string/operations/include/base_pad_operation.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <cassert>
+#include <cstring>
+
+#include "third_party/utf8proc/include/utf8proc.h"
+
+#include "src/common/include/vector/value_vector.h"
+#include "src/common/types/include/ku_string.h"
+
+using namespace std;
+using namespace kuzu::common;
+using namespace kuzu::utf8proc;
+
+namespace kuzu {
+namespace function {
+namespace operation {
+
+// Padding logic has been taken from DuckDB:
+// https://github.com/duckdb/duckdb/blob/master/src/function/scalar/string/pad.cpp
+struct BasePadOperation {
+public:
+    static inline void operation(ku_string_t& src, int64_t count, ku_string_t& characterToPad,
+        ku_string_t& result, ValueVector& resultValueVector,
+        void (*padOperation)(
+            ku_string_t& src, int64_t count, ku_string_t& characterToPad, string& paddedResult)) {
+        if (count < 0) {
+            count = 0;
+        }
+        string paddedResult;
+        padOperation(src, count, characterToPad, paddedResult);
+        result.len = paddedResult.size();
+        if (ku_string_t::isShortString(result.len)) {
+            memcpy(result.prefix, paddedResult.data(), result.len);
+        } else {
+            result.overflowPtr = reinterpret_cast<uint64_t>(
+                resultValueVector.getOverflowBuffer().allocateSpace(result.len));
+            auto buffer = reinterpret_cast<char*>(result.overflowPtr);
+            memcpy(buffer, paddedResult.data(), result.len);
+            memcpy(result.prefix, buffer, ku_string_t::PREFIX_LENGTH);
+        }
+    }
+
+    static pair<uint32_t, uint32_t> padCountChars(
+        const uint32_t count, const char* data, const uint32_t size) {
+        auto str = reinterpret_cast<const utf8proc_uint8_t*>(data);
+        uint32_t byteCount = 0, charCount = 0;
+        for (; charCount < count && byteCount < size; charCount++) {
+            utf8proc_int32_t codepoint;
+            auto bytes = utf8proc_iterate(str + byteCount, size - byteCount, &codepoint);
+            byteCount += bytes;
+        }
+        return {byteCount, charCount};
+    }
+
+    static void insertPadding(uint32_t charCount, ku_string_t pad, string& result) {
+        auto padData = pad.getData();
+        auto padSize = pad.len;
+        uint32_t padByteCount = 0;
+        for (auto i = 0; i < charCount; i++) {
+            if (padByteCount >= padSize) {
+                result.insert(result.end(), (char*)padData, (char*)(padData + padByteCount));
+                padByteCount = 0;
+            }
+            utf8proc_int32_t codepoint;
+            auto bytes =
+                utf8proc_iterate(padData + padByteCount, padSize - padByteCount, &codepoint);
+            padByteCount += bytes;
+        }
+        result.insert(result.end(), (char*)padData, (char*)(padData + padByteCount));
+    }
+};
+
+} // namespace operation
+} // namespace function
+} // namespace kuzu
diff --git a/src/function/string/operations/include/left_operation.h b/src/function/string/operations/include/left_operation.h
@@ -3,12 +3,14 @@
 #include <cassert>
 #include <cstring>
 
+#include "length_operation.h"
 #include "substr_operation.h"
 
 #include "src/common/types/include/ku_string.h"
 
 using namespace std;
 using namespace kuzu::common;
+using namespace kuzu::utf8proc;
 
 namespace kuzu {
 namespace function {
@@ -18,8 +20,9 @@ struct Left {
 public:
     static inline void operation(
         ku_string_t& left, int64_t& right, ku_string_t& result, ValueVector& resultValueVector) {
-        auto len = right >= 0 ? min(left.len, (uint32_t)right) :
-                                ((uint32_t)max(left.len + right, (int64_t)0));
+        int64_t leftLen;
+        Length::operation(left, leftLen);
+        int64_t len = (right > -1) ? min(leftLen, right) : max(leftLen + right, (int64_t)0);
         SubStr::operation(left, 1, len, result, resultValueVector);
     }
 };

diff --git a/src/function/string/operations/include/length_operation.h b/src/function/string/operations/include/length_operation.h
@@ -3,17 +3,38 @@
 #include <cassert>
 #include <cstring>
 
+#include "third_party/utf8proc/include/utf8proc.h"
+
 #include "src/common/types/include/ku_string.h"
 
 using namespace std;
 using namespace kuzu::common;
+using namespace kuzu::utf8proc;
 
 namespace kuzu {
 namespace function {
 namespace operation {
 
 struct Length {
-    static inline void operation(ku_string_t& input, int64_t& result) { result = input.len; }
+    static inline void operation(ku_string_t& input, int64_t& result) {
+        auto totalByteLength = input.len;
+        auto inputString = input.getAsString();
+        for (auto i = 0; i < totalByteLength; i++) {
+            if (inputString[i] & 0x80) {
+                int64_t length = 0;
+                // Use grapheme iterator to identify bytes of utf8 char and increment once for each
+                // char.
+                utf8proc_grapheme_callback(
+                    inputString.c_str(), totalByteLength, [&](size_t start, size_t end) {
+                        length++;
+                        return true;
+                    });
+                result = length;
+                return;
+            }
+        }
+        result = totalByteLength;
+    }
 };
 
 } // namespace operation