Skip to content

Commit

Permalink
merge utf8 to master
Browse files Browse the repository at this point in the history
  • Loading branch information
anuchak committed Nov 20, 2022
1 parent 3aff141 commit 20fb489
Show file tree
Hide file tree
Showing 33 changed files with 80,432 additions and 127 deletions.
1 change: 1 addition & 0 deletions dataset/tinysnb/copy_csv.cypher
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
COPY person FROM "dataset/tinysnb/vPerson.csv" (HEADER=true);
COPY organisation FROM "dataset/tinysnb/vOrganisation.csv";
COPY movies FROM "dataset/tinysnb/vMovies.csv";
COPY knows FROM "dataset/tinysnb/eKnows.csv";
COPY studyAt FROM "dataset/tinysnb/eStudyAt.csv" (HEADER=true);
COPY workAt FROM "dataset/tinysnb/eWorkAt.csv"
Expand Down
1 change: 1 addition & 0 deletions dataset/tinysnb/schema.cypher
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, PRIMARY KEY (ID));
create node table movies (name STRING, PRIMARY KEY (name));
create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], MANY_ONE);
create rel table workAt (FROM person TO organisation, year INT64, MANY_ONE);
Expand Down
3 changes: 3 additions & 0 deletions dataset/tinysnb/vMovies.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Sóló cón tu párejâ
The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie
Roma
1 change: 1 addition & 0 deletions src/common/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ cc_library(
"configs",
"type_utils",
"//src/common/types",
"//third_party/utf8proc:utf8proc"
],
)

Expand Down
12 changes: 11 additions & 1 deletion src/common/csv_reader/csv_reader.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
#include "src/common/include/csv_reader/csv_reader.h"

#include "spdlog/spdlog.h"
#include "third_party/utf8proc/include/utf8proc_wrapper.h"

#include "src/common/include/configs.h"
#include "src/common/include/type_utils.h"
#include "src/common/include/utils.h"

using namespace kuzu::utf8proc;

namespace kuzu {
namespace common {

Expand Down Expand Up @@ -232,7 +235,14 @@ char* CSVReader::getString() {
// If the string is too long, truncate it.
strVal[DEFAULT_PAGE_SIZE] = '\0';
}
return strVal;
auto unicodeType = Utf8Proc::analyze(strVal, strlen(strVal));
if (unicodeType == UnicodeType::ASCII) {
return strVal;
} else if (unicodeType == UnicodeType::UNICODE) {
return Utf8Proc::normalize(strVal, strlen(strVal));
} else {
throw CSVReaderException("Invalid UTF-8 character encountered.");
}
}

date_t CSVReader::getDate() {
Expand Down
1 change: 1 addition & 0 deletions src/function/list/operations/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ cc_library(
],
deps = [
"//src/common/types",
"//src/function/string/operations:string_operations"
],
)
4 changes: 2 additions & 2 deletions src/function/list/operations/include/list_extract_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "src/common/types/include/ku_list.h"
#include "src/common/types/include/ku_string.h"
#include "src/function/string/operations/include/array_extract_operation.h"

using namespace std;
using namespace kuzu::common;
Expand Down Expand Up @@ -39,8 +40,7 @@ struct ListExtract {
if (str.len < idx) {
result.set("", 0);
} else {
auto pos = idx > 0 ? min(idx, (int64_t)str.len) : max(str.len + idx, (int64_t)0) + 1;
result.set((char*)(str.getData() + pos - 1), 1 /* length */);
ArrayExtract::operation(str, idx, result);
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/function/string/operations/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,7 @@ cc_library(
],
deps = [
"//src/common/types",
"//third_party/utf8proc:utf8proc",
"//src/common:vector"
],
)
50 changes: 50 additions & 0 deletions src/function/string/operations/base_lower_upper_operation.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#include "include/base_lower_upper_operation.h"

namespace kuzu {
namespace function {
namespace operation {

uint32_t BaseLowerUpperOperation::getResultLen(char* inputStr, uint32_t inputLen, bool isUpper) {
uint32_t outputLength = 0;
for (uint32_t i = 0; i < inputLen;) {
// For UTF-8 characters, changing case can increase / decrease total byte length.
// Eg.: 'ß' lower case -> 'SS' upper case [more bytes + more chars]
if (inputStr[i] & 0x80) {
int size = 0;
int codepoint = utf8proc_codepoint(inputStr + i, size);
int convertedCodepoint =
isUpper ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
int newSize = utf8proc_codepoint_length(convertedCodepoint);
assert(newSize >= 0);
outputLength += newSize;
i += size;
} else {
outputLength++;
i++;
}
}
return outputLength;
}

void BaseLowerUpperOperation::convertCase(char* result, uint32_t len, char* input, bool toUpper) {
for (auto i = 0u; i < len;) {
if (input[i] & 0x80) {
int size = 0, newSize = 0;
int codepoint = utf8proc_codepoint(input + i, size);
int convertedCodepoint =
toUpper ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
auto success = utf8proc_codepoint_to_utf8(convertedCodepoint, newSize, result);
assert(success);
result += newSize;
i += size;
} else {
*result = toUpper ? toupper(input[i]) : tolower(input[i]);
i++;
result++;
}
}
}

} // namespace operation
} // namespace function
} // namespace kuzu
52 changes: 50 additions & 2 deletions src/function/string/operations/include/array_extract_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
#include <cassert>
#include <cstring>

#include "length_operation.h"
#include "substr_operation.h"

#include "src/common/types/include/ku_string.h"

using namespace kuzu::common;
Expand All @@ -13,8 +16,53 @@ namespace operation {

struct ArrayExtract {
static inline void operation(ku_string_t& str, int64_t& idx, ku_string_t& result) {
auto pos = idx > 0 ? min(idx, (int64_t)str.len) : max(str.len + idx, (int64_t)0) + 1;
result.set((char*)(str.getData() + pos - 1), 1 /* length */);
if (idx == 0) {
result.len = 0;
return;
}
auto stringVal = str.getAsString();
int64_t strLen;
Length::operation(str, strLen);
auto idxPos = idx > 0 ? min(idx, strLen) : max(strLen + idx, (int64_t)0) + 1;
auto startPos = idxPos - 1;
auto endPos = startPos + 1;
bool isAscii = true;
for (auto i = 0u; i < min((uint64_t)idxPos + 1, stringVal.size()); i++) {
if (stringVal[i] & 0x80) {
isAscii = false;
break;
}
}
if (isAscii) {
copySubstr(str, idxPos, 1 /* length */, result, isAscii);
} else {
int64_t characterCount = 0, startBytePos = 0, endBytePos = 0;
kuzu::utf8proc::utf8proc_grapheme_callback(
stringVal.c_str(), stringVal.size(), [&](int64_t gstart, int64_t gend) {
if (characterCount == startPos) {
startBytePos = gstart;
} else if (characterCount == endPos) {
endBytePos = gstart;
return false;
}
characterCount++;
return true;
});
if (endBytePos == 0) {
endBytePos = str.len;
}
copySubstr(str, startBytePos, endBytePos - startBytePos, result, isAscii);
}
}

static inline void copySubstr(
ku_string_t& src, int64_t start, int64_t len, ku_string_t& result, bool isAscii) {
result.len = min(len, src.len - start + 1);
if (isAscii) {
memcpy((uint8_t*)result.getData(), src.getData() + start - 1, result.len);
} else {
memcpy((uint8_t*)result.getData(), src.getData() + start, result.len);
}
}
};

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#pragma once

#include <cassert>
#include <cstring>

#include "third_party/utf8proc/include/utf8proc.h"

#include "src/common/include/vector/value_vector.h"
#include "src/common/types/include/ku_string.h"

using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace kuzu {
namespace function {
namespace operation {

struct BaseLowerUpperOperation {

static inline void operation(
ku_string_t& input, ku_string_t& result, ValueVector& resultValueVector, bool isUpper) {
uint32_t resultLen = getResultLen((char*)input.getData(), input.len, isUpper);
result.len = resultLen;
if (resultLen <= ku_string_t::SHORT_STR_LENGTH) {
convertCase((char*)result.prefix, input.len, (char*)input.getData(), isUpper);
} else {
result.overflowPtr = reinterpret_cast<uint64_t>(
resultValueVector.getOverflowBuffer().allocateSpace(result.len));
auto buffer = reinterpret_cast<char*>(result.overflowPtr);
convertCase(buffer, input.len, (char*)input.getData(), isUpper);
memcpy(result.prefix, buffer, ku_string_t::PREFIX_LENGTH);
}
}

private:
static uint32_t getResultLen(char* inputStr, uint32_t inputLen, bool isUpper);
static void convertCase(char* result, uint32_t len, char* input, bool toUpper);
};

} // namespace operation
} // namespace function
} // namespace kuzu
76 changes: 76 additions & 0 deletions src/function/string/operations/include/base_pad_operation.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#pragma once

#include <cassert>
#include <cstring>

#include "third_party/utf8proc/include/utf8proc.h"

#include "src/common/include/vector/value_vector.h"
#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace kuzu {
namespace function {
namespace operation {

// Padding logic has been taken from DuckDB:
// https://github.com/duckdb/duckdb/blob/master/src/function/scalar/string/pad.cpp
struct BasePadOperation {
public:
static inline void operation(ku_string_t& src, int64_t count, ku_string_t& characterToPad,
ku_string_t& result, ValueVector& resultValueVector,
void (*padOperation)(
ku_string_t& src, int64_t count, ku_string_t& characterToPad, string& paddedResult)) {
if (count < 0) {
count = 0;
}
string paddedResult;
padOperation(src, count, characterToPad, paddedResult);
result.len = paddedResult.size();
if (ku_string_t::isShortString(result.len)) {
memcpy(result.prefix, paddedResult.data(), result.len);
} else {
result.overflowPtr = reinterpret_cast<uint64_t>(
resultValueVector.getOverflowBuffer().allocateSpace(result.len));
auto buffer = reinterpret_cast<char*>(result.overflowPtr);
memcpy(buffer, paddedResult.data(), result.len);
memcpy(result.prefix, buffer, ku_string_t::PREFIX_LENGTH);
}
}

static pair<uint32_t, uint32_t> padCountChars(
const uint32_t count, const char* data, const uint32_t size) {
auto str = reinterpret_cast<const utf8proc_uint8_t*>(data);
uint32_t byteCount = 0, charCount = 0;
for (; charCount < count && byteCount < size; charCount++) {
utf8proc_int32_t codepoint;
auto bytes = utf8proc_iterate(str + byteCount, size - byteCount, &codepoint);
byteCount += bytes;
}
return {byteCount, charCount};
}

static void insertPadding(uint32_t charCount, ku_string_t pad, string& result) {
auto padData = pad.getData();
auto padSize = pad.len;
uint32_t padByteCount = 0;
for (auto i = 0; i < charCount; i++) {
if (padByteCount >= padSize) {
result.insert(result.end(), (char*)padData, (char*)(padData + padByteCount));
padByteCount = 0;
}
utf8proc_int32_t codepoint;
auto bytes =
utf8proc_iterate(padData + padByteCount, padSize - padByteCount, &codepoint);
padByteCount += bytes;
}
result.insert(result.end(), (char*)padData, (char*)(padData + padByteCount));
}
};

} // namespace operation
} // namespace function
} // namespace kuzu
7 changes: 5 additions & 2 deletions src/function/string/operations/include/left_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
#include <cassert>
#include <cstring>

#include "length_operation.h"
#include "substr_operation.h"

#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace kuzu {
namespace function {
Expand All @@ -18,8 +20,9 @@ struct Left {
public:
static inline void operation(
ku_string_t& left, int64_t& right, ku_string_t& result, ValueVector& resultValueVector) {
auto len = right >= 0 ? min(left.len, (uint32_t)right) :
((uint32_t)max(left.len + right, (int64_t)0));
int64_t leftLen;
Length::operation(left, leftLen);
int64_t len = (right > -1) ? min(leftLen, right) : max(leftLen + right, (int64_t)0);
SubStr::operation(left, 1, len, result, resultValueVector);
}
};
Expand Down
23 changes: 22 additions & 1 deletion src/function/string/operations/include/length_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,38 @@
#include <cassert>
#include <cstring>

#include "third_party/utf8proc/include/utf8proc.h"

#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace kuzu {
namespace function {
namespace operation {

struct Length {
static inline void operation(ku_string_t& input, int64_t& result) { result = input.len; }
static inline void operation(ku_string_t& input, int64_t& result) {
auto totalByteLength = input.len;
auto inputString = input.getAsString();
for (auto i = 0; i < totalByteLength; i++) {
if (inputString[i] & 0x80) {
int64_t length = 0;
// Use grapheme iterator to identify bytes of utf8 char and increment once for each
// char.
utf8proc_grapheme_callback(
inputString.c_str(), totalByteLength, [&](size_t start, size_t end) {
length++;
return true;
});
result = length;
return;
}
}
result = totalByteLength;
}
};

} // namespace operation
Expand Down
Loading

0 comments on commit 20fb489

Please sign in to comment.