Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UTF-8 string #1037

Merged
merged 1 commit into from
Nov 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dataset/tinysnb/copy_csv.cypher
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
COPY person FROM "dataset/tinysnb/vPerson.csv" (HEADER=true);
COPY organisation FROM "dataset/tinysnb/vOrganisation.csv";
COPY movies FROM "dataset/tinysnb/vMovies.csv";
COPY knows FROM "dataset/tinysnb/eKnows.csv";
COPY studyAt FROM "dataset/tinysnb/eStudyAt.csv" (HEADER=true);
COPY workAt FROM "dataset/tinysnb/eWorkAt.csv"
Expand Down
1 change: 1 addition & 0 deletions dataset/tinysnb/schema.cypher
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, PRIMARY KEY (ID));
create node table movies (name STRING, PRIMARY KEY (name));
create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], MANY_ONE);
create rel table workAt (FROM person TO organisation, year INT64, MANY_ONE);
Expand Down
3 changes: 3 additions & 0 deletions dataset/tinysnb/vMovies.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Sóló cón tu párejâ
The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie
Roma
1 change: 1 addition & 0 deletions src/common/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ cc_library(
"configs",
"type_utils",
"//src/common/types",
"//third_party/utf8proc:utf8proc"
],
)

Expand Down
12 changes: 11 additions & 1 deletion src/common/csv_reader/csv_reader.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
#include "src/common/include/csv_reader/csv_reader.h"

#include "spdlog/spdlog.h"
#include "third_party/utf8proc/include/utf8proc_wrapper.h"

#include "src/common/include/configs.h"
#include "src/common/include/type_utils.h"
#include "src/common/include/utils.h"

using namespace kuzu::utf8proc;

namespace kuzu {
namespace common {

Expand Down Expand Up @@ -232,7 +235,14 @@ char* CSVReader::getString() {
// If the string is too long, truncate it.
strVal[DEFAULT_PAGE_SIZE] = '\0';
}
return strVal;
auto unicodeType = Utf8Proc::analyze(strVal, strlen(strVal));
if (unicodeType == UnicodeType::ASCII) {
return strVal;
} else if (unicodeType == UnicodeType::UNICODE) {
return Utf8Proc::normalize(strVal, strlen(strVal));
} else {
throw CSVReaderException("Invalid UTF-8 character encountered.");
}
}

date_t CSVReader::getDate() {
Expand Down
1 change: 1 addition & 0 deletions src/function/list/operations/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ cc_library(
],
deps = [
"//src/common/types",
"//src/function/string/operations:string_operations"
],
)
4 changes: 2 additions & 2 deletions src/function/list/operations/include/list_extract_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "src/common/types/include/ku_list.h"
#include "src/common/types/include/ku_string.h"
#include "src/function/string/operations/include/array_extract_operation.h"

using namespace std;
using namespace kuzu::common;
Expand Down Expand Up @@ -39,8 +40,7 @@ struct ListExtract {
if (str.len < idx) {
result.set("", 0);
} else {
auto pos = idx > 0 ? min(idx, (int64_t)str.len) : max(str.len + idx, (int64_t)0) + 1;
result.set((char*)(str.getData() + pos - 1), 1 /* length */);
ArrayExtract::operation(str, idx, result);
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/function/string/operations/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,7 @@ cc_library(
],
deps = [
"//src/common/types",
"//third_party/utf8proc:utf8proc",
"//src/common:vector"
],
)
50 changes: 50 additions & 0 deletions src/function/string/operations/base_lower_upper_operation.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#include "include/base_lower_upper_operation.h"

namespace kuzu {
namespace function {
namespace operation {

uint32_t BaseLowerUpperOperation::getResultLen(char* inputStr, uint32_t inputLen, bool isUpper) {
uint32_t outputLength = 0;
for (uint32_t i = 0; i < inputLen;) {
// For UTF-8 characters, changing case can increase / decrease total byte length.
// Eg.: 'ß' lower case -> 'SS' upper case [more bytes + more chars]
if (inputStr[i] & 0x80) {
int size = 0;
int codepoint = utf8proc_codepoint(inputStr + i, size);
int convertedCodepoint =
isUpper ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
int newSize = utf8proc_codepoint_length(convertedCodepoint);
assert(newSize >= 0);
outputLength += newSize;
i += size;
} else {
outputLength++;
i++;
}
}
return outputLength;
}

void BaseLowerUpperOperation::convertCase(char* result, uint32_t len, char* input, bool toUpper) {
for (auto i = 0u; i < len;) {
if (input[i] & 0x80) {
int size = 0, newSize = 0;
int codepoint = utf8proc_codepoint(input + i, size);
int convertedCodepoint =
toUpper ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
auto success = utf8proc_codepoint_to_utf8(convertedCodepoint, newSize, result);
assert(success);
result += newSize;
i += size;
} else {
*result = toUpper ? toupper(input[i]) : tolower(input[i]);
i++;
result++;
}
}
}

} // namespace operation
} // namespace function
} // namespace kuzu
52 changes: 50 additions & 2 deletions src/function/string/operations/include/array_extract_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
#include <cassert>
#include <cstring>

#include "length_operation.h"
#include "substr_operation.h"

#include "src/common/types/include/ku_string.h"

using namespace kuzu::common;
Expand All @@ -13,8 +16,53 @@ namespace operation {

struct ArrayExtract {
static inline void operation(ku_string_t& str, int64_t& idx, ku_string_t& result) {
auto pos = idx > 0 ? min(idx, (int64_t)str.len) : max(str.len + idx, (int64_t)0) + 1;
result.set((char*)(str.getData() + pos - 1), 1 /* length */);
if (idx == 0) {
result.len = 0;
return;
}
auto stringVal = str.getAsString();
int64_t strLen;
Length::operation(str, strLen);
auto idxPos = idx > 0 ? min(idx, strLen) : max(strLen + idx, (int64_t)0) + 1;
auto startPos = idxPos - 1;
auto endPos = startPos + 1;
bool isAscii = true;
for (auto i = 0u; i < min((uint64_t)idxPos + 1, stringVal.size()); i++) {
if (stringVal[i] & 0x80) {
isAscii = false;
break;
}
}
if (isAscii) {
copySubstr(str, idxPos, 1 /* length */, result, isAscii);
} else {
int64_t characterCount = 0, startBytePos = 0, endBytePos = 0;
kuzu::utf8proc::utf8proc_grapheme_callback(
stringVal.c_str(), stringVal.size(), [&](int64_t gstart, int64_t gend) {
if (characterCount == startPos) {
startBytePos = gstart;
} else if (characterCount == endPos) {
endBytePos = gstart;
return false;
}
characterCount++;
return true;
});
if (endBytePos == 0) {
endBytePos = str.len;
}
copySubstr(str, startBytePos, endBytePos - startBytePos, result, isAscii);
}
}

static inline void copySubstr(
ku_string_t& src, int64_t start, int64_t len, ku_string_t& result, bool isAscii) {
result.len = min(len, src.len - start + 1);
if (isAscii) {
memcpy((uint8_t*)result.getData(), src.getData() + start - 1, result.len);
} else {
memcpy((uint8_t*)result.getData(), src.getData() + start, result.len);
}
}
};

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#pragma once

#include <cassert>
#include <cstring>

#include "third_party/utf8proc/include/utf8proc.h"

#include "src/common/include/vector/value_vector.h"
#include "src/common/types/include/ku_string.h"

using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace kuzu {
namespace function {
namespace operation {

struct BaseLowerUpperOperation {

static inline void operation(
ku_string_t& input, ku_string_t& result, ValueVector& resultValueVector, bool isUpper) {
uint32_t resultLen = getResultLen((char*)input.getData(), input.len, isUpper);
result.len = resultLen;
if (resultLen <= ku_string_t::SHORT_STR_LENGTH) {
convertCase((char*)result.prefix, input.len, (char*)input.getData(), isUpper);
} else {
result.overflowPtr = reinterpret_cast<uint64_t>(
resultValueVector.getOverflowBuffer().allocateSpace(result.len));
auto buffer = reinterpret_cast<char*>(result.overflowPtr);
convertCase(buffer, input.len, (char*)input.getData(), isUpper);
memcpy(result.prefix, buffer, ku_string_t::PREFIX_LENGTH);
}
}

private:
static uint32_t getResultLen(char* inputStr, uint32_t inputLen, bool isUpper);
static void convertCase(char* result, uint32_t len, char* input, bool toUpper);
};
acquamarin marked this conversation as resolved.
Show resolved Hide resolved

} // namespace operation
} // namespace function
} // namespace kuzu
76 changes: 76 additions & 0 deletions src/function/string/operations/include/base_pad_operation.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#pragma once

#include <cassert>
#include <cstring>

#include "third_party/utf8proc/include/utf8proc.h"

#include "src/common/include/vector/value_vector.h"
#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace kuzu {
namespace function {
namespace operation {

// Padding logic has been taken from DuckDB:
// https://github.com/duckdb/duckdb/blob/master/src/function/scalar/string/pad.cpp
struct BasePadOperation {
public:
static inline void operation(ku_string_t& src, int64_t count, ku_string_t& characterToPad,
ku_string_t& result, ValueVector& resultValueVector,
void (*padOperation)(
ku_string_t& src, int64_t count, ku_string_t& characterToPad, string& paddedResult)) {
if (count < 0) {
count = 0;
}
string paddedResult;
padOperation(src, count, characterToPad, paddedResult);
result.len = paddedResult.size();
if (ku_string_t::isShortString(result.len)) {
memcpy(result.prefix, paddedResult.data(), result.len);
} else {
result.overflowPtr = reinterpret_cast<uint64_t>(
resultValueVector.getOverflowBuffer().allocateSpace(result.len));
auto buffer = reinterpret_cast<char*>(result.overflowPtr);
memcpy(buffer, paddedResult.data(), result.len);
memcpy(result.prefix, buffer, ku_string_t::PREFIX_LENGTH);
}
}

static pair<uint32_t, uint32_t> padCountChars(
const uint32_t count, const char* data, const uint32_t size) {
auto str = reinterpret_cast<const utf8proc_uint8_t*>(data);
uint32_t byteCount = 0, charCount = 0;
for (; charCount < count && byteCount < size; charCount++) {
utf8proc_int32_t codepoint;
auto bytes = utf8proc_iterate(str + byteCount, size - byteCount, &codepoint);
byteCount += bytes;
}
return {byteCount, charCount};
}

static void insertPadding(uint32_t charCount, ku_string_t pad, string& result) {
auto padData = pad.getData();
auto padSize = pad.len;
uint32_t padByteCount = 0;
for (auto i = 0; i < charCount; i++) {
if (padByteCount >= padSize) {
result.insert(result.end(), (char*)padData, (char*)(padData + padByteCount));
padByteCount = 0;
}
utf8proc_int32_t codepoint;
auto bytes =
utf8proc_iterate(padData + padByteCount, padSize - padByteCount, &codepoint);
padByteCount += bytes;
}
result.insert(result.end(), (char*)padData, (char*)(padData + padByteCount));
}
};

} // namespace operation
} // namespace function
} // namespace kuzu
7 changes: 5 additions & 2 deletions src/function/string/operations/include/left_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
#include <cassert>
#include <cstring>

#include "length_operation.h"
#include "substr_operation.h"

#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace kuzu {
namespace function {
Expand All @@ -18,8 +20,9 @@ struct Left {
public:
static inline void operation(
ku_string_t& left, int64_t& right, ku_string_t& result, ValueVector& resultValueVector) {
auto len = right >= 0 ? min(left.len, (uint32_t)right) :
((uint32_t)max(left.len + right, (int64_t)0));
int64_t leftLen;
Length::operation(left, leftLen);
int64_t len = (right > -1) ? min(leftLen, right) : max(leftLen + right, (int64_t)0);
SubStr::operation(left, 1, len, result, resultValueVector);
}
};
Expand Down
23 changes: 22 additions & 1 deletion src/function/string/operations/include/length_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,38 @@
#include <cassert>
#include <cstring>

#include "third_party/utf8proc/include/utf8proc.h"

#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace kuzu {
namespace function {
namespace operation {

struct Length {
static inline void operation(ku_string_t& input, int64_t& result) { result = input.len; }
static inline void operation(ku_string_t& input, int64_t& result) {
auto totalByteLength = input.len;
auto inputString = input.getAsString();
for (auto i = 0; i < totalByteLength; i++) {
if (inputString[i] & 0x80) {
int64_t length = 0;
// Use grapheme iterator to identify bytes of utf8 char and increment once for each
// char.
utf8proc_grapheme_callback(
inputString.c_str(), totalByteLength, [&](size_t start, size_t end) {
length++;
return true;
});
result = length;
return;
}
}
result = totalByteLength;
}
};

} // namespace operation
Expand Down
Loading