Skip to content

Commit

Permalink
merge utf8 to master
Browse files Browse the repository at this point in the history
  • Loading branch information
anuchak committed Nov 18, 2022
1 parent 3aff141 commit caa5e3a
Show file tree
Hide file tree
Showing 30 changed files with 80,337 additions and 123 deletions.
1 change: 1 addition & 0 deletions dataset/tinysnb/copy_csv.cypher
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
COPY person FROM "dataset/tinysnb/vPerson.csv" (HEADER=true);
COPY organisation FROM "dataset/tinysnb/vOrganisation.csv";
COPY movies FROM "dataset/tinysnb/vMovies.csv";
COPY knows FROM "dataset/tinysnb/eKnows.csv";
COPY studyAt FROM "dataset/tinysnb/eStudyAt.csv" (HEADER=true);
COPY workAt FROM "dataset/tinysnb/eWorkAt.csv"
Expand Down
1 change: 1 addition & 0 deletions dataset/tinysnb/schema.cypher
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, PRIMARY KEY (ID));
create node table movies (ID INT64, name STRING, PRIMARY KEY (ID));
create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], MANY_ONE);
create rel table workAt (FROM person TO organisation, year INT64, MANY_ONE);
Expand Down
3 changes: 3 additions & 0 deletions dataset/tinysnb/vMovies.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
1,Sóló cón tu párejâ
2,The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie
3,Roma
1 change: 1 addition & 0 deletions src/common/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ cc_library(
"configs",
"type_utils",
"//src/common/types",
"//third_party/utf8proc:utf8proc"
],
)

Expand Down
12 changes: 11 additions & 1 deletion src/common/csv_reader/csv_reader.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
#include "src/common/include/csv_reader/csv_reader.h"

#include "spdlog/spdlog.h"
#include "third_party/utf8proc/include/utf8proc_wrapper.h"

#include "src/common/include/configs.h"
#include "src/common/include/type_utils.h"
#include "src/common/include/utils.h"

using namespace kuzu::utf8proc;

namespace kuzu {
namespace common {

Expand Down Expand Up @@ -232,7 +235,14 @@ char* CSVReader::getString() {
// If the string is too long, truncate it.
strVal[DEFAULT_PAGE_SIZE] = '\0';
}
return strVal;
auto unicodeType = Utf8Proc::analyze(strVal, strlen(strVal));
if (unicodeType == UnicodeType::ASCII) {
return strVal;
} else if (unicodeType == UnicodeType::UNICODE) {
return Utf8Proc::normalize(strVal, strlen(strVal));
} else {
throw CSVReaderException("Invalid UTF-8 character encountered.");
}
}

date_t CSVReader::getDate() {
Expand Down
2 changes: 2 additions & 0 deletions src/function/string/operations/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,7 @@ cc_library(
],
deps = [
"//src/common/types",
"//third_party/utf8proc:utf8proc",
"//src/common:vector"
],
)
50 changes: 50 additions & 0 deletions src/function/string/operations/base_lower_upper_operation.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#include "include/base_lower_upper_operation.h"

namespace kuzu {
namespace function {
namespace operation {

uint32_t BaseLowerUpperOperation::getResultLen(char* inputStr, uint32_t inputLen, bool isUpper) {
uint32_t outputLength = 0;
for (uint32_t i = 0; i < inputLen;) {
// For UTF-8 characters, changing case can increase / decrease total byte length.
// Eg.: 'ß' lower case -> 'SS' upper case [more bytes + more chars]
if (inputStr[i] & 0x80) {
int size = 0;
int codepoint = utf8proc_codepoint(inputStr + i, size);
int convertedCodepoint =
isUpper ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
int newSize = utf8proc_codepoint_length(convertedCodepoint);
assert(newSize >= 0);
outputLength += newSize;
i += size;
} else {
outputLength++;
i++;
}
}
return outputLength;
}

void BaseLowerUpperOperation::convertCase(char* result, uint32_t len, char* input, bool toUpper) {
for (auto i = 0u; i < len;) {
if (input[i] & 0x80) {
int size = 0, newSize = 0;
int codepoint = utf8proc_codepoint(input + i, size);
int convertedCodepoint =
toUpper ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint);
auto success = utf8proc_codepoint_to_utf8(convertedCodepoint, newSize, result);
assert(success);
result += newSize;
i += size;
} else {
*result = toUpper ? toupper(input[i]) : tolower(input[i]);
i++;
result++;
}
}
}

} // namespace operation
} // namespace function
} // namespace kuzu
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#pragma once

#include <cassert>
#include <cstring>

#include "third_party/utf8proc/include/utf8proc.h"

#include "src/common/include/vector/value_vector.h"
#include "src/common/types/include/ku_string.h"

using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace kuzu {
namespace function {
namespace operation {

struct BaseLowerUpperOperation {

static inline void operation(
ku_string_t& input, ku_string_t& result, ValueVector& resultValueVector, bool isUpper) {
uint32_t resultLen = getResultLen((char*)input.getData(), input.len, isUpper);
result.len = resultLen;
if (resultLen <= ku_string_t::SHORT_STR_LENGTH) {
convertCase((char*)result.prefix, input.len, (char*)input.getData(), isUpper);
} else {
result.overflowPtr = reinterpret_cast<uint64_t>(
resultValueVector.getOverflowBuffer().allocateSpace(result.len));
auto buffer = reinterpret_cast<char*>(result.overflowPtr);
convertCase(buffer, input.len, (char*)input.getData(), isUpper);
memcpy(result.prefix, buffer, ku_string_t::PREFIX_LENGTH);
}
}

private:
static uint32_t getResultLen(char* inputStr, uint32_t inputLen, bool isUpper);
static void convertCase(char* result, uint32_t len, char* input, bool toUpper);
};

} // namespace operation
} // namespace function
} // namespace kuzu
76 changes: 76 additions & 0 deletions src/function/string/operations/include/base_pad_operation.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#pragma once

#include <cassert>
#include <cstring>

#include "third_party/utf8proc/include/utf8proc.h"

#include "src/common/include/vector/value_vector.h"
#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace kuzu {
namespace function {
namespace operation {

// Padding logic has been taken from DuckDB:
// https://github.com/duckdb/duckdb/blob/master/src/function/scalar/string/pad.cpp
struct BasePadOperation {
public:
static inline void operation(ku_string_t& src, int64_t count, ku_string_t& characterToPad,
ku_string_t& result, ValueVector& resultValueVector,
void (*padOperation)(
ku_string_t& src, int64_t count, ku_string_t& characterToPad, string& paddedResult)) {
if (count < 0) {
count = 0;
}
string paddedResult;
padOperation(src, count, characterToPad, paddedResult);
result.len = paddedResult.size();
if (ku_string_t::isShortString(result.len)) {
memcpy(result.prefix, paddedResult.data(), result.len);
} else {
result.overflowPtr = reinterpret_cast<uint64_t>(
resultValueVector.getOverflowBuffer().allocateSpace(result.len));
auto buffer = reinterpret_cast<char*>(result.overflowPtr);
memcpy(buffer, paddedResult.data(), result.len);
memcpy(result.prefix, buffer, ku_string_t::PREFIX_LENGTH);
}
}

static pair<uint32_t, uint32_t> padCountChars(
const uint32_t count, const char* data, const uint32_t size) {
auto str = reinterpret_cast<const utf8proc_uint8_t*>(data);
uint32_t byteCount = 0, charCount = 0;
for (; charCount < count && byteCount < size; charCount++) {
utf8proc_int32_t codepoint;
auto bytes = utf8proc_iterate(str + byteCount, size - byteCount, &codepoint);
byteCount += bytes;
}
return {byteCount, charCount};
}

static void insertPadding(uint32_t charCount, ku_string_t pad, string& result) {
auto padData = pad.getData();
auto padSize = pad.len;
uint32_t padByteCount = 0;
for (auto i = 0; i < charCount; i++) {
if (padByteCount >= padSize) {
result.insert(result.end(), (char*)padData, (char*)(padData + padByteCount));
padByteCount = 0;
}
utf8proc_int32_t codepoint;
auto bytes =
utf8proc_iterate(padData + padByteCount, padSize - padByteCount, &codepoint);
padByteCount += bytes;
}
result.insert(result.end(), (char*)padData, (char*)(padData + padByteCount));
}
};

} // namespace operation
} // namespace function
} // namespace kuzu
7 changes: 5 additions & 2 deletions src/function/string/operations/include/left_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
#include <cassert>
#include <cstring>

#include "length_operation.h"
#include "substr_operation.h"

#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace kuzu {
namespace function {
Expand All @@ -18,8 +20,9 @@ struct Left {
public:
static inline void operation(
ku_string_t& left, int64_t& right, ku_string_t& result, ValueVector& resultValueVector) {
auto len = right >= 0 ? min(left.len, (uint32_t)right) :
((uint32_t)max(left.len + right, (int64_t)0));
int64_t leftLen;
Length::operation(left, leftLen);
int64_t len = (right > 0) ? min(leftLen, right) : max(leftLen + right, (int64_t)0);
SubStr::operation(left, 1, len, result, resultValueVector);
}
};
Expand Down
23 changes: 22 additions & 1 deletion src/function/string/operations/include/length_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,38 @@
#include <cassert>
#include <cstring>

#include "third_party/utf8proc/include/utf8proc.h"

#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace kuzu {
namespace function {
namespace operation {

struct Length {
static inline void operation(ku_string_t& input, int64_t& result) { result = input.len; }
static inline void operation(ku_string_t& input, int64_t& result) {
auto totalByteLength = input.len;
auto inputString = input.getAsString();
for (auto i = 0; i < totalByteLength; i++) {
if (inputString[i] & 0x80) {
int64_t length = 0;
// use grapheme iterator to identify bytes of utf8 char and increment once for each
// char
utf8proc_grapheme_callback(
inputString.c_str(), totalByteLength, [&](size_t start, size_t end) {
length++;
return true;
});
result = length;
return;
}
}
result = totalByteLength;
}
};

} // namespace operation
Expand Down
13 changes: 3 additions & 10 deletions src/function/string/operations/include/lower_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
#include <cassert>
#include <cstring>

#include "base_str_operation.h"
#include "base_lower_upper_operation.h"

#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::function::operation;

namespace kuzu {
namespace function {
Expand All @@ -18,15 +19,7 @@ struct Lower {
public:
static inline void operation(
ku_string_t& input, ku_string_t& result, ValueVector& resultValueVector) {
BaseStrOperation::operation(input, result, resultValueVector, lowerStr);
}

private:
static uint32_t lowerStr(char* str, uint32_t len) {
for (auto i = 0u; i < len; i++) {
str[i] = tolower(str[i]);
}
return len;
BaseLowerUpperOperation::operation(input, result, resultValueVector, false /* isUpper */);
}
};

Expand Down
23 changes: 11 additions & 12 deletions src/function/string/operations/include/lpad_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,32 @@
#include <cassert>
#include <cstring>

#include "pad_operation.h"
#include "base_pad_operation.h"

#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::function::operation;

namespace kuzu {
namespace function {
namespace operation {

struct Lpad : PadOperation {
struct Lpad : BasePadOperation {
public:
static inline void operation(ku_string_t& src, int64_t count, ku_string_t& characterToPad,
ku_string_t& result, ValueVector& resultValueVector) {
PadOperation::operation(
BasePadOperation::operation(
src, count, characterToPad, result, resultValueVector, lpadOperation);
}

static void lpadOperation(ku_string_t& result, ku_string_t& src, ku_string_t& characterToPad) {
auto offset = 0u;
if (result.len > src.len) {
for (; offset < result.len - src.len; offset++) {
memcpy((uint8_t*)result.getData() + offset, characterToPad.getData(), 1);
}
}
memcpy((uint8_t*)result.getData() + offset, src.getData(), src.len);
static void lpadOperation(
ku_string_t& src, int64_t count, ku_string_t& characterToPad, string& paddedResult) {
auto srcPadInfo =
BasePadOperation::padCountChars(count, (const char*)src.getData(), src.len);
auto srcData = (const char*)src.getData();
BasePadOperation::insertPadding(count - srcPadInfo.second, characterToPad, paddedResult);
paddedResult.insert(paddedResult.end(), srcData, srcData + srcPadInfo.first);
}
};

Expand Down
Loading

0 comments on commit caa5e3a

Please sign in to comment.