Skip to content

Commit

Permalink
merge utf8 branch to master
Browse files Browse the repository at this point in the history
  • Loading branch information
anuchak committed Nov 16, 2022
1 parent cfa0a26 commit 3bedc52
Show file tree
Hide file tree
Showing 28 changed files with 79,847 additions and 117 deletions.
1 change: 1 addition & 0 deletions dataset/tinysnb/copy_csv.cypher
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
COPY person FROM "dataset/tinysnb/vPerson.csv" (HEADER=true);
COPY organisation FROM "dataset/tinysnb/vOrganisation.csv";
COPY movies FROM "dataset/tinysnb/vMovies.csv";
COPY knows FROM "dataset/tinysnb/eKnows.csv";
COPY studyAt FROM "dataset/tinysnb/eStudyAt.csv" (HEADER=true);
COPY workAt FROM "dataset/tinysnb/eWorkAt.csv"
Expand Down
1 change: 1 addition & 0 deletions dataset/tinysnb/schema.cypher
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], PRIMARY KEY (ID));
create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, PRIMARY KEY (ID));
create node table movies (ID INT64, name STRING, PRIMARY KEY (ID));
create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], MANY_ONE);
create rel table workAt (FROM person TO organisation, year INT64, MANY_ONE);
Expand Down
3 changes: 3 additions & 0 deletions dataset/tinysnb/vMovies.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
1,Sóló cón tu párejâ
2,The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie
3,Roma
1 change: 1 addition & 0 deletions src/common/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ cc_library(
"configs",
"type_utils",
"//src/common/types",
"//third_party/utf8proc:utf8proc"
],
)

Expand Down
12 changes: 11 additions & 1 deletion src/common/csv_reader/csv_reader.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
#include "src/common/include/csv_reader/csv_reader.h"

#include "spdlog/spdlog.h"
#include "third_party/utf8proc/include/utf8proc_wrapper.h"

#include "src/common/include/configs.h"
#include "src/common/include/type_utils.h"
#include "src/common/include/utils.h"

using namespace kuzu::utf8proc;

namespace kuzu {
namespace common {

Expand Down Expand Up @@ -232,7 +235,14 @@ char* CSVReader::getString() {
// If the string is too long, truncate it.
strVal[DEFAULT_PAGE_SIZE] = '\0';
}
return strVal;
auto unicodeType = Utf8Proc::analyze(strVal, strlen(strVal));
if (unicodeType == UnicodeType::ASCII) {
return strVal;
} else if (unicodeType == UnicodeType::UNICODE) {
return Utf8Proc::normalize(strVal, strlen(strVal));
} else {
throw CSVReaderException("Invalid UTF-8 character encountered.");
}
}

date_t CSVReader::getDate() {
Expand Down
1 change: 1 addition & 0 deletions src/function/string/operations/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ cc_library(
],
deps = [
"//src/common/types",
"//third_party/utf8proc:utf8proc",
"//src/common:vector"
],
)
4 changes: 2 additions & 2 deletions src/function/string/operations/base_lower_upper_operation.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "include/base_lower_upper_operation.h"

namespace graphflow {
namespace kuzu {
namespace function {
namespace operation {

Expand Down Expand Up @@ -47,4 +47,4 @@ void BaseLowerUpperOperation::convertCase(char* result, uint32_t len, char* inpu

} // namespace operation
} // namespace function
} // namespace graphflow
} // namespace kuzu
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,29 @@
#include "third_party/utf8proc/include/utf8proc.h"

#include "src/common/include/vector/value_vector.h"
#include "src/common/types/include/gf_string.h"
#include "src/common/types/include/ku_string.h"

using namespace graphflow::common;
using namespace graphflow::utf8proc;
using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace graphflow {
namespace kuzu {
namespace function {
namespace operation {

struct BaseLowerUpperOperation {

static inline void operation(
gf_string_t& input, gf_string_t& result, ValueVector& resultValueVector, bool isUpper) {
ku_string_t& input, ku_string_t& result, ValueVector& resultValueVector, bool isUpper) {
uint32_t resultLen = getResultLen((char*)input.getData(), input.len, isUpper);
result.len = resultLen;
if (resultLen <= gf_string_t::SHORT_STR_LENGTH) {
if (resultLen <= ku_string_t::SHORT_STR_LENGTH) {
convertCase((char*)result.prefix, input.len, (char*)input.getData(), isUpper);
} else {
result.overflowPtr = reinterpret_cast<uint64_t>(
resultValueVector.getOverflowBuffer().allocateSpace(result.len));
auto buffer = reinterpret_cast<char*>(result.overflowPtr);
convertCase(buffer, input.len, (char*)input.getData(), isUpper);
memcpy(result.prefix, buffer, gf_string_t::PREFIX_LENGTH);
memcpy(result.prefix, buffer, ku_string_t::PREFIX_LENGTH);
}
}

Expand All @@ -38,4 +38,4 @@ struct BaseLowerUpperOperation {
};
} // namespace operation
} // namespace function
} // namespace graphflow
} // namespace kuzu
24 changes: 14 additions & 10 deletions src/function/string/operations/include/base_pad_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,38 @@
#include <cassert>
#include <cstring>

#include "src/common/types/include/gf_string.h"
#include "third_party/utf8proc/include/utf8proc.h"

#include "src/common/include/vector/value_vector.h"
#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace graphflow::common;
using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace graphflow {
namespace kuzu {
namespace function {
namespace operation {

// Padding logic has been taken from DuckDB:
// https://github.com/duckdb/duckdb/blob/master/src/function/scalar/string/pad.cpp
struct BasePadOperation {
public:
static inline void operation(gf_string_t& src, int64_t count, gf_string_t& characterToPad,
gf_string_t& result, ValueVector& resultValueVector,
static inline void operation(ku_string_t& src, int64_t count, ku_string_t& characterToPad,
ku_string_t& result, ValueVector& resultValueVector,
void (*padOperation)(
gf_string_t& src, int64_t count, gf_string_t& characterToPad, string& paddedResult)) {
ku_string_t& src, int64_t count, ku_string_t& characterToPad, string& paddedResult)) {
string paddedResult;
padOperation(src, count, characterToPad, paddedResult);
result.len = paddedResult.size();
if (gf_string_t::isShortString(result.len)) {
if (ku_string_t::isShortString(result.len)) {
memcpy(result.prefix, paddedResult.data(), result.len);
} else {
result.overflowPtr = reinterpret_cast<uint64_t>(
resultValueVector.getOverflowBuffer().allocateSpace(result.len));
auto buffer = reinterpret_cast<char*>(result.overflowPtr);
memcpy(buffer, paddedResult.data(), result.len);
memcpy(result.prefix, buffer, gf_string_t::PREFIX_LENGTH);
memcpy(result.prefix, buffer, ku_string_t::PREFIX_LENGTH);
}
}

Expand All @@ -46,7 +50,7 @@ struct BasePadOperation {
return {byteCount, charCount};
}

static void insertPadding(uint32_t charCount, gf_string_t pad, string& result) {
static void insertPadding(uint32_t charCount, ku_string_t pad, string& result) {
auto padData = pad.getData();
auto padSize = pad.len;
uint32_t padByteCount = 0;
Expand All @@ -66,4 +70,4 @@ struct BasePadOperation {

} // namespace operation
} // namespace function
} // namespace graphflow
} // namespace kuzu
7 changes: 5 additions & 2 deletions src/function/string/operations/include/left_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
#include <cassert>
#include <cstring>

#include "length_operation.h"
#include "substr_operation.h"

#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace kuzu {
namespace function {
Expand All @@ -18,8 +20,9 @@ struct Left {
public:
static inline void operation(
ku_string_t& left, int64_t& right, ku_string_t& result, ValueVector& resultValueVector) {
auto len = right > 0 ? min(left.len, (uint32_t)right) :
max(left.len + (uint32_t)right, (uint32_t)0u);
int64_t leftLen;
Length::operation(left, leftLen);
int64_t len = (right > 0) ? min(leftLen, right) : max(leftLen + right, (int64_t)0);
SubStr::operation(left, 1, len, result, resultValueVector);
}
};
Expand Down
23 changes: 22 additions & 1 deletion src/function/string/operations/include/length_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,38 @@
#include <cassert>
#include <cstring>

#include "third_party/utf8proc/include/utf8proc.h"

#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::utf8proc;

namespace kuzu {
namespace function {
namespace operation {

struct Length {
static inline void operation(ku_string_t& input, int64_t& result) { result = input.len; }
static inline void operation(ku_string_t& input, int64_t& result) {
auto totalByteLength = input.len;
auto inputString = input.getAsString();
for (auto i = 0; i < totalByteLength; i++) {
if (inputString[i] & 0x80) {
int64_t length = 0;
// use grapheme iterator to identify bytes of utf8 char and increment once for each
// char
utf8proc_grapheme_callback(
inputString.c_str(), totalByteLength, [&](size_t start, size_t end) {
length++;
return true;
});
result = length;
return;
}
}
result = totalByteLength;
}
};

} // namespace operation
Expand Down
13 changes: 3 additions & 10 deletions src/function/string/operations/include/lower_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
#include <cassert>
#include <cstring>

#include "base_str_operation.h"
#include "base_lower_upper_operation.h"

#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::function::operation;

namespace kuzu {
namespace function {
Expand All @@ -18,15 +19,7 @@ struct Lower {
public:
static inline void operation(
ku_string_t& input, ku_string_t& result, ValueVector& resultValueVector) {
BaseStrOperation::operation(input, result, resultValueVector, lowerStr);
}

private:
static uint32_t lowerStr(char* str, uint32_t len) {
for (auto i = 0u; i < len; i++) {
str[i] = tolower(str[i]);
}
return len;
BaseLowerUpperOperation::operation(input, result, resultValueVector, false /* isUpper */);
}
};

Expand Down
23 changes: 11 additions & 12 deletions src/function/string/operations/include/lpad_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,33 +3,32 @@
#include <cassert>
#include <cstring>

#include "pad_operation.h"
#include "base_pad_operation.h"

#include "src/common/types/include/ku_string.h"

using namespace std;
using namespace kuzu::common;
using namespace kuzu::function::operation;

namespace kuzu {
namespace function {
namespace operation {

struct Lpad : PadOperation {
struct Lpad : BasePadOperation {
public:
static inline void operation(ku_string_t& src, int64_t count, ku_string_t& characterToPad,
ku_string_t& result, ValueVector& resultValueVector) {
PadOperation::operation(
BasePadOperation::operation(
src, count, characterToPad, result, resultValueVector, lpadOperation);
}

static void lpadOperation(ku_string_t& result, ku_string_t& src, ku_string_t& characterToPad) {
auto offset = 0u;
if (result.len > src.len) {
for (; offset < result.len - src.len; offset++) {
memcpy((uint8_t*)result.getData() + offset, characterToPad.getData(), 1);
}
}
memcpy((uint8_t*)result.getData() + offset, src.getData(), src.len);
static void lpadOperation(
ku_string_t& src, int64_t count, ku_string_t& characterToPad, string& paddedResult) {
auto srcPadInfo =
BasePadOperation::padCountChars(count, (const char*)src.getData(), src.len);
auto srcData = (const char*)src.getData();
BasePadOperation::insertPadding(count - srcPadInfo.second, characterToPad, paddedResult);
paddedResult.insert(paddedResult.end(), srcData, srcData + srcPadInfo.first);
}
};

Expand Down
35 changes: 0 additions & 35 deletions src/function/string/operations/include/pad_operation.h

This file was deleted.

28 changes: 27 additions & 1 deletion src/function/string/operations/include/reverse_operation.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,33 @@ struct Reverse {
public:
static inline void operation(
ku_string_t& input, ku_string_t& result, ValueVector& resultValueVector) {
BaseStrOperation::operation(input, result, resultValueVector, reverseStr);
bool isAscii = true;
string inputStr = input.getAsString();
for (uint32_t i = 0; i < input.len; i++) {
if (inputStr[i] & 0x80) {
isAscii = false;
break;
}
}
if (isAscii) {
BaseStrOperation::operation(input, result, resultValueVector, reverseStr);
} else {
result.len = input.len;
if (result.len > ku_string_t::SHORT_STR_LENGTH) {
result.overflowPtr = reinterpret_cast<uint64_t>(
resultValueVector.getOverflowBuffer().allocateSpace(input.len));
}
auto resultBuffer = result.len <= ku_string_t::SHORT_STR_LENGTH ?
reinterpret_cast<char*>(result.prefix) :
reinterpret_cast<char*>(result.overflowPtr);
utf8proc_grapheme_callback(inputStr.c_str(), input.len, [&](size_t start, size_t end) {
memcpy(resultBuffer + input.len - end, input.getData() + start, end - start);
return true;
});
if (result.len > ku_string_t::SHORT_STR_LENGTH) {
memcpy(result.prefix, resultBuffer, ku_string_t::PREFIX_LENGTH);
}
}
}

private:
Expand Down
Loading

0 comments on commit 3bedc52

Please sign in to comment.