-
Notifications
You must be signed in to change notification settings - Fork 96
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
30 changed files
with
80,319 additions
and
118 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
1,Sóló cón tu párejâ | ||
2,The 😂😃🧘🏻♂️🌍🌦️🍞🚗 movie | ||
3,Roma |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -103,6 +103,7 @@ cc_library( | |
"configs", | ||
"type_utils", | ||
"//src/common/types", | ||
"//third_party/utf8proc:utf8proc" | ||
], | ||
) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,5 +12,7 @@ cc_library( | |
], | ||
deps = [ | ||
"//src/common/types", | ||
"//third_party/utf8proc:utf8proc", | ||
"//src/common:vector" | ||
], | ||
) |
50 changes: 50 additions & 0 deletions
50
src/function/string/operations/base_lower_upper_operation.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#include "include/base_lower_upper_operation.h" | ||
|
||
namespace kuzu { | ||
namespace function { | ||
namespace operation { | ||
|
||
uint32_t BaseLowerUpperOperation::getResultLen(char* inputStr, uint32_t inputLen, bool isUpper) { | ||
uint32_t outputLength = 0; | ||
for (uint32_t i = 0; i < inputLen;) { | ||
// For UTF-8 characters, changing case can increase / decrease total byte length. | ||
// Eg.: 'ß' lower case -> 'SS' upper case [more bytes + more chars] | ||
if (inputStr[i] & 0x80) { | ||
int size = 0; | ||
int codepoint = utf8proc_codepoint(inputStr + i, size); | ||
int convertedCodepoint = | ||
isUpper ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint); | ||
int newSize = utf8proc_codepoint_length(convertedCodepoint); | ||
assert(newSize >= 0); | ||
outputLength += newSize; | ||
i += size; | ||
} else { | ||
outputLength++; | ||
i++; | ||
} | ||
} | ||
return outputLength; | ||
} | ||
|
||
void BaseLowerUpperOperation::convertCase(char* result, uint32_t len, char* input, bool toUpper) { | ||
for (auto i = 0u; i < len;) { | ||
if (input[i] & 0x80) { | ||
int size = 0, newSize = 0; | ||
int codepoint = utf8proc_codepoint(input + i, size); | ||
int convertedCodepoint = | ||
toUpper ? utf8proc_toupper(codepoint) : utf8proc_tolower(codepoint); | ||
auto success = utf8proc_codepoint_to_utf8(convertedCodepoint, newSize, result); | ||
assert(success); | ||
result += newSize; | ||
i += size; | ||
} else { | ||
*result = toUpper ? toupper(input[i]) : tolower(input[i]); | ||
i++; | ||
result++; | ||
} | ||
} | ||
} | ||
|
||
} // namespace operation | ||
} // namespace function | ||
} // namespace kuzu |
41 changes: 41 additions & 0 deletions
41
src/function/string/operations/include/base_lower_upper_operation.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#pragma once | ||
|
||
#include <cassert> | ||
#include <cstring> | ||
|
||
#include "third_party/utf8proc/include/utf8proc.h" | ||
|
||
#include "src/common/include/vector/value_vector.h" | ||
#include "src/common/types/include/ku_string.h" | ||
|
||
using namespace kuzu::common; | ||
using namespace kuzu::utf8proc; | ||
|
||
namespace kuzu { | ||
namespace function { | ||
namespace operation { | ||
|
||
struct BaseLowerUpperOperation { | ||
|
||
static inline void operation( | ||
ku_string_t& input, ku_string_t& result, ValueVector& resultValueVector, bool isUpper) { | ||
uint32_t resultLen = getResultLen((char*)input.getData(), input.len, isUpper); | ||
result.len = resultLen; | ||
if (resultLen <= ku_string_t::SHORT_STR_LENGTH) { | ||
convertCase((char*)result.prefix, input.len, (char*)input.getData(), isUpper); | ||
} else { | ||
result.overflowPtr = reinterpret_cast<uint64_t>( | ||
resultValueVector.getOverflowBuffer().allocateSpace(result.len)); | ||
auto buffer = reinterpret_cast<char*>(result.overflowPtr); | ||
convertCase(buffer, input.len, (char*)input.getData(), isUpper); | ||
memcpy(result.prefix, buffer, ku_string_t::PREFIX_LENGTH); | ||
} | ||
} | ||
|
||
private: | ||
static uint32_t getResultLen(char* inputStr, uint32_t inputLen, bool isUpper); | ||
static void convertCase(char* result, uint32_t len, char* input, bool toUpper); | ||
}; | ||
} // namespace operation | ||
} // namespace function | ||
} // namespace kuzu |
73 changes: 73 additions & 0 deletions
73
src/function/string/operations/include/base_pad_operation.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
#pragma once | ||
|
||
#include <cassert> | ||
#include <cstring> | ||
|
||
#include "third_party/utf8proc/include/utf8proc.h" | ||
|
||
#include "src/common/include/vector/value_vector.h" | ||
#include "src/common/types/include/ku_string.h" | ||
|
||
using namespace std; | ||
using namespace kuzu::common; | ||
using namespace kuzu::utf8proc; | ||
|
||
namespace kuzu { | ||
namespace function { | ||
namespace operation { | ||
|
||
// Padding logic has been taken from DuckDB: | ||
// https://github.com/duckdb/duckdb/blob/master/src/function/scalar/string/pad.cpp | ||
struct BasePadOperation { | ||
public: | ||
static inline void operation(ku_string_t& src, int64_t count, ku_string_t& characterToPad, | ||
ku_string_t& result, ValueVector& resultValueVector, | ||
void (*padOperation)( | ||
ku_string_t& src, int64_t count, ku_string_t& characterToPad, string& paddedResult)) { | ||
string paddedResult; | ||
padOperation(src, count, characterToPad, paddedResult); | ||
result.len = paddedResult.size(); | ||
if (ku_string_t::isShortString(result.len)) { | ||
memcpy(result.prefix, paddedResult.data(), result.len); | ||
} else { | ||
result.overflowPtr = reinterpret_cast<uint64_t>( | ||
resultValueVector.getOverflowBuffer().allocateSpace(result.len)); | ||
auto buffer = reinterpret_cast<char*>(result.overflowPtr); | ||
memcpy(buffer, paddedResult.data(), result.len); | ||
memcpy(result.prefix, buffer, ku_string_t::PREFIX_LENGTH); | ||
} | ||
} | ||
|
||
static pair<uint32_t, uint32_t> padCountChars( | ||
const uint32_t count, const char* data, const uint32_t size) { | ||
auto str = reinterpret_cast<const utf8proc_uint8_t*>(data); | ||
uint32_t byteCount = 0, charCount = 0; | ||
for (; charCount < count && byteCount < size; charCount++) { | ||
utf8proc_int32_t codepoint; | ||
auto bytes = utf8proc_iterate(str + byteCount, size - byteCount, &codepoint); | ||
byteCount += bytes; | ||
} | ||
return {byteCount, charCount}; | ||
} | ||
|
||
static void insertPadding(uint32_t charCount, ku_string_t pad, string& result) { | ||
auto padData = pad.getData(); | ||
auto padSize = pad.len; | ||
uint32_t padByteCount = 0; | ||
for (auto i = 0; i < charCount; i++) { | ||
if (padByteCount >= padSize) { | ||
result.insert(result.end(), (char*)padData, (char*)(padData + padByteCount)); | ||
padByteCount = 0; | ||
} | ||
utf8proc_int32_t codepoint; | ||
auto bytes = | ||
utf8proc_iterate(padData + padByteCount, padSize - padByteCount, &codepoint); | ||
padByteCount += bytes; | ||
} | ||
result.insert(result.end(), (char*)padData, (char*)(padData + padByteCount)); | ||
} | ||
}; | ||
|
||
} // namespace operation | ||
} // namespace function | ||
} // namespace kuzu |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.