From 2b75b18835f621a5bdbfd2f63aa049a3ce211352 Mon Sep 17 00:00:00 2001 From: ziyi chen Date: Mon, 19 Jun 2023 14:17:26 -0400 Subject: [PATCH] Implement loading for blob-type --- dataset/tinysnb/eMeets.csv | 14 +++---- dataset/tinysnb/schema.cypher | 4 +- dataset/tinysnb/vMovies.csv | 6 +-- src/common/string_utils.cpp | 13 ++++++ src/common/types/blob.cpp | 18 ++++---- src/common/types/types.cpp | 20 +++++---- src/function/built_in_aggregate_functions.cpp | 13 ++---- src/include/common/string_utils.h | 2 + src/include/common/types/blob.h | 16 +++---- src/include/common/types/types.h | 2 +- src/include/function/cast/cast_operations.h | 6 ++- .../in_mem_storage_structure/in_mem_column.h | 10 ++--- .../in_mem_column_chunk.h | 4 +- .../in_mem_storage_structure/in_mem_lists.h | 1 + .../storage/storage_structure/column.h | 1 + .../storage/storage_structure/lists/lists.h | 1 + src/parser/transformer.cpp | 2 +- src/processor/operator/order_by/order_by.cpp | 2 +- src/storage/copier/table_copy_utils.cpp | 1 + .../in_mem_column.cpp | 8 ++-- .../in_mem_column_chunk.cpp | 14 +++++++ .../in_mem_storage_structure/in_mem_lists.cpp | 18 +++++++- test/c_api/connection_test.cpp | 6 ++- .../exceptions/binder/binder_error.test | 2 + test/test_files/tinysnb/agg/hash.test | 31 ++++++++++++++ test/test_files/tinysnb/agg/simple.test | 26 ++++++++++++ test/test_files/tinysnb/function/blob.test | 42 +++++++++++++++++++ .../tinysnb/order_by/single_label.test | 9 ++++ .../tinysnb/projection/multi_label.test | 12 +++--- .../tinysnb/projection/single_label.test | 23 ++++++++++ tools/python_api/test/conftest.py | 2 +- 31 files changed, 259 insertions(+), 70 deletions(-) diff --git a/dataset/tinysnb/eMeets.csv b/dataset/tinysnb/eMeets.csv index b22bd5d942..1d78ef1aaf 100644 --- a/dataset/tinysnb/eMeets.csv +++ b/dataset/tinysnb/eMeets.csv @@ -1,7 +1,7 @@ -0,2,"[7.82,3.54]",5 -2,5,"[2.87,4.23]",2 -3,7,"[3.65,8.44]",3 -7,3,"[2.11,3.1]",7 -8,3,"[2.2,9.0]",9 -9,3,"[3,5.2]",11 -10,2,"[3.5,1.1]",13 +0,2,"[7.82,3.54]",5,"\\xAA\\xBB\\xCC\\xDD" +2,5,"[2.87,4.23]",2,"NO hex code" +3,7,"[3.65,8.44]",3,"MIXED \\xAC with ASCII \\x0A" +7,3,"[2.11,3.1]",7,"\\xA1\\x2A" +8,3,"[2.2,9.0]",9,"\\x3A\\xA3" +9,3,"[3,5.2]",11,"NO hex code" +10,2,"[3.5,1.1]",13,"\\x3A\\xA3" diff --git a/dataset/tinysnb/schema.cypher b/dataset/tinysnb/schema.cypher index f485a2343a..2c749f3d66 100644 --- a/dataset/tinysnb/schema.cypher +++ b/dataset/tinysnb/schema.cypher @@ -1,8 +1,8 @@ create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], grades INT64[4], height float, PRIMARY KEY (ID)); create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, state STRUCT(revenue INT16, location STRING[], stock STRUCT(price INT64[], volume INT64)), PRIMARY KEY (ID)); -create node table movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, views INT64, release TIMESTAMP, film DATE), PRIMARY KEY (name)); +create node table movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, views INT64, release TIMESTAMP, film DATE), content BYTEA, PRIMARY KEY (name)); create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY); create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], length INT16,MANY_ONE); create rel table workAt (FROM person TO organisation, year INT64, grading DOUBLE[2], rating float, MANY_ONE); -create rel table meets (FROM person TO person, location FLOAT[2], times INT, MANY_ONE); +create rel table meets (FROM person TO person, location FLOAT[2], times INT, data BYTEA, MANY_ONE); create rel table marries (FROM person TO person, usedAddress STRING[], address INT16[2], note STRING, ONE_ONE); diff --git a/dataset/tinysnb/vMovies.csv b/dataset/tinysnb/vMovies.csv index bd1fd0cfde..6fdf986dea 100644 --- a/dataset/tinysnb/vMovies.csv +++ b/dataset/tinysnb/vMovies.csv @@ -1,3 +1,3 @@ -Sóló cón tu párejâ,126, this is a very very good movie,"{rating: 5.3, views: 152, release: 2011-08-20 11:25:30, film: 2012-05-11}" -The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie,2544, the movie is very very good,"{rating: 7, views: 982, release: 2018-11-13 13:33:11, film: 2014-09-12}" -Roma,298,the movie is very interesting and funny,"{rating: 1223, views: 10003, release: 2011-02-11 16:44:22, film: 2013-02-22}" +Sóló cón tu párejâ,126, this is a very very good movie,"{rating: 5.3, views: 152, release: 2011-08-20 11:25:30, film: 2012-05-11}","\\xAA\\xABinteresting\\x0B" +The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie,2544, the movie is very very good,"{rating: 7, views: 982, release: 2018-11-13 13:33:11, film: 2014-09-12}","\\xAB\\xCD" +Roma,298,the movie is very interesting and funny,"{rating: 1223, views: 10003, release: 2011-02-11 16:44:22, film: 2013-02-22}","pure ascii characters" diff --git a/src/common/string_utils.cpp b/src/common/string_utils.cpp index f85554ed9f..fef576ac1d 100644 --- a/src/common/string_utils.cpp +++ b/src/common/string_utils.cpp @@ -55,5 +55,18 @@ std::string StringUtils::extractStringBetween( return input.substr(posStart, posEnd - posStart); } +std::string StringUtils::removeEscapedCharacters(const std::string& input) { + std::string resultStr; + for (auto i = 1u; i < input.length() - 1; i++) { + // Antlr4 already guarantees that the character followed by the escaped character is + // valid. So we can safely skip the escaped character. + if (input[i] == '\\') { + i++; + } + resultStr += input[i]; + } + return resultStr; +} + } // namespace common } // namespace kuzu diff --git a/src/common/types/blob.cpp b/src/common/types/blob.cpp index 6ed6f78b0a..4ae4ad732d 100644 --- a/src/common/types/blob.cpp +++ b/src/common/types/blob.cpp @@ -43,27 +43,27 @@ uint64_t Blob::getBlobSize(const ku_string_t& blob) { return blobSize; } -void Blob::fromString(ku_string_t& str, uint8_t* resultBuffer) { +uint64_t Blob::fromString(const char* str, uint64_t length, uint8_t* resultBuffer) { auto resultPos = 0u; - auto blobData = str.getData(); - for (auto i = 0u; i < str.len; i++) { - if (blobData[i] == '\\') { - validateHexCode(blobData, str.len, i); + for (auto i = 0u; i < length; i++) { + if (str[i] == '\\') { + validateHexCode(reinterpret_cast(str), length, i); auto firstByte = - HexFormatConstants::HEX_MAP[blobData[i + HexFormatConstants::FIRST_BYTE_POS]]; + HexFormatConstants::HEX_MAP[str[i + HexFormatConstants::FIRST_BYTE_POS]]; auto secondByte = - HexFormatConstants::HEX_MAP[blobData[i + HexFormatConstants::SECOND_BYTES_POS]]; + HexFormatConstants::HEX_MAP[str[i + HexFormatConstants::SECOND_BYTES_POS]]; resultBuffer[resultPos++] = (firstByte << HexFormatConstants::NUM_BYTES_TO_SHIFT_FOR_FIRST_BYTE) + secondByte; i += HexFormatConstants::LENGTH - 1; - } else if (blobData[i] <= 127) { - resultBuffer[resultPos++] = blobData[i]; + } else if (str[i] <= 127) { + resultBuffer[resultPos++] = str[i]; } else { throw ConversionException( "Invalid byte encountered in STRING -> BLOB conversion. All non-ascii characters " "must be escaped with hex codes (e.g. \\xAA)"); } } + return resultPos; } std::string Blob::toString(blob_t& blob) { diff --git a/src/common/types/types.cpp b/src/common/types/types.cpp index 592b2d5c92..244d2420a8 100644 --- a/src/common/types/types.cpp +++ b/src/common/types/types.cpp @@ -332,6 +332,8 @@ LogicalTypeID LogicalTypeUtils::dataTypeIDFromString(const std::string& dataType return LogicalTypeID::FLOAT; } else if ("BOOLEAN" == dataTypeIDString) { return LogicalTypeID::BOOL; + } else if ("BYTEA" == dataTypeIDString || "BLOB" == dataTypeIDString) { + return LogicalTypeID::BLOB; } else if ("STRING" == dataTypeIDString) { return LogicalTypeID::STRING; } else if ("DATE" == dataTypeIDString) { @@ -530,7 +532,8 @@ std::vector LogicalTypeUtils::getAllValidComparableLogicalTypes() { LogicalType{LogicalTypeID::INT16}, LogicalType{LogicalTypeID::DOUBLE}, LogicalType{LogicalTypeID::FLOAT}, LogicalType{LogicalTypeID::DATE}, LogicalType{LogicalTypeID::TIMESTAMP}, LogicalType{LogicalTypeID::INTERVAL}, - LogicalType{LogicalTypeID::STRING}, LogicalType{LogicalTypeID::SERIAL}}; + LogicalType{LogicalTypeID::BLOB}, LogicalType{LogicalTypeID::STRING}, + LogicalType{LogicalTypeID::SERIAL}}; } std::vector LogicalTypeUtils::getNumericalLogicalTypeIDs() { @@ -538,14 +541,17 @@ std::vector LogicalTypeUtils::getNumericalLogicalTypeIDs() { LogicalTypeID::INT16, LogicalTypeID::DOUBLE, LogicalTypeID::FLOAT, LogicalTypeID::SERIAL}; } -std::vector LogicalTypeUtils::getAllValidLogicTypeIDs() { +std::vector LogicalTypeUtils::getAllValidLogicTypes() { // TODO(Ziyi): Add FIX_LIST,STRUCT,MAP type to allValidTypeID when we support functions on // FIXED_LIST,STRUCT,MAP. - return std::vector{LogicalTypeID::INTERNAL_ID, LogicalTypeID::BOOL, - LogicalTypeID::INT64, LogicalTypeID::INT32, LogicalTypeID::INT16, LogicalTypeID::DOUBLE, - LogicalTypeID::STRING, LogicalTypeID::DATE, LogicalTypeID::TIMESTAMP, - LogicalTypeID::INTERVAL, LogicalTypeID::VAR_LIST, LogicalTypeID::FLOAT, - LogicalTypeID::SERIAL}; + return std::vector{LogicalType{LogicalTypeID::INTERNAL_ID}, + LogicalType{LogicalTypeID::BOOL}, LogicalType{LogicalTypeID::INT64}, + LogicalType{LogicalTypeID::INT32}, LogicalType{LogicalTypeID::INT16}, + LogicalType{LogicalTypeID::DOUBLE}, LogicalType{LogicalTypeID::STRING}, + LogicalType{LogicalTypeID::BLOB}, LogicalType{LogicalTypeID::DATE}, + LogicalType{LogicalTypeID::TIMESTAMP}, LogicalType{LogicalTypeID::INTERVAL}, + LogicalType{LogicalTypeID::VAR_LIST}, LogicalType{LogicalTypeID::FLOAT}, + LogicalType{LogicalTypeID::SERIAL}}; } std::vector LogicalTypeUtils::parseStructFields(const std::string& structTypeStr) { diff --git a/src/function/built_in_aggregate_functions.cpp b/src/function/built_in_aggregate_functions.cpp index e06ba6d9fe..01c01d2838 100644 --- a/src/function/built_in_aggregate_functions.cpp +++ b/src/function/built_in_aggregate_functions.cpp @@ -85,18 +85,11 @@ void BuiltInAggregateFunctions::registerCountStar() { void BuiltInAggregateFunctions::registerCount() { std::vector> definitions; - LogicalType inputType; - for (auto& typeID : LogicalTypeUtils::getAllValidLogicTypeIDs()) { - if (typeID == LogicalTypeID::VAR_LIST) { - inputType = LogicalType( - typeID, std::make_unique(std::make_unique())); - } else { - inputType = LogicalType(typeID); - } + for (auto& type : LogicalTypeUtils::getAllValidLogicTypes()) { for (auto isDistinct : std::vector{true, false}) { definitions.push_back(std::make_unique(COUNT_FUNC_NAME, - std::vector{typeID}, LogicalTypeID::INT64, - AggregateFunctionUtil::getCountFunction(inputType, isDistinct), isDistinct)); + std::vector{type.getLogicalTypeID()}, LogicalTypeID::INT64, + AggregateFunctionUtil::getCountFunction(type, isDistinct), isDistinct)); } } aggregateFunctions.insert({COUNT_FUNC_NAME, std::move(definitions)}); diff --git a/src/include/common/string_utils.h b/src/include/common/string_utils.h index e3df50e3bc..6b1ef45b86 100644 --- a/src/include/common/string_utils.h +++ b/src/include/common/string_utils.h @@ -69,6 +69,8 @@ class StringUtils { static std::string extractStringBetween(const std::string& input, char delimiterStart, char delimiterEnd, bool includeDelimiter = false); + + static std::string removeEscapedCharacters(const std::string& input); }; } // namespace common diff --git a/src/include/common/types/blob.h b/src/include/common/types/blob.h index 09077d4edd..bc0eb4fb87 100644 --- a/src/include/common/types/blob.h +++ b/src/include/common/types/blob.h @@ -11,17 +11,17 @@ struct blob_t { }; struct HexFormatConstants { - static constexpr const char PREFIX[] = "\\\\x"; - static constexpr const uint64_t PREFIX_LENGTH = 3; - static constexpr const uint64_t FIRST_BYTE_POS = PREFIX_LENGTH; - static constexpr const uint64_t SECOND_BYTES_POS = PREFIX_LENGTH + 1; - static constexpr const uint64_t LENGTH = 5; - static constexpr const uint64_t NUM_BYTES_TO_SHIFT_FOR_FIRST_BYTE = 4; - static constexpr const uint64_t SECOND_BYTE_MASK = 0x0F; // map of integer -> hex value. static constexpr const char* HEX_TABLE = "0123456789ABCDEF"; // reverse map of byte -> integer value, or -1 for invalid hex values. static const int HEX_MAP[256]; + static constexpr const uint64_t NUM_BYTES_TO_SHIFT_FOR_FIRST_BYTE = 4; + static constexpr const uint64_t SECOND_BYTE_MASK = 0x0F; + static constexpr const char PREFIX[] = "\\x"; + static constexpr const uint64_t PREFIX_LENGTH = 2; + static constexpr const uint64_t FIRST_BYTE_POS = PREFIX_LENGTH; + static constexpr const uint64_t SECOND_BYTES_POS = PREFIX_LENGTH + 1; + static constexpr const uint64_t LENGTH = 4; }; struct Blob { @@ -29,7 +29,7 @@ struct Blob { static uint64_t getBlobSize(const ku_string_t& blob); - static void fromString(ku_string_t& str, uint8_t* resultBuffer); + static uint64_t fromString(const char* str, uint64_t length, uint8_t* resultBuffer); private: static void validateHexCode(const uint8_t* blobStr, uint64_t length, uint64_t curPos); diff --git a/src/include/common/types/types.h b/src/include/common/types/types.h index 9917bd5909..b0f8ec6710 100644 --- a/src/include/common/types/types.h +++ b/src/include/common/types/types.h @@ -323,7 +323,7 @@ class LogicalTypeUtils { static bool isNumerical(const LogicalType& dataType); static std::vector getAllValidComparableLogicalTypes(); static std::vector getNumericalLogicalTypeIDs(); - static std::vector getAllValidLogicTypeIDs(); + static std::vector getAllValidLogicTypes(); private: static LogicalTypeID dataTypeIDFromString(const std::string& dataTypeIDString); diff --git a/src/include/function/cast/cast_operations.h b/src/include/function/cast/cast_operations.h index 8471183558..20a29a943d 100644 --- a/src/include/function/cast/cast_operations.h +++ b/src/include/function/cast/cast_operations.h @@ -57,7 +57,11 @@ struct CastToBlob { common::StringVector::getInMemOverflowBuffer(&resultVector) ->allocateSpace(result.value.len)); } - common::Blob::fromString(input, result.value.getDataWritable()); + common::Blob::fromString(reinterpret_cast(input.getData()), input.len, + result.value.getDataWritable()); + if (!common::ku_string_t::isShortString(result.value.len)) { + memcpy(result.value.prefix, result.value.getData(), common::ku_string_t::PREFIX_LENGTH); + } } }; diff --git a/src/include/storage/in_mem_storage_structure/in_mem_column.h b/src/include/storage/in_mem_storage_structure/in_mem_column.h index aaa2e43aad..6d09bbc89a 100644 --- a/src/include/storage/in_mem_storage_structure/in_mem_column.h +++ b/src/include/storage/in_mem_storage_structure/in_mem_column.h @@ -16,17 +16,17 @@ class InMemColumn { std::unique_ptr getInMemColumnChunk(common::offset_t startNodeOffset, common::offset_t endNodeOffset, const common::CopyDescription* copyDescription) { - switch (dataType.getLogicalTypeID()) { - case common::LogicalTypeID::STRING: - case common::LogicalTypeID::VAR_LIST: { + switch (dataType.getPhysicalType()) { + case common::PhysicalTypeID::STRING: + case common::PhysicalTypeID::VAR_LIST: { return std::make_unique( dataType, startNodeOffset, endNodeOffset, copyDescription, inMemOverflowFile.get()); } - case common::LogicalTypeID::FIXED_LIST: { + case common::PhysicalTypeID::FIXED_LIST: { return std::make_unique( dataType, startNodeOffset, endNodeOffset, copyDescription); } - case common::LogicalTypeID::STRUCT: { + case common::PhysicalTypeID::STRUCT: { auto inMemStructColumnChunk = std::make_unique( dataType, startNodeOffset, endNodeOffset, copyDescription); for (auto& fieldColumn : childColumns) { diff --git a/src/include/storage/in_mem_storage_structure/in_mem_column_chunk.h b/src/include/storage/in_mem_storage_structure/in_mem_column_chunk.h index c03a473603..08ec72b7c6 100644 --- a/src/include/storage/in_mem_storage_structure/in_mem_column_chunk.h +++ b/src/include/storage/in_mem_storage_structure/in_mem_column_chunk.h @@ -87,7 +87,8 @@ class InMemColumnChunkWithOverflow : public InMemColumnChunk { common::offset_t endNodeOffset, const common::CopyDescription* copyDescription, InMemOverflowFile* inMemOverflowFile) : InMemColumnChunk{std::move(dataType), startNodeOffset, endNodeOffset, copyDescription}, - inMemOverflowFile{inMemOverflowFile} {} + inMemOverflowFile{inMemOverflowFile}, blobBuffer{std::make_unique( + common::BufferPoolConstants::PAGE_4KB_SIZE)} {} void copyArrowArray(arrow::Array& array, arrow::Array* nodeOffsets = nullptr) final; @@ -106,6 +107,7 @@ class InMemColumnChunkWithOverflow : public InMemColumnChunk { storage::InMemOverflowFile* inMemOverflowFile; // TODO(Ziyi/Guodong): Fix this for rel columns. PageByteCursor overflowCursor; + std::unique_ptr blobBuffer; }; class InMemStructColumnChunk : public InMemColumnChunk { diff --git a/src/include/storage/in_mem_storage_structure/in_mem_lists.h b/src/include/storage/in_mem_storage_structure/in_mem_lists.h index fdb8f90613..6211a5342c 100644 --- a/src/include/storage/in_mem_storage_structure/in_mem_lists.h +++ b/src/include/storage/in_mem_storage_structure/in_mem_lists.h @@ -142,6 +142,7 @@ class InMemListsWithOverflow : public InMemLists { std::unique_ptr overflowInMemFile; // TODO(Guodong/Ziyi): Fix for concurrent writes. PageByteCursor overflowCursor; + std::unique_ptr blobBuffer; }; class InMemAdjLists : public InMemLists { diff --git a/src/include/storage/storage_structure/column.h b/src/include/storage/storage_structure/column.h index d0852a48f6..e7c66e9cfa 100644 --- a/src/include/storage/storage_structure/column.h +++ b/src/include/storage/storage_structure/column.h @@ -221,6 +221,7 @@ class ColumnFactory { case common::LogicalTypeID::INTERVAL: case common::LogicalTypeID::FIXED_LIST: return std::make_unique(structureIDAndFName, logicalType, bufferManager, wal); + case common::LogicalTypeID::BLOB: case common::LogicalTypeID::STRING: return std::make_unique( structureIDAndFName, logicalType, bufferManager, wal); diff --git a/src/include/storage/storage_structure/lists/lists.h b/src/include/storage/storage_structure/lists/lists.h index 9861923058..430e8160ca 100644 --- a/src/include/storage/storage_structure/lists/lists.h +++ b/src/include/storage/storage_structure/lists/lists.h @@ -264,6 +264,7 @@ class ListsFactory { return std::make_unique(structureIDAndFName, dataType, storage::StorageUtils::getDataTypeSize(dataType), adjListsHeaders, bufferManager, wal, listsUpdatesStore); + case common::LogicalTypeID::BLOB: case common::LogicalTypeID::STRING: return std::make_unique( structureIDAndFName, adjListsHeaders, bufferManager, wal, listsUpdatesStore); diff --git a/src/parser/transformer.cpp b/src/parser/transformer.cpp index 6b18761a0a..0f63970707 100644 --- a/src/parser/transformer.cpp +++ b/src/parser/transformer.cpp @@ -1088,7 +1088,7 @@ Transformer::transformParsingOptions(CypherParser::KU_ParsingOptionsContext& ctx std::string Transformer::transformStringLiteral(antlr4::tree::TerminalNode& stringLiteral) { auto str = stringLiteral.getText(); - return str.substr(1, str.size() - 2); + return common::StringUtils::removeEscapedCharacters(str); } } // namespace parser diff --git a/src/processor/operator/order_by/order_by.cpp b/src/processor/operator/order_by/order_by.cpp index 7218981754..c06e6555e6 100644 --- a/src/processor/operator/order_by/order_by.cpp +++ b/src/processor/operator/order_by/order_by.cpp @@ -50,7 +50,7 @@ void OrderBy::initGlobalStateInternal(kuzu::processor::ExecutionContext* context auto tableSchema = populateTableSchema(); for (auto i = 0u; i < orderByDataInfo.keysPosAndType.size(); ++i) { auto [dataPos, dataType] = orderByDataInfo.keysPosAndType[i]; - if (LogicalTypeID::STRING == dataType.getLogicalTypeID()) { + if (PhysicalTypeID::STRING == dataType.getPhysicalType()) { // If this is a string column, we need to find the factorizedTable offset for this // column. auto factorizedTableColIdx = 0ul; diff --git a/src/storage/copier/table_copy_utils.cpp b/src/storage/copier/table_copy_utils.cpp index 350dbb117c..d6a2d01b97 100644 --- a/src/storage/copier/table_copy_utils.cpp +++ b/src/storage/copier/table_copy_utils.cpp @@ -365,6 +365,7 @@ std::shared_ptr TableCopyUtils::toArrowDataType(const LogicalTy case LogicalTypeID::INTERVAL: case LogicalTypeID::FIXED_LIST: case LogicalTypeID::VAR_LIST: + case LogicalTypeID::BLOB: case LogicalTypeID::STRING: case LogicalTypeID::STRUCT: { return arrow::utf8(); diff --git a/src/storage/in_mem_storage_structure/in_mem_column.cpp b/src/storage/in_mem_storage_structure/in_mem_column.cpp index 21572c1359..294f314c04 100644 --- a/src/storage/in_mem_storage_structure/in_mem_column.cpp +++ b/src/storage/in_mem_storage_structure/in_mem_column.cpp @@ -12,8 +12,8 @@ namespace storage { InMemColumn::InMemColumn(std::string filePath, LogicalType dataType, bool requireNullBits) : filePath{std::move(filePath)}, dataType{std::move(dataType)} { // TODO(Guodong): Separate this as a function. - switch (this->dataType.getLogicalTypeID()) { - case LogicalTypeID::STRUCT: { + switch (this->dataType.getPhysicalType()) { + case PhysicalTypeID::STRUCT: { auto fieldTypes = common::StructType::getFieldTypes(&this->dataType); childColumns.reserve(fieldTypes.size()); for (auto i = 0u; i < fieldTypes.size(); i++) { @@ -22,8 +22,8 @@ InMemColumn::InMemColumn(std::string filePath, LogicalType dataType, bool requir true /* hasNull */)); } } break; - case LogicalTypeID::STRING: - case LogicalTypeID::VAR_LIST: { + case PhysicalTypeID::STRING: + case PhysicalTypeID::VAR_LIST: { inMemOverflowFile = std::make_unique(StorageUtils::getOverflowFileName(this->filePath)); fileHandle = std::make_unique( diff --git a/src/storage/in_mem_storage_structure/in_mem_column_chunk.cpp b/src/storage/in_mem_storage_structure/in_mem_column_chunk.cpp index 4d7554b9bb..c3d2d82eb3 100644 --- a/src/storage/in_mem_storage_structure/in_mem_column_chunk.cpp +++ b/src/storage/in_mem_storage_structure/in_mem_column_chunk.cpp @@ -220,6 +220,9 @@ void InMemColumnChunkWithOverflow::copyArrowArray(arrow::Array& array, arrow::Ar case common::LogicalTypeID::STRING: { templateCopyValuesAsStringToPageWithOverflow(array, nodeOffsets); } break; + case common::LogicalTypeID::BLOB: { + templateCopyValuesAsStringToPageWithOverflow(array, nodeOffsets); + } break; case common::LogicalTypeID::VAR_LIST: { templateCopyValuesAsStringToPageWithOverflow(array, nodeOffsets); } break; @@ -303,6 +306,17 @@ void InMemColumnChunkWithOverflow::setValWithOverflow( setValue(val, pos); } +// BLOB +template<> +void InMemColumnChunkWithOverflow::setValWithOverflow( + const char* value, uint64_t length, uint64_t pos) { + auto blobLen = Blob::fromString( + value, std::min(length, BufferPoolConstants::PAGE_4KB_SIZE), blobBuffer.get()); + auto blobVal = inMemOverflowFile->copyString( + reinterpret_cast(blobBuffer.get()), blobLen, overflowCursor); + setValue(blobVal, pos); +} + // VAR_LIST template<> void InMemColumnChunkWithOverflow::setValWithOverflow( diff --git a/src/storage/in_mem_storage_structure/in_mem_lists.cpp b/src/storage/in_mem_storage_structure/in_mem_lists.cpp index 2c6279ef81..f24a8b5226 100644 --- a/src/storage/in_mem_storage_structure/in_mem_lists.cpp +++ b/src/storage/in_mem_storage_structure/in_mem_lists.cpp @@ -312,7 +312,8 @@ InMemListsWithOverflow::InMemListsWithOverflow(std::string fName, LogicalType da uint64_t numNodes, std::shared_ptr listHeadersBuilder, const common::CopyDescription* copyDescription) : InMemLists{std::move(fName), std::move(dataType), - storage::StorageUtils::getDataTypeSize(dataType), numNodes, copyDescription, true} { + storage::StorageUtils::getDataTypeSize(dataType), numNodes, copyDescription, true}, + blobBuffer{std::make_unique(BufferPoolConstants::PAGE_4KB_SIZE)} { assert(this->dataType.getLogicalTypeID() == LogicalTypeID::STRING || this->dataType.getLogicalTypeID() == LogicalTypeID::VAR_LIST); overflowInMemFile = @@ -324,6 +325,10 @@ void InMemListsWithOverflow::copyArrowArray( arrow::Array* boundNodeOffsets, arrow::Array* posInRelLists, arrow::Array* array) { assert(array->type_id() == arrow::Type::STRING); switch (dataType.getLogicalTypeID()) { + case common::LogicalTypeID::BLOB: { + templateCopyArrayAsStringToRelListsWithOverflow( + boundNodeOffsets, posInRelLists, array); + } break; case common::LogicalTypeID::STRING: { templateCopyArrayAsStringToRelListsWithOverflow( boundNodeOffsets, posInRelLists, array); @@ -360,6 +365,16 @@ void InMemListsWithOverflow::setValueFromStringWithOverflow( setValue(nodeOffset, pos, (uint8_t*)&stringVal); } +template<> +void InMemListsWithOverflow::setValueFromStringWithOverflow( + offset_t nodeOffset, uint64_t pos, const char* val, uint64_t length) { + auto blobLen = Blob::fromString( + val, std::min(length, BufferPoolConstants::PAGE_4KB_SIZE), blobBuffer.get()); + auto blobVal = overflowInMemFile->copyString( + reinterpret_cast(blobBuffer.get()), blobLen, overflowCursor); + setValue(nodeOffset, pos, (uint8_t*)&blobVal); +} + template<> void InMemListsWithOverflow::setValueFromStringWithOverflow( offset_t nodeOffset, uint64_t pos, const char* val, uint64_t length) { @@ -390,6 +405,7 @@ std::unique_ptr InMemListsFactory::getInMemPropertyLists(const std:: return make_unique(fName, dataType, storage::StorageUtils::getDataTypeSize(dataType), numNodes, std::move(listHeadersBuilder), copyDescription, true /* hasNULLBytes */); + case LogicalTypeID::BLOB: case LogicalTypeID::STRING: return make_unique(fName, numNodes, std::move(listHeadersBuilder)); case LogicalTypeID::VAR_LIST: diff --git a/test/c_api/connection_test.cpp b/test/c_api/connection_test.cpp index 9ff5d3633a..70765c4903 100644 --- a/test/c_api/connection_test.cpp +++ b/test/c_api/connection_test.cpp @@ -173,7 +173,8 @@ TEST_F(CApiConnectionTest, GetNodePropertyNames) { "\tname STRING(PRIMARY KEY)\n" "\tlength INT32\n" "\tnote STRING\n" - "\tdescription STRUCT(RATING:DOUBLE, VIEWS:INT64, RELEASE:TIMESTAMP, FILM:DATE)\n"); + "\tdescription STRUCT(RATING:DOUBLE, VIEWS:INT64, RELEASE:TIMESTAMP, FILM:DATE)\n" + "\tcontent BLOB\n"); free(result); } @@ -186,7 +187,8 @@ TEST_F(CApiConnectionTest, GetRelPropertyNames) { "meets dst node: person\n" "meets properties: \n" "\tlocation FLOAT[2]\n" - "\ttimes INT32\n"); + "\ttimes INT32\n" + "\tdata BLOB\n"); free(result); } diff --git a/test/test_files/exceptions/binder/binder_error.test b/test/test_files/exceptions/binder/binder_error.test index 4c42318ac2..a125c19047 100644 --- a/test/test_files/exceptions/binder/binder_error.test +++ b/test/test_files/exceptions/binder/binder_error.test @@ -334,6 +334,8 @@ DISTINCT (TIMESTAMP) -> TIMESTAMP (TIMESTAMP) -> TIMESTAMP DISTINCT (INTERVAL) -> INTERVAL (INTERVAL) -> INTERVAL +DISTINCT (BLOB) -> BLOB +(BLOB) -> BLOB DISTINCT (STRING) -> STRING (STRING) -> STRING DISTINCT (SERIAL) -> SERIAL diff --git a/test/test_files/tinysnb/agg/hash.test b/test/test_files/tinysnb/agg/hash.test index 317dc58335..0443daa077 100644 --- a/test/test_files/tinysnb/agg/hash.test +++ b/test/test_files/tinysnb/agg/hash.test @@ -138,6 +138,13 @@ Alice|1 40|[Greg] 83|[Hubert Blaine Wolfeschlegelsteinhausenbergerdorff] +-NAME HashCollectBLOBTest +-QUERY MATCH (m:movies) RETURN m.length > 130, collect(m.content) +---- 2 +False|[\xAA\xABinteresting\x0B] +True|[\xAB\xCD,pure ascii characters] + + -NAME HashCollectLISTOfINT64Test -QUERY MATCH (p:person) RETURN p.gender, collect(p.workedHours) ---- 2 @@ -190,3 +197,27 @@ Elizabeth|[25,40] ---- 2 2020|[55,22] 2021|[5] + +-NAME HashAggCountBlob +-QUERY MATCH (m:movies) RETURN m.length > 2542, COUNT(m.content) +-PARALLELISM 4 +-ENUMERATE +---- 2 +False|2 +True|1 + +-NAME HashAggMinBlob +-QUERY MATCH (m:movies) RETURN m.length <> 298, MIN(m.content) +-PARALLELISM 2 +-ENUMERATE +---- 2 +False|pure ascii characters +True|\xAA\xABinteresting\x0B + +-NAME HashAggMaxBlob +-QUERY MATCH (m:movies) RETURN m.length <> 126, MAX(m.content) +-PARALLELISM 1 +-ENUMERATE +---- 2 +False|\xAA\xABinteresting\x0B +True|\xAB\xCD diff --git a/test/test_files/tinysnb/agg/simple.test b/test/test_files/tinysnb/agg/simple.test index 010c2637fb..12415e9920 100644 --- a/test/test_files/tinysnb/agg/simple.test +++ b/test/test_files/tinysnb/agg/simple.test @@ -93,6 +93,11 @@ False ---- 1 [5,55,22] +-NAME SimpleCollectBLOBTest +-QUERY MATCH (m:movies) RETURN collect(m.content) +---- 1 +[\xAA\xABinteresting\x0B,\xAB\xCD,pure ascii characters] + -NAME SimpleCollectSTRINGTest -QUERY MATCH (p:person) RETURN collect(p.fName) ---- 1 @@ -172,3 +177,24 @@ False -ENUMERATE ---- 1 989.333333 + +-NAME SimpleAggCountBlob +-QUERY MATCH (m:movies) RETURN COUNT(m.content) +-PARALLELISM 3 +-ENUMERATE +---- 1 +3 + +-NAME SimpleAggMinBlob +-QUERY MATCH (m:movies) RETURN MIN(m.content) +-PARALLELISM 3 +-ENUMERATE +---- 1 +pure ascii characters + +-NAME SimpleAggMaxBlob +-QUERY MATCH (m:movies) RETURN MAX(m.content) +-PARALLELISM 3 +-ENUMERATE +---- 1 +\xAB\xCD diff --git a/test/test_files/tinysnb/function/blob.test b/test/test_files/tinysnb/function/blob.test index 8504d50ca0..7ed4d6e18b 100644 --- a/test/test_files/tinysnb/function/blob.test +++ b/test/test_files/tinysnb/function/blob.test @@ -39,3 +39,45 @@ decode this utf-8 char: ü -QUERY RETURN decode(BLOB('decode this invalid utf-8 char: \\xA1\\xBC')) ---- error Runtime exception: Failure in decode: could not convert blob to UTF8 string, the blob contained invalid UTF8 characters + +-NAME BlobGreaterThan +-QUERY MATCH (m:movies) RETURN m.content > BLOB("test"); +---- 3 +True +True +False + +-NAME BlobGreaterThanOrEqualTo +-QUERY MATCH (m:movies) RETURN m.content >= BLOB("\\xAA\\xABinteresting\\x0B"); +---- 3 +True +True +False + +-NAME BlobEqualTo +-QUERY MATCH (m:movies) RETURN m.content = BLOB("\\xAA\\xABinteresting\\x0B"); +---- 3 +True +False +False + +-NAME BlobNotEqualTo +-QUERY MATCH (m:movies) RETURN m.content <> BLOB("\\xAA\\xABinteresting\\x0B"); +---- 3 +False +True +True + +-NAME BlobGreaterThan +-QUERY MATCH (m:movies) RETURN m.content < BLOB("\\xAB\\xCD"); +---- 3 +True +False +True + +-NAME BlobGreaterThanOrEqualTo +-QUERY MATCH (m:movies) RETURN m.content <= BLOB("pure ascii characters"); +---- 3 +False +False +True diff --git a/test/test_files/tinysnb/order_by/single_label.test b/test/test_files/tinysnb/order_by/single_label.test index eacefd2054..39e72da174 100644 --- a/test/test_files/tinysnb/order_by/single_label.test +++ b/test/test_files/tinysnb/order_by/single_label.test @@ -134,6 +134,15 @@ Carol Bob Alice +-NAME OrderByBlobTest +-QUERY MATCH (m:movies) RETURN m.content ORDER BY m.content DESC +-CHECK_ORDER +-PARALLELISM 2 +---- 3 +\xAB\xCD +\xAA\xABinteresting\x0B +pure ascii characters + -NAME OrderByStrMultipleColTest -QUERY MATCH (p:person) RETURN p.age, p.eyeSight ORDER BY p.isWorker desc, p.age, p.eyeSight desc -CHECK_ORDER diff --git a/test/test_files/tinysnb/projection/multi_label.test b/test/test_files/tinysnb/projection/multi_label.test index dd2cdd4e2a..e918a73b17 100644 --- a/test/test_files/tinysnb/projection/multi_label.test +++ b/test/test_files/tinysnb/projection/multi_label.test @@ -30,9 +30,9 @@ -NAME MultiLabelReturnStar -QUERY MATCH (a:movies:organisation) RETURN * ---- 6 -(label:movies, 2:0, {ID:, name:Sóló cón tu párejâ, orgCode:, mark:, score:, history:, licenseValidInterval:, rating:, state:, length:126, note: this is a very very good movie, description:{RATING: 5.300000, VIEWS: 152, RELEASE: 2011-08-20 11:25:30, FILM: 2012-05-11}}) -(label:movies, 2:1, {ID:, name:The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie, orgCode:, mark:, score:, history:, licenseValidInterval:, rating:, state:, length:2544, note: the movie is very very good, description:{RATING: 7.000000, VIEWS: 982, RELEASE: 2018-11-13 13:33:11, FILM: 2014-09-12}}) -(label:movies, 2:2, {ID:, name:Roma, orgCode:, mark:, score:, history:, licenseValidInterval:, rating:, state:, length:298, note:the movie is very interesting and funny, description:{RATING: 1223.000000, VIEWS: 10003, RELEASE: 2011-02-11 16:44:22, FILM: 2013-02-22}}) -(label:organisation, 1:0, {ID:1, name:ABFsUni, orgCode:325, mark:3.700000, score:-2, history:10 years 5 months 13 hours 24 us, licenseValidInterval:3 years 5 days, rating:1.000000, state:{REVENUE: 138, LOCATION: ['toronto', 'montr,eal'], STOCK: {PRICE: [96,56], VOLUME: 1000}}, length:, note:, description:}) -(label:organisation, 1:1, {ID:4, name:CsWork, orgCode:934, mark:4.100000, score:-100, history:2 years 4 days 10 hours, licenseValidInterval:26 years 52 days 48:00:00, rating:0.780000, state:{REVENUE: 152, LOCATION: ["vanco,uver north area"], STOCK: {PRICE: [15,78,671], VOLUME: 432}}, length:, note:, description:}) -(label:organisation, 1:2, {ID:6, name:DEsWork, orgCode:824, mark:4.100000, score:7, history:2 years 4 hours 22 us 34 minutes, licenseValidInterval:82:00:00.1, rating:0.520000, state:{REVENUE: 558, LOCATION: ['very long city name', 'new york'], STOCK: {PRICE: [22], VOLUME: 99}}, length:, note:, description:}) +(label:movies, 2:0, {ID:, name:Sóló cón tu párejâ, orgCode:, mark:, score:, history:, licenseValidInterval:, rating:, state:, length:126, note: this is a very very good movie, description:{RATING: 5.300000, VIEWS: 152, RELEASE: 2011-08-20 11:25:30, FILM: 2012-05-11}, content:\xAA\xABinteresting\x0B}) +(label:movies, 2:1, {ID:, name:The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie, orgCode:, mark:, score:, history:, licenseValidInterval:, rating:, state:, length:2544, note: the movie is very very good, description:{RATING: 7.000000, VIEWS: 982, RELEASE: 2018-11-13 13:33:11, FILM: 2014-09-12}, content:\xAB\xCD}) +(label:movies, 2:2, {ID:, name:Roma, orgCode:, mark:, score:, history:, licenseValidInterval:, rating:, state:, length:298, note:the movie is very interesting and funny, description:{RATING: 1223.000000, VIEWS: 10003, RELEASE: 2011-02-11 16:44:22, FILM: 2013-02-22}, content:pure ascii characters}) +(label:organisation, 1:0, {ID:1, name:ABFsUni, orgCode:325, mark:3.700000, score:-2, history:10 years 5 months 13 hours 24 us, licenseValidInterval:3 years 5 days, rating:1.000000, state:{REVENUE: 138, LOCATION: ['toronto', 'montr,eal'], STOCK: {PRICE: [96,56], VOLUME: 1000}}, length:, note:, description:, content:}) +(label:organisation, 1:1, {ID:4, name:CsWork, orgCode:934, mark:4.100000, score:-100, history:2 years 4 days 10 hours, licenseValidInterval:26 years 52 days 48:00:00, rating:0.780000, state:{REVENUE: 152, LOCATION: ["vanco,uver north area"], STOCK: {PRICE: [15,78,671], VOLUME: 432}}, length:, note:, description:, content:}) +(label:organisation, 1:2, {ID:6, name:DEsWork, orgCode:824, mark:4.100000, score:7, history:2 years 4 hours 22 us 34 minutes, licenseValidInterval:82:00:00.1, rating:0.520000, state:{REVENUE: 558, LOCATION: ['very long city name', 'new york'], STOCK: {PRICE: [22], VOLUME: 99}}, length:, note:, description:, content:}) diff --git a/test/test_files/tinysnb/projection/single_label.test b/test/test_files/tinysnb/projection/single_label.test index 3c287cdfbc..98ef93669c 100644 --- a/test/test_files/tinysnb/projection/single_label.test +++ b/test/test_files/tinysnb/projection/single_label.test @@ -619,3 +619,26 @@ Dan|Carol {{8=Farooq}=25} {{9=Greg}=40} {{10=Hubert Blaine Wolfeschlegelsteinhausenbergerdorff}=83} + +-NAME ReturnNodeBlobProp +-QUERY MATCH (m:movies) RETURN m.content; +---- 3 +\xAA\xABinteresting\x0B +\xAB\xCD +pure ascii characters + +-NAME ReturnRelBlobProp +-QUERY MATCH (p:person)-[e:meets]->(p1:person) RETURN e.data; +---- 7 +MIXED \x5CxAC with ASCII \x5Cx0A +NO hex code +NO hex code +\x5Cx3A\x5CxA3 +\x5Cx3A\x5CxA3 +\x5CxA1\x5Cx2A +\x5CxAA\x5CxBB\x5CxCC\x5CxDD + +-NAME ReturnStringLiteralWithSpecialChar +-QUERY RETURN "hEl \'\"\\"; +---- 1 +hEl '"\ diff --git a/tools/python_api/test/conftest.py b/tools/python_api/test/conftest.py index c7f1d23dff..222545a8e4 100644 --- a/tools/python_api/test/conftest.py +++ b/tools/python_api/test/conftest.py @@ -28,7 +28,7 @@ def init_tiny_snb(tmp_path): "STRING[], stock STRUCT(price INT64[], volume INT64)), PRIMARY KEY (ID));") conn.execute('COPY organisation FROM "../../../dataset/tinysnb/vOrganisation.csv"') conn.execute('CREATE NODE TABLE movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, ' - 'views INT64, release TIMESTAMP, film DATE), PRIMARY KEY (name))') + 'views INT64, release TIMESTAMP, film DATE), content BYTEA, PRIMARY KEY (name))') conn.execute('COPY movies FROM "../../../dataset/tinysnb/vMovies.csv"') conn.execute('create rel table workAt (FROM person TO organisation, year INT64, grading DOUBLE[2], rating float,' ' MANY_ONE)')