From 2b75b18835f621a5bdbfd2f63aa049a3ce211352 Mon Sep 17 00:00:00 2001
From: ziyi chen <chenziyi990424@gmail.com>
Date: Mon, 19 Jun 2023 14:17:26 -0400
Subject: [PATCH] Implement loading for blob-type

---
 dataset/tinysnb/eMeets.csv                    | 14 +++----
 dataset/tinysnb/schema.cypher                 |  4 +-
 dataset/tinysnb/vMovies.csv                   |  6 +--
 src/common/string_utils.cpp                   | 13 ++++++
 src/common/types/blob.cpp                     | 18 ++++----
 src/common/types/types.cpp                    | 20 +++++----
 src/function/built_in_aggregate_functions.cpp | 13 ++----
 src/include/common/string_utils.h             |  2 +
 src/include/common/types/blob.h               | 16 +++----
 src/include/common/types/types.h              |  2 +-
 src/include/function/cast/cast_operations.h   |  6 ++-
 .../in_mem_storage_structure/in_mem_column.h  | 10 ++---
 .../in_mem_column_chunk.h                     |  4 +-
 .../in_mem_storage_structure/in_mem_lists.h   |  1 +
 .../storage/storage_structure/column.h        |  1 +
 .../storage/storage_structure/lists/lists.h   |  1 +
 src/parser/transformer.cpp                    |  2 +-
 src/processor/operator/order_by/order_by.cpp  |  2 +-
 src/storage/copier/table_copy_utils.cpp       |  1 +
 .../in_mem_column.cpp                         |  8 ++--
 .../in_mem_column_chunk.cpp                   | 14 +++++++
 .../in_mem_storage_structure/in_mem_lists.cpp | 18 +++++++-
 test/c_api/connection_test.cpp                |  6 ++-
 .../exceptions/binder/binder_error.test       |  2 +
 test/test_files/tinysnb/agg/hash.test         | 31 ++++++++++++++
 test/test_files/tinysnb/agg/simple.test       | 26 ++++++++++++
 test/test_files/tinysnb/function/blob.test    | 42 +++++++++++++++++++
 .../tinysnb/order_by/single_label.test        |  9 ++++
 .../tinysnb/projection/multi_label.test       | 12 +++---
 .../tinysnb/projection/single_label.test      | 23 ++++++++++
 tools/python_api/test/conftest.py             |  2 +-
 31 files changed, 259 insertions(+), 70 deletions(-)

diff --git a/dataset/tinysnb/eMeets.csv b/dataset/tinysnb/eMeets.csv
index b22bd5d942..1d78ef1aaf 100644
--- a/dataset/tinysnb/eMeets.csv
+++ b/dataset/tinysnb/eMeets.csv
@@ -1,7 +1,7 @@
-0,2,"[7.82,3.54]",5
-2,5,"[2.87,4.23]",2
-3,7,"[3.65,8.44]",3
-7,3,"[2.11,3.1]",7
-8,3,"[2.2,9.0]",9
-9,3,"[3,5.2]",11
-10,2,"[3.5,1.1]",13
+0,2,"[7.82,3.54]",5,"\\xAA\\xBB\\xCC\\xDD"
+2,5,"[2.87,4.23]",2,"NO hex code"
+3,7,"[3.65,8.44]",3,"MIXED \\xAC with ASCII \\x0A"
+7,3,"[2.11,3.1]",7,"\\xA1\\x2A"
+8,3,"[2.2,9.0]",9,"\\x3A\\xA3"
+9,3,"[3,5.2]",11,"NO hex code"
+10,2,"[3.5,1.1]",13,"\\x3A\\xA3"
diff --git a/dataset/tinysnb/schema.cypher b/dataset/tinysnb/schema.cypher
index f485a2343a..2c749f3d66 100644
--- a/dataset/tinysnb/schema.cypher
+++ b/dataset/tinysnb/schema.cypher
@@ -1,8 +1,8 @@
 create node table person (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], grades INT64[4], height float, PRIMARY KEY (ID));
 create node table organisation (ID INT64, name STRING, orgCode INT64, mark DOUBLE, score INT64, history STRING, licenseValidInterval INTERVAL, rating DOUBLE, state STRUCT(revenue INT16, location STRING[], stock STRUCT(price INT64[], volume INT64)), PRIMARY KEY (ID));
-create node table movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, views INT64, release TIMESTAMP, film DATE), PRIMARY KEY (name));
+create node table movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, views INT64, release TIMESTAMP, film DATE), content BYTEA, PRIMARY KEY (name));
 create rel table knows (FROM person TO person, date DATE, meetTime TIMESTAMP, validInterval INTERVAL, comments STRING[], MANY_MANY);
 create rel table studyAt (FROM person TO organisation, year INT64, places STRING[], length INT16,MANY_ONE);
 create rel table workAt (FROM person TO organisation, year INT64, grading DOUBLE[2], rating float, MANY_ONE);
-create rel table meets (FROM person TO person, location FLOAT[2], times INT, MANY_ONE);
+create rel table meets (FROM person TO person, location FLOAT[2], times INT, data BYTEA, MANY_ONE);
 create rel table marries (FROM person TO person, usedAddress STRING[], address INT16[2], note STRING, ONE_ONE);
diff --git a/dataset/tinysnb/vMovies.csv b/dataset/tinysnb/vMovies.csv
index bd1fd0cfde..6fdf986dea 100644
--- a/dataset/tinysnb/vMovies.csv
+++ b/dataset/tinysnb/vMovies.csv
@@ -1,3 +1,3 @@
-Sóló cón tu párejâ,126, this is a very very good movie,"{rating: 5.3, views: 152, release: 2011-08-20 11:25:30, film: 2012-05-11}"
-The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie,2544, the movie is very very good,"{rating: 7, views: 982, release: 2018-11-13 13:33:11, film: 2014-09-12}"
-Roma,298,the movie is very interesting and funny,"{rating: 1223, views: 10003, release: 2011-02-11 16:44:22, film: 2013-02-22}"
+Sóló cón tu párejâ,126, this is a very very good movie,"{rating: 5.3, views: 152, release: 2011-08-20 11:25:30, film: 2012-05-11}","\\xAA\\xABinteresting\\x0B"
+The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie,2544, the movie is very very good,"{rating: 7, views: 982, release: 2018-11-13 13:33:11, film: 2014-09-12}","\\xAB\\xCD"
+Roma,298,the movie is very interesting and funny,"{rating: 1223, views: 10003, release: 2011-02-11 16:44:22, film: 2013-02-22}","pure ascii characters"
diff --git a/src/common/string_utils.cpp b/src/common/string_utils.cpp
index f85554ed9f..fef576ac1d 100644
--- a/src/common/string_utils.cpp
+++ b/src/common/string_utils.cpp
@@ -55,5 +55,18 @@ std::string StringUtils::extractStringBetween(
     return input.substr(posStart, posEnd - posStart);
 }
 
+std::string StringUtils::removeEscapedCharacters(const std::string& input) {
+    std::string resultStr;
+    for (auto i = 1u; i < input.length() - 1; i++) {
+        // Antlr4 already guarantees that the character followed by the escaped character is
+        // valid. So we can safely skip the escaped character.
+        if (input[i] == '\\') {
+            i++;
+        }
+        resultStr += input[i];
+    }
+    return resultStr;
+}
+
 } // namespace common
 } // namespace kuzu
diff --git a/src/common/types/blob.cpp b/src/common/types/blob.cpp
index 6ed6f78b0a..4ae4ad732d 100644
--- a/src/common/types/blob.cpp
+++ b/src/common/types/blob.cpp
@@ -43,27 +43,27 @@ uint64_t Blob::getBlobSize(const ku_string_t& blob) {
     return blobSize;
 }
 
-void Blob::fromString(ku_string_t& str, uint8_t* resultBuffer) {
+uint64_t Blob::fromString(const char* str, uint64_t length, uint8_t* resultBuffer) {
     auto resultPos = 0u;
-    auto blobData = str.getData();
-    for (auto i = 0u; i < str.len; i++) {
-        if (blobData[i] == '\\') {
-            validateHexCode(blobData, str.len, i);
+    for (auto i = 0u; i < length; i++) {
+        if (str[i] == '\\') {
+            validateHexCode(reinterpret_cast<const uint8_t*>(str), length, i);
             auto firstByte =
-                HexFormatConstants::HEX_MAP[blobData[i + HexFormatConstants::FIRST_BYTE_POS]];
+                HexFormatConstants::HEX_MAP[str[i + HexFormatConstants::FIRST_BYTE_POS]];
             auto secondByte =
-                HexFormatConstants::HEX_MAP[blobData[i + HexFormatConstants::SECOND_BYTES_POS]];
+                HexFormatConstants::HEX_MAP[str[i + HexFormatConstants::SECOND_BYTES_POS]];
             resultBuffer[resultPos++] =
                 (firstByte << HexFormatConstants::NUM_BYTES_TO_SHIFT_FOR_FIRST_BYTE) + secondByte;
             i += HexFormatConstants::LENGTH - 1;
-        } else if (blobData[i] <= 127) {
-            resultBuffer[resultPos++] = blobData[i];
+        } else if (str[i] <= 127) {
+            resultBuffer[resultPos++] = str[i];
         } else {
             throw ConversionException(
                 "Invalid byte encountered in STRING -> BLOB conversion. All non-ascii characters "
                 "must be escaped with hex codes (e.g. \\xAA)");
         }
     }
+    return resultPos;
 }
 
 std::string Blob::toString(blob_t& blob) {
diff --git a/src/common/types/types.cpp b/src/common/types/types.cpp
index 592b2d5c92..244d2420a8 100644
--- a/src/common/types/types.cpp
+++ b/src/common/types/types.cpp
@@ -332,6 +332,8 @@ LogicalTypeID LogicalTypeUtils::dataTypeIDFromString(const std::string& dataType
         return LogicalTypeID::FLOAT;
     } else if ("BOOLEAN" == dataTypeIDString) {
         return LogicalTypeID::BOOL;
+    } else if ("BYTEA" == dataTypeIDString || "BLOB" == dataTypeIDString) {
+        return LogicalTypeID::BLOB;
     } else if ("STRING" == dataTypeIDString) {
         return LogicalTypeID::STRING;
     } else if ("DATE" == dataTypeIDString) {
@@ -530,7 +532,8 @@ std::vector<LogicalType> LogicalTypeUtils::getAllValidComparableLogicalTypes() {
         LogicalType{LogicalTypeID::INT16}, LogicalType{LogicalTypeID::DOUBLE},
         LogicalType{LogicalTypeID::FLOAT}, LogicalType{LogicalTypeID::DATE},
         LogicalType{LogicalTypeID::TIMESTAMP}, LogicalType{LogicalTypeID::INTERVAL},
-        LogicalType{LogicalTypeID::STRING}, LogicalType{LogicalTypeID::SERIAL}};
+        LogicalType{LogicalTypeID::BLOB}, LogicalType{LogicalTypeID::STRING},
+        LogicalType{LogicalTypeID::SERIAL}};
 }
 
 std::vector<LogicalTypeID> LogicalTypeUtils::getNumericalLogicalTypeIDs() {
@@ -538,14 +541,17 @@ std::vector<LogicalTypeID> LogicalTypeUtils::getNumericalLogicalTypeIDs() {
         LogicalTypeID::INT16, LogicalTypeID::DOUBLE, LogicalTypeID::FLOAT, LogicalTypeID::SERIAL};
 }
 
-std::vector<LogicalTypeID> LogicalTypeUtils::getAllValidLogicTypeIDs() {
+std::vector<LogicalType> LogicalTypeUtils::getAllValidLogicTypes() {
     // TODO(Ziyi): Add FIX_LIST,STRUCT,MAP type to allValidTypeID when we support functions on
     // FIXED_LIST,STRUCT,MAP.
-    return std::vector<LogicalTypeID>{LogicalTypeID::INTERNAL_ID, LogicalTypeID::BOOL,
-        LogicalTypeID::INT64, LogicalTypeID::INT32, LogicalTypeID::INT16, LogicalTypeID::DOUBLE,
-        LogicalTypeID::STRING, LogicalTypeID::DATE, LogicalTypeID::TIMESTAMP,
-        LogicalTypeID::INTERVAL, LogicalTypeID::VAR_LIST, LogicalTypeID::FLOAT,
-        LogicalTypeID::SERIAL};
+    return std::vector<LogicalType>{LogicalType{LogicalTypeID::INTERNAL_ID},
+        LogicalType{LogicalTypeID::BOOL}, LogicalType{LogicalTypeID::INT64},
+        LogicalType{LogicalTypeID::INT32}, LogicalType{LogicalTypeID::INT16},
+        LogicalType{LogicalTypeID::DOUBLE}, LogicalType{LogicalTypeID::STRING},
+        LogicalType{LogicalTypeID::BLOB}, LogicalType{LogicalTypeID::DATE},
+        LogicalType{LogicalTypeID::TIMESTAMP}, LogicalType{LogicalTypeID::INTERVAL},
+        LogicalType{LogicalTypeID::VAR_LIST}, LogicalType{LogicalTypeID::FLOAT},
+        LogicalType{LogicalTypeID::SERIAL}};
 }
 
 std::vector<std::string> LogicalTypeUtils::parseStructFields(const std::string& structTypeStr) {
diff --git a/src/function/built_in_aggregate_functions.cpp b/src/function/built_in_aggregate_functions.cpp
index e06ba6d9fe..01c01d2838 100644
--- a/src/function/built_in_aggregate_functions.cpp
+++ b/src/function/built_in_aggregate_functions.cpp
@@ -85,18 +85,11 @@ void BuiltInAggregateFunctions::registerCountStar() {
 
 void BuiltInAggregateFunctions::registerCount() {
     std::vector<std::unique_ptr<AggregateFunctionDefinition>> definitions;
-    LogicalType inputType;
-    for (auto& typeID : LogicalTypeUtils::getAllValidLogicTypeIDs()) {
-        if (typeID == LogicalTypeID::VAR_LIST) {
-            inputType = LogicalType(
-                typeID, std::make_unique<VarListTypeInfo>(std::make_unique<LogicalType>()));
-        } else {
-            inputType = LogicalType(typeID);
-        }
+    for (auto& type : LogicalTypeUtils::getAllValidLogicTypes()) {
         for (auto isDistinct : std::vector<bool>{true, false}) {
             definitions.push_back(std::make_unique<AggregateFunctionDefinition>(COUNT_FUNC_NAME,
-                std::vector<LogicalTypeID>{typeID}, LogicalTypeID::INT64,
-                AggregateFunctionUtil::getCountFunction(inputType, isDistinct), isDistinct));
+                std::vector<LogicalTypeID>{type.getLogicalTypeID()}, LogicalTypeID::INT64,
+                AggregateFunctionUtil::getCountFunction(type, isDistinct), isDistinct));
         }
     }
     aggregateFunctions.insert({COUNT_FUNC_NAME, std::move(definitions)});
diff --git a/src/include/common/string_utils.h b/src/include/common/string_utils.h
index e3df50e3bc..6b1ef45b86 100644
--- a/src/include/common/string_utils.h
+++ b/src/include/common/string_utils.h
@@ -69,6 +69,8 @@ class StringUtils {
 
     static std::string extractStringBetween(const std::string& input, char delimiterStart,
         char delimiterEnd, bool includeDelimiter = false);
+
+    static std::string removeEscapedCharacters(const std::string& input);
 };
 
 } // namespace common
diff --git a/src/include/common/types/blob.h b/src/include/common/types/blob.h
index 09077d4edd..bc0eb4fb87 100644
--- a/src/include/common/types/blob.h
+++ b/src/include/common/types/blob.h
@@ -11,17 +11,17 @@ struct blob_t {
 };
 
 struct HexFormatConstants {
-    static constexpr const char PREFIX[] = "\\\\x";
-    static constexpr const uint64_t PREFIX_LENGTH = 3;
-    static constexpr const uint64_t FIRST_BYTE_POS = PREFIX_LENGTH;
-    static constexpr const uint64_t SECOND_BYTES_POS = PREFIX_LENGTH + 1;
-    static constexpr const uint64_t LENGTH = 5;
-    static constexpr const uint64_t NUM_BYTES_TO_SHIFT_FOR_FIRST_BYTE = 4;
-    static constexpr const uint64_t SECOND_BYTE_MASK = 0x0F;
     // map of integer -> hex value.
     static constexpr const char* HEX_TABLE = "0123456789ABCDEF";
     // reverse map of byte -> integer value, or -1 for invalid hex values.
     static const int HEX_MAP[256];
+    static constexpr const uint64_t NUM_BYTES_TO_SHIFT_FOR_FIRST_BYTE = 4;
+    static constexpr const uint64_t SECOND_BYTE_MASK = 0x0F;
+    static constexpr const char PREFIX[] = "\\x";
+    static constexpr const uint64_t PREFIX_LENGTH = 2;
+    static constexpr const uint64_t FIRST_BYTE_POS = PREFIX_LENGTH;
+    static constexpr const uint64_t SECOND_BYTES_POS = PREFIX_LENGTH + 1;
+    static constexpr const uint64_t LENGTH = 4;
 };
 
 struct Blob {
@@ -29,7 +29,7 @@ struct Blob {
 
     static uint64_t getBlobSize(const ku_string_t& blob);
 
-    static void fromString(ku_string_t& str, uint8_t* resultBuffer);
+    static uint64_t fromString(const char* str, uint64_t length, uint8_t* resultBuffer);
 
 private:
     static void validateHexCode(const uint8_t* blobStr, uint64_t length, uint64_t curPos);
diff --git a/src/include/common/types/types.h b/src/include/common/types/types.h
index 9917bd5909..b0f8ec6710 100644
--- a/src/include/common/types/types.h
+++ b/src/include/common/types/types.h
@@ -323,7 +323,7 @@ class LogicalTypeUtils {
     static bool isNumerical(const LogicalType& dataType);
     static std::vector<LogicalType> getAllValidComparableLogicalTypes();
     static std::vector<LogicalTypeID> getNumericalLogicalTypeIDs();
-    static std::vector<LogicalTypeID> getAllValidLogicTypeIDs();
+    static std::vector<LogicalType> getAllValidLogicTypes();
 
 private:
     static LogicalTypeID dataTypeIDFromString(const std::string& dataTypeIDString);
diff --git a/src/include/function/cast/cast_operations.h b/src/include/function/cast/cast_operations.h
index 8471183558..20a29a943d 100644
--- a/src/include/function/cast/cast_operations.h
+++ b/src/include/function/cast/cast_operations.h
@@ -57,7 +57,11 @@ struct CastToBlob {
                 common::StringVector::getInMemOverflowBuffer(&resultVector)
                     ->allocateSpace(result.value.len));
         }
-        common::Blob::fromString(input, result.value.getDataWritable());
+        common::Blob::fromString(reinterpret_cast<const char*>(input.getData()), input.len,
+            result.value.getDataWritable());
+        if (!common::ku_string_t::isShortString(result.value.len)) {
+            memcpy(result.value.prefix, result.value.getData(), common::ku_string_t::PREFIX_LENGTH);
+        }
     }
 };
 
diff --git a/src/include/storage/in_mem_storage_structure/in_mem_column.h b/src/include/storage/in_mem_storage_structure/in_mem_column.h
index aaa2e43aad..6d09bbc89a 100644
--- a/src/include/storage/in_mem_storage_structure/in_mem_column.h
+++ b/src/include/storage/in_mem_storage_structure/in_mem_column.h
@@ -16,17 +16,17 @@ class InMemColumn {
 
     std::unique_ptr<InMemColumnChunk> getInMemColumnChunk(common::offset_t startNodeOffset,
         common::offset_t endNodeOffset, const common::CopyDescription* copyDescription) {
-        switch (dataType.getLogicalTypeID()) {
-        case common::LogicalTypeID::STRING:
-        case common::LogicalTypeID::VAR_LIST: {
+        switch (dataType.getPhysicalType()) {
+        case common::PhysicalTypeID::STRING:
+        case common::PhysicalTypeID::VAR_LIST: {
             return std::make_unique<InMemColumnChunkWithOverflow>(
                 dataType, startNodeOffset, endNodeOffset, copyDescription, inMemOverflowFile.get());
         }
-        case common::LogicalTypeID::FIXED_LIST: {
+        case common::PhysicalTypeID::FIXED_LIST: {
             return std::make_unique<InMemFixedListColumnChunk>(
                 dataType, startNodeOffset, endNodeOffset, copyDescription);
         }
-        case common::LogicalTypeID::STRUCT: {
+        case common::PhysicalTypeID::STRUCT: {
             auto inMemStructColumnChunk = std::make_unique<InMemStructColumnChunk>(
                 dataType, startNodeOffset, endNodeOffset, copyDescription);
             for (auto& fieldColumn : childColumns) {
diff --git a/src/include/storage/in_mem_storage_structure/in_mem_column_chunk.h b/src/include/storage/in_mem_storage_structure/in_mem_column_chunk.h
index c03a473603..08ec72b7c6 100644
--- a/src/include/storage/in_mem_storage_structure/in_mem_column_chunk.h
+++ b/src/include/storage/in_mem_storage_structure/in_mem_column_chunk.h
@@ -87,7 +87,8 @@ class InMemColumnChunkWithOverflow : public InMemColumnChunk {
         common::offset_t endNodeOffset, const common::CopyDescription* copyDescription,
         InMemOverflowFile* inMemOverflowFile)
         : InMemColumnChunk{std::move(dataType), startNodeOffset, endNodeOffset, copyDescription},
-          inMemOverflowFile{inMemOverflowFile} {}
+          inMemOverflowFile{inMemOverflowFile}, blobBuffer{std::make_unique<uint8_t[]>(
+                                                    common::BufferPoolConstants::PAGE_4KB_SIZE)} {}
 
     void copyArrowArray(arrow::Array& array, arrow::Array* nodeOffsets = nullptr) final;
 
@@ -106,6 +107,7 @@ class InMemColumnChunkWithOverflow : public InMemColumnChunk {
     storage::InMemOverflowFile* inMemOverflowFile;
     // TODO(Ziyi/Guodong): Fix this for rel columns.
     PageByteCursor overflowCursor;
+    std::unique_ptr<uint8_t[]> blobBuffer;
 };
 
 class InMemStructColumnChunk : public InMemColumnChunk {
diff --git a/src/include/storage/in_mem_storage_structure/in_mem_lists.h b/src/include/storage/in_mem_storage_structure/in_mem_lists.h
index fdb8f90613..6211a5342c 100644
--- a/src/include/storage/in_mem_storage_structure/in_mem_lists.h
+++ b/src/include/storage/in_mem_storage_structure/in_mem_lists.h
@@ -142,6 +142,7 @@ class InMemListsWithOverflow : public InMemLists {
     std::unique_ptr<InMemOverflowFile> overflowInMemFile;
     // TODO(Guodong/Ziyi): Fix for concurrent writes.
     PageByteCursor overflowCursor;
+    std::unique_ptr<uint8_t[]> blobBuffer;
 };
 
 class InMemAdjLists : public InMemLists {
diff --git a/src/include/storage/storage_structure/column.h b/src/include/storage/storage_structure/column.h
index d0852a48f6..e7c66e9cfa 100644
--- a/src/include/storage/storage_structure/column.h
+++ b/src/include/storage/storage_structure/column.h
@@ -221,6 +221,7 @@ class ColumnFactory {
         case common::LogicalTypeID::INTERVAL:
         case common::LogicalTypeID::FIXED_LIST:
             return std::make_unique<Column>(structureIDAndFName, logicalType, bufferManager, wal);
+        case common::LogicalTypeID::BLOB:
         case common::LogicalTypeID::STRING:
             return std::make_unique<StringPropertyColumn>(
                 structureIDAndFName, logicalType, bufferManager, wal);
diff --git a/src/include/storage/storage_structure/lists/lists.h b/src/include/storage/storage_structure/lists/lists.h
index 9861923058..430e8160ca 100644
--- a/src/include/storage/storage_structure/lists/lists.h
+++ b/src/include/storage/storage_structure/lists/lists.h
@@ -264,6 +264,7 @@ class ListsFactory {
             return std::make_unique<Lists>(structureIDAndFName, dataType,
                 storage::StorageUtils::getDataTypeSize(dataType), adjListsHeaders, bufferManager,
                 wal, listsUpdatesStore);
+        case common::LogicalTypeID::BLOB:
         case common::LogicalTypeID::STRING:
             return std::make_unique<StringPropertyLists>(
                 structureIDAndFName, adjListsHeaders, bufferManager, wal, listsUpdatesStore);
diff --git a/src/parser/transformer.cpp b/src/parser/transformer.cpp
index 6b18761a0a..0f63970707 100644
--- a/src/parser/transformer.cpp
+++ b/src/parser/transformer.cpp
@@ -1088,7 +1088,7 @@ Transformer::transformParsingOptions(CypherParser::KU_ParsingOptionsContext& ctx
 
 std::string Transformer::transformStringLiteral(antlr4::tree::TerminalNode& stringLiteral) {
     auto str = stringLiteral.getText();
-    return str.substr(1, str.size() - 2);
+    return common::StringUtils::removeEscapedCharacters(str);
 }
 
 } // namespace parser
diff --git a/src/processor/operator/order_by/order_by.cpp b/src/processor/operator/order_by/order_by.cpp
index 7218981754..c06e6555e6 100644
--- a/src/processor/operator/order_by/order_by.cpp
+++ b/src/processor/operator/order_by/order_by.cpp
@@ -50,7 +50,7 @@ void OrderBy::initGlobalStateInternal(kuzu::processor::ExecutionContext* context
     auto tableSchema = populateTableSchema();
     for (auto i = 0u; i < orderByDataInfo.keysPosAndType.size(); ++i) {
         auto [dataPos, dataType] = orderByDataInfo.keysPosAndType[i];
-        if (LogicalTypeID::STRING == dataType.getLogicalTypeID()) {
+        if (PhysicalTypeID::STRING == dataType.getPhysicalType()) {
             // If this is a string column, we need to find the factorizedTable offset for this
             // column.
             auto factorizedTableColIdx = 0ul;
diff --git a/src/storage/copier/table_copy_utils.cpp b/src/storage/copier/table_copy_utils.cpp
index 350dbb117c..d6a2d01b97 100644
--- a/src/storage/copier/table_copy_utils.cpp
+++ b/src/storage/copier/table_copy_utils.cpp
@@ -365,6 +365,7 @@ std::shared_ptr<arrow::DataType> TableCopyUtils::toArrowDataType(const LogicalTy
     case LogicalTypeID::INTERVAL:
     case LogicalTypeID::FIXED_LIST:
     case LogicalTypeID::VAR_LIST:
+    case LogicalTypeID::BLOB:
     case LogicalTypeID::STRING:
     case LogicalTypeID::STRUCT: {
         return arrow::utf8();
diff --git a/src/storage/in_mem_storage_structure/in_mem_column.cpp b/src/storage/in_mem_storage_structure/in_mem_column.cpp
index 21572c1359..294f314c04 100644
--- a/src/storage/in_mem_storage_structure/in_mem_column.cpp
+++ b/src/storage/in_mem_storage_structure/in_mem_column.cpp
@@ -12,8 +12,8 @@ namespace storage {
 InMemColumn::InMemColumn(std::string filePath, LogicalType dataType, bool requireNullBits)
     : filePath{std::move(filePath)}, dataType{std::move(dataType)} {
     // TODO(Guodong): Separate this as a function.
-    switch (this->dataType.getLogicalTypeID()) {
-    case LogicalTypeID::STRUCT: {
+    switch (this->dataType.getPhysicalType()) {
+    case PhysicalTypeID::STRUCT: {
         auto fieldTypes = common::StructType::getFieldTypes(&this->dataType);
         childColumns.reserve(fieldTypes.size());
         for (auto i = 0u; i < fieldTypes.size(); i++) {
@@ -22,8 +22,8 @@ InMemColumn::InMemColumn(std::string filePath, LogicalType dataType, bool requir
                 true /* hasNull */));
         }
     } break;
-    case LogicalTypeID::STRING:
-    case LogicalTypeID::VAR_LIST: {
+    case PhysicalTypeID::STRING:
+    case PhysicalTypeID::VAR_LIST: {
         inMemOverflowFile =
             std::make_unique<InMemOverflowFile>(StorageUtils::getOverflowFileName(this->filePath));
         fileHandle = std::make_unique<FileHandle>(
diff --git a/src/storage/in_mem_storage_structure/in_mem_column_chunk.cpp b/src/storage/in_mem_storage_structure/in_mem_column_chunk.cpp
index 4d7554b9bb..c3d2d82eb3 100644
--- a/src/storage/in_mem_storage_structure/in_mem_column_chunk.cpp
+++ b/src/storage/in_mem_storage_structure/in_mem_column_chunk.cpp
@@ -220,6 +220,9 @@ void InMemColumnChunkWithOverflow::copyArrowArray(arrow::Array& array, arrow::Ar
         case common::LogicalTypeID::STRING: {
             templateCopyValuesAsStringToPageWithOverflow<ku_string_t>(array, nodeOffsets);
         } break;
+        case common::LogicalTypeID::BLOB: {
+            templateCopyValuesAsStringToPageWithOverflow<blob_t>(array, nodeOffsets);
+        } break;
         case common::LogicalTypeID::VAR_LIST: {
             templateCopyValuesAsStringToPageWithOverflow<ku_list_t>(array, nodeOffsets);
         } break;
@@ -303,6 +306,17 @@ void InMemColumnChunkWithOverflow::setValWithOverflow<ku_string_t>(
     setValue(val, pos);
 }
 
+// BLOB
+template<>
+void InMemColumnChunkWithOverflow::setValWithOverflow<blob_t>(
+    const char* value, uint64_t length, uint64_t pos) {
+    auto blobLen = Blob::fromString(
+        value, std::min(length, BufferPoolConstants::PAGE_4KB_SIZE), blobBuffer.get());
+    auto blobVal = inMemOverflowFile->copyString(
+        reinterpret_cast<const char*>(blobBuffer.get()), blobLen, overflowCursor);
+    setValue(blobVal, pos);
+}
+
 // VAR_LIST
 template<>
 void InMemColumnChunkWithOverflow::setValWithOverflow<ku_list_t>(
diff --git a/src/storage/in_mem_storage_structure/in_mem_lists.cpp b/src/storage/in_mem_storage_structure/in_mem_lists.cpp
index 2c6279ef81..f24a8b5226 100644
--- a/src/storage/in_mem_storage_structure/in_mem_lists.cpp
+++ b/src/storage/in_mem_storage_structure/in_mem_lists.cpp
@@ -312,7 +312,8 @@ InMemListsWithOverflow::InMemListsWithOverflow(std::string fName, LogicalType da
     uint64_t numNodes, std::shared_ptr<ListHeadersBuilder> listHeadersBuilder,
     const common::CopyDescription* copyDescription)
     : InMemLists{std::move(fName), std::move(dataType),
-          storage::StorageUtils::getDataTypeSize(dataType), numNodes, copyDescription, true} {
+          storage::StorageUtils::getDataTypeSize(dataType), numNodes, copyDescription, true},
+      blobBuffer{std::make_unique<uint8_t[]>(BufferPoolConstants::PAGE_4KB_SIZE)} {
     assert(this->dataType.getLogicalTypeID() == LogicalTypeID::STRING ||
            this->dataType.getLogicalTypeID() == LogicalTypeID::VAR_LIST);
     overflowInMemFile =
@@ -324,6 +325,10 @@ void InMemListsWithOverflow::copyArrowArray(
     arrow::Array* boundNodeOffsets, arrow::Array* posInRelLists, arrow::Array* array) {
     assert(array->type_id() == arrow::Type::STRING);
     switch (dataType.getLogicalTypeID()) {
+    case common::LogicalTypeID::BLOB: {
+        templateCopyArrayAsStringToRelListsWithOverflow<blob_t>(
+            boundNodeOffsets, posInRelLists, array);
+    } break;
     case common::LogicalTypeID::STRING: {
         templateCopyArrayAsStringToRelListsWithOverflow<ku_string_t>(
             boundNodeOffsets, posInRelLists, array);
@@ -360,6 +365,16 @@ void InMemListsWithOverflow::setValueFromStringWithOverflow<ku_string_t>(
     setValue(nodeOffset, pos, (uint8_t*)&stringVal);
 }
 
+template<>
+void InMemListsWithOverflow::setValueFromStringWithOverflow<blob_t>(
+    offset_t nodeOffset, uint64_t pos, const char* val, uint64_t length) {
+    auto blobLen = Blob::fromString(
+        val, std::min(length, BufferPoolConstants::PAGE_4KB_SIZE), blobBuffer.get());
+    auto blobVal = overflowInMemFile->copyString(
+        reinterpret_cast<const char*>(blobBuffer.get()), blobLen, overflowCursor);
+    setValue(nodeOffset, pos, (uint8_t*)&blobVal);
+}
+
 template<>
 void InMemListsWithOverflow::setValueFromStringWithOverflow<ku_list_t>(
     offset_t nodeOffset, uint64_t pos, const char* val, uint64_t length) {
@@ -390,6 +405,7 @@ std::unique_ptr<InMemLists> InMemListsFactory::getInMemPropertyLists(const std::
         return make_unique<InMemLists>(fName, dataType,
             storage::StorageUtils::getDataTypeSize(dataType), numNodes,
             std::move(listHeadersBuilder), copyDescription, true /* hasNULLBytes */);
+    case LogicalTypeID::BLOB:
     case LogicalTypeID::STRING:
         return make_unique<InMemStringLists>(fName, numNodes, std::move(listHeadersBuilder));
     case LogicalTypeID::VAR_LIST:
diff --git a/test/c_api/connection_test.cpp b/test/c_api/connection_test.cpp
index 9ff5d3633a..70765c4903 100644
--- a/test/c_api/connection_test.cpp
+++ b/test/c_api/connection_test.cpp
@@ -173,7 +173,8 @@ TEST_F(CApiConnectionTest, GetNodePropertyNames) {
         "\tname STRING(PRIMARY KEY)\n"
         "\tlength INT32\n"
         "\tnote STRING\n"
-        "\tdescription STRUCT(RATING:DOUBLE, VIEWS:INT64, RELEASE:TIMESTAMP, FILM:DATE)\n");
+        "\tdescription STRUCT(RATING:DOUBLE, VIEWS:INT64, RELEASE:TIMESTAMP, FILM:DATE)\n"
+        "\tcontent BLOB\n");
     free(result);
 }
 
@@ -186,7 +187,8 @@ TEST_F(CApiConnectionTest, GetRelPropertyNames) {
                             "meets dst node: person\n"
                             "meets properties: \n"
                             "\tlocation FLOAT[2]\n"
-                            "\ttimes INT32\n");
+                            "\ttimes INT32\n"
+                            "\tdata BLOB\n");
     free(result);
 }
 
diff --git a/test/test_files/exceptions/binder/binder_error.test b/test/test_files/exceptions/binder/binder_error.test
index 4c42318ac2..a125c19047 100644
--- a/test/test_files/exceptions/binder/binder_error.test
+++ b/test/test_files/exceptions/binder/binder_error.test
@@ -334,6 +334,8 @@ DISTINCT (TIMESTAMP) -> TIMESTAMP
 (TIMESTAMP) -> TIMESTAMP
 DISTINCT (INTERVAL) -> INTERVAL
 (INTERVAL) -> INTERVAL
+DISTINCT (BLOB) -> BLOB
+(BLOB) -> BLOB
 DISTINCT (STRING) -> STRING
 (STRING) -> STRING
 DISTINCT (SERIAL) -> SERIAL
diff --git a/test/test_files/tinysnb/agg/hash.test b/test/test_files/tinysnb/agg/hash.test
index 317dc58335..0443daa077 100644
--- a/test/test_files/tinysnb/agg/hash.test
+++ b/test/test_files/tinysnb/agg/hash.test
@@ -138,6 +138,13 @@ Alice|1
 40|[Greg]
 83|[Hubert Blaine Wolfeschlegelsteinhausenbergerdorff]
 
+-NAME HashCollectBLOBTest
+-QUERY MATCH (m:movies) RETURN m.length > 130, collect(m.content)
+---- 2
+False|[\xAA\xABinteresting\x0B]
+True|[\xAB\xCD,pure ascii characters]
+
+
 -NAME HashCollectLISTOfINT64Test
 -QUERY MATCH (p:person) RETURN p.gender, collect(p.workedHours)
 ---- 2
@@ -190,3 +197,27 @@ Elizabeth|[25,40]
 ---- 2
 2020|[55,22]
 2021|[5]
+
+-NAME HashAggCountBlob
+-QUERY MATCH (m:movies) RETURN m.length > 2542, COUNT(m.content)
+-PARALLELISM 4
+-ENUMERATE
+---- 2
+False|2
+True|1
+
+-NAME HashAggMinBlob
+-QUERY MATCH (m:movies) RETURN m.length <> 298, MIN(m.content)
+-PARALLELISM 2
+-ENUMERATE
+---- 2
+False|pure ascii characters
+True|\xAA\xABinteresting\x0B
+
+-NAME HashAggMaxBlob
+-QUERY MATCH (m:movies) RETURN m.length <> 126, MAX(m.content)
+-PARALLELISM 1
+-ENUMERATE
+---- 2
+False|\xAA\xABinteresting\x0B
+True|\xAB\xCD
diff --git a/test/test_files/tinysnb/agg/simple.test b/test/test_files/tinysnb/agg/simple.test
index 010c2637fb..12415e9920 100644
--- a/test/test_files/tinysnb/agg/simple.test
+++ b/test/test_files/tinysnb/agg/simple.test
@@ -93,6 +93,11 @@ False
 ---- 1
 [5,55,22]
 
+-NAME SimpleCollectBLOBTest
+-QUERY MATCH (m:movies) RETURN collect(m.content)
+---- 1
+[\xAA\xABinteresting\x0B,\xAB\xCD,pure ascii characters]
+
 -NAME SimpleCollectSTRINGTest
 -QUERY MATCH (p:person) RETURN collect(p.fName)
 ---- 1
@@ -172,3 +177,24 @@ False
 -ENUMERATE
 ---- 1
 989.333333
+
+-NAME SimpleAggCountBlob
+-QUERY MATCH (m:movies) RETURN COUNT(m.content)
+-PARALLELISM 3
+-ENUMERATE
+---- 1
+3
+
+-NAME SimpleAggMinBlob
+-QUERY MATCH (m:movies) RETURN MIN(m.content)
+-PARALLELISM 3
+-ENUMERATE
+---- 1
+pure ascii characters
+
+-NAME SimpleAggMaxBlob
+-QUERY MATCH (m:movies) RETURN MAX(m.content)
+-PARALLELISM 3
+-ENUMERATE
+---- 1
+\xAB\xCD
diff --git a/test/test_files/tinysnb/function/blob.test b/test/test_files/tinysnb/function/blob.test
index 8504d50ca0..7ed4d6e18b 100644
--- a/test/test_files/tinysnb/function/blob.test
+++ b/test/test_files/tinysnb/function/blob.test
@@ -39,3 +39,45 @@ decode this utf-8 char: ü
 -QUERY RETURN decode(BLOB('decode this invalid utf-8 char: \\xA1\\xBC'))
 ---- error
 Runtime exception: Failure in decode: could not convert blob to UTF8 string, the blob contained invalid UTF8 characters
+
+-NAME BlobGreaterThan
+-QUERY MATCH (m:movies) RETURN m.content > BLOB("test");
+---- 3
+True
+True
+False
+
+-NAME BlobGreaterThanOrEqualTo
+-QUERY MATCH (m:movies) RETURN m.content >= BLOB("\\xAA\\xABinteresting\\x0B");
+---- 3
+True
+True
+False
+
+-NAME BlobEqualTo
+-QUERY MATCH (m:movies) RETURN m.content = BLOB("\\xAA\\xABinteresting\\x0B");
+---- 3
+True
+False
+False
+
+-NAME BlobNotEqualTo
+-QUERY MATCH (m:movies) RETURN m.content <> BLOB("\\xAA\\xABinteresting\\x0B");
+---- 3
+False
+True
+True
+
+-NAME BlobGreaterThan
+-QUERY MATCH (m:movies) RETURN m.content < BLOB("\\xAB\\xCD");
+---- 3
+True
+False
+True
+
+-NAME BlobGreaterThanOrEqualTo
+-QUERY MATCH (m:movies) RETURN m.content <= BLOB("pure ascii characters");
+---- 3
+False
+False
+True
diff --git a/test/test_files/tinysnb/order_by/single_label.test b/test/test_files/tinysnb/order_by/single_label.test
index eacefd2054..39e72da174 100644
--- a/test/test_files/tinysnb/order_by/single_label.test
+++ b/test/test_files/tinysnb/order_by/single_label.test
@@ -134,6 +134,15 @@ Carol
 Bob
 Alice
 
+-NAME OrderByBlobTest
+-QUERY MATCH (m:movies) RETURN m.content ORDER BY m.content DESC
+-CHECK_ORDER
+-PARALLELISM 2
+---- 3
+\xAB\xCD
+\xAA\xABinteresting\x0B
+pure ascii characters
+
 -NAME OrderByStrMultipleColTest
 -QUERY MATCH (p:person) RETURN p.age, p.eyeSight ORDER BY p.isWorker desc, p.age, p.eyeSight desc
 -CHECK_ORDER
diff --git a/test/test_files/tinysnb/projection/multi_label.test b/test/test_files/tinysnb/projection/multi_label.test
index dd2cdd4e2a..e918a73b17 100644
--- a/test/test_files/tinysnb/projection/multi_label.test
+++ b/test/test_files/tinysnb/projection/multi_label.test
@@ -30,9 +30,9 @@
 -NAME MultiLabelReturnStar
 -QUERY MATCH (a:movies:organisation) RETURN *
 ---- 6
-(label:movies, 2:0, {ID:, name:Sóló cón tu párejâ, orgCode:, mark:, score:, history:, licenseValidInterval:, rating:, state:, length:126, note: this is a very very good movie, description:{RATING: 5.300000, VIEWS: 152, RELEASE: 2011-08-20 11:25:30, FILM: 2012-05-11}})
-(label:movies, 2:1, {ID:, name:The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie, orgCode:, mark:, score:, history:, licenseValidInterval:, rating:, state:, length:2544, note: the movie is very very good, description:{RATING: 7.000000, VIEWS: 982, RELEASE: 2018-11-13 13:33:11, FILM: 2014-09-12}})
-(label:movies, 2:2, {ID:, name:Roma, orgCode:, mark:, score:, history:, licenseValidInterval:, rating:, state:, length:298, note:the movie is very interesting and funny, description:{RATING: 1223.000000, VIEWS: 10003, RELEASE: 2011-02-11 16:44:22, FILM: 2013-02-22}})
-(label:organisation, 1:0, {ID:1, name:ABFsUni, orgCode:325, mark:3.700000, score:-2, history:10 years 5 months 13 hours 24 us, licenseValidInterval:3 years 5 days, rating:1.000000, state:{REVENUE: 138, LOCATION: ['toronto', 'montr,eal'], STOCK: {PRICE: [96,56], VOLUME: 1000}}, length:, note:, description:})
-(label:organisation, 1:1, {ID:4, name:CsWork, orgCode:934, mark:4.100000, score:-100, history:2 years 4 days 10 hours, licenseValidInterval:26 years 52 days 48:00:00, rating:0.780000, state:{REVENUE: 152, LOCATION: ["vanco,uver north area"], STOCK: {PRICE: [15,78,671], VOLUME: 432}}, length:, note:, description:})
-(label:organisation, 1:2, {ID:6, name:DEsWork, orgCode:824, mark:4.100000, score:7, history:2 years 4 hours 22 us 34 minutes, licenseValidInterval:82:00:00.1, rating:0.520000, state:{REVENUE: 558, LOCATION: ['very long city name', 'new york'], STOCK: {PRICE: [22], VOLUME: 99}}, length:, note:, description:})
+(label:movies, 2:0, {ID:, name:Sóló cón tu párejâ, orgCode:, mark:, score:, history:, licenseValidInterval:, rating:, state:, length:126, note: this is a very very good movie, description:{RATING: 5.300000, VIEWS: 152, RELEASE: 2011-08-20 11:25:30, FILM: 2012-05-11}, content:\xAA\xABinteresting\x0B})
+(label:movies, 2:1, {ID:, name:The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie, orgCode:, mark:, score:, history:, licenseValidInterval:, rating:, state:, length:2544, note: the movie is very very good, description:{RATING: 7.000000, VIEWS: 982, RELEASE: 2018-11-13 13:33:11, FILM: 2014-09-12}, content:\xAB\xCD})
+(label:movies, 2:2, {ID:, name:Roma, orgCode:, mark:, score:, history:, licenseValidInterval:, rating:, state:, length:298, note:the movie is very interesting and funny, description:{RATING: 1223.000000, VIEWS: 10003, RELEASE: 2011-02-11 16:44:22, FILM: 2013-02-22}, content:pure ascii characters})
+(label:organisation, 1:0, {ID:1, name:ABFsUni, orgCode:325, mark:3.700000, score:-2, history:10 years 5 months 13 hours 24 us, licenseValidInterval:3 years 5 days, rating:1.000000, state:{REVENUE: 138, LOCATION: ['toronto', 'montr,eal'], STOCK: {PRICE: [96,56], VOLUME: 1000}}, length:, note:, description:, content:})
+(label:organisation, 1:1, {ID:4, name:CsWork, orgCode:934, mark:4.100000, score:-100, history:2 years 4 days 10 hours, licenseValidInterval:26 years 52 days 48:00:00, rating:0.780000, state:{REVENUE: 152, LOCATION: ["vanco,uver north area"], STOCK: {PRICE: [15,78,671], VOLUME: 432}}, length:, note:, description:, content:})
+(label:organisation, 1:2, {ID:6, name:DEsWork, orgCode:824, mark:4.100000, score:7, history:2 years 4 hours 22 us 34 minutes, licenseValidInterval:82:00:00.1, rating:0.520000, state:{REVENUE: 558, LOCATION: ['very long city name', 'new york'], STOCK: {PRICE: [22], VOLUME: 99}}, length:, note:, description:, content:})
diff --git a/test/test_files/tinysnb/projection/single_label.test b/test/test_files/tinysnb/projection/single_label.test
index 3c287cdfbc..98ef93669c 100644
--- a/test/test_files/tinysnb/projection/single_label.test
+++ b/test/test_files/tinysnb/projection/single_label.test
@@ -619,3 +619,26 @@ Dan|Carol
 {{8=Farooq}=25}
 {{9=Greg}=40}
 {{10=Hubert Blaine Wolfeschlegelsteinhausenbergerdorff}=83}
+
+-NAME ReturnNodeBlobProp
+-QUERY MATCH (m:movies) RETURN m.content;
+---- 3
+\xAA\xABinteresting\x0B
+\xAB\xCD
+pure ascii characters
+
+-NAME ReturnRelBlobProp
+-QUERY MATCH (p:person)-[e:meets]->(p1:person) RETURN e.data;
+---- 7
+MIXED \x5CxAC with ASCII \x5Cx0A
+NO hex code
+NO hex code
+\x5Cx3A\x5CxA3
+\x5Cx3A\x5CxA3
+\x5CxA1\x5Cx2A
+\x5CxAA\x5CxBB\x5CxCC\x5CxDD
+
+-NAME ReturnStringLiteralWithSpecialChar
+-QUERY RETURN "hEl \'\"\\";
+---- 1
+hEl '"\
diff --git a/tools/python_api/test/conftest.py b/tools/python_api/test/conftest.py
index c7f1d23dff..222545a8e4 100644
--- a/tools/python_api/test/conftest.py
+++ b/tools/python_api/test/conftest.py
@@ -28,7 +28,7 @@ def init_tiny_snb(tmp_path):
                  "STRING[], stock STRUCT(price INT64[], volume INT64)), PRIMARY KEY (ID));")
     conn.execute('COPY organisation FROM "../../../dataset/tinysnb/vOrganisation.csv"')
     conn.execute('CREATE NODE TABLE movies (name STRING, length INT32, note STRING, description STRUCT(rating DOUBLE, '
-                 'views INT64, release TIMESTAMP, film DATE), PRIMARY KEY (name))')
+                 'views INT64, release TIMESTAMP, film DATE), content BYTEA, PRIMARY KEY (name))')
     conn.execute('COPY movies FROM "../../../dataset/tinysnb/vMovies.csv"')
     conn.execute('create rel table workAt (FROM person TO organisation, year INT64, grading DOUBLE[2], rating float,'
                  ' MANY_ONE)')