finish string cast to num

kuzudb · Sep 29, 2023 · af4bd0b · af4bd0b
1 parent d4b0f19
commit af4bd0b
Show file tree

Hide file tree

Showing 19 changed files with 3,018 additions and 116 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -223,6 +223,7 @@ include_directories(third_party/re2/include)
 include_directories(third_party/concurrentqueue)
 include_directories(third_party/serd/include)
 include_directories(third_party/miniparquet/src)
+include_directories(third_party/fast_float/include)
 
 add_subdirectory(third_party)
 add_subdirectory(src)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -14,7 +14,7 @@ add_subdirectory(transaction)
 
 add_library(kuzu STATIC ${ALL_OBJECT_FILES})
 target_link_libraries(kuzu
-        PUBLIC antlr4_cypher antlr4_runtime utf8proc re2 serd ${PARQUET_LIB} ${ARROW_LIB} Threads::Threads fastpfor miniparquet)
+        PUBLIC antlr4_cypher antlr4_runtime fast_float utf8proc re2 serd ${PARQUET_LIB} ${ARROW_LIB} Threads::Threads fastpfor miniparquet)
 target_include_directories(kuzu
         PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
 add_library(kuzu_shared SHARED ${ALL_OBJECT_FILES})
@@ -24,6 +24,6 @@ else()
         set_target_properties(kuzu_shared PROPERTIES OUTPUT_NAME kuzu)
 endif()
 target_link_libraries(kuzu_shared
-        PUBLIC antlr4_cypher antlr4_runtime utf8proc re2 serd ${PARQUET_LIB} ${ARROW_LIB} Threads::Threads fastpfor miniparquet)
+        PUBLIC antlr4_cypher antlr4_runtime fast_float utf8proc re2 serd ${PARQUET_LIB} ${ARROW_LIB} Threads::Threads fastpfor miniparquet)
 target_include_directories(kuzu_shared
         PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
diff --git a/src/common/string_utils.cpp b/src/common/string_utils.cpp
@@ -31,6 +31,21 @@ std::vector<std::string> StringUtils::splitBySpace(const std::string& input) {
     return result;
 }
 
+void StringUtils::removeCStringWhiteSpaces(const char*& input, uint64_t& len) {
+    // skip leading/trailing spaces
+    uint64_t start = 0;
+    uint64_t end = len - 1;
+    while (start < len && isspace(input[start])) {
+        start++;
+        len--;
+    }
+    while (end > start && isspace(input[end])) {
+        end--;
+        len--;
+    }
+    input+=start;
+}
+
 void StringUtils::replaceAll(
     std::string& str, const std::string& search, const std::string& replacement) {
     size_t pos = 0;

diff --git a/src/function/vector_cast_functions.cpp b/src/function/vector_cast_functions.cpp
@@ -252,6 +252,8 @@ vector_function_definitions CastToDoubleVectorFunction::getDefinitions() {
         CAST_TO_DOUBLE_FUNC_NAME, LogicalTypeID::UINT8, LogicalTypeID::DOUBLE));
     result.push_back(bindVectorFunction<float_t, double_t, CastToDouble>(
         CAST_TO_DOUBLE_FUNC_NAME, LogicalTypeID::FLOAT, LogicalTypeID::DOUBLE));
+    result.push_back(bindVectorFunction<ku_string_t, double_t, CastToDouble>(
+        CAST_TO_DOUBLE_FUNC_NAME, LogicalTypeID::STRING, LogicalTypeID::DOUBLE));
     return result;
 }
 
@@ -273,6 +275,8 @@ vector_function_definitions CastToFloatVectorFunction::getDefinitions() {
         CAST_TO_FLOAT_FUNC_NAME, LogicalTypeID::UINT16, LogicalTypeID::FLOAT));
     result.push_back(bindVectorFunction<uint8_t, float_t, CastToFloat>(
         CAST_TO_FLOAT_FUNC_NAME, LogicalTypeID::UINT8, LogicalTypeID::FLOAT));
+    result.push_back(bindVectorFunction<ku_string_t, float_t, CastToFloat>(
+        CAST_TO_FLOAT_FUNC_NAME, LogicalTypeID::STRING, LogicalTypeID::FLOAT));
     // down cast
     result.push_back(bindVectorFunction<double_t, float_t, CastToFloat>(
         CAST_TO_FLOAT_FUNC_NAME, LogicalTypeID::DOUBLE, LogicalTypeID::FLOAT));
@@ -314,16 +318,19 @@ vector_function_definitions CastToInt64VectorFunction::getDefinitions() {
     // down cast
     result.push_back(bindVectorFunction<uint64_t, int64_t, CastToInt64>(
         CAST_TO_INT64_FUNC_NAME, LogicalTypeID::UINT64, LogicalTypeID::INT64));
+    result.push_back(bindVectorFunction<float_t, int64_t, CastToInt64>(
+        CAST_TO_INT64_FUNC_NAME, LogicalTypeID::FLOAT, LogicalTypeID::INT64));
+    result.push_back(bindVectorFunction<double_t, int64_t, CastToInt64>(
+        CAST_TO_INT64_FUNC_NAME, LogicalTypeID::DOUBLE, LogicalTypeID::INT64));
+    // down cast end
     result.push_back(bindVectorFunction<uint32_t, int64_t, CastToInt64>(
         CAST_TO_INT64_FUNC_NAME, LogicalTypeID::UINT32, LogicalTypeID::INT64));
     result.push_back(bindVectorFunction<uint16_t, int64_t, CastToInt64>(
         CAST_TO_INT64_FUNC_NAME, LogicalTypeID::UINT16, LogicalTypeID::INT64));
     result.push_back(bindVectorFunction<uint8_t, int64_t, CastToInt64>(
         CAST_TO_INT64_FUNC_NAME, LogicalTypeID::UINT8, LogicalTypeID::INT64));
-    result.push_back(bindVectorFunction<float_t, int64_t, CastToInt64>(
-        CAST_TO_INT64_FUNC_NAME, LogicalTypeID::FLOAT, LogicalTypeID::INT64));
-    result.push_back(bindVectorFunction<double_t, int64_t, CastToInt64>(
-        CAST_TO_INT64_FUNC_NAME, LogicalTypeID::DOUBLE, LogicalTypeID::INT64));
+    result.push_back(bindVectorFunction<ku_string_t, int64_t, CastToInt64>(
+        CAST_TO_INT64_FUNC_NAME, LogicalTypeID::STRING, LogicalTypeID::INT64));
     return result;
 }
 
@@ -340,14 +347,17 @@ vector_function_definitions CastToInt32VectorFunction::getDefinitions() {
         CAST_TO_INT32_FUNC_NAME, LogicalTypeID::UINT64, LogicalTypeID::INT32));
     result.push_back(bindVectorFunction<uint32_t, int32_t, CastToInt32>(
         CAST_TO_INT32_FUNC_NAME, LogicalTypeID::UINT32, LogicalTypeID::INT32));
-    result.push_back(bindVectorFunction<uint16_t, int32_t, CastToInt32>(
-        CAST_TO_INT32_FUNC_NAME, LogicalTypeID::UINT16, LogicalTypeID::INT32));
-    result.push_back(bindVectorFunction<uint8_t, int32_t, CastToInt32>(
-        CAST_TO_INT32_FUNC_NAME, LogicalTypeID::UINT8, LogicalTypeID::INT32));
     result.push_back(bindVectorFunction<float_t, int32_t, CastToInt32>(
         CAST_TO_INT32_FUNC_NAME, LogicalTypeID::FLOAT, LogicalTypeID::INT32));
     result.push_back(bindVectorFunction<double_t, int32_t, CastToInt32>(
         CAST_TO_INT32_FUNC_NAME, LogicalTypeID::DOUBLE, LogicalTypeID::INT32));
+    // down cast end
+    result.push_back(bindVectorFunction<uint16_t, int32_t, CastToInt32>(
+        CAST_TO_INT32_FUNC_NAME, LogicalTypeID::UINT16, LogicalTypeID::INT32));
+    result.push_back(bindVectorFunction<uint8_t, int32_t, CastToInt32>(
+        CAST_TO_INT32_FUNC_NAME, LogicalTypeID::UINT8, LogicalTypeID::INT32));
+    result.push_back(bindVectorFunction<ku_string_t, int32_t, CastToInt32>(
+        CAST_TO_INT32_FUNC_NAME, LogicalTypeID::STRING, LogicalTypeID::INT32));
     return result;
 }
 
@@ -366,12 +376,15 @@ vector_function_definitions CastToInt16VectorFunction::getDefinitions() {
         CAST_TO_INT16_FUNC_NAME, LogicalTypeID::UINT32, LogicalTypeID::INT16));
     result.push_back(bindVectorFunction<uint16_t, int16_t, CastToInt16>(
         CAST_TO_INT16_FUNC_NAME, LogicalTypeID::UINT16, LogicalTypeID::INT16));
-    result.push_back(bindVectorFunction<uint8_t, int16_t, CastToInt16>(
-        CAST_TO_INT16_FUNC_NAME, LogicalTypeID::UINT8, LogicalTypeID::INT16));
     result.push_back(bindVectorFunction<float_t, int16_t, CastToInt16>(
         CAST_TO_INT16_FUNC_NAME, LogicalTypeID::FLOAT, LogicalTypeID::INT16));
     result.push_back(bindVectorFunction<double_t, int16_t, CastToInt16>(
         CAST_TO_INT16_FUNC_NAME, LogicalTypeID::DOUBLE, LogicalTypeID::INT16));
+    // down cast end
+    result.push_back(bindVectorFunction<uint8_t, int16_t, CastToInt16>(
+        CAST_TO_INT16_FUNC_NAME, LogicalTypeID::UINT8, LogicalTypeID::INT16));
+    result.push_back(bindVectorFunction<ku_string_t, int16_t, CastToInt16>(
+        CAST_TO_INT16_FUNC_NAME, LogicalTypeID::STRING, LogicalTypeID::INT16));
     return result;
 }
 
@@ -386,8 +399,20 @@ vector_function_definitions CastToInt8VectorFunction::getDefinitions() {
         CAST_TO_INT8_FUNC_NAME, LogicalTypeID::INT64, LogicalTypeID::INT8));
     result.push_back(bindVectorFunction<float_t, int8_t, CastToInt8>(
         CAST_TO_INT8_FUNC_NAME, LogicalTypeID::FLOAT, LogicalTypeID::INT8));
+    result.push_back(bindVectorFunction<uint64_t, int8_t, CastToInt8>(
+        CAST_TO_INT8_FUNC_NAME, LogicalTypeID::UINT64, LogicalTypeID::INT8));
+    result.push_back(bindVectorFunction<uint32_t, int8_t, CastToInt8>(
+        CAST_TO_INT8_FUNC_NAME, LogicalTypeID::UINT32, LogicalTypeID::INT8));
+    result.push_back(bindVectorFunction<uint16_t, int8_t, CastToInt8>(
+        CAST_TO_INT8_FUNC_NAME, LogicalTypeID::UINT16, LogicalTypeID::INT8));
+    result.push_back(bindVectorFunction<uint8_t, int8_t, CastToInt8>(
+        CAST_TO_INT8_FUNC_NAME, LogicalTypeID::UINT8, LogicalTypeID::INT8));
     result.push_back(bindVectorFunction<double_t, int8_t, CastToInt8>(
         CAST_TO_INT8_FUNC_NAME, LogicalTypeID::DOUBLE, LogicalTypeID::INT8));
+    result.push_back(bindVectorFunction<float_t, int8_t, CastToInt8>(
+        CAST_TO_INT8_FUNC_NAME, LogicalTypeID::FLOAT, LogicalTypeID::INT8));
+    result.push_back(bindVectorFunction<ku_string_t, int8_t, CastToInt8>(
+        CAST_TO_INT8_FUNC_NAME, LogicalTypeID::STRING, LogicalTypeID::INT8));
     return result;
 }
 
@@ -399,7 +424,6 @@ vector_function_definitions CastToUInt64VectorFunction::getDefinitions() {
         CAST_TO_UINT64_FUNC_NAME, LogicalTypeID::UINT16, LogicalTypeID::UINT64));
     result.push_back(bindVectorFunction<uint32_t, uint64_t, CastToUInt64>(
         CAST_TO_UINT64_FUNC_NAME, LogicalTypeID::UINT32, LogicalTypeID::UINT64));
-    // down cast
     result.push_back(bindVectorFunction<int8_t, uint64_t, CastToUInt64>(
         CAST_TO_UINT64_FUNC_NAME, LogicalTypeID::INT8, LogicalTypeID::UINT64));
     result.push_back(bindVectorFunction<int16_t, uint64_t, CastToUInt64>(
@@ -408,10 +432,13 @@ vector_function_definitions CastToUInt64VectorFunction::getDefinitions() {
         CAST_TO_UINT64_FUNC_NAME, LogicalTypeID::INT32, LogicalTypeID::UINT64));
     result.push_back(bindVectorFunction<int64_t, uint64_t, CastToUInt64>(
         CAST_TO_UINT64_FUNC_NAME, LogicalTypeID::INT64, LogicalTypeID::UINT64));
+    // down cast
     result.push_back(bindVectorFunction<float_t, uint64_t, CastToUInt64>(
         CAST_TO_UINT64_FUNC_NAME, LogicalTypeID::FLOAT, LogicalTypeID::UINT64));
     result.push_back(bindVectorFunction<double_t, uint64_t, CastToUInt64>(
         CAST_TO_UINT64_FUNC_NAME, LogicalTypeID::DOUBLE, LogicalTypeID::UINT64));
+    result.push_back(bindVectorFunction<ku_string_t, uint64_t, CastToUInt64>(
+        CAST_TO_UINT64_FUNC_NAME, LogicalTypeID::STRING, LogicalTypeID::UINT64));
     return result;
 }
 
@@ -436,6 +463,8 @@ vector_function_definitions CastToUInt32VectorFunction::getDefinitions() {
         CAST_TO_UINT32_FUNC_NAME, LogicalTypeID::FLOAT, LogicalTypeID::UINT32));
     result.push_back(bindVectorFunction<double_t, uint32_t, CastToUInt32>(
         CAST_TO_UINT32_FUNC_NAME, LogicalTypeID::DOUBLE, LogicalTypeID::UINT32));
+    result.push_back(bindVectorFunction<ku_string_t, uint32_t, CastToUInt32>(
+        CAST_TO_UINT32_FUNC_NAME, LogicalTypeID::STRING, LogicalTypeID::UINT32));
     return result;
 }
 
@@ -460,6 +489,8 @@ vector_function_definitions CastToUInt16VectorFunction::getDefinitions() {
         CAST_TO_UINT16_FUNC_NAME, LogicalTypeID::FLOAT, LogicalTypeID::UINT16));
     result.push_back(bindVectorFunction<double_t, uint16_t, CastToUInt16>(
         CAST_TO_UINT16_FUNC_NAME, LogicalTypeID::DOUBLE, LogicalTypeID::UINT16));
+    result.push_back(bindVectorFunction<ku_string_t, uint16_t, CastToUInt16>(
+        CAST_TO_UINT16_FUNC_NAME, LogicalTypeID::STRING, LogicalTypeID::UINT16));
     return result;
 }
 
@@ -484,6 +515,8 @@ vector_function_definitions CastToUInt8VectorFunction::getDefinitions() {
         CAST_TO_UINT8_FUNC_NAME, LogicalTypeID::FLOAT, LogicalTypeID::UINT8));
     result.push_back(bindVectorFunction<double_t, uint8_t, CastToUInt8>(
         CAST_TO_UINT8_FUNC_NAME, LogicalTypeID::DOUBLE, LogicalTypeID::UINT8));
+    result.push_back(bindVectorFunction<ku_string_t, uint8_t, CastToUInt8>(
+        CAST_TO_UINT8_FUNC_NAME, LogicalTypeID::STRING, LogicalTypeID::UINT8));
     return result;
 }
 

diff --git a/src/include/common/string_utils.h b/src/include/common/string_utils.h
@@ -69,6 +69,8 @@ class StringUtils {
         str = std::regex_replace(str, whiteSpacePattern, "");
     }
 
+    static void removeCStringWhiteSpaces(const char*& input, uint64_t& len);
+
     static void replaceAll(
         std::string& str, const std::string& search, const std::string& replacement);
 

diff --git a/src/include/common/type_utils.h b/src/include/common/type_utils.h
@@ -19,39 +19,6 @@ class StringCastUtils {
 public:
     static bool tryCastToBoolean(const char* data, uint64_t length, bool& result);
     static bool castToBool(const char* data, uint64_t length);
-    template<typename T>
-    static bool tryCastToNum(const char* data, uint64_t length, T& result) {
-        auto numStr = std::string{data, length};
-        removeSpace(numStr);
-        std::istringstream iss{numStr};
-        if (iss.str().empty()) {
-            throw ConversionException{"Empty string."};
-        }
-
-        if constexpr (std::is_same_v<int8_t, T>) {
-            int val;
-            iss >> val; // C++ will recognize int8 as char if we don't separate this case.
-            result = val;
-        } else if constexpr (std::is_same_v<uint8_t, T>) {
-            int val;
-            iss >> val; // C++ will recognize int8 as char if we don't separate this case.
-            result = val;
-        } else
-            iss >> result;
-
-        if (iss.fail() || !iss.eof()) {
-            return false;
-        }
-        return true;
-    }
-    template<typename T>
-    static T castToNum(const char* data, uint64_t length) {
-        T result;
-        if (!tryCastToNum(data, length, result)) {
-            throw ConversionException{"Invalid number: " + std::string{data, length} + "."};
-        }
-        return result;
-    }
 
 private:
     static void removeSpace(std::string& str);