From 8ca311e5fe35e2d0e92b60eec4555aa49424e6c3 Mon Sep 17 00:00:00 2001 From: Manh Dinh Date: Wed, 24 Apr 2024 19:55:30 -0400 Subject: [PATCH 1/4] Fix bind file scan --- dataset/copy-test/node/parquet/copy.cypher | 2 +- src/binder/bind/bind_file_scan.cpp | 3 +++ src/binder/bind/bind_import_database.cpp | 17 ++++++++++++----- test/test_files/copy/copy_node_parquet.test | 6 ++++++ test/test_files/copy/copy_partial_column.test | 4 ++-- .../exceptions/copy/wrong_header.test | 4 ++-- 6 files changed, 26 insertions(+), 10 deletions(-) diff --git a/dataset/copy-test/node/parquet/copy.cypher b/dataset/copy-test/node/parquet/copy.cypher index a5823700cd..4e9d385846 100644 --- a/dataset/copy-test/node/parquet/copy.cypher +++ b/dataset/copy-test/node/parquet/copy.cypher @@ -1 +1 @@ -COPY tableOfTypes FROM "dataset/copy-test/node/parquet/types_50k*.parquet" (HEADER=true); +COPY tableOfTypes FROM "dataset/copy-test/node/parquet/types_50k*.parquet"; diff --git a/src/binder/bind/bind_file_scan.cpp b/src/binder/bind/bind_file_scan.cpp index cf7c3035ab..ec33fbde1f 100644 --- a/src/binder/bind/bind_file_scan.cpp +++ b/src/binder/bind/bind_file_scan.cpp @@ -75,6 +75,9 @@ std::unique_ptr Binder::bindScanSource(BaseScanSource* sour auto filePaths = bindFilePaths(fileSource->filePaths); // Bind file type. auto fileType = bindFileType(filePaths); + if (fileType != FileType::CSV && options.size() != 0) { + throw BinderException{"Only copy from CSV can have options."}; + } auto config = std::make_unique(fileType, std::move(filePaths)); // Bind options. config->options = bindParsingOptions(options); diff --git a/src/binder/bind/bind_import_database.cpp b/src/binder/bind/bind_import_database.cpp index a919214764..383149567d 100644 --- a/src/binder/bind/bind_import_database.cpp +++ b/src/binder/bind/bind_import_database.cpp @@ -55,12 +55,19 @@ std::unique_ptr Binder::bindImportDatabaseClause(const Statement copyFromStatement->getSource()) ->filePaths; KU_ASSERT(filePaths.size() == 1); + auto fileType = bindFileType(filePaths); auto copyFilePath = boundFilePath + "/" + filePaths[0]; - auto csvConfig = CSVReaderConfig::construct( - bindParsingOptions(copyFromStatement->getParsingOptionsRef())); - auto csvQuery = stringFormat("COPY {} FROM \"{}\" {};", copyFromStatement->getTableName(), - copyFilePath, csvConfig.option.toCypher()); - finalQueryStatements += csvQuery; + std::string query; + if (fileType == FileType::CSV) { + auto csvConfig = CSVReaderConfig::construct( + bindParsingOptions(copyFromStatement->getParsingOptionsRef())); + query = stringFormat("COPY {} FROM \"{}\" {};", copyFromStatement->getTableName(), + copyFilePath, csvConfig.option.toCypher()); + } else { + query = stringFormat("COPY {} FROM \"{}\";", copyFromStatement->getTableName(), + copyFilePath); + } + finalQueryStatements += query; } finalQueryStatements += getQueryFromFile(fs, boundFilePath, ImportDBConstants::MACRO_NAME); return std::make_unique(boundFilePath, finalQueryStatements); diff --git a/test/test_files/copy/copy_node_parquet.test b/test/test_files/copy/copy_node_parquet.test index f169341072..6fc212bdca 100644 --- a/test/test_files/copy/copy_node_parquet.test +++ b/test/test_files/copy/copy_node_parquet.test @@ -53,3 +53,9 @@ -STATEMENT MATCH (row:tableOfTypes) WHERE 0 <= row.doubleColumn AND row.doubleColumn <= 10 AND 0 <= row.int64Column AND row.int64Column <= 10 RETURN count(*); ---- 1 546 + +-CASE CopyWithOptionsErrorTest + +-STATEMENT COPY tableOfTypes FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k*.parquet" (HEADER=true); +---- error +Binder exception: Only copy from CSV can have options. diff --git a/test/test_files/copy/copy_partial_column.test b/test/test_files/copy/copy_partial_column.test index 9b662766e9..91c1f3ee6c 100644 --- a/test/test_files/copy/copy_partial_column.test +++ b/test/test_files/copy/copy_partial_column.test @@ -184,7 +184,7 @@ Copy exception: Found NULL, which violates the non-null constraint of the primar ---- ok -STATEMENT COPY tableOfTypes12 (id, int64Column, doubleColumn, booleanColumn, dateColumn, stringColumn, listOfInt64, listOfString, listOfListOfInt64, structColumn) FROM -"${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k*.parquet" (HEADER=true); +"${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k*.parquet"; ---- ok -STATEMENT MATCH (row:tableOfTypes12) WHERE row.id >= 20 AND row.id <= 24 RETURN row.*; ---- 5 @@ -205,6 +205,6 @@ listOfString, listOfListOfInt64, structColumn) FROM ---- ok -STATEMENT COPY tableOfTypes12 (id, dateColumn, doubleColumn, booleanColumn, int64Column, stringColumn, listOfInt64, listOfString, listOfListOfInt64, structColumn) FROM -"${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k*.parquet" (HEADER=true); +"${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k*.parquet"; ---- error Binder exception: Column `dateColumn` type mismatch. Expected DATE but got INT64. diff --git a/test/test_files/exceptions/copy/wrong_header.test b/test/test_files/exceptions/copy/wrong_header.test index b9b0351b76..1172f3b0c0 100644 --- a/test/test_files/exceptions/copy/wrong_header.test +++ b/test/test_files/exceptions/copy/wrong_header.test @@ -111,7 +111,7 @@ Binder exception: Number of columns mismatch. Expected 4 but got 3. -CASE NodeUnmatchedNumColumns -STATEMENT create node table person (ID1 SERIAL, ID INT64, fName INT64, age INT64, PRIMARY KEY (ID1)) ---- ok --STATEMENT COPY person FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k_1.parquet" (HEADER=true) +-STATEMENT COPY person FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k_1.parquet" ---- error Binder exception: Number of columns mismatch. Expected 3 but got 10. @@ -120,6 +120,6 @@ Binder exception: Number of columns mismatch. Expected 3 but got 10. ---- ok -STATEMENT create rel table knows (FROM person TO person, time date, age INT64) ---- ok --STATEMENT COPY knows FROM "${KUZU_ROOT_DIRECTORY}/dataset/demo-db/parquet/follows.parquet" (HEADER=true) +-STATEMENT COPY knows FROM "${KUZU_ROOT_DIRECTORY}/dataset/demo-db/parquet/follows.parquet" ---- error Binder exception: Number of columns mismatch. Expected 4 but got 3. From 680d7c17c681d938990190a9a0ff34b0a0a57147 Mon Sep 17 00:00:00 2001 From: Manh Dinh Date: Thu, 25 Apr 2024 14:37:29 -0400 Subject: [PATCH 2/4] Fix on review --- src/binder/bind/bind_file_scan.cpp | 3 --- src/processor/operator/persistent/reader/npy/npy_reader.cpp | 4 ++++ .../operator/persistent/reader/parquet/parquet_reader.cpp | 4 ++++ test/test_files/copy/copy_node_parquet.test | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/binder/bind/bind_file_scan.cpp b/src/binder/bind/bind_file_scan.cpp index ec33fbde1f..cf7c3035ab 100644 --- a/src/binder/bind/bind_file_scan.cpp +++ b/src/binder/bind/bind_file_scan.cpp @@ -75,9 +75,6 @@ std::unique_ptr Binder::bindScanSource(BaseScanSource* sour auto filePaths = bindFilePaths(fileSource->filePaths); // Bind file type. auto fileType = bindFileType(filePaths); - if (fileType != FileType::CSV && options.size() != 0) { - throw BinderException{"Only copy from CSV can have options."}; - } auto config = std::make_unique(fileType, std::move(filePaths)); // Bind options. config->options = bindParsingOptions(options); diff --git a/src/processor/operator/persistent/reader/npy/npy_reader.cpp b/src/processor/operator/persistent/reader/npy/npy_reader.cpp index 5fd2298bbf..320d52cc1b 100644 --- a/src/processor/operator/persistent/reader/npy/npy_reader.cpp +++ b/src/processor/operator/persistent/reader/npy/npy_reader.cpp @@ -16,6 +16,7 @@ #include #endif +#include "common/exception/binder.h" #include "common/exception/copy.h" #include "common/string_format.h" #include "common/utils.h" @@ -296,6 +297,9 @@ static void bindColumns(const common::ReaderConfig& readerConfig, static std::unique_ptr bindFunc(main::ClientContext* /*context*/, function::TableFuncBindInput* input) { auto scanInput = reinterpret_cast(input); + if (!scanInput->config.options.empty()) { + throw BinderException{"Copy from Npy cannot have options."}; + } std::vector detectedColumnNames; std::vector detectedColumnTypes; bindColumns(scanInput->config, detectedColumnNames, detectedColumnTypes); diff --git a/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp b/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp index f7e40d05bb..419485ddbd 100644 --- a/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp +++ b/src/processor/operator/persistent/reader/parquet/parquet_reader.cpp @@ -2,6 +2,7 @@ #include +#include "common/exception/binder.h" #include "common/exception/copy.h" #include "common/file_system/virtual_file_system.h" #include "common/string_format.h" @@ -652,6 +653,9 @@ static std::unique_ptr bindFunc(main::ClientContext function::TableFuncBindInput* input) { auto scanInput = ku_dynamic_cast(input); + if (!scanInput->config.options.empty()) { + throw BinderException{"Copy from Parquet cannot have options."}; + } std::vector detectedColumnNames; std::vector detectedColumnTypes; bindColumns(scanInput, detectedColumnNames, detectedColumnTypes); diff --git a/test/test_files/copy/copy_node_parquet.test b/test/test_files/copy/copy_node_parquet.test index 6fc212bdca..7671bc4a27 100644 --- a/test/test_files/copy/copy_node_parquet.test +++ b/test/test_files/copy/copy_node_parquet.test @@ -58,4 +58,4 @@ -STATEMENT COPY tableOfTypes FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k*.parquet" (HEADER=true); ---- error -Binder exception: Only copy from CSV can have options. +Binder exception: Copy from Parquet cannot have options. From 3e5e0883fdb4de930d231213389f475737c0f676 Mon Sep 17 00:00:00 2001 From: Manh Dinh Date: Thu, 25 Apr 2024 17:26:10 -0400 Subject: [PATCH 3/4] Fix on review --- src/processor/operator/persistent/reader/npy/npy_reader.cpp | 4 +--- test/test_files/copy/copy_node_parquet.test | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/processor/operator/persistent/reader/npy/npy_reader.cpp b/src/processor/operator/persistent/reader/npy/npy_reader.cpp index 320d52cc1b..0ade2f56c6 100644 --- a/src/processor/operator/persistent/reader/npy/npy_reader.cpp +++ b/src/processor/operator/persistent/reader/npy/npy_reader.cpp @@ -297,9 +297,7 @@ static void bindColumns(const common::ReaderConfig& readerConfig, static std::unique_ptr bindFunc(main::ClientContext* /*context*/, function::TableFuncBindInput* input) { auto scanInput = reinterpret_cast(input); - if (!scanInput->config.options.empty()) { - throw BinderException{"Copy from Npy cannot have options."}; - } + KU_ASSERT(scanInput->config.options.empty()); std::vector detectedColumnNames; std::vector detectedColumnTypes; bindColumns(scanInput->config, detectedColumnNames, detectedColumnTypes); diff --git a/test/test_files/copy/copy_node_parquet.test b/test/test_files/copy/copy_node_parquet.test index 7671bc4a27..43148238f2 100644 --- a/test/test_files/copy/copy_node_parquet.test +++ b/test/test_files/copy/copy_node_parquet.test @@ -54,8 +54,7 @@ ---- 1 546 --CASE CopyWithOptionsErrorTest - +-LOG CopyWithOptionsErrorTest -STATEMENT COPY tableOfTypes FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k*.parquet" (HEADER=true); ---- error Binder exception: Copy from Parquet cannot have options. From 3eab7266d79888a80ef95c343a42c44af798c3c3 Mon Sep 17 00:00:00 2001 From: Manh Dinh Date: Thu, 25 Apr 2024 17:35:49 -0400 Subject: [PATCH 4/4] Fix clang tidy --- src/processor/operator/persistent/reader/npy/npy_reader.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/processor/operator/persistent/reader/npy/npy_reader.cpp b/src/processor/operator/persistent/reader/npy/npy_reader.cpp index 0ade2f56c6..4672322531 100644 --- a/src/processor/operator/persistent/reader/npy/npy_reader.cpp +++ b/src/processor/operator/persistent/reader/npy/npy_reader.cpp @@ -16,7 +16,6 @@ #include #endif -#include "common/exception/binder.h" #include "common/exception/copy.h" #include "common/string_format.h" #include "common/utils.h"