From 80e99057a3fb1d83e0cd38e2c44bf9fc42eface4 Mon Sep 17 00:00:00 2001 From: xiyang Date: Fri, 13 Oct 2023 10:47:31 +0800 Subject: [PATCH] Validate file header for LOAD and COPY reader/csv: skip empty lines when sniffing On CSVs without headers, we should skip any leading empty lines, and return zero if all lines are empty. Co-authored-by: Keenan G <41458184+Riolku@users.noreply.github.com> --- src/binder/bind/bind_copy.cpp | 56 ++++-- src/binder/bind/bind_reading_clause.cpp | 171 ++++++++++++------ src/include/binder/binder.h | 12 ++ .../persistent/reader/csv/base_csv_reader.h | 3 - .../operator/persistent/reader/csv/driver.h | 1 + .../persistent/reader/csv/base_csv_reader.cpp | 9 +- .../operator/persistent/reader/csv/driver.cpp | 14 +- .../exceptions/copy/wrong_header.test | 80 +++++++- .../cast/cast_string_to_other_type.test | 2 +- .../tinysnb/load_from/load_from.test | 19 +- 10 files changed, 264 insertions(+), 103 deletions(-) diff --git a/src/binder/bind/bind_copy.cpp b/src/binder/bind/bind_copy.cpp index e3551d12b5..093bf85ad3 100644 --- a/src/binder/bind/bind_copy.cpp +++ b/src/binder/bind/bind_copy.cpp @@ -49,14 +49,6 @@ static void validateByColumnKeyword(FileType fileType, bool byColumn) { } } -static void validateCopyNpyFilesMatchSchema(uint32_t numFiles, TableSchema* schema) { - if (schema->properties.size() != numFiles) { - throw BinderException(StringUtils::string_format( - "Number of npy files is not equal to number of properties in table {}.", - schema->tableName)); - } -} - static void validateCopyNpyNotForRelTables(TableSchema* schema) { if (schema->tableType == TableType::REL) { throw BinderException( @@ -99,7 +91,6 @@ std::unique_ptr Binder::bindCopyFromClause(const Statement& stat std::make_unique(fileType, std::move(filePaths), std::move(csvReaderConfig)); validateByColumnKeyword(readerConfig->fileType, copyStatement.byColumn()); if (readerConfig->fileType == FileType::NPY) { - validateCopyNpyFilesMatchSchema(readerConfig->getNumFiles(), tableSchema); validateCopyNpyNotForRelTables(tableSchema); } switch (tableSchema->tableType) { @@ -194,16 +185,16 @@ static bool skipPropertyInFile(const Property& property) { expression_vector Binder::bindExpectedNodeFileColumns( TableSchema* tableSchema, ReaderConfig& readerConfig) { - expression_vector columns; + // Resolve expected columns. + std::vector expectedColumnNames; + std::vector> expectedColumnTypes; switch (readerConfig.fileType) { case FileType::TURTLE: { auto stringType = LogicalType{LogicalTypeID::STRING}; - auto columnNames = std::vector{ + expectedColumnNames = { std::string(RDF_SUBJECT), std::string(RDF_PREDICATE), std::string(RDF_OBJECT)}; - for (auto& columnName : columnNames) { - readerConfig.columnNames.push_back(columnName); - readerConfig.columnTypes.push_back(stringType.copy()); - columns.push_back(createVariable(columnName, stringType)); + for (auto _ : expectedColumnNames) { + expectedColumnTypes.push_back(stringType.copy()); } } break; case FileType::NPY: @@ -213,16 +204,29 @@ expression_vector Binder::bindExpectedNodeFileColumns( if (skipPropertyInFile(*property)) { continue; } - readerConfig.columnNames.push_back(property->getName()); - readerConfig.columnTypes.push_back(property->getDataType()->copy()); - columns.push_back(createVariable(property->getName(), *property->getDataType())); + expectedColumnNames.push_back(property->getName()); + expectedColumnTypes.push_back(property->getDataType()->copy()); } } break; default: { throw NotImplementedException{"Binder::bindCopyNodeColumns"}; } } - return columns; + if (readerConfig.fileType == common::FileType::TURTLE) { + // Nothing to validate for turtle + return createColumnExpressions(readerConfig, expectedColumnNames, expectedColumnTypes); + } + // Detect columns from file. + std::vector detectedColumnNames; + std::vector> detectedColumnTypes; + sniffFiles(readerConfig, detectedColumnNames, detectedColumnTypes); + // Validate. + validateNumColumns(expectedColumnTypes.size(), detectedColumnTypes.size()); + if (readerConfig.fileType == common::FileType::PARQUET) { + // HACK(Ziyi): We should allow casting in Parquet reader. + validateColumnTypes(expectedColumnNames, expectedColumnTypes, detectedColumnTypes); + } + return createColumnExpressions(readerConfig, expectedColumnNames, expectedColumnTypes); } expression_vector Binder::bindExpectedRelFileColumns( @@ -273,6 +277,20 @@ expression_vector Binder::bindExpectedRelFileColumns( throw NotImplementedException{"Binder::bindCopyRelColumns"}; } } + if (readerConfig.fileType == common::FileType::TURTLE) { + // Nothing to validate for turtle + return columns; + } + // Detect columns from file. + std::vector detectedColumnNames; + std::vector> detectedColumnTypes; + sniffFiles(readerConfig, detectedColumnNames, detectedColumnTypes); + // Validate number of columns. + validateNumColumns(readerConfig.getNumColumns(), detectedColumnTypes.size()); + if (readerConfig.fileType == common::FileType::PARQUET) { + validateColumnTypes( + readerConfig.columnNames, readerConfig.columnTypes, detectedColumnTypes); + } return columns; } diff --git a/src/binder/bind/bind_reading_clause.cpp b/src/binder/bind/bind_reading_clause.cpp index d7fd6cdeab..cec57961b5 100644 --- a/src/binder/bind/bind_reading_clause.cpp +++ b/src/binder/bind/bind_reading_clause.cpp @@ -137,86 +137,139 @@ std::unique_ptr Binder::bindLoadFrom( if (readerConfig->getNumFiles() > 1) { throw BinderException("Load from multiple files is not supported."); } - std::vector inputColumnNames; - std::vector> inputColumnTypes; + // Bind columns from input. + std::vector expectedColumnNames; + std::vector> expectedColumnTypes; for (auto& [name, type] : loadFrom.getColumnNameDataTypesRef()) { - inputColumnNames.push_back(name); - inputColumnTypes.push_back(bindDataType(type)); + expectedColumnNames.push_back(name); + expectedColumnTypes.push_back(bindDataType(type)); } + // Detect columns from file. std::vector detectedColumnNames; std::vector> detectedColumnTypes; - switch (fileType) { + sniffFiles(*readerConfig, detectedColumnNames, detectedColumnTypes); + // Validate and resolve columns to use. + expression_vector columns; + if (expectedColumnTypes.empty()) { // Input is empty. Use detected columns. + columns = createColumnExpressions(*readerConfig, detectedColumnNames, detectedColumnTypes); + } else { + validateNumColumns(expectedColumnTypes.size(), detectedColumnTypes.size()); + if (fileType == common::FileType::PARQUET) { + validateColumnTypes(expectedColumnNames, expectedColumnTypes, detectedColumnTypes); + } + columns = createColumnExpressions(*readerConfig, expectedColumnNames, expectedColumnTypes); + } + auto info = std::make_unique( + std::move(readerConfig), std::move(columns), nullptr /* offset */, TableType::UNKNOWN); + auto boundLoadFrom = std::make_unique(std::move(info)); + if (loadFrom.hasWherePredicate()) { + auto wherePredicate = expressionBinder.bindExpression(*loadFrom.getWherePredicate()); + boundLoadFrom->setWherePredicate(std::move(wherePredicate)); + } + return boundLoadFrom; +} + +expression_vector Binder::createColumnExpressions(common::ReaderConfig& readerConfig, + const std::vector& columnNames, + const std::vector>& columnTypes) { + expression_vector columns; + for (auto i = 0u; i < columnTypes.size(); ++i) { + auto columnName = columnNames[i]; + auto columnType = columnTypes[i].get(); + readerConfig.columnNames.push_back(columnName); + readerConfig.columnTypes.push_back(columnType->copy()); + columns.push_back(createVariable(columnName, *columnType)); + } + return columns; +} + +void Binder::validateColumnTypes(const std::vector& columnNames, + const std::vector>& expectedColumnTypes, + const std::vector>& detectedColumnTypes) { + assert(expectedColumnTypes.size() == detectedColumnTypes.size()); + for (auto i = 0; i < expectedColumnTypes.size(); ++i) { + if (*expectedColumnTypes[i] != *detectedColumnTypes[i]) { + throw BinderException( + StringUtils::string_format("Column `{}` type mismatch. Expected {} but got {}.", + columnNames[i], LogicalTypeUtils::dataTypeToString(*expectedColumnTypes[i]), + LogicalTypeUtils::dataTypeToString(*detectedColumnTypes[i]))); + } + } +} + +void Binder::validateNumColumns(uint32_t expectedNumber, uint32_t detectedNumber) { + if (detectedNumber == 0) { + return; // Empty CSV. Continue processing. + } + if (expectedNumber != detectedNumber) { + throw BinderException(StringUtils::string_format( + "Number of columns mismatch. Expected {} but got {}.", expectedNumber, detectedNumber)); + } +} + +void Binder::sniffFiles(const common::ReaderConfig& readerConfig, + std::vector& columnNames, + std::vector>& columnTypes) { + assert(readerConfig.getNumFiles() > 0); + sniffFile(readerConfig, 0, columnNames, columnTypes); + for (auto i = 1; i < readerConfig.getNumFiles(); ++i) { + std::vector tmpColumnNames; + std::vector> tmpColumnTypes; + sniffFile(readerConfig, i, tmpColumnNames, tmpColumnTypes); + switch (readerConfig.fileType) { + case FileType::CSV: { + validateNumColumns(columnTypes.size(), tmpColumnTypes.size()); + } + case FileType::PARQUET: { + validateNumColumns(columnTypes.size(), tmpColumnTypes.size()); + validateColumnTypes(columnNames, columnTypes, tmpColumnTypes); + } break; + case FileType::NPY: { + validateNumColumns(1, tmpColumnTypes.size()); + columnNames.push_back(tmpColumnNames[0]); + columnTypes.push_back(tmpColumnTypes[0]->copy()); + } break; + case FileType::TURTLE: + break; + default: + // LCOV_EXCL_START + throw NotImplementedException("Binder::sniffFiles"); + // LCOV_EXCL_END + } + } +} + +void Binder::sniffFile(const common::ReaderConfig& readerConfig, uint32_t fileIdx, + std::vector& columnNames, std::vector>& columnTypes) { + switch (readerConfig.fileType) { case FileType::CSV: { - auto csvReader = SerialCSVReader(readerConfig->filePaths[0], *readerConfig); + auto csvReader = SerialCSVReader(readerConfig.filePaths[fileIdx], readerConfig); auto sniffedColumns = csvReader.sniffCSV(); for (auto& [name, type] : sniffedColumns) { - detectedColumnNames.push_back(name); - detectedColumnTypes.push_back(type.copy()); + columnNames.push_back(name); + columnTypes.push_back(type.copy()); } } break; case FileType::PARQUET: { - auto reader = ParquetReader(readerConfig->filePaths[0], memoryManager); + auto reader = ParquetReader(readerConfig.filePaths[fileIdx], memoryManager); auto state = std::make_unique(); reader.initializeScan(*state, std::vector{}); for (auto i = 0u; i < reader.getNumColumns(); ++i) { - detectedColumnNames.push_back(reader.getColumnName(i)); - detectedColumnTypes.push_back(reader.getColumnType(i)->copy()); + columnNames.push_back(reader.getColumnName(i)); + columnTypes.push_back(reader.getColumnType(i)->copy()); } } break; case FileType::NPY: { - auto reader = NpyReader(readerConfig->filePaths[0]); - auto columnName = std::string("column0"); + auto reader = NpyReader(readerConfig.filePaths[0]); + auto columnName = std::string("column" + std::to_string(fileIdx)); auto columnType = bindFixedListType(reader.getShape(), reader.getType()); - detectedColumnNames.push_back(columnName); - detectedColumnTypes.push_back(columnType->copy()); + columnNames.push_back(columnName); + columnTypes.push_back(columnType->copy()); } break; default: throw BinderException(StringUtils::string_format( - "Load from {} file is not supported.", FileTypeUtils::toString(fileType))); + "Cannot sniff header of file type {}", FileTypeUtils::toString(readerConfig.fileType))); } - expression_vector columns; - if (inputColumnTypes.empty()) { - for (auto i = 0u; i < detectedColumnTypes.size(); ++i) { - auto columnName = detectedColumnNames[i]; - auto columnType = detectedColumnTypes[i].get(); - readerConfig->columnNames.push_back(columnName); - readerConfig->columnTypes.push_back(columnType->copy()); - columns.push_back(createVariable(columnName, *columnType)); - } - } else { - if (inputColumnTypes.size() != detectedColumnTypes.size()) { - throw BinderException( - StringUtils::string_format("Number of columns mismatch. Detect {} but expect {}.", - detectedColumnTypes.size(), inputColumnTypes.size())); - } - if (fileType == common::FileType::PARQUET) { - for (auto i = 0u; i < inputColumnTypes.size(); ++i) { - auto inputType = inputColumnTypes[i].get(); - auto detectType = detectedColumnTypes[i].get(); - if (*inputType != *detectType) { - throw BinderException(StringUtils::string_format( - "Column {} data type mismatch. Detect {} but expect {}.", - inputColumnNames[i], LogicalTypeUtils::dataTypeToString(*detectType), - LogicalTypeUtils::dataTypeToString(*inputType))); - } - } - } - for (auto i = 0u; i < inputColumnTypes.size(); ++i) { - auto columnName = inputColumnNames[i]; - auto columnType = inputColumnTypes[i].get(); - readerConfig->columnNames.push_back(columnName); - readerConfig->columnTypes.push_back(columnType->copy()); - columns.push_back(createVariable(columnName, *columnType)); - } - } - auto info = std::make_unique( - std::move(readerConfig), std::move(columns), nullptr, TableType::UNKNOWN); - auto boundLoadFrom = std::make_unique(std::move(info)); - if (loadFrom.hasWherePredicate()) { - auto wherePredicate = expressionBinder.bindExpression(*loadFrom.getWherePredicate()); - boundLoadFrom->setWherePredicate(std::move(wherePredicate)); - } - return boundLoadFrom; } } // namespace binder diff --git a/src/include/binder/binder.h b/src/include/binder/binder.h index b7d27fe8f3..ce7bcdb2b4 100644 --- a/src/include/binder/binder.h +++ b/src/include/binder/binder.h @@ -162,6 +162,18 @@ class Binder { const parser::ReadingClause& readingClause); std::unique_ptr bindInQueryCall(const parser::ReadingClause& readingClause); std::unique_ptr bindLoadFrom(const parser::ReadingClause& readingClause); + expression_vector createColumnExpressions(common::ReaderConfig& readerConfig, + const std::vector& columnNames, + const std::vector>& columnTypes); + void sniffFiles(const common::ReaderConfig& readerConfig, std::vector& columnNames, + std::vector>& columnTypes); + void sniffFile(const common::ReaderConfig& readerConfig, uint32_t fileIdx, + std::vector& columnNames, + std::vector>& columnTypes); + static void validateNumColumns(uint32_t expectedNumber, uint32_t detectedNumber); + static void validateColumnTypes(const std::vector& expectedColumnNames, + const std::vector>& expectedColumnTypes, + const std::vector>& detectedColumnTypes); /*** bind updating clause ***/ // TODO(Guodong/Xiyang): Is update clause an accurate name? How about (data)modificationClause? diff --git a/src/include/processor/operator/persistent/reader/csv/base_csv_reader.h b/src/include/processor/operator/persistent/reader/csv/base_csv_reader.h index 97001938ed..e67a9f8fa7 100644 --- a/src/include/processor/operator/persistent/reader/csv/base_csv_reader.h +++ b/src/include/processor/operator/persistent/reader/csv/base_csv_reader.h @@ -29,9 +29,6 @@ class BaseCSVReader { void addValue(Driver&, uint64_t rowNum, common::column_id_t columnIdx, std::string_view strVal, std::vector& escapePositions); - template - bool addRow(Driver&, uint64_t rowNum, common::column_id_t column_count); - //! Read BOM and header. void handleFirstBlock(); diff --git a/src/include/processor/operator/persistent/reader/csv/driver.h b/src/include/processor/operator/persistent/reader/csv/driver.h index 263eb3a9af..4d1843b0ab 100644 --- a/src/include/processor/operator/persistent/reader/csv/driver.h +++ b/src/include/processor/operator/persistent/reader/csv/driver.h @@ -62,6 +62,7 @@ struct SniffCSVNameAndTypeDriver { }; struct SniffCSVColumnCountDriver { + bool emptyRow = true; uint64_t numColumns = 0; bool done(uint64_t rowNum); diff --git a/src/processor/operator/persistent/reader/csv/base_csv_reader.cpp b/src/processor/operator/persistent/reader/csv/base_csv_reader.cpp index 610649ca0b..8f7a5b7d5d 100644 --- a/src/processor/operator/persistent/reader/csv/base_csv_reader.cpp +++ b/src/processor/operator/persistent/reader/csv/base_csv_reader.cpp @@ -158,11 +158,6 @@ void BaseCSVReader::addValue(Driver& driver, uint64_t rowNum, column_id_t column } } -template -bool BaseCSVReader::addRow(Driver& driver, uint64_t rowNum, column_id_t column) { - return driver.addRow(rowNum, column); -} - void BaseCSVReader::handleFirstBlock() { readBOM(); if (csvReaderConfig.hasHeader) { @@ -308,7 +303,7 @@ add_row : { std::string_view(buffer.get() + start, position - start - hasQuotes), escapePositions); column++; - rowNum += addRow(driver, rowNum, column); + rowNum += driver.addRow(rowNum, column); column = 0; position++; @@ -423,7 +418,7 @@ add_row : { column++; } if (column > 0) { - rowNum += addRow(driver, rowNum, column); + rowNum += driver.addRow(rowNum, column); } return rowNum; } diff --git a/src/processor/operator/persistent/reader/csv/driver.cpp b/src/processor/operator/persistent/reader/csv/driver.cpp index 6f6671289c..89659839e7 100644 --- a/src/processor/operator/persistent/reader/csv/driver.cpp +++ b/src/processor/operator/persistent/reader/csv/driver.cpp @@ -612,14 +612,24 @@ bool SniffCSVNameAndTypeDriver::addRow(uint64_t, common::column_id_t) { } bool SniffCSVColumnCountDriver::done(uint64_t) { - return true; + return !emptyRow; } -void SniffCSVColumnCountDriver::addValue(uint64_t, common::column_id_t, std::string_view value) { +void SniffCSVColumnCountDriver::addValue( + uint64_t, common::column_id_t columnIdx, std::string_view value) { + if (value != "" || columnIdx > 0) { + emptyRow = false; + } numColumns++; } bool SniffCSVColumnCountDriver::addRow(uint64_t, common::column_id_t) { + if (emptyRow) { + // If this is the last row, we just return zero: we don't know how many columns there are + // supposed to be. + numColumns = 0; + return false; + } return true; } diff --git a/test/test_files/exceptions/copy/wrong_header.test b/test/test_files/exceptions/copy/wrong_header.test index 0d50e20017..9b420ccfda 100644 --- a/test/test_files/exceptions/copy/wrong_header.test +++ b/test/test_files/exceptions/copy/wrong_header.test @@ -3,6 +3,78 @@ -- +-CASE CSVHeaderMismatch +-STATEMENT CREATE NODE TABLE person (ID INT64, fName STRING, PRIMARY KEY (ID)) +---- ok +-STATEMENT COPY person FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-fault-tests/wrong-header/vPerson.csv" (HEADER=true) +---- ok +-STATEMENT CREATE NODE TABLE person1 (ID STRING, fName STRING, PRIMARY KEY (ID)) +---- ok +-STATEMENT CREATE NODE TABLE person2 (ID STRING, fName INT64, PRIMARY KEY (ID)); +---- ok +-STATEMENT CREATE NODE TABLE person3 (ID STRING, PRIMARY KEY (ID)); +---- ok +-STATEMENT COPY person1 FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-fault-tests/wrong-header/vPerson.csv" (HEADER=true) +---- ok +-STATEMENT MATCH (a:person1) RETURN a.ID STARTS WITH '1'; +---- 3 +False +False +True +-STATEMENT COPY person2 FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-fault-tests/wrong-header/vPerson.csv" (HEADER=true) +---- error +Conversion exception: Cast failed. Guodong is not in INT64 range. +-STATEMENT COPY person3 FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-fault-tests/wrong-header/vPerson.csv" (HEADER=true) +---- error +Binder exception: Number of columns mismatch. Expected 1 but got 2. +-STATEMENT COPY person FROM + ["${KUZU_ROOT_DIRECTORY}/dataset/copy-fault-tests/wrong-header/vPerson.csv", + "${KUZU_ROOT_DIRECTORY}/dataset/copy-fault-tests/wrong-header/vPersonMissingColumn.csv"] (HEADER=true) +---- error +Binder exception: Number of columns mismatch. Expected 2 but got 1. +-STATEMENT CREATE REL TABLE knows (FROM person TO person, prop1 INTERVAL); +---- ok +-STATEMENT COPY knows FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-fault-tests/wrong-header/eKnowsWrongColumnName.csv" (HEADER=true) +---- error +Binder exception: Number of columns mismatch. Expected 3 but got 4. +-STATEMENT COPY knows FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-fault-tests/wrong-header/eKnowsMissingColumn.csv" (HEADER=true) +---- error +Conversion exception: Error occurred during parsing interval. Field name is missing. + +-CASE ParquetHeaderMismatch +-STATEMENT CREATE NODE TABLE User(name STRING, age INT64, PRIMARY KEY (name)); +---- ok +-STATEMENT CREATE NODE TABLE User1(name INT64, age INT64, PRIMARY KEY (name)); +---- ok +-STATEMENT CREATE NODE TABLE User2(name INT64, age INT64, dummy INT32[], PRIMARY KEY (name)); +---- ok +-STATEMENT CREATE NODE TABLE User3(name STRING, age INT16[], PRIMARY KEY (name)); +---- ok +-STATEMENT COPY User FROM "${KUZU_ROOT_DIRECTORY}/dataset/demo-db/parquet/user.parquet"; +---- ok +-STATEMENT COPY User1 FROM "${KUZU_ROOT_DIRECTORY}/dataset/demo-db/parquet/user.parquet"; +---- error +Binder exception: Column `name` type mismatch. Expected INT64 but got STRING. +-STATEMENT COPY User1 FROM ["${KUZU_ROOT_DIRECTORY}/dataset/demo-db/parquet/user.parquet", + "${KUZU_ROOT_DIRECTORY}/dataset/demo-db/parquet/lives-in.parquet"]; +---- error +Binder exception: Column `f1` type mismatch. Expected INT64 but got STRING. +-STATEMENT COPY User2 FROM "${KUZU_ROOT_DIRECTORY}/dataset/demo-db/parquet/user.parquet"; +---- error +Binder exception: Number of columns mismatch. Expected 3 but got 2. +-STATEMENT COPY User3 FROM "${KUZU_ROOT_DIRECTORY}/dataset/demo-db/parquet/user.parquet"; +---- error +Binder exception: Column `age` type mismatch. Expected INT16[] but got INT64. +-STATEMENT CREATE REL TABLE Follows1(FROM User TO User, since INT64[]); +---- ok +-STATEMENT COPY Follows1 FROM "${KUZU_ROOT_DIRECTORY}/dataset/demo-db/parquet/follows.parquet"; +---- error +Binder exception: Column `since` type mismatch. Expected INT64[] but got INT64. +-STATEMENT COPY Follows1 FROM ["${KUZU_ROOT_DIRECTORY}/dataset/demo-db/parquet/follows.parquet", + "${KUZU_ROOT_DIRECTORY}/dataset/demo-db/parquet/lives-in.parquet"]; +---- error +Binder exception: Number of columns mismatch. Expected 3 but got 2. + -CASE UnMatchedColumnTypeError -STATEMENT create node table person (ID INT64, fName INT64, gender INT64, isStudent BOOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, @@ -43,16 +115,14 @@ Copy exception: COPY commands can only be executed once on a table. ---- ok -STATEMENT COPY knows FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-fault-tests/wrong-header/eKnowsMissingColumn.csv" (HEADER=true) ---- error -Copy exception: Invalid: CSV parse error: Expected 4 columns, got 3: 10,24,1 +Binder exception: Number of columns mismatch. Expected 4 but got 3. -CASE NodeUnmatchedNumColumns --SKIP -#binder needs to check the number of columns -STATEMENT create node table person (ID1 SERIAL, ID INT64, fName INT64, age INT64, PRIMARY KEY (ID1)) ---- ok -STATEMENT COPY person FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k_1.parquet" (HEADER=true) ---- error -Copy exception: Unmatched number of columns in parquet file. Expect: 3, got: 13. +Binder exception: Number of columns mismatch. Expected 3 but got 10. -CASE RelUnmatchedNumColumns -STATEMENT create node table person (ID1 SERIAL, ID INT64, fName INT64, age INT64, PRIMARY KEY (ID1)) @@ -61,4 +131,4 @@ Copy exception: Unmatched number of columns in parquet file. Expect: 3, got: 13. ---- ok -STATEMENT COPY knows FROM "${KUZU_ROOT_DIRECTORY}/dataset/demo-db/parquet/follows.parquet" (HEADER=true) ---- error -Copy exception: Unmatched number of columns in parquet file. Expect: 4, got: 3. +Binder exception: Number of columns mismatch. Expected 4 but got 3. diff --git a/test/test_files/tinysnb/cast/cast_string_to_other_type.test b/test/test_files/tinysnb/cast/cast_string_to_other_type.test index 256bc70204..8c1a9143b9 100644 --- a/test/test_files/tinysnb/cast/cast_string_to_other_type.test +++ b/test/test_files/tinysnb/cast/cast_string_to_other_type.test @@ -50,7 +50,7 @@ Conversion exception: Cast failed. "((hello),(bdfadf),)" is not in STRING[][] ra Conversion exception: Cast failed. (() is not in STRING[][] range. -STATEMENT LOAD WITH HEADERS (list INT32[]) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/quote_fail.csv" RETURN * ; ---- error -Binder exception: Number of columns mismatch. Detect 3 but expect 1. +Binder exception: Number of columns mismatch. Expected 1 but got 3. -STATEMENT LOAD WITH HEADERS (list STRING[]) FROM "${KUZU_ROOT_DIRECTORY}/dataset/load-from-test/single_quote.csv" RETURN *; ---- error Conversion exception: Cast failed. ['fdsfdsfe werw] is not in STRING[] range. diff --git a/test/test_files/tinysnb/load_from/load_from.test b/test/test_files/tinysnb/load_from/load_from.test index bfe03c7684..d790e52c6e 100644 --- a/test/test_files/tinysnb/load_from/load_from.test +++ b/test/test_files/tinysnb/load_from/load_from.test @@ -4,7 +4,6 @@ -- -CASE LoadFromNpyTest --SKIP -STATEMENT LOAD FROM "${KUZU_ROOT_DIRECTORY}/dataset/npy-1d/one_dim_double.npy" RETURN * ORDER BY column0 LIMIT 5; ---- 3 1.000000 @@ -21,18 +20,17 @@ [7,8,9] -CASE LoadFromParquetTest --SKIP -STATEMENT LOAD WITH HEADERS (id INT64) FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k_0.parquet" RETURN *; ---- error -Binder exception: Number of columns mismatch. Detect 10 but expect 1. +Binder exception: Number of columns mismatch. Expected 1 but got 10. -STATEMENT LOAD WITH HEADERS (id INT64, int64Column INT64, doubleColumn DOUBLE, booleanColumn BOOLEAN, dateColumn DATE, stringColumn STRING, listOfInt64 INT64[], listOfString STRING[], listOfListOfInt64 INT64[][], structColumn STRUCT(ID int64, name STRING)) FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k_0.parquet" RETURN id, dateColumn ORDER BY id LIMIT 1; ---- 1 0|1994-01-12 -STATEMENT LOAD WITH HEADERS (id INT64, int64Column INT64, doubleColumn INT64, booleanColumn INT64, dateColumn INT64, stringColumn INT64, listOfInt64 INT64, listOfString INT64, listOfListOfInt64 INT64, structColumn INT64) FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k_0.parquet" RETURN * ORDER BY id LIMIT 1; ----- 1 -Binder exception: Column doubleColumn data type mismatch. Detect DOUBLE but expect INT64. +---- error +Binder exception: Column `doubleColumn` type mismatch. Expected INT64 but got DOUBLE. -STATEMENT LOAD FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k_0.parquet" RETURN * ORDER BY id LIMIT 5; ---- 5 0|73|3.258507|True|1994-01-12|FrPZkcHFuepVxcAiMwyAsRqDlRtQx|[65,25]|[4deQc5]|[[163,237],[28,60,77,31,137],[286,186,249,206]]|{id: 764, name: CwFRaCoEp} @@ -63,12 +61,19 @@ Greg|0|1994-01-12 Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|0|1994-01-12 -STATEMENT LOAD FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-test/rdf/taxonomy.ttl" RETURN COUNT(*) ---- error -Binder exception: Load from TURTLE file is not supported. +Binder exception: Cannot sniff header of file type TURTLE +-STATEMENT LOAD WITH HEADERS (a INT64, b INT64) FROM "${KUZU_ROOT_DIRECTORY}/dataset/demo-db/parquet/user.parquet" RETURN *; +---- error +Binder exception: Column `a` type mismatch. Expected INT64 but got STRING. +-STATEMENT LOAD WITH HEADERS (id INT64, int64Column INT64, doubleColumn DOUBLE, booleanColumn BOOLEAN, dateColumn INT32, stringColumn STRING, listOfInt64 INT64[], listOfString STRING[], listOfListOfInt64 INT64[][], structColumn STRUCT(ID int64, name STRING)) + FROM "${KUZU_ROOT_DIRECTORY}/dataset/copy-test/node/parquet/types_50k_0.parquet" RETURN *; +---- error +Binder exception: Column `dateColumn` type mismatch. Expected INT32 but got DATE. -CASE LoadFromCSVTest -STATEMENT LOAD WITH HEADERS (a INT64) FROM "${KUZU_ROOT_DIRECTORY}/dataset/tinysnb/eStudyAt.csv" (HEADER=True) RETURN `from`, `to`, YEAR, Places; ---- error -Binder exception: Number of columns mismatch. Detect 10 but expect 1. +Binder exception: Number of columns mismatch. Expected 1 but got 10. -STATEMENT LOAD WITH HEADERS (ID INt64, fName StRING, gender INT64, isStudent BoOLEAN, isWorker BOOLEAN, age INT64, eyeSight DOUBLE, birthdate DATE, registerTime TIMESTAMP, lastJobDuration interval, workedHours INT64[], usedNames STRING[], courseScoresPerTerm INT64[][], grades INT64[4], height float) FROM "${KUZU_ROOT_DIRECTORY}/dataset/tinysnb/vPerson.csv" (HEADER=True) RETURN fName, gender, birthdate; ---- 8