From c7f160449819eea93ece3c3bf213cd040b2b9cbe Mon Sep 17 00:00:00 2001 From: Keenan Gugeler Date: Fri, 13 Oct 2023 14:42:44 -0400 Subject: [PATCH] reader/csv: skip empty lines when sniffing On CSVs without headers, we should skip any leading empty lines, and return zero if all lines are empty. --- src/binder/bind/bind_copy.cpp | 1 + .../persistent/reader/csv/base_csv_reader.h | 3 --- .../operator/persistent/reader/csv/driver.h | 1 + .../persistent/reader/csv/base_csv_reader.cpp | 9 ++------- .../operator/persistent/reader/csv/driver.cpp | 14 ++++++++++++-- 5 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/binder/bind/bind_copy.cpp b/src/binder/bind/bind_copy.cpp index 8ede2aa43ec..eb6ca90cdae 100644 --- a/src/binder/bind/bind_copy.cpp +++ b/src/binder/bind/bind_copy.cpp @@ -223,6 +223,7 @@ expression_vector Binder::bindExpectedNodeFileColumns( // Validate. validateNumColumns(expectedColumnTypes.size(), detectedColumnTypes.size()); if (readerConfig.fileType == common::FileType::PARQUET) { + // HACK(Ziyi): We should allow casting in Parquet reader. validateColumnTypes(expectedColumnNames, expectedColumnTypes, detectedColumnTypes); } return createColumnExpressions(readerConfig, expectedColumnNames, expectedColumnTypes); diff --git a/src/include/processor/operator/persistent/reader/csv/base_csv_reader.h b/src/include/processor/operator/persistent/reader/csv/base_csv_reader.h index 97001938ed7..e67a9f8fa7e 100644 --- a/src/include/processor/operator/persistent/reader/csv/base_csv_reader.h +++ b/src/include/processor/operator/persistent/reader/csv/base_csv_reader.h @@ -29,9 +29,6 @@ class BaseCSVReader { void addValue(Driver&, uint64_t rowNum, common::column_id_t columnIdx, std::string_view strVal, std::vector& escapePositions); - template - bool addRow(Driver&, uint64_t rowNum, common::column_id_t column_count); - //! Read BOM and header. void handleFirstBlock(); diff --git a/src/include/processor/operator/persistent/reader/csv/driver.h b/src/include/processor/operator/persistent/reader/csv/driver.h index 263eb3a9afe..4d1843b0ab1 100644 --- a/src/include/processor/operator/persistent/reader/csv/driver.h +++ b/src/include/processor/operator/persistent/reader/csv/driver.h @@ -62,6 +62,7 @@ struct SniffCSVNameAndTypeDriver { }; struct SniffCSVColumnCountDriver { + bool emptyRow = true; uint64_t numColumns = 0; bool done(uint64_t rowNum); diff --git a/src/processor/operator/persistent/reader/csv/base_csv_reader.cpp b/src/processor/operator/persistent/reader/csv/base_csv_reader.cpp index 610649ca0bd..8f7a5b7d5d6 100644 --- a/src/processor/operator/persistent/reader/csv/base_csv_reader.cpp +++ b/src/processor/operator/persistent/reader/csv/base_csv_reader.cpp @@ -158,11 +158,6 @@ void BaseCSVReader::addValue(Driver& driver, uint64_t rowNum, column_id_t column } } -template -bool BaseCSVReader::addRow(Driver& driver, uint64_t rowNum, column_id_t column) { - return driver.addRow(rowNum, column); -} - void BaseCSVReader::handleFirstBlock() { readBOM(); if (csvReaderConfig.hasHeader) { @@ -308,7 +303,7 @@ add_row : { std::string_view(buffer.get() + start, position - start - hasQuotes), escapePositions); column++; - rowNum += addRow(driver, rowNum, column); + rowNum += driver.addRow(rowNum, column); column = 0; position++; @@ -423,7 +418,7 @@ add_row : { column++; } if (column > 0) { - rowNum += addRow(driver, rowNum, column); + rowNum += driver.addRow(rowNum, column); } return rowNum; } diff --git a/src/processor/operator/persistent/reader/csv/driver.cpp b/src/processor/operator/persistent/reader/csv/driver.cpp index 1c5423717c6..83ec38049bb 100644 --- a/src/processor/operator/persistent/reader/csv/driver.cpp +++ b/src/processor/operator/persistent/reader/csv/driver.cpp @@ -608,14 +608,24 @@ bool SniffCSVNameAndTypeDriver::addRow(uint64_t, common::column_id_t) { } bool SniffCSVColumnCountDriver::done(uint64_t) { - return true; + return !emptyRow; } -void SniffCSVColumnCountDriver::addValue(uint64_t, common::column_id_t, std::string_view value) { +void SniffCSVColumnCountDriver::addValue( + uint64_t, common::column_id_t columnIdx, std::string_view value) { + if (value != "" || columnIdx > 0) { + emptyRow = false; + } numColumns++; } bool SniffCSVColumnCountDriver::addRow(uint64_t, common::column_id_t) { + if (emptyRow) { + // If this is the last row, we just return zero: we don't know how many columns there are + // supposed to be. + numColumns = 0; + return false; + } return true; }