Skip to content

Commit

Permalink
reader/csv: skip empty lines when sniffing
Browse files Browse the repository at this point in the history
On CSVs without headers, we should skip any leading empty lines, and
return zero if all lines are empty.
  • Loading branch information
Riolku committed Oct 13, 2023
1 parent 5bbf57d commit c7f1604
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 12 deletions.
1 change: 1 addition & 0 deletions src/binder/bind/bind_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ expression_vector Binder::bindExpectedNodeFileColumns(
// Validate.
validateNumColumns(expectedColumnTypes.size(), detectedColumnTypes.size());
if (readerConfig.fileType == common::FileType::PARQUET) {
// HACK(Ziyi): We should allow casting in Parquet reader.
validateColumnTypes(expectedColumnNames, expectedColumnTypes, detectedColumnTypes);
}
return createColumnExpressions(readerConfig, expectedColumnNames, expectedColumnTypes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@ class BaseCSVReader {
void addValue(Driver&, uint64_t rowNum, common::column_id_t columnIdx, std::string_view strVal,
std::vector<uint64_t>& escapePositions);

template<typename Driver>
bool addRow(Driver&, uint64_t rowNum, common::column_id_t column_count);

//! Read BOM and header.
void handleFirstBlock();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ struct SniffCSVNameAndTypeDriver {
};

struct SniffCSVColumnCountDriver {
bool emptyRow = true;
uint64_t numColumns = 0;

bool done(uint64_t rowNum);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,6 @@ void BaseCSVReader::addValue(Driver& driver, uint64_t rowNum, column_id_t column
}
}

template<typename Driver>
bool BaseCSVReader::addRow(Driver& driver, uint64_t rowNum, column_id_t column) {
return driver.addRow(rowNum, column);
}

void BaseCSVReader::handleFirstBlock() {
readBOM();
if (csvReaderConfig.hasHeader) {
Expand Down Expand Up @@ -308,7 +303,7 @@ add_row : {
std::string_view(buffer.get() + start, position - start - hasQuotes), escapePositions);
column++;

rowNum += addRow(driver, rowNum, column);
rowNum += driver.addRow(rowNum, column);

column = 0;
position++;
Expand Down Expand Up @@ -423,7 +418,7 @@ add_row : {
column++;
}
if (column > 0) {
rowNum += addRow(driver, rowNum, column);
rowNum += driver.addRow(rowNum, column);
}
return rowNum;
}
Expand Down
14 changes: 12 additions & 2 deletions src/processor/operator/persistent/reader/csv/driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -608,14 +608,24 @@ bool SniffCSVNameAndTypeDriver::addRow(uint64_t, common::column_id_t) {
}

bool SniffCSVColumnCountDriver::done(uint64_t) {
return true;
return !emptyRow;
}

void SniffCSVColumnCountDriver::addValue(uint64_t, common::column_id_t, std::string_view value) {
void SniffCSVColumnCountDriver::addValue(
uint64_t, common::column_id_t columnIdx, std::string_view value) {
if (value != "" || columnIdx > 0) {
emptyRow = false;
}
numColumns++;
}

bool SniffCSVColumnCountDriver::addRow(uint64_t, common::column_id_t) {
if (emptyRow) {
// If this is the last row, we just return zero: we don't know how many columns there are
// supposed to be.
numColumns = 0;
return false;
}
return true;
}

Expand Down

0 comments on commit c7f1604

Please sign in to comment.