-
Notifications
You must be signed in to change notification settings - Fork 90
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
reader: implement parallel CSV reading
This also refactors the CSVReader class to enable this change.
- Loading branch information
Showing
23 changed files
with
733 additions
and
453 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
col1,col2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
99 changes: 99 additions & 0 deletions
99
src/include/processor/operator/persistent/reader/csv/base_csv_reader.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
#pragma once | ||
|
||
#include <cstdint> | ||
#include <string> | ||
|
||
#include "common/copier_config/copier_config.h" | ||
#include "common/data_chunk/data_chunk.h" | ||
#include "common/types/types.h" | ||
|
||
namespace kuzu { | ||
namespace processor { | ||
|
||
enum class ParserMode : uint8_t { | ||
PARSING = 0, | ||
PARSING_HEADER = 1, | ||
SNIFFING_DIALECT = 2, | ||
INVALID = 255 | ||
}; | ||
|
||
class BaseCSVReader { | ||
protected: | ||
//! Initial buffer read size; can be extended for long values. | ||
static constexpr uint64_t INITIAL_BUFFER_SIZE = 16384; | ||
|
||
public: | ||
BaseCSVReader(const std::string& filePath, const common::ReaderConfig& readerConfig); | ||
|
||
virtual ~BaseCSVReader(); | ||
|
||
uint64_t ParseBlock(common::block_idx_t blockIdx, common::DataChunk& resultChunk); | ||
|
||
uint64_t CountRows(); | ||
|
||
protected: | ||
void AddValue(common::DataChunk& resultChunk, std::string strVal, common::column_id_t columnIdx, | ||
std::vector<uint64_t>& escapePositions); | ||
void AddRow(common::DataChunk&, common::column_id_t column); | ||
|
||
//! If this finds a BOM, it advances `position`. | ||
void ReadBOM(); | ||
void ReadHeader(); | ||
//! Reads a new buffer from the CSV file. | ||
//! Uses the start value to ensure the current value stays within the buffer. | ||
//! Modifies the start value to point to the new start of the current value. | ||
//! If start is NULL, none of the buffer is kept. | ||
//! Returns false if the file has been exhausted. | ||
bool ReadBuffer(uint64_t* start); | ||
|
||
//! Like ReadBuffer, but only reads if position >= bufferSize. | ||
//! If this returns true, buffer[position] is a valid character that we can read. | ||
inline bool MaybeReadBuffer(uint64_t* start) { | ||
return position < bufferSize || ReadBuffer(start); | ||
} | ||
|
||
uint64_t ParseCSV(common::DataChunk& resultChunk); | ||
|
||
inline bool isNewLine(char c) { return c == '\n' || c == '\r'; } | ||
|
||
// Get the file offset of the current buffer position. | ||
uint64_t getFileOffset() const; | ||
uint64_t getLineNumber(); | ||
|
||
protected: | ||
//! Called when starting the parsing of a new block. | ||
virtual void parseBlockHook() = 0; | ||
virtual bool finishedBlockDetail() const = 0; | ||
virtual void handleQuotedNewline() = 0; | ||
|
||
private: | ||
void copyStringToVector(common::ValueVector*, std::string); | ||
//! Called after a row is finished to determine if we should keep processing. | ||
inline bool finishedBlock() { | ||
return mode != ParserMode::PARSING || rowToAdd >= common::DEFAULT_VECTOR_CAPACITY || | ||
finishedBlockDetail(); | ||
} | ||
|
||
protected: | ||
std::string filePath; | ||
common::CSVReaderConfig& csvReaderConfig; | ||
|
||
uint64_t expectedNumColumns; | ||
uint64_t numColumnsDetected; | ||
int fd; | ||
|
||
common::block_idx_t currentBlockIdx; | ||
|
||
std::unique_ptr<char[]> buffer; | ||
uint64_t bufferSize; | ||
uint64_t position; | ||
|
||
bool rowEmpty = false; | ||
|
||
ParserMode mode; | ||
|
||
uint64_t rowToAdd; | ||
}; | ||
|
||
} // namespace processor | ||
} // namespace kuzu |
Oops, something went wrong.