Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reader: remove counting of blocks #2166

Merged
merged 1 commit into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class BaseCSVReader {
virtual uint64_t parseBlock(common::block_idx_t blockIdx, common::DataChunk& resultChunk) = 0;

uint64_t countRows();
bool isEOF() const;

protected:
template<typename Driver>
Expand Down
41 changes: 18 additions & 23 deletions src/include/processor/operator/persistent/reader_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,32 @@ struct ReaderFunctionData {

virtual ~ReaderFunctionData() = default;

virtual inline bool emptyBlockImpliesDone() const { return false; }
// Called after receiving an empty block from readFunc.
virtual inline bool doneAfterEmptyBlock() const { return true; }

// Called to determine if the current block has more data.
virtual inline bool hasMoreToRead() const { return false; }
};

struct RelCSVReaderFunctionData : public ReaderFunctionData {
struct RelCSVReaderFunctionData final : public ReaderFunctionData {
std::shared_ptr<arrow::csv::StreamingReader> reader = nullptr;

inline bool emptyBlockImpliesDone() const override { return true; }
};

struct SerialCSVReaderFunctionData : public ReaderFunctionData {
struct SerialCSVReaderFunctionData final : public ReaderFunctionData {
std::unique_ptr<SerialCSVReader> reader = nullptr;
};

struct ParallelCSVReaderFunctionData : public ReaderFunctionData {
struct ParallelCSVReaderFunctionData final : public ReaderFunctionData {
std::unique_ptr<ParallelCSVReader> reader = nullptr;

// NOTE: It is *critical* that `emptyBlockImpliesDone` is false for Parallel CSV Reader!
// NOTE: It is *critical* that `doneAfterEmptyBlock` is false for Parallel CSV Reader!
// Otherwise, when the parallel CSV reader gets a block that resides in the middle of a header
// or a long line, it will return zero and cause rows to not be loaded!
inline bool doneAfterEmptyBlock() const override { return reader->isEOF(); }
inline bool hasMoreToRead() const override { return reader->hasMoreToRead(); }
};

struct RelParquetReaderFunctionData : public ReaderFunctionData {
struct RelParquetReaderFunctionData final : public ReaderFunctionData {
std::unique_ptr<parquet::arrow::FileReader> reader = nullptr;
};

Expand All @@ -60,23 +62,18 @@ struct RDFReaderFunctionData : public ReaderFunctionData {
std::unique_ptr<RDFReader> reader = nullptr;
};

struct FileBlocksInfo {
common::row_idx_t numRows = 0;
common::block_idx_t numBlocks = 0;
};

using validate_func_t = std::function<void(const common::ReaderConfig& config)>;
using init_reader_data_func_t =
std::function<void(ReaderFunctionData& funcData, common::vector_idx_t fileIdx,
const common::ReaderConfig& config, storage::MemoryManager* memoryManager)>;
using count_blocks_func_t = std::function<std::vector<FileBlocksInfo>(
using count_rows_func_t = std::function<common::row_idx_t(
const common::ReaderConfig& config, storage::MemoryManager* memoryManager)>;
using read_rows_func_t = std::function<void(
ReaderFunctionData& funcData, common::block_idx_t blockIdx, common::DataChunk*)>;

struct ReaderFunctions {
static validate_func_t getValidateFunc(common::FileType fileType);
static count_blocks_func_t getCountBlocksFunc(
static count_rows_func_t getCountRowsFunc(
const common::ReaderConfig& config, common::TableType tableType);
static init_reader_data_func_t getInitDataFunc(
const common::ReaderConfig& config, common::TableType tableType);
Expand All @@ -90,19 +87,17 @@ struct ReaderFunctions {
}
static void validateNPYFiles(const common::ReaderConfig& config);

static std::vector<FileBlocksInfo> countRowsNoOp(
const common::ReaderConfig& config, storage::MemoryManager* memoryManager);
static std::vector<FileBlocksInfo> countRowsInSerialCSVFile(
static common::row_idx_t countRowsNoOp(
const common::ReaderConfig& config, storage::MemoryManager* memoryManager);
static std::vector<FileBlocksInfo> countRowsInParallelCSVFile(
static common::row_idx_t countRowsInCSVFile(
const common::ReaderConfig& config, storage::MemoryManager* memoryManager);
static std::vector<FileBlocksInfo> countRowsInRelParquetFile(
static common::row_idx_t countRowsInRelParquetFile(
const common::ReaderConfig& config, storage::MemoryManager* memoryManager);
static std::vector<FileBlocksInfo> countRowsInParquetFile(
static common::row_idx_t countRowsInParquetFile(
const common::ReaderConfig& config, storage::MemoryManager* memoryManager);
static std::vector<FileBlocksInfo> countRowsInNPYFile(
static common::row_idx_t countRowsInNPYFile(
const common::ReaderConfig& config, storage::MemoryManager* memoryManager);
static std::vector<FileBlocksInfo> countRowsInRDFFile(
static common::row_idx_t countRowsInRDFFile(
const common::ReaderConfig& config, storage::MemoryManager* memoryManager);

static void initRelCSVReadData(ReaderFunctionData& funcData, common::vector_idx_t fileIdx,
Expand Down
34 changes: 9 additions & 25 deletions src/include/processor/operator/persistent/reader_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,46 +40,30 @@ class ReaderSharedState {
};

explicit ReaderSharedState(std::unique_ptr<common::ReaderConfig> readerConfig)
: readerConfig{std::move(readerConfig)}, numRows{0}, currFileIdx{0}, currBlockIdx{0},
currRowIdx{0} {}
: readerConfig{std::move(readerConfig)}, numRows{common::INVALID_ROW_IDX}, currFileIdx{0},
currBlockIdx{0}, currRowIdx{0} {}

void initialize(common::TableType tableType);
void initialize(storage::MemoryManager* memoryManager, common::TableType tableType);
void validate() const;
void countBlocks(storage::MemoryManager* memoryManager);
void countRows(storage::MemoryManager* memoryManager);

inline void moveToNextFile() {
currFileIdx +=
(readerConfig->fileType == common::FileType::NPY ? readerConfig->filePaths.size() : 1);
currBlockIdx = 0;
}
// Signal that we are done the given file.
// No-op if we have already moved to the next file.
template<ReadMode READ_MODE>
void doneFile(common::vector_idx_t fileIdx);

template<ReadMode READ_MODE>
std::unique_ptr<ReaderMorsel> getMorsel();

inline common::row_idx_t& getNumRowsRef() { return std::ref(numRows); }

private:
template<ReadMode READ_MODE>
inline void lockForParallel() {
if constexpr (READ_MODE == ReadMode::PARALLEL) {
mtx.lock();
}
}
template<ReadMode READ_MODE>
inline void unlockForParallel() {
if constexpr (READ_MODE == ReadMode::PARALLEL) {
mtx.unlock();
}
}

public:
validate_func_t validateFunc;
init_reader_data_func_t initFunc;
count_blocks_func_t countBlocksFunc;
count_rows_func_t countRowsFunc;
std::shared_ptr<ReaderFunctionData> readFuncData;

common::row_idx_t numRows;
std::vector<FileBlocksInfo> fileInfos;

common::vector_idx_t currFileIdx;
common::block_idx_t currBlockIdx;
Expand Down
8 changes: 4 additions & 4 deletions src/processor/operator/persistent/reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ namespace kuzu {
namespace processor {

void Reader::initGlobalStateInternal(ExecutionContext* context) {
sharedState->initialize(info->tableType);
sharedState->initialize(context->memoryManager, info->tableType);
sharedState->validate();
sharedState->countBlocks(context->memoryManager);
sharedState->countRows(context->memoryManager);
}

void Reader::initLocalStateInternal(ResultSet* resultSet, ExecutionContext* context) {
Expand Down Expand Up @@ -80,8 +80,8 @@ void Reader::readNextDataChunk() {
readFunc(*readFuncData, morsel->blockIdx, dataChunk.get());
if (dataChunk->state->selVector->selectedSize > 0) {
leftArrowArrays.appendFromDataChunk(dataChunk.get());
} else if (readFuncData->emptyBlockImpliesDone()) {
sharedState->moveToNextFile();
} else if (readFuncData->doneAfterEmptyBlock()) {
sharedState->doneFile<READ_MODE>(morsel->fileIdx);
}
}
unlockForSerial<READ_MODE>();
Expand Down
18 changes: 18 additions & 0 deletions src/processor/operator/persistent/reader/csv/base_csv_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,24 @@ uint64_t BaseCSVReader::countRows() {
goto in_quotes;
}

bool BaseCSVReader::isEOF() const {
uint64_t offset = getFileOffset();
uint64_t end = lseek(fd, 0, SEEK_END);
if (end == -1) {
// LCOV_EXCL_START
throw CopyException(StringUtils::string_format(
"Could not seek to end of file {}: {}", filePath, posixErrMessage()));
// LCOV_EXCL_END
}
if (lseek(fd, offset, SEEK_SET) == -1) {
// LCOV_EXCL_START
throw CopyException(StringUtils::string_format(
"Could not reset position of file {}: {}", filePath, posixErrMessage()));
// LCOV_EXCL_END
}
return offset >= end;
}

template<typename Driver>
void BaseCSVReader::addValue(Driver& driver, uint64_t rowNum, column_id_t columnIdx,
std::string_view strVal, std::vector<uint64_t>& escapePositions) {
Expand Down
15 changes: 10 additions & 5 deletions src/processor/operator/persistent/reader/npy/npy_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,11 +209,16 @@ void NpyReader::validate(const LogicalType& type_, offset_t numRows) {

void NpyReader::readBlock(block_idx_t blockIdx, common::ValueVector* vectorToRead) const {
uint64_t rowNumber = DEFAULT_VECTOR_CAPACITY * blockIdx;
auto rowPointer = getPointerToRow(rowNumber);
auto numRowsToRead = std::min(DEFAULT_VECTOR_CAPACITY, getNumRows() - rowNumber);
memcpy(
vectorToRead->getData(), rowPointer, numRowsToRead * vectorToRead->getNumBytesPerValue());
vectorToRead->state->selVector->selectedSize = numRowsToRead;
auto numRows = getNumRows();
if (rowNumber >= numRows) {
vectorToRead->state->selVector->selectedSize = 0;
} else {
auto rowPointer = getPointerToRow(rowNumber);
auto numRowsToRead = std::min(DEFAULT_VECTOR_CAPACITY, getNumRows() - rowNumber);
memcpy(vectorToRead->getData(), rowPointer,
numRowsToRead * vectorToRead->getNumBytesPerValue());
vectorToRead->state->selVector->selectedSize = numRowsToRead;
}
}

NpyMultiFileReader::NpyMultiFileReader(const std::vector<std::string>& filePaths) {
Expand Down
Loading
Loading