Skip to content

Commit

Permalink
Test Framework: Support CSV to Parquet conversion (#1611)
Browse files Browse the repository at this point in the history
Convert CSV dataset to PARQUET dataset inside .test files by using:
-DATASET PARQUET CSV_TO_PARQUET(dataset)
  • Loading branch information
rfdavid committed Jun 7, 2023
1 parent c6ce3fe commit 20d696a
Show file tree
Hide file tree
Showing 106 changed files with 3,836 additions and 105 deletions.
2 changes: 1 addition & 1 deletion dataset/tinysnb/copy.cypher
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ COPY person FROM "dataset/tinysnb/vPerson.csv" (HeaDER=true, deLim=',');
COPY organisation FROM "dataset/tinysnb/vOrganisation.csv";
COPY movies FROM "dataset/tinysnb/vMovies.csv";
COPY knows FROM "dataset/tinysnb/eKnows.csv";
COPY studyAt FROM "dataset/tinysnb/eStudyAt.csv" (HEADeR=true);
COPY studyAt FROM "dataset/tinysnb/eStudyAt.csv" (HeaDER=true);
COPY workAt FROM "dataset/tinysnb/eWorkAt.csv"
COPY meets FROM "dataset/tinysnb/eMeets.csv"
COPY marries FROM "dataset/tinysnb/eMarries.csv"
12 changes: 9 additions & 3 deletions scripts/parquet/csv_to_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@
import pyarrow.parquet as pq

csv_files = ['dummy.csv']
read_options = csv.ReadOptions(autogenerate_column_names=True)
has_header = True
# CSV:
# has header? autogenerate_column_names=False
# no header? autogenerate_column_names=True
read_options = csv.ReadOptions(autogenerate_column_names=not has_header)
parse_options = csv.ParseOptions(delimiter=",")
for csv_file in csv_files:
table = csv.read_csv(csv_file, read_options=read_options)
pq.write_table(table, csv_file.replace('.csv', '.parquet'))
table = csv.read_csv(csv_file, read_options=read_options,
parse_options=parse_options)
pq.write_table(table, csv_file.replace('.csv', '.parquet'))
20 changes: 16 additions & 4 deletions src/common/file_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,17 +121,23 @@ void FileUtils::writeToFile(
}
}

void FileUtils::overwriteFile(const std::string& from, const std::string& to) {
if (!fileOrPathExists(from) || !fileOrPathExists(to))
void FileUtils::copyFile(
const std::string& from, const std::string& to, std::filesystem::copy_options options) {
if (!fileOrPathExists(from))
return;
std::error_code errorCode;
if (!std::filesystem::copy_file(
from, to, std::filesystem::copy_options::overwrite_existing, errorCode)) {
if (!std::filesystem::copy_file(from, to, options, errorCode)) {
throw Exception(StringUtils::string_format(
"Error copying file {} to {}. ErrorMessage: {}", from, to, errorCode.message()));
}
}

void FileUtils::overwriteFile(const std::string& from, const std::string& to) {
if (!fileOrPathExists(from) || !fileOrPathExists(to))
return;
copyFile(from, to, std::filesystem::copy_options::overwrite_existing);
}

void FileUtils::readFromFile(
FileInfo* fileInfo, void* buffer, uint64_t numBytes, uint64_t position) {
#if defined(_WIN32)
Expand Down Expand Up @@ -179,6 +185,12 @@ void FileUtils::createDir(const std::string& dir) {
}
}

void FileUtils::createDirIfNotExists(const std::string& path) {
if (!fileOrPathExists(path)) {
createDir(path);
}
}

void FileUtils::removeDir(const std::string& dir) {
std::error_code removeErrorCode;
if (!fileOrPathExists(dir))
Expand Down
3 changes: 3 additions & 0 deletions src/include/common/file_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ class FileUtils {
FileInfo* fileInfo, uint8_t* buffer, uint64_t numBytes, uint64_t offset);
// This function is a no-op if either file, from or to, does not exist.
static void overwriteFile(const std::string& from, const std::string& to);
static void copyFile(const std::string& from, const std::string& to,
std::filesystem::copy_options options = std::filesystem::copy_options::none);
static void createDir(const std::string& dir);
static void createDirIfNotExists(const std::string& path);
static void removeDir(const std::string& dir);

static inline std::string joinPath(const std::string& base, const std::string& part) {
Expand Down
5 changes: 5 additions & 0 deletions test/include/test_helper/test_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,14 @@ class TestHelper {
static constexpr char E2E_TEST_FILES_DIRECTORY[] = "test/test_files";
static constexpr char SCHEMA_FILE_NAME[] = "schema.cypher";
static constexpr char COPY_FILE_NAME[] = "copy.cypher";
static constexpr char PARQUET_TEMP_DATASET_PATH[] = "dataset/parquet_temp/";

static std::string getTmpTestDir() { return appendKuzuRootPath("test/unittest_temp/"); }

static std::string appendParquetDatasetTempDir(const std::string& dataset) {
return TestHelper::appendKuzuRootPath(TestHelper::PARQUET_TEMP_DATASET_PATH + dataset);
}

static std::string appendKuzuRootPath(const std::string& path) {
return KUZU_ROOT_DIRECTORY + std::string("/") + path;
}
Expand Down
49 changes: 49 additions & 0 deletions test/include/test_runner/csv_to_parquet_converter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#include "main/kuzu.h"
#include <arrow/api.h>

namespace kuzu {
namespace testing {

// Convert an entire CSV dataset directory to parquet.
// The dataset directory must contain schema and copy files.
class CSVToParquetConverter {
public:
static void convertCSVDatasetToParquet(std::string& dataset);

private:
struct CopyCommandInfo {
std::string table;
std::string csvFilePath;
std::string parquetFilePath;
bool csvHasHeader;
char delimiter;
};

static std::vector<CopyCommandInfo> readCopyCommandsFromCopyCypherFile(
const std::string& dataset);

static void convertCSVFilesToParquet(
const std::vector<CSVToParquetConverter::CopyCommandInfo>& copyCommands);

static CopyCommandInfo createCopyCommandInfo(
const std::string& dataset, std::string copyStatement);

static arrow::Status runCSVToParquetConversion(const std::string& inputFile,
const std::string& outputFile, char delimiter, bool hasHeader);

static void copySchema(
const std::string& csvDatasetPath, const std::string& parquetDatasetPath);

static void createCopyFile(const std::string& dataset,
const std::vector<CSVToParquetConverter::CopyCommandInfo>& copyCommands);

inline static std::string replaceSlashesWithUnderscores(std::string dataset) {
std::replace(dataset.begin(), dataset.end(), '/', '_');
return dataset;
}

static std::string extractPath(std::string& str, char delimiter);
};

} // namespace testing
} // namespace kuzu
7 changes: 4 additions & 3 deletions test/include/test_runner/test_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@ struct TestStatement {
struct TestGroup {
std::string group;
std::string dataset;
uint64_t bufferPoolSize =
kuzu::common::BufferPoolConstants::DEFAULT_BUFFER_POOL_SIZE_FOR_TESTING;
std::unordered_map<std::string, std::vector<std::unique_ptr<TestStatement>>> testCases;
std::unordered_map<std::string, std::vector<std::unique_ptr<TestStatement>>>
testCasesStatementBlocks;
uint64_t bufferPoolSize = common::BufferPoolConstants::DEFAULT_BUFFER_POOL_SIZE_FOR_TESTING;

bool isValid() const { return !group.empty() && !dataset.empty(); }
enum class DatasetType { CSV, PARQUET, NPY, CSV_TO_PARQUET };
DatasetType datasetType;

bool isValid() const { return !group.empty() && !dataset.empty(); }
bool hasStatements() const { return !testCases.empty(); }
};

Expand Down
1 change: 1 addition & 0 deletions test/include/test_runner/test_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class TestParser {
void parseBody();
void extractExpectedResult(TestStatement* statement);
void extractStatementBlock();
void extractDataset();
void addStatementBlock(const std::string& blockName, const std::string& testGroupName);
void replaceVariables(std::string& str);

Expand Down
20 changes: 15 additions & 5 deletions test/runner/e2e_test.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "graph_test/graph_test.h"
#include "test_runner/csv_to_parquet_converter.h"
#include "test_runner/test_parser.h"

using ::testing::Test;
Expand All @@ -19,9 +20,8 @@ class EndToEndTest : public DBTest {
initGraph();
}

std::string getInputDir() override {
return TestHelper::appendKuzuRootPath("dataset/" + dataset + "/");
}
std::string getInputDir() override { return dataset + "/"; }

void TestBody() override { runTest(testStatements); }

private:
Expand All @@ -37,6 +37,11 @@ void parseAndRegisterTestGroup(const std::string& path) {
auto dataset = testGroup->dataset;
auto testCases = std::move(testGroup->testCases);
auto bufferPoolSize = testGroup->bufferPoolSize;
if (testGroup->datasetType == TestGroup::DatasetType::CSV_TO_PARQUET) {
CSVToParquetConverter::convertCSVDatasetToParquet(dataset);
} else {
dataset = TestHelper::appendKuzuRootPath("dataset/" + dataset);
}
for (auto& [testCaseName, testStatements] : testCases) {
testing::RegisterTest(testGroup->group.c_str(), testCaseName.c_str(), nullptr, nullptr,
__FILE__, __LINE__,
Expand Down Expand Up @@ -66,12 +71,17 @@ int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
std::string path = TestHelper::E2E_TEST_FILES_DIRECTORY;
if (argc > 1) {
path = path + "/" + argv[1];
path = FileUtils::joinPath(path, argv[1]);
}
path = TestHelper::appendKuzuRootPath(path);
if (!FileUtils::fileOrPathExists(path)) {
throw TestException("Test path not exists [" + path + "].");
}
auto parquetDatasetTempDir =
TestHelper::appendKuzuRootPath(TestHelper::PARQUET_TEMP_DATASET_PATH);
FileUtils::createDirIfNotExists(parquetDatasetTempDir);
scanTestFiles(path);
return RUN_ALL_TESTS();
auto result = RUN_ALL_TESTS();
FileUtils::removeDir(parquetDatasetTempDir);
return result;
}
2 changes: 1 addition & 1 deletion test/test_files/copy/csv/copy_node.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP CopyNodeFromCSVTest
-DATASET copy-test/node/csv
-DATASET CSV copy-test/node/csv

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/copy/parquet/copy_node.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP CopyNodeFromParquetTest
-DATASET copy-test/node/parquet
-DATASET CSV copy-test/node/parquet

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBTest
-DATASET demo-db/csv
-DATASET CSV demo-db/csv

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db_create.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBCreateTest
-DATASET demo-db/csv
-DATASET CSV demo-db/csv

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db_delete.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBDeleteTest
-DATASET demo-db/csv
-DATASET CSV demo-db/csv

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db_order.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBTest
-DATASET demo-db/csv
-DATASET CSV demo-db/csv

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db_order_parquet.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBTest
-DATASET demo-db/parquet
-DATASET PARQUET demo-db/parquet

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db_parquet.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBTest
-DATASET demo-db/parquet
-DATASET PARQUET demo-db/parquet

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db_set_copy.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBSetAndCopyTest
-DATASET demo-db/csv
-DATASET CSV demo-db/csv

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/exceptions/binder/binder_error.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP BinderErrorTest
-DATASET tinysnb
-DATASET CSV tinysnb

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/exceptions/catalog/catalog.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP CatalogErrorTest
-DATASET tinysnb
-DATASET CSV tinysnb

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/exceptions/parser/parse_type.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP SyntaxErrorTest
-DATASET dummy
-DATASET CSV dummy

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/exceptions/parser/syntax_error.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP SyntaxErrorTest
-DATASET dummy
-DATASET CSV dummy

--

Expand Down
Loading

0 comments on commit 20d696a

Please sign in to comment.