Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test Framework: Support CSV to Parquet conversion #1611

Merged
merged 7 commits into from
Jun 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dataset/tinysnb/copy.cypher
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ COPY person FROM "dataset/tinysnb/vPerson.csv" (HeaDER=true, deLim=',');
COPY organisation FROM "dataset/tinysnb/vOrganisation.csv";
COPY movies FROM "dataset/tinysnb/vMovies.csv";
COPY knows FROM "dataset/tinysnb/eKnows.csv";
COPY studyAt FROM "dataset/tinysnb/eStudyAt.csv" (HEADeR=true);
COPY studyAt FROM "dataset/tinysnb/eStudyAt.csv" (HeaDER=true);
COPY workAt FROM "dataset/tinysnb/eWorkAt.csv"
COPY meets FROM "dataset/tinysnb/eMeets.csv"
COPY marries FROM "dataset/tinysnb/eMarries.csv"
12 changes: 9 additions & 3 deletions scripts/parquet/csv_to_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@
import pyarrow.parquet as pq

csv_files = ['dummy.csv']
read_options = csv.ReadOptions(autogenerate_column_names=True)
has_header = True
# CSV:
# has header? autogenerate_column_names=False
# no header? autogenerate_column_names=True
read_options = csv.ReadOptions(autogenerate_column_names=not has_header)
parse_options = csv.ParseOptions(delimiter=",")
for csv_file in csv_files:
table = csv.read_csv(csv_file, read_options=read_options)
pq.write_table(table, csv_file.replace('.csv', '.parquet'))
table = csv.read_csv(csv_file, read_options=read_options,
parse_options=parse_options)
pq.write_table(table, csv_file.replace('.csv', '.parquet'))
20 changes: 16 additions & 4 deletions src/common/file_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,17 +121,23 @@
}
}

void FileUtils::overwriteFile(const std::string& from, const std::string& to) {
if (!fileOrPathExists(from) || !fileOrPathExists(to))
void FileUtils::copyFile(
const std::string& from, const std::string& to, std::filesystem::copy_options options) {
if (!fileOrPathExists(from))
return;
std::error_code errorCode;
if (!std::filesystem::copy_file(
from, to, std::filesystem::copy_options::overwrite_existing, errorCode)) {
if (!std::filesystem::copy_file(from, to, options, errorCode)) {
throw Exception(StringUtils::string_format(
"Error copying file {} to {}. ErrorMessage: {}", from, to, errorCode.message()));
}
}

void FileUtils::overwriteFile(const std::string& from, const std::string& to) {
if (!fileOrPathExists(from) || !fileOrPathExists(to))
return;

Check warning on line 137 in src/common/file_utils.cpp

View check run for this annotation

Codecov / codecov/patch

src/common/file_utils.cpp#L137

Added line #L137 was not covered by tests
copyFile(from, to, std::filesystem::copy_options::overwrite_existing);
}

void FileUtils::readFromFile(
FileInfo* fileInfo, void* buffer, uint64_t numBytes, uint64_t position) {
#if defined(_WIN32)
Expand Down Expand Up @@ -179,6 +185,12 @@
}
}

void FileUtils::createDirIfNotExists(const std::string& path) {
if (!fileOrPathExists(path)) {
createDir(path);
}
}

void FileUtils::removeDir(const std::string& dir) {
std::error_code removeErrorCode;
if (!fileOrPathExists(dir))
Expand Down
3 changes: 3 additions & 0 deletions src/include/common/file_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ class FileUtils {
FileInfo* fileInfo, uint8_t* buffer, uint64_t numBytes, uint64_t offset);
// This function is a no-op if either file, from or to, does not exist.
static void overwriteFile(const std::string& from, const std::string& to);
static void copyFile(const std::string& from, const std::string& to,
std::filesystem::copy_options options = std::filesystem::copy_options::none);
static void createDir(const std::string& dir);
static void createDirIfNotExists(const std::string& path);
static void removeDir(const std::string& dir);

static inline std::string joinPath(const std::string& base, const std::string& part) {
Expand Down
5 changes: 5 additions & 0 deletions test/include/test_helper/test_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,14 @@ class TestHelper {
static constexpr char E2E_TEST_FILES_DIRECTORY[] = "test/test_files";
static constexpr char SCHEMA_FILE_NAME[] = "schema.cypher";
static constexpr char COPY_FILE_NAME[] = "copy.cypher";
static constexpr char PARQUET_TEMP_DATASET_PATH[] = "dataset/parquet_temp/";

static std::string getTmpTestDir() { return appendKuzuRootPath("test/unittest_temp/"); }

static std::string appendParquetDatasetTempDir(const std::string& dataset) {
return TestHelper::appendKuzuRootPath(TestHelper::PARQUET_TEMP_DATASET_PATH + dataset);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to call FileUtils::joinPath to concatenation file paths?
Also in appendKuzuRootPath.

}

static std::string appendKuzuRootPath(const std::string& path) {
return KUZU_ROOT_DIRECTORY + std::string("/") + path;
}
Expand Down
49 changes: 49 additions & 0 deletions test/include/test_runner/csv_to_parquet_converter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#include "main/kuzu.h"
#include <arrow/api.h>

namespace kuzu {
namespace testing {

// Convert an entire CSV dataset directory to parquet.
// The dataset directory must contain schema and copy files.
class CSVToParquetConverter {
public:
static void convertCSVDatasetToParquet(std::string& dataset);

private:
struct CopyCommandInfo {
std::string table;
std::string csvFilePath;
std::string parquetFilePath;
bool csvHasHeader;
char delimiter;
};

static std::vector<CopyCommandInfo> readCopyCommandsFromCopyCypherFile(
const std::string& dataset);

static void convertCSVFilesToParquet(
const std::vector<CSVToParquetConverter::CopyCommandInfo>& copyCommands);

static CopyCommandInfo createCopyCommandInfo(
const std::string& dataset, std::string copyStatement);

static arrow::Status runCSVToParquetConversion(const std::string& inputFile,
const std::string& outputFile, char delimiter, bool hasHeader);

static void copySchema(
const std::string& csvDatasetPath, const std::string& parquetDatasetPath);

static void createCopyFile(const std::string& dataset,
const std::vector<CSVToParquetConverter::CopyCommandInfo>& copyCommands);

inline static std::string replaceSlashesWithUnderscores(std::string dataset) {
std::replace(dataset.begin(), dataset.end(), '/', '_');
return dataset;
}

static std::string extractPath(std::string& str, char delimiter);
};

} // namespace testing
} // namespace kuzu
7 changes: 4 additions & 3 deletions test/include/test_runner/test_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@ struct TestStatement {
struct TestGroup {
std::string group;
std::string dataset;
uint64_t bufferPoolSize =
kuzu::common::BufferPoolConstants::DEFAULT_BUFFER_POOL_SIZE_FOR_TESTING;
std::unordered_map<std::string, std::vector<std::unique_ptr<TestStatement>>> testCases;
std::unordered_map<std::string, std::vector<std::unique_ptr<TestStatement>>>
testCasesStatementBlocks;
uint64_t bufferPoolSize = common::BufferPoolConstants::DEFAULT_BUFFER_POOL_SIZE_FOR_TESTING;

bool isValid() const { return !group.empty() && !dataset.empty(); }
enum class DatasetType { CSV, PARQUET, NPY, CSV_TO_PARQUET };
DatasetType datasetType;

bool isValid() const { return !group.empty() && !dataset.empty(); }
bool hasStatements() const { return !testCases.empty(); }
};

Expand Down
1 change: 1 addition & 0 deletions test/include/test_runner/test_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class TestParser {
void parseBody();
void extractExpectedResult(TestStatement* statement);
void extractStatementBlock();
void extractDataset();
void addStatementBlock(const std::string& blockName, const std::string& testGroupName);
void replaceVariables(std::string& str);

Expand Down
20 changes: 15 additions & 5 deletions test/runner/e2e_test.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "graph_test/graph_test.h"
#include "test_runner/csv_to_parquet_converter.h"
#include "test_runner/test_parser.h"

using ::testing::Test;
Expand All @@ -19,9 +20,8 @@ class EndToEndTest : public DBTest {
initGraph();
}

std::string getInputDir() override {
return TestHelper::appendKuzuRootPath("dataset/" + dataset + "/");
}
std::string getInputDir() override { return dataset + "/"; }

void TestBody() override { runTest(testStatements); }

private:
Expand All @@ -37,6 +37,11 @@ void parseAndRegisterTestGroup(const std::string& path) {
auto dataset = testGroup->dataset;
auto testCases = std::move(testGroup->testCases);
auto bufferPoolSize = testGroup->bufferPoolSize;
if (testGroup->datasetType == TestGroup::DatasetType::CSV_TO_PARQUET) {
CSVToParquetConverter::convertCSVDatasetToParquet(dataset);
} else {
dataset = TestHelper::appendKuzuRootPath("dataset/" + dataset);
}
for (auto& [testCaseName, testStatements] : testCases) {
testing::RegisterTest(testGroup->group.c_str(), testCaseName.c_str(), nullptr, nullptr,
__FILE__, __LINE__,
Expand Down Expand Up @@ -66,12 +71,17 @@ int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
std::string path = TestHelper::E2E_TEST_FILES_DIRECTORY;
if (argc > 1) {
path = path + "/" + argv[1];
path = FileUtils::joinPath(path, argv[1]);
}
path = TestHelper::appendKuzuRootPath(path);
if (!FileUtils::fileOrPathExists(path)) {
throw TestException("Test path not exists [" + path + "].");
}
auto parquetDatasetTempDir =
TestHelper::appendKuzuRootPath(TestHelper::PARQUET_TEMP_DATASET_PATH);
FileUtils::createDirIfNotExists(parquetDatasetTempDir);
scanTestFiles(path);
return RUN_ALL_TESTS();
auto result = RUN_ALL_TESTS();
FileUtils::removeDir(parquetDatasetTempDir);
return result;
}
2 changes: 1 addition & 1 deletion test/test_files/copy/csv/copy_node.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP CopyNodeFromCSVTest
-DATASET copy-test/node/csv
-DATASET CSV copy-test/node/csv

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/copy/parquet/copy_node.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP CopyNodeFromParquetTest
-DATASET copy-test/node/parquet
-DATASET CSV copy-test/node/parquet

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBTest
-DATASET demo-db/csv
-DATASET CSV demo-db/csv

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db_create.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBCreateTest
-DATASET demo-db/csv
-DATASET CSV demo-db/csv

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db_delete.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBDeleteTest
-DATASET demo-db/csv
-DATASET CSV demo-db/csv

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db_order.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBTest
-DATASET demo-db/csv
-DATASET CSV demo-db/csv

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db_order_parquet.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBTest
-DATASET demo-db/parquet
-DATASET PARQUET demo-db/parquet

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db_parquet.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBTest
-DATASET demo-db/parquet
-DATASET PARQUET demo-db/parquet

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db_set_copy.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBSetAndCopyTest
-DATASET demo-db/csv
-DATASET CSV demo-db/csv

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/exceptions/binder/binder_error.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP BinderErrorTest
-DATASET tinysnb
-DATASET CSV tinysnb

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/exceptions/catalog/catalog.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP CatalogErrorTest
-DATASET tinysnb
-DATASET CSV tinysnb

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/exceptions/parser/parse_type.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP SyntaxErrorTest
-DATASET dummy
-DATASET CSV dummy

--

Expand Down
2 changes: 1 addition & 1 deletion test/test_files/exceptions/parser/syntax_error.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP SyntaxErrorTest
-DATASET dummy
-DATASET CSV dummy

--

Expand Down
Loading
Loading