From a3214443a0124ce2a1783ab6dbd7012c9584d2cc Mon Sep 17 00:00:00 2001 From: Manh Dinh Date: Thu, 29 Feb 2024 13:51:29 -0500 Subject: [PATCH] Rework CSV_TO_PARQUET testing feature --- dataset/long-string-pk-tests/schema.cypher | 2 +- src/common/file_system/file_system.cpp | 4 + src/common/file_system/local_file_system.cpp | 12 ++ src/include/common/file_system/file_system.h | 2 + .../common/file_system/local_file_system.h | 2 + .../test_runner/csv_to_parquet_converter.h | 64 +++--- test/runner/e2e_test.cpp | 23 ++- .../copy/copy_pk_long_string_parquet.test | 1 - ...isabled => interactive-short-parquet.test} | 32 +-- ...uet.disabled => lsqb_queries_parquet.test} | 1 - .../test_files/order_by/order_by_parquet.test | 1 - .../read_list/large_adj_list_parquet.test | 1 - .../shortest_path/bfs_sssp_parquet.test | 59 +++--- .../tinysnb/parquet/tinysnb_parquet.test | 6 +- test/test_runner/CMakeLists.txt | 1 + test/test_runner/csv_to_parquet_converter.cpp | 191 ++++++++++++++++++ 16 files changed, 316 insertions(+), 86 deletions(-) rename test/test_files/ldbc/ldbc-interactive/{interactive-short-parquet.disabled => interactive-short-parquet.test} (69%) rename test/test_files/lsqb/{lsqb_queries_parquet.disabled => lsqb_queries_parquet.test} (99%) create mode 100644 test/test_runner/csv_to_parquet_converter.cpp diff --git a/dataset/long-string-pk-tests/schema.cypher b/dataset/long-string-pk-tests/schema.cypher index c2d3476cb8..014a774e00 100644 --- a/dataset/long-string-pk-tests/schema.cypher +++ b/dataset/long-string-pk-tests/schema.cypher @@ -1,2 +1,2 @@ -CREATE NODE TABLE Person(name STRING, spouse STRING, PRIMARY KEY (name)) +CREATE NODE TABLE Person (name STRING, spouse STRING, PRIMARY KEY (name)) create REL TABLE Knows (FROM Person TO Person); diff --git a/src/common/file_system/file_system.cpp b/src/common/file_system/file_system.cpp index 6bdca6dd63..bc595b94d8 100644 --- a/src/common/file_system/file_system.cpp +++ b/src/common/file_system/file_system.cpp @@ -7,6 +7,10 @@ void FileSystem::overwriteFile(const std::string& /*from*/, const std::string& / KU_UNREACHABLE; } +void FileSystem::copyFile(const std::string& /*from*/, const std::string& /*to*/) const { + KU_UNREACHABLE; +} + void FileSystem::createDir(const std::string& /*dir*/) const { KU_UNREACHABLE; } diff --git a/src/common/file_system/local_file_system.cpp b/src/common/file_system/local_file_system.cpp index 1f91d006b0..71ff32ff9a 100644 --- a/src/common/file_system/local_file_system.cpp +++ b/src/common/file_system/local_file_system.cpp @@ -147,6 +147,18 @@ void LocalFileSystem::overwriteFile(const std::string& from, const std::string& } } +void LocalFileSystem::copyFile(const std::string& from, const std::string& to) const { + if (!fileOrPathExists(from)) + return; + std::error_code errorCode; + if (!std::filesystem::copy_file(from, to, std::filesystem::copy_options::none, errorCode)) { + // LCOV_EXCL_START + throw Exception(stringFormat( + "Error copying file {} to {}. ErrorMessage: {}", from, to, errorCode.message())); + // LCOV_EXCL_STOP + } +} + void LocalFileSystem::createDir(const std::string& dir) const { try { if (std::filesystem::exists(dir)) { diff --git a/src/include/common/file_system/file_system.h b/src/include/common/file_system/file_system.h index dce4f2c0db..8e7860a134 100644 --- a/src/include/common/file_system/file_system.h +++ b/src/include/common/file_system/file_system.h @@ -29,6 +29,8 @@ class KUZU_API FileSystem { virtual void overwriteFile(const std::string& from, const std::string& to) const; + virtual void copyFile(const std::string& from, const std::string& to) const; + virtual void createDir(const std::string& dir) const; virtual void removeFileIfExists(const std::string& path) const; diff --git a/src/include/common/file_system/local_file_system.h b/src/include/common/file_system/local_file_system.h index caad85db2b..c29f7b577d 100644 --- a/src/include/common/file_system/local_file_system.h +++ b/src/include/common/file_system/local_file_system.h @@ -36,6 +36,8 @@ class LocalFileSystem final : public FileSystem { void overwriteFile(const std::string& from, const std::string& to) const override; + void copyFile(const std::string& from, const std::string& to) const override; + void createDir(const std::string& dir) const override; void removeFileIfExists(const std::string& path) const override; diff --git a/test/include/test_runner/csv_to_parquet_converter.h b/test/include/test_runner/csv_to_parquet_converter.h index 9a2fe67114..54b9d80551 100644 --- a/test/include/test_runner/csv_to_parquet_converter.h +++ b/test/include/test_runner/csv_to_parquet_converter.h @@ -2,6 +2,8 @@ #include #include +#include "main/kuzu.h" + namespace kuzu { namespace testing { @@ -9,41 +11,51 @@ namespace testing { // The dataset directory must contain schema and copy files. class CSVToParquetConverter { public: - static std::string convertCSVDatasetToParquet( - const std::string& csvDatasetPath, const std::string& parquetDatasetPath); + explicit CSVToParquetConverter( + std::string csvDatasetPath, std::string parquetDatasetPath, uint64_t bufferPoolSize) + : csvDatasetPath{csvDatasetPath}, parquetDatasetPath{parquetDatasetPath}, + bufferPoolSize{bufferPoolSize} {} + + void convertCSVDatasetToParquet(); - inline static std::string replaceSlashesWithUnderscores(std::string dataset) { - std::replace(dataset.begin(), dataset.end(), '/', '_'); - return dataset; - } +private: + void copySchemaFile(); + void createTableInfo(std::string schemaFile); + void readCopyCommandsFromCSVDataset(); + void createCopyFile(); + void convertCSVFilesToParquet(); private: - struct CopyCommandInfo { - std::string table; + struct TableInfo { + std::string name; std::string csvFilePath; std::string parquetFilePath; - bool csvHasHeader; - char delimiter; + // get cypher query to convert csv file to parquet file + virtual std::string getConverterQuery() const = 0; }; - static std::vector readCopyCommandsFromCopyCypherFile( - const std::string& csvDatasetPath, const std::string& parquetDatasetPath); - - static void convertCSVFilesToParquet( - const std::vector& copyCommands); - - static CopyCommandInfo createCopyCommandInfo( - const std::string& parquetDatasetPath, std::string copyStatement); - - // TODO: This has to be re-implemented to not rely on arrow, instead of on our own parquet lib. - // static arrow::Status runCSVToParquetConversion(const std::string& inputFile, - // const std::string& outputFile, char delimiter, bool hasHeader); + struct NodeTableInfo final : public TableInfo { + std::string primaryKey; + // get converter query for node table + std::string getConverterQuery() const override; + }; - static void copySchema( - const std::string& csvDatasetPath, const std::string& parquetDatasetPath); + struct RelTableInfo final : public TableInfo { + std::shared_ptr fromTable; + std::shared_ptr toTable; + // get converter query for rel table + std::string getConverterQuery() const override; + }; - static void createCopyFile(const std::string& parquetDatasetPath, - const std::vector& copyCommands); +private: + std::string csvDatasetPath; + std::string parquetDatasetPath; + std::vector> tables; + std::unordered_map> tableNameMap; + uint64_t bufferPoolSize; + std::unique_ptr systemConfig; + std::unique_ptr tempDb; + std::unique_ptr tempConn; }; } // namespace testing diff --git a/test/runner/e2e_test.cpp b/test/runner/e2e_test.cpp index 4b397a2287..3cf9c9d0ac 100644 --- a/test/runner/e2e_test.cpp +++ b/test/runner/e2e_test.cpp @@ -30,17 +30,23 @@ class EndToEndTest : public DBTest { } void setUpDataset() { - parquetTempDatasetPath = generateParquetTempDatasetPath(); - dataset = TestHelper::appendKuzuRootPath("dataset/" + dataset); if (datasetType == TestGroup::DatasetType::CSV_TO_PARQUET) { - throw NotImplementedException("CSV_TO_PARQUET dataset type is not implemented yet."); + auto csvDatasetPath = TestHelper::appendKuzuRootPath("dataset/" + dataset); + parquetTempDatasetPath = generateParquetTempDatasetPath(); + CSVToParquetConverter converter(csvDatasetPath, parquetTempDatasetPath, bufferPoolSize); + converter.convertCSVDatasetToParquet(); + dataset = parquetTempDatasetPath; + } else { + dataset = TestHelper::appendKuzuRootPath("dataset/" + dataset); } } void TearDown() override { std::filesystem::remove_all(databasePath); - std::filesystem::remove_all(parquetTempDatasetPath); BaseGraphTest::removeIEDBPath(); + if (datasetType == TestGroup::DatasetType::CSV_TO_PARQUET) { + std::filesystem::remove_all(parquetTempDatasetPath); + } } void TestBody() override { runTest(testStatements, checkpointWaitTimeout, connNames); } @@ -56,10 +62,11 @@ class EndToEndTest : public DBTest { std::set connNames; std::string generateParquetTempDatasetPath() { - return TestHelper::appendKuzuRootPath( - TestHelper::PARQUET_TEMP_DATASET_PATH + - CSVToParquetConverter::replaceSlashesWithUnderscores(dataset) + getTestGroupAndName() + - TestHelper::getMillisecondsSuffix()); + std::string datasetName = dataset; + std::replace(datasetName.begin(), datasetName.end(), '/', '_'); + return TestHelper::appendKuzuRootPath(TestHelper::PARQUET_TEMP_DATASET_PATH + datasetName + + "_" + getTestGroupAndName() + "_" + + TestHelper::getMillisecondsSuffix()); } }; diff --git a/test/test_files/copy/copy_pk_long_string_parquet.test b/test/test_files/copy/copy_pk_long_string_parquet.test index 0a34d037ff..c70f0e7058 100644 --- a/test/test_files/copy/copy_pk_long_string_parquet.test +++ b/test/test_files/copy/copy_pk_long_string_parquet.test @@ -1,6 +1,5 @@ -GROUP LongStringPKTest -DATASET PARQUET CSV_TO_PARQUET(long-string-pk-tests) --SKIP -- diff --git a/test/test_files/ldbc/ldbc-interactive/interactive-short-parquet.disabled b/test/test_files/ldbc/ldbc-interactive/interactive-short-parquet.test similarity index 69% rename from test/test_files/ldbc/ldbc-interactive/interactive-short-parquet.disabled rename to test/test_files/ldbc/ldbc-interactive/interactive-short-parquet.test index d89129ac1b..1a2cafdfa1 100644 --- a/test/test_files/ldbc/ldbc-interactive/interactive-short-parquet.disabled +++ b/test/test_files/ldbc/ldbc-interactive/interactive-short-parquet.test @@ -1,9 +1,7 @@ -# FIXME: this test is failing on Parquet dataset -GROUP LDBCTest -DATASET PARQUET CSV_TO_PARQUET(ldbc-sf01) -BUFFER_POOL_SIZE 1073741824 -SKIP - -- -CASE LDBCInteractiveShortParquet @@ -22,6 +20,20 @@ ---- 1 Mahinda|Perera|19891203|119.235.7.103|Firefox|1353|male|20100214153210447 +# IS2 should be changed to use Kleene Star relationship once that is implemented. +# The 'Case When' statement should be supported as coalesce(). +-LOG IS2 +-STATEMENT MATCH (:Person {id: 21990232555803})<-[:Post_hasCreator|:Comment_hasCreator]-(message) + WITH message,message.id AS messageId, message.creationDate AS messageCreationDate + ORDER BY messageCreationDate DESC, messageId ASC LIMIT 10 + MATCH (message)-[:replyOf_Post|:replyOf_Comment*1..2]->(post:Post), (post)-[:Post_hasCreator]->(person) + RETURN messageId, CASE WHEN message.imageFile is NULL THEN message.content ELSE message.imageFile END AS messageContent, messageCreationDate, post.id AS postId, person.id AS personId, person.firstName AS personFirstName, person.lastName AS personLastName + ORDER BY messageCreationDate DESC, messageId ASC; +---- 3 +1030792343617|About H•A•M, d Kanye West, released as the first single from their collaborati|20120823142307823|1030792343610|21990232556463|Victor|Antonescu +1030792343876|thx|20120818110412997|1030792343869|21990232555803|Carlos|Lopez +1030792343986|fine|20120810030544084|1030792343978|21990232556837|Bing|Li + -LOG IS3 -STATEMENT MATCH (n:Person {id: 1129})-[r:knows]-(friend) RETURN friend.id AS personId, @@ -58,17 +70,11 @@ Mahinda|Perera|19891203|119.235.7.103|Firefox|1353|male|20100214153210447 2199023256077|Ibrahim Bare|Ousmane # IS6 should be changed to use Kleene Star relationship once that is implemented -# This query is currently commented out, but will work as soon as multilabelled recursive rels are merged to master. -# -LOG IS6 -# -STATEMENT MATCH (m:Comment {id: 962072675825 })-[:replyOf_Post|:replyOf_Comment*1..2]->(p:Post)<-[:containerOf]-(f:Forum)-[:hasModerator]->(mod:Person) -# RETURN -# f.id AS forumId, -# f.title AS forumTitle, -# mod.id AS moderatorId, -# mod.firstName AS moderatorFirstName, -# mod.lastName AS moderatorLastName; -# ---- 1 -# 687194767491|Wall of Faisal Malik|21990232556585|Faisal|Malik +-LOG IS6 +-STATEMENT MATCH (m:Comment {id: 962072675825 })-[:replyOf_Post|:replyOf_Comment*1..2]->(p:Post)<-[:containerOf]-(f:Forum)-[:hasModerator]->(mod:Person) + RETURN f.id AS forumId,f.title AS forumTitle,mod.id AS moderatorId,mod.firstName AS moderatorFirstName, mod.lastName AS moderatorLastName; +---- 1 +687194767491|Wall of Faisal Malik|21990232556585|Faisal|Malik -LOG IS7 -STATEMENT MATCH (m:Post:Comment {id: 962072971887})<-[:replyOf_Comment|:replyOf_Post]-(c:Comment)-[:Comment_hasCreator]->(p:Person) diff --git a/test/test_files/lsqb/lsqb_queries_parquet.disabled b/test/test_files/lsqb/lsqb_queries_parquet.test similarity index 99% rename from test/test_files/lsqb/lsqb_queries_parquet.disabled rename to test/test_files/lsqb/lsqb_queries_parquet.test index 52497b65ed..1694656414 100644 --- a/test/test_files/lsqb/lsqb_queries_parquet.disabled +++ b/test/test_files/lsqb/lsqb_queries_parquet.test @@ -2,7 +2,6 @@ -DATASET PARQUET CSV_TO_PARQUET(lsqb-sf01) -BUFFER_POOL_SIZE 1073741824 -SKIP - -- -CASE LSQBTestParquet diff --git a/test/test_files/order_by/order_by_parquet.test b/test/test_files/order_by/order_by_parquet.test index 7e59732e50..6b587d3c68 100644 --- a/test/test_files/order_by/order_by_parquet.test +++ b/test/test_files/order_by/order_by_parquet.test @@ -10,7 +10,6 @@ -GROUP OrderByTests -DATASET PARQUET CSV_TO_PARQUET(order-by-tests) --SKIP -- diff --git a/test/test_files/read_list/large_adj_list_parquet.test b/test/test_files/read_list/large_adj_list_parquet.test index 5012aad215..629908f427 100644 --- a/test/test_files/read_list/large_adj_list_parquet.test +++ b/test/test_files/read_list/large_adj_list_parquet.test @@ -1,6 +1,5 @@ -GROUP EndToEndReadLargeListsTest -DATASET PARQUET CSV_TO_PARQUET(read-list-tests/large-list) --SKIP -- diff --git a/test/test_files/shortest_path/bfs_sssp_parquet.test b/test/test_files/shortest_path/bfs_sssp_parquet.test index 6fdaa31dba..fde3cc4253 100644 --- a/test/test_files/shortest_path/bfs_sssp_parquet.test +++ b/test/test_files/shortest_path/bfs_sssp_parquet.test @@ -1,42 +1,41 @@ -GROUP ShortestPathTest -DATASET PARQUET CSV_TO_PARQUET(shortest-path-tests) --SKIP # Timestamp and Date columns are written as strings in parquet file. Parquet reader should support casting operations. -- -CASE BfsParquet -LOG SingleSourceAllDestinationsSSP --STATEMENT MATCH (a:person)-[r:knows* SHORTEST 1..30]->(b:person) WHERE a.fName = 'Alice' RETURN a.fName, b.fName, rels(r), properties(nodes(r), 'fName') +-STATEMENT MATCH (a:person)-[r:knows* SHORTEST 1..30]->(b:person) WHERE a.fName = 'Alice' RETURN a.fName, b.fName, properties(nodes(r), 'fName') ---- 7 -Alice|Bob|[(0:0)-{_LABEL: knows, _ID: 1:0}->(0:1)]|[] -Alice|Carol|[(0:0)-{_LABEL: knows, _ID: 1:1}->(0:2)]|[] -Alice|Dan|[(0:0)-{_LABEL: knows, _ID: 1:2}->(0:3)]|[] -Alice|Elizabeth|[(0:0)-{_LABEL: knows, _ID: 1:0}->(0:1),(0:1)-{_LABEL: knows, _ID: 1:6}->(0:4)]|[Bob] -Alice|Farooq|[(0:0)-{_LABEL: knows, _ID: 1:0}->(0:1),(0:1)-{_LABEL: knows, _ID: 1:6}->(0:4),(0:4)-{_LABEL: knows, _ID: 1:13}->(0:5)]|[Bob,Elizabeth] -Alice|Greg|[(0:0)-{_LABEL: knows, _ID: 1:0}->(0:1),(0:1)-{_LABEL: knows, _ID: 1:6}->(0:4),(0:4)-{_LABEL: knows, _ID: 1:14}->(0:6)]|[Bob,Elizabeth] -Alice|Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|[(0:0)-{_LABEL: knows, _ID: 1:0}->(0:1),(0:1)-{_LABEL: knows, _ID: 1:6}->(0:4),(0:4)-{_LABEL: knows, _ID: 1:15}->(0:7)]|[Bob,Elizabeth] +Alice|Bob|[] +Alice|Carol|[] +Alice|Dan|[] +Alice|Elizabeth|[Bob] +Alice|Farooq|[Bob,Elizabeth] +Alice|Greg|[Bob,Elizabeth] +Alice|Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|[Bob,Elizabeth] -LOG AllSourcesSingleDestinationQuery --STATEMENT MATCH (a:person)-[r:knows* SHORTEST 1..30]->(b:person) WHERE b.fName = 'Alice' RETURN a.fName, b.fName, rels(r), properties(nodes(r), 'usedNames') +-STATEMENT MATCH (a:person)-[r:knows* SHORTEST 1..30]->(b:person) WHERE b.fName = 'Alice' RETURN a.fName, b.fName, properties(nodes(r), 'usedNames') ---- 6 -Bob|Alice|[(0:0)-{_LABEL: knows, _ID: 1:3}->(0:1)]|[] -Carol|Alice|[(0:0)-{_LABEL: knows, _ID: 1:7}->(0:2)]|[] -Dan|Alice|[(0:0)-{_LABEL: knows, _ID: 1:10}->(0:3)]|[] -Elizabeth|Alice|[(0:0)-{_LABEL: knows, _ID: 1:20}->(0:7),(0:7)-{_LABEL: knows, _ID: 1:15}->(0:4)]|[[Ad,De,Hi,Kye,Orlan]] -Farooq|Alice|[(0:0)-{_LABEL: knows, _ID: 1:20}->(0:7),(0:7)-{_LABEL: knows, _ID: 1:17}->(0:5)]|[[Ad,De,Hi,Kye,Orlan]] -Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|Alice|[(0:0)-{_LABEL: knows, _ID: 1:20}->(0:7)]|[] +Bob|Alice|[] +Carol|Alice|[] +Dan|Alice|[] +Elizabeth|Alice|[[Ad,De,Hi,Kye,Orlan]] +Farooq|Alice|[[Ad,De,Hi,Kye,Orlan]] +Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|Alice|[] -LOG SingleSourceWithAllProperties --STATEMENT MATCH (a:person)-[r:knows* SHORTEST 1..30]->(b:person) WHERE a.fName = 'Alice' RETURN length(r), b, a +-STATEMENT MATCH (a:person)-[r:knows* SHORTEST 1..30]->(b:person) WHERE a.fName = 'Alice' RETURN length(r), b.*, a.* ---- 7 -1|{_ID: 0:1, _LABEL: person, ID: 2, fName: Bob, gender: 2, isStudent: True, isWorker: False, age: 30, eyeSight: 5.100000, birthdate: 1900-01-01, registerTime: 2008-11-03 15:25:30.000526, lastJobDuration: 10 years 5 months 13:00:00.000024, workedHours: [12,8], usedNames: [Bobby], courseScoresPerTerm: [[8,9],[9,10]]}|{_ID: 0:0, _LABEL: person, ID: 0, fName: Alice, gender: 1, isStudent: True, isWorker: False, age: 35, eyeSight: 5.000000, birthdate: 1900-01-01, registerTime: 2011-08-20 11:25:30, lastJobDuration: 3 years 2 days 13:02:00, workedHours: [10,5], usedNames: [Aida], courseScoresPerTerm: [[10,8],[6,7,8]]} -1|{_ID: 0:2, _LABEL: person, ID: 3, fName: Carol, gender: 1, isStudent: False, isWorker: True, age: 45, eyeSight: 5.000000, birthdate: 1940-06-22, registerTime: 1911-08-20 02:32:21, lastJobDuration: 48:24:11, workedHours: [4,5], usedNames: [Carmen,Fred], courseScoresPerTerm: [[8,10]]}|{_ID: 0:0, _LABEL: person, ID: 0, fName: Alice, gender: 1, isStudent: True, isWorker: False, age: 35, eyeSight: 5.000000, birthdate: 1900-01-01, registerTime: 2011-08-20 11:25:30, lastJobDuration: 3 years 2 days 13:02:00, workedHours: [10,5], usedNames: [Aida], courseScoresPerTerm: [[10,8],[6,7,8]]} -1|{_ID: 0:3, _LABEL: person, ID: 5, fName: Dan, gender: 2, isStudent: False, isWorker: True, age: 20, eyeSight: 4.800000, birthdate: 1950-07-23, registerTime: 2031-11-30 12:25:30, lastJobDuration: 10 years 5 months 13:00:00.000024, workedHours: [1,9], usedNames: [Wolfeschlegelstein,Daniel], courseScoresPerTerm: [[7,4],[8,8],[9]]}|{_ID: 0:0, _LABEL: person, ID: 0, fName: Alice, gender: 1, isStudent: True, isWorker: False, age: 35, eyeSight: 5.000000, birthdate: 1900-01-01, registerTime: 2011-08-20 11:25:30, lastJobDuration: 3 years 2 days 13:02:00, workedHours: [10,5], usedNames: [Aida], courseScoresPerTerm: [[10,8],[6,7,8]]} -2|{_ID: 0:4, _LABEL: person, ID: 7, fName: Elizabeth, gender: 1, isStudent: False, isWorker: True, age: 20, eyeSight: 4.700000, birthdate: 1980-10-26, registerTime: 1976-12-23 11:21:42, lastJobDuration: 48:24:11, workedHours: [2], usedNames: [Ein], courseScoresPerTerm: [[6],[7],[8]]}|{_ID: 0:0, _LABEL: person, ID: 0, fName: Alice, gender: 1, isStudent: True, isWorker: False, age: 35, eyeSight: 5.000000, birthdate: 1900-01-01, registerTime: 2011-08-20 11:25:30, lastJobDuration: 3 years 2 days 13:02:00, workedHours: [10,5], usedNames: [Aida], courseScoresPerTerm: [[10,8],[6,7,8]]} -3|{_ID: 0:5, _LABEL: person, ID: 8, fName: Farooq, gender: 2, isStudent: True, isWorker: False, age: 25, eyeSight: 4.500000, birthdate: 1980-10-26, registerTime: 1972-07-31 13:22:30.678559, lastJobDuration: 00:18:00.024, workedHours: [3,4,5,6,7], usedNames: [Fesdwe], courseScoresPerTerm: [[8]]}|{_ID: 0:0, _LABEL: person, ID: 0, fName: Alice, gender: 1, isStudent: True, isWorker: False, age: 35, eyeSight: 5.000000, birthdate: 1900-01-01, registerTime: 2011-08-20 11:25:30, lastJobDuration: 3 years 2 days 13:02:00, workedHours: [10,5], usedNames: [Aida], courseScoresPerTerm: [[10,8],[6,7,8]]} -3|{_ID: 0:6, _LABEL: person, ID: 9, fName: Greg, gender: 2, isStudent: False, isWorker: False, age: 40, eyeSight: 4.900000, birthdate: 1980-10-26, registerTime: 1976-12-23 04:41:42, lastJobDuration: 10 years 5 months 13:00:00.000024, workedHours: [1], usedNames: [Grad], courseScoresPerTerm: [[10]]}|{_ID: 0:0, _LABEL: person, ID: 0, fName: Alice, gender: 1, isStudent: True, isWorker: False, age: 35, eyeSight: 5.000000, birthdate: 1900-01-01, registerTime: 2011-08-20 11:25:30, lastJobDuration: 3 years 2 days 13:02:00, workedHours: [10,5], usedNames: [Aida], courseScoresPerTerm: [[10,8],[6,7,8]]} -3|{_ID: 0:7, _LABEL: person, ID: 10, fName: Hubert Blaine Wolfeschlegelsteinhausenbergerdorff, gender: 2, isStudent: False, isWorker: True, age: 83, eyeSight: 4.900000, birthdate: 1990-11-27, registerTime: 2023-02-21 13:25:30, lastJobDuration: 3 years 2 days 13:02:00, workedHours: [10,11,12,3,4,5,6,7], usedNames: [Ad,De,Hi,Kye,Orlan], courseScoresPerTerm: [[7],[10],[6,7]]}|{_ID: 0:0, _LABEL: person, ID: 0, fName: Alice, gender: 1, isStudent: True, isWorker: False, age: 35, eyeSight: 5.000000, birthdate: 1900-01-01, registerTime: 2011-08-20 11:25:30, lastJobDuration: 3 years 2 days 13:02:00, workedHours: [10,5], usedNames: [Aida], courseScoresPerTerm: [[10,8],[6,7,8]]} +1|2|Bob|2|True|False|30|5.100000|1900-01-01|2008-11-03 15:25:30.000526|10 years 5 months 13:00:00|[12,8]|[Bobby]|[[8,9],[9,10]]|0|Alice|1|True|False|35|5.000000|1900-01-01|2011-08-20 11:25:30|3 years 2 days 13:02:00|[10,5]|[Aida]|[[10,8],[6,7,8]] +1|3|Carol|1|False|True|45|5.000000|1940-06-22|1911-08-20 02:32:21|48:24:11|[4,5]|[Carmen,Fred]|[[8,10]]|0|Alice|1|True|False|35|5.000000|1900-01-01|2011-08-20 11:25:30|3 years 2 days 13:02:00|[10,5]|[Aida]|[[10,8],[6,7,8]] +1|5|Dan|2|False|True|20|4.800000|1950-07-23|2031-11-30 12:25:30|10 years 5 months 13:00:00|[1,9]|[Wolfeschlegelstein,Daniel]|[[7,4],[8,8],[9]]|0|Alice|1|True|False|35|5.000000|1900-01-01|2011-08-20 11:25:30|3 years 2 days 13:02:00|[10,5]|[Aida]|[[10,8],[6,7,8]] +2|7|Elizabeth|1|False|True|20|4.700000|1980-10-26|1976-12-23 11:21:42|48:24:11|[2]|[Ein]|[[6],[7],[8]]|0|Alice|1|True|False|35|5.000000|1900-01-01|2011-08-20 11:25:30|3 years 2 days 13:02:00|[10,5]|[Aida]|[[10,8],[6,7,8]] +3|10|Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|2|False|True|83|4.900000|1990-11-27|2023-02-21 13:25:30|3 years 2 days 13:02:00|[10,11,12,3,4,5,6,7]|[Ad,De,Hi,Kye,Orlan]|[[7],[10],[6,7]]|0|Alice|1|True|False|35|5.000000|1900-01-01|2011-08-20 11:25:30|3 years 2 days 13:02:00|[10,5]|[Aida]|[[10,8],[6,7,8]] +3|8|Farooq|2|True|False|25|4.500000|1980-10-26|1972-07-31 13:22:30.678559|00:18:00.024|[3,4,5,6,7]|[Fesdwe]|[[8]]|0|Alice|1|True|False|35|5.000000|1900-01-01|2011-08-20 11:25:30|3 years 2 days 13:02:00|[10,5]|[Aida]|[[10,8],[6,7,8]] +3|9|Greg|2|False|False|40|4.900000|1980-10-26|1976-12-23 04:41:42|10 years 5 months 13:00:00|[1]|[Grad]|[[10]]|0|Alice|1|True|False|35|5.000000|1900-01-01|2011-08-20 11:25:30|3 years 2 days 13:02:00|[10,5]|[Aida]|[[10,8],[6,7,8]] -LOG SingleSourceSingleDestination -STATEMENT MATCH (a:person)-[r:knows* SHORTEST 1..30]->(b:person) WHERE a.fName = 'Alice' AND b.fName = 'Bob' RETURN a.fName, b.fName, length(r) @@ -44,13 +43,13 @@ Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|Alice|[(0:0)-{_LABEL: knows, _ Alice|Bob|1 -LOG SingleSourceAllDestinations2 --STATEMENT MATCH (a:person)-[r:knows* SHORTEST 1..2]->(b:person) WHERE a.fName = 'Elizabeth' RETURN a.fName, b.fName, rels(r), properties(nodes(r), '_Label') +-STATEMENT MATCH (a:person)-[r:knows* SHORTEST 1..2]->(b:person) WHERE a.fName = 'Elizabeth' RETURN a.fName, b.fName, properties(nodes(r), '_Label') ---- 5 -Elizabeth|Alice|[(0:4)-{_LABEL: knows, _ID: 1:15}->(0:7),(0:7)-{_LABEL: knows, _ID: 1:20}->(0:0)]|[person] -Elizabeth|Dan|[(0:4)-{_LABEL: knows, _ID: 1:15}->(0:7),(0:7)-{_LABEL: knows, _ID: 1:21}->(0:3)]|[person] -Elizabeth|Farooq|[(0:4)-{_LABEL: knows, _ID: 1:13}->(0:5)]|[] -Elizabeth|Greg|[(0:4)-{_LABEL: knows, _ID: 1:14}->(0:6)]|[] -Elizabeth|Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|[(0:4)-{_LABEL: knows, _ID: 1:15}->(0:7)]|[] +Elizabeth|Alice|[person] +Elizabeth|Dan|[person] +Elizabeth|Farooq|[] +Elizabeth|Greg|[] +Elizabeth|Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|[] -LOG SingleSourceUnreachableDestination -STATEMENT MATCH (a:person)-[r:knows* SHORTEST 1..30]->(b:person) WHERE a.fName = 'Alice' AND b.fName = 'Alice11' RETURN a.fName, b.fName, r diff --git a/test/test_files/tinysnb/parquet/tinysnb_parquet.test b/test/test_files/tinysnb/parquet/tinysnb_parquet.test index 18378e4dab..bff6331e0e 100644 --- a/test/test_files/tinysnb/parquet/tinysnb_parquet.test +++ b/test/test_files/tinysnb/parquet/tinysnb_parquet.test @@ -1,9 +1,7 @@ -# one sample from each tinysnb csv test CSV_TO_PARQUET -# https://github.com/kuzudb/kuzu/issues/1627 +# This test group is currently disabled since exporting fixed-list to parquet is not supported yet -GROUP TinySnbParquet --SKIP -DATASET PARQUET CSV_TO_PARQUET(tinysnb) - +-SKIP -- -CASE TinySnbParquet diff --git a/test/test_runner/CMakeLists.txt b/test/test_runner/CMakeLists.txt index 594f6028b6..f1703a4572 100644 --- a/test/test_runner/CMakeLists.txt +++ b/test/test_runner/CMakeLists.txt @@ -1,6 +1,7 @@ add_library( test_runner OBJECT + csv_to_parquet_converter.cpp test_parser.cpp test_runner.cpp ) diff --git a/test/test_runner/csv_to_parquet_converter.cpp b/test/test_runner/csv_to_parquet_converter.cpp new file mode 100644 index 0000000000..a76047b2c9 --- /dev/null +++ b/test/test_runner/csv_to_parquet_converter.cpp @@ -0,0 +1,191 @@ +#include "test_runner/csv_to_parquet_converter.h" + +#include + +#include "common/exception/test.h" +#include "common/file_system/local_file_system.h" +#include "common/string_utils.h" +#include "spdlog/spdlog.h" +#include "test_helper/test_helper.h" + +using namespace kuzu::common; + +namespace kuzu { +namespace testing { + +void CSVToParquetConverter::copySchemaFile() { + LocalFileSystem localFileSystem; + auto csvSchemaFile = + localFileSystem.joinPath(csvDatasetPath, std::string(TestHelper::SCHEMA_FILE_NAME)); + auto parquetSchemaFile = + localFileSystem.joinPath(parquetDatasetPath, std::string(TestHelper::SCHEMA_FILE_NAME)); + if (!localFileSystem.fileOrPathExists(parquetSchemaFile)) { + localFileSystem.copyFile(csvSchemaFile, parquetSchemaFile); + } else { + localFileSystem.overwriteFile(csvSchemaFile, parquetSchemaFile); + } + createTableInfo(parquetSchemaFile); +} + +void CSVToParquetConverter::createTableInfo(std::string schemaFile) { + std::ifstream file(schemaFile); + if (!file.is_open()) { + throw TestException(stringFormat("Error opening file: {}, errno: {}.", schemaFile, errno)); + } + // This implementation stays as a temporary solution to create copy statements for rel tables + // We'll switch to use table_info once that function can provide everything needed + // table_info is mentioned in this issue https://github.com/kuzudb/kuzu/issues/2991 + std::string line; + while (getline(file, line)) { + auto tokens = StringUtils::split(line, " "); + + std::transform(tokens[0].begin(), tokens[0].end(), tokens[0].begin(), + [](unsigned char c) { return std::tolower(c); }); + std::transform(tokens[2].begin(), tokens[2].end(), tokens[2].begin(), + [](unsigned char c) { return std::tolower(c); }); + if (tokens[0] != "create" || tokens[2] != "table") { + throw TestException(stringFormat("Invalid CREATE statement: {}", line)); + } + + auto tableType = tokens[1]; + std::transform(tableType.begin(), tableType.end(), tableType.begin(), + [](unsigned char c) { return std::tolower(c); }); + auto tableName = tokens[3]; + + std::shared_ptr table; + if (tableType == "node") { + auto nodeTable = std::make_shared(); + size_t primaryKeyPos = line.find("PRIMARY KEY"); + if (primaryKeyPos != std::string::npos) { + size_t openParenPos = line.find("(", primaryKeyPos); + size_t closeParenPos = line.find(")", primaryKeyPos); + if (openParenPos != std::string::npos && closeParenPos != std::string::npos && + openParenPos < closeParenPos) { + nodeTable->primaryKey = + line.substr(openParenPos + 1, closeParenPos - openParenPos - 1); + table = nodeTable; + } else { + throw TestException( + stringFormat("PRIMARY KEY is not defined in node table: {}", line)); + } + } else { + throw TestException( + stringFormat("PRIMARY KEY is not defined in node table: {}", line)); + } + } else { + auto relTable = std::make_shared(); + size_t startPos = line.find("FROM"); + if (startPos != std::string::npos) { + size_t endPos = line.find_first_of(",)", startPos); + if (endPos != std::string::npos) { + auto tmp = StringUtils::splitBySpace(line.substr(startPos, endPos - startPos)); + relTable->fromTable = + std::dynamic_pointer_cast(tableNameMap[tmp[1]]); + relTable->toTable = + std::dynamic_pointer_cast(tableNameMap[tmp[3]]); + table = relTable; + } else { + throw TestException(stringFormat( + "FROM node and TO node are not defined in rel table: {}", line)); + } + } else { + throw TestException( + stringFormat("FROM node and TO node are not defined in rel table: {}", line)); + } + } + table->name = tableName; + tables.push_back(table); + tableNameMap[tableName] = table; + } +} + +std::string extractPath(std::string& str, char delimiter) { + std::string::size_type posStart = str.find_first_of(delimiter); + std::string::size_type posEnd = str.find_last_of(delimiter); + return str.substr(posStart + 1, posEnd - posStart - 1); +} + +void CSVToParquetConverter::readCopyCommandsFromCSVDataset() { + auto csvCopyFile = + LocalFileSystem::joinPath(csvDatasetPath, std::string(TestHelper::COPY_FILE_NAME)); + std::ifstream file(csvCopyFile); + if (!file.is_open()) { + throw TestException(stringFormat("Error opening file: {}, errno: {}.", csvCopyFile, errno)); + } + std::string line; + while (getline(file, line)) { + auto tokens = StringUtils::split(line, " "); + auto path = std::filesystem::path(extractPath(tokens[3], '"')); + auto table = tableNameMap[tokens[1]]; + table->csvFilePath = TestHelper::appendKuzuRootPath(path.string()); + auto parquetFileName = path.stem().string() + ".parquet"; + table->parquetFilePath = parquetDatasetPath + "/" + parquetFileName; + } +} + +void CSVToParquetConverter::createCopyFile() { + readCopyCommandsFromCSVDataset(); + auto parquetCopyFile = + LocalFileSystem::joinPath(parquetDatasetPath, std::string(TestHelper::COPY_FILE_NAME)); + std::ofstream outfile(parquetCopyFile); + if (!outfile.is_open()) { + throw TestException( + stringFormat("Error opening file: {}, errno: {}.", parquetCopyFile, errno)); + } + std::string kuzuRootPath = KUZU_ROOT_DIRECTORY + std::string("/"); + for (auto table : tables) { + auto cmd = stringFormat("COPY {} FROM \"{}\";", table->name, + table->parquetFilePath.substr(kuzuRootPath.length())); + outfile << cmd << '\n'; + } +} + +void CSVToParquetConverter::convertCSVFilesToParquet() { + // Load CSV Files to temp database + TestHelper::executeScript( + LocalFileSystem::joinPath(csvDatasetPath, std::string(TestHelper::SCHEMA_FILE_NAME)), + *tempConn); + TestHelper::executeScript( + LocalFileSystem::joinPath(csvDatasetPath, std::string(TestHelper::COPY_FILE_NAME)), + *tempConn); + + spdlog::set_level(spdlog::level::info); + for (auto table : tables) { + spdlog::info("Converting: {} to {}", table->csvFilePath, table->parquetFilePath); + auto cmd = table->getConverterQuery(); + tempConn->query(cmd); + spdlog::info("Executed query: {}", cmd); + } +} + +void CSVToParquetConverter::convertCSVDatasetToParquet() { + LocalFileSystem localFileSystem; + if (!localFileSystem.fileOrPathExists(parquetDatasetPath)) { + localFileSystem.createDir(parquetDatasetPath); + } + + copySchemaFile(); + createCopyFile(); + + systemConfig = std::make_unique(bufferPoolSize); + std::string tempDatabasePath = TestHelper::appendKuzuRootPath( + std::string(TestHelper::TMP_TEST_DIR) + "csv_to_parquet_converter_" + + TestHelper::getMillisecondsSuffix()); + tempDb = std::make_unique(tempDatabasePath, *systemConfig); + tempConn = std::make_unique(tempDb.get()); + + convertCSVFilesToParquet(); + std::filesystem::remove_all(tempDatabasePath); +} + +std::string CSVToParquetConverter::NodeTableInfo::getConverterQuery() const { + return stringFormat("COPY (MATCH (a:{}) RETURN a.*) TO \"{}\";", name, parquetFilePath); +} + +std::string CSVToParquetConverter::RelTableInfo::getConverterQuery() const { + return stringFormat("COPY (MATCH (a)-[e:{}]->(b) RETURN a.{}, b.{}, e.*) TO \"{}\";", name, + fromTable->primaryKey, toTable->primaryKey, parquetFilePath); +} + +} // namespace testing +} // namespace kuzu