Skip to content

Commit

Permalink
Added parquet tinysnb test and parser adjustments
Browse files Browse the repository at this point in the history
  • Loading branch information
rfdavid committed Jun 5, 2023
1 parent 2233517 commit dd0889c
Show file tree
Hide file tree
Showing 15 changed files with 339 additions and 75 deletions.
4 changes: 2 additions & 2 deletions dataset/tinysnb/copy.cypher
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
COPY person FROM "dataset/tinysnb/vPerson.csv" (HeaDER=true, deLim=',');
COPY person FROM "dataset/tinysnb/vPerson.csv" (HEADER=true, deLim=',');
COPY organisation FROM "dataset/tinysnb/vOrganisation.csv";
COPY movies FROM "dataset/tinysnb/vMovies.csv";
COPY knows FROM "dataset/tinysnb/eKnows.csv";
COPY studyAt FROM "dataset/tinysnb/eStudyAt.csv" (HEADeR=true);
COPY studyAt FROM "dataset/tinysnb/eStudyAt.csv" (HEADER=true);
COPY workAt FROM "dataset/tinysnb/eWorkAt.csv"
COPY meets FROM "dataset/tinysnb/eMeets.csv"
COPY marries FROM "dataset/tinysnb/eMarries.csv"
41 changes: 7 additions & 34 deletions scripts/parquet/csv_to_parquet.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,13 @@
from pyarrow import csv
import pyarrow.parquet as pq

csv_files = ['/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Comment.csv',
'/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Comment_hasCreator_Person.csv',
'/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Comment_hasTag_Tag.csv',
'/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Comment_isLocatedIn_Place.csv',
'/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Comment_replyOf_Comment.csv',
'/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Comment_replyOf_Post.csv',
'/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Forum.csv',
'/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Forum_containerOf_Post.csv',
'/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Forum_hasMember_Person.csv',
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Forum_hasModerator_Person.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Forum_hasTag_Tag.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Organisation.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Organisation_isLocatedIn_Place.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Person.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Person_hasInterest_Tag.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Person_isLocatedIn_Place.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Person_knows_Person.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Person_likes_Comment.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Person_likes_Post.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Person_studyAt_Organisation.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Person_workAt_Organisation.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Place.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Place_isPartOf_Place.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Post.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Post_hasCreator_Person.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Post_hasTag_Tag.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Post_isLocatedIn_Place.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Tag.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/TagClass.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/TagClass_isSubclassOf_TagClass.csv",
"/Users/rfdavid/Devel/waterloo/kuzu/dataset/ldbc-sf01/Tag_hasType_TagClass.csv"]

read_options = csv.ReadOptions(autogenerate_column_names=False)
parse_options = csv.ParseOptions(delimiter="|")
csv_files = ['/Users/rfdavid/Devel/waterloo/kuzu/dataset/tinysnb/vPerson.csv']
has_header = True
# CSV:
# has header? autogenerate_column_names=False
# no header? autogenerate_column_names=True
read_options = csv.ReadOptions(autogenerate_column_names=not has_header)
parse_options = csv.ParseOptions(delimiter=",")
for csv_file in csv_files:
table = csv.read_csv(csv_file, read_options=read_options,
parse_options=parse_options)
Expand Down
6 changes: 6 additions & 0 deletions src/common/string_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ std::vector<std::string> StringUtils::splitBySpace(const std::string& input) {
return result;
}

std::string StringUtils::extractSubstring(std::string& str, char delimiter) {
std::string::size_type posStart = str.find_first_of(delimiter);
std::string::size_type posEnd = str.find_last_of(delimiter);
return str.substr(posStart + 1, posEnd - posStart - 1);
}

void StringUtils::replaceAll(
std::string& str, const std::string& search, const std::string& replacement) {
size_t pos = 0;
Expand Down
2 changes: 2 additions & 0 deletions src/include/common/string_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ class StringUtils {
return s;
}

static std::string extractSubstring(std::string& str, char delimiter);

static inline void removeWhiteSpaces(std::string& str) {
std::regex whiteSpacePattern{"\\s"};
str = std::regex_replace(str, whiteSpacePattern, "");
Expand Down
20 changes: 10 additions & 10 deletions test/include/test_runner/csv_to_parquet_converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,28 +19,28 @@ class CSVToParquetConverter {
char delimiter;
};

static std::vector<CopyCommandInfo> readCopyCommandsFromCopyCypherFile(const std::string& dataset);
static std::vector<CopyCommandInfo> readCopyCommandsFromCopyCypherFile(
const std::string& dataset);

static void convertCSVFilesToParquet(const std::vector<CSVToParquetConverter::CopyCommandInfo>& copyCommands);
static void convertCSVFilesToParquet(
const std::vector<CSVToParquetConverter::CopyCommandInfo>& copyCommands);

static CopyCommandInfo createCopyCommandInfo(const std::string& dataset, const std::string& copyStatement);
static CopyCommandInfo createCopyCommandInfo(
const std::string& dataset, const std::string& copyStatement);

static arrow::Status RunCSVToParquetConversion(
const std::string& inputFile, const std::string& outputFile, char delimiter, bool hasHeader);
static arrow::Status RunCSVToParquetConversion(const std::string& inputFile,
const std::string& outputFile, char delimiter, bool hasHeader);

static void copySchema(
const std::string& csvDatasetPath, const std::string& parquetDatasetPath);

static void createCopyFile(const std::string& dataset, const std::vector<CSVToParquetConverter::CopyCommandInfo>& copyCommands);
static void createCopyFile(const std::string& dataset,
const std::vector<CSVToParquetConverter::CopyCommandInfo>& copyCommands);

inline static std::string replaceSlashesWithUnderscores(std::string dataset) {
std::replace(dataset.begin(), dataset.end(), '/', '_');
return dataset;
}

inline static void removeQuotes(std::string& str) {
str.erase(std::remove(str.begin(), str.end(), '"'), str.end());
}
};

} // namespace testing
Expand Down
2 changes: 1 addition & 1 deletion test/runner/e2e_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,6 @@ int main(int argc, char** argv) {
FileUtils::createDirIfNotExists(parquetDatasetTempDir);
scanTestFiles(path);
auto result = RUN_ALL_TESTS();
// FileUtils::removeDir(parquetDatasetTempDir);
FileUtils::removeDir(parquetDatasetTempDir);
return result;
}
2 changes: 1 addition & 1 deletion test/test_files/demo_db/demo_db_parquet.test
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-GROUP DemoDBTest
-DATASET PARQUET CSV_TO_PARQUET(demo-db/csv)
-DATASET PARQUET demo-db/parquet

--

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# FIXME: this test is failing on Parquet dataset
-GROUP LDBCTest
-DATASET PARQUET CSV_TO_PARQUET(ldbc-sf01)
-BUFFER_POOL_SIZE 1073741824
-SKIP

--

-CASE LDBCInteractiveShort
-CASE LDBCInteractiveShortParquet

-NAME IS1
-QUERY MATCH (n:Person {id: 933})-[:Person_isLocatedIn_Place]->(p:Place)
Expand Down
4 changes: 2 additions & 2 deletions test/test_files/long_string_pk/long_string_pk_parquet.test
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

--

-CASE LongStringPKTest
-CASE LongStringPKTestParquet

-NAME LongStringPKTest
-NAME LongStringPKTestParquet
-QUERY MATCH (a:Person)-[e:Knows]->(b:Person) WHERE a.name = "AAAAAAAAAAAAAAAAAAAA" RETURN COUNT(*)
---- 1
1
2 changes: 1 addition & 1 deletion test/test_files/lsqb/lsqb_queries_parquet.test
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

--

-CASE LSQBTest
-CASE LSQBTestParquet

-NAME q1
-QUERY MATCH (:Country)<-[:City_isPartOf_Country]-(:City)<-[:Person_isLocatedIn_City]-(:Person)<-[:Forum_hasMember_Person]-(:Forum)-[:Forum_containerOf_Post]->(:Post)<-[Comment_replyOf_Post]-(:Comment)-[:Comment_hasTag_Tag]->(:Tag)-[:Tag_hasType_TagClass]->(:TagClass) RETURN count(*) as count;
Expand Down
2 changes: 1 addition & 1 deletion test/test_files/order_by/order_by_parquet.test
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

--

-CASE OrderByLargeDatasetTest
-CASE OrderByLargeDatasetTestParquet

-NAME OrderByWithLimitTest
-QUERY MATCH (p:person) RETURN p.balance ORDER BY p.balance limit 25
Expand Down
43 changes: 43 additions & 0 deletions test/test_files/read_list/large_adj_list_parquet.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
-GROUP EndToEndReadLargeListsTest
-DATASET PARQUET CSV_TO_PARQUET(read-list-tests/large-list)

--

-CASE LargeAdjListTestParquet

-NAME NodeP0
-QUERY MATCH (a:person)-[:knows]->(b:person) WHERE a.ID = 0 RETURN COUNT(*)
-ENUMERATE
---- 1
5001

-NAME NodeP1
-QUERY MATCH (a:person)-[:knows]->(b:person) WHERE a.ID = 1 RETURN b.ID
-ENUMERATE
---- 1
5000

-NAME NodeP5500
-QUERY MATCH (a:person)-[:knows]->(b:person) WHERE a.ID = 5500 RETURN b.ID
-ENUMERATE
---- 0

-NAME EdgeID
-QUERY MATCH (a:person)-[r:knows]->(b:person) WHERE a.ID = 5000 RETURN ID(r)
-ENUMERATE
---- 1
1:10000

-NAME CrossProduct1
-QUERY MATCH (a:person), (b:person) RETURN COUNT(*)
-PARALLELISM 2
-ENUMERATE
---- 1
36000000

-NAME CrossProduct
-QUERY MATCH (a:person) WITH AVG(a.ID) AS s MATCH (b:person) WHERE s > b.ID RETURN COUNT(*)
-PARALLELISM 8
-ENUMERATE
---- 1
3000
88 changes: 88 additions & 0 deletions test/test_files/shortest_path/bfs_sssp_parquet.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# FIXME: this test is segfaulting
-GROUP ShortestPathTest
-SKIP
-DATASET PARQUET CSV_TO_PARQUET(shortest-path-tests)

--

-CASE BfsParquet

-NAME SingleSourceAllDestinationsSSP
-QUERY MATCH (a:person)-[r:knows* SHORTEST 1..30]->(b:person) WHERE a.fName = 'Alice' RETURN a.fName, b.fName, r
---- 7
Alice|Bob|[0:0,1:0,0:1]
Alice|Carol|[0:0,1:1,0:2]
Alice|Dan|[0:0,1:2,0:3]
Alice|Elizabeth|[0:0,1:0,0:1,1:6,0:4]
Alice|Farooq|[0:0,1:0,0:1,1:6,0:4,1:13,0:5]
Alice|Greg|[0:0,1:0,0:1,1:6,0:4,1:14,0:6]
Alice|Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|[0:0,1:0,0:1,1:6,0:4,1:15,0:7]

-NAME AllSourcesSingleDestinationQuery
-QUERY MATCH (a:person)-[r:knows* SHORTEST 1..30]->(b:person) WHERE b.fName = 'Alice' RETURN a.fName, b.fName, r
---- 6
Bob|Alice|[0:0,1:3,0:1]
Carol|Alice|[0:0,1:7,0:2]
Dan|Alice|[0:0,1:10,0:3]
Elizabeth|Alice|[0:0,1:20,0:7,1:15,0:4]
Farooq|Alice|[0:0,1:20,0:7,1:17,0:5]
Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|Alice|[0:0,1:20,0:7]

-NAME SingleSourceWithAllProperties
-QUERY MATCH (a:person)-[r:knows* SHORTEST 1..30]->(b:person) WHERE a.fName = 'Alice' RETURN length(r), b, a
---- 7
1|(label:person, 0:1, {ID:2, fName:Bob, gender:2, isStudent:True, isWorker:False, age:30, eyeSight:5.100000, birthdate:1900-01-01, registerTime:2008-11-03 15:25:30.000526, lastJobDuration:10 years 5 months 13:00:00.000024, workedHours:[12,8], usedNames:[Bobby], courseScoresPerTerm:[[8,9],[9,10]]})|(label:person, 0:0, {ID:0, fName:Alice, gender:1, isStudent:True, isWorker:False, age:35, eyeSight:5.000000, birthdate:1900-01-01, registerTime:2011-08-20 11:25:30, lastJobDuration:3 years 2 days 13:02:00, workedHours:[10,5], usedNames:[Aida], courseScoresPerTerm:[[10,8],[6,7,8]]})
1|(label:person, 0:2, {ID:3, fName:Carol, gender:1, isStudent:False, isWorker:True, age:45, eyeSight:5.000000, birthdate:1940-06-22, registerTime:1911-08-20 02:32:21, lastJobDuration:48:24:11, workedHours:[4,5], usedNames:[Carmen,Fred], courseScoresPerTerm:[[8,10]]})|(label:person, 0:0, {ID:0, fName:Alice, gender:1, isStudent:True, isWorker:False, age:35, eyeSight:5.000000, birthdate:1900-01-01, registerTime:2011-08-20 11:25:30, lastJobDuration:3 years 2 days 13:02:00, workedHours:[10,5], usedNames:[Aida], courseScoresPerTerm:[[10,8],[6,7,8]]})
1|(label:person, 0:3, {ID:5, fName:Dan, gender:2, isStudent:False, isWorker:True, age:20, eyeSight:4.800000, birthdate:1950-07-23, registerTime:2031-11-30 12:25:30, lastJobDuration:10 years 5 months 13:00:00.000024, workedHours:[1,9], usedNames:[Wolfeschlegelstein,Daniel], courseScoresPerTerm:[[7,4],[8,8],[9]]})|(label:person, 0:0, {ID:0, fName:Alice, gender:1, isStudent:True, isWorker:False, age:35, eyeSight:5.000000, birthdate:1900-01-01, registerTime:2011-08-20 11:25:30, lastJobDuration:3 years 2 days 13:02:00, workedHours:[10,5], usedNames:[Aida], courseScoresPerTerm:[[10,8],[6,7,8]]})
2|(label:person, 0:4, {ID:7, fName:Elizabeth, gender:1, isStudent:False, isWorker:True, age:20, eyeSight:4.700000, birthdate:1980-10-26, registerTime:1976-12-23 11:21:42, lastJobDuration:48:24:11, workedHours:[2], usedNames:[Ein], courseScoresPerTerm:[[6],[7],[8]]})|(label:person, 0:0, {ID:0, fName:Alice, gender:1, isStudent:True, isWorker:False, age:35, eyeSight:5.000000, birthdate:1900-01-01, registerTime:2011-08-20 11:25:30, lastJobDuration:3 years 2 days 13:02:00, workedHours:[10,5], usedNames:[Aida], courseScoresPerTerm:[[10,8],[6,7,8]]})
3|(label:person, 0:5, {ID:8, fName:Farooq, gender:2, isStudent:True, isWorker:False, age:25, eyeSight:4.500000, birthdate:1980-10-26, registerTime:1972-07-31 13:22:30.678559, lastJobDuration:00:18:00.024, workedHours:[3,4,5,6,7], usedNames:[Fesdwe], courseScoresPerTerm:[[8]]})|(label:person, 0:0, {ID:0, fName:Alice, gender:1, isStudent:True, isWorker:False, age:35, eyeSight:5.000000, birthdate:1900-01-01, registerTime:2011-08-20 11:25:30, lastJobDuration:3 years 2 days 13:02:00, workedHours:[10,5], usedNames:[Aida], courseScoresPerTerm:[[10,8],[6,7,8]]})
3|(label:person, 0:6, {ID:9, fName:Greg, gender:2, isStudent:False, isWorker:False, age:40, eyeSight:4.900000, birthdate:1980-10-26, registerTime:1976-12-23 04:41:42, lastJobDuration:10 years 5 months 13:00:00.000024, workedHours:[1], usedNames:[Grad], courseScoresPerTerm:[[10]]})|(label:person, 0:0, {ID:0, fName:Alice, gender:1, isStudent:True, isWorker:False, age:35, eyeSight:5.000000, birthdate:1900-01-01, registerTime:2011-08-20 11:25:30, lastJobDuration:3 years 2 days 13:02:00, workedHours:[10,5], usedNames:[Aida], courseScoresPerTerm:[[10,8],[6,7,8]]})
3|(label:person, 0:7, {ID:10, fName:Hubert Blaine Wolfeschlegelsteinhausenbergerdorff, gender:2, isStudent:False, isWorker:True, age:83, eyeSight:4.900000, birthdate:1990-11-27, registerTime:2023-02-21 13:25:30, lastJobDuration:3 years 2 days 13:02:00, workedHours:[10,11,12,3,4,5,6,7], usedNames:[Ad,De,Hi,Kye,Orlan], courseScoresPerTerm:[[7],[10],[6,7]]})|(label:person, 0:0, {ID:0, fName:Alice, gender:1, isStudent:True, isWorker:False, age:35, eyeSight:5.000000, birthdate:1900-01-01, registerTime:2011-08-20 11:25:30, lastJobDuration:3 years 2 days 13:02:00, workedHours:[10,5], usedNames:[Aida], courseScoresPerTerm:[[10,8],[6,7,8]]})

-NAME SingleSourceSingleDestination
-QUERY MATCH (a:person)-[r:knows* SHORTEST 1..30]->(b:person) WHERE a.fName = 'Alice' AND b.fName = 'Bob' RETURN a.fName, b.fName, length(r)
---- 1
Alice|Bob|1

-NAME SingleSourceAllDestinations2
-QUERY MATCH (a:person)-[r:knows* SHORTEST 1..2]->(b:person) WHERE a.fName = 'Elizabeth' RETURN a.fName, b.fName, r
---- 5
Elizabeth|Alice|[0:4,1:15,0:7,1:20,0:0]
Elizabeth|Dan|[0:4,1:15,0:7,1:21,0:3]
Elizabeth|Farooq|[0:4,1:13,0:5]
Elizabeth|Greg|[0:4,1:14,0:6]
Elizabeth|Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|[0:4,1:15,0:7]

-NAME SingleSourceUnreachableDestination
-QUERY MATCH (a:person)-[r:knows* SHORTEST 1..30]->(b:person) WHERE a.fName = 'Alice' AND b.fName = 'Alice11' RETURN a.fName, b.fName, r
---- 0


-NAME MultipleSrcMultipleDstQuery
-QUERY MATCH (a:person)-[r:knows* SHORTEST 1..10]->(b:person) WHERE a.isStudent = true AND b.isWorker = true RETURN a.fName, b.fName, length(r)
---- 12
Alice|Carol|1
Alice|Dan|1
Alice|Elizabeth|2
Alice|Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|3
Bob|Carol|1
Bob|Dan|1
Bob|Elizabeth|1
Bob|Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|2
Farooq|Carol|3
Farooq|Dan|2
Farooq|Elizabeth|1
Farooq|Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|1

-NAME SSPWithExtend
-QUERY MATCH (c:person)<-[:knows* SHORTEST 1..30]-(a:person)-[r:knows* SHORTEST 1..30]->(b:person), (b)-[:knows]->(c) WHERE a.fName = 'Alice' AND b.ID < 6 AND c.ID > 5 RETURN a.fName, b.fName, c.fName
---- 1
Alice|Bob|Elizabeth

-NAME MultiPart
-QUERY MATCH (a)-[r:knows* SHORTEST 1..30]->(b:person) WHERE b.ID > 6 AND a.fName = 'Alice' WITH a, b, r MATCH (c:person)<-[:knows]-(a:person) RETURN b.fName, length(r), COUNT(*)
---- 4
Elizabeth|2|3
Farooq|3|3
Greg|3|3
Hubert Blaine Wolfeschlegelsteinhausenbergerdorff|3|3
Loading

0 comments on commit dd0889c

Please sign in to comment.