Skip to content

Commit

Permalink
Merge pull request #2247 from kuzudb/rdf-literal
Browse files Browse the repository at this point in the history
Add rdf literal and literal triples table
  • Loading branch information
andyfengHKU authored Oct 26, 2023
2 parents fdbcfea + 6aa24bf commit f91a957
Show file tree
Hide file tree
Showing 52 changed files with 1,031 additions and 569 deletions.
6 changes: 4 additions & 2 deletions dataset/copy-test/rdf/copy.cypher
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
COPY taxonomy_RESOURCE FROM "dataset/copy-test/rdf/taxonomy.ttl" ;
COPY taxonomy_TRIPLES FROM "dataset/copy-test/rdf/taxonomy.ttl" ;
COPY taxonomy_resource_t FROM "dataset/copy-test/rdf/taxonomy.ttl" ;
COPY taxonomy_literal_t FROM "dataset/copy-test/rdf/taxonomy.ttl" ;
COPY taxonomy_resource_triples_t FROM "dataset/copy-test/rdf/taxonomy.ttl" ;
COPY taxonomy_literal_triples_t FROM "dataset/copy-test/rdf/taxonomy.ttl" ;
2 changes: 0 additions & 2 deletions dataset/rdf/copy.cypher

This file was deleted.

4 changes: 4 additions & 0 deletions dataset/rdf/rdfox_example/copy.cypher
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
COPY example_resource_t FROM "dataset/rdf/rdfox_example/data.ttl";
COPY example_literal_t FROM "dataset/rdf/rdfox_example/data.ttl";
COPY example_resource_triples_t FROM "dataset/rdf/rdfox_example/data.ttl";
COPY example_literal_triples_t FROM "dataset/rdf/rdfox_example/data.ttl";
29 changes: 29 additions & 0 deletions dataset/rdf/rdfox_example/data.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
@prefix : <https://rdfox.com/getting-started/> .

:peter :forename "Peter" ;
a :Person ;
:marriedTo :lois ;
:gender "male" .

:lois :forename "Lois" ;
a :Person ;
:gender "female" .

:meg :forename "Meg" ;
a :Person ;
:hasParent :lois, :peter ;
:gender "female" .

:chris :forename "Chris" ;
a :Person ;
:hasParent :peter ;
:gender "male" .

:stewie :forename "Stewie" ;
a :Person ;
:hasParent :lois ;
:gender "male" .

:brian :forename "Brian" . # Brian is a dog

:andy :age 12 .
1 change: 1 addition & 0 deletions dataset/rdf/rdfox_example/schema.cypher
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE RDF GRAPH example;
1 change: 0 additions & 1 deletion dataset/rdf/schema.cypher

This file was deleted.

9 changes: 0 additions & 9 deletions dataset/rdf/tiny.ttl

This file was deleted.

104 changes: 54 additions & 50 deletions src/binder/bind/bind_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "common/string_format.h"
#include "common/table_type.h"
#include "parser/copy.h"
#include "storage/storage_manager.h"

using namespace kuzu::binder;
using namespace kuzu::catalog;
Expand Down Expand Up @@ -95,7 +96,11 @@ std::unique_ptr<BoundStatement> Binder::bindCopyFromClause(const Statement& stat
}
switch (tableSchema->tableType) {
case TableType::NODE:
return bindCopyNodeFrom(std::move(readerConfig), tableSchema);
if (readerConfig->fileType == FileType::TURTLE) {
return bindCopyRdfNodeFrom(std::move(readerConfig), tableSchema);
} else {
return bindCopyNodeFrom(std::move(readerConfig), tableSchema);
}
case TableType::REL: {
if (readerConfig->fileType == FileType::TURTLE) {
return bindCopyRdfRelFrom(std::move(readerConfig), tableSchema);
Expand All @@ -118,7 +123,33 @@ std::unique_ptr<BoundStatement> Binder::bindCopyNodeFrom(
auto boundFileScanInfo = std::make_unique<BoundFileScanInfo>(
std::move(readerConfig), std::move(columns), std::move(nodeID), TableType::NODE);
auto boundCopyFromInfo = std::make_unique<BoundCopyFromInfo>(
tableSchema, std::move(boundFileScanInfo), containsSerial, nullptr);
tableSchema, std::move(boundFileScanInfo), containsSerial, nullptr /* extraInfo */);
return std::make_unique<BoundCopyFrom>(std::move(boundCopyFromInfo));
}

std::unique_ptr<BoundStatement> Binder::bindCopyRdfNodeFrom(
std::unique_ptr<ReaderConfig> readerConfig, TableSchema* tableSchema) {
auto containsSerial = bindContainsSerial(tableSchema);
auto stringType = LogicalType{LogicalTypeID::STRING};
auto nodeID =
createVariable(std::string(Property::INTERNAL_ID_NAME), LogicalTypeID::INTERNAL_ID);
expression_vector columns;
auto columnName = std::string(RDFKeyword::ANONYMOUS);
readerConfig->columnNames.push_back(columnName);
readerConfig->columnTypes.push_back(stringType.copy());
columns.push_back(createVariable(columnName, stringType));
if (tableSchema->tableName.ends_with(common::RDFKeyword::RESOURCE_TABLE_SUFFIX)) {
readerConfig->rdfReaderConfig =
std::make_unique<RdfReaderConfig>(RdfReaderMode::RESOURCE, nullptr /* index */);
} else {
assert(tableSchema->tableName.ends_with(common::RDFKeyword::LITERAL_TABLE_SUFFIX));
readerConfig->rdfReaderConfig =
std::make_unique<RdfReaderConfig>(RdfReaderMode::LITERAL, nullptr /* index */);
}
auto boundFileScanInfo = std::make_unique<BoundFileScanInfo>(
std::move(readerConfig), std::move(columns), std::move(nodeID), TableType::NODE);
auto boundCopyFromInfo = std::make_unique<BoundCopyFromInfo>(
tableSchema, std::move(boundFileScanInfo), containsSerial, nullptr /* extraInfo */);
return std::make_unique<BoundCopyFrom>(std::move(boundCopyFromInfo));
}

Expand Down Expand Up @@ -149,33 +180,32 @@ std::unique_ptr<BoundStatement> Binder::bindCopyRelFrom(
return std::make_unique<BoundCopyFrom>(std::move(boundCopyFromInfo));
}

static constexpr std::string_view RDF_SUBJECT = "_SUBJECT";
static constexpr std::string_view RDF_PREDICATE = "_PREDICATE";
static constexpr std::string_view RDF_OBJECT = "_OBJECT";
static constexpr std::string_view RDF_SUBJECT_OFFSET = "_SUBJECT_OFFSET";
static constexpr std::string_view RDF_PREDICATE_OFFSET = "_PREDICATE_OFFSET";
static constexpr std::string_view RDF_OBJECT_OFFSET = "_OBJECT_OFFSET";

std::unique_ptr<BoundStatement> Binder::bindCopyRdfRelFrom(
std::unique_ptr<ReaderConfig> readerConfig, TableSchema* tableSchema) {
auto columns = bindExpectedRelFileColumns(tableSchema, *readerConfig);
auto subjectKey = columns[0];
auto predicateKey = columns[1];
auto objectKey = columns[2];
auto containsSerial = bindContainsSerial(tableSchema);
auto offsetType = std::make_unique<LogicalType>(LogicalTypeID::ARROW_COLUMN);
expression_vector columns;
for (auto i = 0u; i < 3; ++i) {
auto columnName = std::string(RDFKeyword::ANONYMOUS) + std::to_string(i);
readerConfig->columnNames.push_back(columnName);
readerConfig->columnTypes.push_back(offsetType->copy());
columns.push_back(createVariable(columnName, *offsetType));
}
auto relTableSchema = reinterpret_cast<RelTableSchema*>(tableSchema);
auto resourceTableID = relTableSchema->getSrcTableID();
auto index = storageManager->getNodesStore().getPKIndex(resourceTableID);
if (tableSchema->tableName.ends_with(common::RDFKeyword::RESOURCE_TRIPLE_TABLE_SUFFIX)) {
readerConfig->rdfReaderConfig =
std::make_unique<RdfReaderConfig>(RdfReaderMode::RESOURCE_TRIPLE, index);
} else {
readerConfig->rdfReaderConfig =
std::make_unique<RdfReaderConfig>(RdfReaderMode::LITERAL_TRIPLE, index);
}
auto relID =
createVariable(std::string(Property::INTERNAL_ID_NAME), LogicalTypeID::INTERNAL_ID);
auto containsSerial = false;
auto boundFileScanInfo = std::make_unique<BoundFileScanInfo>(
std::move(readerConfig), std::move(columns), std::move(relID), TableType::REL);
auto relTableSchema = reinterpret_cast<RelTableSchema*>(tableSchema);
assert(relTableSchema->getSrcTableID() == relTableSchema->getDstTableID());
auto nodeTableID = relTableSchema->getSrcTableID();
auto arrowColumnType = LogicalType{LogicalTypeID::ARROW_COLUMN};
auto subjectOffset = createVariable(std::string(RDF_SUBJECT_OFFSET), arrowColumnType);
auto predicateOffset = createVariable(std::string(RDF_PREDICATE_OFFSET), arrowColumnType);
auto objectOffset = createVariable(std::string(RDF_OBJECT_OFFSET), arrowColumnType);
auto extraInfo = std::make_unique<ExtraBoundCopyRdfRelInfo>(nodeTableID, subjectOffset,
predicateOffset, objectOffset, subjectKey, predicateKey, objectKey);
std::move(readerConfig), columns, std::move(relID), TableType::REL);
auto extraInfo = std::make_unique<ExtraBoundCopyRdfRelInfo>(columns[0], columns[1], columns[2]);
auto boundCopyFromInfo = std::make_unique<BoundCopyFromInfo>(
tableSchema, std::move(boundFileScanInfo), containsSerial, std::move(extraInfo));
return std::make_unique<BoundCopyFrom>(std::move(boundCopyFromInfo));
Expand All @@ -192,14 +222,6 @@ expression_vector Binder::bindExpectedNodeFileColumns(
std::vector<std::string> expectedColumnNames;
std::vector<std::unique_ptr<common::LogicalType>> expectedColumnTypes;
switch (readerConfig.fileType) {
case FileType::TURTLE: {
auto stringType = LogicalType{LogicalTypeID::STRING};
expectedColumnNames = {
std::string(RDF_SUBJECT), std::string(RDF_PREDICATE), std::string(RDF_OBJECT)};
for (auto _ : expectedColumnNames) {
expectedColumnTypes.push_back(stringType.copy());
}
} break;
case FileType::NPY:
case FileType::PARQUET:
case FileType::CSV: {
Expand All @@ -215,10 +237,6 @@ expression_vector Binder::bindExpectedNodeFileColumns(
throw NotImplementedException{"Binder::bindCopyNodeColumns"};
}
}
if (readerConfig.fileType == common::FileType::TURTLE) {
// Nothing to validate for turtle
return createColumnExpressions(readerConfig, expectedColumnNames, expectedColumnTypes);
}
// Detect columns from file.
std::vector<std::string> detectedColumnNames;
std::vector<std::unique_ptr<common::LogicalType>> detectedColumnTypes;
Expand All @@ -237,16 +255,6 @@ expression_vector Binder::bindExpectedRelFileColumns(
auto relTableSchema = reinterpret_cast<RelTableSchema*>(tableSchema);
expression_vector columns;
switch (readerConfig.fileType) {
case FileType::TURTLE: {
auto stringType = LogicalType{LogicalTypeID::STRING};
auto columnNames = std::vector<std::string>{
std::string(RDF_SUBJECT), std::string(RDF_PREDICATE), std::string(RDF_OBJECT)};
for (auto& columnName : columnNames) {
readerConfig.columnNames.push_back(columnName);
readerConfig.columnTypes.push_back(stringType.copy());
columns.push_back(createVariable(columnName, stringType));
}
} break;
case FileType::CSV:
case FileType::PARQUET:
case FileType::NPY: {
Expand Down Expand Up @@ -280,10 +288,6 @@ expression_vector Binder::bindExpectedRelFileColumns(
throw NotImplementedException{"Binder::bindCopyRelColumns"};
}
}
if (readerConfig.fileType == common::FileType::TURTLE) {
// Nothing to validate for turtle
return columns;
}
// Detect columns from file.
std::vector<std::string> detectedColumnNames;
std::vector<std::unique_ptr<common::LogicalType>> detectedColumnTypes;
Expand Down
77 changes: 54 additions & 23 deletions src/binder/bind/bind_ddl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,37 +149,68 @@ std::unique_ptr<BoundCreateTableInfo> Binder::bindCreateRelTableGroupInfo(
TableType::REL_GROUP, info->tableName, std::move(boundExtraInfo));
}

static inline std::string getRdfNodeTableName(const std::string& rdfName) {
return rdfName + common::RDFKeyword::NODE_TABLE_SUFFIX;
static std::string getRdfResourceTableName(const std::string& rdfName) {
return rdfName + std::string(RDFKeyword::RESOURCE_TABLE_SUFFIX);
}

static inline std::string getRdfRelTableName(const std::string& rdfName) {
return rdfName + common::RDFKeyword::REL_TABLE_SUFFIX;
static std::string getRdfLiteralTableName(const std::string& rdfName) {
return rdfName + std::string(RDFKeyword::LITERAL_TABLE_SUFFIX);
}

static inline std::string getRdfResourceTripleTableName(const std::string& rdfName) {
return rdfName + std::string(RDFKeyword::RESOURCE_TRIPLE_TABLE_SUFFIX);
}

static std::string getRdfLiteralTripleTableName(const std::string& rdfName) {
return rdfName + std::string(RDFKeyword::LITERAL_TRIPLE_TABLE_SUFFIX);
}

std::unique_ptr<BoundCreateTableInfo> Binder::bindCreateRdfGraphInfo(const CreateTableInfo* info) {
auto rdfGraphName = info->tableName;
auto stringType = std::make_unique<LogicalType>(LogicalTypeID::STRING);
// RDF node (resource) table
auto nodeTableName = getRdfNodeTableName(rdfGraphName);
std::vector<std::unique_ptr<Property>> nodeProperties;
nodeProperties.push_back(
std::make_unique<Property>(common::RDFKeyword::IRI, stringType->copy()));
auto boundNodeExtraInfo = std::make_unique<BoundExtraCreateNodeTableInfo>(
0 /* primaryKeyIdx */, std::move(nodeProperties));
auto boundNodeCreateInfo = std::make_unique<BoundCreateTableInfo>(
TableType::NODE, nodeTableName, std::move(boundNodeExtraInfo));
// RDF rel (triples) table
auto relTableName = getRdfRelTableName(rdfGraphName);
std::vector<std::unique_ptr<Property>> relProperties;
relProperties.push_back(std::make_unique<Property>(
common::RDFKeyword::PREDICT_ID, std::make_unique<LogicalType>(LogicalTypeID::INTERNAL_ID)));
auto boundRelExtraInfo = std::make_unique<BoundExtraCreateRelTableInfo>(
RelMultiplicity::MANY_MANY, INVALID_TABLE_ID, INVALID_TABLE_ID, std::move(relProperties));
auto boundRelCreateInfo = std::make_unique<BoundCreateTableInfo>(
TableType::REL, relTableName, std::move(boundRelExtraInfo));
auto serialType = std::make_unique<LogicalType>(LogicalTypeID::SERIAL);
// Resource table.
auto resourceTableName = getRdfResourceTableName(rdfGraphName);
std::vector<std::unique_ptr<Property>> resourceProperties;
resourceProperties.push_back(std::make_unique<Property>(RDFKeyword::IRI, stringType->copy()));
auto resourceExtraInfo = std::make_unique<BoundExtraCreateNodeTableInfo>(
0 /* primaryKeyIdx */, std::move(resourceProperties));
auto resourceCreateInfo = std::make_unique<BoundCreateTableInfo>(
TableType::NODE, resourceTableName, std::move(resourceExtraInfo));
// Literal table.
auto literalTableName = getRdfLiteralTableName(rdfGraphName);
std::vector<std::unique_ptr<Property>> literalProperties;
literalProperties.push_back(std::make_unique<Property>(RDFKeyword::ID, serialType->copy()));
literalProperties.push_back(
std::make_unique<Property>(std::string(RDFKeyword::IRI), stringType->copy()));
auto literalExtraInfo = std::make_unique<BoundExtraCreateNodeTableInfo>(
0 /* primaryKeyIdx */, std::move(literalProperties));
auto literalCreateInfo = std::make_unique<BoundCreateTableInfo>(
TableType::NODE, literalTableName, std::move(literalExtraInfo));
// Resource triple table.
auto resourceTripleTableName = getRdfResourceTripleTableName(rdfGraphName);
std::vector<std::unique_ptr<Property>> resourceTripleProperties;
resourceTripleProperties.push_back(std::make_unique<Property>(
common::RDFKeyword::PREDICT_ID, std::make_unique<LogicalType>(LogicalTypeID::INT64)));
auto boundResourceTripleExtraInfo =
std::make_unique<BoundExtraCreateRelTableInfo>(RelMultiplicity::MANY_MANY, INVALID_TABLE_ID,
INVALID_TABLE_ID, std::move(resourceTripleProperties));
auto boundResourceTripleCreateInfo = std::make_unique<BoundCreateTableInfo>(
TableType::REL, resourceTripleTableName, std::move(boundResourceTripleExtraInfo));
// Literal triple table.
auto literalTripleTableName = getRdfLiteralTripleTableName(rdfGraphName);
std::vector<std::unique_ptr<Property>> literalTripleProperties;
literalTripleProperties.push_back(std::make_unique<Property>(
common::RDFKeyword::PREDICT_ID, std::make_unique<LogicalType>(LogicalTypeID::INT64)));
auto boundLiteralTripleExtraInfo =
std::make_unique<BoundExtraCreateRelTableInfo>(RelMultiplicity::MANY_MANY, INVALID_TABLE_ID,
INVALID_TABLE_ID, std::move(literalTripleProperties));
auto boundLiteralTripleCreateInfo = std::make_unique<BoundCreateTableInfo>(
TableType::REL, literalTripleTableName, std::move(boundLiteralTripleExtraInfo));
// Rdf table.
auto boundExtraInfo = std::make_unique<BoundExtraCreateRdfGraphInfo>(
std::move(boundNodeCreateInfo), std::move(boundRelCreateInfo));
std::move(resourceCreateInfo), std::move(literalCreateInfo),
std::move(boundResourceTripleCreateInfo), std::move(boundLiteralTripleCreateInfo));
return std::make_unique<BoundCreateTableInfo>(
TableType::RDF, rdfGraphName, std::move(boundExtraInfo));
}
Expand Down
6 changes: 4 additions & 2 deletions src/binder/bind/bind_graph_pattern.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,8 @@ std::vector<table_id_t> Binder::getNodeTableIDs(const std::vector<table_id_t>& t
for (auto& tableID : tableIDs) {
auto rdfGraphSchema =
reinterpret_cast<RdfGraphSchema*>(readVersion->getTableSchema(tableID));
result.push_back(rdfGraphSchema->getNodeTableID());
result.push_back(rdfGraphSchema->getResourceTableID());
result.push_back(rdfGraphSchema->getLiteralTableID());
}
return result;
}
Expand All @@ -603,7 +604,8 @@ std::vector<table_id_t> Binder::getRelTableIDs(const std::vector<table_id_t>& ta
for (auto& tableID : tableIDs) {
auto rdfGraphSchema =
reinterpret_cast<RdfGraphSchema*>(readVersion->getTableSchema(tableID));
result.push_back(rdfGraphSchema->getRelTableID());
result.push_back(rdfGraphSchema->getResourceTripleTableID());
result.push_back(rdfGraphSchema->getLiteralTripleTableID());
}
return result;
}
Expand Down
Loading

0 comments on commit f91a957

Please sign in to comment.