Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++] Add support for WKB and WKT attributes and dimensions #3210

Merged
merged 1 commit into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions libtiledbsoma/src/soma/column_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,17 @@ size_t ColumnBuffer::update_size(const Query& query) {
return num_cells_;
}

std::vector<std::vector<std::byte>> ColumnBuffer::binaries() {
std::vector<std::vector<std::byte>> result;

for (size_t i = 0; i < num_cells_; i++) {
result.emplace_back(std::vector<std::byte>(
data_.data() + offsets_[i], data_.data() + offsets_[i + 1]));
}

return result;
}

std::vector<std::string> ColumnBuffer::strings() {
std::vector<std::string> result;

Expand Down
7 changes: 7 additions & 0 deletions libtiledbsoma/src/soma/column_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,13 @@ class ColumnBuffer {
return tcb::span<T>((T*)data_.data(), num_cells_);
}

/**
* @brief Return data in a vector of binary buffers.
*
* @return std::vector<std::vector<uint8_t>>
*/
std::vector<std::vector<std::byte>> binaries();

/**
* @brief Return data in a vector of strings.
*
Expand Down
10 changes: 10 additions & 0 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,8 @@ bool SOMAArray::_cast_column(
case TILEDB_STRING_ASCII:
case TILEDB_STRING_UTF8:
case TILEDB_CHAR:
case TILEDB_GEOM_WKB:
case TILEDB_GEOM_WKT:
return _cast_column_aux<std::string>(schema, array, se);
case TILEDB_BOOL:
return _cast_column_aux<bool>(schema, array, se);
Expand Down Expand Up @@ -477,6 +479,8 @@ void SOMAArray::_promote_indexes_to_values(
case TILEDB_STRING_ASCII:
case TILEDB_STRING_UTF8:
case TILEDB_CHAR:
case TILEDB_GEOM_WKB:
case TILEDB_GEOM_WKT:
return _cast_dictionary_values<std::string>(schema, array);
case TILEDB_BOOL:
return _cast_dictionary_values<bool>(schema, array);
Expand Down Expand Up @@ -784,6 +788,8 @@ bool SOMAArray::_extend_enumeration(
case TILEDB_STRING_ASCII:
case TILEDB_STRING_UTF8:
case TILEDB_CHAR:
case TILEDB_GEOM_WKB:
case TILEDB_GEOM_WKT:
return _extend_and_evolve_schema<std::string>(
value_schema, value_array, index_schema, index_array, se);
case TILEDB_INT8:
Expand Down Expand Up @@ -1261,6 +1267,8 @@ ArrowTable SOMAArray::_get_core_domainish(enum Domainish which_kind) {

case TILEDB_STRING_ASCII:
case TILEDB_CHAR:
case TILEDB_GEOM_WKB:
case TILEDB_GEOM_WKT:
child = ArrowAdapter::make_arrow_array_child_string(
_core_domainish_slot_string(core_dim.name(), which_kind));
break;
Expand Down Expand Up @@ -1750,6 +1758,8 @@ void SOMAArray::_set_soma_joinid_shape_helper(
case TILEDB_STRING_ASCII:
case TILEDB_STRING_UTF8:
case TILEDB_CHAR:
case TILEDB_GEOM_WKB:
case TILEDB_GEOM_WKT:
// TODO: make these named constants b/c they're shared
// with arrow_adapter.
ndrect.set_range(dim_name, "", "\xff");
Expand Down
2 changes: 1 addition & 1 deletion libtiledbsoma/src/soma/soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ void SOMADataFrame::update_dataframe_schema(
attr_name,
ArrowAdapter::to_tiledb_format(attr_type));

if (ArrowAdapter::arrow_is_string_type(attr_type.c_str())) {
if (ArrowAdapter::arrow_is_var_length_type(attr_type.c_str())) {
attr.set_cell_val_num(TILEDB_VAR_NUM);
}

Expand Down
3 changes: 3 additions & 0 deletions libtiledbsoma/src/soma/soma_dense_ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ void SOMADenseNDArray::create(
schema->format = strdup("+s");
schema->n_children = index_column_size + 1;
schema->dictionary = nullptr;
schema->metadata = nullptr;
schema->flags = 0;
schema->release = &ArrowAdapter::release_schema;
schema->children = new ArrowSchema*[schema->n_children];
Expand All @@ -64,6 +65,7 @@ void SOMADenseNDArray::create(
dim->name = strdup(
std::string("soma_dim_" + std::to_string(dim_idx)).c_str());
dim->n_children = 0;
dim->metadata = nullptr;
dim->dictionary = nullptr;
dim->release = &ArrowAdapter::release_schema;
index_column_names.push_back(dim->name);
Expand All @@ -75,6 +77,7 @@ void SOMADenseNDArray::create(
attr->n_children = 0;
attr->flags = 0; // or ARROW_FLAG_NULLABLE;
attr->dictionary = nullptr;
attr->metadata = nullptr;
attr->release = &ArrowAdapter::release_schema;

auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema(
Expand Down
3 changes: 3 additions & 0 deletions libtiledbsoma/src/soma/soma_sparse_ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ void SOMASparseNDArray::create(
schema->format = strdup("+s");
schema->n_children = index_column_size + 1;
schema->dictionary = nullptr;
schema->metadata = nullptr;
schema->flags = 0;
schema->release = &ArrowAdapter::release_schema;
schema->children = new ArrowSchema*[schema->n_children];
Expand All @@ -66,6 +67,7 @@ void SOMASparseNDArray::create(
std::string("soma_dim_" + std::to_string(dim_idx)).c_str());
dim->n_children = 0;
dim->dictionary = nullptr;
dim->metadata = nullptr;
dim->release = &ArrowAdapter::release_schema;
index_column_names.push_back(dim->name);
}
Expand All @@ -76,6 +78,7 @@ void SOMASparseNDArray::create(
attr->n_children = 0;
attr->flags = 0;
attr->dictionary = nullptr;
attr->metadata = nullptr;
attr->release = &ArrowAdapter::release_schema;

auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema(
Expand Down
42 changes: 32 additions & 10 deletions libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -957,7 +957,18 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(

for (int64_t sch_idx = 0; sch_idx < arrow_schema->n_children; ++sch_idx) {
auto child = arrow_schema->children[sch_idx];
auto type = ArrowAdapter::to_tiledb_format(child->format);
std::string_view type_metadata;

if (ArrowMetadataHasKey(child->metadata, ArrowCharView("dtype"))) {
ArrowStringView out;
NANOARROW_THROW_NOT_OK(ArrowMetadataGetValue(
child->metadata, ArrowCharView("dtype"), &out));

type_metadata = std::string_view(out.data, out.size_bytes);
}

auto type = ArrowAdapter::to_tiledb_format(
child->format, type_metadata);

LOG_DEBUG(fmt::format(
"[ArrowAdapter] schema pass for child {} name {}",
Expand All @@ -971,7 +982,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
auto schild = index_column_schema->children[i];
auto col_name = schild->name;
if (strcmp(child->name, col_name) == 0) {
if (ArrowAdapter::arrow_is_string_type(child->format)) {
if (ArrowAdapter::arrow_is_var_length_type(child->format)) {
type = TILEDB_STRING_ASCII;
}

Expand Down Expand Up @@ -1010,7 +1021,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
attr.set_nullable(true);
}

if (ArrowAdapter::arrow_is_string_type(child->format)) {
if (ArrowAdapter::arrow_is_var_length_type(child->format)) {
attr.set_cell_val_num(TILEDB_VAR_NUM);
}

Expand All @@ -1021,7 +1032,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
*ctx,
child->name,
enmr_type,
ArrowAdapter::arrow_is_string_type(enmr_format) ?
ArrowAdapter::arrow_is_var_length_type(enmr_format) ?
TILEDB_VAR_NUM :
1,
child->flags & ARROW_FLAG_DICTIONARY_ORDERED);
Expand Down Expand Up @@ -1070,7 +1081,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
continue;
}

if (ArrowAdapter::arrow_is_string_type(child->format)) {
if (ArrowAdapter::arrow_is_var_length_type(child->format)) {
// In the core API:
//
// * domain for strings must be set as (nullptr, nullptr)
Expand Down Expand Up @@ -1399,7 +1410,7 @@ ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
return std::pair(std::move(array), std::move(schema));
}

bool ArrowAdapter::arrow_is_string_type(const char* format) {
bool ArrowAdapter::arrow_is_var_length_type(const char* format) {
return (
(strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) ||
(strcmp(format, "u") == 0) || (strcmp(format, "z") == 0));
Expand All @@ -1419,8 +1430,8 @@ std::string_view ArrowAdapter::to_arrow_format(
{TILEDB_FLOAT32, "f"}, {TILEDB_FLOAT64, "g"},
{TILEDB_BOOL, "b"}, {TILEDB_DATETIME_SEC, "tss:"},
{TILEDB_DATETIME_MS, "tsm:"}, {TILEDB_DATETIME_US, "tsu:"},
{TILEDB_DATETIME_NS, "tsn:"},
};
{TILEDB_DATETIME_NS, "tsn:"}, {TILEDB_GEOM_WKB, z},
{TILEDB_GEOM_WKT, u}};

try {
return _to_arrow_format_map.at(tiledb_dtype);
Expand All @@ -1431,7 +1442,8 @@ std::string_view ArrowAdapter::to_arrow_format(
}
}

tiledb_datatype_t ArrowAdapter::to_tiledb_format(std::string_view arrow_dtype) {
tiledb_datatype_t ArrowAdapter::to_tiledb_format(
std::string_view arrow_dtype, std::string_view arrow_dtype_metadata) {
std::map<std::string_view, tiledb_datatype_t> _to_tiledb_format_map = {
{"u", TILEDB_STRING_UTF8}, {"U", TILEDB_STRING_UTF8},
{"z", TILEDB_CHAR}, {"Z", TILEDB_CHAR},
Expand All @@ -1446,7 +1458,17 @@ tiledb_datatype_t ArrowAdapter::to_tiledb_format(std::string_view arrow_dtype) {
};

try {
return _to_tiledb_format_map.at(arrow_dtype);
auto dtype = _to_tiledb_format_map.at(arrow_dtype);

if (dtype == TILEDB_CHAR && arrow_dtype_metadata.compare("WKB") == 0) {
dtype = TILEDB_GEOM_WKB;
} else if (
dtype == TILEDB_STRING_UTF8 &&
arrow_dtype_metadata.compare("WKT") == 0) {
dtype = TILEDB_GEOM_WKT;
}

return dtype;
} catch (const std::out_of_range& e) {
throw std::out_of_range(fmt::format(
"ArrowAdapter: Unsupported Arrow type: {} ", arrow_dtype));
Expand Down
8 changes: 6 additions & 2 deletions libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -275,15 +275,19 @@ class ArrowAdapter {
* @param const char* Arrow data format
* @return bool Whether the Arrow type represents a string type
*/
static bool arrow_is_string_type(const char* format);
static bool arrow_is_var_length_type(const char* format);

/**
* @brief Get TileDB datatype from Arrow format string.
*
* @param datatype TileDB datatype.
* @param arrow_dtype_metadata Additional datatype info. Useful for
* differentiating between BLOB and WKB.
* @return std::string_view Arrow format string.
*/
static tiledb_datatype_t to_tiledb_format(std::string_view arrow_dtype);
static tiledb_datatype_t to_tiledb_format(
std::string_view arrow_dtype,
std::string_view arrow_dtype_metadata = {});

static enum ArrowType to_nanoarrow_type(std::string_view sv);

Expand Down
Loading