Skip to content

Commit

Permalink
Add support for WKB and WKT attributes and dimensions (#3210)
Browse files Browse the repository at this point in the history
  • Loading branch information
XanthosXanthopoulos authored Oct 21, 2024
1 parent 003fd1d commit 928c281
Show file tree
Hide file tree
Showing 8 changed files with 73 additions and 13 deletions.
11 changes: 11 additions & 0 deletions libtiledbsoma/src/soma/column_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,17 @@ size_t ColumnBuffer::update_size(const Query& query) {
return num_cells_;
}

std::vector<std::vector<std::byte>> ColumnBuffer::binaries() {
std::vector<std::vector<std::byte>> result;

for (size_t i = 0; i < num_cells_; i++) {
result.emplace_back(std::vector<std::byte>(
data_.data() + offsets_[i], data_.data() + offsets_[i + 1]));
}

return result;
}

std::vector<std::string> ColumnBuffer::strings() {
std::vector<std::string> result;

Expand Down
7 changes: 7 additions & 0 deletions libtiledbsoma/src/soma/column_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,13 @@ class ColumnBuffer {
return tcb::span<T>((T*)data_.data(), num_cells_);
}

/**
* @brief Return data in a vector of binary buffers.
*
* @return std::vector<std::vector<uint8_t>>
*/
std::vector<std::vector<std::byte>> binaries();

/**
* @brief Return data in a vector of strings.
*
Expand Down
10 changes: 10 additions & 0 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,8 @@ bool SOMAArray::_cast_column(
case TILEDB_STRING_ASCII:
case TILEDB_STRING_UTF8:
case TILEDB_CHAR:
case TILEDB_GEOM_WKB:
case TILEDB_GEOM_WKT:
return _cast_column_aux<std::string>(schema, array, se);
case TILEDB_BOOL:
return _cast_column_aux<bool>(schema, array, se);
Expand Down Expand Up @@ -477,6 +479,8 @@ void SOMAArray::_promote_indexes_to_values(
case TILEDB_STRING_ASCII:
case TILEDB_STRING_UTF8:
case TILEDB_CHAR:
case TILEDB_GEOM_WKB:
case TILEDB_GEOM_WKT:
return _cast_dictionary_values<std::string>(schema, array);
case TILEDB_BOOL:
return _cast_dictionary_values<bool>(schema, array);
Expand Down Expand Up @@ -784,6 +788,8 @@ bool SOMAArray::_extend_enumeration(
case TILEDB_STRING_ASCII:
case TILEDB_STRING_UTF8:
case TILEDB_CHAR:
case TILEDB_GEOM_WKB:
case TILEDB_GEOM_WKT:
return _extend_and_evolve_schema<std::string>(
value_schema, value_array, index_schema, index_array, se);
case TILEDB_INT8:
Expand Down Expand Up @@ -1261,6 +1267,8 @@ ArrowTable SOMAArray::_get_core_domainish(enum Domainish which_kind) {

case TILEDB_STRING_ASCII:
case TILEDB_CHAR:
case TILEDB_GEOM_WKB:
case TILEDB_GEOM_WKT:
child = ArrowAdapter::make_arrow_array_child_string(
_core_domainish_slot_string(core_dim.name(), which_kind));
break;
Expand Down Expand Up @@ -1750,6 +1758,8 @@ void SOMAArray::_set_soma_joinid_shape_helper(
case TILEDB_STRING_ASCII:
case TILEDB_STRING_UTF8:
case TILEDB_CHAR:
case TILEDB_GEOM_WKB:
case TILEDB_GEOM_WKT:
// TODO: make these named constants b/c they're shared
// with arrow_adapter.
ndrect.set_range(dim_name, "", "\xff");
Expand Down
2 changes: 1 addition & 1 deletion libtiledbsoma/src/soma/soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ void SOMADataFrame::update_dataframe_schema(
attr_name,
ArrowAdapter::to_tiledb_format(attr_type));

if (ArrowAdapter::arrow_is_string_type(attr_type.c_str())) {
if (ArrowAdapter::arrow_is_var_length_type(attr_type.c_str())) {
attr.set_cell_val_num(TILEDB_VAR_NUM);
}

Expand Down
3 changes: 3 additions & 0 deletions libtiledbsoma/src/soma/soma_dense_ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ void SOMADenseNDArray::create(
schema->format = strdup("+s");
schema->n_children = index_column_size + 1;
schema->dictionary = nullptr;
schema->metadata = nullptr;
schema->flags = 0;
schema->release = &ArrowAdapter::release_schema;
schema->children = new ArrowSchema*[schema->n_children];
Expand All @@ -64,6 +65,7 @@ void SOMADenseNDArray::create(
dim->name = strdup(
std::string("soma_dim_" + std::to_string(dim_idx)).c_str());
dim->n_children = 0;
dim->metadata = nullptr;
dim->dictionary = nullptr;
dim->release = &ArrowAdapter::release_schema;
index_column_names.push_back(dim->name);
Expand All @@ -75,6 +77,7 @@ void SOMADenseNDArray::create(
attr->n_children = 0;
attr->flags = 0; // or ARROW_FLAG_NULLABLE;
attr->dictionary = nullptr;
attr->metadata = nullptr;
attr->release = &ArrowAdapter::release_schema;

auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema(
Expand Down
3 changes: 3 additions & 0 deletions libtiledbsoma/src/soma/soma_sparse_ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ void SOMASparseNDArray::create(
schema->format = strdup("+s");
schema->n_children = index_column_size + 1;
schema->dictionary = nullptr;
schema->metadata = nullptr;
schema->flags = 0;
schema->release = &ArrowAdapter::release_schema;
schema->children = new ArrowSchema*[schema->n_children];
Expand All @@ -66,6 +67,7 @@ void SOMASparseNDArray::create(
std::string("soma_dim_" + std::to_string(dim_idx)).c_str());
dim->n_children = 0;
dim->dictionary = nullptr;
dim->metadata = nullptr;
dim->release = &ArrowAdapter::release_schema;
index_column_names.push_back(dim->name);
}
Expand All @@ -76,6 +78,7 @@ void SOMASparseNDArray::create(
attr->n_children = 0;
attr->flags = 0;
attr->dictionary = nullptr;
attr->metadata = nullptr;
attr->release = &ArrowAdapter::release_schema;

auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema(
Expand Down
42 changes: 32 additions & 10 deletions libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -957,7 +957,18 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(

for (int64_t sch_idx = 0; sch_idx < arrow_schema->n_children; ++sch_idx) {
auto child = arrow_schema->children[sch_idx];
auto type = ArrowAdapter::to_tiledb_format(child->format);
std::string_view type_metadata;

if (ArrowMetadataHasKey(child->metadata, ArrowCharView("dtype"))) {
ArrowStringView out;
NANOARROW_THROW_NOT_OK(ArrowMetadataGetValue(
child->metadata, ArrowCharView("dtype"), &out));

type_metadata = std::string_view(out.data, out.size_bytes);
}

auto type = ArrowAdapter::to_tiledb_format(
child->format, type_metadata);

LOG_DEBUG(fmt::format(
"[ArrowAdapter] schema pass for child {} name {}",
Expand All @@ -971,7 +982,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
auto schild = index_column_schema->children[i];
auto col_name = schild->name;
if (strcmp(child->name, col_name) == 0) {
if (ArrowAdapter::arrow_is_string_type(child->format)) {
if (ArrowAdapter::arrow_is_var_length_type(child->format)) {
type = TILEDB_STRING_ASCII;
}

Expand Down Expand Up @@ -1010,7 +1021,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
attr.set_nullable(true);
}

if (ArrowAdapter::arrow_is_string_type(child->format)) {
if (ArrowAdapter::arrow_is_var_length_type(child->format)) {
attr.set_cell_val_num(TILEDB_VAR_NUM);
}

Expand All @@ -1021,7 +1032,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
*ctx,
child->name,
enmr_type,
ArrowAdapter::arrow_is_string_type(enmr_format) ?
ArrowAdapter::arrow_is_var_length_type(enmr_format) ?
TILEDB_VAR_NUM :
1,
child->flags & ARROW_FLAG_DICTIONARY_ORDERED);
Expand Down Expand Up @@ -1070,7 +1081,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
continue;
}

if (ArrowAdapter::arrow_is_string_type(child->format)) {
if (ArrowAdapter::arrow_is_var_length_type(child->format)) {
// In the core API:
//
// * domain for strings must be set as (nullptr, nullptr)
Expand Down Expand Up @@ -1399,7 +1410,7 @@ ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
return std::pair(std::move(array), std::move(schema));
}

bool ArrowAdapter::arrow_is_string_type(const char* format) {
bool ArrowAdapter::arrow_is_var_length_type(const char* format) {
return (
(strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) ||
(strcmp(format, "u") == 0) || (strcmp(format, "z") == 0));
Expand All @@ -1419,8 +1430,8 @@ std::string_view ArrowAdapter::to_arrow_format(
{TILEDB_FLOAT32, "f"}, {TILEDB_FLOAT64, "g"},
{TILEDB_BOOL, "b"}, {TILEDB_DATETIME_SEC, "tss:"},
{TILEDB_DATETIME_MS, "tsm:"}, {TILEDB_DATETIME_US, "tsu:"},
{TILEDB_DATETIME_NS, "tsn:"},
};
{TILEDB_DATETIME_NS, "tsn:"}, {TILEDB_GEOM_WKB, z},
{TILEDB_GEOM_WKT, u}};

try {
return _to_arrow_format_map.at(tiledb_dtype);
Expand All @@ -1431,7 +1442,8 @@ std::string_view ArrowAdapter::to_arrow_format(
}
}

tiledb_datatype_t ArrowAdapter::to_tiledb_format(std::string_view arrow_dtype) {
tiledb_datatype_t ArrowAdapter::to_tiledb_format(
std::string_view arrow_dtype, std::string_view arrow_dtype_metadata) {
std::map<std::string_view, tiledb_datatype_t> _to_tiledb_format_map = {
{"u", TILEDB_STRING_UTF8}, {"U", TILEDB_STRING_UTF8},
{"z", TILEDB_CHAR}, {"Z", TILEDB_CHAR},
Expand All @@ -1446,7 +1458,17 @@ tiledb_datatype_t ArrowAdapter::to_tiledb_format(std::string_view arrow_dtype) {
};

try {
return _to_tiledb_format_map.at(arrow_dtype);
auto dtype = _to_tiledb_format_map.at(arrow_dtype);

if (dtype == TILEDB_CHAR && arrow_dtype_metadata.compare("WKB") == 0) {
dtype = TILEDB_GEOM_WKB;
} else if (
dtype == TILEDB_STRING_UTF8 &&
arrow_dtype_metadata.compare("WKT") == 0) {
dtype = TILEDB_GEOM_WKT;
}

return dtype;
} catch (const std::out_of_range& e) {
throw std::out_of_range(fmt::format(
"ArrowAdapter: Unsupported Arrow type: {} ", arrow_dtype));
Expand Down
8 changes: 6 additions & 2 deletions libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -275,15 +275,19 @@ class ArrowAdapter {
* @param const char* Arrow data format
* @return bool Whether the Arrow type represents a string type
*/
static bool arrow_is_string_type(const char* format);
static bool arrow_is_var_length_type(const char* format);

/**
* @brief Get TileDB datatype from Arrow format string.
*
* @param datatype TileDB datatype.
* @param arrow_dtype_metadata Additional datatype info. Useful for
* differentiating between BLOB and WKB.
* @return std::string_view Arrow format string.
*/
static tiledb_datatype_t to_tiledb_format(std::string_view arrow_dtype);
static tiledb_datatype_t to_tiledb_format(
std::string_view arrow_dtype,
std::string_view arrow_dtype_metadata = {});

static enum ArrowType to_nanoarrow_type(std::string_view sv);

Expand Down

0 comments on commit 928c281

Please sign in to comment.