diff --git a/libtiledbsoma/src/soma/column_buffer.cc b/libtiledbsoma/src/soma/column_buffer.cc index bafc76f11d..74fb4e47d8 100644 --- a/libtiledbsoma/src/soma/column_buffer.cc +++ b/libtiledbsoma/src/soma/column_buffer.cc @@ -252,6 +252,17 @@ size_t ColumnBuffer::update_size(const Query& query) { return num_cells_; } +std::vector> ColumnBuffer::binaries() { + std::vector> result; + + for (size_t i = 0; i < num_cells_; i++) { + result.emplace_back(std::vector( + data_.data() + offsets_[i], data_.data() + offsets_[i + 1])); + } + + return result; +} + std::vector ColumnBuffer::strings() { std::vector result; diff --git a/libtiledbsoma/src/soma/column_buffer.h b/libtiledbsoma/src/soma/column_buffer.h index 0200aeb651..cdfc8ace86 100644 --- a/libtiledbsoma/src/soma/column_buffer.h +++ b/libtiledbsoma/src/soma/column_buffer.h @@ -195,6 +195,13 @@ class ColumnBuffer { return tcb::span((T*)data_.data(), num_cells_); } + /** + * @brief Return data in a vector of binary buffers. + * + * @return std::vector> + */ + std::vector> binaries(); + /** * @brief Return data in a vector of strings. * diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 57f4550b85..b294b9c326 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -411,6 +411,8 @@ bool SOMAArray::_cast_column( case TILEDB_STRING_ASCII: case TILEDB_STRING_UTF8: case TILEDB_CHAR: + case TILEDB_GEOM_WKB: + case TILEDB_GEOM_WKT: return _cast_column_aux(schema, array, se); case TILEDB_BOOL: return _cast_column_aux(schema, array, se); @@ -477,6 +479,8 @@ void SOMAArray::_promote_indexes_to_values( case TILEDB_STRING_ASCII: case TILEDB_STRING_UTF8: case TILEDB_CHAR: + case TILEDB_GEOM_WKB: + case TILEDB_GEOM_WKT: return _cast_dictionary_values(schema, array); case TILEDB_BOOL: return _cast_dictionary_values(schema, array); @@ -784,6 +788,8 @@ bool SOMAArray::_extend_enumeration( case TILEDB_STRING_ASCII: case TILEDB_STRING_UTF8: case TILEDB_CHAR: + case TILEDB_GEOM_WKB: + case TILEDB_GEOM_WKT: return _extend_and_evolve_schema( value_schema, value_array, index_schema, index_array, se); case TILEDB_INT8: @@ -1261,6 +1267,8 @@ ArrowTable SOMAArray::_get_core_domainish(enum Domainish which_kind) { case TILEDB_STRING_ASCII: case TILEDB_CHAR: + case TILEDB_GEOM_WKB: + case TILEDB_GEOM_WKT: child = ArrowAdapter::make_arrow_array_child_string( _core_domainish_slot_string(core_dim.name(), which_kind)); break; @@ -1750,6 +1758,8 @@ void SOMAArray::_set_soma_joinid_shape_helper( case TILEDB_STRING_ASCII: case TILEDB_STRING_UTF8: case TILEDB_CHAR: + case TILEDB_GEOM_WKB: + case TILEDB_GEOM_WKT: // TODO: make these named constants b/c they're shared // with arrow_adapter. ndrect.set_range(dim_name, "", "\xff"); diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index bf70adb69f..d7feb8a552 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -108,7 +108,7 @@ void SOMADataFrame::update_dataframe_schema( attr_name, ArrowAdapter::to_tiledb_format(attr_type)); - if (ArrowAdapter::arrow_is_string_type(attr_type.c_str())) { + if (ArrowAdapter::arrow_is_var_length_type(attr_type.c_str())) { attr.set_cell_val_num(TILEDB_VAR_NUM); } diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.cc b/libtiledbsoma/src/soma/soma_dense_ndarray.cc index 63a16e31d4..13d3f2ee18 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.cc @@ -53,6 +53,7 @@ void SOMADenseNDArray::create( schema->format = strdup("+s"); schema->n_children = index_column_size + 1; schema->dictionary = nullptr; + schema->metadata = nullptr; schema->flags = 0; schema->release = &ArrowAdapter::release_schema; schema->children = new ArrowSchema*[schema->n_children]; @@ -64,6 +65,7 @@ void SOMADenseNDArray::create( dim->name = strdup( std::string("soma_dim_" + std::to_string(dim_idx)).c_str()); dim->n_children = 0; + dim->metadata = nullptr; dim->dictionary = nullptr; dim->release = &ArrowAdapter::release_schema; index_column_names.push_back(dim->name); @@ -75,6 +77,7 @@ void SOMADenseNDArray::create( attr->n_children = 0; attr->flags = 0; // or ARROW_FLAG_NULLABLE; attr->dictionary = nullptr; + attr->metadata = nullptr; attr->release = &ArrowAdapter::release_schema; auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc index a1f0ad149f..f518ebe342 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc @@ -54,6 +54,7 @@ void SOMASparseNDArray::create( schema->format = strdup("+s"); schema->n_children = index_column_size + 1; schema->dictionary = nullptr; + schema->metadata = nullptr; schema->flags = 0; schema->release = &ArrowAdapter::release_schema; schema->children = new ArrowSchema*[schema->n_children]; @@ -66,6 +67,7 @@ void SOMASparseNDArray::create( std::string("soma_dim_" + std::to_string(dim_idx)).c_str()); dim->n_children = 0; dim->dictionary = nullptr; + dim->metadata = nullptr; dim->release = &ArrowAdapter::release_schema; index_column_names.push_back(dim->name); } @@ -76,6 +78,7 @@ void SOMASparseNDArray::create( attr->n_children = 0; attr->flags = 0; attr->dictionary = nullptr; + attr->metadata = nullptr; attr->release = &ArrowAdapter::release_schema; auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index d176c81ce0..f35b82981e 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -957,7 +957,18 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( for (int64_t sch_idx = 0; sch_idx < arrow_schema->n_children; ++sch_idx) { auto child = arrow_schema->children[sch_idx]; - auto type = ArrowAdapter::to_tiledb_format(child->format); + std::string_view type_metadata; + + if (ArrowMetadataHasKey(child->metadata, ArrowCharView("dtype"))) { + ArrowStringView out; + NANOARROW_THROW_NOT_OK(ArrowMetadataGetValue( + child->metadata, ArrowCharView("dtype"), &out)); + + type_metadata = std::string_view(out.data, out.size_bytes); + } + + auto type = ArrowAdapter::to_tiledb_format( + child->format, type_metadata); LOG_DEBUG(fmt::format( "[ArrowAdapter] schema pass for child {} name {}", @@ -971,7 +982,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( auto schild = index_column_schema->children[i]; auto col_name = schild->name; if (strcmp(child->name, col_name) == 0) { - if (ArrowAdapter::arrow_is_string_type(child->format)) { + if (ArrowAdapter::arrow_is_var_length_type(child->format)) { type = TILEDB_STRING_ASCII; } @@ -1010,7 +1021,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( attr.set_nullable(true); } - if (ArrowAdapter::arrow_is_string_type(child->format)) { + if (ArrowAdapter::arrow_is_var_length_type(child->format)) { attr.set_cell_val_num(TILEDB_VAR_NUM); } @@ -1021,7 +1032,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( *ctx, child->name, enmr_type, - ArrowAdapter::arrow_is_string_type(enmr_format) ? + ArrowAdapter::arrow_is_var_length_type(enmr_format) ? TILEDB_VAR_NUM : 1, child->flags & ARROW_FLAG_DICTIONARY_ORDERED); @@ -1070,7 +1081,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( continue; } - if (ArrowAdapter::arrow_is_string_type(child->format)) { + if (ArrowAdapter::arrow_is_var_length_type(child->format)) { // In the core API: // // * domain for strings must be set as (nullptr, nullptr) @@ -1399,7 +1410,7 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { return std::pair(std::move(array), std::move(schema)); } -bool ArrowAdapter::arrow_is_string_type(const char* format) { +bool ArrowAdapter::arrow_is_var_length_type(const char* format) { return ( (strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) || (strcmp(format, "u") == 0) || (strcmp(format, "z") == 0)); @@ -1419,8 +1430,8 @@ std::string_view ArrowAdapter::to_arrow_format( {TILEDB_FLOAT32, "f"}, {TILEDB_FLOAT64, "g"}, {TILEDB_BOOL, "b"}, {TILEDB_DATETIME_SEC, "tss:"}, {TILEDB_DATETIME_MS, "tsm:"}, {TILEDB_DATETIME_US, "tsu:"}, - {TILEDB_DATETIME_NS, "tsn:"}, - }; + {TILEDB_DATETIME_NS, "tsn:"}, {TILEDB_GEOM_WKB, z}, + {TILEDB_GEOM_WKT, u}}; try { return _to_arrow_format_map.at(tiledb_dtype); @@ -1431,7 +1442,8 @@ std::string_view ArrowAdapter::to_arrow_format( } } -tiledb_datatype_t ArrowAdapter::to_tiledb_format(std::string_view arrow_dtype) { +tiledb_datatype_t ArrowAdapter::to_tiledb_format( + std::string_view arrow_dtype, std::string_view arrow_dtype_metadata) { std::map _to_tiledb_format_map = { {"u", TILEDB_STRING_UTF8}, {"U", TILEDB_STRING_UTF8}, {"z", TILEDB_CHAR}, {"Z", TILEDB_CHAR}, @@ -1446,7 +1458,17 @@ tiledb_datatype_t ArrowAdapter::to_tiledb_format(std::string_view arrow_dtype) { }; try { - return _to_tiledb_format_map.at(arrow_dtype); + auto dtype = _to_tiledb_format_map.at(arrow_dtype); + + if (dtype == TILEDB_CHAR && arrow_dtype_metadata.compare("WKB") == 0) { + dtype = TILEDB_GEOM_WKB; + } else if ( + dtype == TILEDB_STRING_UTF8 && + arrow_dtype_metadata.compare("WKT") == 0) { + dtype = TILEDB_GEOM_WKT; + } + + return dtype; } catch (const std::out_of_range& e) { throw std::out_of_range(fmt::format( "ArrowAdapter: Unsupported Arrow type: {} ", arrow_dtype)); diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 566d1f6507..83afcce1f9 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -275,15 +275,19 @@ class ArrowAdapter { * @param const char* Arrow data format * @return bool Whether the Arrow type represents a string type */ - static bool arrow_is_string_type(const char* format); + static bool arrow_is_var_length_type(const char* format); /** * @brief Get TileDB datatype from Arrow format string. * * @param datatype TileDB datatype. + * @param arrow_dtype_metadata Additional datatype info. Useful for + * differentiating between BLOB and WKB. * @return std::string_view Arrow format string. */ - static tiledb_datatype_t to_tiledb_format(std::string_view arrow_dtype); + static tiledb_datatype_t to_tiledb_format( + std::string_view arrow_dtype, + std::string_view arrow_dtype_metadata = {}); static enum ArrowType to_nanoarrow_type(std::string_view sv);