From 02bfa69760779e4e1b47e66ca97b43175a20415a Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Mon, 21 Oct 2024 18:24:39 +0300 Subject: [PATCH] Geometry dataframe implementation, use spatial axes when available when creating schema --- libtiledbsoma/src/CMakeLists.txt | 3 + .../src/soma/soma_geometry_dataframe.cc | 125 ++++++++ .../src/soma/soma_geometry_dataframe.h | 179 +++++++++++ libtiledbsoma/src/soma/soma_object.cc | 4 +- libtiledbsoma/src/tiledbsoma/tiledbsoma | 1 + libtiledbsoma/src/utils/arrow_adapter.cc | 277 ++++++++++++++---- libtiledbsoma/src/utils/arrow_adapter.h | 41 +++ libtiledbsoma/test/CMakeLists.txt | 1 + libtiledbsoma/test/common.cc | 39 ++- .../test/unit_soma_geometry_dataframe.cc | 149 ++++++++++ 10 files changed, 753 insertions(+), 66 deletions(-) create mode 100644 libtiledbsoma/src/soma/soma_geometry_dataframe.cc create mode 100644 libtiledbsoma/src/soma/soma_geometry_dataframe.h create mode 100644 libtiledbsoma/test/unit_soma_geometry_dataframe.cc diff --git a/libtiledbsoma/src/CMakeLists.txt b/libtiledbsoma/src/CMakeLists.txt index 98f1cc697f..90d203770d 100644 --- a/libtiledbsoma/src/CMakeLists.txt +++ b/libtiledbsoma/src/CMakeLists.txt @@ -37,6 +37,7 @@ add_library(TILEDB_SOMA_OBJECTS OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_experiment.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_measurement.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_scene.cc + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_geometry_dataframe.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_point_cloud_dataframe.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_multiscale_image.cc ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_context.cc @@ -188,6 +189,7 @@ endif() # ${CMAKE_CURRENT_SOURCE_DIR}/cpp_api/soma_experiment.h # ${CMAKE_CURRENT_SOURCE_DIR}/cpp_api/soma_measurement.h # ${CMAKE_CURRENT_SOURCE_DIR}/cpp_api/soma_scene.h +# ${CMAKE_CURRENT_SOURCE_DIR}/cpp_api/soma_geometry_dataframe.h # ${CMAKE_CURRENT_SOURCE_DIR}/cpp_api/soma_point_cloud_dataframe.h # ${CMAKE_CURRENT_SOURCE_DIR}/cpp_api/soma_multiscale_image.h # ${CMAKE_CURRENT_SOURCE_DIR}/cpp_api/soma_object.h @@ -211,6 +213,7 @@ install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_experiment.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_measurement.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_scene.h + ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_geometry_dataframe.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_point_cloud_dataframe.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_multiscale_image.h ${CMAKE_CURRENT_SOURCE_DIR}/soma/soma_object.h diff --git a/libtiledbsoma/src/soma/soma_geometry_dataframe.cc b/libtiledbsoma/src/soma/soma_geometry_dataframe.cc new file mode 100644 index 0000000000..d54be5f3dd --- /dev/null +++ b/libtiledbsoma/src/soma/soma_geometry_dataframe.cc @@ -0,0 +1,125 @@ +/** + * @file soma_geometry_dataframe.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the SOMAGeometryDataFrame class. + */ + +#include "soma_geometry_dataframe.h" +#include "../utils/util.h" + +#include + +namespace tiledbsoma { +using namespace tiledb; + +//=================================================================== +//= public static +//=================================================================== + +void SOMAGeometryDataFrame::create( + std::string_view uri, + std::unique_ptr schema, + ArrowTable index_columns, + ArrowTable spatial_columns, + std::shared_ptr ctx, + PlatformConfig platform_config, + std::optional timestamp) { + std::vector spatial_axes; + auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( + ctx->tiledb_ctx(), + std::move(schema), + ArrowTable( + std::move(index_columns.first), std::move(index_columns.second)), + "SOMAGeometryDataFrame", + true, + platform_config, + ArrowTable( + std::move(spatial_columns.first), + std::move(spatial_columns.second))); + auto array = SOMAArray::create( + ctx, uri, tiledb_schema, "SOMAGeometryDataFrame", timestamp); +} + +std::unique_ptr SOMAGeometryDataFrame::open( + std::string_view uri, + OpenMode mode, + std::shared_ptr ctx, + std::vector column_names, + ResultOrder result_order, + std::optional timestamp) { + return std::make_unique( + mode, uri, ctx, column_names, result_order, timestamp); +} + +bool SOMAGeometryDataFrame::exists( + std::string_view uri, std::shared_ptr ctx) { + try { + auto obj = SOMAObject::open(uri, OpenMode::read, ctx); + return "SOMAGeometryDataFrame" == obj->type(); + } catch (TileDBSOMAError& e) { + return false; + } +} + +//=================================================================== +//= public non-static +//=================================================================== + +std::unique_ptr SOMAGeometryDataFrame::schema() const { + return this->arrow_schema(); +} + +const std::vector SOMAGeometryDataFrame::index_column_names() + const { + return this->dimension_names(); +} + +const std::vector SOMAGeometryDataFrame::spatial_column_names() + const { + std::vector names; + std::unordered_set unique_names; + std::regex rgx("tiledb__internal__(\\S+)__"); + std::smatch matches; + for (auto dimension : this->dimension_names()) { + if (std::regex_search(dimension, matches, rgx)) { + if (unique_names.count(matches[1].str()) == 0) { + unique_names.insert(matches[1].str()); + names.push_back(matches[1].str()); + } + } + } + + return names; +} + +uint64_t SOMAGeometryDataFrame::count() { + return this->nnz(); +} + +} // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_geometry_dataframe.h b/libtiledbsoma/src/soma/soma_geometry_dataframe.h new file mode 100644 index 0000000000..5bfedd4648 --- /dev/null +++ b/libtiledbsoma/src/soma/soma_geometry_dataframe.h @@ -0,0 +1,179 @@ +/** + * @file soma_geometry_dataframe.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines the SOMAGeometryDataFrame class. + */ + +#ifndef SOMA_GEOMETRY_DATAFRAME +#define SOMA_GEOMETRY_DATAFRAME + +#include + +#include "soma_array.h" + +namespace tiledbsoma { + +class ArrayBuffers; + +using namespace tiledb; + +class SOMAGeometryDataFrame : virtual public SOMAArray { + public: + //=================================================================== + //= public static + //=================================================================== + + /** + * @brief Create a SOMAGeometryDataFrame object at the given URI. + * + * @param uri URI to create the SOMAGeometryDataFrame + * @param schema Arrow schema + * @param index_columns The index column names with associated domains + * and tile extents per dimension + * @param spatial_columns The spatial column names with associated domains + * and tile extents per dimension + * @param ctx SOMAContext + * @param platform_config Optional config parameter dictionary + * @param timestamp Optional the timestamp range to write SOMA metadata info + */ + static void create( + std::string_view uri, + std::unique_ptr schema, + ArrowTable index_columns, + ArrowTable spatial_columns, + std::shared_ptr ctx, + PlatformConfig platform_config = PlatformConfig(), + std::optional timestamp = std::nullopt); + + /** + * @brief Open and return a SOMAGeometryDataFrame object at the given URI. + * + * @param uri URI to create the SOMAGeometryDataFrame + * @param mode read or write + * @param ctx SOMAContext + * @param column_names A list of column names to use as user-defined index + * columns (e.g., ``['cell_type', 'tissue_type']``). All named columns must + * exist in the schema, and at least one index column name is required. + * @param result_order Read result order: automatic (default), rowmajor, or + * colmajor + * @param timestamp If specified, overrides the default timestamp used to + * open this object. If unset, uses the timestamp provided by the context. + * @return std::unique_ptr SOMAGeometryDataFrame + */ + static std::unique_ptr open( + std::string_view uri, + OpenMode mode, + std::shared_ptr ctx, + std::vector column_names = {}, + ResultOrder result_order = ResultOrder::automatic, + std::optional timestamp = std::nullopt); + + /** + * @brief Check if the SOMAGeometryDataFrame exists at the URI. + * + * @param uri URI to create the SOMAGeometryDataFrame + * @param ctx SOMAContext + */ + static bool exists(std::string_view uri, std::shared_ptr ctx); + + //=================================================================== + //= public non-static + //=================================================================== + + /** + * @brief Construct a new SOMAGeometryDataFrame object. + * + * @param mode read or write + * @param uri URI of the array + * @param ctx TileDB context + * @param column_names Columns to read + * @param result_order Read result order: automatic (default), rowmajor, or + * colmajor + * @param timestamp Timestamp + */ + SOMAGeometryDataFrame( + OpenMode mode, + std::string_view uri, + std::shared_ptr ctx, + std::vector column_names, + ResultOrder result_order, + std::optional timestamp = std::nullopt) + : SOMAArray( + mode, + uri, + ctx, + std::filesystem::path(uri).filename().string(), // array name + column_names, + "auto", // batch_size + result_order, + timestamp) { + } + + SOMAGeometryDataFrame(const SOMAArray& other) + : SOMAArray(other) { + } + + SOMAGeometryDataFrame() = delete; + SOMAGeometryDataFrame(const SOMAGeometryDataFrame&) = default; + SOMAGeometryDataFrame(SOMAGeometryDataFrame&&) = delete; + ~SOMAGeometryDataFrame() = default; + + using SOMAArray::open; + + /** + * Return the data schema, in the form of a ArrowSchema. + * + * @return std::unique_ptr + */ + std::unique_ptr schema() const; + + /** + * Return the index (dimension) column names. + * + * @return std::vector + */ + const std::vector index_column_names() const; + + /** + * Return the spatial column names. + * + * @return std::vector + */ + const std::vector spatial_column_names() const; + + /** + * Return the number of rows. + * + * @return int64_t + */ + uint64_t count(); +}; +} // namespace tiledbsoma + +#endif // SOMA_GEOMETRY_DATAFRAME \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_object.cc b/libtiledbsoma/src/soma/soma_object.cc index c610ece465..e9cc3627bf 100644 --- a/libtiledbsoma/src/soma/soma_object.cc +++ b/libtiledbsoma/src/soma/soma_object.cc @@ -7,6 +7,7 @@ #include "soma_dataframe.h" #include "soma_dense_ndarray.h" #include "soma_experiment.h" +#include "soma_geometry_dataframe.h" #include "soma_measurement.h" #include "soma_multiscale_image.h" #include "soma_point_cloud_dataframe.h" @@ -61,8 +62,7 @@ std::unique_ptr SOMAObject::open( } else if (array_type == "somapointclouddataframe") { return std::make_unique(*array_); } else if (array_type == "somageometrydataframe") { - throw TileDBSOMAError( - "Support for SOMAGeometryDataFrame is not yet implemented"); + return std::make_unique(*array_); } else { throw TileDBSOMAError("Saw invalid SOMAArray type"); } diff --git a/libtiledbsoma/src/tiledbsoma/tiledbsoma b/libtiledbsoma/src/tiledbsoma/tiledbsoma index 707bd03d60..3f71c123a4 100644 --- a/libtiledbsoma/src/tiledbsoma/tiledbsoma +++ b/libtiledbsoma/src/tiledbsoma/tiledbsoma @@ -54,6 +54,7 @@ #include "soma/soma_experiment.h" #include "soma/soma_measurement.h" #include "soma/soma_scene.h" +#include "soma/soma_geometry_dataframe.h" #include "soma/soma_point_cloud_dataframe.h" #include "soma/soma_multiscale_image.h" #include "soma/soma_object.h" diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index f35b82981e..caaa46e681 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -920,7 +920,8 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( ArrowTable index_column_info, std::string soma_type, bool is_sparse, - PlatformConfig platform_config) { + PlatformConfig platform_config, + ArrowTable spatial_column_info) { auto index_column_array = std::move(index_column_info.first); auto index_column_schema = std::move(index_column_info.second); @@ -967,9 +968,6 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( type_metadata = std::string_view(out.data, out.size_bytes); } - auto type = ArrowAdapter::to_tiledb_format( - child->format, type_metadata); - LOG_DEBUG(fmt::format( "[ArrowAdapter] schema pass for child {} name {}", sch_idx, @@ -978,85 +976,106 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( bool isattr = true; for (int64_t i = 0; i < index_column_schema->n_children; ++i) { - auto achild = index_column_array->children[i]; - auto schild = index_column_schema->children[i]; - auto col_name = schild->name; - if (strcmp(child->name, col_name) == 0) { - if (ArrowAdapter::arrow_is_var_length_type(child->format)) { - type = TILEDB_STRING_ASCII; - } + if (strcmp(child->name, index_column_schema->children[i]->name) == + 0) { + if (strcmp(child->name, "soma_geometry") == 0 && + spatial_column_info.first.get() != nullptr) { + if (type_metadata.compare("WKB") != 0) { + throw std::runtime_error( + std::string( + "Unkwown type metadata for `soma_geometry`. " + "Expected 'WKB', found ") + + std::string(type_metadata.data())); + } - FilterList filter_list = ArrowAdapter::_create_dim_filter_list( - child->name, platform_config, soma_type, ctx); + for (int64_t j = 0; + j < spatial_column_info.second->n_children; + ++j) { + auto min_dim = tiledb_dimension_from_arrow_schema( + ctx, + spatial_column_info.second->children[j], + spatial_column_info.first->children[j], + soma_type, + type_metadata, + "tiledb__internal__", + "__min", + platform_config); + + auto max_dim = tiledb_dimension_from_arrow_schema( + ctx, + spatial_column_info.second->children[j], + spatial_column_info.first->children[j], + soma_type, + type_metadata, + "tiledb__internal__", + "__max", + platform_config); + + dims.insert({min_dim.first.name(), min_dim.first}); + dims.insert({max_dim.first.name(), max_dim.first}); + + use_current_domain &= min_dim.second; + use_current_domain &= max_dim.second; + } - if (achild->length == 3) { - use_current_domain = false; - } else if (achild->length == 5) { - // This is fine + // Do not set the `isattr` flag to false to add the + // `soma_geometry` as attribute } else { - throw TileDBSOMAError(fmt::format( - "ArrowAdapter: unexpected length {} for name {}", - achild->length, - col_name)); + auto dim = tiledb_dimension_from_arrow_schema( + ctx, + index_column_schema->children[i], + index_column_array->children[i], + soma_type, + type_metadata, + "", + "", + platform_config); + dims.insert({dim.first.name(), dim.first}); + use_current_domain &= dim.second; + isattr = false; } - const void* buff = achild->buffers[1]; - auto dim = ArrowAdapter::_create_dim( - type, child->name, buff, ctx); - dim.set_filter_list(filter_list); - dims.insert({child->name, dim}); - isattr = false; break; } } if (isattr) { - Attribute attr(*ctx, child->name, type); - - FilterList filter_list = ArrowAdapter::_create_attr_filter_list( - child->name, platform_config, ctx); - attr.set_filter_list(filter_list); - - if (child->flags & ARROW_FLAG_NULLABLE) { - attr.set_nullable(true); - } - - if (ArrowAdapter::arrow_is_var_length_type(child->format)) { - attr.set_cell_val_num(TILEDB_VAR_NUM); - } - - if (child->dictionary != nullptr) { - auto enmr_format = child->dictionary->format; - auto enmr_type = ArrowAdapter::to_tiledb_format(enmr_format); - auto enmr = Enumeration::create_empty( - *ctx, - child->name, - enmr_type, - ArrowAdapter::arrow_is_var_length_type(enmr_format) ? - TILEDB_VAR_NUM : - 1, - child->flags & ARROW_FLAG_DICTIONARY_ORDERED); - ArraySchemaExperimental::add_enumeration(*ctx, schema, enmr); - AttributeExperimental::set_enumeration_name( - *ctx, attr, child->name); - LOG_DEBUG(fmt::format( - "[ArrowAdapter] dictionary for {} as {} {}", - std::string(child->name), - tiledb::impl::type_to_str(enmr_type), - std::string(enmr_format))); + auto attr = tiledb_attribute_from_arrow_schema( + ctx, child, type_metadata, platform_config); + if (attr.second.has_value()) { + ArraySchemaExperimental::add_enumeration( + *ctx, schema, attr.second.value()); } LOG_DEBUG( fmt::format("[ArrowAdapter] adding attribute {}", child->name)); - schema.add_attribute(attr); + + schema.add_attribute(attr.first); } } for (int64_t i = 0; i < index_column_schema->n_children; ++i) { LOG_DEBUG(fmt::format("[ArrowAdapter] child {}", i)); auto col_name = index_column_schema->children[i]->name; - domain.add_dimension(dims.at(col_name)); + if (strcmp(col_name, "soma_geometry") == 0) { + for (auto& dim : dims) { + if (dim.first.substr(std::max(0ul, dim.first.size() - 5)) == + std::string("__min")) { + domain.add_dimension(dim.second); + } + } + + for (auto& dim : dims) { + if (dim.first.substr(std::max(0ul, dim.first.size() - 5)) == + std::string("__max")) { + domain.add_dimension(dim.second); + } + } + } else { + domain.add_dimension(dims.at(col_name)); + } } + LOG_DEBUG(fmt::format("[ArrowAdapter] set_domain")); schema.set_domain(domain); @@ -1081,7 +1100,28 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( continue; } - if (ArrowAdapter::arrow_is_var_length_type(child->format)) { + if (strcmp(child->name, "soma_geometry") == 0 && + spatial_column_info.first.get() != nullptr) { + for (int64_t j = 0; + j < spatial_column_info.first->n_children; + ++j) { + auto col_name = "tiledb__internal__" + + std::string(spatial_column_info.second + ->children[j] + ->name); + const void* buff = spatial_column_info.first + ->children[j] + ->buffers[1]; + auto type = ArrowAdapter::to_tiledb_format( + spatial_column_info.second->children[j]->format); + + _set_current_domain_slot( + type, buff, ndrect, col_name + "__min"); + _set_current_domain_slot( + type, buff, ndrect, col_name + "__max"); + } + } else if (ArrowAdapter::arrow_is_var_length_type( + child->format)) { // In the core API: // // * domain for strings must be set as (nullptr, nullptr) @@ -1150,6 +1190,104 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( return schema; } +std::pair> +ArrowAdapter::tiledb_attribute_from_arrow_schema( + std::shared_ptr ctx, + ArrowSchema* arrow_schema, + std::string_view type_metadata, + PlatformConfig platform_config) { + auto type = ArrowAdapter::to_tiledb_format(arrow_schema->format); + if (strcmp(arrow_schema->name, "soma_geometry") == 0) { + if (type_metadata.compare("WKB") == 0) { + type = TILEDB_GEOM_WKB; + } else { + throw std::runtime_error( + std::string("Unkwown type metadata for `soma_geometry`. " + "Expected 'WKB', found ") + + type_metadata.data()); + } + } + + Attribute attr(*ctx, arrow_schema->name, type); + + FilterList filter_list = ArrowAdapter::_create_attr_filter_list( + arrow_schema->name, platform_config, ctx); + attr.set_filter_list(filter_list); + + if (arrow_schema->flags & ARROW_FLAG_NULLABLE) { + attr.set_nullable(true); + } + + if (ArrowAdapter::arrow_is_var_length_type(arrow_schema->format)) { + attr.set_cell_val_num(TILEDB_VAR_NUM); + } + + std::optional enmr = std::nullopt; + + if (arrow_schema->dictionary != nullptr) { + auto enmr_format = arrow_schema->dictionary->format; + auto enmr_type = ArrowAdapter::to_tiledb_format(enmr_format); + enmr = Enumeration::create_empty( + *ctx, + arrow_schema->name, + enmr_type, + ArrowAdapter::arrow_is_var_length_type(enmr_format) ? + TILEDB_VAR_NUM : + 1, + arrow_schema->flags & ARROW_FLAG_DICTIONARY_ORDERED); + AttributeExperimental::set_enumeration_name( + *ctx, attr, arrow_schema->name); + LOG_DEBUG(fmt::format( + "[ArrowAdapter] dictionary for {} as {} {}", + std::string(arrow_schema->name), + tiledb::impl::type_to_str(enmr_type), + std::string(enmr_format))); + } + + return {attr, enmr}; +} + +std::pair ArrowAdapter::tiledb_dimension_from_arrow_schema( + std::shared_ptr ctx, + ArrowSchema* schema, + ArrowArray* array, + std::string soma_type, + std::string_view type_metadata, + std::string prefix, + std::string suffix, + PlatformConfig platform_config) { + bool use_current_domain = true; + + auto type = ArrowAdapter::to_tiledb_format(schema->format, type_metadata); + + if (ArrowAdapter::arrow_is_var_length_type(schema->format)) { + type = TILEDB_STRING_ASCII; + } + + auto col_name = prefix + std::string(schema->name) + suffix; + + FilterList filter_list = ArrowAdapter::_create_dim_filter_list( + col_name, platform_config, soma_type, ctx); + + if (array->length == 3) { + use_current_domain = false; + } else if (array->length == 5) { + // This is fine + } else { + throw TileDBSOMAError(fmt::format( + "ArrowAdapter: unexpected length {} for name " + "{}", + array->length, + col_name)); + } + + const void* buff = array->buffers[1]; + auto dim = ArrowAdapter::_create_dim(type, col_name, buff, ctx); + dim.set_filter_list(filter_list); + + return {dim, use_current_domain}; +} + std::pair ArrowAdapter::_get_data_and_length( Enumeration& enmr, const void* dst) { switch (enmr.type()) { @@ -1572,6 +1710,19 @@ std::unique_ptr ArrowAdapter::make_arrow_schema( i, dim_schema->format, dim_schema->name)); + + if (strcmp(dim_schema->name, "soma_geometry") == 0) { + nanoarrow::UniqueBuffer buffer; + ArrowMetadataBuilderInit(buffer.get(), nullptr); + ArrowMetadataBuilderAppend( + buffer.get(), + ArrowCharView("dtype"), + ArrowCharView( + tiledb_datatypes[i] == TILEDB_GEOM_WKB ? "WKB" : "WKT")); + ArrowSchemaSetMetadata( + dim_schema, + std::string((char*)buffer->data, buffer->size_bytes).c_str()); + } } return arrow_schema; diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 83afcce1f9..c3ab94209f 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -257,6 +257,37 @@ class ArrowAdapter { ArrowTable index_column_info, std::string soma_type, bool is_sparse = true, + PlatformConfig platform_config = PlatformConfig(), + ArrowTable spatial_column_info = { + std::unique_ptr(nullptr), + std::unique_ptr(nullptr)}); + + /** + * @brief Get a TileDB attribute with its enumeration from an Arrow schema. + * + * @return std::pair> + */ + static std::pair> + tiledb_attribute_from_arrow_schema( + std::shared_ptr ctx, + ArrowSchema* arrow_schema, + std::string_view type_metadata, + PlatformConfig platform_config = PlatformConfig()); + + /** + * @brief Get a TileDB dimension from an Arrow schema. + * + * @return std::pair The TileDB dimension with a boolean + * flag indicating whether or not the dimension uses `current domain`. + */ + static std::pair tiledb_dimension_from_arrow_schema( + std::shared_ptr ctx, + ArrowSchema* schema, + ArrowArray* array, + std::string soma_type, + std::string_view type_metadata, + std::string prefix = std::string(), + std::string suffix = std::string(), PlatformConfig platform_config = PlatformConfig()); /** @@ -344,6 +375,16 @@ class ArrowAdapter { return make_arrow_array_child_string(v); } + static ArrowArray* make_arrow_array_child_binary() { + // Use malloc here, not new, to match ArrowAdapter::release_array + auto arrow_array = (ArrowArray*)malloc(sizeof(ArrowArray)); + + ArrowArrayInitFromType( + arrow_array, ArrowType::NANOARROW_TYPE_LARGE_BINARY); + + return arrow_array; + } + template static ArrowArray* make_arrow_array_child(const std::vector& v) { // We're aware of template-specialization wherein we can diff --git a/libtiledbsoma/test/CMakeLists.txt b/libtiledbsoma/test/CMakeLists.txt index 8888b06b23..0427cbe03f 100644 --- a/libtiledbsoma/test/CMakeLists.txt +++ b/libtiledbsoma/test/CMakeLists.txt @@ -30,6 +30,7 @@ add_executable(unit_soma unit_soma_sparse_ndarray.cc unit_soma_collection.cc unit_soma_scene.cc + unit_soma_geometry_dataframe.cc unit_soma_point_cloud_dataframe.cc unit_soma_multiscale_image.cc test_indexer.cc diff --git a/libtiledbsoma/test/common.cc b/libtiledbsoma/test/common.cc index c3f0e60197..4ce06e73b7 100644 --- a/libtiledbsoma/test/common.cc +++ b/libtiledbsoma/test/common.cc @@ -153,7 +153,25 @@ static std::unique_ptr _create_index_cols_info_schema( tiledb_datatypes[i] = dim_info.tiledb_datatype; } - return ArrowAdapter::make_arrow_schema(names, tiledb_datatypes); + auto schema = ArrowAdapter::make_arrow_schema(names, tiledb_datatypes); + + for (size_t i = 0; i < schema->n_children; ++i) { + if (strcmp(schema->children[i]->name, "soma_geometry")) { + nanoarrow::UniqueBuffer buffer; + ArrowMetadataBuilderInit(buffer.get(), nullptr); + ArrowMetadataBuilderAppend( + buffer.get(), + ArrowCharView("dtype"), + ArrowCharView( + dim_infos[i].tiledb_datatype == TILEDB_GEOM_WKB ? "WKB" : + "WKT")); + ArrowSchemaSetMetadata( + schema->children[i], + std::string((char*)buffer->data, buffer->size_bytes).c_str()); + } + } + + return schema; } static std::unique_ptr _create_index_cols_info_array( @@ -210,6 +228,25 @@ static std::unique_ptr _create_index_cols_info_array( std::vector dom({"", "", ""}); dim_array = ArrowAdapter::make_arrow_array_child_string(dom); } + } else if (info.tiledb_datatype == TILEDB_GEOM_WKB) { + // No domain can be set for WKB. The domain will be set to the + // individual spatial axes. + dim_array = ArrowAdapter::make_arrow_array_child_binary(); + } else if (info.tiledb_datatype == TILEDB_FLOAT64) { + if (info.use_current_domain) { + // domain big; current_domain small + std::vector dom( + {0, + (double_t)CORE_DOMAIN_MAX, + 1, + 0, + (double_t)info.dim_max}); + dim_array = ArrowAdapter::make_arrow_array_child(dom); + } else { + // domain small; current_domain feature not being used + std::vector dom({0, (double_t)info.dim_max, 1}); + dim_array = ArrowAdapter::make_arrow_array_child(dom); + } } if (dim_array == nullptr) { diff --git a/libtiledbsoma/test/unit_soma_geometry_dataframe.cc b/libtiledbsoma/test/unit_soma_geometry_dataframe.cc new file mode 100644 index 0000000000..bfde07be64 --- /dev/null +++ b/libtiledbsoma/test/unit_soma_geometry_dataframe.cc @@ -0,0 +1,149 @@ +/** + * @file unit_soma_geometry_dataframe.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file manages unit tests for the SOMAGeometryDataFrame class + */ + +#include +#include "../src/geometry/geometry.h" +#include "../src/geometry/operators/io/write.h" +#include "common.h" + +const int64_t SOMA_JOINID_DIM_MAX = 99; + +TEST_CASE("SOMAGeometryDataFrame: basic", "[SOMAGeometryDataFrame]") { + auto use_current_domain = GENERATE(false, true); + // TODO this could be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { + auto ctx = std::make_shared(); + std::string uri{"mem://unit-test-geometry-basic"}; + PlatformConfig platform_config{}; + + std::vector dim_infos( + {helper::DimInfo( + {.name = "soma_joinid", + .tiledb_datatype = TILEDB_INT64, + .dim_max = SOMA_JOINID_DIM_MAX, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain}), + helper::DimInfo( + {.name = "soma_geometry", + .tiledb_datatype = TILEDB_GEOM_WKB, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain})}); + + std::vector spatial_dim_infos( + {helper::DimInfo( + {.name = "x", + .tiledb_datatype = TILEDB_FLOAT64, + .dim_max = 200, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain}), + helper::DimInfo( + {.name = "y", + .tiledb_datatype = TILEDB_FLOAT64, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain})}); + + std::vector attr_infos({helper::AttrInfo( + {.name = "quality", .tiledb_datatype = TILEDB_FLOAT64})}); + + // Check the point cloud doesn't exist yet. + REQUIRE(!SOMAGeometryDataFrame::exists(uri, ctx)); + + // Create the point cloud. + auto [schema, index_columns] = + helper::create_arrow_schema_and_index_columns( + dim_infos, attr_infos); + auto spatial_columns = helper::create_column_index_info( + spatial_dim_infos); + + SOMAGeometryDataFrame::create( + uri, + std::move(schema), + ArrowTable( + std::move(index_columns.first), + std::move(index_columns.second)), + ArrowTable( + std::move(spatial_columns.first), + std::move(spatial_columns.second)), + ctx, + platform_config, + std::nullopt); + + // Check the point cloud exists and it cannot be read as a different + // object. + REQUIRE(SOMAGeometryDataFrame::exists(uri, ctx)); + REQUIRE(!SOMASparseNDArray::exists(uri, ctx)); + REQUIRE(!SOMADenseNDArray::exists(uri, ctx)); + REQUIRE(!SOMADataFrame::exists(uri, ctx)); + + auto soma_geometry = SOMAGeometryDataFrame::open( + uri, + OpenMode::read, + ctx, + {}, // column_names, + ResultOrder::automatic, + std::nullopt); + REQUIRE(soma_geometry->uri() == uri); + REQUIRE(soma_geometry->ctx() == ctx); + REQUIRE(soma_geometry->type() == "SOMAGeometryDataFrame"); + std::vector expected_index_column_names = { + dim_infos[0].name, + "tiledb__internal__" + spatial_dim_infos[0].name + "__min", + "tiledb__internal__" + spatial_dim_infos[1].name + "__min", + "tiledb__internal__" + spatial_dim_infos[0].name + "__max", + "tiledb__internal__" + spatial_dim_infos[1].name + "__max"}; + + std::vector expected_spatial_column_names = { + spatial_dim_infos[0].name, spatial_dim_infos[1].name}; + REQUIRE( + soma_geometry->index_column_names() == expected_index_column_names); + REQUIRE( + soma_geometry->spatial_column_names() == + expected_spatial_column_names); + REQUIRE(soma_geometry->nnz() == 0); + soma_geometry->close(); + + auto soma_object = SOMAObject::open(uri, OpenMode::read, ctx); + REQUIRE(soma_object->uri() == uri); + REQUIRE(soma_object->type() == "SOMAGeometryDataFrame"); + soma_object->close(); + } +} \ No newline at end of file