From 97135fa6de856e4900850487e2f52fcda33ba5eb Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 29 Sep 2024 20:39:27 -0400 Subject: [PATCH] can_resize/can_upgrade_shape --- libtiledbsoma/src/soma/soma_array.cc | 151 +++++++++++++++- libtiledbsoma/src/soma/soma_array.h | 69 ++++++- .../test/unit_soma_sparse_ndarray.cc | 171 ++++++++++++++++++ 3 files changed, 389 insertions(+), 2 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index fbcd1695f2..e47df21261 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -29,10 +29,10 @@ * This file defines the SOMAArray class. */ -#include "soma_array.h" #include #include "../utils/logger.h" #include "../utils/util.h" +#include "soma_array.h" namespace tiledbsoma { using namespace tiledb; @@ -1418,6 +1418,155 @@ std::vector SOMAArray::maxshape() { return _tiledb_domain(); } +// This is a helper for can_upgrade_domain and can_resize, which have +// much overlap. +std::pair SOMAArray::_can_set_shape_helper( + const std::vector& newshape, + bool is_resize, + std::string method_name_for_messages) { + // E.g. it's an error to try to upgrade_domain or resize specifying + // a 3-D shape on a 2-D array. + auto arg_ndim = newshape.size(); + auto array_ndim = arr_->schema().domain().ndim(); + if (array_ndim != arg_ndim) { + return std::pair( + false, + fmt::format( + "cannot {}: provided shape has ndim {}, while the array has {}", + method_name_for_messages, + arg_ndim, + array_ndim)); + } + + // Enforce the semantics that tiledbsoma_upgrade_domain must be called + // only on arrays that don't have a shape set, and resize must be called + // only on arrays that do. + bool has_shape = has_current_domain(); + if (is_resize) { + // They're trying to do resize on an array that doesn't already have a + // shape. + if (!has_shape) { + return std::pair( + false, + fmt::format( + "{}: array currently has no shape: please use " + "tiledbsoma_upgrade_shape.", + method_name_for_messages)); + } + } else { + // They're trying to do upgrade_shape on an array that already has a + // shape. + if (has_shape) { + return std::pair( + false, + fmt::format( + "{}: array already has a shape: please use resize rather " + "than " + "tiledbsoma_upgrade_shape.", + method_name_for_messages)); + } + } + + // * For old-style arrays without shape: core domain (soma maxdomain) may be + // small (like 100) or big (like 2 billionish). + // * For new-style arrays with shape: core current domain (soma domain) will + // probably be small and core domain (soma maxdomain) will be huge. + // + // In either case, we need to check that the user's requested shape isn't + // outside the core domain, which is immutable. For old-style arrays, + // + // if the requested shape fits in the array's core domain, it's good to go + // as a new shape. + auto domain_check = _can_set_shape_domainish_helper( + newshape, false, method_name_for_messages); + if (!domain_check.first) { + return domain_check; + } + + // For new-style arrays, we need to additionally that the the requested + // shape (core current domain) isn't a downsize of the current one. + if (has_shape) { + auto current_domain_check = _can_set_shape_domainish_helper( + newshape, true, method_name_for_messages); + if (!current_domain_check.first) { + return current_domain_check; + } + } + + return std::pair(true, ""); +} + +// This is a helper for _can_set_shape_helper: it's used for comparing +// the user's requested shape against the core current domain or core (max) +// domain. +std::pair SOMAArray::_can_set_shape_domainish_helper( + const std::vector& newshape, + bool check_current_domain, + std::string method_name_for_messages) { + Domain domain = arr_->schema().domain(); + + for (unsigned i = 0; i < domain.ndim(); i++) { + const auto& dim = domain.dimension(i); + + const std::string& dim_name = dim.name(); + + // These methods are only for SOMA NDArrays, and any other arrays for + // which the indices are entirely int64. SOMA DataFrame objects, with + // multi-type dims, need to go through upgrade_domain -- and this is + // library-internal code, it's not the user's fault if we got here. + if (dim.type() != TILEDB_INT64) { + throw TileDBSOMAError(fmt::format( + "{}: internal error: expected {} dim to " + "be {}; got {}", + method_name_for_messages, + dim_name, + tiledb::impl::type_to_str(TILEDB_INT64), + tiledb::impl::type_to_str(dim.type()))); + } + + if (check_current_domain) { + std::pair + cap = _core_current_domain_slot(dim_name); + int64_t old_dim_shape = cap.second + 1; + + if (newshape[i] < old_dim_shape) { + return std::pair( + false, + fmt::format( + "cannot {} for {}: new {} < existing shape {}", + method_name_for_messages, + dim_name, + newshape[i], + old_dim_shape)); + } + + } else { + std::pair cap = _core_domain_slot( + dim_name); + int64_t old_dim_shape = cap.second + 1; + + if (newshape[i] > old_dim_shape) { + return std::pair( + false, + fmt::format( + "cannot {} for {}: new {} < existing maxshape {}", + method_name_for_messages, + dim_name, + newshape[i], + old_dim_shape)); + } + } + } + return std::pair(true, ""); +} + +std::pair SOMAArray::can_set_shape_soma_joinid( + int64_t /*newshape*/, + bool /*is_resize*/, + std::string /*method_name_for_messages*/) { + return std::pair(false, "TBW"); +} + void SOMAArray::resize(const std::vector& newshape) { if (_get_current_domain().is_empty()) { throw TileDBSOMAError( diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 67903bbeb6..d3852a2987 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -1031,6 +1031,57 @@ class SOMAArray : public SOMAObject { */ std::vector maxshape(); + /** + * This wires up to Python/R to tell a user if they can call resize() on an + * array without error. For single arrays, they could just call resize() and + * take their chances -- but for experiment-level resize (e.g. append mode) + * it's crucial that we provide a can-we-do-them-all pass through all arrays + * in the experiment before attempting any of them. + * + * On failure, returns false and an error string suitable for showing + * to the user; on success, returns true and the empty string. + * + * Failure reasons: the requested shape's dimension-count doesn't match the + * arrays; the array doesn't have a shape set (they must call + * upgrade_shape), or the requested shape doesn't fit within the array's + * existing core domain. + */ + std::pair can_resize( + const std::vector& newshape) { + return _can_set_shape_helper(newshape, true, "resize"); + } + + /** + * This wires up to Python/R to tell a user if they can call + * upgrade_shape() on an array without error. For single dataframes, + * they could just call upgrade_shape() and take their chances -- but for + * experiment-level resize (e.g. append mode) it's crucial that we provide a + * can-we-do-them-all pass through all arrays in the experiment before + * attempting any of them. + * + * On failure, returns false and an error string suitable for showing + * to the user; on success, returns true and the empty string. + * + * Failure reasons: the requested shape's dimension-count doesn't match the + * arrays; the array already has a shape set (they must call resize), the + * requested shape doesn't fit within the array's existing core domain, or + * the requested shape is a downsize of the array's existing core current + * domain. + */ + std::pair can_upgrade_shape( + const std::vector& newshape) { + return _can_set_shape_helper( + newshape, false, "tiledbsoma_upgrade_shape"); + } + + /** + * XXX COMMENT + */ + std::pair can_set_shape_soma_joinid( + int64_t newshape, + bool require_has_shape, + std::string method_name_for_messages); + /** * @brief Resize the shape (what core calls "current domain") up to the * maxshape (what core calls "domain"). @@ -1125,7 +1176,23 @@ class SOMAArray : public SOMAObject { } /** - * Helper method for resize and upgrade_shape. + * This is a code-dedupe helper for can_resize and can_upgrade_domain. + */ + std::pair _can_set_shape_helper( + const std::vector& newshape, + bool is_resize, + std::string method_name_for_messages); + + /** + * This is a second-level code-dedupe helper for _can_set_shape_helper. + */ + std::pair _can_set_shape_domainish_helper( + const std::vector& newshape, + bool check_current_domain, + std::string method_name_for_messages); + + /** + * This is a code-dedupe helper method for resize and upgrade_shape. */ void _set_current_domain_from_shape(const std::vector& newshape); diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index bb6b2decc4..0fd09b1103 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -340,3 +340,174 @@ TEST_CASE("SOMASparseNDArray: metadata", "[SOMASparseNDArray]") { REQUIRE(soma_sparse->metadata_num() == 2); } } +void breakme() { +} + +TEST_CASE( + "SOMASparseNDArray: can_tiledbsoma_upgrade_shape", "[SOMASparseNDArray]") { + int64_t dim_max = 999; + + auto ctx = std::make_shared(); + std::string uri = "mem://unit-test-sparse-ndarray-upgrade-shape"; + + std::string dim_name = "soma_dim_0"; + tiledb_datatype_t dim_tiledb_datatype = TILEDB_INT64; + tiledb_datatype_t attr_tiledb_datatype = TILEDB_INT32; + std::string dim_arrow_format = ArrowAdapter::tdb_to_arrow_type( + dim_tiledb_datatype); + std::string attr_arrow_format = ArrowAdapter::tdb_to_arrow_type( + attr_tiledb_datatype); + + std::vector dim_infos( + {{.name = dim_name, + .tiledb_datatype = dim_tiledb_datatype, + .dim_max = dim_max, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = false}}); + + auto index_columns = helper::create_column_index_info(dim_infos); + + SOMASparseNDArray::create( + uri, + attr_arrow_format, + ArrowTable( + std::move(index_columns.first), std::move(index_columns.second)), + ctx); + + auto soma_sparse = SOMASparseNDArray::open(uri, OpenMode::write, ctx); + REQUIRE(soma_sparse->has_current_domain() == false); + + // For old-style arrays, from before the current-domain feature: + // * The shape specified at create becomes the core (max) domain + // o Recall that the core domain is immutable + // * There is no current domain set + // o A current domain can be applied to it, up to <= (max) domain + auto dom = soma_sparse->soma_domain_slot(dim_name); + auto mxd = soma_sparse->soma_maxdomain_slot(dim_name); + REQUIRE(dom == mxd); + REQUIRE(dom.first == 0); + REQUIRE(dom.second == dim_max); + + breakme(); + std::vector newshape_wrong_dims({dim_max, 12}); + std::vector newshape_too_big({dim_max + 10}); + std::vector newshape_good({40}); + + auto check = soma_sparse->can_upgrade_shape(newshape_wrong_dims); + REQUIRE(check.first == false); + REQUIRE( + check.second == + "cannot tiledbsoma_upgrade_shape: provided shape has ndim 2, while the " + "array has 1"); + + check = soma_sparse->can_upgrade_shape(newshape_too_big); + REQUIRE(check.first == false); + REQUIRE( + check.second == + "cannot tiledbsoma_upgrade_shape for soma_dim_0: new 1009 < existing " + "maxshape 1000"); + + check = soma_sparse->can_upgrade_shape(newshape_good); + REQUIRE(check.first == true); + REQUIRE(check.second == ""); +} + +TEST_CASE("SOMASparseNDArray: can_resize", "[SOMASparseNDArray]") { + int64_t dim_max = 999; + + auto ctx = std::make_shared(); + std::string uri = "mem://unit-test-sparse-ndarray-resize"; + + std::string dim_name = "soma_dim_0"; + tiledb_datatype_t dim_tiledb_datatype = TILEDB_INT64; + tiledb_datatype_t attr_tiledb_datatype = TILEDB_INT32; + std::string dim_arrow_format = ArrowAdapter::tdb_to_arrow_type( + dim_tiledb_datatype); + std::string attr_arrow_format = ArrowAdapter::tdb_to_arrow_type( + attr_tiledb_datatype); + + std::vector dim_infos( + {{.name = dim_name, + .tiledb_datatype = dim_tiledb_datatype, + .dim_max = dim_max, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = true}}); + + auto index_columns = helper::create_column_index_info(dim_infos); + + SOMASparseNDArray::create( + uri, + attr_arrow_format, + ArrowTable( + std::move(index_columns.first), std::move(index_columns.second)), + ctx); + + auto soma_sparse = SOMASparseNDArray::open(uri, OpenMode::write, ctx); + REQUIRE(soma_sparse->has_current_domain() == true); + + // For new-style arrays, with the current-domain feature: + // * The shape specified at create becomes the core current domain + // o Recall that the core current domain is mutable, up tp <= (max) domain + // * The core (max) domain is huge + // o Recall that the core max domain is immutable + auto dom = soma_sparse->soma_domain_slot(dim_name); + auto mxd = soma_sparse->soma_maxdomain_slot(dim_name); + REQUIRE(dom != mxd); + REQUIRE(dom.first == 0); + REQUIRE(dom.second == dim_max); + + std::vector newshape_wrong_dims({dim_max, 12}); + std::vector newshape_too_small({40}); + std::vector newshape_good({2000}); + + auto check = soma_sparse->can_resize(newshape_wrong_dims); + REQUIRE(check.first == false); + REQUIRE( + check.second == + "cannot resize: provided shape has ndim 2, while the array has 1"); + + check = soma_sparse->can_resize(newshape_too_small); + REQUIRE(check.first == false); + REQUIRE( + check.second == + "cannot resize for soma_dim_0: new 40 < existing shape 1000"); + + check = soma_sparse->can_resize(newshape_good); + REQUIRE(check.first == true); + REQUIRE(check.second == ""); +} + +// +//// array w shape +//// o set curdom say 1000 and maxdom huge +//// o test newshape.size() is wrong +//// o test doing tiledbsoma_upgrade_shape instead of resize +//// o test resize: +//// - test too small in any one slot +//// - test equal -- do it twice +//// - test larger +// +//// ================================================================ +//// OLD +//// ACC-SNDA URI data-snda-py <----------------- SLOTWISE NONE +//// ACC-SNDA NEW SHAPE False +//// ACC-SNDA SHAPE (2147483646, 2147483646) +//// ACC-SNDA MAXSHAPE (2147483646, 2147483646) +//// +//// ACC-SNDA URI data-snda-shape-py <----------------- SLOTWISE SPECIFIED +//// ACC-SNDA NEW SHAPE False +//// ACC-SNDA SHAPE (100, 200) +//// ACC-SNDA MAXSHAPE (100, 200) +//// +//// ================================================================ +//// ACC-SNDA URI data-snda-py <----------------- SLOTWISE NONE +//// ACC-SNDA NEW SHAPE True +//// ACC-SNDA SHAPE (2147483646, 2147483646) +//// ACC-SNDA MAXSHAPE (2147483646, 2147483646) +//// +//// ACC-SNDA URI data-snda-shape-py <----------------- SLOTWISE SPECIFIED +//// ACC-SNDA NEW SHAPE True +//// ACC-SNDA SHAPE (100, 200) +//// ACC-SNDA MAXSHAPE (2147483646, 2147483646)