Skip to content

Commit

Permalink
[python/r] DataFrame resizer (#3091)
Browse files Browse the repository at this point in the history
* Connect `resize_soma_joinid` at the `pybind11` layer

* Connect `resize_soma_joinid` at the Python UX layer

* Python unit-test cases

* Connect to R Rcpp and UX layers

* R unit testing

* code-review feedback

* fix bad rebase
  • Loading branch information
johnkerl authored Oct 1, 2024
1 parent 8885e31 commit 8e9be91
Show file tree
Hide file tree
Showing 12 changed files with 229 additions and 42 deletions.
12 changes: 12 additions & 0 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,18 @@ def tiledbsoma_has_upgraded_domain(self) -> bool:
"""
return self._handle.tiledbsoma_has_upgraded_domain

def resize_soma_joinid(self, newshape: int) -> None:
"""Increases the shape of the dataframe on the ``soma_joinid`` index
column, if it indeed is an index column, leaving all other index columns
as-is. If the ``soma_joinid`` is not an index column, no change is made.
This is a special case of ``upgrade_domain`` (WIP for 1.15), but simpler
to keystroke, and handles the most common case for dataframe domain
expansion. Raises an error if the dataframe doesn't already have a
domain: in that case please call ``tiledbsoma_upgrade_domain`` (WIP for
1.15).
"""
self._handle._handle.resize_soma_joinid(newshape)

def __len__(self) -> int:
"""Returns the number of rows in the dataframe. Same as ``df.count``."""
return self.count
Expand Down
16 changes: 16 additions & 0 deletions apis/python/src/tiledbsoma/_tdb_handles.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,10 @@ def tiledbsoma_upgrade_shape(self, newshape: Sequence[Union[int, None]]) -> None
"""Not implemented for DataFrame."""
raise NotImplementedError

def resize_soma_joinid(self, newshape: int) -> None:
"""Only implemented for DataFrame."""
raise NotImplementedError


class DataFrameWrapper(SOMAArrayWrapper[clib.SOMADataFrame]):
"""Wrapper around a Pybind11 SOMADataFrame handle."""
Expand Down Expand Up @@ -496,6 +500,18 @@ def tiledbsoma_has_upgraded_domain(self) -> bool:
"""
return cast(bool, self._handle.tiledbsoma_has_upgraded_domain)

def resize_soma_joinid(self, newshape: int) -> None:
"""Increases the shape of the dataframe on the ``soma_joinid`` index
column, if it indeed is an index column, leaving all other index columns
as-is. If the ``soma_joinid`` is not an index column, no change is made.
This is a special case of ``upgrade_domain`` (WIP for 1.15), but simpler
to keystroke, and handles the most common case for dataframe domain
expansion. Raises an error if the dataframe doesn't already have a
domain: in that case please call ``tiledbsoma_upgrade_domain`` (WIP for
1.15).
"""
self._handle.resize_soma_joinid(newshape)


class DenseNDArrayWrapper(SOMAArrayWrapper[clib.SOMADenseNDArray]):
"""Wrapper around a Pybind11 DenseNDArrayWrapper handle."""
Expand Down
13 changes: 12 additions & 1 deletion apis/python/src/tiledbsoma/soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,17 @@ void load_soma_dataframe(py::module& m) {
"maybe_soma_joinid_maxshape",
&SOMADataFrame::maybe_soma_joinid_maxshape)
.def_property_readonly(
"tiledbsoma_has_upgraded_domain", &SOMAArray::has_current_domain);
"tiledbsoma_has_upgraded_domain", &SOMAArray::has_current_domain)

.def(
"resize_soma_joinid",
[](SOMADataFrame& sdf, int64_t newshape) {
try {
sdf.resize_soma_joinid(newshape);
} catch (const std::exception& e) {
throw TileDBSOMAError(e.what());
}
},
"newshape"_a);
}
} // namespace libtiledbsomacpp
60 changes: 51 additions & 9 deletions apis/python/tests/test_shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def test_sparse_nd_array_basics(
table = pa.Table.from_pydict(dikt)
snda.write(table)

# Test reasonable resize
# Test resize
new_shape = tuple([arg_shape[i] + 50 for i in range(ndim)])
with tiledbsoma.SparseNDArray.open(uri, "w") as snda:
snda.resize(new_shape)
Expand Down Expand Up @@ -216,14 +216,14 @@ def test_dataframe_basics(tmp_path, soma_joinid_domain, index_column_names):
]
)

data = pa.Table.from_pydict(
{
"soma_joinid": [0, 1, 2, 3],
"mystring": ["a", "b", "a", "b"],
"myint": [20, 30, 40, 50],
"myfloat": [1.0, 2.5, 4.0, 5.5],
}
)
data_dict = {
"soma_joinid": [0, 1, 2, 3],
"mystring": ["a", "b", "a", "b"],
"myint": [20, 30, 40, 50],
"myfloat": [1.0, 2.5, 4.0, 5.5],
}

data = pa.Table.from_pydict(data_dict)

domain_slots = {
"soma_joinid": soma_joinid_domain,
Expand All @@ -232,6 +232,8 @@ def test_dataframe_basics(tmp_path, soma_joinid_domain, index_column_names):
"myfloat": (-999.5, 999.5),
}

has_soma_joinid_dim = "soma_joinid" in index_column_names

domain = tuple([domain_slots[name] for name in index_column_names])

soma_joinid_coords = data["soma_joinid"]
Expand Down Expand Up @@ -265,3 +267,43 @@ def test_dataframe_basics(tmp_path, soma_joinid_domain, index_column_names):
assert sdf._maybe_soma_joinid_maxshape is None

assert len(sdf.non_empty_domain()) == len(index_column_names)

# This may be None if soma_joinid is not an index column
shape_at_create = sdf._maybe_soma_joinid_shape

if tiledbsoma._flags.NEW_SHAPE_FEATURE_FLAG_ENABLED:

# Test resize down
new_shape = 0
with tiledbsoma.DataFrame.open(uri, "w") as sdf:
if has_soma_joinid_dim:
# TODO: check draft spec
# with pytest.raises(ValueError):
with pytest.raises(tiledbsoma.SOMAError):
sdf.resize_soma_joinid(new_shape)
else:
sdf.resize_soma_joinid(new_shape)

with tiledbsoma.DataFrame.open(uri) as sdf:
assert sdf._maybe_soma_joinid_shape == shape_at_create

# Test writes out of bounds, before resize
offset = shape_at_create if has_soma_joinid_dim else 100
data_dict["soma_joinid"] = [e + offset for e in data_dict["soma_joinid"]]
data = pa.Table.from_pydict(data_dict)

with tiledbsoma.DataFrame.open(uri, "w") as sdf:
if has_soma_joinid_dim:
with pytest.raises(tiledbsoma.SOMAError):
sdf.write(data)
else:
sdf.write(data)

# Test resize
new_shape = 0 if shape_at_create is None else shape_at_create + 100
with tiledbsoma.DataFrame.open(uri, "w") as sdf:
sdf.resize_soma_joinid(new_shape)

# Test writes out of old bounds, within new bounds, after resize
with tiledbsoma.DataFrame.open(uri, "w") as sdf:
sdf.write(data)
4 changes: 4 additions & 0 deletions apis/r/R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,10 @@ resize <- function(uri, new_shape, ctxxp) {
invisible(.Call(`_tiledbsoma_resize`, uri, new_shape, ctxxp))
}

resize_soma_joinid <- function(uri, new_shape, ctxxp) {
invisible(.Call(`_tiledbsoma_resize_soma_joinid`, uri, new_shape, ctxxp))
}

tiledbsoma_upgrade_shape <- function(uri, new_shape, ctxxp) {
invisible(.Call(`_tiledbsoma_tiledbsoma_upgrade_shape`, uri, new_shape, ctxxp))
}
Expand Down
20 changes: 20 additions & 0 deletions apis/r/R/SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,26 @@ SOMADataFrame <- R6::R6Class(
#' @return Logical
tiledbsoma_has_upgraded_domain = function() {
has_current_domain(self$uri, private$.soma_context)
},

#' @description Increases the shape of the dataframe on the ``soma_joinid``
#' index column, if it indeed is an index column, leaving all other index
#' columns as-is. If the ``soma_joinid`` is not an index column, no change is
#' made. This is a special case of ``upgrade_domain`` (WIP for 1.15), but
#' simpler to keystroke, and handles the most common case for dataframe
#' domain expansion. Raises an error if the dataframe doesn't already have a
#' domain: in that case please call ``tiledbsoma_upgrade_domain`` (WIP for
#' 1.15).
#' @param new_shape An integer, greater than or equal to 1 + the
#' `soma_joinid` domain slot.
#' @return No return value
resize_soma_joinid = function(new_shape) {

stopifnot("'new_shape' must be an integer" = rlang::is_integerish(new_shape, n = 1) ||
(bit64::is.integer64(new_shape) && length(new_shape) == 1)
)
# Checking slotwise new shape >= old shape, and <= max_shape, is already done in libtiledbsoma
invisible(resize_soma_joinid(self$uri, new_shape, private$.soma_context))
}

),
Expand Down
13 changes: 13 additions & 0 deletions apis/r/src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,18 @@ BEGIN_RCPP
return R_NilValue;
END_RCPP
}
// resize_soma_joinid
void resize_soma_joinid(const std::string& uri, Rcpp::NumericVector new_shape, Rcpp::XPtr<somactx_wrap_t> ctxxp);
RcppExport SEXP _tiledbsoma_resize_soma_joinid(SEXP uriSEXP, SEXP new_shapeSEXP, SEXP ctxxpSEXP) {
BEGIN_RCPP
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const std::string& >::type uri(uriSEXP);
Rcpp::traits::input_parameter< Rcpp::NumericVector >::type new_shape(new_shapeSEXP);
Rcpp::traits::input_parameter< Rcpp::XPtr<somactx_wrap_t> >::type ctxxp(ctxxpSEXP);
resize_soma_joinid(uri, new_shape, ctxxp);
return R_NilValue;
END_RCPP
}
// tiledbsoma_upgrade_shape
void tiledbsoma_upgrade_shape(const std::string& uri, Rcpp::NumericVector new_shape, Rcpp::XPtr<somactx_wrap_t> ctxxp);
RcppExport SEXP _tiledbsoma_tiledbsoma_upgrade_shape(SEXP uriSEXP, SEXP new_shapeSEXP, SEXP ctxxpSEXP) {
Expand Down Expand Up @@ -708,6 +720,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_tiledbsoma_ndim", (DL_FUNC) &_tiledbsoma_ndim, 2},
{"_tiledbsoma_c_dimnames", (DL_FUNC) &_tiledbsoma_c_dimnames, 2},
{"_tiledbsoma_resize", (DL_FUNC) &_tiledbsoma_resize, 3},
{"_tiledbsoma_resize_soma_joinid", (DL_FUNC) &_tiledbsoma_resize_soma_joinid, 3},
{"_tiledbsoma_tiledbsoma_upgrade_shape", (DL_FUNC) &_tiledbsoma_tiledbsoma_upgrade_shape, 3},
{"_tiledbsoma_sr_setup", (DL_FUNC) &_tiledbsoma_sr_setup, 10},
{"_tiledbsoma_sr_complete", (DL_FUNC) &_tiledbsoma_sr_complete, 1},
Expand Down
12 changes: 12 additions & 0 deletions apis/r/src/rinterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,18 @@ void resize(
sr->close();
}

// [[Rcpp::export]]
void resize_soma_joinid(
const std::string& uri,
Rcpp::NumericVector new_shape,
Rcpp::XPtr<somactx_wrap_t> ctxxp) {
// This function is solely for SOMADataFrame.
auto sr = tdbs::SOMADataFrame::open(uri, OpenMode::write, ctxxp->ctxptr);
std::vector<int64_t> new_shape_i64 = i64_from_rcpp_numeric(new_shape);
sr->resize_soma_joinid(new_shape_i64[0]);
sr->close();
}

// [[Rcpp::export]]
void tiledbsoma_upgrade_shape(
const std::string& uri,
Expand Down
75 changes: 69 additions & 6 deletions apis/r/tests/testthat/test-shape.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ test_that("SOMADataFrame shape", {
)

domain_at_create_choices = list(
list(soma_joinid = c(0, 1000)),
list(soma_joinid = c(0, 1000), int_column = c(-10000, 10000)),
list(soma_joinid = c(0, 1000), string_column = NULL),
list(soma_joinid = c(0, 999)),
list(soma_joinid = c(0, 999), int_column = c(-10000, 10000)),
list(soma_joinid = c(0, 999), string_column = NULL),
list(string_column = NULL, int_column = c(-10000, 10000)),
list(string_column = c("apple", "zebra"), int_column = c(-10000, 10000))
)
Expand Down Expand Up @@ -186,8 +186,8 @@ test_that("SOMADataFrame shape", {
expect_equal(int_mxd, int_dfc)
}
} else {
expect_true(int_mxd[[1]] < -2000000000)
expect_true(int_mxd[[2]] > 2000000000)
expect_true(int_mxd[[1]] < -2000000000)
expect_true(int_mxd[[2]] > 2000000000)
}
}

Expand All @@ -205,7 +205,7 @@ test_that("SOMADataFrame shape", {
expect_equal(str_dom, c("", ""))
} else {
if (is.null(str_dfc)) {
expect_equal(str_dom, c("", ""))
expect_equal(str_dom, c("", ""))
} else {
expect_equal(str_dom, str_dfc)
}
Expand All @@ -216,6 +216,69 @@ test_that("SOMADataFrame shape", {

sdf$close()

# Test resize for dataframes (more general upgrade_domain to be tested
# separately -- see https://github.com/single-cell-data/TileDB-SOMA/issues/2407)
if (.new_shape_feature_flag_is_enabled() && use_domain_at_create) {
has_soma_joinid_dim <- "soma_joinid" %in% index_column_names
sjid_dfc <- domain_for_create[["soma_joinid"]]

# Test resize down
new_shape <- 0
sdf <- SOMADataFrameOpen(uri, "WRITE")
if (has_soma_joinid_dim) {
# It's an error to downsize
expect_error(sdf$resize_soma_joinid(new_shape))
} else {
# There is no problem when soma_joinid is not a dim --
# sdf$resize_soma_joinid is a no-op in that case
expect_no_condition(sdf$resize_soma_joinid(new_shape))
}
sdf$close()

# Make sure the failed resize really didn't change the shape
if (has_soma_joinid_dim) {
sdf <- SOMADataFrameOpen(uri, "READ")
expect_equal(sdf$domain()[["soma_joinid"]], sjid_dfc)
sdf$close()
}

# Test writes out of bounds, before resize
old_shape <- 100
if (has_soma_joinid_dim) {
old_shape <- domain_for_create[["soma_joinid"]][[2]] + 1 + 100
}
new_shape <- old_shape + 100

tbl1 <- arrow::arrow_table(
int_column = 5L:8L,
soma_joinid = (old_shape+1L):(old_shape+4L),
float_column = 5.1:8.1,
string_column = c("egg", "flag", "geese", "hay"),
schema = asch)

sdf <- SOMADataFrameOpen(uri, "WRITE")
if (has_soma_joinid_dim) {
expect_error(sdf$write(tbl1))
} else {
expect_no_condition(sdf$write(tbl1))
}
sdf$close()

# Test resize
sdf <- SOMADataFrameOpen(uri, "WRITE")
sdf$resize_soma_joinid(new_shape)
sdf$close();

# Test writes out of old bounds, within new bounds, after resize
sdf <- SOMADataFrameOpen(uri, "WRITE")
expect_no_condition(sdf$write(tbl1))
sdf$close();

# To do: test readback

rm(tbl1)
}

rm(sdf, tbl0)

gc()
Expand Down
10 changes: 2 additions & 8 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1471,7 +1471,7 @@ void SOMAArray::_set_current_domain_from_shape(
schema_evolution.array_evolve(uri_);
}

void SOMAArray::maybe_resize_soma_joinid(const std::vector<int64_t>& newshape) {
void SOMAArray::resize_soma_joinid(int64_t newshape) {
if (mq_->query_type() != TILEDB_WRITE) {
throw TileDBSOMAError(
"[SOMAArray::resize] array must be opened in write mode");
Expand All @@ -1480,12 +1480,6 @@ void SOMAArray::maybe_resize_soma_joinid(const std::vector<int64_t>& newshape) {
ArraySchema schema = arr_->schema();
Domain domain = schema.domain();
unsigned ndim = domain.ndim();
if (newshape.size() != 1) {
throw TileDBSOMAError(fmt::format(
"[SOMAArray::resize]: newshape has dimension count {}; needed 1",
newshape.size(),
ndim));
}

auto tctx = ctx_->tiledb_ctx();
CurrentDomain old_current_domain = ArraySchemaExperimental::current_domain(
Expand All @@ -1498,7 +1492,7 @@ void SOMAArray::maybe_resize_soma_joinid(const std::vector<int64_t>& newshape) {
for (unsigned i = 0; i < ndim; i++) {
if (domain.dimension(i).name() == "soma_joinid") {
ndrect.set_range<int64_t>(
domain.dimension(i).name(), 0, newshape[0] - 1);
domain.dimension(i).name(), 0, newshape - 1);
}
}

Expand Down
2 changes: 1 addition & 1 deletion libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -1066,7 +1066,7 @@ class SOMAArray : public SOMAObject {
* @return Throws if the requested shape exceeds the array's create-time
* maxshape. Throws if the array does not have current-domain support.
*/
void maybe_resize_soma_joinid(const std::vector<int64_t>& newshape);
void resize_soma_joinid(int64_t newshape);

protected:
// These two are for use nominally by SOMADataFrame. This could be moved in
Expand Down
Loading

0 comments on commit 8e9be91

Please sign in to comment.