diff --git a/apis/python/src/tiledbsoma/pytiledbsoma.cc b/apis/python/src/tiledbsoma/pytiledbsoma.cc index fa97811f04..0ac54a0a72 100644 --- a/apis/python/src/tiledbsoma/pytiledbsoma.cc +++ b/apis/python/src/tiledbsoma/pytiledbsoma.cc @@ -155,6 +155,11 @@ PYBIND11_MODULE(pytiledbsoma, m) { *ctx->tiledb_ctx(), attr_name, ArrowAdapter::to_tiledb_format(attr_type)); + + if (ArrowAdapter::arrow_is_string_type(attr_type.c_str())) { + attr.set_cell_val_num(TILEDB_VAR_NUM); + } + FilterList filter_list(*ctx->tiledb_ctx()); filter_list.add_filter( Filter(*ctx->tiledb_ctx(), TILEDB_FILTER_ZSTD)); diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index 8807ea2981..30b0861a63 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -1722,3 +1722,37 @@ def test_only_evolve_schema_when_enmr_is_extended(tmp_path): # subtract 1 for the __schema/__enumerations directory; # only looking at fragment files assert len(vfs.ls(os.path.join(uri, "__schema"))) - 1 == 3 + + +def test_fix_update_dataframe_with_var_strings(tmp_path): + uri = tmp_path.as_posix() + + tbl = pa.table( + { + "soma_joinid": pa.array([0, 1, 2, 3], pa.int64()), + "mystring": pa.array(["a", "bb", "ccc", "dddd"], pa.large_utf8()), + "myint": pa.array([33, 44, 55, 66], pa.int32()), + "myfloat": pa.array([4.5, 5.5, 6.5, 7.5], pa.float32()), + } + ) + + with soma.DataFrame.create(uri, schema=tbl.schema) as sdf: + sdf.write(tbl) + + with soma.DataFrame.open(uri, "r") as sdf: + updated_sdf = sdf.read().concat().to_pandas() + updated_sdf["newattr"] = np.array(["a", "b", "c", "d"]) + + with soma.DataFrame.open(uri, "w") as sdf: + soma.io.ingest._update_dataframe( + sdf, + updated_sdf, + "testing", + platform_config=None, + context=None, + default_index_name="mystring", + ) + + with soma.DataFrame.open(uri, "r") as sdf: + results = sdf.read().concat().to_pandas() + assert results.equals(updated_sdf) diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index ac2ea971e1..a89dff8fe2 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -822,7 +822,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( auto schild = index_column_schema->children[i]; auto col_name = schild->name; if (strcmp(child->name, col_name) == 0) { - if (ArrowAdapter::_isvar(child->format)) { + if (ArrowAdapter::arrow_is_string_type(child->format)) { type = TILEDB_STRING_ASCII; } @@ -861,7 +861,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( attr.set_nullable(true); } - if (ArrowAdapter::_isvar(child->format)) { + if (ArrowAdapter::arrow_is_string_type(child->format)) { attr.set_cell_val_num(TILEDB_VAR_NUM); } @@ -872,7 +872,9 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( *ctx, child->name, enmr_type, - ArrowAdapter::_isvar(enmr_format) ? TILEDB_VAR_NUM : 1, + ArrowAdapter::arrow_is_string_type(enmr_format) ? + TILEDB_VAR_NUM : + 1, child->flags & ARROW_FLAG_DICTIONARY_ORDERED); ArraySchemaExperimental::add_enumeration(*ctx, schema, enmr); AttributeExperimental::set_enumeration_name( @@ -919,7 +921,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( continue; } - if (ArrowAdapter::_isvar(child->format)) { + if (ArrowAdapter::arrow_is_string_type(child->format)) { // In the core API: // // * domain for strings must be set as (nullptr, nullptr) @@ -1241,12 +1243,10 @@ ArrowAdapter::to_arrow(std::shared_ptr column) { return std::pair(std::move(array), std::move(schema)); } -bool ArrowAdapter::_isvar(const char* format) { - if ((strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) || - (strcmp(format, "u") == 0) || (strcmp(format, "z") == 0)) { - return true; - } - return false; +bool ArrowAdapter::arrow_is_string_type(const char* format) { + return ( + (strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) || + (strcmp(format, "u") == 0) || (strcmp(format, "z") == 0)); } std::string_view ArrowAdapter::to_arrow_format( diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 1cac805cc7..41af7c1c56 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -260,6 +260,15 @@ class ArrowAdapter { static std::string_view to_arrow_format( tiledb_datatype_t tiledb_dtype, bool use_large = true); + /** + * @brief Keystroke saver to determine whether Arrow type is of string, + * large string, binary, or large binary type. + * + * @param const char* Arrow data format + * @return bool Whether the Arrow type represents a string type + */ + static bool arrow_is_string_type(const char* format); + /** * @brief Get TileDB datatype from Arrow format string. * @@ -673,8 +682,6 @@ class ArrowAdapter { return Dimension::create(*ctx, name, {b[0], b[1]}, b[2]); } - static bool _isvar(const char* format); - static FilterList _create_filter_list( std::string filters, std::shared_ptr ctx);