Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport release-1.14] [c++/python] Check arrow_is_string_type in _update_dataframe #3225

Merged
merged 1 commit into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions apis/python/src/tiledbsoma/pytiledbsoma.cc
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ PYBIND11_MODULE(pytiledbsoma, m) {
*ctx->tiledb_ctx(),
attr_name,
ArrowAdapter::to_tiledb_format(attr_type));

if (ArrowAdapter::arrow_is_string_type(attr_type.c_str())) {
attr.set_cell_val_num(TILEDB_VAR_NUM);
}

FilterList filter_list(*ctx->tiledb_ctx());
filter_list.add_filter(
Filter(*ctx->tiledb_ctx(), TILEDB_FILTER_ZSTD));
Expand Down
34 changes: 34 additions & 0 deletions apis/python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1722,3 +1722,37 @@ def test_only_evolve_schema_when_enmr_is_extended(tmp_path):
# subtract 1 for the __schema/__enumerations directory;
# only looking at fragment files
assert len(vfs.ls(os.path.join(uri, "__schema"))) - 1 == 3


def test_fix_update_dataframe_with_var_strings(tmp_path):
uri = tmp_path.as_posix()

tbl = pa.table(
{
"soma_joinid": pa.array([0, 1, 2, 3], pa.int64()),
"mystring": pa.array(["a", "bb", "ccc", "dddd"], pa.large_utf8()),
"myint": pa.array([33, 44, 55, 66], pa.int32()),
"myfloat": pa.array([4.5, 5.5, 6.5, 7.5], pa.float32()),
}
)

with soma.DataFrame.create(uri, schema=tbl.schema) as sdf:
sdf.write(tbl)

with soma.DataFrame.open(uri, "r") as sdf:
updated_sdf = sdf.read().concat().to_pandas()
updated_sdf["newattr"] = np.array(["a", "b", "c", "d"])

with soma.DataFrame.open(uri, "w") as sdf:
soma.io.ingest._update_dataframe(
sdf,
updated_sdf,
"testing",
platform_config=None,
context=None,
default_index_name="mystring",
)

with soma.DataFrame.open(uri, "r") as sdf:
results = sdf.read().concat().to_pandas()
assert results.equals(updated_sdf)
20 changes: 10 additions & 10 deletions libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -763,7 +763,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
auto schild = index_column_schema->children[i];
auto col_name = schild->name;
if (strcmp(child->name, col_name) == 0) {
if (ArrowAdapter::_isvar(child->format)) {
if (ArrowAdapter::arrow_is_string_type(child->format)) {
type = TILEDB_STRING_ASCII;
}

Expand Down Expand Up @@ -802,7 +802,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
attr.set_nullable(true);
}

if (ArrowAdapter::_isvar(child->format)) {
if (ArrowAdapter::arrow_is_string_type(child->format)) {
attr.set_cell_val_num(TILEDB_VAR_NUM);
}

Expand All @@ -813,7 +813,9 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
*ctx,
child->name,
enmr_type,
ArrowAdapter::_isvar(enmr_format) ? TILEDB_VAR_NUM : 1,
ArrowAdapter::arrow_is_string_type(enmr_format) ?
TILEDB_VAR_NUM :
1,
child->flags & ARROW_FLAG_DICTIONARY_ORDERED);
ArraySchemaExperimental::add_enumeration(*ctx, schema, enmr);
AttributeExperimental::set_enumeration_name(
Expand Down Expand Up @@ -860,7 +862,7 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema(
continue;
}

if (ArrowAdapter::_isvar(child->format)) {
if (ArrowAdapter::arrow_is_string_type(child->format)) {
// In the core API:
//
// * domain for strings must be set as (nullptr, nullptr)
Expand Down Expand Up @@ -1149,12 +1151,10 @@ ArrowAdapter::to_arrow(std::shared_ptr<ColumnBuffer> column) {
return std::pair(std::move(array), std::move(schema));
}

bool ArrowAdapter::_isvar(const char* format) {
if ((strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) ||
(strcmp(format, "u") == 0) || (strcmp(format, "z") == 0)) {
return true;
}
return false;
bool ArrowAdapter::arrow_is_string_type(const char* format) {
return (
(strcmp(format, "U") == 0) || (strcmp(format, "Z") == 0) ||
(strcmp(format, "u") == 0) || (strcmp(format, "z") == 0));
}

std::string_view ArrowAdapter::to_arrow_format(
Expand Down
11 changes: 9 additions & 2 deletions libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,15 @@ class ArrowAdapter {
static std::string_view to_arrow_format(
tiledb_datatype_t tiledb_dtype, bool use_large = true);

/**
* @brief Keystroke saver to determine whether Arrow type is of string,
* large string, binary, or large binary type.
*
* @param const char* Arrow data format
* @return bool Whether the Arrow type represents a string type
*/
static bool arrow_is_string_type(const char* format);

/**
* @brief Get TileDB datatype from Arrow format string.
*
Expand Down Expand Up @@ -363,8 +372,6 @@ class ArrowAdapter {
return Dimension::create<T>(*ctx, name, {b[0], b[1]}, b[2]);
}

static bool _isvar(const char* format);

static FilterList _create_filter_list(
std::string filters, std::shared_ptr<Context> ctx);

Expand Down
Loading