Skip to content

Commit

Permalink
[python] Append-mode enum bit-width mods (#2455) (#2512)
Browse files Browse the repository at this point in the history
* [python] Append-mode enum bit-width mods [WIP]

* lint

* Fix remaining issue

* unit-test updates

* complete the unit-test mod

Co-authored-by: John Kerl <kerl.john.r@gmail.com>
  • Loading branch information
github-actions[bot] and johnkerl authored May 6, 2024
1 parent 82f1468 commit 69cb851
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 2 deletions.
23 changes: 23 additions & 0 deletions apis/python/src/tiledbsoma/_arrow_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,4 +236,27 @@ def df_to_arrow(df: pd.DataFrame) -> pa.Table:
md.update(dict.fromkeys(null_fields, "nullable"))
arrow_table = arrow_table.replace_schema_metadata(md)

# For tiledbsoma.io (for which this method exists) _any_ dataset can be appended to
# later on. This means that on fresh ingest we must use a larger bit-width than
# the bare minimum necessary.
new_map = {}
for field in arrow_table.schema:
if pa.types.is_dictionary(field.type):
old_index_type = field.type.index_type
new_index_type = (
pa.int32()
if old_index_type in [pa.int8(), pa.int16()]
else old_index_type
)
new_map[field.name] = pa.dictionary(
new_index_type,
field.type.value_type,
field.type.ordered,
)
else:
new_map[field.name] = field.type
new_schema = pa.schema(new_map, metadata=arrow_table.schema.metadata)

arrow_table = pa.Table.from_pandas(df, schema=new_schema)

return arrow_table
82 changes: 81 additions & 1 deletion apis/python/tests/test_registration_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def _create_anndata(
X_value_base: int,
measurement_name: str,
raw_var_ids: Optional[Sequence[str]] = None,
X_density: float = 0.3,
):
n_obs = len(obs_ids)
n_var = len(var_ids)
Expand Down Expand Up @@ -1217,3 +1216,84 @@ def test_append_with_nonunique_field_values(
obs_field_name=obs_field_name,
var_field_name=var_field_name,
)


@pytest.mark.parametrize("all_at_once", [False, True])
@pytest.mark.parametrize("nobs_a", [50, 300])
@pytest.mark.parametrize("nobs_b", [60, 400])
def test_enum_bit_width_append(tmp_path, all_at_once, nobs_a, nobs_b):
"""Creates an obs column whose bit width might naively be inferred to be int8
by tiledbsoma.io, and another which could be inferred to int16. Then
ensures the dataframes are appendable regardless of which one was written
first."""
obs_ids_a = [("a_%08d" % e) for e in range(nobs_a)]
obs_ids_b = [("b_%08d" % e) for e in range(nobs_b)]
var_ids = ["W", "X", "Y", "Z"]
obs_field_name = "cell_id"
var_field_name = "gene_id"
measurement_name = "meas"

adata = _create_anndata(
obs_ids=obs_ids_a,
var_ids=var_ids,
obs_field_name=obs_field_name,
var_field_name=var_field_name,
X_value_base=0,
measurement_name=measurement_name,
)

bdata = _create_anndata(
obs_ids=obs_ids_b,
var_ids=var_ids,
obs_field_name=obs_field_name,
var_field_name=var_field_name,
X_value_base=100,
measurement_name=measurement_name,
)

adata.obs["enum"] = pd.Categorical(obs_ids_a, categories=obs_ids_a)
bdata.obs["enum"] = pd.Categorical(obs_ids_b, categories=obs_ids_b)

soma_uri = tmp_path.as_posix()

if all_at_once:
rd = tiledbsoma.io.register_anndatas(
None,
[adata, bdata],
measurement_name=measurement_name,
obs_field_name=obs_field_name,
var_field_name=var_field_name,
)

tiledbsoma.io.from_anndata(
soma_uri, adata, measurement_name=measurement_name, registration_mapping=rd
)
tiledbsoma.io.from_anndata(
soma_uri, bdata, measurement_name=measurement_name, registration_mapping=rd
)

else:
tiledbsoma.io.from_anndata(soma_uri, adata, measurement_name=measurement_name)

rd = tiledbsoma.io.register_anndatas(
soma_uri,
[bdata],
measurement_name=measurement_name,
obs_field_name=obs_field_name,
var_field_name=var_field_name,
)

tiledbsoma.io.from_anndata(
soma_uri, bdata, measurement_name=measurement_name, registration_mapping=rd
)

with tiledbsoma.Experiment.open(soma_uri) as exp:
obs = exp.obs.read().concat()

cell_ids = obs[obs_field_name].to_pylist()

readback_a = cell_ids[:nobs_a]
readback_b = cell_ids[nobs_a:]

assert readback_a == obs_ids_a
assert readback_b == obs_ids_b
3 changes: 2 additions & 1 deletion apis/python/tests/test_update_dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,9 @@ def test_add(exp_path, new_obs, new_var):

assert o2.field("is_g1").type == pa.bool_()
assert o2.field("seq").type == pa.int32()
# tiledbsoma.io upgrades int8 and int16 to int32 for appendability
assert o2.field("parity").type == pa.dictionary(
index_type=pa.int8(), value_type=pa.string(), ordered=False
index_type=pa.int32(), value_type=pa.string(), ordered=False
)
assert obs["parity"][0] == "even"
assert obs["parity"][1] == "odd"
Expand Down

0 comments on commit 69cb851

Please sign in to comment.