Skip to content

Commit

Permalink
unit-test coverage
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Oct 24, 2024
1 parent 9ea23eb commit 7787ee8
Show file tree
Hide file tree
Showing 2 changed files with 151 additions and 3 deletions.
35 changes: 32 additions & 3 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,18 @@
"""
Implementation of a SOMA DataFrame
"""
from typing import Any, List, Optional, Sequence, Tuple, Type, Union, cast
from typing import (
Any,
Dict,
Iterable,
List,
Optional,
Sequence,
Tuple,
Type,
Union,
cast,
)

import numpy as np
import pyarrow as pa
Expand Down Expand Up @@ -485,6 +496,11 @@ def _upgrade_or_change_domain_helper(
f"{function_name_for_messages}: requested domain has length {len(dim_names)} but the dataframe's schema has index-column count {len(newdomain)}"
)

if any([len(slot) != 2 for slot in newdomain]): # type: ignore
raise ValueError(
f"{function_name_for_messages}: requested domain must have low,high pairs in each slot"
)

# From the dataframe's schema, extract the subschema for only index columns (TileDB dimensions).
full_schema = self.schema
dim_schema_list = []
Expand All @@ -493,9 +509,22 @@ def _upgrade_or_change_domain_helper(
dim_schema = pa.schema(dim_schema_list)

# Convert the user's tuple of low/high pairs into a dict keyed by index-column name.
new_domain_dict = {}
new_domain_dict: Dict[str, Iterable[Any]] = {}
for i, dim_name in enumerate(dim_names):
new_domain_dict[dim_name] = newdomain[i]
# Domain can't be specified for strings (core constraint) so let them keystroke that easily.
if (
dim_schema.field(dim_name).type
in [
pa.string(),
pa.large_string(),
pa.binary(),
pa.large_binary(),
]
and newdomain[i] is None
):
new_domain_dict[dim_name] = ("", "")
else:
new_domain_dict[dim_name] = tuple(newdomain[i]) # type: ignore

# Return this as a pyarrow table. This has n columns where n is the number of
# index columns, and two rows: one row for the low values and one for the high values.
Expand Down
119 changes: 119 additions & 0 deletions apis/python/tests/test_shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,103 @@ def test_dataframe_basics(tmp_path, soma_joinid_domain, index_column_names):
sdf.write(data)


def test_domain_mods(tmp_path):
if not tiledbsoma._flags.NEW_SHAPE_FEATURE_FLAG_ENABLED:
return

uri = tmp_path.as_posix()

schema = pa.schema(
[
("soma_joinid", pa.int64()),
("mystring", pa.string()),
("myint", pa.int16()),
("myfloat", pa.float32()),
("mybool", pa.bool_()), # not supported as an index type
]
)
index_column_names = ["soma_joinid", "mystring", "myint", "myfloat"]

domain_for_create = [
[0, 3],
None,
[20, 50],
[0.0, 6.0],
]

data_dict = {
"soma_joinid": [0, 1, 2, 3],
"mystring": ["a", "b", "a", "b"],
"myint": [20, 30, 40, 50],
"myfloat": [1.0, 2.5, 4.0, 5.5],
"mybool": [True, False, True, True],
}

data = pa.Table.from_pydict(data_dict)

with tiledbsoma.DataFrame.create(
uri,
schema=schema,
index_column_names=index_column_names,
domain=domain_for_create,
) as sdf:
sdf.write(data)

# Check "expand" to same
with tiledbsoma.DataFrame.open(uri, "w") as sdf:
newdomain = [[0, 3], None, [20, 50], [0.0, 6.0]]
ok, msg = sdf.change_domain(newdomain, check_only=True)
assert ok
assert msg == ""

# Shrink
with tiledbsoma.DataFrame.open(uri, "w") as sdf:
newdomain = [[0, 2], None, [20, 50], [0.0, 6.0]]
ok, msg = sdf.change_domain(newdomain, check_only=True)
assert not ok
assert "downsize is unsupported" in msg

with tiledbsoma.DataFrame.open(uri, "w") as sdf:
newdomain = [[0, 3], None, [20, 40], [0.0, 6.0]]
ok, msg = sdf.change_domain(newdomain, check_only=True)
assert not ok
assert "downsize is unsupported" in msg

with tiledbsoma.DataFrame.open(uri, "w") as sdf:
newdomain = [[0, 3], None, [20, 50], [1.0, 6.0]]
ok, msg = sdf.change_domain(newdomain, check_only=True)
assert not ok
assert "downsize is unsupported" in msg

# String domain cannot be specified
with tiledbsoma.DataFrame.open(uri, "w") as sdf:
newdomain = [
[0, 3],
["a", "z"],
[20, 50],
[0.0, 6.0],
]
ok, msg = sdf.change_domain(newdomain, check_only=True)
assert not ok
assert "domain cannot be set for string index columns" in msg

# All clear
with tiledbsoma.DataFrame.open(uri, "w") as sdf:
newdomain = [[0, 9], None, [0, 100], [-10.0, 10.0]]
ok, msg = sdf.change_domain(newdomain, check_only=True)
assert ok
assert msg == ""
sdf.change_domain(newdomain)

# Check for success
with tiledbsoma.DataFrame.open(uri, "r") as sdf:
dom = sdf.domain
assert dom[0] == (0, 9)
assert dom[1] == ("", "")
assert dom[2] == (0, 100)
assert dom[3] == (-10.0, 10.0)


@pytest.mark.parametrize("has_shapes", [False, True])
def test_canned_experiments(tmp_path, has_shapes):
uri = tmp_path.as_posix()
Expand Down Expand Up @@ -435,6 +532,28 @@ def _check_ndarray(ndarray, has_shapes, expected_shape):
assert "[SparseNDArray] ms/RNA/obsp/distances" in body
assert "ms/RNA/obsm/X_draw_graph_fr" in body

# Check upgrade_domain for dataframes
with tiledbsoma.Experiment.open(uri, "w") as exp:

ok, msg = exp.obs.tiledbsoma_upgrade_domain([[10, 4]], check_only=True)
if has_shapes:
assert not ok
assert "dataframe already has a domain" in msg
else:
assert not ok
assert "new lower > new upper" in msg

ok, msg = exp.obs.tiledbsoma_upgrade_domain([[0, 1]], check_only=True)
if has_shapes:
assert not ok
assert "dataframe already has a domain" in msg
else:
assert ok
assert msg == ""

with pytest.raises(ValueError):
exp.obs.tiledbsoma_upgrade_domain([[0, 1, 2]], check_only=True)

# Check dry run of tiledbsoma.io.upgrade_experiment_shapes
handle = io.StringIO()
upgradeable = tiledbsoma.io.upgrade_experiment_shapes(
Expand Down

0 comments on commit 7787ee8

Please sign in to comment.