Skip to content

Commit

Permalink
[python] Min-sizing for dataframes/arrays
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Oct 18, 2024
1 parent 58ca2d0 commit cd9ffee
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 40 deletions.
104 changes: 68 additions & 36 deletions apis/python/src/tiledbsoma/_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,10 +267,10 @@ def create(
)

(slot_core_current_domain, saturated_cd) = _fill_out_slot_soma_domain(
slot_soma_domain, index_column_name, pa_field.type, dtype
slot_soma_domain, False, index_column_name, pa_field.type, dtype
)
(slot_core_max_domain, saturated_md) = _fill_out_slot_soma_domain(
None, index_column_name, pa_field.type, dtype
None, True, index_column_name, pa_field.type, dtype
)

extent = _find_extent_for_domain(
Expand Down Expand Up @@ -824,6 +824,7 @@ def _canonicalize_schema(

def _fill_out_slot_soma_domain(
slot_domain: AxisDomain,
is_max_domain: bool,
index_column_name: str,
pa_type: pa.DataType,
dtype: Any,
Expand Down Expand Up @@ -873,47 +874,78 @@ def _fill_out_slot_soma_domain(
# will (and must) ignore these when creating the TileDB schema.
slot_domain = "", ""
elif np.issubdtype(dtype, NPInteger):
iinfo = np.iinfo(cast(NPInteger, dtype))
slot_domain = iinfo.min, iinfo.max - 1
# Here the slot_domain isn't specified by the user; we're setting it.
# The SOMA spec disallows negative soma_joinid.
if index_column_name == SOMA_JOINID:
slot_domain = (0, 2**63 - 2)
saturated_range = True
if is_max_domain or not NEW_SHAPE_FEATURE_FLAG_ENABLED:
# Core max domain is immutable. If unspecified, it should be as big
# as possible since it can never be resized.
iinfo = np.iinfo(cast(NPInteger, dtype))
slot_domain = iinfo.min, iinfo.max - 1
# Here the slot_domain isn't specified by the user; we're setting it.
# The SOMA spec disallows negative soma_joinid.
if index_column_name == SOMA_JOINID:
slot_domain = (0, 2**63 - 2)
saturated_range = True
else:
# Core current domain is mutable but not shrinkable. If
# unspecified, it should be as small as possible since it can only
# be grown, not shrunk.
#
# Core current-domain semantics are (lo, hi) with both inclusive,
# with lo <= hi. This means smallest is (0, 0) which is shape 1,
# not 0.
slot_domain = 0, 0
elif np.issubdtype(dtype, NPFloating):
finfo = np.finfo(cast(NPFloating, dtype))
slot_domain = finfo.min, finfo.max
saturated_range = True
if is_max_domain or not NEW_SHAPE_FEATURE_FLAG_ENABLED:
finfo = np.finfo(cast(NPFloating, dtype))
slot_domain = finfo.min, finfo.max
saturated_range = True
else:
slot_domain = 0.0, 0.0

# The `iinfo.min+1` is necessary as of tiledb core 2.15 / tiledb-py 0.21.1 since
# `iinfo.min` maps to `NaT` (not a time), resulting in
# TypeError: invalid domain extent, domain cannot be safely cast to dtype dtype('<M8[s]')
# The `iinfo.min+1` is necessary as of tiledb core 2.15 / tiledb-py 0.21.1
# since `iinfo.min` maps to `NaT` (not a time), resulting in
#
# TypeError: invalid domain extent, domain cannot be safely cast to
# dtype dtype('<M8[s]')
#
# The `iinfo.max-delta` is necessary since with iinfo.min being bumped by 1, without subtracting
# we would get
# tiledb.cc.TileDBError: [TileDB::Dimension] Error: Tile extent check failed; domain max
# expanded to multiple of tile extent exceeds max value representable by domain type. Reduce
# domain max by 1 tile extent to allow for expansion.
# The `iinfo.max-delta` is necessary since with iinfo.min being bumped by
# 1, without subtracting we would get
#
# tiledb.cc.TileDBError: [TileDB::Dimension] Error: Tile extent check
# failed; domain max expanded to multiple of tile extent exceeds max
# value representable by domain type. Reduce domain max by 1 tile extent
# to allow for expansion.
elif dtype == "datetime64[s]":
iinfo = np.iinfo(cast(NPInteger, np.int64))
slot_domain = np.datetime64(iinfo.min + 1, "s"), np.datetime64(
iinfo.max - 1000000, "s"
)
if is_max_domain or not NEW_SHAPE_FEATURE_FLAG_ENABLED:
iinfo = np.iinfo(cast(NPInteger, np.int64))
slot_domain = np.datetime64(iinfo.min + 1, "s"), np.datetime64(
iinfo.max - 1000000, "s"
)
else:
slot_domain = np.datetime64(0, "s"), np.datetime64(0, "s")

Check warning on line 924 in apis/python/src/tiledbsoma/_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_dataframe.py#L924

Added line #L924 was not covered by tests
elif dtype == "datetime64[ms]":
iinfo = np.iinfo(cast(NPInteger, np.int64))
slot_domain = np.datetime64(iinfo.min + 1, "ms"), np.datetime64(
iinfo.max - 1000000, "ms"
)
if is_max_domain or not NEW_SHAPE_FEATURE_FLAG_ENABLED:
iinfo = np.iinfo(cast(NPInteger, np.int64))
slot_domain = np.datetime64(iinfo.min + 1, "ms"), np.datetime64(
iinfo.max - 1000000, "ms"
)
else:
slot_domain = np.datetime64(0, "ms"), np.datetime64(0, "ms")

Check warning on line 932 in apis/python/src/tiledbsoma/_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_dataframe.py#L932

Added line #L932 was not covered by tests
elif dtype == "datetime64[us]":
iinfo = np.iinfo(cast(NPInteger, np.int64))
slot_domain = np.datetime64(iinfo.min + 1, "us"), np.datetime64(
iinfo.max - 1000000, "us"
)
if is_max_domain or not NEW_SHAPE_FEATURE_FLAG_ENABLED:
iinfo = np.iinfo(cast(NPInteger, np.int64))
slot_domain = np.datetime64(iinfo.min + 1, "us"), np.datetime64(
iinfo.max - 1000000, "us"
)
else:
slot_domain = np.datetime64(0, "us"), np.datetime64(0, "us")

Check warning on line 940 in apis/python/src/tiledbsoma/_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_dataframe.py#L940

Added line #L940 was not covered by tests
elif dtype == "datetime64[ns]":
iinfo = np.iinfo(cast(NPInteger, np.int64))
slot_domain = np.datetime64(iinfo.min + 1, "ns"), np.datetime64(
iinfo.max - 1000000, "ns"
)
if is_max_domain or not NEW_SHAPE_FEATURE_FLAG_ENABLED:
iinfo = np.iinfo(cast(NPInteger, np.int64))
slot_domain = np.datetime64(iinfo.min + 1, "ns"), np.datetime64(
iinfo.max - 1000000, "ns"
)
else:
slot_domain = np.datetime64(0, "ns"), np.datetime64(0, "ns")

Check warning on line 948 in apis/python/src/tiledbsoma/_dataframe.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_dataframe.py#L948

Added line #L948 was not covered by tests

else:
raise TypeError(f"Unsupported dtype {dtype}")
Expand Down
5 changes: 4 additions & 1 deletion apis/python/src/tiledbsoma/_dense_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,10 @@ def create(
if dim_shape == 0:
raise ValueError("DenseNDArray shape slots must be at least 1")
if dim_shape is None:
dim_shape = dim_capacity
# Core current-domain semantics are (lo, hi) with both
# inclusive, with lo <= hi. This means smallest is (0, 0)
# which is shape 1, not 0.
dim_shape = 1

Check warning on line 128 in apis/python/src/tiledbsoma/_dense_nd_array.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_dense_nd_array.py#L128

Added line #L128 was not covered by tests

index_column_data[pa_field.name] = [
0,
Expand Down
4 changes: 2 additions & 2 deletions apis/python/src/tiledbsoma/_point_cloud_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,10 +182,10 @@ def create(
)

(slot_core_current_domain, saturated_cd) = _fill_out_slot_soma_domain(
slot_soma_domain, index_column_name, pa_field.type, dtype
slot_soma_domain, False, index_column_name, pa_field.type, dtype
)
(slot_core_max_domain, saturated_md) = _fill_out_slot_soma_domain(
None, index_column_name, pa_field.type, dtype
None, True, index_column_name, pa_field.type, dtype
)

extent = _find_extent_for_domain(
Expand Down
5 changes: 4 additions & 1 deletion apis/python/src/tiledbsoma/_sparse_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,10 @@ def create(
if dim_shape == 0:
raise ValueError("SparseNDArray shape slots must be at least 1")
if dim_shape is None:
dim_shape = dim_capacity
# Core current-domain semantics are (lo, hi) with both
# inclusive, with lo <= hi. This means smallest is (0, 0)
# which is shape 1, not 0.
dim_shape = 1

Check warning on line 182 in apis/python/src/tiledbsoma/_sparse_nd_array.py

View check run for this annotation

Codecov / codecov/patch

apis/python/src/tiledbsoma/_sparse_nd_array.py#L182

Added line #L182 was not covered by tests

index_column_data[pa_field.name] = [
0,
Expand Down

0 comments on commit cd9ffee

Please sign in to comment.