Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python] Append-mode pre-check logic #1554

Merged
merged 14 commits into from
Jul 31, 2023
30 changes: 30 additions & 0 deletions apis/python/src/tiledbsoma/_arrow_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

import numpy as np
import numpy.typing as npt
import pandas as pd
import pyarrow as pa
import tiledb

Expand Down Expand Up @@ -159,3 +160,32 @@ def tiledb_schema_to_arrow(tdb_schema: tiledb.ArraySchema) -> pa.Schema:
arrow_schema_dict[name] = arrow_type_from_tiledb_dtype(attr.dtype, attr.isascii)

return pa.schema(arrow_schema_dict)


def df_to_arrow(df: pd.DataFrame) -> pa.Table:
"""
Categoricals are not yet well supported, so we must flatten.
Also replace Numpy/Pandas-style nulls with Arrow-style nulls.
"""
null_fields = set()
# Not for name, col in df.items() since we need df[k] on the left-hand sides
for k in df:
johnkerl marked this conversation as resolved.
Show resolved Hide resolved
if df[k].dtype == "category":
df[k] = df[k].astype(df[k].cat.categories.dtype)
if df[k].isnull().any():
if df[k].isnull().all():
df[k] = pa.nulls(df.shape[0], pa.infer_type(df[k]))
else:
df[k].where(
df[k].notnull(),
pd.Series(pa.nulls(df[k].isnull().sum(), pa.infer_type(df[k]))),
inplace=True,
)
null_fields.add(k)
arrow_table = pa.Table.from_pandas(df)
if null_fields:
md = arrow_table.schema.metadata
md.update(dict.fromkeys(null_fields, "nullable"))
arrow_table = arrow_table.replace_schema_metadata(md)

return arrow_table
31 changes: 2 additions & 29 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
eta,
logging,
)
from .._arrow_types import df_to_arrow
from .._collection import AnyTileDBCollection
from .._common_nd_array import NDArray
from .._constants import SOMA_JOINID
Expand Down Expand Up @@ -641,34 +642,6 @@ def _create_or_open_coll(
return cls.create(uri, context=context)


def _df_to_arrow(df: pd.DataFrame) -> pa.Table:
"""
Categoricals are not yet well supported, so we must flatten.
Also replace Numpy/Pandas-style nulls with Arrow-style nulls.
"""
null_fields = set()
for k in df:
if df[k].dtype == "category":
df[k] = df[k].astype(df[k].cat.categories.dtype)
if df[k].isnull().any():
if df[k].isnull().all():
df[k] = pa.nulls(df.shape[0], pa.infer_type(df[k]))
else:
df[k].where(
df[k].notnull(),
pd.Series(pa.nulls(df[k].isnull().sum(), pa.infer_type(df[k]))),
inplace=True,
)
null_fields.add(k)
arrow_table = pa.Table.from_pandas(df)
if null_fields:
md = arrow_table.schema.metadata
md.update(dict.fromkeys(null_fields, "nullable"))
arrow_table = arrow_table.replace_schema_metadata(md)

return arrow_table


def _write_dataframe(
df_uri: str,
df: pd.DataFrame,
Expand Down Expand Up @@ -705,7 +678,7 @@ def _write_dataframe_impl(
s = _util.get_start_stamp()
logging.log_io(None, f"START WRITING {df_uri}")

arrow_table = _df_to_arrow(df)
arrow_table = df_to_arrow(df)

try:
soma_df = _factory.open(df_uri, "w", soma_type=DataFrame, context=context)
Expand Down
Loading
Loading