Skip to content

Commit

Permalink
Mixed raw/no-raw testing
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Jul 9, 2023
1 parent f1efa1d commit c2180d8
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 98 deletions.
99 changes: 7 additions & 92 deletions apis/python/src/tiledbsoma/io/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,7 @@ def _extract_new_values_for_append(
previous_table = previous_soma_dataframe.read().concat()
previous_df = previous_table.to_pandas()
previous_join_ids = set(
list(str(e) for e in get_dataframe_values(previous_df, id_column_name))
list(int(e) for e in get_dataframe_values(previous_df, SOMA_JOINID))
)
mask = [
e.as_py() not in previous_join_ids for e in arrow_table[SOMA_JOINID]
Expand All @@ -803,7 +803,7 @@ def _write_dataframe(
axis_mapping: AxisIDMapping,
) -> DataFrame:

s = _util.get_start_stamp()
_util.get_start_stamp()
logging.log_io(None, f"START WRITING {df_uri}")

df.reset_index(inplace=True)
Expand All @@ -818,6 +818,7 @@ def _write_dataframe(
return _write_dataframe_impl(
df,
df_uri,
id_column_name,
ingestion_params=ingestion_params,
platform_config=platform_config,
context=context,
Expand All @@ -827,6 +828,7 @@ def _write_dataframe(
def _write_dataframe_impl(
df: pd.DataFrame,
df_uri: str,
id_column_name: Optional[str],
*,
ingestion_params: IngestionParams,
platform_config: Optional[PlatformConfig] = None,
Expand Down Expand Up @@ -1498,7 +1500,6 @@ def _ingest_uns_dict(
_maybe_set(parent, parent_key, coll, use_relative_uri=use_relative_uri)
coll.metadata["soma_tiledbsoma_type"] = "uns"
for key, value in dct.items():
<<<<<<< HEAD
_ingest_uns_node(
coll,
key,
Expand All @@ -1508,95 +1509,6 @@ def _ingest_uns_dict(
ingestion_params=ingestion_params,
use_relative_uri=use_relative_uri,
)
||||||| parent of b2691801 ([python] Append-mode sketching)
if isinstance(value, np.generic):
# This is some kind of numpy scalar value. Metadata entries
# only accept native Python types, so unwrap it.
value = value.item()
if isinstance(value, (int, float, str)):
# Primitives get set on the metadata.
coll.metadata[key] = value
continue
if isinstance(value, Mapping):
# Mappings are represented as sub-dictionaries.
_ingest_uns_dict(
coll,
key,
value,
platform_config=platform_config,
context=context,
ingestion_params=ingestion_params,
use_relative_uri=use_relative_uri,
)
continue
if isinstance(value, pd.DataFrame):
with _write_dataframe(
_util.uri_joinpath(coll.uri, key),
value,
None,
platform_config=platform_config,
context=context,
ingestion_params=ingestion_params,
) as df:
_maybe_set(coll, key, df, use_relative_uri=use_relative_uri)
continue
if isinstance(value, list) or "numpy" in str(type(value)):
value = np.asarray(value)
if isinstance(value, np.ndarray):
if value.dtype.names is not None:
msg = (
f"Skipped {coll.uri}[{key!r}]"
" (uns): unsupported structured array"
)
# This is a structured array, which we do not support.
logging.log_io(msg, msg)
continue
=======
if isinstance(value, np.generic):
# This is some kind of numpy scalar value. Metadata entries
# only accept native Python types, so unwrap it.
value = value.item()
if isinstance(value, (int, float, str)):
# Primitives get set on the metadata.
coll.metadata[key] = value
continue
if isinstance(value, Mapping):
# Mappings are represented as sub-dictionaries.
_ingest_uns_dict(
coll,
key,
value,
platform_config=platform_config,
context=context,
ingestion_params=ingestion_params,
use_relative_uri=use_relative_uri,
)
continue
if isinstance(value, pd.DataFrame):
num_cols = value.shape[1]
with _write_dataframe(
_util.uri_joinpath(coll.uri, key),
value,
None,
platform_config=platform_config,
context=context,
ingestion_params=ingestion_params,
axis_mapping=AxisIDMapping.identity(num_cols),
) as df:
_maybe_set(coll, key, df, use_relative_uri=use_relative_uri)
continue
if isinstance(value, list) or "numpy" in str(type(value)):
value = np.asarray(value)
if isinstance(value, np.ndarray):
if value.dtype.names is not None:
msg = (
f"Skipped {coll.uri}[{key!r}]"
" (uns): unsupported structured array"
)
# This is a structured array, which we do not support.
logging.log_io(msg, msg)
continue
>>>>>>> b2691801 ([python] Append-mode sketching)

msg = f"Wrote {coll.uri} (uns collection)"
logging.log_io(msg, msg)
Expand Down Expand Up @@ -1637,13 +1549,15 @@ def _ingest_uns_node(
return

if isinstance(value, pd.DataFrame):
num_cols = value.shape[1]
with _write_dataframe(
_util.uri_joinpath(coll.uri, key),
value,
None,
platform_config=platform_config,
context=context,
ingestion_params=ingestion_params,
axis_mapping=AxisIDMapping.identity(num_cols),
) as df:
_maybe_set(coll, key, df, use_relative_uri=use_relative_uri)
return
Expand Down Expand Up @@ -1726,6 +1640,7 @@ def _ingest_uns_string_array(
with _write_dataframe_impl(
df,
df_uri,
None,
ingestion_params=ingestion_params,
platform_config=platform_config,
context=context,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,11 +246,18 @@ def from_anndata_append_on_experiment(

var_maps = {measurement_name: var_map}

if adata.raw is not None:
raw_var_map = previous.var_axes["raw"].data
raw_var_next_soma_joinid = previous.var_axes[
"raw"
].get_next_start_soma_joinid()
if adata.raw is None:
if "raw" in previous.var_axes:
var_maps["raw"] = previous.var_axes["raw"].data

else:
# One input may not have a raw while the next may have one
raw_var_map = {}
raw_var_next_soma_joinid = 0
if "raw" in previous.var_axes:
raw_var_axis = previous.var_axes["raw"]
raw_var_map = raw_var_axis.data
raw_var_next_soma_joinid = raw_var_axis.get_next_start_soma_joinid()
raw_var_ids = get_dataframe_values(adata.raw.var, var_field_name)
for raw_var_id in raw_var_ids:
if raw_var_id not in raw_var_map:
Expand Down Expand Up @@ -329,3 +336,8 @@ def from_h5ad_appends_on_experiment(

tiledbsoma.logging.logger.info("Registration: complete.")
return registration_data

def show(self) -> None:
print(f"obs:{len(self.obs_axis.data)}")
for k, v in self.var_axes.items():
print(f"{k}/var:{len(v.data)}")
4 changes: 3 additions & 1 deletion apis/python/src/tiledbsoma/io/registration/id_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,6 @@ def get_dataframe_values(df: pd.DataFrame, field_name: str) -> List[str]:
return list(df.index)
if df.index.name is None:
return list(df.index)
raise ValueError(f"could not find field name {field_name} in dataframe")
# XXX re-think
# raise ValueError(f"could not find field name {field_name} in dataframe")
return list(df.index)

0 comments on commit c2180d8

Please sign in to comment.