Mixed raw/no-raw testing

single-cell-data · Jul 9, 2023 · c2180d8 · c2180d8
1 parent f1efa1d
commit c2180d8
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 98 deletions.
diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py
@@ -782,7 +782,7 @@ def _extract_new_values_for_append(
             previous_table = previous_soma_dataframe.read().concat()
             previous_df = previous_table.to_pandas()
             previous_join_ids = set(
-                list(str(e) for e in get_dataframe_values(previous_df, id_column_name))
+                list(int(e) for e in get_dataframe_values(previous_df, SOMA_JOINID))
             )
             mask = [
                 e.as_py() not in previous_join_ids for e in arrow_table[SOMA_JOINID]
@@ -803,7 +803,7 @@ def _write_dataframe(
     axis_mapping: AxisIDMapping,
 ) -> DataFrame:
 
-    s = _util.get_start_stamp()
+    _util.get_start_stamp()
     logging.log_io(None, f"START  WRITING {df_uri}")
 
     df.reset_index(inplace=True)
@@ -818,6 +818,7 @@ def _write_dataframe(
     return _write_dataframe_impl(
         df,
         df_uri,
+        id_column_name,
         ingestion_params=ingestion_params,
         platform_config=platform_config,
         context=context,
@@ -827,6 +828,7 @@ def _write_dataframe(
 def _write_dataframe_impl(
     df: pd.DataFrame,
     df_uri: str,
+    id_column_name: Optional[str],
     *,
     ingestion_params: IngestionParams,
     platform_config: Optional[PlatformConfig] = None,
@@ -1498,7 +1500,6 @@ def _ingest_uns_dict(
         _maybe_set(parent, parent_key, coll, use_relative_uri=use_relative_uri)
         coll.metadata["soma_tiledbsoma_type"] = "uns"
         for key, value in dct.items():
-<<<<<<< HEAD
             _ingest_uns_node(
                 coll,
                 key,
@@ -1508,95 +1509,6 @@ def _ingest_uns_dict(
                 ingestion_params=ingestion_params,
                 use_relative_uri=use_relative_uri,
             )
-||||||| parent of b2691801 ([python] Append-mode sketching)
-            if isinstance(value, np.generic):
-                # This is some kind of numpy scalar value. Metadata entries
-                # only accept native Python types, so unwrap it.
-                value = value.item()
-            if isinstance(value, (int, float, str)):
-                # Primitives get set on the metadata.
-                coll.metadata[key] = value
-                continue
-            if isinstance(value, Mapping):
-                # Mappings are represented as sub-dictionaries.
-                _ingest_uns_dict(
-                    coll,
-                    key,
-                    value,
-                    platform_config=platform_config,
-                    context=context,
-                    ingestion_params=ingestion_params,
-                    use_relative_uri=use_relative_uri,
-                )
-                continue
-            if isinstance(value, pd.DataFrame):
-                with _write_dataframe(
-                    _util.uri_joinpath(coll.uri, key),
-                    value,
-                    None,
-                    platform_config=platform_config,
-                    context=context,
-                    ingestion_params=ingestion_params,
-                ) as df:
-                    _maybe_set(coll, key, df, use_relative_uri=use_relative_uri)
-                continue
-            if isinstance(value, list) or "numpy" in str(type(value)):
-                value = np.asarray(value)
-            if isinstance(value, np.ndarray):
-                if value.dtype.names is not None:
-                    msg = (
-                        f"Skipped {coll.uri}[{key!r}]"
-                        " (uns): unsupported structured array"
-                    )
-                    # This is a structured array, which we do not support.
-                    logging.log_io(msg, msg)
-                    continue
-=======
-            if isinstance(value, np.generic):
-                # This is some kind of numpy scalar value. Metadata entries
-                # only accept native Python types, so unwrap it.
-                value = value.item()
-            if isinstance(value, (int, float, str)):
-                # Primitives get set on the metadata.
-                coll.metadata[key] = value
-                continue
-            if isinstance(value, Mapping):
-                # Mappings are represented as sub-dictionaries.
-                _ingest_uns_dict(
-                    coll,
-                    key,
-                    value,
-                    platform_config=platform_config,
-                    context=context,
-                    ingestion_params=ingestion_params,
-                    use_relative_uri=use_relative_uri,
-                )
-                continue
-            if isinstance(value, pd.DataFrame):
-                num_cols = value.shape[1]
-                with _write_dataframe(
-                    _util.uri_joinpath(coll.uri, key),
-                    value,
-                    None,
-                    platform_config=platform_config,
-                    context=context,
-                    ingestion_params=ingestion_params,
-                    axis_mapping=AxisIDMapping.identity(num_cols),
-                ) as df:
-                    _maybe_set(coll, key, df, use_relative_uri=use_relative_uri)
-                continue
-            if isinstance(value, list) or "numpy" in str(type(value)):
-                value = np.asarray(value)
-            if isinstance(value, np.ndarray):
-                if value.dtype.names is not None:
-                    msg = (
-                        f"Skipped {coll.uri}[{key!r}]"
-                        " (uns): unsupported structured array"
-                    )
-                    # This is a structured array, which we do not support.
-                    logging.log_io(msg, msg)
-                    continue
->>>>>>> b2691801 ([python] Append-mode sketching)
 
     msg = f"Wrote   {coll.uri} (uns collection)"
     logging.log_io(msg, msg)
@@ -1637,13 +1549,15 @@ def _ingest_uns_node(
         return
 
     if isinstance(value, pd.DataFrame):
+        num_cols = value.shape[1]
         with _write_dataframe(
             _util.uri_joinpath(coll.uri, key),
             value,
             None,
             platform_config=platform_config,
             context=context,
             ingestion_params=ingestion_params,
+            axis_mapping=AxisIDMapping.identity(num_cols),
         ) as df:
             _maybe_set(coll, key, df, use_relative_uri=use_relative_uri)
         return
@@ -1726,6 +1640,7 @@ def _ingest_uns_string_array(
     with _write_dataframe_impl(
         df,
         df_uri,
+        None,
         ingestion_params=ingestion_params,
         platform_config=platform_config,
         context=context,

diff --git a/apis/python/src/tiledbsoma/io/registration/ambient_label_mappings.py b/apis/python/src/tiledbsoma/io/registration/ambient_label_mappings.py
@@ -246,11 +246,18 @@ def from_anndata_append_on_experiment(
 
         var_maps = {measurement_name: var_map}
 
-        if adata.raw is not None:
-            raw_var_map = previous.var_axes["raw"].data
-            raw_var_next_soma_joinid = previous.var_axes[
-                "raw"
-            ].get_next_start_soma_joinid()
+        if adata.raw is None:
+            if "raw" in previous.var_axes:
+                var_maps["raw"] = previous.var_axes["raw"].data
+
+        else:
+            # One input may not have a raw while the next may have one
+            raw_var_map = {}
+            raw_var_next_soma_joinid = 0
+            if "raw" in previous.var_axes:
+                raw_var_axis = previous.var_axes["raw"]
+                raw_var_map = raw_var_axis.data
+                raw_var_next_soma_joinid = raw_var_axis.get_next_start_soma_joinid()
             raw_var_ids = get_dataframe_values(adata.raw.var, var_field_name)
             for raw_var_id in raw_var_ids:
                 if raw_var_id not in raw_var_map:
@@ -329,3 +336,8 @@ def from_h5ad_appends_on_experiment(
 
         tiledbsoma.logging.logger.info("Registration: complete.")
         return registration_data
+
+    def show(self) -> None:
+        print(f"obs:{len(self.obs_axis.data)}")
+        for k, v in self.var_axes.items():
+            print(f"{k}/var:{len(v.data)}")
diff --git a/apis/python/src/tiledbsoma/io/registration/id_mappings.py b/apis/python/src/tiledbsoma/io/registration/id_mappings.py
@@ -56,4 +56,6 @@ def get_dataframe_values(df: pd.DataFrame, field_name: str) -> List[str]:
         return list(df.index)
     if df.index.name is None:
         return list(df.index)
-    raise ValueError(f"could not find field name {field_name} in dataframe")
+    # XXX re-think
+    # raise ValueError(f"could not find field name {field_name} in dataframe")
+    return list(df.index)