-
Notifications
You must be signed in to change notification settings - Fork 541
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Intro SubsetRelationshipInspector (#99)
- Move RelationshipInspector as base - Intro SubsetRelationshipInspector as default RelationshipInspector
- Loading branch information
Showing
11 changed files
with
190 additions
and
53 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
from __future__ import annotations | ||
|
||
from itertools import chain | ||
from typing import TYPE_CHECKING, Any | ||
|
||
import pandas as pd | ||
|
||
from sdgx.data_models.inspectors.base import RelationshipInspector | ||
from sdgx.data_models.inspectors.extension import hookimpl | ||
from sdgx.data_models.relationship import Relationship | ||
|
||
if TYPE_CHECKING: | ||
from sdgx.data_models.metadata import Metadata | ||
|
||
|
||
class SubsetRelationshipInspector(RelationshipInspector): | ||
""" | ||
Inspecting relationships by comparing two columns is subset or not. So it needs to inspect all data for prev | ||
""" | ||
|
||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.maybe_related_columns: dict[str, dict[str, pd.Series]] = {} | ||
|
||
def _is_related(self, p: pd.Series, c: pd.Series) -> bool: | ||
""" | ||
If child is subset of parent, assume related | ||
""" | ||
|
||
return c.isin(p).all() | ||
|
||
def _build_relationship(self) -> list[Relationship]: | ||
r = [] | ||
for parent, p_m_related in self.maybe_related_columns.items(): | ||
for child, c_m_related in self.maybe_related_columns.items(): | ||
if parent == child: | ||
continue | ||
related_pairs = [] | ||
for p_col, p_df in p_m_related.items(): | ||
for c_col, c_df in c_m_related.items(): | ||
if self._is_related(p_df, c_df): | ||
related_pairs.append((p_col, c_col) if p_col != c_col else p_col) | ||
if related_pairs: | ||
r.append(Relationship.build(parent, child, related_pairs)) | ||
return r | ||
|
||
def fit( | ||
self, | ||
raw_data: pd.DataFrame, | ||
name: str | None = None, | ||
metadata: "Metadata" | None = None, | ||
*args, | ||
**kwargs, | ||
): | ||
columns = set(n for n in chain(metadata.id_columns, metadata.primary_keys)) | ||
for c in columns: | ||
cur_map = self.maybe_related_columns.setdefault(name, dict()) | ||
cur_map[c] = pd.concat( | ||
(cur_map.get(c, pd.Series()), raw_data[[c]].squeeze()), | ||
ignore_index=True, | ||
) | ||
|
||
def inspect(self, *args, **kwargs) -> dict[str, Any]: | ||
"""Inspect raw data and generate metadata.""" | ||
return {"relationships": self._build_relationship()} | ||
|
||
|
||
@hookimpl | ||
def register(manager): | ||
manager.register("SubsetRelationshipInspector", SubsetRelationshipInspector) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import pandas as pd | ||
import pytest | ||
|
||
from sdgx.data_models.inspectors.subset_relationship import SubsetRelationshipInspector | ||
from sdgx.data_models.metadata import Metadata | ||
from sdgx.data_models.relationship import Relationship | ||
|
||
|
||
@pytest.fixture | ||
def inspector(): | ||
yield SubsetRelationshipInspector() | ||
|
||
|
||
@pytest.fixture | ||
def dummy_data(demo_relational_table_path): | ||
table_path_a, table_path_b, _ = demo_relational_table_path | ||
df_a = pd.read_csv(table_path_a) | ||
df_b = pd.read_csv(table_path_b) | ||
|
||
yield [ | ||
(df_a, "parent", Metadata.from_dataframe(df_a)), | ||
(df_b, "child", Metadata.from_dataframe(df_b)), | ||
] | ||
|
||
|
||
@pytest.fixture | ||
def dummy_relationship(demo_relational_table_path): | ||
_, _, pairs = demo_relational_table_path | ||
|
||
yield Relationship.build( | ||
parent_table="parent", | ||
child_table="child", | ||
foreign_keys=pairs, | ||
) | ||
|
||
|
||
def test_inspector(dummy_data, dummy_relationship, inspector: SubsetRelationshipInspector): | ||
for raw_data, name, metadata in dummy_data: | ||
inspector.fit(raw_data, name=name, metadata=metadata) | ||
relationships = inspector.inspect()["relationships"] | ||
assert relationships | ||
assert relationships == [dummy_relationship] | ||
|
||
|
||
if __name__ == "__main__": | ||
pytest.main(["-vv", "-s", __file__]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters