Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Intro SubsetRelationshipInspector #99

Merged
merged 9 commits into from
Jan 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions sdgx/data_models/combiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,8 @@ def check(self):
def from_dataloader(
cls,
dataloaders: list[DataLoader],
max_chunk: int = 10,
metadata_from_dataloader_kwargs: None | dict = None,
relationshipe_inspector: None | str | type[Inspector] = "DefaultRelationshipInspector",
relationshipe_inspector: None | str | type[Inspector] = "SubsetRelationshipInspector",
relationships_inspector_kwargs: None | dict = None,
relationships: None | list[Relationship] = None,
):
Expand All @@ -91,7 +90,6 @@ def from_dataloader(
dataloaders = [dataloaders]

metadata_from_dataloader_kwargs = metadata_from_dataloader_kwargs or {}
metadata_from_dataloader_kwargs.setdefault("max_chunk", max_chunk)
named_metadata = {
d.identity: Metadata.from_dataloader(d, **metadata_from_dataloader_kwargs)
for d in dataloaders
Expand All @@ -105,10 +103,12 @@ def from_dataloader(
relationshipe_inspector, **relationships_inspector_kwargs
)
for d in dataloaders:
for i, chunk in enumerate(d.iter()):
inspector.fit(chunk, name=d.identity)
if inspector.ready or i > max_chunk:
break
for chunk in d.iter():
inspector.fit(
chunk,
name=d.identity,
metadata=named_metadata[d.identity],
)
relationships = inspector.inspect()["relationships"]

return cls(named_metadata=named_metadata, relationships=relationships)
Expand All @@ -119,7 +119,7 @@ def from_dataframe(
dataframes: list[pd.DataFrame],
names: list[str],
metadata_from_dataloader_kwargs: None | dict = None,
relationshipe_inspector: None | str | type[Inspector] = "DefaultRelationshipInspector",
relationshipe_inspector: None | str | type[Inspector] = "SubsetRelationshipInspector",
relationships_inspector_kwargs: None | dict = None,
relationships: None | list[Relationship] = None,
) -> "MetadataCombiner":
Expand Down Expand Up @@ -157,7 +157,11 @@ def from_dataframe(
relationshipe_inspector, **relationships_inspector_kwargs
)
for n, d in zip(names, dataframes):
inspector.fit(d, name=n)
inspector.fit(
d,
name=n,
metadata=named_metadata[n],
)
relationships = inspector.inspect()["relationships"]

return cls(named_metadata=named_metadata, relationships=relationships)
Expand Down
32 changes: 31 additions & 1 deletion sdgx/data_models/inspectors/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
from __future__ import annotations

from typing import Any
from typing import TYPE_CHECKING, Any

import pandas as pd

if TYPE_CHECKING:
from sdgx.data_models.metadata import Metadata

from sdgx.data_models.relationship import Relationship


class Inspector:
"""
Expand All @@ -28,3 +33,28 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs):

def inspect(self, *args, **kwargs) -> dict[str, Any]:
"""Inspect raw data and generate metadata."""


class RelationshipInspector(Inspector):
"""
Empty RelationshipInspector for inheritence

Subclass should implement `_build_relationship` and `fit`
"""

def _build_relationship(self) -> list[Relationship]:
return []

def fit(
self,
raw_data: pd.DataFrame,
name: str | None = None,
metadata: "Metadata" | None = None,
*args,
**kwargs,
):
pass

def inspect(self, *args, **kwargs) -> dict[str, Any]:
"""Inspect raw data and generate metadata."""
return {"relationships": self._build_relationship()}
26 changes: 0 additions & 26 deletions sdgx/data_models/inspectors/relationship.py

This file was deleted.

70 changes: 70 additions & 0 deletions sdgx/data_models/inspectors/subset_relationship.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from __future__ import annotations

from itertools import chain
from typing import TYPE_CHECKING, Any

import pandas as pd

from sdgx.data_models.inspectors.base import RelationshipInspector
from sdgx.data_models.inspectors.extension import hookimpl
from sdgx.data_models.relationship import Relationship

if TYPE_CHECKING:
from sdgx.data_models.metadata import Metadata


class SubsetRelationshipInspector(RelationshipInspector):
"""
Inspecting relationships by comparing two columns is subset or not. So it needs to inspect all data for prev
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.maybe_related_columns: dict[str, dict[str, pd.Series]] = {}

def _is_related(self, p: pd.Series, c: pd.Series) -> bool:
"""
If child is subset of parent, assume related
"""

return c.isin(p).all()

def _build_relationship(self) -> list[Relationship]:
r = []
for parent, p_m_related in self.maybe_related_columns.items():
for child, c_m_related in self.maybe_related_columns.items():
if parent == child:
continue
related_pairs = []
for p_col, p_df in p_m_related.items():
for c_col, c_df in c_m_related.items():
if self._is_related(p_df, c_df):
related_pairs.append((p_col, c_col) if p_col != c_col else p_col)
if related_pairs:
r.append(Relationship.build(parent, child, related_pairs))
return r

def fit(
self,
raw_data: pd.DataFrame,
name: str | None = None,
metadata: "Metadata" | None = None,
*args,
**kwargs,
):
columns = set(n for n in chain(metadata.id_columns, metadata.primary_keys))
for c in columns:
cur_map = self.maybe_related_columns.setdefault(name, dict())
cur_map[c] = pd.concat(
(cur_map.get(c, pd.Series()), raw_data[[c]].squeeze()),
ignore_index=True,
)

def inspect(self, *args, **kwargs) -> dict[str, Any]:
"""Inspect raw data and generate metadata."""
return {"relationships": self._build_relationship()}


@hookimpl
def register(manager):
manager.register("SubsetRelationshipInspector", SubsetRelationshipInspector)
5 changes: 3 additions & 2 deletions sdgx/data_models/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from pydantic import BaseModel

from sdgx.data_loader import DataLoader
from sdgx.data_models.inspectors.base import RelationshipInspector
from sdgx.data_models.inspectors.manager import InspectorManager
from sdgx.data_models.inspectors.relationship import RelationshipInspector
from sdgx.exceptions import MetadataInitError, MetadataInvalidError
from sdgx.utils import logger

Expand Down Expand Up @@ -243,7 +243,8 @@ def from_dataloader(
)
for i, chunk in enumerate(dataloader.iter()):
for inspector in inspectors:
inspector.fit(chunk)
if not inspector.ready:
inspector.fit(chunk)
if all(i.ready for i in inspectors) or i > max_chunk:
break

Expand Down
14 changes: 10 additions & 4 deletions sdgx/data_models/relationship.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from __future__ import annotations

import json
from collections import namedtuple
from pathlib import Path
from typing import Any, Iterable, List, Set, Tuple, Union
from typing import Any, Iterable, List, Union

from pydantic import BaseModel

from sdgx.exceptions import RelationshipInitError

KeyTuple = namedtuple("KeyTuple", ["parent", "child"])


class Relationship(BaseModel):
"""Relationship between tables
Expand All @@ -24,7 +27,7 @@ class Relationship(BaseModel):
parent_table: str
child_table: str

foreign_keys: List[Union[str, Tuple[str, str]]]
foreign_keys: List[KeyTuple]
"""
foreign keys.

Expand All @@ -36,7 +39,7 @@ def build(
cls,
parent_table: str,
child_table: str,
foreign_keys: Iterable[str | tuple[str, str]],
foreign_keys: Iterable[str | tuple[str, str] | KeyTuple],
) -> "Relationship":
"""
Build relationship from parent table, child table and foreign keys
Expand All @@ -52,7 +55,10 @@ def build(
if not child_table:
raise RelationshipInitError("child table cannot be empty")

foreign_keys = list(foreign_keys)
foreign_keys = [
KeyTuple(key, key) if isinstance(key, str) else KeyTuple(*key) for key in foreign_keys
]

if not foreign_keys:
raise RelationshipInitError("foreign keys cannot be empty")
if parent_table == child_table:
Expand Down
46 changes: 46 additions & 0 deletions tests/data_models/inspector/test_subset_relationship.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pandas as pd
import pytest

from sdgx.data_models.inspectors.subset_relationship import SubsetRelationshipInspector
from sdgx.data_models.metadata import Metadata
from sdgx.data_models.relationship import Relationship


@pytest.fixture
def inspector():
yield SubsetRelationshipInspector()


@pytest.fixture
def dummy_data(demo_relational_table_path):
table_path_a, table_path_b, _ = demo_relational_table_path
df_a = pd.read_csv(table_path_a)
df_b = pd.read_csv(table_path_b)

yield [
(df_a, "parent", Metadata.from_dataframe(df_a)),
(df_b, "child", Metadata.from_dataframe(df_b)),
]


@pytest.fixture
def dummy_relationship(demo_relational_table_path):
_, _, pairs = demo_relational_table_path

yield Relationship.build(
parent_table="parent",
child_table="child",
foreign_keys=pairs,
)


def test_inspector(dummy_data, dummy_relationship, inspector: SubsetRelationshipInspector):
for raw_data, name, metadata in dummy_data:
inspector.fit(raw_data, name=name, metadata=metadata)
relationships = inspector.inspect()["relationships"]
assert relationships
assert relationships == [dummy_relationship]


if __name__ == "__main__":
pytest.main(["-vv", "-s", __file__])
3 changes: 1 addition & 2 deletions tests/data_models/test_combiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sdgx.data_connectors.csv_connector import CsvConnector
from sdgx.data_loader import DataLoader
from sdgx.data_models.combiner import MetadataCombiner
from sdgx.data_models.inspectors.relationship import RelationshipInspector
from sdgx.data_models.inspectors.base import RelationshipInspector
from sdgx.data_models.relationship import Relationship


Expand All @@ -32,7 +32,6 @@ def test_from_dataloader(demo_relational_table_path, tmp_path):

combiner = MetadataCombiner.from_dataloader(
dataloaders=[dl_a, dl_b],
max_chunk=10,
metadata_from_dataloader_kwargs={},
relationshipe_inspector=MockInspector,
relationships_inspector_kwargs=dict(dummy_data=[relationship]),
Expand Down
21 changes: 13 additions & 8 deletions tests/data_models/test_relationship.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,26 @@

import pytest

from sdgx.data_models.relationship import Relationship
from sdgx.data_models.relationship import KeyTuple, Relationship
from sdgx.exceptions import RelationshipInitError


@pytest.mark.parametrize(
"parent_table, child_table, foreign_keys, exception",
[
("parent", "child", ["parent_id"], None),
("parent", "child", ["parent_id", "child_id"], None),
("parent", "child", [("parent_id", "p_id_in_child")], None),
("parent", "parent", ["parent_id"], RelationshipInitError),
("parent", "child", [KeyTuple("parent_id", "parent_id")], None),
(
"parent",
"child",
[KeyTuple("parent_id", "parent_id"), KeyTuple("child_id", "child_id")],
None,
),
("parent", "child", [KeyTuple("parent_id", "p_id_in_child")], None),
("parent", "parent", [KeyTuple("parent_id", "parent_id")], RelationshipInitError),
("parent", "parent", [], RelationshipInitError),
("", "child", ["parent_id"], RelationshipInitError),
("parent", "", ["parent_id"], RelationshipInitError),
("", "", ["parent_id"], RelationshipInitError),
("", "child", [KeyTuple("parent_id", "parent_id")], RelationshipInitError),
("parent", "", [KeyTuple("parent_id", "parent_id")], RelationshipInitError),
("", "", [KeyTuple("parent_id", "parent_id")], RelationshipInitError),
("", "", [], RelationshipInitError),
],
)
Expand Down
4 changes: 3 additions & 1 deletion tests/test_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ def test_fit(synthesizer):


def test_sample(synthesizer):
assert synthesizer.sample(10) is not None
assert len(synthesizer.sample(10)) == 10
for df in synthesizer.sample(10, chunksize=5):
assert len(df) == 5


def test_save_and_load(synthesizer, save_dir):
Expand Down