hitsz-ids · MooooCat · Jan 16, 2024 · Jan 10, 2024 · Jan 10, 2024 · Jan 11, 2024
diff --git a/sdgx/data_models/combiner.py b/sdgx/data_models/combiner.py
@@ -33,7 +33,6 @@ class MetadataCombiner(BaseModel):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.check()
 
     def check(self):
         """Do necessary checks:

diff --git a/sdgx/data_models/metadata.py b/sdgx/data_models/metadata.py
@@ -218,6 +218,7 @@ def from_dataloader(
         include_inspectors: Iterable[str] | None = None,
         exclude_inspectors: Iterable[str] | None = None,
         inspector_init_kwargs: dict[str, Any] | None = None,
+        check: bool = False,
     ) -> "Metadata":
         """Initialize a metadata from DataLoader and Inspectors
 
@@ -257,7 +258,8 @@ def from_dataloader(
         if not primary_keys:
             metadata.update_primary_key(metadata.id_columns)
 
-        metadata.check()
+        if check:
+            metadata.check()
         return metadata
 
     @classmethod
@@ -267,6 +269,7 @@ def from_dataframe(
         include_inspectors: list[str] | None = None,
         exclude_inspectors: list[str] | None = None,
         inspector_init_kwargs: dict[str, Any] | None = None,
+        check: bool = False,
     ) -> "Metadata":
         """Initialize a metadata from DataFrame and Inspectors
 
@@ -294,7 +297,8 @@ def from_dataframe(
         metadata = Metadata(primary_keys=[df.columns[0]], column_list=set(df.columns))
         for inspector in inspectors:
             metadata.update(inspector.inspect())
-        metadata.check()
+        if check:
+            metadata.check()
         return metadata
 
     def _dump_json(self):

diff --git a/sdgx/models/statistics/multi_tables/base.py b/sdgx/models/statistics/multi_tables/base.py
@@ -0,0 +1,151 @@
+from __future__ import annotations
+
+import time
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List
+
+import pandas as pd
+from pydantic import BaseModel
+
+from sdgx.data_loader import DataLoader
+from sdgx.data_models.combiner import MetadataCombiner
+from sdgx.log import logger
+from sdgx.utils import DataAccessType
+
+
+class MultiTableSynthesizerModel(BaseModel):
+    """MultiTableSynthesizerModel
+
+    The base model of multi-table statistic models.
+    """
+
+    data_access_method: DataAccessType = DataAccessType.pd_data_frame
+    """
+    The type of the data access, now support pandas.DataFrame or sdgx.DataLoader.
+    """
+
+    metadata_combiner: MetadataCombiner = None
+    """
+    metadata_combiner is a sdgx builtin class, it stores all tables' metadata and relationships.
+    """
+
+    tables_data_frame: Dict[str, Any] = defaultdict()
+    """
+    tables_data_frame is a dict contains every table's csv data frame.
+    For a small amount of data, this scheme can be used.
+    """
+
+    tables_data_loader: Dict[str, Any] = defaultdict()
+    """
+    tables_data_loader is a dict contains every table's data loader.
+    """
+
+    _parent_id: List = []
+    """
+    _parent_id is used to store all parent table's parimary keys in list.
+    """
+
+    _table_synthesizers: Dict[str, Any] = {}
+    """
+    _table_synthesizers is a dict to store model for each table.
+    """
+
+    parent_map: Dict = defaultdict()
+    """
+    The mapping from all child tables to their parent table.
+    """
+
+    child_map: Dict = defaultdict()
+    """
+    The mapping from all parent tabels to their child table.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._calculate_parent_and_child_map()
+
+        self.check()
+
+    def _calculate_parent_and_child_map(self):
+        """Get the mapping from all parent tables to self._parent_map
+        - key(str) is a child map;
+        - value(str) is the parent map.
+        """
+        relationships = self.metadata_combiner.relationships
+        for each_relationship in relationships:
+            parent_table = each_relationship.parent_table
+            child_table = each_relationship.child_table
+            self.parent_map[child_table] = parent_table
+            self.child_map[parent_table] = child_table
+
+    def _get_foreign_keys(self, parent_table, child_table):
+        """Get the foreign key list from a relationship"""
+        relationships = self.metadata_combiner.relationships
+        for each_relationship in relationships:
+            # find the exact relationship and return foreign keys
+            if (
+                each_relationship.parent_table == parent_table
+                and each_relationship.child_table == child_table
+            ):
+                return each_relationship.foreign_keys
+        return []
+
+    def _get_all_foreign_keys(self, child_table):
+        """Given a child table, return ALL foreign keys from metadata."""
+        all_foreign_keys = []
+        relationships = self.metadata_combiner.relationships
+        for each_relationship in relationships:
+            # find the exact relationship and return foreign keys
+            if each_relationship.child_table == child_table:
+                all_foreign_keys.append(each_relationship.foreign_keys)
+
+        return all_foreign_keys
+
+    def _finalize(self):
+        """Finalize the"""
+        raise NotImplementedError
+
+    def check(self, check_circular=True):
+        """Excute necessary checks
+
+        - validate circular relationships
+        - validate child map_circular relationship
+        - validate all tables connect relationship
+        - validate column relationships foreign keys
+        """
+
+        pass
+
+    def fit(self, dataloader: DataLoader, *args, **kwargs):
+        """
+        Fit the model using the given metadata and dataloader.
+
+        Args:
+            metadata (Metadata): The metadata to use.
+            dataloader (DataLoader): The dataloader to use.
+        """
+        raise NotImplementedError
+
+    def sample(self, count: int, *args, **kwargs) -> pd.DataFrame:
+        """
+        Sample data from the model.
+
+        Args:
+            count (int): The number of samples to generate.
+
+        Returns:
+            pd.DataFrame: The generated data.
+        """
+
+        raise NotImplementedError
+
+    def save(self, save_dir: str | Path):
+        pass
+
+    @classmethod
+    def load(target_path: str | Path):
+        pass
+
+    pass
diff --git a/sdgx/models/statistics/base.py → sdgx/models/statistics/single_table/base.py b/sdgx/models/statistics/base.py → sdgx/models/statistics/single_table/base.py
@@ -13,8 +13,7 @@ class SynthesizerModel:
     random_states = None
 
     def __init__(self, transformer=None, sampler=None) -> None:
-        # 以下几个变量都需要在初始化 model 时进行更改
-        self.model = None  # 存放模型
+        self.model = None
         self.status = "UNFINED"
         self.model_type = "MODEL_TYPE_UNDEFINED"
         # self.epochs = epochs

diff --git a/sdgx/models/statistics/single_table/copula.py b/sdgx/models/statistics/single_table/copula.py
@@ -24,7 +24,7 @@
     unflatten_dict,
     validate_numerical_distributions,
 )
-from sdgx.models.statistics.base import SynthesizerModel
+from sdgx.models.statistics.single_table.base import SynthesizerModel
 
 LOGGER = logging.getLogger(__name__)
 

diff --git a/sdgx/utils.py b/sdgx/utils.py
@@ -6,6 +6,7 @@
 import urllib.request
 import warnings
 from contextlib import closing
+from enum import Enum
 from pathlib import Path
 from typing import Callable
 
@@ -41,6 +42,15 @@
 }
 
 
+class DataAccessType(Enum):
+    """
+    Type of data access.
+    """
+
+    pd_data_frame = 1
+    sdgx_data_loader = 2
+
+
 def find_free_port():
     with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
         s.bind(("", 0))

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -12,7 +12,9 @@
 
 from sdgx.data_connectors.csv_connector import CsvConnector
 from sdgx.data_loader import DataLoader
+from sdgx.data_models.combiner import MetadataCombiner
 from sdgx.data_models.metadata import Metadata
+from sdgx.data_models.relationship import Relationship
 from sdgx.utils import download_demo_data, download_multi_table_demo_data
 
 _HERE = os.path.dirname(__file__)
@@ -158,3 +160,24 @@ def demo_multi_table_data_loader(demo_multi_table_data_connector, cacher_kwargs)
     yield loader_dict
     for each_table in demo_multi_table_data_connector.keys():
         demo_multi_table_data_connector[each_table].finalize()
+
+
+@pytest.fixture
+def demo_multi_data_relationship():
+    yield Relationship.build(parent_table="store", child_table="train", foreign_keys=["Store"])
+
+
+@pytest.fixture
+def demo_multi_table_data_metadata_combiner(
+    demo_multi_table_data_loader, demo_multi_data_relationship
+):
+    # 1. get metadata
+    metadata_dict = {}
+    for each_table_name in demo_multi_table_data_loader:
+        each_metadata = Metadata.from_dataloader(demo_multi_table_data_loader[each_table_name])
+        metadata_dict[each_table_name] = each_metadata
+    # 2. define relationship - already defined
+    # 3. define combiner
+    m = MetadataCombiner(named_metadata=metadata_dict, relationships=[demo_multi_data_relationship])
+
+    yield m
diff --git a/tests/models/test_base.py b/tests/models/test_base.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+from collections import defaultdict, namedtuple
+
+import pytest
+
+from sdgx.models.statistics.multi_tables.base import MultiTableSynthesizerModel
+from sdgx.utils import DataAccessType
+
+
+@pytest.fixture
+def demo_base_multi_table_synthesizer(
+    demo_multi_table_data_metadata_combiner, demo_multi_table_data_loader
+):
+    yield MultiTableSynthesizerModel(
+        metadata_combiner=demo_multi_table_data_metadata_combiner,
+        data_access_method=DataAccessType.sdgx_data_loader,
+        tables_data_loader=demo_multi_table_data_loader,
+    )
+
+
+def test_base_multi_table_synthesizer(demo_base_multi_table_synthesizer):
+    KeyTuple = namedtuple("KeyTuple", ["parent", "child"])
+
+    assert demo_base_multi_table_synthesizer.parent_map == defaultdict(None, {"train": "store"})
+    assert demo_base_multi_table_synthesizer.child_map == defaultdict(None, {"store": "train"})
+    assert demo_base_multi_table_synthesizer._get_all_foreign_keys("train")[0][0] == KeyTuple(
+        parent="Store", child="Store"
+    )
+
+
+if __name__ == "__main__":
+    pytest.main(["-vv", "-s", __file__])