Skip to content

Commit

Permalink
[0.1.0] Metadata and Inspector (#67)
Browse files Browse the repository at this point in the history
- Intro `Metadata`, without implement yet
- Intro `Inspector` for generating metadata from `DataLoader`
- Implement a `GeneratorConnector` for connecting `ProcessedData`
- WIP: More logic in `Synthesizer`
  • Loading branch information
Wh1isper committed Dec 16, 2023
1 parent 8800c62 commit 35beee7
Show file tree
Hide file tree
Showing 44 changed files with 738 additions and 101 deletions.
1 change: 1 addition & 0 deletions .github/workflows/extension.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ jobs:
python -m pip install -e .[test]
- name: Install all packages in example/extension
run: |
python -m pip install -e example/extension/dummymetadatainspector[test]
python -m pip install -e example/extension/dummycache[test]
python -m pip install -e example/extension/dummydataconnector[test]
python -m pip install -e example/extension/dummydataprocessor[test]
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/

# End of https://www.toptal.com/developers/gitignore/api/macos,emacs,python

Expand Down
10 changes: 10 additions & 0 deletions docs/source/api_reference/data_connectors/generator_connector.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
GeneratorConnector
=====================================


.. autoclass:: sdgx.data_connectors.generator_connector.GeneratorConnector
:members:
:undoc-members:
:inherited-members:
:show-inheritance:
:private-members:
1 change: 1 addition & 0 deletions docs/source/api_reference/data_connectors/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Built-in DataConnector

DataConnector <base>
CsvConnector <csv_connector>
GeneratorConnector <generator_connector>

Custom DataConnector Relevant
-----------------------------
Expand Down
2 changes: 1 addition & 1 deletion example/2_guassian_copula_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# ipython -i example/2_guassian_copula_example.py
# 并查看 sampled_data 变量

from sdgx.statistics.single_table.copula import GaussianCopulaSynthesizer
from sdgx.models.statistics.single_table.copula import GaussianCopulaSynthesizer
from sdgx.utils.io.csv_utils import *

# 针对 csv 格式的小规模数据
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "0.1.0"
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from __future__ import annotations

from sdgx.data_models.inspectors.base import Inspector
from sdgx.data_models.inspectors.extension import hookimpl


class MyOwnInspector(Inspector):
...


@hookimpl
def register(manager):
manager.register("DummyInspector", MyOwnInspector)
27 changes: 27 additions & 0 deletions example/extension/dummymetadatainspector/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "dummymetadatainspector"
dependencies = ["sdgx"]
dynamic = ["version"]
requires-python = ">=3.8"
classifiers = [
"Programming Language :: Python :: 3",
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
]
[project.optional-dependencies]
test = ["pytest"]

[tool.check-manifest]
ignore = [".*"]

[tool.hatch.version]
path = "dummymetadatainspector/__init__.py"

[project.entry-points."sdgx.metadata.inspector"]
dummymetadatainspector = "dummymetadatainspector.inspector"
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pytest

from sdgx.data_models.inspectors.manager import InspectorManager


@pytest.fixture
def manager():
yield InspectorManager()


def test_registed_cacher(manager: InspectorManager):
assert manager._normalize_name("DummyInspector") in manager.registed_inspectors


if __name__ == "__main__":
pytest.main(["-vv", "-s", __file__])
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dependencies = [
"pluggy",
"loguru",
"pyarrow",
"pydantic>=2"
]
dynamic = ["version"]
classifiers = [
Expand Down
6 changes: 6 additions & 0 deletions sdgx/cachers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ def load_all(self, data_connector: DataConnector) -> pd.DataFrame:

raise NotImplementedError

def clear_cache(self):
"""
Clear all cache
"""
return

def clear_invalid_cache(self):
"""
Clear invalid cache.
Expand Down
10 changes: 9 additions & 1 deletion sdgx/cachers/disk_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,21 @@ def __init__(
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(parents=True, exist_ok=True)

def clear_invalid_cache(self):
def clear_cache(self):
"""
Clear all cache in cache_dir.
"""
for f in self.cache_dir.glob("*.parquet"):
f.unlink()

def clear_invalid_cache(self):
"""
Clear all cache in cache_dir.
TODO: Improve cache invalidation
"""
return self.clear_cache()

def _get_cache_filename(self, offset: int) -> Path:
"""
Get cache filename
Expand Down
8 changes: 7 additions & 1 deletion sdgx/data_connectors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def iter(self, offset=0, chunksize=0) -> Generator[pd.DataFrame, None, None]:
Returns:
Generator[pd.DataFrame, None, None]: Generator/Iterator for readed dataframe
"""
raise NotImplementedError
return self._iter(offset, chunksize)

def read(self, offset=0, limit=None) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -82,3 +82,9 @@ def keys(self) -> list[str]:
Same as ``columns``.
"""
return self.columns()

def finalize(self):
"""
Finalize the data connector.
"""
pass
6 changes: 5 additions & 1 deletion sdgx/data_connectors/csv_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,11 @@ def _columns(self) -> list[str]:
).columns.tolist()
return d

def iter(self, offset=0, chunksize=1000) -> Generator[pd.DataFrame, None, None]:
def _iter(self, offset=0, chunksize=1000) -> Generator[pd.DataFrame, None, None]:
if chunksize is None:
yield self._read(offset=offset)
return

for d in pd.read_csv(
self.path,
sep=self.sep,
Expand Down
60 changes: 60 additions & 0 deletions sdgx/data_connectors/generator_connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from __future__ import annotations

from typing import Callable, Generator

import pandas as pd

from sdgx.data_connectors.base import DataConnector


class GeneratorConnector(DataConnector):
"""
A virtual data connector that wrap `Generator <https://docs.python.org/3/glossary.html#term-generator>`_
into a DataConnector.
Passing ``offset=0`` to ``read`` will reset the generator.
Warning:
``offset`` and ``limit`` are ignored as ``Generator`` not supporting random access.
Note:
This connector is not been registered by default. So only be used with the library way.
"""

@property
def identity(self) -> str:
return f"{id(self.generator_caller)}"

def __init__(
self,
generator_caller: Callable[[], Generator[pd.DataFrame, None, None]],
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.generator_caller = generator_caller
self._generator = self.generator_caller()

def _read(self, offset=0, limit=None) -> pd.DataFrame:
"""
Ingore limit and allow sequential reading.
"""
if offset == 0:
self._generator = self.generator_caller()

try:
return next(self._generator)
except StopIteration:
return None

def _columns(self) -> list[str]:
for df in self._iter():
return list(df.columns)

def _iter(self, offset=0, chunksize=0) -> Generator[pd.DataFrame, None, None]:
"""
Subclass should implement this for reading data in chunk.
See ``iter`` for more details.
"""
return self.generator_caller()
18 changes: 14 additions & 4 deletions sdgx/data_loader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

from typing import Any, Generator

import pandas as pd
Expand All @@ -24,7 +26,7 @@ class DataLoader:
def __init__(
self,
data_connector: DataConnector,
chunksize: int = 1000,
chunksize: int = 10000,
cacher: Cacher | None = None,
cache_mode: str = "DiskCache",
cacher_kwargs: None | dict[str, Any] = None,
Expand All @@ -33,11 +35,11 @@ def __init__(
self.chunksize = chunksize
self.cache_manager = CacherManager()

if not cacher_kwargs:
cacher_kwargs = {}
cacher_kwargs.setdefault("blocksize", self.chunksize)
cacher_kwargs.setdefault("identity", self.data_connector.identity)
if not cacher:
self.cacher = self.cache_manager.init_cacher(cache_mode, **cacher_kwargs)
self.cacher = cacher
self.cacher = cacher or self.cache_manager.init_cacher(cache_mode, **cacher_kwargs)

self.cacher.clear_invalid_cache()

Expand Down Expand Up @@ -68,3 +70,11 @@ def load_all(self) -> pd.DataFrame:
Load all data from cache.
"""
return self.cacher.load_all(self.data_connector)

def finalize(self, clear_cache=False) -> None:
"""
Finalize the dataloader.
"""
self.data_connector.finalize()
if clear_cache:
self.cacher.clear_cache()
File renamed without changes.
25 changes: 25 additions & 0 deletions sdgx/data_models/inspectors/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pandas as pd

from sdgx.data_models.inspectors.inspect_meta import InspectMeta


class Inspector:
"""
Base Inspector class
Inspector is used to inspect data and generate metadata automatically.
"""

ready: bool
"""Ready to inspect, maybe all fields are fitted."""

def fit(self, raw_data: pd.DataFrame):
"""Fit the inspector.
Args:
raw_data (pd.DataFrame): Raw data
"""
return

def inspect(self) -> InspectMeta:
"""Inspect raw data and generate metadata."""
56 changes: 56 additions & 0 deletions sdgx/data_models/inspectors/extension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from __future__ import annotations

from typing import Any

import pluggy

project_name = "sdgx.metadata.inspector"
"""
The entry-point name of this extension.
Should be used in ``pyproject.toml`` as ``[project.entry-points."{project_name}"]``
"""
hookimpl = pluggy.HookimplMarker(project_name)
"""
Hookimpl marker for this extension, extension module should use this marker
Example:
.. code-block:: python
@hookimpl
def register(manager):
...
"""

hookspec = pluggy.HookspecMarker(project_name)


@hookspec
def register(manager):
"""
For more information about this function, please check the :ref:`manager`
We provided an example package for you in ``{project_root}/example/extension/dummymetadatainspector``.
Example:
.. code-block:: python
class MyOwnInspector(Inspector):
...
from sdgx.data_models.inspectors.extension import hookimpl
@hookimpl
def register(manager):
manager.register("DummyInspector", MyOwnInspector)
Config ``project.entry-points`` so that we can find it
.. code-block:: toml
[project.entry-points."sdgx.metadata.inspector"]
{whatever-name} = "{package}.{path}.{to}.{file-with-hookimpl-function}"
"""
5 changes: 5 additions & 0 deletions sdgx/data_models/inspectors/inspect_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from pydantic import BaseModel


class InspectMeta:
pass
Loading

0 comments on commit 35beee7

Please sign in to comment.