Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[0.1.0] Metadata and Inspector #67

Merged
merged 14 commits into from
Dec 16, 2023
1 change: 1 addition & 0 deletions .github/workflows/extension.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ jobs:
python -m pip install -e .[test]
- name: Install all packages in example/extension
run: |
python -m pip install -e example/extension/dummymetadatainspector[test]
python -m pip install -e example/extension/dummycache[test]
python -m pip install -e example/extension/dummydataconnector[test]
python -m pip install -e example/extension/dummydataprocessor[test]
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/

# End of https://www.toptal.com/developers/gitignore/api/macos,emacs,python

Expand Down
10 changes: 10 additions & 0 deletions docs/source/api_reference/data_connectors/generator_connector.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
GeneratorConnector
=====================================


.. autoclass:: sdgx.data_connectors.generator_connector.GeneratorConnector
:members:
:undoc-members:
:inherited-members:
:show-inheritance:
:private-members:
1 change: 1 addition & 0 deletions docs/source/api_reference/data_connectors/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Built-in DataConnector

DataConnector <base>
CsvConnector <csv_connector>
GeneratorConnector <generator_connector>

Custom DataConnector Relevant
-----------------------------
Expand Down
2 changes: 1 addition & 1 deletion example/2_guassian_copula_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# ipython -i example/2_guassian_copula_example.py
# 并查看 sampled_data 变量

from sdgx.statistics.single_table.copula import GaussianCopulaSynthesizer
from sdgx.models.statistics.single_table.copula import GaussianCopulaSynthesizer
from sdgx.utils.io.csv_utils import *

# 针对 csv 格式的小规模数据
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "0.1.0"
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from __future__ import annotations

from sdgx.data_models.inspectors.base import Inspector
from sdgx.data_models.inspectors.extension import hookimpl


class MyOwnInspector(Inspector):
...


@hookimpl
def register(manager):
manager.register("DummyInspector", MyOwnInspector)
27 changes: 27 additions & 0 deletions example/extension/dummymetadatainspector/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "dummymetadatainspector"
dependencies = ["sdgx"]
dynamic = ["version"]
requires-python = ">=3.8"
classifiers = [
"Programming Language :: Python :: 3",
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
]
[project.optional-dependencies]
test = ["pytest"]

[tool.check-manifest]
ignore = [".*"]

[tool.hatch.version]
path = "dummymetadatainspector/__init__.py"

[project.entry-points."sdgx.metadata.inspector"]
dummymetadatainspector = "dummymetadatainspector.inspector"
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pytest

from sdgx.data_models.inspectors.manager import InspectorManager


@pytest.fixture
def manager():
yield InspectorManager()


def test_registed_cacher(manager: InspectorManager):
assert manager._normalize_name("DummyInspector") in manager.registed_inspectors


if __name__ == "__main__":
pytest.main(["-vv", "-s", __file__])
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dependencies = [
"pluggy",
"loguru",
"pyarrow",
"pydantic>=2"
]
dynamic = ["version"]
classifiers = [
Expand Down
6 changes: 6 additions & 0 deletions sdgx/cachers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ def load_all(self, data_connector: DataConnector) -> pd.DataFrame:

raise NotImplementedError

def clear_cache(self):
"""
Clear all cache
"""
return

def clear_invalid_cache(self):
"""
Clear invalid cache.
Expand Down
10 changes: 9 additions & 1 deletion sdgx/cachers/disk_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,21 @@ def __init__(
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(parents=True, exist_ok=True)

def clear_invalid_cache(self):
def clear_cache(self):
"""
Clear all cache in cache_dir.
"""
for f in self.cache_dir.glob("*.parquet"):
f.unlink()

def clear_invalid_cache(self):
"""
Clear all cache in cache_dir.
TODO: Improve cache invalidation
"""
return self.clear_cache()

def _get_cache_filename(self, offset: int) -> Path:
"""
Get cache filename
Expand Down
8 changes: 7 additions & 1 deletion sdgx/data_connectors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def iter(self, offset=0, chunksize=0) -> Generator[pd.DataFrame, None, None]:
Returns:
Generator[pd.DataFrame, None, None]: Generator/Iterator for readed dataframe
"""
raise NotImplementedError
return self._iter(offset, chunksize)

def read(self, offset=0, limit=None) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -82,3 +82,9 @@ def keys(self) -> list[str]:
Same as ``columns``.
"""
return self.columns()

def finalize(self):
"""
Finalize the data connector.
"""
pass
6 changes: 5 additions & 1 deletion sdgx/data_connectors/csv_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,11 @@ def _columns(self) -> list[str]:
).columns.tolist()
return d

def iter(self, offset=0, chunksize=1000) -> Generator[pd.DataFrame, None, None]:
def _iter(self, offset=0, chunksize=1000) -> Generator[pd.DataFrame, None, None]:
if chunksize is None:
yield self._read(offset=offset)
return

for d in pd.read_csv(
self.path,
sep=self.sep,
Expand Down
60 changes: 60 additions & 0 deletions sdgx/data_connectors/generator_connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from __future__ import annotations

from typing import Callable, Generator

import pandas as pd

from sdgx.data_connectors.base import DataConnector


class GeneratorConnector(DataConnector):
"""
A virtual data connector that wrap `Generator <https://docs.python.org/3/glossary.html#term-generator>`_
into a DataConnector.
Passing ``offset=0`` to ``read`` will reset the generator.
Warning:
``offset`` and ``limit`` are ignored as ``Generator`` not supporting random access.
Note:
This connector is not been registered by default. So only be used with the library way.
"""

@property
def identity(self) -> str:
return f"{id(self.generator_caller)}"

def __init__(
self,
generator_caller: Callable[[], Generator[pd.DataFrame, None, None]],
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.generator_caller = generator_caller
self._generator = self.generator_caller()

def _read(self, offset=0, limit=None) -> pd.DataFrame:
"""
Ingore limit and allow sequential reading.
"""
if offset == 0:
self._generator = self.generator_caller()

try:
return next(self._generator)
except StopIteration:
return None

def _columns(self) -> list[str]:
for df in self._iter():
return list(df.columns)

def _iter(self, offset=0, chunksize=0) -> Generator[pd.DataFrame, None, None]:
"""
Subclass should implement this for reading data in chunk.
See ``iter`` for more details.
"""
return self.generator_caller()
18 changes: 14 additions & 4 deletions sdgx/data_loader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

from typing import Any, Generator

import pandas as pd
Expand All @@ -24,7 +26,7 @@ class DataLoader:
def __init__(
self,
data_connector: DataConnector,
chunksize: int = 1000,
chunksize: int = 10000,
cacher: Cacher | None = None,
cache_mode: str = "DiskCache",
cacher_kwargs: None | dict[str, Any] = None,
Expand All @@ -33,11 +35,11 @@ def __init__(
self.chunksize = chunksize
self.cache_manager = CacherManager()

if not cacher_kwargs:
cacher_kwargs = {}
cacher_kwargs.setdefault("blocksize", self.chunksize)
cacher_kwargs.setdefault("identity", self.data_connector.identity)
if not cacher:
self.cacher = self.cache_manager.init_cacher(cache_mode, **cacher_kwargs)
self.cacher = cacher
self.cacher = cacher or self.cache_manager.init_cacher(cache_mode, **cacher_kwargs)

self.cacher.clear_invalid_cache()

Expand Down Expand Up @@ -68,3 +70,11 @@ def load_all(self) -> pd.DataFrame:
Load all data from cache.
"""
return self.cacher.load_all(self.data_connector)

def finalize(self, clear_cache=False) -> None:
"""
Finalize the dataloader.
"""
self.data_connector.finalize()
if clear_cache:
self.cacher.clear_cache()
25 changes: 25 additions & 0 deletions sdgx/data_models/inspectors/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pandas as pd

from sdgx.data_models.inspectors.inspect_meta import InspectMeta


class Inspector:
"""
Base Inspector class
Inspector is used to inspect data and generate metadata automatically.
"""

ready: bool
"""Ready to inspect, maybe all fields are fitted."""

def fit(self, raw_data: pd.DataFrame):
"""Fit the inspector.
Args:
raw_data (pd.DataFrame): Raw data
"""
return

def inspect(self) -> InspectMeta:
"""Inspect raw data and generate metadata."""
56 changes: 56 additions & 0 deletions sdgx/data_models/inspectors/extension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from __future__ import annotations

from typing import Any

import pluggy

project_name = "sdgx.metadata.inspector"
"""
The entry-point name of this extension.
Should be used in ``pyproject.toml`` as ``[project.entry-points."{project_name}"]``
"""
hookimpl = pluggy.HookimplMarker(project_name)
"""
Hookimpl marker for this extension, extension module should use this marker
Example:
.. code-block:: python
@hookimpl
def register(manager):
...
"""

hookspec = pluggy.HookspecMarker(project_name)


@hookspec
def register(manager):
"""
For more information about this function, please check the :ref:`manager`
We provided an example package for you in ``{project_root}/example/extension/dummymetadatainspector``.
Example:
.. code-block:: python
class MyOwnInspector(Inspector):
...
from sdgx.data_models.inspectors.extension import hookimpl
@hookimpl
def register(manager):
manager.register("DummyInspector", MyOwnInspector)
Config ``project.entry-points`` so that we can find it
.. code-block:: toml
[project.entry-points."sdgx.metadata.inspector"]
{whatever-name} = "{package}.{path}.{to}.{file-with-hookimpl-function}"
"""
5 changes: 5 additions & 0 deletions sdgx/data_models/inspectors/inspect_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from pydantic import BaseModel


class InspectMeta:
pass
Loading