Skip to content

Commit

Permalink
Add dict support on metadata, optimize datetime format judgment rules…
Browse files Browse the repository at this point in the history
…, add eq for combiner (#135)

* Update metadata.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add dict support in metadata

* update datetime fmt detect

still draft

* Update datetime.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add __eq__ for combiner

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update datetime.py

* add datetime_format and add indent

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add tag/format fields

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update metadata.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Apply suggestions from code review

Co-authored-by: Zhongsheng Ji <9573586@qq.com>

* Update datetime.py

* Update datetime.py

* Update sdgx/data_models/inspectors/datetime.py

* Update sdgx/data_models/inspectors/datetime.py

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Zhongsheng Ji <9573586@qq.com>
  • Loading branch information
3 people committed Feb 7, 2024
1 parent 782bcfe commit 775e605
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 10 deletions.
25 changes: 25 additions & 0 deletions sdgx/data_models/combiner.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from collections.abc import Iterable
from itertools import chain
from pathlib import Path
from typing import Dict, List

Expand Down Expand Up @@ -258,3 +260,26 @@ def upgrade(
"""

pass

@property
def fields(self) -> Iterable[str]:
"""
Return all fields in MetadataCombiner.
"""

return chain(
(k for k in self.model_fields if k.endswith("_columns")),
)

def __eq__(self, other):
if not isinstance(other, MetadataCombiner):
return super().__eq__(other)

# if self and other has the same
return (
self.version == other.version
and all(
self.get(key) == other.get(key) for key in set(chain(self.fields, other.fields))
)
and set(self.fields) == set(other.fields)
)
26 changes: 22 additions & 4 deletions sdgx/data_models/inspectors/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,19 @@ class DatetimeInspector(Inspector):
Often, difficult-to-recognize date or datetime objects are also recognized as descrete types by DatetimeInspector, causing the column to be marked repeatedly.
"""
PRESET_FORMAT_STRINGS = ["%Y/%m/%d", "%Y-%m-%d", "%d %b %Y"]

_format_match_rate = 0.9
"""
When specifically check the datatime format, problems caused by missing values and incorrect values will inevitably occur.
To fix this, we discard the .any() method and use the `match_rate` to increase the robustness of this inspector.
"""

PRESET_FORMAT_STRINGS = [
"%Y-%m-%d",
"%d %b %Y",
"%b-%Y",
"%Y/%m/%d",
]

def __init__(self, user_formats: list[str] = None, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -85,19 +97,25 @@ def detect_datetime_format(self, series: pd.Series):
Returns:
str: The datetime format that can parse all dates in the series, or None if no such format is found.
"""

def _is_series_fit_format(parsed_series, match_rate):
length = len(parsed_series)
false_num = len(list(i for i in parsed_series if i is False))
false_rate = false_num / length
return false_rate >= match_rate

for fmt in self.user_defined_formats + self.PRESET_FORMAT_STRINGS:
try:
# Check if all dates in the series can be parsed with this format
parsed_series = series.apply(
lambda x: pd.to_datetime(x, format=fmt, errors="coerce")
)
if not parsed_series.isnull().any():
# if fit return format, return
if _is_series_fit_format(parsed_series.isnull(), self._format_match_rate):
return fmt
except ValueError:
continue

self.ready = True

def inspect(self, *args, **kwargs) -> dict[str, Any]:
"""Inspect raw data and generate metadata."""

Expand Down
50 changes: 44 additions & 6 deletions sdgx/data_models/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class Metadata(BaseModel):
bool_columns: Set[str] = set()
discrete_columns: Set[str] = set()
datetime_columns: Set[str] = set()
datetime_format: Dict = defaultdict(str)

# version info
version: str = "1.0"
Expand All @@ -78,7 +79,18 @@ def tag_fields(self) -> Iterable[str]:

return chain(
(k for k in self.model_fields if k.endswith("_columns")),
self._extend.keys(),
(k for k in self._extend.keys() if k.endswith("_columns")),
)

@property
def format_fields(self) -> Iterable[str]:
"""
Return all tag fields in this metadata.
"""

return chain(
(k for k in self.model_fields if k.endswith("_format")),
(k for k in self._extend.keys() if k.endswith("_format")),
)

def __eq__(self, other):
Expand All @@ -90,6 +102,10 @@ def __eq__(self, other):
self.get(key) == other.get(key)
for key in set(chain(self.tag_fields, other.tag_fields))
)
and all(
self.get(key) == other.get(key)
for key in set(chain(self.format_fields, other.format_fields))
)
and self.version == other.version
)

Expand Down Expand Up @@ -149,7 +165,11 @@ def set(self, key: str, value: Any):
raise MetadataInitError("Cannot set _extend directly")

old_value = self.get(key)
if key in self.model_fields and key not in self.tag_fields:
if (
key in self.model_fields
and key not in self.tag_fields
and key not in self.format_fields
):
raise MetadataInitError(
f"Set {key} not in tag_fields, try set it directly as m.{key} = value"
)
Expand Down Expand Up @@ -181,12 +201,28 @@ def add(self, key: str, values: str | Iterable[str]):
m.add("id_columns", "ticket_id")
# OR
m.add("id_columns", ["user_id", "ticket_id"])
# OR
# add datetime format
m.add('datetime_format',{"col_1": "%Y-%m-%d %H:%M:%S", "col_2": "%d %b %Y"})
"""

values = (
values if isinstance(values, Iterable) and not isinstance(values, str) else [values]
)

# dict support, this prevents the value in the key-value pair from being discarded
if isinstance(values, dict):
# already in fields that contains dict
if key in list(self.format_fields):
self.get(key).update(values)

# in extend
if self._extend.get(key, None) is None:
self._extend[key] = values
else:
self._extend[key].update(values)
return

for value in values:
self.get(key).add(value)

Expand Down Expand Up @@ -274,7 +310,8 @@ def from_dataloader(
metadata.update({"pii_columns": inspect_res[each_key]})
# update inspect level
for each_key in inspect_res:
metadata.column_inspect_level[each_key] = inspector.inspect_level
if "columns" in each_key:
metadata.column_inspect_level[each_key] = inspector.inspect_level

if not primary_keys:
metadata.update_primary_key(metadata.id_columns)
Expand Down Expand Up @@ -326,14 +363,15 @@ def from_dataframe(
metadata.update({"pii_columns": inspect_res[each_key]})
# update inspect level
for each_key in inspect_res:
metadata.column_inspect_level[each_key] = inspector.inspect_level
if "columns" in each_key:
metadata.column_inspect_level[each_key] = inspector.inspect_level

if check:
metadata.check()
return metadata

def _dump_json(self):
return self.model_dump_json()
def _dump_json(self) -> str:
return self.model_dump_json(indent=4)

def save(self, path: str | Path):
"""
Expand Down

0 comments on commit 775e605

Please sign in to comment.