Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dict support on metadata, optimize datetime format judgment rules, add eq for combiner #135

Merged
merged 21 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions sdgx/data_models/combiner.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from collections.abc import Iterable
from itertools import chain
from pathlib import Path
from typing import Dict, List

Expand Down Expand Up @@ -258,3 +260,26 @@ def upgrade(
"""

pass

@property
def fields(self) -> Iterable[str]:
"""
Return all fields in MetadataCombiner.
"""

return chain(
(k for k in self.model_fields if k.endswith("_columns")),
)

def __eq__(self, other):
if not isinstance(other, MetadataCombiner):
return super().__eq__(other)

# if self and other has the same
return (
self.version == other.version
and all(
self.get(key) == other.get(key) for key in set(chain(self.fields, other.fields))
)
and set(self.fields) == set(other.fields)
)
28 changes: 24 additions & 4 deletions sdgx/data_models/inspectors/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,19 @@ class DatetimeInspector(Inspector):

Often, difficult-to-recognize date or datetime objects are also recognized as descrete types by DatetimeInspector, causing the column to be marked repeatedly.
"""
PRESET_FORMAT_STRINGS = ["%Y/%m/%d", "%Y-%m-%d", "%d %b %Y"]

_format_match_rate = 0.9
"""
When specifically check the datatime format, problems caused by missing values and incorrect values will inevitably occur.
To fix this, we discard the .any() method and use the `match_rate` to increase the robustness of this inspector.
"""

PRESET_FORMAT_STRINGS = [
"%Y-%m-%d",
"%d %b %Y",
"%b-%Y",
"%Y/%m/%d",
]

def __init__(self, user_formats: list[str] = None, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -85,19 +97,27 @@ def detect_datetime_format(self, series: pd.Series):
Returns:
str: The datetime format that can parse all dates in the series, or None if no such format is found.
"""

def _is_series_fit_format(parsed_series, match_rate):
length = len(parsed_series)
false_num = len(list(i for i in parsed_series if i is False))
false_rate = false_num / length
if false_rate >= match_rate:
return True
return False
Wh1isper marked this conversation as resolved.
Show resolved Hide resolved

for fmt in self.user_defined_formats + self.PRESET_FORMAT_STRINGS:
try:
# Check if all dates in the series can be parsed with this format
parsed_series = series.apply(
lambda x: pd.to_datetime(x, format=fmt, errors="coerce")
)
if not parsed_series.isnull().any():
# if fit return format, return
if _is_series_fit_format(parsed_series.isnull(), self._format_match_rate):
return fmt
except ValueError:
continue

self.ready = True

def inspect(self, *args, **kwargs) -> dict[str, Any]:
"""Inspect raw data and generate metadata."""

Expand Down
50 changes: 44 additions & 6 deletions sdgx/data_models/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class Metadata(BaseModel):
bool_columns: Set[str] = set()
discrete_columns: Set[str] = set()
datetime_columns: Set[str] = set()
datetime_format: Dict = defaultdict(str)

# version info
version: str = "1.0"
Expand All @@ -78,7 +79,18 @@ def tag_fields(self) -> Iterable[str]:

return chain(
(k for k in self.model_fields if k.endswith("_columns")),
self._extend.keys(),
(k for k in self._extend.keys() if k.endswith("_columns")),
)

@property
def format_fields(self) -> Iterable[str]:
"""
Return all tag fields in this metadata.
"""

return chain(
(k for k in self.model_fields if k.endswith("_format")),
(k for k in self._extend.keys() if k.endswith("_format")),
)

def __eq__(self, other):
Expand All @@ -90,6 +102,10 @@ def __eq__(self, other):
self.get(key) == other.get(key)
for key in set(chain(self.tag_fields, other.tag_fields))
)
and all(
self.get(key) == other.get(key)
for key in set(chain(self.format_fields, other.format_fields))
)
and self.version == other.version
)

Expand Down Expand Up @@ -149,7 +165,11 @@ def set(self, key: str, value: Any):
raise MetadataInitError("Cannot set _extend directly")

old_value = self.get(key)
if key in self.model_fields and key not in self.tag_fields:
if (
key in self.model_fields
and key not in self.tag_fields
and key not in self.format_fields
):
raise MetadataInitError(
f"Set {key} not in tag_fields, try set it directly as m.{key} = value"
)
Expand Down Expand Up @@ -181,12 +201,28 @@ def add(self, key: str, values: str | Iterable[str]):
m.add("id_columns", "ticket_id")
# OR
m.add("id_columns", ["user_id", "ticket_id"])
# OR
# add datetime format
m.add('datetime_format',{"col_1": "%Y-%m-%d %H:%M:%S", "col_2": "%d %b %Y"})
"""

values = (
values if isinstance(values, Iterable) and not isinstance(values, str) else [values]
)

# dict support, this prevents the value in the key-value pair from being discarded
if isinstance(values, dict):
# already in fields that contains dict
if key in list(self.format_fields):
self.get(key).update(values)

# in extend
if self._extend.get(key, None) is None:
self._extend[key] = values
else:
self._extend[key].update(values)
return

for value in values:
self.get(key).add(value)

Expand Down Expand Up @@ -274,7 +310,8 @@ def from_dataloader(
metadata.update({"pii_columns": inspect_res[each_key]})
# update inspect level
for each_key in inspect_res:
metadata.column_inspect_level[each_key] = inspector.inspect_level
if "columns" in each_key:
metadata.column_inspect_level[each_key] = inspector.inspect_level

if not primary_keys:
metadata.update_primary_key(metadata.id_columns)
Expand Down Expand Up @@ -326,14 +363,15 @@ def from_dataframe(
metadata.update({"pii_columns": inspect_res[each_key]})
# update inspect level
for each_key in inspect_res:
metadata.column_inspect_level[each_key] = inspector.inspect_level
if "columns" in each_key:
metadata.column_inspect_level[each_key] = inspector.inspect_level

if check:
metadata.check()
return metadata

def _dump_json(self):
return self.model_dump_json()
def _dump_json(self) -> str:
return self.model_dump_json(indent=4)

def save(self, path: str | Path):
"""
Expand Down
Loading