Add dict support on metadata, optimize datetime format judgment rules…

…, add eq for combiner (#135) * Update metadata.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add dict support in metadata * update datetime fmt detect still draft * Update datetime.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add __eq__ for combiner * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update datetime.py * add datetime_format and add indent * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add tag/format fields * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update metadata.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply suggestions from code review Co-authored-by: Zhongsheng Ji <9573586@qq.com> * Update datetime.py * Update datetime.py * Update sdgx/data_models/inspectors/datetime.py * Update sdgx/data_models/inspectors/datetime.py --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Zhongsheng Ji <9573586@qq.com>
hitsz-ids · Feb 7, 2024 · 775e605 · 775e605
1 parent 782bcfe
commit 775e605
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 10 deletions.
diff --git a/sdgx/data_models/combiner.py b/sdgx/data_models/combiner.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from collections.abc import Iterable
+from itertools import chain
 from pathlib import Path
 from typing import Dict, List
 
@@ -258,3 +260,26 @@ def upgrade(
         """
 
         pass
+
+    @property
+    def fields(self) -> Iterable[str]:
+        """
+        Return all fields in MetadataCombiner.
+        """
+
+        return chain(
+            (k for k in self.model_fields if k.endswith("_columns")),
+        )
+
+    def __eq__(self, other):
+        if not isinstance(other, MetadataCombiner):
+            return super().__eq__(other)
+
+        # if self and other has the same
+        return (
+            self.version == other.version
+            and all(
+                self.get(key) == other.get(key) for key in set(chain(self.fields, other.fields))
+            )
+            and set(self.fields) == set(other.fields)
+        )
diff --git a/sdgx/data_models/inspectors/datetime.py b/sdgx/data_models/inspectors/datetime.py
@@ -17,7 +17,19 @@ class DatetimeInspector(Inspector):
 
     Often, difficult-to-recognize date or datetime objects are also recognized as descrete types by DatetimeInspector, causing the column to be marked repeatedly.
     """
-    PRESET_FORMAT_STRINGS = ["%Y/%m/%d", "%Y-%m-%d", "%d %b %Y"]
+
+    _format_match_rate = 0.9
+    """
+    When specifically check the datatime format, problems caused by missing values and incorrect values will inevitably occur.
+    To fix this, we discard the .any()  method and use the `match_rate` to increase the robustness of this inspector.
+    """
+
+    PRESET_FORMAT_STRINGS = [
+        "%Y-%m-%d",
+        "%d %b %Y",
+        "%b-%Y",
+        "%Y/%m/%d",
+    ]
 
     def __init__(self, user_formats: list[str] = None, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -85,19 +97,25 @@ def detect_datetime_format(self, series: pd.Series):
         Returns:
                str: The datetime format that can parse all dates in the series, or None if no such format is found.
         """
+
+        def _is_series_fit_format(parsed_series, match_rate):
+            length = len(parsed_series)
+            false_num = len(list(i for i in parsed_series if i is False))
+            false_rate = false_num / length
+            return false_rate >= match_rate
+
         for fmt in self.user_defined_formats + self.PRESET_FORMAT_STRINGS:
             try:
                 # Check if all dates in the series can be parsed with this format
                 parsed_series = series.apply(
                     lambda x: pd.to_datetime(x, format=fmt, errors="coerce")
                 )
-                if not parsed_series.isnull().any():
+                # if fit return format, return
+                if _is_series_fit_format(parsed_series.isnull(), self._format_match_rate):
                     return fmt
             except ValueError:
                 continue
 
-        self.ready = True
-
     def inspect(self, *args, **kwargs) -> dict[str, Any]:
         """Inspect raw data and generate metadata."""
 

diff --git a/sdgx/data_models/metadata.py b/sdgx/data_models/metadata.py
@@ -62,6 +62,7 @@ class Metadata(BaseModel):
     bool_columns: Set[str] = set()
     discrete_columns: Set[str] = set()
     datetime_columns: Set[str] = set()
+    datetime_format: Dict = defaultdict(str)
 
     # version info
     version: str = "1.0"
@@ -78,7 +79,18 @@ def tag_fields(self) -> Iterable[str]:
 
         return chain(
             (k for k in self.model_fields if k.endswith("_columns")),
-            self._extend.keys(),
+            (k for k in self._extend.keys() if k.endswith("_columns")),
+        )
+
+    @property
+    def format_fields(self) -> Iterable[str]:
+        """
+        Return all tag fields in this metadata.
+        """
+
+        return chain(
+            (k for k in self.model_fields if k.endswith("_format")),
+            (k for k in self._extend.keys() if k.endswith("_format")),
         )
 
     def __eq__(self, other):
@@ -90,6 +102,10 @@ def __eq__(self, other):
                 self.get(key) == other.get(key)
                 for key in set(chain(self.tag_fields, other.tag_fields))
             )
+            and all(
+                self.get(key) == other.get(key)
+                for key in set(chain(self.format_fields, other.format_fields))
+            )
             and self.version == other.version
         )
 
@@ -149,7 +165,11 @@ def set(self, key: str, value: Any):
             raise MetadataInitError("Cannot set _extend directly")
 
         old_value = self.get(key)
-        if key in self.model_fields and key not in self.tag_fields:
+        if (
+            key in self.model_fields
+            and key not in self.tag_fields
+            and key not in self.format_fields
+        ):
             raise MetadataInitError(
                 f"Set {key} not in tag_fields, try set it directly as m.{key} = value"
             )
@@ -181,12 +201,28 @@ def add(self, key: str, values: str | Iterable[str]):
                 m.add("id_columns", "ticket_id")
                 # OR
                 m.add("id_columns", ["user_id", "ticket_id"])
+                # OR
+                # add datetime format
+                m.add('datetime_format',{"col_1": "%Y-%m-%d %H:%M:%S", "col_2": "%d %b %Y"})
         """
 
         values = (
             values if isinstance(values, Iterable) and not isinstance(values, str) else [values]
         )
 
+        # dict support,  this prevents the value in the key-value pair from being discarded
+        if isinstance(values, dict):
+            # already in fields that contains dict
+            if key in list(self.format_fields):
+                self.get(key).update(values)
+
+            # in extend
+            if self._extend.get(key, None) is None:
+                self._extend[key] = values
+            else:
+                self._extend[key].update(values)
+            return
+
         for value in values:
             self.get(key).add(value)
 
@@ -274,7 +310,8 @@ def from_dataloader(
                     metadata.update({"pii_columns": inspect_res[each_key]})
             # update inspect level
             for each_key in inspect_res:
-                metadata.column_inspect_level[each_key] = inspector.inspect_level
+                if "columns" in each_key:
+                    metadata.column_inspect_level[each_key] = inspector.inspect_level
 
         if not primary_keys:
             metadata.update_primary_key(metadata.id_columns)
@@ -326,14 +363,15 @@ def from_dataframe(
                     metadata.update({"pii_columns": inspect_res[each_key]})
             # update inspect level
             for each_key in inspect_res:
-                metadata.column_inspect_level[each_key] = inspector.inspect_level
+                if "columns" in each_key:
+                    metadata.column_inspect_level[each_key] = inspector.inspect_level
 
         if check:
             metadata.check()
         return metadata
 
-    def _dump_json(self):
-        return self.model_dump_json()
+    def _dump_json(self) -> str:
+        return self.model_dump_json(indent=4)
 
     def save(self, path: str | Path):
         """