Add Regex Inspector and Email Inspector example. (#115)

* add InspectorInitError * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [Sweep GHA Fix] The GitHub Actions run failed with... (#116) * feat: Add new_system.rst to document the design of * feat: Updated LICENSE * feat: Updated docs/source/design/motivation.rst * feat: Updated docs/source/developer_guides/extensi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: sweep-ai[bot] <128439645+sweep-ai[bot]@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * add regex inspector (still draft) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add regex base inspector * add some personal info inspector * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix hookimpl * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add _inspect_level * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * discard weird change from sweep * fix typo in sweep commits * add PII attribute * add email test case (still draft) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add personal info inspector * add test cases (still draft) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add domain_verification * update localized inspectors * add test cases * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * discard sweep change * fix col name typo * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix expection type Now we have InspectorInitError, which can replace DataModelError when initialize an inspector * add corner test cases * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix version typo * add inspector manager testcase * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: sweep-ai[bot] <128439645+sweep-ai[bot]@users.noreply.github.com>
hitsz-ids · Jan 29, 2024 · c358110 · c358110
1 parent fc5201e
commit c358110
Show file tree

Hide file tree

Showing 8 changed files with 547 additions and 20 deletions.
diff --git a/sdgx/data_models/inspectors/base.py b/sdgx/data_models/inspectors/base.py
@@ -8,7 +8,7 @@
     from sdgx.data_models.metadata import Metadata
 
 from sdgx.data_models.relationship import Relationship
-from sdgx.exceptions import DataModelError
+from sdgx.exceptions import InspectorInitError
 
 
 class Inspector:
@@ -28,7 +28,7 @@ class Inspector:
 
     _inspect_level: int = 10
     """
-    Inspected level is a concept newly introduced in version 0.1.5. Since a single column in the table may be marked by different inspectors at the same time (for example: the email column may be recognized as email, but it may also be recognized as the id column, and it may also be recognized by different inspectors at the same time identified as a discrete column, which will cause confusion in subsequent processing), the inspect_leve is used when determining the specific type of a column.
+    Inspected level is a concept newly introduced in version 0.1.6. Since a single column in the table may be marked by different inspectors at the same time (for example: the email column may be recognized as email, but it may also be recognized as the id column, and it may also be recognized by different inspectors at the same time identified as a discrete column, which will cause confusion in subsequent processing), the inspect_leve is used when determining the specific type of a column.
 
     We will preset different inspector levels for different inspectors, usually more specific inspectors will get higher levels, and general inspectors (like discrete) will have inspect_level.
 
@@ -44,7 +44,7 @@ def inspect_level(self, value: int):
         if value > 0 and value <= 100:
             self._inspect_level = value
         else:
-            raise DataModelError("The inspect_level should be set in [1, 100].")
+            raise InspectorInitError("The inspect_level should be set in [1, 100].")
 
     def __init__(self, inspect_level=None, *args, **kwargs):
         self.ready: bool = False

diff --git a/sdgx/data_models/inspectors/personal.py b/sdgx/data_models/inspectors/personal.py
@@ -0,0 +1,87 @@
+import re
+
+from sdgx.data_models.inspectors.extension import hookimpl
+from sdgx.data_models.inspectors.regex import RegexInspector
+
+
+class EmailInspector(RegexInspector):
+    pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
+
+    data_type_name = "email"
+
+    _inspect_level = 30
+
+    pii = True
+
+
+class ChinaMainlandIDInspector(RegexInspector):
+    pattern = (
+        r"^[1-9]\d{5}(18|19|20)\d{2}((0[1-9])|(1[0-2]))(([0-2][1-9])|10|20|30|31)\d{3}[0-9Xx]$"
+    )
+
+    data_type_name = "china_mainland_id"
+
+    _inspect_level = 30
+
+    pii = True
+
+
+class ChinaMainlandMobilePhoneInspector(RegexInspector):
+    pattern = r"^1[3-9]\d{9}$"
+
+    data_type_name = "china_mainland_mobile_phone"
+
+    _inspect_level = 30
+
+    pii = True
+
+
+# 邮编
+class ChinaMainlandPostCode(RegexInspector):
+    pattern = r"^[0-9]{6}$"
+
+    _match_percentage = 0.95
+    """
+    Since zip codes and six-digit integers are the same, here we increase match_percentage to prevent some pure integer columns from being recognized.
+    """
+
+    data_type_name = "china_mainland_postcode"
+
+    _inspect_level = 20
+
+    pii = False
+
+
+# 统一社会信用代码
+class ChinaMainlandUnifiedSocialCreditCode(RegexInspector):
+    pattern = r"^[0-9A-HJ-NPQRTUWXY]{2}\d{6}[0-9A-HJ-NPQRTUWXY]{10}$"
+
+    data_type_name = "unified_social_credit_code"
+
+    _inspect_level = 30
+
+    pii = True
+
+    pattern_ID = (
+        r"^[1-9]\d{5}(18|19|20)\d{2}((0[1-9])|(1[0-2]))(([0-2][1-9])|10|20|30|31)\d{3}[0-9Xx]$"
+    )
+
+    p_id = re.compile(pattern_ID)
+
+    def domain_verification(self, each_sample):
+        if re.match(self.p_id, each_sample):
+            return False
+        return True
+
+
+@hookimpl
+def register(manager):
+    manager.register("EmailInspector", EmailInspector)
+
+    manager.register("ChinaMainlandIDInspector", ChinaMainlandIDInspector)
+
+    manager.register("ChinaMainlandMobilePhoneInspector", ChinaMainlandMobilePhoneInspector)
+
+    manager.register("ChinaMainlandPostCode", ChinaMainlandPostCode)
+
+    manager.register("ChinaMainlandUnifiedSocialCreditCode", ChinaMainlandUnifiedSocialCreditCode)
diff --git a/sdgx/data_models/inspectors/regex.py b/sdgx/data_models/inspectors/regex.py
@@ -0,0 +1,117 @@
+from __future__ import annotations
+
+import re
+from typing import Any
+
+import pandas as pd
+
+from sdgx.data_models.inspectors.base import Inspector
+from sdgx.exceptions import InspectorInitError
+
+# By default, we will not directly register the RegexInspector to the Inspector Manager
+# Instead, use it as a baseclass or user-defined regex, then put it into the Inspector Manager or use it alone
+
+
+class RegexInspector(Inspector):
+    """RegexInspector
+    RegexInspector is a sdgx inspector that uses regular expression rules to detect column data types. It can be initialized with a custom expression, or it can be inherited and applied to specific data types,such as email, US address, HKID etc.
+    """
+
+    pattern: str = None
+    """
+    pattern is the regular expression string of current inspector.
+    """
+
+    data_type_name: str = None
+    """
+    data_type_name is the name of the data type, such as email, US address, HKID etc.
+    """
+
+    _match_percentage: float = 0.8
+    """
+    match_percentage shoud > 0.5 and < 1.
+
+    Due to the existence of empty data, wrong data, etc., the match_percentage is the proportion of the current regular expression compound. When the number of compound regular expressions is higher than this ratio, the column can be considered fit the current data type.
+    """
+
+    @property
+    def match_percentage(self):
+        return self._match_percentage
+
+    @match_percentage.setter
+    def match_percentage(self, value):
+        if value > 0.5 and value <= 1:
+            self._match_percentage = value
+        else:
+            raise InspectorInitError("The match_percentage should be set in (0.5, 1].")
+
+    def __init__(
+        self,
+        pattern: str = None,
+        data_type_name: str = None,
+        match_percentage: float = None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.regex_columns: set[str] = set()
+
+        # this pattern should be a re pattern
+        if pattern:
+            self.pattern = pattern
+        # check pattern
+        if self.pattern is None:
+            raise InspectorInitError("Regular expression NOT found.")
+        self.p = re.compile(self.pattern)
+
+        # set data_type_name
+        if data_type_name:
+            if data_type_name.endswith("_columns"):
+                self.data_type_name = data_type_name[:-8]
+            else:
+                self.data_type_name = data_type_name
+        elif not self.data_type_name:
+            self.data_type_name = f"regex_{self.pattern}_columns"
+        # then chech the data type name
+        if self.data_type_name is None:
+            raise InspectorInitError("Inspector's data type undefined.")
+
+        # set percentage
+        if match_percentage:
+            self.match_percentage = match_percentage
+
+    def fit(self, raw_data: pd.DataFrame, *args, **kwargs):
+        """Fit the inspector.
+
+        Finds the list of regex columns from the raw data.
+
+        Args:
+            raw_data (pd.DataFrame): Raw data
+        """
+        for each_col in raw_data.columns:
+            each_match_rate = self._fit_column(raw_data[each_col])
+            if each_match_rate > self.match_percentage:
+                self.regex_columns.add(each_col)
+
+        self.ready = True
+
+    def domain_verification(self, each_sample):
+        return True
+
+    def _fit_column(self, column_data: pd.Series):
+        """
+        Regular expression matching for a single column, returning the matching ratio.
+        """
+        length = len(column_data)
+        match_cnt = 0
+        for i in column_data:
+            m = re.match(self.p, str(i))
+            d = self.domain_verification(str(i))
+            if m and d:
+                match_cnt += 1
+        return match_cnt / length
+
+    def inspect(self, *args, **kwargs) -> dict[str, Any]:
+        """Inspect raw data and generate metadata."""
+
+        return {self.data_type_name + "_columns": list(self.regex_columns)}
diff --git a/sdgx/exceptions.py b/sdgx/exceptions.py
@@ -137,3 +137,7 @@ class MetadataCombinerInvalidError(MetadataCombinerError):
 
 class MetadataCombinerInitError(MetadataCombinerError):
     ERROR_CODE = 9006
+
+
+class InspectorInitError(DataModelError):
+    ERROR_CODE = 9007
diff --git a/tests/data_models/inspector/test_bool.py b/tests/data_models/inspector/test_bool.py
@@ -3,7 +3,7 @@
 import pytest
 
 from sdgx.data_models.inspectors.bool import BoolInspector
-from sdgx.exceptions import DataModelError
+from sdgx.exceptions import InspectorInitError
 
 
 @pytest.fixture
@@ -39,23 +39,13 @@ def test_inspector_demo_data(inspector: BoolInspector, raw_data):
     assert not inspector.bool_columns
     assert sorted(inspector.inspect()["bool_columns"]) == sorted([])
     assert inspector.inspect_level == 10
-    # test inspect_level.setter
-    try:
-        inspector.inspect_level = 120
-    except Exception as e:
-        assert type(e) == DataModelError
 
 
 def test_inspector_generated_data(inspector: BoolInspector, bool_test_df: pd.DataFrame):
     # use generated id data
     inspector.fit(bool_test_df)
     assert inspector.bool_columns
     assert sorted(inspector.inspect()["bool_columns"]) == sorted(["bool_random"])
-    assert inspector.inspect_level == 10
-    try:
-        inspector.inspect_level = 0
-    except Exception as e:
-        assert type(e) == DataModelError
 
 
 if __name__ == "__main__":