Skip to content

Commit

Permalink
Add Regex Inspector and Email Inspector example. (#115)
Browse files Browse the repository at this point in the history
* add InspectorInitError

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [Sweep GHA Fix] The GitHub Actions run failed with... (#116)

* feat: Add new_system.rst to document the design of

* feat: Updated LICENSE

* feat: Updated docs/source/design/motivation.rst

* feat: Updated docs/source/developer_guides/extensi

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: sweep-ai[bot] <128439645+sweep-ai[bot]@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* add regex inspector (still draft)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add regex base inspector

* add some personal info inspector

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix hookimpl

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add _inspect_level

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* discard weird change from sweep

* fix typo in sweep commits

* add PII attribute

* add email test case (still draft)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add personal info inspector

* add test cases (still draft)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add domain_verification

* update localized inspectors

* add test cases

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* discard sweep change

* fix col name typo

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix expection type

Now we have InspectorInitError, which  can replace DataModelError when initialize an inspector

* add corner test cases

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix version typo

* add inspector manager testcase

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: sweep-ai[bot] <128439645+sweep-ai[bot]@users.noreply.github.com>
  • Loading branch information
3 people committed Jan 29, 2024
1 parent fc5201e commit c358110
Show file tree
Hide file tree
Showing 8 changed files with 547 additions and 20 deletions.
6 changes: 3 additions & 3 deletions sdgx/data_models/inspectors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sdgx.data_models.metadata import Metadata

from sdgx.data_models.relationship import Relationship
from sdgx.exceptions import DataModelError
from sdgx.exceptions import InspectorInitError


class Inspector:
Expand All @@ -28,7 +28,7 @@ class Inspector:

_inspect_level: int = 10
"""
Inspected level is a concept newly introduced in version 0.1.5. Since a single column in the table may be marked by different inspectors at the same time (for example: the email column may be recognized as email, but it may also be recognized as the id column, and it may also be recognized by different inspectors at the same time identified as a discrete column, which will cause confusion in subsequent processing), the inspect_leve is used when determining the specific type of a column.
Inspected level is a concept newly introduced in version 0.1.6. Since a single column in the table may be marked by different inspectors at the same time (for example: the email column may be recognized as email, but it may also be recognized as the id column, and it may also be recognized by different inspectors at the same time identified as a discrete column, which will cause confusion in subsequent processing), the inspect_leve is used when determining the specific type of a column.
We will preset different inspector levels for different inspectors, usually more specific inspectors will get higher levels, and general inspectors (like discrete) will have inspect_level.
Expand All @@ -44,7 +44,7 @@ def inspect_level(self, value: int):
if value > 0 and value <= 100:
self._inspect_level = value
else:
raise DataModelError("The inspect_level should be set in [1, 100].")
raise InspectorInitError("The inspect_level should be set in [1, 100].")

def __init__(self, inspect_level=None, *args, **kwargs):
self.ready: bool = False
Expand Down
87 changes: 87 additions & 0 deletions sdgx/data_models/inspectors/personal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import re

from sdgx.data_models.inspectors.extension import hookimpl
from sdgx.data_models.inspectors.regex import RegexInspector


class EmailInspector(RegexInspector):
pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"

data_type_name = "email"

_inspect_level = 30

pii = True


class ChinaMainlandIDInspector(RegexInspector):
pattern = (
r"^[1-9]\d{5}(18|19|20)\d{2}((0[1-9])|(1[0-2]))(([0-2][1-9])|10|20|30|31)\d{3}[0-9Xx]$"
)

data_type_name = "china_mainland_id"

_inspect_level = 30

pii = True


class ChinaMainlandMobilePhoneInspector(RegexInspector):
pattern = r"^1[3-9]\d{9}$"

data_type_name = "china_mainland_mobile_phone"

_inspect_level = 30

pii = True


# 邮编
class ChinaMainlandPostCode(RegexInspector):
pattern = r"^[0-9]{6}$"

_match_percentage = 0.95
"""
Since zip codes and six-digit integers are the same, here we increase match_percentage to prevent some pure integer columns from being recognized.
"""

data_type_name = "china_mainland_postcode"

_inspect_level = 20

pii = False


# 统一社会信用代码
class ChinaMainlandUnifiedSocialCreditCode(RegexInspector):
pattern = r"^[0-9A-HJ-NPQRTUWXY]{2}\d{6}[0-9A-HJ-NPQRTUWXY]{10}$"

data_type_name = "unified_social_credit_code"

_inspect_level = 30

pii = True

pattern_ID = (
r"^[1-9]\d{5}(18|19|20)\d{2}((0[1-9])|(1[0-2]))(([0-2][1-9])|10|20|30|31)\d{3}[0-9Xx]$"
)

p_id = re.compile(pattern_ID)

def domain_verification(self, each_sample):
if re.match(self.p_id, each_sample):
return False
return True


@hookimpl
def register(manager):
manager.register("EmailInspector", EmailInspector)

manager.register("ChinaMainlandIDInspector", ChinaMainlandIDInspector)

manager.register("ChinaMainlandMobilePhoneInspector", ChinaMainlandMobilePhoneInspector)

manager.register("ChinaMainlandPostCode", ChinaMainlandPostCode)

manager.register("ChinaMainlandUnifiedSocialCreditCode", ChinaMainlandUnifiedSocialCreditCode)
117 changes: 117 additions & 0 deletions sdgx/data_models/inspectors/regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from __future__ import annotations

import re
from typing import Any

import pandas as pd

from sdgx.data_models.inspectors.base import Inspector
from sdgx.exceptions import InspectorInitError

# By default, we will not directly register the RegexInspector to the Inspector Manager
# Instead, use it as a baseclass or user-defined regex, then put it into the Inspector Manager or use it alone


class RegexInspector(Inspector):
"""RegexInspector
RegexInspector is a sdgx inspector that uses regular expression rules to detect column data types. It can be initialized with a custom expression, or it can be inherited and applied to specific data types,such as email, US address, HKID etc.
"""

pattern: str = None
"""
pattern is the regular expression string of current inspector.
"""

data_type_name: str = None
"""
data_type_name is the name of the data type, such as email, US address, HKID etc.
"""

_match_percentage: float = 0.8
"""
match_percentage shoud > 0.5 and < 1.
Due to the existence of empty data, wrong data, etc., the match_percentage is the proportion of the current regular expression compound. When the number of compound regular expressions is higher than this ratio, the column can be considered fit the current data type.
"""

@property
def match_percentage(self):
return self._match_percentage

@match_percentage.setter
def match_percentage(self, value):
if value > 0.5 and value <= 1:
self._match_percentage = value
else:
raise InspectorInitError("The match_percentage should be set in (0.5, 1].")

def __init__(
self,
pattern: str = None,
data_type_name: str = None,
match_percentage: float = None,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.regex_columns: set[str] = set()

# this pattern should be a re pattern
if pattern:
self.pattern = pattern
# check pattern
if self.pattern is None:
raise InspectorInitError("Regular expression NOT found.")
self.p = re.compile(self.pattern)

# set data_type_name
if data_type_name:
if data_type_name.endswith("_columns"):
self.data_type_name = data_type_name[:-8]
else:
self.data_type_name = data_type_name
elif not self.data_type_name:
self.data_type_name = f"regex_{self.pattern}_columns"
# then chech the data type name
if self.data_type_name is None:
raise InspectorInitError("Inspector's data type undefined.")

# set percentage
if match_percentage:
self.match_percentage = match_percentage

def fit(self, raw_data: pd.DataFrame, *args, **kwargs):
"""Fit the inspector.
Finds the list of regex columns from the raw data.
Args:
raw_data (pd.DataFrame): Raw data
"""
for each_col in raw_data.columns:
each_match_rate = self._fit_column(raw_data[each_col])
if each_match_rate > self.match_percentage:
self.regex_columns.add(each_col)

self.ready = True

def domain_verification(self, each_sample):
return True

def _fit_column(self, column_data: pd.Series):
"""
Regular expression matching for a single column, returning the matching ratio.
"""
length = len(column_data)
match_cnt = 0
for i in column_data:
m = re.match(self.p, str(i))
d = self.domain_verification(str(i))
if m and d:
match_cnt += 1
return match_cnt / length

def inspect(self, *args, **kwargs) -> dict[str, Any]:
"""Inspect raw data and generate metadata."""

return {self.data_type_name + "_columns": list(self.regex_columns)}
4 changes: 4 additions & 0 deletions sdgx/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,7 @@ class MetadataCombinerInvalidError(MetadataCombinerError):

class MetadataCombinerInitError(MetadataCombinerError):
ERROR_CODE = 9006


class InspectorInitError(DataModelError):
ERROR_CODE = 9007
12 changes: 1 addition & 11 deletions tests/data_models/inspector/test_bool.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest

from sdgx.data_models.inspectors.bool import BoolInspector
from sdgx.exceptions import DataModelError
from sdgx.exceptions import InspectorInitError


@pytest.fixture
Expand Down Expand Up @@ -39,23 +39,13 @@ def test_inspector_demo_data(inspector: BoolInspector, raw_data):
assert not inspector.bool_columns
assert sorted(inspector.inspect()["bool_columns"]) == sorted([])
assert inspector.inspect_level == 10
# test inspect_level.setter
try:
inspector.inspect_level = 120
except Exception as e:
assert type(e) == DataModelError


def test_inspector_generated_data(inspector: BoolInspector, bool_test_df: pd.DataFrame):
# use generated id data
inspector.fit(bool_test_df)
assert inspector.bool_columns
assert sorted(inspector.inspect()["bool_columns"]) == sorted(["bool_random"])
assert inspector.inspect_level == 10
try:
inspector.inspect_level = 0
except Exception as e:
assert type(e) == DataModelError


if __name__ == "__main__":
Expand Down
Loading

0 comments on commit c358110

Please sign in to comment.