-
Notifications
You must be signed in to change notification settings - Fork 541
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Regex Inspector and Email Inspector example. (#115)
* add InspectorInitError * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [Sweep GHA Fix] The GitHub Actions run failed with... (#116) * feat: Add new_system.rst to document the design of * feat: Updated LICENSE * feat: Updated docs/source/design/motivation.rst * feat: Updated docs/source/developer_guides/extensi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: sweep-ai[bot] <128439645+sweep-ai[bot]@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * add regex inspector (still draft) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add regex base inspector * add some personal info inspector * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix hookimpl * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add _inspect_level * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * discard weird change from sweep * fix typo in sweep commits * add PII attribute * add email test case (still draft) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add personal info inspector * add test cases (still draft) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add domain_verification * update localized inspectors * add test cases * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * discard sweep change * fix col name typo * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix expection type Now we have InspectorInitError, which can replace DataModelError when initialize an inspector * add corner test cases * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix version typo * add inspector manager testcase * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: sweep-ai[bot] <128439645+sweep-ai[bot]@users.noreply.github.com>
- Loading branch information
1 parent
fc5201e
commit c358110
Showing
8 changed files
with
547 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import re | ||
|
||
from sdgx.data_models.inspectors.extension import hookimpl | ||
from sdgx.data_models.inspectors.regex import RegexInspector | ||
|
||
|
||
class EmailInspector(RegexInspector): | ||
pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$" | ||
|
||
data_type_name = "email" | ||
|
||
_inspect_level = 30 | ||
|
||
pii = True | ||
|
||
|
||
class ChinaMainlandIDInspector(RegexInspector): | ||
pattern = ( | ||
r"^[1-9]\d{5}(18|19|20)\d{2}((0[1-9])|(1[0-2]))(([0-2][1-9])|10|20|30|31)\d{3}[0-9Xx]$" | ||
) | ||
|
||
data_type_name = "china_mainland_id" | ||
|
||
_inspect_level = 30 | ||
|
||
pii = True | ||
|
||
|
||
class ChinaMainlandMobilePhoneInspector(RegexInspector): | ||
pattern = r"^1[3-9]\d{9}$" | ||
|
||
data_type_name = "china_mainland_mobile_phone" | ||
|
||
_inspect_level = 30 | ||
|
||
pii = True | ||
|
||
|
||
# 邮编 | ||
class ChinaMainlandPostCode(RegexInspector): | ||
pattern = r"^[0-9]{6}$" | ||
|
||
_match_percentage = 0.95 | ||
""" | ||
Since zip codes and six-digit integers are the same, here we increase match_percentage to prevent some pure integer columns from being recognized. | ||
""" | ||
|
||
data_type_name = "china_mainland_postcode" | ||
|
||
_inspect_level = 20 | ||
|
||
pii = False | ||
|
||
|
||
# 统一社会信用代码 | ||
class ChinaMainlandUnifiedSocialCreditCode(RegexInspector): | ||
pattern = r"^[0-9A-HJ-NPQRTUWXY]{2}\d{6}[0-9A-HJ-NPQRTUWXY]{10}$" | ||
|
||
data_type_name = "unified_social_credit_code" | ||
|
||
_inspect_level = 30 | ||
|
||
pii = True | ||
|
||
pattern_ID = ( | ||
r"^[1-9]\d{5}(18|19|20)\d{2}((0[1-9])|(1[0-2]))(([0-2][1-9])|10|20|30|31)\d{3}[0-9Xx]$" | ||
) | ||
|
||
p_id = re.compile(pattern_ID) | ||
|
||
def domain_verification(self, each_sample): | ||
if re.match(self.p_id, each_sample): | ||
return False | ||
return True | ||
|
||
|
||
@hookimpl | ||
def register(manager): | ||
manager.register("EmailInspector", EmailInspector) | ||
|
||
manager.register("ChinaMainlandIDInspector", ChinaMainlandIDInspector) | ||
|
||
manager.register("ChinaMainlandMobilePhoneInspector", ChinaMainlandMobilePhoneInspector) | ||
|
||
manager.register("ChinaMainlandPostCode", ChinaMainlandPostCode) | ||
|
||
manager.register("ChinaMainlandUnifiedSocialCreditCode", ChinaMainlandUnifiedSocialCreditCode) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from __future__ import annotations | ||
|
||
import re | ||
from typing import Any | ||
|
||
import pandas as pd | ||
|
||
from sdgx.data_models.inspectors.base import Inspector | ||
from sdgx.exceptions import InspectorInitError | ||
|
||
# By default, we will not directly register the RegexInspector to the Inspector Manager | ||
# Instead, use it as a baseclass or user-defined regex, then put it into the Inspector Manager or use it alone | ||
|
||
|
||
class RegexInspector(Inspector): | ||
"""RegexInspector | ||
RegexInspector is a sdgx inspector that uses regular expression rules to detect column data types. It can be initialized with a custom expression, or it can be inherited and applied to specific data types,such as email, US address, HKID etc. | ||
""" | ||
|
||
pattern: str = None | ||
""" | ||
pattern is the regular expression string of current inspector. | ||
""" | ||
|
||
data_type_name: str = None | ||
""" | ||
data_type_name is the name of the data type, such as email, US address, HKID etc. | ||
""" | ||
|
||
_match_percentage: float = 0.8 | ||
""" | ||
match_percentage shoud > 0.5 and < 1. | ||
Due to the existence of empty data, wrong data, etc., the match_percentage is the proportion of the current regular expression compound. When the number of compound regular expressions is higher than this ratio, the column can be considered fit the current data type. | ||
""" | ||
|
||
@property | ||
def match_percentage(self): | ||
return self._match_percentage | ||
|
||
@match_percentage.setter | ||
def match_percentage(self, value): | ||
if value > 0.5 and value <= 1: | ||
self._match_percentage = value | ||
else: | ||
raise InspectorInitError("The match_percentage should be set in (0.5, 1].") | ||
|
||
def __init__( | ||
self, | ||
pattern: str = None, | ||
data_type_name: str = None, | ||
match_percentage: float = None, | ||
*args, | ||
**kwargs, | ||
): | ||
super().__init__(*args, **kwargs) | ||
self.regex_columns: set[str] = set() | ||
|
||
# this pattern should be a re pattern | ||
if pattern: | ||
self.pattern = pattern | ||
# check pattern | ||
if self.pattern is None: | ||
raise InspectorInitError("Regular expression NOT found.") | ||
self.p = re.compile(self.pattern) | ||
|
||
# set data_type_name | ||
if data_type_name: | ||
if data_type_name.endswith("_columns"): | ||
self.data_type_name = data_type_name[:-8] | ||
else: | ||
self.data_type_name = data_type_name | ||
elif not self.data_type_name: | ||
self.data_type_name = f"regex_{self.pattern}_columns" | ||
# then chech the data type name | ||
if self.data_type_name is None: | ||
raise InspectorInitError("Inspector's data type undefined.") | ||
|
||
# set percentage | ||
if match_percentage: | ||
self.match_percentage = match_percentage | ||
|
||
def fit(self, raw_data: pd.DataFrame, *args, **kwargs): | ||
"""Fit the inspector. | ||
Finds the list of regex columns from the raw data. | ||
Args: | ||
raw_data (pd.DataFrame): Raw data | ||
""" | ||
for each_col in raw_data.columns: | ||
each_match_rate = self._fit_column(raw_data[each_col]) | ||
if each_match_rate > self.match_percentage: | ||
self.regex_columns.add(each_col) | ||
|
||
self.ready = True | ||
|
||
def domain_verification(self, each_sample): | ||
return True | ||
|
||
def _fit_column(self, column_data: pd.Series): | ||
""" | ||
Regular expression matching for a single column, returning the matching ratio. | ||
""" | ||
length = len(column_data) | ||
match_cnt = 0 | ||
for i in column_data: | ||
m = re.match(self.p, str(i)) | ||
d = self.domain_verification(str(i)) | ||
if m and d: | ||
match_cnt += 1 | ||
return match_cnt / length | ||
|
||
def inspect(self, *args, **kwargs) -> dict[str, Any]: | ||
"""Inspect raw data and generate metadata.""" | ||
|
||
return {self.data_type_name + "_columns": list(self.regex_columns)} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.