-
Notifications
You must be signed in to change notification settings - Fork 541
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[0.2.0] Metadata Implementation (#81)
* Update file structure * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update single_table.py * Update metadata (still draft) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update metadata and reset file structure * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add numeric inspector * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix metadata initialization * fix type hits error in py38 * add inspectors * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update relationship.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update multi-table-combiner * add composite key list in relationship * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply suggestions from code review Use a single list for single or composite primary key. Co-authored-by: Zhongsheng Ji <9573586@qq.com> * Apply suggestions from code review Key are described using list, which is compatible with single or composite foreign key. Co-authored-by: Zhongsheng Ji <9573586@qq.com> * use a single list for primary key(s) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update expections * Update check functions Unit testing also needs to be implemented. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply suggestions from code review Co-authored-by: Zhongsheng Ji <9573586@qq.com> * Apply suggestions from code review Unit testing also needs to be implemented. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update some test case still some cases not completed yet. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update test cases * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update metadata save and load test cases. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Zhongsheng Ji <9573586@qq.com>
- Loading branch information
1 parent
897e252
commit 9b4c683
Showing
13 changed files
with
673 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Any | ||
|
||
import pandas as pd | ||
from pandas._libs.tslibs.parsing import DateParseError | ||
|
||
from sdgx.data_models.inspectors.base import Inspector | ||
from sdgx.data_models.inspectors.extension import hookimpl | ||
|
||
|
||
class BoolInspector(Inspector): | ||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.bool_columns: set[str] = set() | ||
|
||
def fit(self, raw_data: pd.DataFrame): | ||
"""Fit the inspector. | ||
Gets the list of discrete columns from the raw data. | ||
Args: | ||
raw_data (pd.DataFrame): Raw data | ||
""" | ||
self.bool_columns = self.bool_columns.union( | ||
set(raw_data.infer_objects().select_dtypes(include=["bool"]).columns) | ||
) | ||
|
||
self.ready = True | ||
|
||
def inspect(self) -> dict[str, Any]: | ||
"""Inspect raw data and generate metadata.""" | ||
|
||
return {"bool_columns": list(self.bool_columns)} | ||
|
||
|
||
@hookimpl | ||
def register(manager): | ||
manager.register("BoolInspector", BoolInspector) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Any | ||
|
||
import pandas as pd | ||
from pandas._libs.tslibs.parsing import DateParseError | ||
|
||
from sdgx.data_models.inspectors.base import Inspector | ||
from sdgx.data_models.inspectors.extension import hookimpl | ||
|
||
|
||
class DatetimeInspector(Inspector): | ||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.datetime_columns: set[str] = set() | ||
|
||
@classmethod | ||
def can_convert_to_datetime(cls, input_col: pd.Series): | ||
"""Whether a df column can be converted to datetime. | ||
Args: | ||
input_col(pd.Series): A column of a dataframe. | ||
""" | ||
try: | ||
pd.to_datetime(input_col) | ||
return True | ||
except DateParseError: | ||
return False | ||
# for other situations | ||
except: | ||
return False | ||
|
||
def fit(self, raw_data: pd.DataFrame): | ||
"""Fit the inspector. | ||
Gets the list of discrete columns from the raw data. | ||
Args: | ||
raw_data (pd.DataFrame): Raw data | ||
""" | ||
self.datetime_columns = self.datetime_columns.union( | ||
set(raw_data.infer_objects().select_dtypes(include=["datetime64"]).columns) | ||
) | ||
|
||
# for some other case | ||
# Some columns containing dates after infer are still marked as object | ||
candidate_columns = set(raw_data.select_dtypes(include=["object"]).columns) | ||
for col_name in candidate_columns: | ||
each_col = raw_data[col_name] | ||
if DatetimeInspector.can_convert_to_datetime(each_col): | ||
self.datetime_columns.add(col_name) | ||
|
||
self.ready = True | ||
|
||
def inspect(self) -> dict[str, Any]: | ||
"""Inspect raw data and generate metadata.""" | ||
|
||
return {"datetime_columns": list(self.datetime_columns)} | ||
|
||
|
||
@hookimpl | ||
def register(manager): | ||
manager.register("DatetimeInspector", DatetimeInspector) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Any | ||
|
||
import pandas as pd | ||
|
||
from sdgx.data_models.inspectors.base import Inspector | ||
from sdgx.data_models.inspectors.extension import hookimpl | ||
|
||
|
||
class IDInspector(Inspector): | ||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.ID_columns: set[str] = set() | ||
|
||
def fit(self, raw_data: pd.DataFrame): | ||
"""Fit the inspector. | ||
Gets the list of discrete columns from the raw data. | ||
Args: | ||
raw_data (pd.DataFrame): Raw data | ||
""" | ||
|
||
df_length = len(raw_data) | ||
candidate_columns = set(raw_data.select_dtypes(include=["object", "int64"]).columns) | ||
|
||
for each_col_name in candidate_columns: | ||
target_col = raw_data[each_col_name] | ||
col_set_length = len(set(target_col)) | ||
if col_set_length == df_length: | ||
self.ID_columns.add(each_col_name) | ||
|
||
self.ready = True | ||
|
||
def inspect(self) -> dict[str, Any]: | ||
"""Inspect raw data and generate metadata.""" | ||
|
||
return {"id_columns": list(self.ID_columns)} | ||
|
||
|
||
@hookimpl | ||
def register(manager): | ||
manager.register("IDInspector", IDInspector) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Any | ||
|
||
import pandas as pd | ||
|
||
from sdgx.data_models.inspectors.base import Inspector | ||
from sdgx.data_models.inspectors.extension import hookimpl | ||
|
||
|
||
class NumericInspector(Inspector): | ||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.numeric_columns: set[str] = set() | ||
|
||
def fit(self, raw_data: pd.DataFrame): | ||
"""Fit the inspector. | ||
Gets the list of discrete columns from the raw data. | ||
Args: | ||
raw_data (pd.DataFrame): Raw data | ||
""" | ||
|
||
self.numeric_columns = self.numeric_columns.union( | ||
set(raw_data.select_dtypes(include=["float64", "int64"]).columns) | ||
) | ||
self.ready = True | ||
|
||
def inspect(self) -> dict[str, Any]: | ||
"""Inspect raw data and generate metadata.""" | ||
|
||
return {"numeric_columns": list(self.numeric_columns)} | ||
|
||
|
||
@hookimpl | ||
def register(manager): | ||
manager.register("NumericInspector", NumericInspector) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.