-
Notifications
You must be signed in to change notification settings - Fork 541
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Enhance Data Handling with Empty Column Inspector and Transformer (#197)
* add EmptyInspector * add testcase * fix typo in test case * add EmptyTransformer * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Create test_transformers_empty.py * fix typo, which may cause bug * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
- Loading branch information
1 parent
ab99a8e
commit 7c338ad
Showing
5 changed files
with
295 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Any | ||
|
||
import pandas as pd | ||
|
||
from sdgx.data_models.inspectors.base import Inspector | ||
from sdgx.data_models.inspectors.extension import hookimpl | ||
|
||
|
||
class EmptyInspector(Inspector): | ||
""" | ||
The EmptyInspector class is designed to identify columns in a DataFrame that have a high rate of missing values. | ||
Columns taged empty will be removed during the training process and reinserted into their original positions after the model sampling process is complete. | ||
Attributes: | ||
empty_rate_threshold (float): The threshold for the rate of missing values above which a column is considered empty, default = 0.9. | ||
empty_columns (set[str]): A set of column names that have missing values above the threshold. | ||
Methods: | ||
__init__(self, *args, **kwargs): Initializes the EmptyInspector instance, optionally setting the empty_rate_threshold. | ||
fit(self, raw_data: pd.DataFrame, *args, **kwargs): Fits the inspector to the raw data, identifying columns with missing values above the threshold. | ||
inspect(self, *args, **kwargs) -> dict[str, Any]: Returns a dictionary containing the list of columns identified as empty. | ||
""" | ||
|
||
empty_rate_threshold = 0.9 | ||
""" | ||
float: The threshold for the rate of missing values above which a column is considered empty. | ||
Default is 0.9, meaning if a column has more than 90% of its values missing, it will be considered empty. | ||
""" | ||
|
||
empty_columns: set[str] = set() | ||
""" | ||
set[str]: A set of column names that have missing values above the empty_rate_threshold. | ||
These columns are identified as empty and will be handled accordingly during the data processing. | ||
""" | ||
|
||
_inspect_level = 90 | ||
""" | ||
int: The inspection level for the EmptyInspector, set to a quite high value (90) to prioritize the identification and handling of empty columns. | ||
This high value is chosen because empty columns contain no information and should not be considered for any other type of inspection or processing. | ||
They are typically removed during model training as they cannot be understood by many models and may cause errors. | ||
""" | ||
|
||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
|
||
if "empty_rate_threshold" in kwargs: | ||
self.empty_rate_threshold = kwargs["empty_rate_threshold"] | ||
|
||
def fit(self, raw_data: pd.DataFrame, *args, **kwargs): | ||
"""Fit the inspector. | ||
Gets the list of empty columns from the raw data. | ||
Args: | ||
raw_data (pd.DataFrame): Raw data | ||
""" | ||
# Calculate the empty rate for each column | ||
empty_rate = raw_data.isnull().mean() | ||
|
||
# Identify columns where the empty rate exceeds the threshold | ||
self.empty_columns = set(empty_rate[empty_rate >= self.empty_rate_threshold].index) | ||
|
||
self.ready = True | ||
|
||
def inspect(self, *args, **kwargs) -> dict[str, Any]: | ||
"""Inspect raw data and generate metadata.""" | ||
|
||
return {"empty_columns": list(self.empty_columns)} | ||
|
||
|
||
@hookimpl | ||
def register(manager): | ||
manager.register("EmptyInspector", EmptyInspector) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Any | ||
|
||
import pandas as pd | ||
|
||
from sdgx.data_models.metadata import Metadata | ||
from sdgx.data_processors.extension import hookimpl | ||
from sdgx.data_processors.transformers.base import Transformer | ||
from sdgx.utils import logger | ||
|
||
|
||
class EmptyTransformer(Transformer): | ||
""" | ||
A transformer that handles empty columns in a DataFrame. | ||
This transformer identifies and processes columns that contain no data (empty columns) in a given DataFrame. | ||
It can remove these columns during the conversion process and restore them during the reverse conversion process. | ||
Attributes: | ||
empty_columns (list): A list of column names that are identified as empty. | ||
Methods: | ||
fit(metadata: Metadata | None = None, **kwargs: dict[str, Any]): | ||
Fits the transformer to the data by identifying empty columns based on provided metadata. | ||
convert(raw_data: pd.DataFrame) -> pd.DataFrame: | ||
Converts the raw data by removing the identified empty columns. | ||
reverse_convert(processed_data: pd.DataFrame) -> pd.DataFrame: | ||
Reverses the conversion by restoring the previously removed empty columns. | ||
""" | ||
|
||
empty_columns: list = [] | ||
""" | ||
List of column names that are identified as empty. This attribute is populated during the fitting process | ||
and is used to remove these columns during the conversion process and restore them during the reverse conversion process. | ||
""" | ||
|
||
def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]): | ||
""" | ||
Fit method for the transformer. | ||
Remember the empty_columns from all columns. | ||
Args: | ||
metadata (Metadata | None): The metadata containing information about the data, including empty columns. | ||
**kwargs (dict[str, Any]): Additional keyword arguments. | ||
Returns: | ||
None | ||
""" | ||
|
||
self.empty_columns = list(metadata.get("empty_columns")) | ||
|
||
logger.info("EmptyTransformer Fitted.") | ||
|
||
self.fitted = True | ||
|
||
return | ||
|
||
def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Converts the raw data by removing the identified empty columns. | ||
Args: | ||
raw_data (pd.DataFrame): The input DataFrame containing the raw data. | ||
Returns: | ||
pd.DataFrame: The processed DataFrame with empty columns removed. | ||
""" | ||
processed_data = raw_data | ||
|
||
logger.info("Converting data using EmptyTransformer...") | ||
|
||
for each_col in self.empty_columns: | ||
processed_data = self.remove_columns(processed_data, [each_col]) | ||
logger.info("Converting data using EmptyTransformer... Finished (No action).") | ||
|
||
return processed_data | ||
|
||
def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Reverses the conversion by restoring the previously removed empty columns. | ||
Args: | ||
processed_data (pd.DataFrame): The input DataFrame containing the processed data. | ||
Returns: | ||
pd.DataFrame: The DataFrame with previously removed empty columns restored. | ||
""" | ||
|
||
df_length = processed_data.shape[0] | ||
|
||
for each_col_name in self.empty_columns: | ||
each_empty_col = [None for _ in range(df_length)] | ||
each_empty_df = pd.DataFrame({each_col_name: each_empty_col}) | ||
processed_data = self.attach_columns(processed_data, each_empty_df) | ||
|
||
logger.info("Data reverse-converted by EmptyTransformer.") | ||
|
||
return processed_data | ||
|
||
|
||
@hookimpl | ||
def register(manager): | ||
manager.register("EmptyTransformer", EmptyTransformer) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import pandas as pd | ||
import pytest | ||
|
||
from sdgx.data_models.inspectors.empty import EmptyInspector | ||
|
||
|
||
@pytest.fixture | ||
def inspector(): | ||
yield EmptyInspector() | ||
|
||
|
||
@pytest.fixture | ||
def raw_data(demo_single_table_path): | ||
yield pd.read_csv(demo_single_table_path) | ||
|
||
|
||
@pytest.fixture | ||
def test_empty_data(raw_data: pd.DataFrame): | ||
# Convert the columns to float to allow None values | ||
raw_data["age"] = raw_data["age"].astype(float) | ||
raw_data["fnlwgt"] = raw_data["fnlwgt"].astype(float) | ||
|
||
# Set the values to None | ||
raw_data["age"].values[:] = None | ||
raw_data["fnlwgt"].values[:] = None | ||
|
||
yield raw_data | ||
|
||
|
||
def test_inspector(inspector: EmptyInspector, test_empty_data): | ||
inspector.fit(test_empty_data) | ||
assert inspector.ready | ||
assert inspector.empty_columns | ||
assert sorted(inspector.inspect()["empty_columns"]) == sorted( | ||
[ | ||
"age", | ||
"fnlwgt", | ||
] | ||
) | ||
assert inspector.inspect_level == 90 | ||
|
||
|
||
if __name__ == "__main__": | ||
pytest.main(["-vv", "-s", __file__]) |
70 changes: 70 additions & 0 deletions
70
tests/data_processors/transformers/test_transformers_empty.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import pytest | ||
|
||
from sdgx.data_models.metadata import Metadata | ||
from sdgx.data_processors.transformers.empty import EmptyTransformer | ||
|
||
|
||
@pytest.fixture | ||
def raw_data(demo_single_table_path): | ||
yield pd.read_csv(demo_single_table_path) | ||
|
||
|
||
@pytest.fixture | ||
def test_empty_data(raw_data: pd.DataFrame): | ||
# Convert the columns to float to allow None values | ||
raw_data["age"] = raw_data["age"].astype(float) | ||
raw_data["fnlwgt"] = raw_data["fnlwgt"].astype(float) | ||
|
||
# Set the values to None | ||
raw_data["age"].values[:] = None | ||
raw_data["fnlwgt"].values[:] = None | ||
|
||
yield raw_data | ||
|
||
|
||
def test_nan_handling_test_df(test_empty_data: pd.DataFrame): | ||
""" | ||
Test the handling of empty columns in a DataFrame. | ||
This function tests the behavior of a DataFrame when it contains empty columns. | ||
It is designed to be used in a testing environment, where the DataFrame is passed as an argument. | ||
Parameters: | ||
test_empty_data (pd.DataFrame): The DataFrame to test. | ||
Returns: | ||
None | ||
Raises: | ||
AssertionError: If the DataFrame does not handle empty columns as expected. | ||
""" | ||
|
||
metadata = Metadata.from_dataframe(test_empty_data) | ||
|
||
# Initialize the EmptyTransformer. | ||
empty_transformer = EmptyTransformer() | ||
# Check if the transformer has not been fitted yet. | ||
assert empty_transformer.fitted is False | ||
|
||
# Fit the transformer with the DataFrame. | ||
empty_transformer.fit(metadata) | ||
|
||
# Check if the transformer has been fitted after the fit operation. | ||
assert empty_transformer.fitted | ||
|
||
# Check the empty column | ||
assert sorted(empty_transformer.empty_columns) == ["age", "fnlwgt"] | ||
|
||
# Transform the DataFrame using the transformer. | ||
transformed_df = empty_transformer.convert(test_empty_data) | ||
|
||
# Check if the transformed DataFrame does not contain any empty columns. | ||
# assert not df_has_empty_col(transformed_df) | ||
processed_metadata = Metadata.from_dataframe(transformed_df) | ||
assert not processed_metadata.get("empty_columns") | ||
|
||
# reverse convert the df | ||
reverse_converted_df = empty_transformer.reverse_convert(transformed_df) | ||
reverse_converted_metadata = Metadata.from_dataframe(reverse_converted_df) | ||
assert reverse_converted_metadata.get("empty_columns") == {"age", "fnlwgt"} |