Skip to content

Commit

Permalink
Enhance Data Handling with Empty Column Inspector and Transformer (#197)
Browse files Browse the repository at this point in the history
* add EmptyInspector

* add testcase

* fix typo in test case

* add EmptyTransformer

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Create test_transformers_empty.py

* fix typo, which may cause bug

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
MooooCat and pre-commit-ci[bot] committed Jul 11, 2024
1 parent ab99a8e commit 7c338ad
Show file tree
Hide file tree
Showing 5 changed files with 295 additions and 1 deletion.
76 changes: 76 additions & 0 deletions sdgx/data_models/inspectors/empty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from __future__ import annotations

from typing import Any

import pandas as pd

from sdgx.data_models.inspectors.base import Inspector
from sdgx.data_models.inspectors.extension import hookimpl


class EmptyInspector(Inspector):
"""
The EmptyInspector class is designed to identify columns in a DataFrame that have a high rate of missing values.
Columns taged empty will be removed during the training process and reinserted into their original positions after the model sampling process is complete.
Attributes:
empty_rate_threshold (float): The threshold for the rate of missing values above which a column is considered empty, default = 0.9.
empty_columns (set[str]): A set of column names that have missing values above the threshold.
Methods:
__init__(self, *args, **kwargs): Initializes the EmptyInspector instance, optionally setting the empty_rate_threshold.
fit(self, raw_data: pd.DataFrame, *args, **kwargs): Fits the inspector to the raw data, identifying columns with missing values above the threshold.
inspect(self, *args, **kwargs) -> dict[str, Any]: Returns a dictionary containing the list of columns identified as empty.
"""

empty_rate_threshold = 0.9
"""
float: The threshold for the rate of missing values above which a column is considered empty.
Default is 0.9, meaning if a column has more than 90% of its values missing, it will be considered empty.
"""

empty_columns: set[str] = set()
"""
set[str]: A set of column names that have missing values above the empty_rate_threshold.
These columns are identified as empty and will be handled accordingly during the data processing.
"""

_inspect_level = 90
"""
int: The inspection level for the EmptyInspector, set to a quite high value (90) to prioritize the identification and handling of empty columns.
This high value is chosen because empty columns contain no information and should not be considered for any other type of inspection or processing.
They are typically removed during model training as they cannot be understood by many models and may cause errors.
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

if "empty_rate_threshold" in kwargs:
self.empty_rate_threshold = kwargs["empty_rate_threshold"]

def fit(self, raw_data: pd.DataFrame, *args, **kwargs):
"""Fit the inspector.
Gets the list of empty columns from the raw data.
Args:
raw_data (pd.DataFrame): Raw data
"""
# Calculate the empty rate for each column
empty_rate = raw_data.isnull().mean()

# Identify columns where the empty rate exceeds the threshold
self.empty_columns = set(empty_rate[empty_rate >= self.empty_rate_threshold].index)

self.ready = True

def inspect(self, *args, **kwargs) -> dict[str, Any]:
"""Inspect raw data and generate metadata."""

return {"empty_columns": list(self.empty_columns)}


@hookimpl
def register(manager):
manager.register("EmptyInspector", EmptyInspector)
2 changes: 1 addition & 1 deletion sdgx/data_processors/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class DataProcessorManager(Manager):
"IntValueFormatter",
"DatetimeFormatter",
]
] + ["ColumnOrderTransformer".lower()]
] + ["EmptyTransformer".lower(), "ColumnOrderTransformer".lower()]
"""
preset_defalut_processors list stores the lowercase names of the transformers loaded by default. When using the synthesizer, they will be loaded by default to facilitate user operations.
Expand Down
104 changes: 104 additions & 0 deletions sdgx/data_processors/transformers/empty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from __future__ import annotations

from typing import Any

import pandas as pd

from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.extension import hookimpl
from sdgx.data_processors.transformers.base import Transformer
from sdgx.utils import logger


class EmptyTransformer(Transformer):
"""
A transformer that handles empty columns in a DataFrame.
This transformer identifies and processes columns that contain no data (empty columns) in a given DataFrame.
It can remove these columns during the conversion process and restore them during the reverse conversion process.
Attributes:
empty_columns (list): A list of column names that are identified as empty.
Methods:
fit(metadata: Metadata | None = None, **kwargs: dict[str, Any]):
Fits the transformer to the data by identifying empty columns based on provided metadata.
convert(raw_data: pd.DataFrame) -> pd.DataFrame:
Converts the raw data by removing the identified empty columns.
reverse_convert(processed_data: pd.DataFrame) -> pd.DataFrame:
Reverses the conversion by restoring the previously removed empty columns.
"""

empty_columns: list = []
"""
List of column names that are identified as empty. This attribute is populated during the fitting process
and is used to remove these columns during the conversion process and restore them during the reverse conversion process.
"""

def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]):
"""
Fit method for the transformer.
Remember the empty_columns from all columns.
Args:
metadata (Metadata | None): The metadata containing information about the data, including empty columns.
**kwargs (dict[str, Any]): Additional keyword arguments.
Returns:
None
"""

self.empty_columns = list(metadata.get("empty_columns"))

logger.info("EmptyTransformer Fitted.")

self.fitted = True

return

def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame:
"""
Converts the raw data by removing the identified empty columns.
Args:
raw_data (pd.DataFrame): The input DataFrame containing the raw data.
Returns:
pd.DataFrame: The processed DataFrame with empty columns removed.
"""
processed_data = raw_data

logger.info("Converting data using EmptyTransformer...")

for each_col in self.empty_columns:
processed_data = self.remove_columns(processed_data, [each_col])
logger.info("Converting data using EmptyTransformer... Finished (No action).")

return processed_data

def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame:
"""
Reverses the conversion by restoring the previously removed empty columns.
Args:
processed_data (pd.DataFrame): The input DataFrame containing the processed data.
Returns:
pd.DataFrame: The DataFrame with previously removed empty columns restored.
"""

df_length = processed_data.shape[0]

for each_col_name in self.empty_columns:
each_empty_col = [None for _ in range(df_length)]
each_empty_df = pd.DataFrame({each_col_name: each_empty_col})
processed_data = self.attach_columns(processed_data, each_empty_df)

logger.info("Data reverse-converted by EmptyTransformer.")

return processed_data


@hookimpl
def register(manager):
manager.register("EmptyTransformer", EmptyTransformer)
44 changes: 44 additions & 0 deletions tests/data_models/inspector/test_empty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pandas as pd
import pytest

from sdgx.data_models.inspectors.empty import EmptyInspector


@pytest.fixture
def inspector():
yield EmptyInspector()


@pytest.fixture
def raw_data(demo_single_table_path):
yield pd.read_csv(demo_single_table_path)


@pytest.fixture
def test_empty_data(raw_data: pd.DataFrame):
# Convert the columns to float to allow None values
raw_data["age"] = raw_data["age"].astype(float)
raw_data["fnlwgt"] = raw_data["fnlwgt"].astype(float)

# Set the values to None
raw_data["age"].values[:] = None
raw_data["fnlwgt"].values[:] = None

yield raw_data


def test_inspector(inspector: EmptyInspector, test_empty_data):
inspector.fit(test_empty_data)
assert inspector.ready
assert inspector.empty_columns
assert sorted(inspector.inspect()["empty_columns"]) == sorted(
[
"age",
"fnlwgt",
]
)
assert inspector.inspect_level == 90


if __name__ == "__main__":
pytest.main(["-vv", "-s", __file__])
70 changes: 70 additions & 0 deletions tests/data_processors/transformers/test_transformers_empty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import numpy as np
import pandas as pd
import pytest

from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.transformers.empty import EmptyTransformer


@pytest.fixture
def raw_data(demo_single_table_path):
yield pd.read_csv(demo_single_table_path)


@pytest.fixture
def test_empty_data(raw_data: pd.DataFrame):
# Convert the columns to float to allow None values
raw_data["age"] = raw_data["age"].astype(float)
raw_data["fnlwgt"] = raw_data["fnlwgt"].astype(float)

# Set the values to None
raw_data["age"].values[:] = None
raw_data["fnlwgt"].values[:] = None

yield raw_data


def test_nan_handling_test_df(test_empty_data: pd.DataFrame):
"""
Test the handling of empty columns in a DataFrame.
This function tests the behavior of a DataFrame when it contains empty columns.
It is designed to be used in a testing environment, where the DataFrame is passed as an argument.
Parameters:
test_empty_data (pd.DataFrame): The DataFrame to test.
Returns:
None
Raises:
AssertionError: If the DataFrame does not handle empty columns as expected.
"""

metadata = Metadata.from_dataframe(test_empty_data)

# Initialize the EmptyTransformer.
empty_transformer = EmptyTransformer()
# Check if the transformer has not been fitted yet.
assert empty_transformer.fitted is False

# Fit the transformer with the DataFrame.
empty_transformer.fit(metadata)

# Check if the transformer has been fitted after the fit operation.
assert empty_transformer.fitted

# Check the empty column
assert sorted(empty_transformer.empty_columns) == ["age", "fnlwgt"]

# Transform the DataFrame using the transformer.
transformed_df = empty_transformer.convert(test_empty_data)

# Check if the transformed DataFrame does not contain any empty columns.
# assert not df_has_empty_col(transformed_df)
processed_metadata = Metadata.from_dataframe(transformed_df)
assert not processed_metadata.get("empty_columns")

# reverse convert the df
reverse_converted_df = empty_transformer.reverse_convert(transformed_df)
reverse_converted_metadata = Metadata.from_dataframe(reverse_converted_df)
assert reverse_converted_metadata.get("empty_columns") == {"age", "fnlwgt"}

0 comments on commit 7c338ad

Please sign in to comment.