Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Aug 22, 2024
1 parent 974367e commit d438011
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 49 deletions.
34 changes: 17 additions & 17 deletions sdgx/data_models/inspectors/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ class NumericInspector(Inspector):
A set of column names that contain float values.
"""

positive_columns: set = set()
positive_columns: set = set()
"""
A set of column names that contain only positive numeric values.
"""

negative_columns: set = set()
negative_columns: set = set()
"""
A set of column names that contain only negative numeric values.
"""
Expand Down Expand Up @@ -69,20 +69,20 @@ def _is_int_column(self, col_series: pd.Series):
bool: True if the column is predominantly integer, False otherwise.
"""
# Convert the column series to numeric values, coercing errors to NaN and dropping them
numeric_values = pd.to_numeric(col_series, errors='coerce').dropna()
numeric_values = pd.to_numeric(col_series, errors="coerce").dropna()

# Count how many of the numeric values are integers
int_cnt = (numeric_values == numeric_values.astype(int)).sum()

# Calculate the ratio of integer values to the total numeric values
int_rate = int_cnt / len(numeric_values)

# Return True if the integer rate is greater than the predefined threshold
return int_rate > self._int_rate



def _is_positive_or_negative_column(self, col_series: pd.Series, threshold: float, comparison_func) -> bool:
def _is_positive_or_negative_column(
self, col_series: pd.Series, threshold: float, comparison_func
) -> bool:
"""
Determine if a column contains predominantly positive or negative values.
Expand All @@ -98,7 +98,7 @@ def _is_positive_or_negative_column(self, col_series: pd.Series, threshold: floa
bool: True if the column satisfies the condition, False otherwise.
"""
# Convert the column series to numeric values, coercing errors to NaN and dropping NaN values
numeric_values = pd.to_numeric(col_series, errors='coerce').dropna()
numeric_values = pd.to_numeric(col_series, errors="coerce").dropna()

# Apply the comparison function to the numeric values and sum the results
count = comparison_func(numeric_values).sum()
Expand All @@ -109,7 +109,6 @@ def _is_positive_or_negative_column(self, col_series: pd.Series, threshold: floa
# Return True if the proportion meets or exceeds the threshold, otherwise False
return proportion >= threshold


def _is_positive_column(self, col_series: pd.Series) -> bool:
"""
Determine if a column contains predominantly positive values.
Expand Down Expand Up @@ -138,8 +137,10 @@ def _is_negative_column(self, col_series: pd.Series) -> bool:
Returns:
bool: True if the column is predominantly negative, False otherwise.
"""
return self._is_positive_or_negative_column(col_series, self.negative_threshold, lambda x: x < 0)

return self._is_positive_or_negative_column(
col_series, self.negative_threshold, lambda x: x < 0
)

def fit(self, raw_data: pd.DataFrame, *args, **kwargs):
"""Fit the inspector.
Expand Down Expand Up @@ -187,20 +188,19 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs):
# Mark the inspector as ready
self.ready = True


def inspect(self, *args, **kwargs) -> dict[str, Any]:
"""Inspect raw data and generate metadata."""

# Positive and negative columns should not be strictly considered as label columns
# We use the format dict to inspect and output to metadata
numeric_format: dict = {}
numeric_format['positive'] = sorted(list(self.positive_columns))
numeric_format['negative'] = sorted(list(self.negative_columns))
numeric_format["positive"] = sorted(list(self.positive_columns))
numeric_format["negative"] = sorted(list(self.negative_columns))

return {
"int_columns": list(self.int_columns),
"float_columns": list(self.float_columns),
"numeric_format": numeric_format
"numeric_format": numeric_format,
}


Expand Down
1 change: 1 addition & 0 deletions sdgx/data_processors/filter/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from sdgx.data_processors.base import DataProcessor


class Filter(DataProcessor):
"""
Base class for all data filters.
Expand Down
60 changes: 33 additions & 27 deletions sdgx/data_processors/filter/positive_negative.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
from __future__ import annotations

from typing import Any

import pandas as pd

from sdgx.data_processors.filter.base import Filter
from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.extension import hookimpl
from sdgx.data_processors.filter.base import Filter
from sdgx.utils import logger


class PositiveNegativeFilter(Filter):
"""
A data processor for filtering positive and negative values.
This filter is used to ensure that values in specific columns remain positive or negative.
During the reverse conversion process, rows that do not meet the expected positivity or
This filter is used to ensure that values in specific columns remain positive or negative.
During the reverse conversion process, rows that do not meet the expected positivity or
negativity will be removed.
Attributes:
Expand All @@ -32,35 +35,33 @@ class PositiveNegativeFilter(Filter):
A set of column names that contain float values.
"""

positive_columns: set = set()
'''
positive_columns: set = set()
"""
A set of column names that are identified as containing positive numeric values.
'''
"""

negative_columns: set = set()
'''
negative_columns: set = set()
"""
A set of column names that are identified as containing negative numeric values.
'''

"""

def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]):
"""
Fit method for the data filter.
"""
logger.info("PositiveNegativeFilter Fitted.")

# record int and float data
# record int and float data
self.int_columns = metadata.int_columns
self.float_columns = metadata.float_columns
self.float_columns = metadata.float_columns

# record pos and neg
self.positive_columns = set(metadata.numeric_format['positive'])
self.negative_columns = set(metadata.numeric_format['negative'])

# record pos and neg
self.positive_columns = set(metadata.numeric_format["positive"])
self.negative_columns = set(metadata.numeric_format["negative"])

def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame:
"""
Convert method for data filter (No Action).
Convert method for data filter (No Action).
"""

logger.info("Converting data using PositiveNegativeFilter... Finished (No Action)")
Expand All @@ -70,32 +71,37 @@ def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame:
def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame:
"""
Reverse_convert method for the pos_neg data filter.
Iterate through each row of data, check if there are negative values in positive_columns,
or positive values in negative_columns. If the conditions are not met, discard the row.
"""
logger.info(f"Data reverse-converted by PositiveNegativeFilter Start with Shape: {processed_data.shape}.")

logger.info(
f"Data reverse-converted by PositiveNegativeFilter Start with Shape: {processed_data.shape}."
)

# Create a boolean mask to mark the rows that need to be retained
mask = pd.Series(True, index=processed_data.index)

# Check positive_columns
for col in self.positive_columns:
if col in processed_data.columns:
mask &= processed_data[col] >= 0

# Check negative_columns
for col in self.negative_columns:
if col in processed_data.columns:
mask &= processed_data[col] <= 0

# Apply the mask to filter the data
filtered_data = processed_data[mask]

logger.info(f"Data reverse-converted by PositiveNegativeFilter with Output Shape: {filtered_data.shape}.")


logger.info(
f"Data reverse-converted by PositiveNegativeFilter with Output Shape: {filtered_data.shape}."
)

return filtered_data


@hookimpl
def register(manager):
manager.register("PositiveNegativeFilter", PositiveNegativeFilter)
manager.register("PositiveNegativeFilter", PositiveNegativeFilter)
4 changes: 2 additions & 2 deletions tests/data_models/inspector/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def test_inspector(inspector: NumericInspector, raw_data):
assert not inspector.float_columns
assert inspector.inspect_level == 10
assert inspector.negative_columns == set()
assert inspector.positive_columns == {'age', 'hours-per-week', 'fnlwgt','educational-num'}
assert set(inspector.inspect().keys()) == {'int_columns','float_columns', 'numeric_format'}
assert inspector.positive_columns == {"age", "hours-per-week", "fnlwgt", "educational-num"}
assert set(inspector.inspect().keys()) == {"int_columns", "float_columns", "numeric_format"}


if __name__ == "__main__":
Expand Down
17 changes: 14 additions & 3 deletions tests/data_processors/filter/test_filters_pos_neg.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,15 @@ def pos_neg_test_df():
mixed_float = np.random.uniform(-50, 50, size=row_cnt)

X = [
[int_id[i], pos_int[i], neg_int[i], pos_float[i], neg_float[i], mixed_int[i], mixed_float[i]]
[
int_id[i],
pos_int[i],
neg_int[i],
pos_float[i],
neg_float[i],
mixed_int[i],
mixed_float[i],
]
for i in range(row_cnt)
]

Expand Down Expand Up @@ -60,10 +68,13 @@ def test_positive_negative_filter(pos_neg_test_df: pd.DataFrame):

# Check: whether mixed columns remained unchanged
pd.testing.assert_series_equal(pos_neg_test_df["mixed_int"], reverse_converted_df["mixed_int"])
pd.testing.assert_series_equal(pos_neg_test_df["mixed_float"], reverse_converted_df["mixed_float"])
pd.testing.assert_series_equal(
pos_neg_test_df["mixed_float"], reverse_converted_df["mixed_float"]
)

# Check if reverse_convert correctly filtered out non-compliant rows (samples)
assert reverse_converted_df.shape[0] <= pos_neg_test_df.shape[0]


if __name__ == "__main__":
pytest.main(["-vv", "-s", __file__])
pytest.main(["-vv", "-s", __file__])

0 comments on commit d438011

Please sign in to comment.