Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement datetime_formats in DatetimeInspector #125

Merged
merged 1 commit into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 42 additions & 2 deletions sdgx/data_models/inspectors/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@ class DatetimeInspector(Inspector):

Often, difficult-to-recognize date or datetime objects are also recognized as descrete types by DatetimeInspector, causing the column to be marked repeatedly.
"""
PRESET_FORMAT_STRINGS = ["%Y/%m/%d", "%Y-%m-%d", "%d %b %Y"]

def __init__(self, *args, **kwargs):
def __init__(self, user_formats: list[str] = None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.datetime_columns: set[str] = set()
self.user_defined_formats = user_formats if user_formats else []
self.column_formats: dict[str, str] = {}

@classmethod
@ignore_warnings(category=UserWarning)
Expand Down Expand Up @@ -59,12 +62,49 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs):
if DatetimeInspector.can_convert_to_datetime(each_col):
self.datetime_columns.add(col_name)

# Process for detecting format strings
for col_name in self.datetime_columns:
each_col = raw_data[col_name]
datetime_format = self.detect_datetime_format(each_col)
if datetime_format:
self.column_formats[col_name] = datetime_format

self.ready = True

def detect_datetime_format(self, series: pd.Series):
"""Detects the datetime format of a pandas Series.

This method iterates over a list of user-defined and preset datetime formats,
and attempts to parse each date in the series using each format.
If all dates in the series can be successfully parsed with a format,
that format is returned. If no format can parse all dates, an empty string is returned.

Args:
series (pd.Series): The pandas Series to detect the datetime format of.

Returns:
str: The datetime format that can parse all dates in the series, or None if no such format is found.
"""
for fmt in self.user_defined_formats + self.PRESET_FORMAT_STRINGS:
try:
# Check if all dates in the series can be parsed with this format
parsed_series = series.apply(
lambda x: pd.to_datetime(x, format=fmt, errors="coerce")
)
if not parsed_series.isnull().any():
return fmt
except ValueError:
continue

self.ready = True

def inspect(self, *args, **kwargs) -> dict[str, Any]:
"""Inspect raw data and generate metadata."""

return {"datetime_columns": list(self.datetime_columns)}
return {
"datetime_columns": list(self.datetime_columns),
"datetime_formats": self.column_formats,
}


@hookimpl
Expand Down
24 changes: 24 additions & 0 deletions tests/data_models/inspector/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,29 @@ def test_inspector_generated_data(inspector: DatetimeInspector, datetime_test_df
assert inspector.inspect_level == 20


def test_custom_format_detection(datetime_test_df: pd.DataFrame):
# Instantiate the DatetimeInspector with the custom formats
inspector = DatetimeInspector(user_formats=["%Y-%m-%d %H:%M:%S"])
inspector.fit(datetime_test_df)

# Get the detected datetime formats
result = inspector.inspect()

# Assert that the detected formats are correct
assert result["datetime_formats"]["simple_datetime"] == "%Y-%m-%d %H:%M:%S"
assert result["datetime_formats"]["simple_datetime_2"] == "%d %b %Y"
assert result["datetime_formats"]["date_with_time"] == "%Y-%m-%d %H:%M:%S"
assert inspector.inspect_level == 20


def test_detect_datetime_format_partial_and_no_match(inspector):
partial_match_series = pd.Series(["2023-03-17", "invalid-date"])
no_match_series = pd.Series(["not-a-date"])

assert inspector.detect_datetime_format(partial_match_series) == None
assert inspector.detect_datetime_format(no_match_series) == None
assert inspector.inspect_level == 20


if __name__ == "__main__":
pytest.main(["-vv", "-s", __file__])
Loading