Skip to content

Commit

Permalink
Implement datetime_formats in DatetimeInspector (#125)
Browse files Browse the repository at this point in the history
  • Loading branch information
Femi-lawal committed Jan 31, 2024
1 parent c358110 commit 73e9476
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 2 deletions.
44 changes: 42 additions & 2 deletions sdgx/data_models/inspectors/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@ class DatetimeInspector(Inspector):
Often, difficult-to-recognize date or datetime objects are also recognized as descrete types by DatetimeInspector, causing the column to be marked repeatedly.
"""
PRESET_FORMAT_STRINGS = ["%Y/%m/%d", "%Y-%m-%d", "%d %b %Y"]

def __init__(self, *args, **kwargs):
def __init__(self, user_formats: list[str] = None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.datetime_columns: set[str] = set()
self.user_defined_formats = user_formats if user_formats else []
self.column_formats: dict[str, str] = {}

@classmethod
@ignore_warnings(category=UserWarning)
Expand Down Expand Up @@ -59,12 +62,49 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs):
if DatetimeInspector.can_convert_to_datetime(each_col):
self.datetime_columns.add(col_name)

# Process for detecting format strings
for col_name in self.datetime_columns:
each_col = raw_data[col_name]
datetime_format = self.detect_datetime_format(each_col)
if datetime_format:
self.column_formats[col_name] = datetime_format

self.ready = True

def detect_datetime_format(self, series: pd.Series):
"""Detects the datetime format of a pandas Series.
This method iterates over a list of user-defined and preset datetime formats,
and attempts to parse each date in the series using each format.
If all dates in the series can be successfully parsed with a format,
that format is returned. If no format can parse all dates, an empty string is returned.
Args:
series (pd.Series): The pandas Series to detect the datetime format of.
Returns:
str: The datetime format that can parse all dates in the series, or None if no such format is found.
"""
for fmt in self.user_defined_formats + self.PRESET_FORMAT_STRINGS:
try:
# Check if all dates in the series can be parsed with this format
parsed_series = series.apply(
lambda x: pd.to_datetime(x, format=fmt, errors="coerce")
)
if not parsed_series.isnull().any():
return fmt
except ValueError:
continue

self.ready = True

def inspect(self, *args, **kwargs) -> dict[str, Any]:
"""Inspect raw data and generate metadata."""

return {"datetime_columns": list(self.datetime_columns)}
return {
"datetime_columns": list(self.datetime_columns),
"datetime_formats": self.column_formats,
}


@hookimpl
Expand Down
24 changes: 24 additions & 0 deletions tests/data_models/inspector/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,5 +87,29 @@ def test_inspector_generated_data(inspector: DatetimeInspector, datetime_test_df
assert inspector.inspect_level == 20


def test_custom_format_detection(datetime_test_df: pd.DataFrame):
# Instantiate the DatetimeInspector with the custom formats
inspector = DatetimeInspector(user_formats=["%Y-%m-%d %H:%M:%S"])
inspector.fit(datetime_test_df)

# Get the detected datetime formats
result = inspector.inspect()

# Assert that the detected formats are correct
assert result["datetime_formats"]["simple_datetime"] == "%Y-%m-%d %H:%M:%S"
assert result["datetime_formats"]["simple_datetime_2"] == "%d %b %Y"
assert result["datetime_formats"]["date_with_time"] == "%Y-%m-%d %H:%M:%S"
assert inspector.inspect_level == 20


def test_detect_datetime_format_partial_and_no_match(inspector):
partial_match_series = pd.Series(["2023-03-17", "invalid-date"])
no_match_series = pd.Series(["not-a-date"])

assert inspector.detect_datetime_format(partial_match_series) == None
assert inspector.detect_datetime_format(no_match_series) == None
assert inspector.inspect_level == 20


if __name__ == "__main__":
pytest.main(["-vv", "-s", __file__])

0 comments on commit 73e9476

Please sign in to comment.