snowflakedb · sfc-gh-helmeleegy · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -36,6 +36,7 @@
 - Added suppport for applying `rolling().count()` and `expanding().count()` to `Timedelta` series and columns.
 - Added support for `tz` in both `pd.date_range` and `pd.bdate_range`.
 - Added support for `Series.items`.
+- Added support for `DataFrame.tz_localize` and `Series.tz_localize`.
 
 #### Improvements
 

@@ -483,7 +483,8 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``tz_convert``              | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``tz_localize``             | N                               |                                  |                                                    |
+| ``tz_localize``             | P                               | ``axis``, ``level``, ``copy``,   |                                                    |
+|                             |                                 | ``ambiguous``, ``nonexistent``   |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``unstack``                 | P                               | ``sort``                         | ``N`` for non-integer ``level``.                   |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

@@ -459,7 +459,8 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``tz_convert``              | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``tz_localize``             | N                               |                                  |                                                    |
+| ``tz_localize``             | P                               | ``axis``, ``level``, ``copy``,   |                                                    |
+|                             |                                 | ``ambiguous``, ``nonexistent``   |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``unique``                  | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

@@ -1255,25 +1255,38 @@ def update_snowflake_quoted_identifiers_with_expressions(
     def apply_snowpark_function_to_columns(
         self,
         snowpark_func: Callable[[Any], SnowparkColumn],
+        include_data: bool = True,
         include_index: bool = False,
         return_type: Optional[SnowparkPandasType] = None,
     ) -> "InternalFrame":
         """
-        Apply snowpark function callable to all data columns of an InternalFrame. If
-        include_index is True also apply this function to all index columns. The
-        snowflake quoted identifiers are preserved.
+        Apply snowpark function callable to all data columns and/or all index columns of an InternalFrame.
+        If include_data is True, apply the function to all data columns.
+        If include_index is True, apply the function to all index columns.
+        Raise an error if both include_data and include_index are False.
+        The snowflake quoted identifiers are preserved.
 
         Arguments:
             snowpark_func: Snowpark function to apply to columns of underlying snowpark df.
-            return_type: The optional SnowparkPandasType for the new column.
+            include_data: Whether to apply the function to data columns.
             include_index: Whether to apply the function to index columns as well.
+            return_type: The optional SnowparkPandasType for the new column.
 
         Returns:
-            InternalFrame with snowpark_func applies to columns of original frame, all other columns remain unchanged.
+            InternalFrame with snowpark_func applied to columns of original frame, all other columns remain unchanged.
         """
-        snowflake_ids = self.data_column_snowflake_quoted_identifiers
-        if include_index:
+
+        assert (
+            include_data or include_index
+        ), "Internal error: Cannot exclude both of data columns and index columns"
+        if include_data and include_index:
+            snowflake_ids = self.data_column_snowflake_quoted_identifiers
             snowflake_ids.extend(self.index_column_snowflake_quoted_identifiers)
+        elif include_data:
+            snowflake_ids = self.data_column_snowflake_quoted_identifiers
+        else:
+            assert include_index
+            snowflake_ids = self.index_column_snowflake_quoted_identifiers
 
         return self.update_snowflake_quoted_identifiers_with_expressions(
             {col_id: snowpark_func(col(col_id)) for col_id in snowflake_ids},

@@ -11684,7 +11684,7 @@ def dt_property(
 
         return SnowflakeQueryCompiler(
             self._modin_frame.apply_snowpark_function_to_columns(
-                property_function, include_index
+                property_function, include_index=include_index
             )
         )
 
@@ -17243,7 +17243,7 @@ def dt_tz_localize(
         return SnowflakeQueryCompiler(
             self._modin_frame.apply_snowpark_function_to_columns(
                 lambda column: tz_localize_column(column, tz),
-                include_index,
+                include_index=include_index,
             )
         )
 
@@ -17265,7 +17265,7 @@ def dt_tz_convert(
         return SnowflakeQueryCompiler(
             self._modin_frame.apply_snowpark_function_to_columns(
                 lambda column: tz_convert_column(column, tz),
-                include_index,
+                include_index=include_index,
             )
         )
 
@@ -17346,7 +17346,9 @@ def ceil_func(column: SnowparkColumn) -> SnowparkColumn:
 
         return SnowflakeQueryCompiler(
             self._modin_frame.apply_snowpark_function_to_columns(
-                ceil_func, include_index, return_type
+                ceil_func,
+                include_index=include_index,
+                return_type=return_type,
             )
         )
 
@@ -17505,7 +17507,9 @@ def round_func(column: SnowparkColumn) -> SnowparkColumn:
 
         return SnowflakeQueryCompiler(
             self._modin_frame.apply_snowpark_function_to_columns(
-                round_func, include_index, return_type
+                round_func,
+                include_index=include_index,
+                return_type=return_type,
             )
         )
 
@@ -17578,7 +17582,9 @@ def floor_func(column: SnowparkColumn) -> SnowparkColumn:
 
         return SnowflakeQueryCompiler(
             self._modin_frame.apply_snowpark_function_to_columns(
-                floor_func, include_index, return_type
+                floor_func,
+                include_index=include_index,
+                return_type=return_type,
             ),
         )
 
@@ -17600,7 +17606,8 @@ def normalize_column(column: SnowparkColumn) -> SnowparkColumn:
 
         return SnowflakeQueryCompiler(
             self._modin_frame.apply_snowpark_function_to_columns(
-                normalize_column, include_index
+                normalize_column,
+                include_index=include_index,
             )
         )
 
@@ -17633,7 +17640,8 @@ def month_name_func(column: SnowparkColumn) -> SnowparkColumn:
 
         return SnowflakeQueryCompiler(
             self._modin_frame.apply_snowpark_function_to_columns(
-                month_name_func, include_index
+                month_name_func,
+                include_index=include_index,
             )
         )
 
@@ -17666,7 +17674,8 @@ def day_name_func(column: SnowparkColumn) -> SnowparkColumn:
 
         return SnowflakeQueryCompiler(
             self._modin_frame.apply_snowpark_function_to_columns(
-                day_name_func, include_index
+                day_name_func,
+                include_index=include_index,
             )
         )
 
@@ -17688,7 +17697,7 @@ def dt_total_seconds(self, include_index: bool = False) -> "SnowflakeQueryCompil
             self._modin_frame.apply_snowpark_function_to_columns(
                 # Cast the column to decimal of scale 9 to ensure no precision loss.
                 lambda x: x.cast(DecimalType(scale=9)) / 1_000_000_000,
-                include_index,
+                include_index=include_index,
             )
         )
 
@@ -18766,11 +18775,77 @@ def compare(
 
         return result
 
-    def tz_convert(self, *args: Any, **kwargs: Any) -> None:
-        ErrorMessage.method_not_implemented_error("tz_convert", "BasePandasDataset")
+    def tz_localize(
+        self,
+        tz: Union[str, tzinfo],
+        axis: int = 0,
+        level: Optional[Level] = None,
+        copy: bool = True,
+        ambiguous: str = "raise",
+        nonexistent: str = "raise",
+    ) -> "SnowflakeQueryCompiler":
+        """
+        Localize tz-naive index of a Series or DataFrame to target time zone.
+
+        This operation localizes the Index. To localize the values in a timezone-naive Series, use Series.dt.tz_localize().
+
+        Parameters
+        ----------
+        tz : str or tzinfo or None
+            Time zone to localize. Passing None will remove the time zone information and preserve local time.
+        axis : {0 or ‘index’, 1 or ‘columns’}, default 0
+            The axis to localize
+        level : int, str, default None
+            If axis is a MultiIndex, localize a specific level. Otherwise must be None.
+        copy : bool, default True
+            Also make a copy of the underlying data.
+        ambiguous: ‘infer’, bool-ndarray, ‘NaT’, default ‘raise’
+            When clocks moved backward due to DST, ambiguous times may arise. For example in Central European Time (UTC+01), when going from 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the ambiguous parameter dictates how ambiguous times should be handled.
+            - ‘infer’ will attempt to infer fall dst-transition hours based on order
+            - bool-ndarray where True signifies a DST time, False designates a non-DST time (note that this flag is only applicable for ambiguous times)
+            - ‘NaT’ will return NaT where there are ambiguous times
+            - ‘raise’ will raise an AmbiguousTimeError if there are ambiguous times.
+        nonexistent : str, default ‘raise’
+            A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. Valid values are:
+            - ‘shift_forward’ will shift the nonexistent time forward to the closest existing time
+            - ‘shift_backward’ will shift the nonexistent time backward to the closest existing time
+            - ‘NaT’ will return NaT where there are nonexistent times
+            - timedelta objects will shift nonexistent times by the timedelta
+            - ‘raise’ will raise an NonExistentTimeError if there are nonexistent times.
+
+        Returns
+        -------
+        SnowflakeQueryCompiler
+            The result of applying time zone localization.
+        """
+        if axis in (1, "columns"):
+            ErrorMessage.not_implemented(
+                f"Snowpark pandas 'tz_localize' method doesn't yet support 'axis={axis}'"
+            )
+        if level is not None:
+            ErrorMessage.not_implemented(
+                "Snowpark pandas 'tz_localize' method doesn't yet support the 'level' parameter"
+            )
+        if copy is not True:
+            ErrorMessage.not_implemented(
+                "Snowpark pandas 'tz_localize' method doesn't support 'copy=False'"
+            )
+        if not isinstance(ambiguous, str) or ambiguous != "raise":
+            ErrorMessage.not_implemented(
+                "Snowpark pandas 'tz_localize' method doesn't yet support the 'ambiguous' parameter"
+            )
+        if not isinstance(nonexistent, str) or nonexistent != "raise":
+            ErrorMessage.not_implemented(
+                "Snowpark pandas 'tz_localize' method doesn't yet support the 'nonexistent' parameter"
+            )
 
-    def tz_localize(self, *args: Any, **kwargs: Any) -> None:
-        ErrorMessage.method_not_implemented_error("tz_convert", "BasePandasDataset")
+        return SnowflakeQueryCompiler(
+            self._modin_frame.apply_snowpark_function_to_columns(
+                lambda column: tz_localize_column(column, tz),
+                include_data=False,
+                include_index=True,
+            )
+        )
 
     def timedelta_property(
         self, property_name: str, include_index: bool = False
@@ -18815,5 +18890,8 @@ def timedelta_property(
                 f"Snowpark pandas doesn't yet support the property '{class_prefix}.{property_name}'"
             )  # pragma: no cover
         return SnowflakeQueryCompiler(
-            self._modin_frame.apply_snowpark_function_to_columns(func, include_index)
+            self._modin_frame.apply_snowpark_function_to_columns(
+                func,
+                include_index=include_index,
+            )
         )
@@ -4179,6 +4179,117 @@ def to_timestamp():
         Cast to DatetimeIndex of timestamps, at *beginning* of period.
         """
 
+    def tz_localize():
+        """
+        Localize tz-naive index of a Series or DataFrame to target time zone.
+
+        This operation localizes the Index. To localize the values in a timezone-naive Series, use Series.dt.tz_localize().
+
+        Parameters
+        ----------
+        tz : str or tzinfo or None
+            Time zone to localize. Passing None will remove the time zone information and preserve local time.
+        axis : {0 or ‘index’, 1 or ‘columns’}, default 0
+            The axis to localize
+        level : int, str, default None
+            If axis ia a MultiIndex, localize a specific level. Otherwise must be None.
+        copy : bool, default True
+            Also make a copy of the underlying data.
+        ambiguou: ‘infer’, bool-ndarray, ‘NaT’, default ‘raise’
+            When clocks moved backward due to DST, ambiguous times may arise. For example in Central European Time (UTC+01), when going from 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the ambiguous parameter dictates how ambiguous times should be handled.
+            - ‘infer’ will attempt to infer fall dst-transition hours based on order
+            - bool-ndarray where True signifies a DST time, False designates a non-DST time (note that this flag is only applicable for ambiguous times)
+            - ‘NaT’ will return NaT where there are ambiguous times
+            - ‘raise’ will raise an AmbiguousTimeError if there are ambiguous times.
+        nonexistent : str, default ‘raise’
+            A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. Valid values are:
+            - ‘shift_forward’ will shift the nonexistent time forward to the closest existing time
+            - ‘shift_backward’ will shift the nonexistent time backward to the closest existing time
+            - ‘NaT’ will return NaT where there are nonexistent times
+            - timedelta objects will shift nonexistent times by the timedelta
+            - ‘raise’ will raise an NonExistentTimeError if there are nonexistent times.
+
+        Returns
+        -------
+        Series/DataFrame
+            Same type as the input.
+
+        Raises
+        ------
+        TypeError
+            If the TimeSeries is tz-aware and tz is not None.
+
+        Examples
+        --------
+        Localize local times:
+
+        >>> s = pd.Series(
+        ...     [1],
+        ...     index=pd.DatetimeIndex(['2018-09-15 01:30:00']),
+        ... )
+        >>> s.tz_localize('CET')
+        2018-09-15 01:30:00+02:00    1
+        Freq: None, dtype: int64
+
+        Pass None to convert to tz-naive index and preserve local time:
+
+        >>> s = pd.Series([1],
+        ...             index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']))
+        >>> s.tz_localize(None)
+        2018-09-15 01:30:00    1
+        Freq: None, dtype: int64
+
+        Be careful with DST changes. When there is sequential data, pandas can infer the DST time:
+
+        >>> s = pd.Series(range(7),
+        ...             index=pd.DatetimeIndex(['2018-10-28 01:30:00',
+        ...                                     '2018-10-28 02:00:00',
+        ...                                     '2018-10-28 02:30:00',
+        ...                                     '2018-10-28 02:00:00',
+        ...                                     '2018-10-28 02:30:00',
+        ...                                     '2018-10-28 03:00:00',
+        ...                                     '2018-10-28 03:30:00']))
+        >>> s.tz_localize('CET', ambiguous='infer')  # doctest: +SKIP
+        2018-10-28 01:30:00+02:00    0
+        2018-10-28 02:00:00+02:00    1
+        2018-10-28 02:30:00+02:00    2
+        2018-10-28 02:00:00+01:00    3
+        2018-10-28 02:30:00+01:00    4
+        2018-10-28 03:00:00+01:00    5
+        2018-10-28 03:30:00+01:00    6
+        dtype: int64
+
+        In some cases, inferring the DST is impossible. In such cases, you can pass an ndarray to the ambiguous parameter to set the DST explicitly
+
+        >>> s = pd.Series(range(3),
+        ...             index=pd.DatetimeIndex(['2018-10-28 01:20:00',
+        ...                                     '2018-10-28 02:36:00',
+        ...                                     '2018-10-28 03:46:00']))
+        >>> s.tz_localize('CET', ambiguous=np.array([True, True, False]))  # doctest: +SKIP
+        2018-10-28 01:20:00+02:00    0
+        2018-10-28 02:36:00+02:00    1
+        2018-10-28 03:46:00+01:00    2
+        dtype: int64
+
+        If the DST transition causes nonexistent times, you can shift these dates forward or backward with a timedelta object or ‘shift_forward’ or ‘shift_backward’.
+
+        >>> s = pd.Series(range(2),
+        ...             index=pd.DatetimeIndex(['2015-03-29 02:30:00',
+        ...                                     '2015-03-29 03:30:00']))
+        >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward')  # doctest: +SKIP
+        2015-03-29 03:00:00+02:00    0
+        2015-03-29 03:30:00+02:00    1
+        dtype: int64
+        >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward')  # doctest: +SKIP
+        2015-03-29 01:59:59.999999999+01:00    0
+        2015-03-29 03:30:00+02:00              1
+        dtype: int64
+        >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1h'))  # doctest: +SKIP
+        2015-03-29 03:30:00+02:00    0
+        2015-03-29 03:30:00+02:00    1
+        dtype: int64
+        """
+
     def truediv():
         """
         Get floating division of ``DataFrame`` and `other`, element-wise (binary operator `truediv`).