From 7e461a18d9f6928132afec6f48ce968b3e989ba6 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Mon, 3 Dec 2018 17:43:52 +0100 Subject: [PATCH 01/43] remove \n from docstring --- pandas/core/arrays/datetimes.py | 26 +++++++++++++------------- pandas/core/arrays/timedeltas.py | 16 ++++++++-------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index cfe3afcf3730a..b3df505d56d78 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -82,7 +82,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -1072,19 +1072,19 @@ def date(self): return tslib.ints_to_pydatetime(timestamps, box="date") - year = _field_accessor('year', 'Y', "\n The year of the datetime\n") + year = _field_accessor('year', 'Y', "The year of the datetime") month = _field_accessor('month', 'M', - "\n The month as January=1, December=12 \n") - day = _field_accessor('day', 'D', "\nThe days of the datetime\n") - hour = _field_accessor('hour', 'h', "\nThe hours of the datetime\n") - minute = _field_accessor('minute', 'm', "\nThe minutes of the datetime\n") - second = _field_accessor('second', 's', "\nThe seconds of the datetime\n") + "The month as January=1, December=12") + day = _field_accessor('day', 'D', "The days of the datetime") + hour = _field_accessor('hour', 'h', "The hours of the datetime") + minute = _field_accessor('minute', 'm', "The minutes of the datetime") + second = _field_accessor('second', 's', "The seconds of the datetime") microsecond = _field_accessor('microsecond', 'us', - "\nThe microseconds of the datetime\n") + "The microseconds of the datetime") nanosecond = _field_accessor('nanosecond', 'ns', - "\nThe nanoseconds of the datetime\n") + "The nanoseconds of the datetime") weekofyear = _field_accessor('weekofyear', 'woy', - "\nThe week ordinal of the year\n") + "The week ordinal of the year") week = weekofyear _dayofweek_doc = """ The day of the week with Monday=0, Sunday=6. @@ -1129,12 +1129,12 @@ def date(self): "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0") dayofyear = _field_accessor('dayofyear', 'doy', - "\nThe ordinal day of the year\n") - quarter = _field_accessor('quarter', 'q', "\nThe quarter of the date\n") + "The ordinal day of the year") + quarter = _field_accessor('quarter', 'q', "The quarter of the date") days_in_month = _field_accessor( 'days_in_month', 'dim', - "\nThe number of days in the month\n") + "The number of days in the month") daysinmonth = days_in_month _is_month_doc = """ Indicates whether the date is the {first_or_last} day of the month. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 830283d31a929..4afc9f5483c2a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -59,7 +59,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -684,16 +684,16 @@ def to_pytimedelta(self): return tslibs.ints_to_pytimedelta(self.asi8) days = _field_accessor("days", "days", - "\nNumber of days for each element.\n") + "Number of days for each element.") seconds = _field_accessor("seconds", "seconds", - "\nNumber of seconds (>= 0 and less than 1 day) " - "for each element.\n") + "Number of seconds (>= 0 and less than 1 day) " + "for each element.") microseconds = _field_accessor("microseconds", "microseconds", - "\nNumber of microseconds (>= 0 and less " - "than 1 second) for each element.\n") + "Number of microseconds (>= 0 and less " + "than 1 second) for each element.") nanoseconds = _field_accessor("nanoseconds", "nanoseconds", - "\nNumber of nanoseconds (>= 0 and less " - "than 1 microsecond) for each element.\n") + "Number of nanoseconds (>= 0 and less " + "than 1 microsecond) for each element.") @property def components(self): From 98f61271b5631c08b2fae3f6f13b65e8e24a7634 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 31 Dec 2019 17:06:22 +0100 Subject: [PATCH 02/43] fix issue 3729 --- pandas/core/algorithms.py | 9 ++- pandas/core/generic.py | 8 ++ pandas/core/groupby/groupby.py | 8 ++ pandas/core/groupby/grouper.py | 15 +++- pandas/tests/groupby/test_groupby.py | 115 +++++++++++++++++++++++++++ pandas/tests/test_algos.py | 96 ++++++++++++++++++++++ 6 files changed, 248 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 42cfd9d54ac19..83b26a116d505 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -596,7 +596,11 @@ def _factorize_array( ) @Appender(_shared_docs["factorize"]) def factorize( - values, sort: bool = False, na_sentinel: int = -1, size_hint: Optional[int] = None + values, + sort: bool = False, + na_sentinel: int = -1, + size_hint: Optional[int] = None, + dropna: Optional[bool] = None, ) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) @@ -630,6 +634,9 @@ def factorize( uniques, codes = safe_sort( uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False ) + if dropna is False and (codes == na_sentinel).any(): + uniques = np.append(uniques, [np.nan]) + codes = np.where(codes == na_sentinel, len(uniques) - 1, codes) uniques = _reconstruct_data(uniques, dtype, original) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2b108d3997235..19580b1e2cddc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7311,6 +7311,7 @@ def groupby( group_keys: bool_t = True, squeeze: bool_t = False, observed: bool_t = False, + dropna: Optional[bool_t] = None, ): """ Group DataFrame or Series using a mapper or by a Series of columns. @@ -7355,6 +7356,12 @@ def groupby( If False: show all values for categorical groupers. .. versionadded:: 0.23.0 + dropna : bool or None, default None + If None or True, and if group keys contain NaN values, NaN values together + with row/column will be dropped. + If False, NaN values will also be treated as the key in groups + + .. versionadded:: 1.0.0 Returns ------- @@ -7433,6 +7440,7 @@ def groupby( group_keys=group_keys, squeeze=squeeze, observed=observed, + dropna=dropna, ) def asfreq( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8ff055ff4c1be..a004c909c4160 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -373,6 +373,7 @@ def __init__( squeeze: bool = False, observed: bool = False, mutated: bool = False, + dropna: Optional[bool] = None, ): self._selection = selection @@ -396,6 +397,8 @@ def __init__( self.observed = observed self.mutated = mutated + self.dropna = dropna if dropna is not None else True + if grouper is None: from pandas.core.groupby.grouper import get_grouper @@ -407,6 +410,7 @@ def __init__( sort=sort, observed=observed, mutated=self.mutated, + dropna=self.dropna, ) self.obj = obj @@ -2540,6 +2544,7 @@ def get_groupby( squeeze: bool = False, observed: bool = False, mutated: bool = False, + dropna: Optional[bool] = None, ): klass: Union[Type["SeriesGroupBy"], Type["DataFrameGroupBy"]] @@ -2554,6 +2559,8 @@ def get_groupby( else: raise TypeError(f"invalid type: {obj}") + dropna = dropna if dropna is not None else True + return klass( obj=obj, keys=by, @@ -2568,4 +2575,5 @@ def get_groupby( squeeze=squeeze, observed=observed, mutated=mutated, + dropna=dropna, ) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 2c224a1bef338..e2c8df07aca6d 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -100,7 +100,9 @@ def __new__(cls, *args, **kwargs): cls = TimeGrouper return super().__new__(cls) - def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): + def __init__( + self, key=None, level=None, freq=None, axis=0, sort=False, dropna=None + ): self.key = key self.level = level self.freq = freq @@ -112,6 +114,7 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): self.indexer = None self.binner = None self._grouper = None + self.dropna = dropna if dropna is not None else True @property def ax(self): @@ -138,6 +141,7 @@ def _get_grouper(self, obj, validate: bool = True): level=self.level, sort=self.sort, validate=validate, + dropna=self.dropna, ) return self.binner, self.grouper, self.obj @@ -250,6 +254,7 @@ def __init__( sort: bool = True, observed: bool = False, in_axis: bool = False, + dropna: Optional[bool] = None, ): self.name = name self.level = level @@ -261,6 +266,8 @@ def __init__( self.observed = observed self.in_axis = in_axis + self.dropna = dropna if dropna is not None else True + # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name @@ -413,7 +420,9 @@ def _make_codes(self) -> None: codes = self.grouper.codes_info uniques = self.grouper.result_index else: - codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) + codes, uniques = algorithms.factorize( + self.grouper, sort=self.sort, dropna=self.dropna + ) uniques = Index(uniques, name=self.name) self._codes = codes self._group_index = uniques @@ -432,6 +441,7 @@ def get_grouper( observed: bool = False, mutated: bool = False, validate: bool = True, + dropna: Optional[bool] = None, ) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": """ Create and return a BaseGrouper, which is an internal @@ -621,6 +631,7 @@ def is_in_obj(gpr) -> bool: sort=sort, observed=observed, in_axis=in_axis, + dropna=dropna, ) if not isinstance(gpr, Grouping) else gpr diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8f88f68c69f2b..b69a9ac853704 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2011,3 +2011,118 @@ def test_groupby_crash_on_nunique(axis): expected = expected.T tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dropna, tuples, outputs", + [ + ( + None, + [["A", "B"], ["B", "A"]], + {"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]}, + ), + ( + True, + [["A", "B"], ["B", "A"]], + {"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]}, + ), + ( + False, + [["A", "B"], ["A", np.nan], ["B", "A"]], + { + "c": [13.0, 12.3, 123.23], + "d": [13.0, 233.0, 123.0], + "e": [13.0, 12.0, 1.0], + }, + ), + ], +) +def test_groupby_dropna_multi_index_dataframe(dropna, tuples, outputs): + # GH 3729 + df_list = [ + ["A", "B", 12, 12, 12], + ["A", None, 12.3, 233.0, 12], + ["B", "A", 123.23, 123, 1], + ["A", "B", 1, 1, 1.0], + ] + df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) + grouped = df.groupby(["a", "b"], dropna=dropna).sum() + + mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) + expected = pd.DataFrame(outputs, index=mi) + + tm.assert_frame_equal(grouped, expected, check_index_type=False) + + +@pytest.mark.parametrize( + "dropna, idx, outputs", + [ + (None, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}), + (True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}), + ( + False, + ["A", "B", np.nan], + { + "b": [123.23, 13.0, 12.3], + "c": [123.0, 13.0, 233.0], + "d": [1.0, 13.0, 12.0], + }, + ), + ], +) +def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): + # GH 3729 + df_list = [ + ["B", 12, 12, 12], + [None, 12.3, 233.0, 12], + ["A", 123.23, 123, 1], + ["B", 1, 1, 1.0], + ] + df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) + grouped = df.groupby("a", dropna=dropna).sum() + + expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) + + tm.assert_frame_equal(grouped, expected, check_index_type=False) + + +@pytest.mark.parametrize( + "dropna, tuples, outputs", + [ + ( + None, + [["A", "B"], ["B", "A"]], + {"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]}, + ), + ( + True, + [["A", "B"], ["B", "A"]], + {"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]}, + ), + ( + False, + [["A", "B"], ["A", np.nan], ["B", "A"]], + { + "c": [13.0, 12.3, 123.23], + "d": [12.0, 233.0, 123.0], + "e": [1.0, 12.0, 1.0], + }, + ), + ], +) +def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): + # GH 3729 + df_list = [ + ["A", "B", 12, 12, 12], + ["A", None, 12.3, 233.0, 12], + ["B", "A", 123.23, 123, 1], + ["A", "B", 1, 1, 1.0], + ] + df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) + agg_dict = {"c": sum, "d": max, "e": "min"} + grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict) + + mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) + expected = pd.DataFrame(outputs, index=mi) + + tm.assert_frame_equal(grouped, expected, check_index_type=False) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 82f647c9385b2..42b5a16aea707 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -326,6 +326,102 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): else: tm.assert_extension_array_equal(uniques, expected_uniques) + @pytest.mark.parametrize( + "data, dropna, expected_codes, expected_uniques", + [ + ( + ["a", None, "b", "a"], + None, + np.array([0, -1, 1, 0]), + np.array(["a", "b"], dtype=object), + ), + ( + ["a", np.nan, "b", "a"], + None, + np.array([0, -1, 1, 0]), + np.array(["a", "b"], dtype=object), + ), + ( + ["a", None, "b", "a"], + True, + np.array([0, -1, 1, 0]), + np.array(["a", "b"], dtype=object), + ), + ( + ["a", np.nan, "b", "a"], + True, + np.array([0, -1, 1, 0]), + np.array(["a", "b"], dtype=object), + ), + ( + ["a", None, "b", "a"], + False, + np.array([0, 2, 1, 0]), + np.array(["a", "b", np.nan], dtype=object), + ), + ( + ["a", np.nan, "b", "a"], + False, + np.array([0, 2, 1, 0]), + np.array(["a", "b", np.nan], dtype=object), + ), + ], + ) + def test_object_factorize_dropna( + self, data, dropna, expected_codes, expected_uniques + ): + codes, uniques = algos.factorize(data, dropna=dropna) + + tm.assert_numpy_array_equal(uniques, expected_uniques) + tm.assert_numpy_array_equal(codes, expected_codes) + + @pytest.mark.parametrize( + "data, dropna, expected_codes, expected_uniques", + [ + ( + [1, None, 1, 2], + None, + np.array([0, -1, 0, 1]), + np.array([1, 2], dtype="O"), + ), + ( + [1, np.nan, 1, 2], + None, + np.array([0, -1, 0, 1]), + np.array([1, 2], dtype=np.float64), + ), + ( + [1, None, 1, 2], + True, + np.array([0, -1, 0, 1]), + np.array([1, 2], dtype="O"), + ), + ( + [1, np.nan, 1, 2], + True, + np.array([0, -1, 0, 1]), + np.array([1, 2], dtype=np.float64), + ), + ( + [1, None, 1, 2], + False, + np.array([0, 2, 0, 1]), + np.array([1, 2, np.nan], dtype="O"), + ), + ( + [1, np.nan, 1, 2], + False, + np.array([0, 2, 0, 1]), + np.array([1, 2, np.nan], dtype=np.float64), + ), + ], + ) + def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniques): + codes, uniques = algos.factorize(data, dropna=dropna) + + tm.assert_numpy_array_equal(uniques, expected_uniques) + tm.assert_numpy_array_equal(codes, expected_codes) + class TestUnique: def test_ints(self): From eb717ec405a843d6a17904e80964e0532d602784 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 31 Dec 2019 17:51:24 +0100 Subject: [PATCH 03/43] not check type --- pandas/tests/test_algos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 42b5a16aea707..5e74c7745dc60 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -373,7 +373,7 @@ def test_object_factorize_dropna( codes, uniques = algos.factorize(data, dropna=dropna) tm.assert_numpy_array_equal(uniques, expected_uniques) - tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(codes, expected_codes, check_dtype=False) @pytest.mark.parametrize( "data, dropna, expected_codes, expected_uniques", @@ -420,7 +420,7 @@ def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniqu codes, uniques = algos.factorize(data, dropna=dropna) tm.assert_numpy_array_equal(uniques, expected_uniques) - tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(codes, expected_codes, check_dtype=False) class TestUnique: From de2ee5d5b2f4d21a8ba4001d42aeac68ff1ff1ca Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 31 Dec 2019 19:35:40 +0100 Subject: [PATCH 04/43] Add groupby test for Series --- pandas/tests/groupby/test_groupby.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 723a00b668ae8..1abe79dec17e7 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2100,6 +2100,25 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): tm.assert_frame_equal(grouped, expected, check_index_type=False) +@pytest.mark.parametrize( + "dropna, idx, expected", + [ + (None, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])), + (True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])), + ( + False, + ["a", "a", "b", np.nan], + pd.Series([3, 3, 3], index=["a", "b", np.nan]), + ), + ], +) +def test_groupby_dropna_series(dropna, idx, expected): + ser = pd.Series([1, 2, 3, 3], index=idx) + + result = ser.groupby(level=0, dropna=dropna).sum() + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "dropna, tuples, outputs", [ From def05cc5910a7859003110556ba54e9768a4876e Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 31 Dec 2019 19:47:11 +0100 Subject: [PATCH 05/43] Add whatsnew note --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index e8663853b7684..2638fbab2f08a 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -549,6 +549,7 @@ Other API changes Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) - :meth:`Series.str.__iter__` was deprecated and will be removed in future releases (:issue:`28277`). +- :meth:`DataFrame.groupby` and :meth:`Series.groupby` have gained ``dropna`` argument in order to allow ``NaN`` values in group keys (:issue:`3729`) .. _whatsnew_1000.api.documentation: From 28888078c0a41a09742316c555d9ebde1b194a5b Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 1 Jan 2020 10:30:46 +0100 Subject: [PATCH 06/43] Code change based on JR review --- doc/source/whatsnew/v1.0.0.rst | 11 ++++++++++- pandas/core/algorithms.py | 4 ++-- pandas/core/generic.py | 6 +++--- pandas/core/groupby/groupby.py | 6 +++--- pandas/core/groupby/grouper.py | 6 +++--- pandas/tests/groupby/test_groupby.py | 12 ------------ pandas/tests/test_algos.py | 24 ------------------------ 7 files changed, 21 insertions(+), 48 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2638fbab2f08a..30f6786cd1113 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -191,6 +191,16 @@ method on a :func:`pandas.api.indexers.BaseIndexer` subclass that will generate indices used for each window during the rolling aggregation. For more details and example usage, see the :ref:`custom window rolling documentation ` +.. _whatsnew_1000.groupby_key: + +Allow NaN in groupby key +^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added a ``dropna`` keyword to :meth:`DataFrame.groupby` and :meth:`Series.groupby` in order to +allow ``NaN`` values in group keys. Users can define ``dropna`` to ``False`` if they want to include +``NaN`` values in groupby keys. The default is set to ``True`` for ``dropna`` to keep backwards +compatibility (:issue:`3729`) + .. _whatsnew_1000.enhancements.other: Other enhancements @@ -549,7 +559,6 @@ Other API changes Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) - :meth:`Series.str.__iter__` was deprecated and will be removed in future releases (:issue:`28277`). -- :meth:`DataFrame.groupby` and :meth:`Series.groupby` have gained ``dropna`` argument in order to allow ``NaN`` values in group keys (:issue:`3729`) .. _whatsnew_1000.api.documentation: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 83b26a116d505..4d6f91d054740 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -600,7 +600,7 @@ def factorize( sort: bool = False, na_sentinel: int = -1, size_hint: Optional[int] = None, - dropna: Optional[bool] = None, + dropna: bool = True, ) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) @@ -634,7 +634,7 @@ def factorize( uniques, codes = safe_sort( uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False ) - if dropna is False and (codes == na_sentinel).any(): + if not dropna and (codes == na_sentinel).any(): uniques = np.append(uniques, [np.nan]) codes = np.where(codes == na_sentinel, len(uniques) - 1, codes) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 19580b1e2cddc..d97a4235c75b7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7311,7 +7311,7 @@ def groupby( group_keys: bool_t = True, squeeze: bool_t = False, observed: bool_t = False, - dropna: Optional[bool_t] = None, + dropna: bool_t = True, ): """ Group DataFrame or Series using a mapper or by a Series of columns. @@ -7356,8 +7356,8 @@ def groupby( If False: show all values for categorical groupers. .. versionadded:: 0.23.0 - dropna : bool or None, default None - If None or True, and if group keys contain NaN values, NaN values together + dropna : bool, default True + If True, and if group keys contain NaN values, NaN values together with row/column will be dropped. If False, NaN values will also be treated as the key in groups diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e39d684c0eb49..a7f98aca7011e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -373,7 +373,7 @@ def __init__( squeeze: bool = False, observed: bool = False, mutated: bool = False, - dropna: Optional[bool] = None, + dropna: bool = True, ): self._selection = selection @@ -397,7 +397,7 @@ def __init__( self.observed = observed self.mutated = mutated - self.dropna = dropna if dropna is not None else True + self.dropna = dropna if grouper is None: from pandas.core.groupby.grouper import get_grouper @@ -2547,7 +2547,7 @@ def get_groupby( squeeze: bool = False, observed: bool = False, mutated: bool = False, - dropna: Optional[bool] = None, + dropna: bool = True, ) -> GroupBy: klass: Type[GroupBy] diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 9a40ed1c319a1..919602fb3d728 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -254,7 +254,7 @@ def __init__( sort: bool = True, observed: bool = False, in_axis: bool = False, - dropna: Optional[bool] = None, + dropna: bool = True, ): self.name = name self.level = level @@ -266,7 +266,7 @@ def __init__( self.observed = observed self.in_axis = in_axis - self.dropna = dropna if dropna is not None else True + self.dropna = dropna # right place for this? if isinstance(grouper, (Series, Index)) and name is None: @@ -441,7 +441,7 @@ def get_grouper( observed: bool = False, mutated: bool = False, validate: bool = True, - dropna: Optional[bool] = None, + dropna: bool = True, ) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": """ Create and return a BaseGrouper, which is an internal diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1abe79dec17e7..9f04dd2cd0309 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2030,11 +2030,6 @@ def test_groupby_crash_on_nunique(axis): @pytest.mark.parametrize( "dropna, tuples, outputs", [ - ( - None, - [["A", "B"], ["B", "A"]], - {"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]}, - ), ( True, [["A", "B"], ["B", "A"]], @@ -2071,7 +2066,6 @@ def test_groupby_dropna_multi_index_dataframe(dropna, tuples, outputs): @pytest.mark.parametrize( "dropna, idx, outputs", [ - (None, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}), (True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}), ( False, @@ -2103,7 +2097,6 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): @pytest.mark.parametrize( "dropna, idx, expected", [ - (None, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])), (True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])), ( False, @@ -2122,11 +2115,6 @@ def test_groupby_dropna_series(dropna, idx, expected): @pytest.mark.parametrize( "dropna, tuples, outputs", [ - ( - None, - [["A", "B"], ["B", "A"]], - {"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]}, - ), ( True, [["A", "B"], ["B", "A"]], diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 5e74c7745dc60..e880850ee18e1 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -329,18 +329,6 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): @pytest.mark.parametrize( "data, dropna, expected_codes, expected_uniques", [ - ( - ["a", None, "b", "a"], - None, - np.array([0, -1, 1, 0]), - np.array(["a", "b"], dtype=object), - ), - ( - ["a", np.nan, "b", "a"], - None, - np.array([0, -1, 1, 0]), - np.array(["a", "b"], dtype=object), - ), ( ["a", None, "b", "a"], True, @@ -378,18 +366,6 @@ def test_object_factorize_dropna( @pytest.mark.parametrize( "data, dropna, expected_codes, expected_uniques", [ - ( - [1, None, 1, 2], - None, - np.array([0, -1, 0, 1]), - np.array([1, 2], dtype="O"), - ), - ( - [1, np.nan, 1, 2], - None, - np.array([0, -1, 0, 1]), - np.array([1, 2], dtype=np.float64), - ), ( [1, None, 1, 2], True, From dc4fef1b0b99fd7d2680e5f75e8cb257d7abcf4c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 1 Jan 2020 10:43:23 +0100 Subject: [PATCH 07/43] add forgotten commits --- pandas/core/frame.py | 2 ++ pandas/core/groupby/groupby.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fb1ba4f6f53f8..b232c4fadea7f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5661,6 +5661,7 @@ def groupby( group_keys: bool = True, squeeze: bool = False, observed: bool = False, + dropna: bool = True, ) -> "groupby_generic.DataFrameGroupBy": if level is None and by is None: @@ -5677,6 +5678,7 @@ def groupby( group_keys=group_keys, squeeze=squeeze, observed=observed, + dropna=dropna, ) _shared_docs[ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a7f98aca7011e..93501e1db6a30 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2562,8 +2562,6 @@ def get_groupby( else: raise TypeError(f"invalid type: {obj}") - dropna = dropna if dropna is not None else True - return klass( obj=obj, keys=by, From 25482ec111c3e83078c7ec710a5c58dd442fc88e Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 1 Jan 2020 10:44:26 +0100 Subject: [PATCH 08/43] add forgotten commit --- pandas/core/groupby/grouper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 919602fb3d728..176e84923b700 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -101,7 +101,7 @@ def __new__(cls, *args, **kwargs): return super().__new__(cls) def __init__( - self, key=None, level=None, freq=None, axis=0, sort=False, dropna=None + self, key=None, level=None, freq=None, axis=0, sort=False, dropna=True ): self.key = key self.level = level @@ -114,7 +114,7 @@ def __init__( self.indexer = None self.binner = None self._grouper = None - self.dropna = dropna if dropna is not None else True + self.dropna = dropna @property def ax(self): From 015336d5dbcfa00d9e2d1357411eeb7e49a85d2d Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 1 Jan 2020 10:47:22 +0100 Subject: [PATCH 09/43] Add dropna for series --- pandas/core/series.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index aa5af9bb893fa..bb3125dea1b02 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1633,6 +1633,7 @@ def groupby( group_keys: bool = True, squeeze: bool = False, observed: bool = False, + dropna: bool = True, ) -> "groupby_generic.SeriesGroupBy": if level is None and by is None: @@ -1649,6 +1650,7 @@ def groupby( group_keys=group_keys, squeeze=squeeze, observed=observed, + dropna=dropna, ) # ---------------------------------------------------------------------- From ac2a79f51a37307a67d16aed4f5b7c6ccb611a8b Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 1 Jan 2020 12:04:08 +0100 Subject: [PATCH 10/43] add doc example for Series --- pandas/core/series.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index bb3125dea1b02..d213408f0275d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1433,7 +1433,7 @@ def to_string( @Substitution(klass="Series") @Appender(generic._shared_docs["to_markdown"]) def to_markdown( - self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs, + self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs ) -> Optional[str]: return self.to_frame().to_markdown(buf, mode, **kwargs) @@ -1620,6 +1620,34 @@ def _set_name(self, name, inplace=False): Captive 210.0 Wild 185.0 Name: Max Speed, dtype: float64 + +We can also choose to include NaN in group keys or not by defining +`dropna` parameter: + +>>> ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan]) +>>> ser.groupby(level=0).sum() +a 3 +b 3 +dtype: int64 + +>>> ser.groupby(level=0, dropna=False).sum() +a 3 +b 3 +NaN 3 +dtype: int64 + +>>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot'] +>>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed") +>>> ser.groupby(["a", "b", "a", np.nan]).mean() +a 210.0 +b 350.0 +Name: Max Speed, dtype: float64 + +>>> ser.groupby(["a", "b", "a", np.nan], dropna=False).mean() +a 210.0 +b 350.0 +NaN 20.0 +Name: Max Speed, dtype: float64 """ ) @Appender(generic._shared_docs["groupby"] % _shared_doc_kwargs) @@ -4480,9 +4508,7 @@ def to_period(self, freq=None, copy=True): hist = pandas.plotting.hist_series -Series._setup_axes( - ["index"], docs={"index": "The index (axis labels) of the Series."}, -) +Series._setup_axes(["index"], docs={"index": "The index (axis labels) of the Series."}) Series._add_numeric_operations() Series._add_series_or_dataframe_operations() From eb9a6f71ca2eca0ce0df3bbbccf632ea581c17d0 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 1 Jan 2020 12:05:14 +0100 Subject: [PATCH 11/43] Add level example for series groupby --- pandas/tests/groupby/test_groupby.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9f04dd2cd0309..54595c3df9a81 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2105,13 +2105,34 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): ), ], ) -def test_groupby_dropna_series(dropna, idx, expected): +def test_groupby_dropna_series_level(dropna, idx, expected): ser = pd.Series([1, 2, 3, 3], index=idx) result = ser.groupby(level=0, dropna=dropna).sum() tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "dropna, expected", + [ + (True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")), + ( + False, + pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"), + ), + ], +) +def test_groupby_dropna_series_by(dropna, expected): + ser = pd.Series( + [390.0, 350.0, 30.0, 20.0], + index=["Falcon", "Falcon", "Parrot", "Parrot"], + name="Max Speed", + ) + + result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean() + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "dropna, tuples, outputs", [ From ffb70f8dd021b88d33a926200f273a0f24e6b700 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 1 Jan 2020 12:05:53 +0100 Subject: [PATCH 12/43] Add doc example for frame groupby --- pandas/core/frame.py | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b232c4fadea7f..249c37c4706e9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1988,7 +1988,7 @@ def to_feather(self, path): @Substitution(klass="DataFrame") @Appender(_shared_docs["to_markdown"]) def to_markdown( - self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs, + self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs ) -> Optional[str]: kwargs.setdefault("headers", "keys") kwargs.setdefault("tablefmt", "pipe") @@ -5648,6 +5648,41 @@ def update( Type Captive 210.0 Wild 185.0 + +We can also choose to include NaN in group keys or not by defining +`dropna` parameter: + +>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] +>>> df = pd.DataFrame(l, columns=["a", "b", "c"]) + +>>> df.groupby(by=["b"]).sum() + a c +b +1.0 2 3 +2.0 2 5 + +>>> df.groupby(by=["b"], dropna=False).sum() + a c +b +1.0 2 3 +2.0 2 5 +NaN 1 4 + +>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] +>>> df = pd.DataFrame(l, columns=["a", "b", "c"]) + +>>> df.groupby(by="a").sum() + b c +a +a 13.0 13.0 +b 12.3 123.0 + +>>> df.groupby(by="a", dropna=False).sum() + b c +a +a 13.0 13.0 +b 12.3 123.0 +NaN 12.3 33.0 """ ) @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) From b0e3cce4dc203ac1ccf6917e2169db21aa59035b Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 2 Jan 2020 19:08:20 +0100 Subject: [PATCH 13/43] Code change based on JR reviews --- pandas/core/algorithms.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 4 +- pandas/tests/groupby/test_groupby.py | 143 ------------ pandas/tests/groupby/test_groupby_dropna.py | 236 ++++++++++++++++++++ 5 files changed, 240 insertions(+), 147 deletions(-) create mode 100644 pandas/tests/groupby/test_groupby_dropna.py diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4d6f91d054740..f977959774eb4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -635,7 +635,7 @@ def factorize( uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False ) if not dropna and (codes == na_sentinel).any(): - uniques = np.append(uniques, [np.nan]) + uniques = np.append(uniques, [None]) codes = np.where(codes == na_sentinel, len(uniques) - 1, codes) uniques = _reconstruct_data(uniques, dtype, original) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 249c37c4706e9..406b53cb2a4b1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5649,7 +5649,7 @@ def update( Captive 210.0 Wild 185.0 -We can also choose to include NaN in group keys or not by defining +We can also choose to include NaN in group keys or not by setting `dropna` parameter: >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 68b07e8855607..1e5a81c36a3ba 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7347,9 +7347,9 @@ def clip( .. versionadded:: 0.23.0 dropna : bool, default True - If True, and if group keys contain NaN values, NaN values together + If True, and if group keys contain NA values, NA values together with row/column will be dropped. - If False, NaN values will also be treated as the key in groups + If False, NA values will also be treated as the key in groups .. versionadded:: 1.0.0 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 54595c3df9a81..795f76d94cc25 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2025,146 +2025,3 @@ def test_groupby_crash_on_nunique(axis): expected = expected.T tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "dropna, tuples, outputs", - [ - ( - True, - [["A", "B"], ["B", "A"]], - {"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]}, - ), - ( - False, - [["A", "B"], ["A", np.nan], ["B", "A"]], - { - "c": [13.0, 12.3, 123.23], - "d": [13.0, 233.0, 123.0], - "e": [13.0, 12.0, 1.0], - }, - ), - ], -) -def test_groupby_dropna_multi_index_dataframe(dropna, tuples, outputs): - # GH 3729 - df_list = [ - ["A", "B", 12, 12, 12], - ["A", None, 12.3, 233.0, 12], - ["B", "A", 123.23, 123, 1], - ["A", "B", 1, 1, 1.0], - ] - df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) - grouped = df.groupby(["a", "b"], dropna=dropna).sum() - - mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) - expected = pd.DataFrame(outputs, index=mi) - - tm.assert_frame_equal(grouped, expected, check_index_type=False) - - -@pytest.mark.parametrize( - "dropna, idx, outputs", - [ - (True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}), - ( - False, - ["A", "B", np.nan], - { - "b": [123.23, 13.0, 12.3], - "c": [123.0, 13.0, 233.0], - "d": [1.0, 13.0, 12.0], - }, - ), - ], -) -def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): - # GH 3729 - df_list = [ - ["B", 12, 12, 12], - [None, 12.3, 233.0, 12], - ["A", 123.23, 123, 1], - ["B", 1, 1, 1.0], - ] - df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) - grouped = df.groupby("a", dropna=dropna).sum() - - expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) - - tm.assert_frame_equal(grouped, expected, check_index_type=False) - - -@pytest.mark.parametrize( - "dropna, idx, expected", - [ - (True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])), - ( - False, - ["a", "a", "b", np.nan], - pd.Series([3, 3, 3], index=["a", "b", np.nan]), - ), - ], -) -def test_groupby_dropna_series_level(dropna, idx, expected): - ser = pd.Series([1, 2, 3, 3], index=idx) - - result = ser.groupby(level=0, dropna=dropna).sum() - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "dropna, expected", - [ - (True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")), - ( - False, - pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"), - ), - ], -) -def test_groupby_dropna_series_by(dropna, expected): - ser = pd.Series( - [390.0, 350.0, 30.0, 20.0], - index=["Falcon", "Falcon", "Parrot", "Parrot"], - name="Max Speed", - ) - - result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean() - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "dropna, tuples, outputs", - [ - ( - True, - [["A", "B"], ["B", "A"]], - {"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]}, - ), - ( - False, - [["A", "B"], ["A", np.nan], ["B", "A"]], - { - "c": [13.0, 12.3, 123.23], - "d": [12.0, 233.0, 123.0], - "e": [1.0, 12.0, 1.0], - }, - ), - ], -) -def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): - # GH 3729 - df_list = [ - ["A", "B", 12, 12, 12], - ["A", None, 12.3, 233.0, 12], - ["B", "A", 123.23, 123, 1], - ["A", "B", 1, 1, 1.0], - ] - df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) - agg_dict = {"c": sum, "d": max, "e": "min"} - grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict) - - mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) - expected = pd.DataFrame(outputs, index=mi) - - tm.assert_frame_equal(grouped, expected, check_index_type=False) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py new file mode 100644 index 0000000000000..2fe0c1cc7aac4 --- /dev/null +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -0,0 +1,236 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas.util.testing as tm + + +@pytest.mark.parametrize("na_value", [np.nan, None]) +@pytest.mark.parametrize( + "dropna, tuples, outputs", + [ + ( + True, + [["A", "B"], ["B", "A"]], + {"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]}, + ), + ( + False, + [["A", "B"], ["A", np.nan], ["B", "A"]], + { + "c": [13.0, 12.3, 123.23], + "d": [13.0, 233.0, 123.0], + "e": [13.0, 12.0, 1.0], + }, + ), + ], +) +def test_groupby_dropna_multi_index_dataframe_nan_in_one_group( + na_value, dropna, tuples, outputs +): + # GH 3729 this is to test that NA is in one group + df_list = [ + ["A", "B", 12, 12, 12], + ["A", na_value, 12.3, 233.0, 12], + ["B", "A", 123.23, 123, 1], + ["A", "B", 1, 1, 1.0], + ] + df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) + grouped = df.groupby(["a", "b"], dropna=dropna).sum() + + mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) + expected = pd.DataFrame(outputs, index=mi) + + tm.assert_frame_equal(grouped, expected, check_index_type=False) + + +@pytest.mark.parametrize( + "na_value1, na_value2", [(np.nan, np.nan), (None, None), (np.nan, None)] +) +@pytest.mark.parametrize( + "dropna, tuples, outputs", + [ + ( + True, + [["A", "B"], ["B", "A"]], + {"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]}, + ), + ( + False, + [["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]], + { + "c": [12.0, 13.3, 123.23, 1.0], + "d": [12.0, 234.0, 123.0, 1.0], + "e": [12.0, 13.0, 1.0, 1.0], + }, + ), + ], +) +def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( + na_value1, na_value2, dropna, tuples, outputs +): + # GH 3729 this is to test that NA in different groups with different representations + df_list = [ + ["A", "B", 12, 12, 12], + ["A", na_value1, 12.3, 233.0, 12], + ["B", "A", 123.23, 123, 1], + [na_value2, "B", 1, 1, 1.0], + ["A", na_value2, 1, 1, 1.0], + ] + df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) + grouped = df.groupby(["a", "b"], dropna=dropna).sum() + + mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) + expected = pd.DataFrame(outputs, index=mi) + + tm.assert_frame_equal(grouped, expected, check_index_type=False) + + +@pytest.mark.parametrize( + "dropna, idx, outputs", + [ + (True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}), + ( + False, + ["A", "B", np.nan], + { + "b": [123.23, 13.0, 12.3], + "c": [123.0, 13.0, 233.0], + "d": [1.0, 13.0, 12.0], + }, + ), + ], +) +def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): + # GH 3729 + df_list = [ + ["B", 12, 12, 12], + [None, 12.3, 233.0, 12], + ["A", 123.23, 123, 1], + ["B", 1, 1, 1.0], + ] + df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) + grouped = df.groupby("a", dropna=dropna).sum() + + expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) + + tm.assert_frame_equal(grouped, expected, check_index_type=False) + + +@pytest.mark.parametrize( + "dropna, idx, expected", + [ + (True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])), + ( + False, + ["a", "a", "b", np.nan], + pd.Series([3, 3, 3], index=["a", "b", np.nan]), + ), + ], +) +def test_groupby_dropna_series_level(dropna, idx, expected): + ser = pd.Series([1, 2, 3, 3], index=idx) + + result = ser.groupby(level=0, dropna=dropna).sum() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dropna, expected", + [ + (True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")), + ( + False, + pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"), + ), + ], +) +def test_groupby_dropna_series_by(dropna, expected): + ser = pd.Series( + [390.0, 350.0, 30.0, 20.0], + index=["Falcon", "Falcon", "Parrot", "Parrot"], + name="Max Speed", + ) + + result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dropna, tuples, outputs", + [ + ( + True, + [["A", "B"], ["B", "A"]], + {"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]}, + ), + ( + False, + [["A", "B"], ["A", np.nan], ["B", "A"]], + { + "c": [13.0, 12.3, 123.23], + "d": [12.0, 233.0, 123.0], + "e": [1.0, 12.0, 1.0], + }, + ), + ], +) +def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): + # GH 3729 + df_list = [ + ["A", "B", 12, 12, 12], + ["A", None, 12.3, 233.0, 12], + ["B", "A", 123.23, 123, 1], + ["A", "B", 1, 1, 1.0], + ] + df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) + agg_dict = {"c": sum, "d": max, "e": "min"} + grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict) + + mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) + expected = pd.DataFrame(outputs, index=mi) + + tm.assert_frame_equal(grouped, expected, check_index_type=False) + + +@pytest.mark.parametrize( + "na_value1, na_value2", + [ + (np.nan, pd.NaT), + (np.nan, np.nan), + (pd.NaT, pd.NaT), + (pd.NaT, None), + (None, None), + (None, np.nan), + ], +) +@pytest.mark.parametrize( + "dropna, values, indexes", + [ + (True, [12, 3], [pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")]), + ( + False, + [12, 3, 6], + [pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01"), pd.NaT], + ), + ], +) +def test_groupby_dropna_datetime_data(na_value1, na_value2, dropna, values, indexes): + # 3729 + df = pd.DataFrame( + { + "values": [1, 2, 3, 4, 5, 6], + "dt": [ + pd.Timestamp("2020-01-01"), + na_value1, + pd.Timestamp("2020-02-01"), + na_value2, + pd.Timestamp("2020-01-01"), + pd.Timestamp("2020-01-01"), + ], + } + ) + grouped = df.groupby("dt", dropna=dropna).agg({"values": sum}) + expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) + + tm.assert_frame_equal(grouped, expected) From a1d5510d790c98f768a1a5409e803b6ad0b251b7 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 2 Jan 2020 19:17:25 +0100 Subject: [PATCH 14/43] add doc --- doc/source/user_guide/groupby.rst | 22 ++++++++++++++++++++++ doc/source/whatsnew/v1.0.0.rst | 18 ++++++++++++++++-- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 8cd229070e365..8d20532fd893f 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -199,6 +199,28 @@ For example, the groups created by ``groupby()`` below are in the order they app df3.groupby(['X']).get_group('B') +.. _groupby.dropna: + +GroupBy dropna +^^^^^^^^^^^^^^ + +By default ``NA`` values are excluded from group keys during the ``groupby`` operation. However, +in case you want to include ``NA`` values in group keys, you could pass ``dropna=False`` to achieve it. + +.. ipython:: python + + l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] + df = pd.DataFrame(l, columns=["a", "b", "c"]) + + df.groupby(by=["b"], dropna=False).sum() + +.. ipython:: python + + ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan]) + ser.groupby(level=0).sum() + + ser.groupby(level=0, dropna=False).sum() + .. _groupby.attributes: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0164c5fdd2f54..e962db5daf770 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -180,6 +180,20 @@ can yield significant performance gains if the apply function can operate on num the data set is larger (1 million rows or greater). For more details, see :ref:`rolling apply documentation ` (:issue:`28987`) +.. ipython:: python + + l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] + df = pd.DataFrame(l, columns=["a", "b", "c"]) + + df.groupby(by=["b"], dropna=False).sum() + +.. ipython:: python + + ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan]) + ser.groupby(level=0).sum() + + ser.groupby(level=0, dropna=False).sum() + .. _whatsnew_1000.custom_window: Defining custom windows for rolling operations @@ -197,8 +211,8 @@ Allow NaN in groupby key ^^^^^^^^^^^^^^^^^^^^^^^^ We've added a ``dropna`` keyword to :meth:`DataFrame.groupby` and :meth:`Series.groupby` in order to -allow ``NaN`` values in group keys. Users can define ``dropna`` to ``False`` if they want to include -``NaN`` values in groupby keys. The default is set to ``True`` for ``dropna`` to keep backwards +allow ``NA`` values in group keys. Users can define ``dropna`` to ``False`` if they want to include +``NA`` values in groupby keys. The default is set to ``True`` for ``dropna`` to keep backwards compatibility (:issue:`3729`) .. _whatsnew_1000.enhancements.other: From 11ef56aa62de2730001eeb9292011edd4cc0a442 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 2 Jan 2020 19:21:44 +0100 Subject: [PATCH 15/43] move doc --- doc/source/whatsnew/v1.0.0.rst | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index e962db5daf770..9402ec59554ea 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -180,19 +180,6 @@ can yield significant performance gains if the apply function can operate on num the data set is larger (1 million rows or greater). For more details, see :ref:`rolling apply documentation ` (:issue:`28987`) -.. ipython:: python - - l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] - df = pd.DataFrame(l, columns=["a", "b", "c"]) - - df.groupby(by=["b"], dropna=False).sum() - -.. ipython:: python - - ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan]) - ser.groupby(level=0).sum() - - ser.groupby(level=0, dropna=False).sum() .. _whatsnew_1000.custom_window: @@ -215,6 +202,21 @@ allow ``NA`` values in group keys. Users can define ``dropna`` to ``False`` if t ``NA`` values in groupby keys. The default is set to ``True`` for ``dropna`` to keep backwards compatibility (:issue:`3729`) +.. ipython:: python + + l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] + df = pd.DataFrame(l, columns=["a", "b", "c"]) + + df.groupby(by=["b"], dropna=False).sum() + +.. ipython:: python + + ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan]) + ser.groupby(level=0).sum() + + ser.groupby(level=0, dropna=False).sum() + + .. _whatsnew_1000.enhancements.other: Other enhancements From b247a8b2c34cd830bc8b4478abf7f60528310709 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 2 Jan 2020 19:22:23 +0100 Subject: [PATCH 16/43] NaN to NA --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 9402ec59554ea..b238f17461e65 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -194,7 +194,7 @@ the :ref:`custom window rolling documentation ` .. _whatsnew_1000.groupby_key: -Allow NaN in groupby key +Allow NA in groupby key ^^^^^^^^^^^^^^^^^^^^^^^^ We've added a ``dropna`` keyword to :meth:`DataFrame.groupby` and :meth:`Series.groupby` in order to From d730c4a926cc6e974abb36c9a8a2b74c5f94295b Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 2 Jan 2020 19:47:10 +0100 Subject: [PATCH 17/43] Fix linting --- doc/source/user_guide/groupby.rst | 4 ++-- doc/source/whatsnew/v1.0.0.rst | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 8d20532fd893f..a0e66fe607c96 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -209,8 +209,8 @@ in case you want to include ``NA`` values in group keys, you could pass ``dropna .. ipython:: python - l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] - df = pd.DataFrame(l, columns=["a", "b", "c"]) + df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] + df = pd.DataFrame(df_list, columns=["a", "b", "c"]) df.groupby(by=["b"], dropna=False).sum() diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ebb8a78c029bb..bd8c0e79e0515 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -204,8 +204,8 @@ compatibility (:issue:`3729`) .. ipython:: python - l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] - df = pd.DataFrame(l, columns=["a", "b", "c"]) + df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] + df = pd.DataFrame(df_list, columns=["a", "b", "c"]) df.groupby(by=["b"], dropna=False).sum() From 42c4934aa743b922179a2ff91d66479e707a6bb3 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 2 Jan 2020 20:05:13 +0100 Subject: [PATCH 18/43] fix rst issue --- doc/source/user_guide/groupby.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index a0e66fe607c96..7d24345e62eae 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -210,9 +210,9 @@ in case you want to include ``NA`` values in group keys, you could pass ``dropna .. ipython:: python df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] - df = pd.DataFrame(df_list, columns=["a", "b", "c"]) + df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"]) - df.groupby(by=["b"], dropna=False).sum() + df_dropna.groupby(by=["b"], dropna=False).sum() .. ipython:: python From 2ba79b9b0bc6da426122ff1a7f11f65997f7bb81 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 2 Jan 2020 20:05:53 +0100 Subject: [PATCH 19/43] fix rst issue --- doc/source/user_guide/groupby.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 7d24345e62eae..d5ded5d62fe5e 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -216,7 +216,7 @@ in case you want to include ``NA`` values in group keys, you could pass ``dropna .. ipython:: python - ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan]) + ser = pd.Series([1, 2, 3, 3], index=["a", "a", "b", np.nan]) ser.groupby(level=0).sum() ser.groupby(level=0, dropna=False).sum() diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index bd8c0e79e0515..57a7b6dc82969 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -211,7 +211,7 @@ compatibility (:issue:`3729`) .. ipython:: python - ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan]) + ser = pd.Series([1, 2, 3, 3], index=["a", "a", "b", np.nan]) ser.groupby(level=0).sum() ser.groupby(level=0, dropna=False).sum() From 8b79b6c31931048555c8a8af784fd463d84388ce Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 3 Jan 2020 08:40:21 +0100 Subject: [PATCH 20/43] refactor based on WA review --- pandas/core/algorithms.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f977959774eb4..7ac2c68a74797 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -634,9 +634,11 @@ def factorize( uniques, codes = safe_sort( uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False ) - if not dropna and (codes == na_sentinel).any(): + + code_is_na = codes == na_sentinel + if not dropna and code_is_na.any(): uniques = np.append(uniques, [None]) - codes = np.where(codes == na_sentinel, len(uniques) - 1, codes) + codes = np.where(code_is_na, len(uniques) - 1, codes) uniques = _reconstruct_data(uniques, dtype, original) From 4ac15e325b2baa853d31a0991abfd180029799c9 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 10 Feb 2020 18:41:24 +0100 Subject: [PATCH 21/43] remove blank --- doc/source/whatsnew/v1.0.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index d35e592e8413e..6597b764581a4 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -227,7 +227,6 @@ See :ref:`here ` for a description. .. _whatsnew_100.enhancements.other: - Other enhancements ~~~~~~~~~~~~~~~~~~ From 4ebbad31a6b1128a6fce5c06f9ccd853dc36520f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 10 Feb 2020 18:59:00 +0100 Subject: [PATCH 22/43] code change on reviews --- pandas/core/algorithms.py | 3 ++- pandas/core/frame.py | 4 ++-- pandas/core/series.py | 4 ++-- pandas/tests/test_algos.py | 4 ++-- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index aadef9e474c77..f1cb1759de8e9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -647,7 +647,8 @@ def factorize( code_is_na = codes == na_sentinel if not dropna and code_is_na.any(): - uniques = np.append(uniques, [None]) + na_value = na_value_for_dtype(original.dtype) + uniques = np.append(uniques, [na_value]) codes = np.where(code_is_na, len(uniques) - 1, codes) uniques = _reconstruct_data(uniques, dtype, original) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bc9a89a6d7e8c..cd59db1dcd68b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5845,8 +5845,8 @@ def update( Captive 210.0 Wild 185.0 -We can also choose to include NaN in group keys or not by setting -`dropna` parameter: +We can also choose to include NA in group keys or not by setting +`dropna` parameter, the default setting is `True`: >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] >>> df = pd.DataFrame(l, columns=["a", "b", "c"]) diff --git a/pandas/core/series.py b/pandas/core/series.py index 17b5ec658b6fa..3227d597d26ed 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1638,8 +1638,8 @@ def _set_name(self, name, inplace=False) -> "Series": Wild 185.0 Name: Max Speed, dtype: float64 -We can also choose to include NaN in group keys or not by defining -`dropna` parameter: +We can also choose to include `NA` in group keys or not by defining +`dropna` parameter, the default setting is `True`: >>> ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan]) >>> ser.groupby(level=0).sum() diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 8bc9fb40f1389..638947f76f9a6 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -361,7 +361,7 @@ def test_object_factorize_dropna( codes, uniques = algos.factorize(data, dropna=dropna) tm.assert_numpy_array_equal(uniques, expected_uniques) - tm.assert_numpy_array_equal(codes, expected_codes, check_dtype=False) + tm.assert_numpy_array_equal(codes, expected_codes) @pytest.mark.parametrize( "data, dropna, expected_codes, expected_uniques", @@ -396,7 +396,7 @@ def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniqu codes, uniques = algos.factorize(data, dropna=dropna) tm.assert_numpy_array_equal(uniques, expected_uniques) - tm.assert_numpy_array_equal(codes, expected_codes, check_dtype=False) + tm.assert_numpy_array_equal(codes, expected_codes) class TestUnique: From f141b80adb3cb217b3668707a2792832e8ea0bc0 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 10 Feb 2020 19:00:22 +0100 Subject: [PATCH 23/43] use pd.testing --- pandas/tests/groupby/test_groupby_dropna.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 2fe0c1cc7aac4..1ebf9a4de76de 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas.testing as tm @pytest.mark.parametrize("na_value", [np.nan, None]) From 23ad19bd9b5a8dcd81add0fd2eddb163e4fe5d67 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 10 Feb 2020 19:02:31 +0100 Subject: [PATCH 24/43] linting --- doc/source/user_guide/groupby.rst | 2 +- doc/source/whatsnew/v1.1.0.rst | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 0b9a3fe66aa8c..f47ed9d2b4a85 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -214,7 +214,7 @@ in case you want to include ``NA`` values in group keys, you could pass ``dropna df_dropna.groupby(by=["b"], dropna=False).sum() - df_dropna.groupby(by=["b"], dropna=False).sum() + df_dropna.groupby(by=["b"], dropna=True).sum() The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys. diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c0f7e3e27af75..c79e01dffc324 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -50,12 +50,12 @@ compatibility (:issue:`3729`) .. ipython:: python df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] - df = pd.DataFrame(df_list, columns=["a", "b", "c"]) - - df.groupby(by=["b"], dropna=False).sum() + df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"]) df_dropna.groupby(by=["b"], dropna=False).sum() + df_dropna.groupby(by=["b"], dropna=True).sum() + The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys. .. versionadded:: 1.1.0 From bafc4a5bb69c56ff22b4ecebfc037446202401fe Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 10 Feb 2020 19:44:44 +0100 Subject: [PATCH 25/43] fixup --- pandas/core/algorithms.py | 6 +- pandas/tests/groupby/test_groupby_dropna.py | 86 +++++++++++++++++++++ 2 files changed, 89 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f1cb1759de8e9..3089fd13ed6d8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -182,7 +182,8 @@ def _reconstruct_data(values, dtype, original): ------- Index for extension types, otherwise ndarray casted to dtype """ - + print(dtype) + print(values) if is_extension_array_dtype(dtype): values = dtype.construct_array_type()._from_sequence(values) elif is_bool_dtype(dtype): @@ -647,8 +648,7 @@ def factorize( code_is_na = codes == na_sentinel if not dropna and code_is_na.any(): - na_value = na_value_for_dtype(original.dtype) - uniques = np.append(uniques, [na_value]) + uniques = np.append(uniques, [None]) codes = np.where(code_is_na, len(uniques) - 1, codes) uniques = _reconstruct_data(uniques, dtype, original) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 1ebf9a4de76de..dc95a41b58bc7 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -234,3 +234,89 @@ def test_groupby_dropna_datetime_data(na_value1, na_value2, dropna, values, inde expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) tm.assert_frame_equal(grouped, expected) + + +@pytest.mark.parametrize( + "na_value1, na_value2", + [ + (np.nan, pd.NaT), + (np.nan, np.nan), + (pd.NaT, pd.NaT), + (pd.NaT, None), + (None, None), + (None, np.nan), + ], +) +@pytest.mark.parametrize( + "dropna, values, indexes", + [ + (True, [3, 12], [pd.Timedelta("-2 days"), pd.Timedelta("-1 days")]), + ( + False, + [3, 12, 6], + [pd.Timedelta("-2 days"), pd.Timedelta("-1 days"), pd.NaT], + ), + ], +) +def test_groupby_dropna_timedelta_data(na_value1, na_value2, dropna, values, indexes): + # 3729 + df = pd.DataFrame( + { + "values": [1, 2, 3, 4, 5, 6], + "dt": [ + pd.Timedelta("-1 days"), + na_value1, + pd.Timedelta("-2 days"), + na_value2, + pd.Timedelta("-1 days"), + pd.Timedelta("-1 days"), + ], + } + ) + grouped = df.groupby("dt", dropna=dropna).agg({"values": sum}) + expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) + + tm.assert_frame_equal(grouped, expected) + + +@pytest.mark.parametrize( + "na_value1, na_value2", + [ + (np.nan, pd.NaT), + (np.nan, np.nan), + (pd.NaT, pd.NaT), + (pd.NaT, None), + (None, None), + (None, np.nan), + ], +) +@pytest.mark.parametrize( + "dropna, values, indexes", + [ + (True, [12, 3], [pd.Period("2020-01-01"), pd.Period("2020-02-01")]), + ( + False, + [12, 3, 6], + [pd.Period("2020-01-01"), pd.Period("2020-02-01"), pd.NaT], + ), + ], +) +def test_groupby_dropna_period_data(na_value1, na_value2, dropna, values, indexes): + # 3729 + df = pd.DataFrame( + { + "values": [1, 2, 3, 4, 5, 6], + "dt": [ + pd.Period("2020-01-01"), + na_value1, + pd.Period("2020-02-01"), + na_value2, + pd.Period("2020-01-01"), + pd.Period("2020-01-01"), + ], + } + ) + grouped = df.groupby("dt", dropna=dropna).agg({"values": sum}) + expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) + + tm.assert_frame_equal(grouped, expected) From c98bafe74bda50b707785e954a96af5dc7814e5c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 10 Feb 2020 20:20:10 +0100 Subject: [PATCH 26/43] fixup --- pandas/core/algorithms.py | 2 -- pandas/tests/test_algos.py | 8 ++++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3089fd13ed6d8..e9a67c3deb37d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -182,8 +182,6 @@ def _reconstruct_data(values, dtype, original): ------- Index for extension types, otherwise ndarray casted to dtype """ - print(dtype) - print(values) if is_extension_array_dtype(dtype): values = dtype.construct_array_type()._from_sequence(values) elif is_bool_dtype(dtype): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 638947f76f9a6..b99b17c410737 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -326,6 +326,10 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): else: tm.assert_extension_array_equal(uniques, expected_uniques) + @pytest.mark.xfail( + compat.is_platform_windows(), + reason="Windows will be coelced to int32 other than int64", + ) @pytest.mark.parametrize( "data, dropna, expected_codes, expected_uniques", [ @@ -363,6 +367,10 @@ def test_object_factorize_dropna( tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes) + @pytest.mark.xfail( + compat.is_platform_windows(), + reason="Windows will be coelced to int32 other than int64", + ) @pytest.mark.parametrize( "data, dropna, expected_codes, expected_uniques", [ From 86a5958f362af30714267153716b6070b9f1fad1 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 10 Feb 2020 21:30:39 +0100 Subject: [PATCH 27/43] doc --- pandas/core/groupby/grouper.py | 2 +- pandas/tests/test_algos.py | 24 ++++++++---------------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 2db3ff8e5e572..1354f046b4dd2 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -102,7 +102,7 @@ def __new__(cls, *args, **kwargs): return super().__new__(cls) def __init__( - self, key=None, level=None, freq=None, axis=0, sort=False, dropna=True + self, key=None, level=None, freq=None, axis=0, sort=False, dropna: bool = True ): self.key = key self.level = level diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b99b17c410737..56345bc9bf58f 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -326,35 +326,31 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): else: tm.assert_extension_array_equal(uniques, expected_uniques) - @pytest.mark.xfail( - compat.is_platform_windows(), - reason="Windows will be coelced to int32 other than int64", - ) @pytest.mark.parametrize( "data, dropna, expected_codes, expected_uniques", [ ( ["a", None, "b", "a"], True, - np.array([0, -1, 1, 0]), + np.array([0, -1, 1, 0], dtype="int64"), np.array(["a", "b"], dtype=object), ), ( ["a", np.nan, "b", "a"], True, - np.array([0, -1, 1, 0]), + np.array([0, -1, 1, 0], dtype="int64"), np.array(["a", "b"], dtype=object), ), ( ["a", None, "b", "a"], False, - np.array([0, 2, 1, 0]), + np.array([0, 2, 1, 0], dtype="int64"), np.array(["a", "b", np.nan], dtype=object), ), ( ["a", np.nan, "b", "a"], False, - np.array([0, 2, 1, 0]), + np.array([0, 2, 1, 0], dtype="int64"), np.array(["a", "b", np.nan], dtype=object), ), ], @@ -367,35 +363,31 @@ def test_object_factorize_dropna( tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes) - @pytest.mark.xfail( - compat.is_platform_windows(), - reason="Windows will be coelced to int32 other than int64", - ) @pytest.mark.parametrize( "data, dropna, expected_codes, expected_uniques", [ ( [1, None, 1, 2], True, - np.array([0, -1, 0, 1]), + np.array([0, -1, 0, 1], dtype="int64"), np.array([1, 2], dtype="O"), ), ( [1, np.nan, 1, 2], True, - np.array([0, -1, 0, 1]), + np.array([0, -1, 0, 1], dtype="int64"), np.array([1, 2], dtype=np.float64), ), ( [1, None, 1, 2], False, - np.array([0, 2, 0, 1]), + np.array([0, 2, 0, 1], dtype="int64"), np.array([1, 2, np.nan], dtype="O"), ), ( [1, np.nan, 1, 2], False, - np.array([0, 2, 0, 1]), + np.array([0, 2, 0, 1], dtype="int64"), np.array([1, 2, np.nan], dtype=np.float64), ), ], From 6cf31d7078fdb9a0b639b66b98c348b2ccc4a7f5 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 10 Feb 2020 21:51:06 +0100 Subject: [PATCH 28/43] validation --- pandas/core/groupby/grouper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 1354f046b4dd2..2db3ff8e5e572 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -102,7 +102,7 @@ def __new__(cls, *args, **kwargs): return super().__new__(cls) def __init__( - self, key=None, level=None, freq=None, axis=0, sort=False, dropna: bool = True + self, key=None, level=None, freq=None, axis=0, sort=False, dropna=True ): self.key = key self.level = level From 2b77f3761db1afcd76a483b96eefe9e255f544fb Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 10 Feb 2020 22:40:22 +0100 Subject: [PATCH 29/43] xfail windows --- pandas/tests/test_algos.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 56345bc9bf58f..b99b17c410737 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -326,31 +326,35 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): else: tm.assert_extension_array_equal(uniques, expected_uniques) + @pytest.mark.xfail( + compat.is_platform_windows(), + reason="Windows will be coelced to int32 other than int64", + ) @pytest.mark.parametrize( "data, dropna, expected_codes, expected_uniques", [ ( ["a", None, "b", "a"], True, - np.array([0, -1, 1, 0], dtype="int64"), + np.array([0, -1, 1, 0]), np.array(["a", "b"], dtype=object), ), ( ["a", np.nan, "b", "a"], True, - np.array([0, -1, 1, 0], dtype="int64"), + np.array([0, -1, 1, 0]), np.array(["a", "b"], dtype=object), ), ( ["a", None, "b", "a"], False, - np.array([0, 2, 1, 0], dtype="int64"), + np.array([0, 2, 1, 0]), np.array(["a", "b", np.nan], dtype=object), ), ( ["a", np.nan, "b", "a"], False, - np.array([0, 2, 1, 0], dtype="int64"), + np.array([0, 2, 1, 0]), np.array(["a", "b", np.nan], dtype=object), ), ], @@ -363,31 +367,35 @@ def test_object_factorize_dropna( tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes) + @pytest.mark.xfail( + compat.is_platform_windows(), + reason="Windows will be coelced to int32 other than int64", + ) @pytest.mark.parametrize( "data, dropna, expected_codes, expected_uniques", [ ( [1, None, 1, 2], True, - np.array([0, -1, 0, 1], dtype="int64"), + np.array([0, -1, 0, 1]), np.array([1, 2], dtype="O"), ), ( [1, np.nan, 1, 2], True, - np.array([0, -1, 0, 1], dtype="int64"), + np.array([0, -1, 0, 1]), np.array([1, 2], dtype=np.float64), ), ( [1, None, 1, 2], False, - np.array([0, 2, 0, 1], dtype="int64"), + np.array([0, 2, 0, 1]), np.array([1, 2, np.nan], dtype="O"), ), ( [1, np.nan, 1, 2], False, - np.array([0, 2, 0, 1], dtype="int64"), + np.array([0, 2, 0, 1]), np.array([1, 2, np.nan], dtype=np.float64), ), ], From 1089b1815dcb33988d23e6d2ac638d8100ad763a Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 22 Feb 2020 15:28:14 +0100 Subject: [PATCH 30/43] fixup based on WA review --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/algorithms.py | 5 ++++- pandas/core/generic.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5ae13b7a71af3..06e6bdf0f6a40 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -37,7 +37,7 @@ For example: ser.loc["May 2015"] -.. _whatsnew_1000.groupby_key: +.. _whatsnew_110.groupby_key: Allow NA in groupby key ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 85bf899565d63..c78bceab0f8a6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -639,7 +639,10 @@ def factorize( code_is_na = codes == na_sentinel if not dropna and code_is_na.any(): - uniques = np.append(uniques, [None]) + # na_value is set based on the dtype of uniques, and compat set to False is + # because we do not want na_value to be 0 for integers + na_value = na_value_for_dtype(uniques.dtype, compat=False) + uniques = np.append(uniques, [na_value]) codes = np.where(code_is_na, len(uniques) - 1, codes) uniques = _reconstruct_data(uniques, dtype, original) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7919b81a1fe73..2248f91568592 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7355,7 +7355,7 @@ def clip( with row/column will be dropped. If False, NA values will also be treated as the key in groups - .. versionadded:: 1.0.0 + .. versionadded:: 1.1.0 Returns ------- From 3f360a997a6e924c754906a6ca34b217bfe0c45f Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 7 Apr 2020 23:07:46 +0200 Subject: [PATCH 31/43] reduce tests --- pandas/tests/groupby/test_groupby_dropna.py | 111 +++----------------- 1 file changed, 13 insertions(+), 98 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index dc95a41b58bc7..8bdcad2e25807 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -205,117 +205,32 @@ def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): ], ) @pytest.mark.parametrize( - "dropna, values, indexes", + "datetime1, datetime2", [ - (True, [12, 3], [pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")]), - ( - False, - [12, 3, 6], - [pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01"), pd.NaT], - ), - ], -) -def test_groupby_dropna_datetime_data(na_value1, na_value2, dropna, values, indexes): - # 3729 - df = pd.DataFrame( - { - "values": [1, 2, 3, 4, 5, 6], - "dt": [ - pd.Timestamp("2020-01-01"), - na_value1, - pd.Timestamp("2020-02-01"), - na_value2, - pd.Timestamp("2020-01-01"), - pd.Timestamp("2020-01-01"), - ], - } - ) - grouped = df.groupby("dt", dropna=dropna).agg({"values": sum}) - expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) - - tm.assert_frame_equal(grouped, expected) - - -@pytest.mark.parametrize( - "na_value1, na_value2", - [ - (np.nan, pd.NaT), - (np.nan, np.nan), - (pd.NaT, pd.NaT), - (pd.NaT, None), - (None, None), - (None, np.nan), + (pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")), + (pd.Timedelta("-2 days"), pd.Timedelta("-1 days")), + (pd.Period("2020-01-01"), pd.Period("2020-02-01")), ], ) @pytest.mark.parametrize( - "dropna, values, indexes", - [ - (True, [3, 12], [pd.Timedelta("-2 days"), pd.Timedelta("-1 days")]), - ( - False, - [3, 12, 6], - [pd.Timedelta("-2 days"), pd.Timedelta("-1 days"), pd.NaT], - ), - ], + "dropna, values", [(True, [12, 3]), (False, [12, 3, 6],),], ) -def test_groupby_dropna_timedelta_data(na_value1, na_value2, dropna, values, indexes): +def test_groupby_dropna_datetime_like_data( + na_value1, na_value2, dropna, values, datetime1, datetime2 +): # 3729 df = pd.DataFrame( { "values": [1, 2, 3, 4, 5, 6], - "dt": [ - pd.Timedelta("-1 days"), - na_value1, - pd.Timedelta("-2 days"), - na_value2, - pd.Timedelta("-1 days"), - pd.Timedelta("-1 days"), - ], + "dt": [datetime1, na_value1, datetime2, na_value2, datetime1, datetime1], } ) - grouped = df.groupby("dt", dropna=dropna).agg({"values": sum}) - expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) - - tm.assert_frame_equal(grouped, expected) + if dropna: + indexes = [datetime1, datetime2] + else: + indexes = [datetime1, datetime2, pd.NaT] -@pytest.mark.parametrize( - "na_value1, na_value2", - [ - (np.nan, pd.NaT), - (np.nan, np.nan), - (pd.NaT, pd.NaT), - (pd.NaT, None), - (None, None), - (None, np.nan), - ], -) -@pytest.mark.parametrize( - "dropna, values, indexes", - [ - (True, [12, 3], [pd.Period("2020-01-01"), pd.Period("2020-02-01")]), - ( - False, - [12, 3, 6], - [pd.Period("2020-01-01"), pd.Period("2020-02-01"), pd.NaT], - ), - ], -) -def test_groupby_dropna_period_data(na_value1, na_value2, dropna, values, indexes): - # 3729 - df = pd.DataFrame( - { - "values": [1, 2, 3, 4, 5, 6], - "dt": [ - pd.Period("2020-01-01"), - na_value1, - pd.Period("2020-02-01"), - na_value2, - pd.Period("2020-01-01"), - pd.Period("2020-01-01"), - ], - } - ) grouped = df.groupby("dt", dropna=dropna).agg({"values": sum}) expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) From 5cabe4b71215f9a706f860776d360b759fab0946 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 7 Apr 2020 23:10:04 +0200 Subject: [PATCH 32/43] fix pep8 --- pandas/tests/groupby/test_groupby_dropna.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 8bdcad2e25807..98af0ebb20a5b 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -213,7 +213,7 @@ def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): ], ) @pytest.mark.parametrize( - "dropna, values", [(True, [12, 3]), (False, [12, 3, 6],),], + "dropna, values", [(True, [12, 3]), (False, [12, 3, 6],)], ) def test_groupby_dropna_datetime_like_data( na_value1, na_value2, dropna, values, datetime1, datetime2 From 6c126c78abce67f0af68d829cf37498b3557f482 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Apr 2020 09:30:27 +0200 Subject: [PATCH 33/43] rebase and docs fixes --- doc/source/user_guide/groupby.rst | 10 ++++++---- doc/source/whatsnew/v1.1.0.rst | 7 ++++--- pandas/core/groupby/groupby.py | 1 - pandas/core/groupby/grouper.py | 1 - 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 98bcc14903b2c..d281f405caa20 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -199,6 +199,8 @@ For example, the groups created by ``groupby()`` below are in the order they app df3.groupby(['X']).get_group('B') +.. versionadded:: 1.1.0 + .. _groupby.dropna: GroupBy dropna @@ -212,13 +214,13 @@ in case you want to include ``NA`` values in group keys, you could pass ``dropna df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"]) - df_dropna.groupby(by=["b"], dropna=False).sum() - + # Default `dropna` is set to True, which will exclude NaNs in keys df_dropna.groupby(by=["b"], dropna=True).sum() -The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys. + # In order to allow NaN in keys, set `dropna` to False + df_dropna.groupby(by=["b"], dropna=False).sum() -.. versionadded:: 1.1.0 +The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys. .. _groupby.attributes: diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6560140937e31..3285730f6ad96 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -52,10 +52,12 @@ compatibility (:issue:`3729`) df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"]) - df_dropna.groupby(by=["b"], dropna=False).sum() - + # Default `dropna` is set to True, which will exclude NaNs in keys df_dropna.groupby(by=["b"], dropna=True).sum() + # In order to allow NaN in keys, set `dropna` to False + df_dropna.groupby(by=["b"], dropna=False).sum() + The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys. .. versionadded:: 1.1.0 @@ -99,7 +101,6 @@ For example: pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z', utc=True) pd.to_datetime(tz_strs, format='%Y-%m-%d %H:%M:%S %z') - .. _whatsnew_110.enhancements.other: Other enhancements diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4404ab31b7c60..4380fd444f97a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -399,7 +399,6 @@ def __init__( self.squeeze = squeeze self.observed = observed self.mutated = mutated - self.dropna = dropna if grouper is None: diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index f42645d74abfa..0314045c964d1 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -299,7 +299,6 @@ def __init__( self.obj = obj self.observed = observed self.in_axis = in_axis - self.dropna = dropna # right place for this? From 6d61d6a4c9efc3d8c3df909e6b49f80918752de4 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Apr 2020 09:32:00 +0200 Subject: [PATCH 34/43] fixup doc --- doc/source/user_guide/groupby.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index d281f405caa20..6e3a39ae41412 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -199,10 +199,10 @@ For example, the groups created by ``groupby()`` below are in the order they app df3.groupby(['X']).get_group('B') -.. versionadded:: 1.1.0 - .. _groupby.dropna: +.. versionadded:: 1.1.0 + GroupBy dropna ^^^^^^^^^^^^^^ From 3630e8b203a6dc72797e332acfd7e14bf5b1a821 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Apr 2020 10:36:41 +0200 Subject: [PATCH 35/43] remove inferred type --- pandas/tests/groupby/test_groupby_dropna.py | 41 +++++++++++++-------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 98af0ebb20a5b..6482c7e1cf453 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -5,7 +5,6 @@ import pandas.testing as tm -@pytest.mark.parametrize("na_value", [np.nan, None]) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -26,12 +25,12 @@ ], ) def test_groupby_dropna_multi_index_dataframe_nan_in_one_group( - na_value, dropna, tuples, outputs + dropna, tuples, outputs, nulls_fixture ): # GH 3729 this is to test that NA is in one group df_list = [ ["A", "B", 12, 12, 12], - ["A", na_value, 12.3, 233.0, 12], + ["A", nulls_fixture, 12.3, 233.0, 12], ["B", "A", 123.23, 123, 1], ["A", "B", 1, 1, 1.0], ] @@ -39,14 +38,16 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_one_group( grouped = df.groupby(["a", "b"], dropna=dropna).sum() mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) + + # Since right now, by default MI will drop NA from levels, so we need + # to set NA for level manually afterwards. + if not dropna: + mi = mi.set_levels(["A", "B", np.nan], level="b") expected = pd.DataFrame(outputs, index=mi) - tm.assert_frame_equal(grouped, expected, check_index_type=False) + tm.assert_frame_equal(grouped, expected) -@pytest.mark.parametrize( - "na_value1, na_value2", [(np.nan, np.nan), (None, None), (np.nan, None)] -) @pytest.mark.parametrize( "dropna, tuples, outputs", [ @@ -67,23 +68,28 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_one_group( ], ) def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( - na_value1, na_value2, dropna, tuples, outputs + dropna, tuples, outputs, nulls_fixture, nulls_fixture2 ): # GH 3729 this is to test that NA in different groups with different representations df_list = [ ["A", "B", 12, 12, 12], - ["A", na_value1, 12.3, 233.0, 12], + ["A", nulls_fixture, 12.3, 233.0, 12], ["B", "A", 123.23, 123, 1], - [na_value2, "B", 1, 1, 1.0], - ["A", na_value2, 1, 1, 1.0], + [nulls_fixture2, "B", 1, 1, 1.0], + ["A", nulls_fixture2, 1, 1, 1.0], ] df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) grouped = df.groupby(["a", "b"], dropna=dropna).sum() mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) + + # Since right now, by default MI will drop NA from levels, so we need + # to set NA for level manually afterwards. + if not dropna: + mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]]) expected = pd.DataFrame(outputs, index=mi) - tm.assert_frame_equal(grouped, expected, check_index_type=False) + tm.assert_frame_equal(grouped, expected) @pytest.mark.parametrize( @@ -114,7 +120,7 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) - tm.assert_frame_equal(grouped, expected, check_index_type=False) + tm.assert_frame_equal(grouped, expected) @pytest.mark.parametrize( @@ -188,9 +194,14 @@ def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict) mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) + + # Since right now, by default MI will drop NA from levels, so we need + # to set NA for level manually afterwards. + if not dropna: + mi = mi.set_levels(["A", "B", np.nan], level="b") expected = pd.DataFrame(outputs, index=mi) - tm.assert_frame_equal(grouped, expected, check_index_type=False) + tm.assert_frame_equal(grouped, expected) @pytest.mark.parametrize( @@ -229,7 +240,7 @@ def test_groupby_dropna_datetime_like_data( if dropna: indexes = [datetime1, datetime2] else: - indexes = [datetime1, datetime2, pd.NaT] + indexes = [datetime1, datetime2, np.nan] grouped = df.groupby("dt", dropna=dropna).agg({"values": sum}) expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) From 1cec7f1f05c64dd432e22d571c75228cf608d559 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Apr 2020 10:38:50 +0200 Subject: [PATCH 36/43] better comment --- pandas/tests/groupby/test_groupby_dropna.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 6482c7e1cf453..38dd8a67d5020 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -39,8 +39,8 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_one_group( mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) - # Since right now, by default MI will drop NA from levels, so we need - # to set NA for level manually afterwards. + # Since right now, by default MI will drop NA from levels when we create MI + # via `from_*`, so we need to add NA for level manually afterwards. if not dropna: mi = mi.set_levels(["A", "B", np.nan], level="b") expected = pd.DataFrame(outputs, index=mi) @@ -83,8 +83,8 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) - # Since right now, by default MI will drop NA from levels, so we need - # to set NA for level manually afterwards. + # Since right now, by default MI will drop NA from levels when we create MI + # via `from_*`, so we need to add NA for level manually afterwards. if not dropna: mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]]) expected = pd.DataFrame(outputs, index=mi) @@ -195,8 +195,8 @@ def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) - # Since right now, by default MI will drop NA from levels, so we need - # to set NA for level manually afterwards. + # Since right now, by default MI will drop NA from levels when we create MI + # via `from_*`, so we need to add NA for level manually afterwards. if not dropna: mi = mi.set_levels(["A", "B", np.nan], level="b") expected = pd.DataFrame(outputs, index=mi) From 1a1bb49e71b74d7b3b463b535b74f780fcd1e58e Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Apr 2020 11:07:53 +0200 Subject: [PATCH 37/43] remove xfail --- pandas/tests/test_algos.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index bfa3f000c8074..e2fde8995f75d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -326,10 +326,6 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): else: tm.assert_extension_array_equal(uniques, expected_uniques) - @pytest.mark.xfail( - compat.is_platform_windows(), - reason="Windows will be coelced to int32 other than int64", - ) @pytest.mark.parametrize( "data, dropna, expected_codes, expected_uniques", [ @@ -367,10 +363,6 @@ def test_object_factorize_dropna( tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes) - @pytest.mark.xfail( - compat.is_platform_windows(), - reason="Windows will be coelced to int32 other than int64", - ) @pytest.mark.parametrize( "data, dropna, expected_codes, expected_uniques", [ From 7ea2e797182b4b85bc92f42cda9623f4948bff9d Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Apr 2020 11:11:32 +0200 Subject: [PATCH 38/43] use fixture --- pandas/tests/groupby/test_groupby_dropna.py | 22 +++++++++------------ 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 38dd8a67d5020..1a525d306e9f5 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -204,17 +204,6 @@ def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): tm.assert_frame_equal(grouped, expected) -@pytest.mark.parametrize( - "na_value1, na_value2", - [ - (np.nan, pd.NaT), - (np.nan, np.nan), - (pd.NaT, pd.NaT), - (pd.NaT, None), - (None, None), - (None, np.nan), - ], -) @pytest.mark.parametrize( "datetime1, datetime2", [ @@ -227,13 +216,20 @@ def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): "dropna, values", [(True, [12, 3]), (False, [12, 3, 6],)], ) def test_groupby_dropna_datetime_like_data( - na_value1, na_value2, dropna, values, datetime1, datetime2 + dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2 ): # 3729 df = pd.DataFrame( { "values": [1, 2, 3, 4, 5, 6], - "dt": [datetime1, na_value1, datetime2, na_value2, datetime1, datetime1], + "dt": [ + datetime1, + unique_nulls_fixture, + datetime2, + unique_nulls_fixture2, + datetime1, + datetime1, + ], } ) From 13b1e9a5c67694e6dd835545e8aa0f6a58f1ea12 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Apr 2020 11:38:01 +0200 Subject: [PATCH 39/43] coelse type for windows build --- pandas/tests/test_algos.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index e2fde8995f75d..ce4543cb6bc9d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -361,6 +361,10 @@ def test_object_factorize_dropna( codes, uniques = algos.factorize(data, dropna=dropna) tm.assert_numpy_array_equal(uniques, expected_uniques) + + # On windows builds, integer arrays are of type int32 + if compat.is_platform_windows(): + expected_codes = expected_codes.astype("int32") tm.assert_numpy_array_equal(codes, expected_codes) @pytest.mark.parametrize( @@ -396,6 +400,10 @@ def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniqu codes, uniques = algos.factorize(data, dropna=dropna) tm.assert_numpy_array_equal(uniques, expected_uniques) + + # On windows builds, integer arrays are of type int32 + if compat.is_platform_windows(): + expected_codes = expected_codes.astype("int32") tm.assert_numpy_array_equal(codes, expected_codes) From 92a7eedc7d3d5b9d7a0aaac80ed7b642d80d5292 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Apr 2020 12:16:41 +0200 Subject: [PATCH 40/43] fixup --- pandas/tests/test_algos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ce4543cb6bc9d..2b03a54f0d9ff 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -364,7 +364,7 @@ def test_object_factorize_dropna( # On windows builds, integer arrays are of type int32 if compat.is_platform_windows(): - expected_codes = expected_codes.astype("int32") + expected_codes = expected_codes.astype("int64") tm.assert_numpy_array_equal(codes, expected_codes) @pytest.mark.parametrize( @@ -403,7 +403,7 @@ def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniqu # On windows builds, integer arrays are of type int32 if compat.is_platform_windows(): - expected_codes = expected_codes.astype("int32") + expected_codes = expected_codes.astype("int64") tm.assert_numpy_array_equal(codes, expected_codes) From 1315a9d0a1ebd092aa3e1c9fe0d82e6570b45d73 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 11 Apr 2020 12:19:06 +0200 Subject: [PATCH 41/43] fixup --- pandas/tests/test_algos.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 2b03a54f0d9ff..2538d42b270fd 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -332,25 +332,25 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): ( ["a", None, "b", "a"], True, - np.array([0, -1, 1, 0]), + np.array([0, -1, 1, 0], dtype=np.int64), np.array(["a", "b"], dtype=object), ), ( ["a", np.nan, "b", "a"], True, - np.array([0, -1, 1, 0]), + np.array([0, -1, 1, 0], dtype=np.int64), np.array(["a", "b"], dtype=object), ), ( ["a", None, "b", "a"], False, - np.array([0, 2, 1, 0]), + np.array([0, 2, 1, 0], dtype=np.int64), np.array(["a", "b", np.nan], dtype=object), ), ( ["a", np.nan, "b", "a"], False, - np.array([0, 2, 1, 0]), + np.array([0, 2, 1, 0], dtype=np.int64), np.array(["a", "b", np.nan], dtype=object), ), ], @@ -361,10 +361,6 @@ def test_object_factorize_dropna( codes, uniques = algos.factorize(data, dropna=dropna) tm.assert_numpy_array_equal(uniques, expected_uniques) - - # On windows builds, integer arrays are of type int32 - if compat.is_platform_windows(): - expected_codes = expected_codes.astype("int64") tm.assert_numpy_array_equal(codes, expected_codes) @pytest.mark.parametrize( @@ -373,25 +369,25 @@ def test_object_factorize_dropna( ( [1, None, 1, 2], True, - np.array([0, -1, 0, 1]), + np.array([0, -1, 0, 1], dtype=np.int64), np.array([1, 2], dtype="O"), ), ( [1, np.nan, 1, 2], True, - np.array([0, -1, 0, 1]), + np.array([0, -1, 0, 1], dtype=np.int64), np.array([1, 2], dtype=np.float64), ), ( [1, None, 1, 2], False, - np.array([0, 2, 0, 1]), + np.array([0, 2, 0, 1], dtype=np.int64), np.array([1, 2, np.nan], dtype="O"), ), ( [1, np.nan, 1, 2], False, - np.array([0, 2, 0, 1]), + np.array([0, 2, 0, 1], dtype=np.int64), np.array([1, 2, np.nan], dtype=np.float64), ), ], @@ -400,10 +396,6 @@ def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniqu codes, uniques = algos.factorize(data, dropna=dropna) tm.assert_numpy_array_equal(uniques, expected_uniques) - - # On windows builds, integer arrays are of type int32 - if compat.is_platform_windows(): - expected_codes = expected_codes.astype("int64") tm.assert_numpy_array_equal(codes, expected_codes) From ffbae76ca431575ce88dd17bfc30c8830f0962d8 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 27 Apr 2020 08:54:41 +0200 Subject: [PATCH 42/43] Doc fixup --- doc/source/user_guide/groupby.rst | 4 ++++ doc/source/whatsnew/v1.1.0.rst | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 23cbb29f5d013..ddba3dc452e28 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -214,6 +214,10 @@ in case you want to include ``NA`` values in group keys, you could pass ``dropna df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"]) + df_dropna + +.. ipython:: python + # Default `dropna` is set to True, which will exclude NaNs in keys df_dropna.groupby(by=["b"], dropna=True).sum() diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9977a50ae02d1..81da97336e586 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -42,7 +42,7 @@ For example: Allow NA in groupby key ^^^^^^^^^^^^^^^^^^^^^^^^ -We've added a ``dropna`` keyword to :meth:`DataFrame.groupby` and :meth:`Series.groupby` in order to +With :ref:`groupby ` , we've added a ``dropna`` keyword to :meth:`DataFrame.groupby` and :meth:`Series.groupby` in order to allow ``NA`` values in group keys. Users can define ``dropna`` to ``False`` if they want to include ``NA`` values in groupby keys. The default is set to ``True`` for ``dropna`` to keep backwards compatibility (:issue:`3729`) @@ -52,6 +52,10 @@ compatibility (:issue:`3729`) df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"]) + df_dropna + +.. ipython:: python + # Default `dropna` is set to True, which will exclude NaNs in keys df_dropna.groupby(by=["b"], dropna=True).sum() From 4ea6aa019fdfe76f6ca3445d12aaf0cace9e0b35 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 7 May 2020 09:08:18 +0200 Subject: [PATCH 43/43] try merge master again --- doc/source/whatsnew/v1.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c74e2c040e1c1..55af0b218a2c7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -114,7 +114,6 @@ When applied to a `DataFrame`, they key is applied per-column to all columns or For more details, see examples and documentation in :meth:`DataFrame.sort_values`, :meth:`Series.sort_values`, and :meth:`~DataFrame.sort_index`. - .. _whatsnew_110.timestamp_fold_support: Fold argument support in Timestamp constructor