From b34fc3b92555190f1bc77d56d37e25a0e02566b2 Mon Sep 17 00:00:00 2001 From: Blake Hawkins Date: Sat, 19 Oct 2019 13:55:01 -0500 Subject: [PATCH 001/112] Replacing list concatenation with "+" --- pandas/tests/dtypes/test_common.py | 36 +++++++++++++++++------------- setup.cfg | 2 -- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 466b724f98770..726a177ee587c 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -324,11 +324,13 @@ def test_is_datetimelike(): @pytest.mark.parametrize( "dtype", - [pd.Series([1, 2])] - + ALL_INT_DTYPES - + to_numpy_dtypes(ALL_INT_DTYPES) - + ALL_EA_INT_DTYPES - + to_ea_dtypes(ALL_EA_INT_DTYPES), + [ + type(pd.Series([1, 2])), + *ALL_INT_DTYPES, + *to_numpy_dtypes(ALL_INT_DTYPES), + *ALL_EA_INT_DTYPES, + *to_ea_dtypes(ALL_EA_INT_DTYPES) + ] ) def test_is_integer_dtype(dtype): assert com.is_integer_dtype(dtype) @@ -352,11 +354,13 @@ def test_is_not_integer_dtype(dtype): @pytest.mark.parametrize( "dtype", - [pd.Series([1, 2])] - + SIGNED_INT_DTYPES - + to_numpy_dtypes(SIGNED_INT_DTYPES) - + SIGNED_EA_INT_DTYPES - + to_ea_dtypes(SIGNED_EA_INT_DTYPES), + [ + type(pd.Series([1, 2])), + *SIGNED_INT_DTYPES, + *to_numpy_dtypes(SIGNED_INT_DTYPES), + *SIGNED_EA_INT_DTYPES, + *to_ea_dtypes(SIGNED_EA_INT_DTYPES) + ] ) def test_is_signed_integer_dtype(dtype): assert com.is_integer_dtype(dtype) @@ -384,11 +388,13 @@ def test_is_not_signed_integer_dtype(dtype): @pytest.mark.parametrize( "dtype", - [pd.Series([1, 2], dtype=np.uint32)] - + UNSIGNED_INT_DTYPES - + to_numpy_dtypes(UNSIGNED_INT_DTYPES) - + UNSIGNED_EA_INT_DTYPES - + to_ea_dtypes(UNSIGNED_EA_INT_DTYPES), + [ + type(pd.Series([1, 2], dtype=np.uint32)), + *UNSIGNED_INT_DTYPES, + *to_numpy_dtypes(UNSIGNED_INT_DTYPES), + *UNSIGNED_EA_INT_DTYPES, + *to_ea_dtypes(UNSIGNED_EA_INT_DTYPES) + ] ) def test_is_unsigned_integer_dtype(dtype): assert com.is_unsigned_integer_dtype(dtype) diff --git a/setup.cfg b/setup.cfg index 3562ece5acad3..0674e68a101c4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -145,8 +145,6 @@ ignore_errors=True [mypy-pandas.tests.arrays.test_period] ignore_errors=True -[mypy-pandas.tests.dtypes.test_common] -ignore_errors=True [mypy-pandas.tests.dtypes.test_inference] ignore_errors=True From a266c6921f2d1ae924254e52a7ecd7d261af91e5 Mon Sep 17 00:00:00 2001 From: Blake Hawkins Date: Tue, 22 Oct 2019 18:56:41 -0500 Subject: [PATCH 002/112] Update test_common.py Apply black formatting to test_common.py --- pandas/tests/dtypes/test_common.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 726a177ee587c..edd0340b7ab5f 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -329,8 +329,8 @@ def test_is_datetimelike(): *ALL_INT_DTYPES, *to_numpy_dtypes(ALL_INT_DTYPES), *ALL_EA_INT_DTYPES, - *to_ea_dtypes(ALL_EA_INT_DTYPES) - ] + *to_ea_dtypes(ALL_EA_INT_DTYPES), + ], ) def test_is_integer_dtype(dtype): assert com.is_integer_dtype(dtype) @@ -359,8 +359,8 @@ def test_is_not_integer_dtype(dtype): *SIGNED_INT_DTYPES, *to_numpy_dtypes(SIGNED_INT_DTYPES), *SIGNED_EA_INT_DTYPES, - *to_ea_dtypes(SIGNED_EA_INT_DTYPES) - ] + *to_ea_dtypes(SIGNED_EA_INT_DTYPES), + ], ) def test_is_signed_integer_dtype(dtype): assert com.is_integer_dtype(dtype) @@ -393,8 +393,8 @@ def test_is_not_signed_integer_dtype(dtype): *UNSIGNED_INT_DTYPES, *to_numpy_dtypes(UNSIGNED_INT_DTYPES), *UNSIGNED_EA_INT_DTYPES, - *to_ea_dtypes(UNSIGNED_EA_INT_DTYPES) - ] + *to_ea_dtypes(UNSIGNED_EA_INT_DTYPES), + ], ) def test_is_unsigned_integer_dtype(dtype): assert com.is_unsigned_integer_dtype(dtype) From 4d27ef2e8bfb61deb525ce2e2877394ab29df43f Mon Sep 17 00:00:00 2001 From: Blake Hawkins Date: Fri, 25 Oct 2019 19:42:31 -0500 Subject: [PATCH 003/112] Update test_common.py Removed type(pd.Series[...]) --- pandas/tests/dtypes/test_common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index edd0340b7ab5f..cb9572d6a1c0d 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -325,7 +325,7 @@ def test_is_datetimelike(): @pytest.mark.parametrize( "dtype", [ - type(pd.Series([1, 2])), + pd.Series([1, 2]), *ALL_INT_DTYPES, *to_numpy_dtypes(ALL_INT_DTYPES), *ALL_EA_INT_DTYPES, @@ -355,7 +355,7 @@ def test_is_not_integer_dtype(dtype): @pytest.mark.parametrize( "dtype", [ - type(pd.Series([1, 2])), + pd.Series([1, 2]), *SIGNED_INT_DTYPES, *to_numpy_dtypes(SIGNED_INT_DTYPES), *SIGNED_EA_INT_DTYPES, @@ -389,7 +389,7 @@ def test_is_not_signed_integer_dtype(dtype): @pytest.mark.parametrize( "dtype", [ - type(pd.Series([1, 2], dtype=np.uint32)), + pd.Series([1, 2], dtype=np.uint32), *UNSIGNED_INT_DTYPES, *to_numpy_dtypes(UNSIGNED_INT_DTYPES), *UNSIGNED_EA_INT_DTYPES, From c208bbec85a2d23658e81b02f5658187168bf73e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Oct 2019 05:37:47 -0700 Subject: [PATCH 004/112] BUG: Fix TypeError in _cython_agg_blocks (#29035) --- pandas/core/groupby/generic.py | 21 +++++++++++++++++++-- pandas/core/groupby/groupby.py | 15 ++++++++------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8cd727e744519..8e53972c95275 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -970,6 +970,11 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): # call our grouper again with only this block obj = self.obj[data.items[locs]] + if obj.shape[1] == 1: + # Avoid call to self.values that can occur in DataFrame + # reductions; see GH#28949 + obj = obj.iloc[:, 0] + s = groupby(obj, self.grouper) try: result = s.aggregate(lambda x: alt(x, axis=self.axis)) @@ -978,17 +983,29 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): # continue and exclude the block deleted_items.append(locs) continue + + # unwrap DataFrame to get array + assert len(result._data.blocks) == 1 + result = result._data.blocks[0].values + if result.ndim == 1 and isinstance(result, np.ndarray): + result = result.reshape(1, -1) + finally: + assert not isinstance(result, DataFrame) + if result is not no_result: # see if we can cast the block back to the original dtype result = maybe_downcast_numeric(result, block.dtype) - if result.ndim == 1 and isinstance(result, np.ndarray): + if block.is_extension and isinstance(result, np.ndarray): # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical + # and result is ndarray[object] + assert result.ndim == 1 or result.shape[0] == 1 try: # Cast back if feasible result = type(block.values)._from_sequence( - result, dtype=block.values.dtype + result.ravel(), dtype=block.values.dtype ) except ValueError: # reshape to be valid for non-Extension Block diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6f2868482b798..fa65179469840 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1344,13 +1344,14 @@ def f(self, **kwargs): raise except DataError: pass - except Exception: - # TODO: the remaining test cases that get here are from: - # - AttributeError from _cython_agg_blocks bug passing - # DataFrame to make_block; see GH#28275 - # - TypeError in _cython_operation calling ensure_float64 - # on object array containing complex numbers; - # see test_groupby_complex, test_max_nan_bug + except (TypeError, NotImplementedError): + # TODO: + # - TypeError: this is reached via test_groupby_complex + # and can be fixed by implementing _group_add for + # complex dtypes + # - NotImplementedError: reached in test_max_nan_bug, + # raised in _get_cython_function and should probably + # be handled inside _cython_agg_blocks pass # apply a non-cython aggregation From 0e556c241a2b42645d3109ea78adcc7a95794078 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Oct 2019 11:25:47 -0700 Subject: [PATCH 005/112] CI: xfail on numpy 1.18 (#29057) * xfail on numpy 1.18 * CI: try using numpy wheel --- ci/build38.sh | 8 +------- pandas/compat/numpy/__init__.py | 1 + pandas/tests/series/test_analytics.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/ci/build38.sh b/ci/build38.sh index 5c798c17301e0..903016536d240 100644 --- a/ci/build38.sh +++ b/ci/build38.sh @@ -6,13 +6,7 @@ pip install --no-deps -U pip wheel setuptools pip install python-dateutil pytz cython pytest pytest-xdist hypothesis # Possible alternative for getting numpy: -# pip install --pre -f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com/ numpy -git clone https://github.com/numpy/numpy -cd numpy -python setup.py build_ext --inplace -python setup.py install -cd .. -rm -rf numpy +pip install --pre -f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com/ numpy python setup.py build_ext -inplace python -m pip install --no-build-isolation -e . diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index ce56c08d3ec14..402ed62f2df65 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -12,6 +12,7 @@ _np_version_under1p15 = _nlv < LooseVersion("1.15") _np_version_under1p16 = _nlv < LooseVersion("1.16") _np_version_under1p17 = _nlv < LooseVersion("1.17") +_np_version_under1p18 = _nlv < LooseVersion("1.18") _is_numpy_dev = ".dev" in str(_nlv) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index d60cd3029e5a8..c8e1c04f3e3fb 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas.compat.numpy import _np_version_under1p18 import pandas.util._test_decorators as td import pandas as pd @@ -160,6 +161,9 @@ def test_cummax(self, datetime_series): tm.assert_series_equal(result, expected) + @pytest.mark.xfail( + not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" + ) def test_cummin_datetime64(self): s = pd.Series( pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"]) @@ -179,6 +183,9 @@ def test_cummin_datetime64(self): result = s.cummin(skipna=False) tm.assert_series_equal(expected, result) + @pytest.mark.xfail( + not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" + ) def test_cummax_datetime64(self): s = pd.Series( pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"]) @@ -198,6 +205,9 @@ def test_cummax_datetime64(self): result = s.cummax(skipna=False) tm.assert_series_equal(expected, result) + @pytest.mark.xfail( + not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" + ) def test_cummin_timedelta64(self): s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) @@ -213,6 +223,9 @@ def test_cummin_timedelta64(self): result = s.cummin(skipna=False) tm.assert_series_equal(expected, result) + @pytest.mark.xfail( + not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" + ) def test_cummax_timedelta64(self): s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) From 2cfc88fbf15fb17a105e3131b63c667037a2e919 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Oct 2019 15:17:13 -0700 Subject: [PATCH 006/112] Stop catching TypeError in groupby methods (#29060) --- pandas/_libs/groupby.pyx | 33 ++++++++++++++++++++++++--------- pandas/core/groupby/groupby.py | 21 ++++++++++----------- pandas/core/groupby/ops.py | 8 +++++++- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 4f7488c88630b..68c21139e7384 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -8,7 +8,7 @@ import numpy as np cimport numpy as cnp from numpy cimport (ndarray, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float32_t, float64_t) + uint32_t, uint64_t, float32_t, float64_t, complex64_t, complex128_t) cnp.import_array() @@ -421,16 +421,23 @@ def group_any_all(uint8_t[:] out, if values[i] == flag_val: out[lab] = flag_val + # ---------------------------------------------------------------------- # group_add, group_prod, group_var, group_mean, group_ohlc # ---------------------------------------------------------------------- +ctypedef fused complexfloating_t: + float64_t + float32_t + complex64_t + complex128_t + @cython.wraparound(False) @cython.boundscheck(False) -def _group_add(floating[:, :] out, +def _group_add(complexfloating_t[:, :] out, int64_t[:] counts, - floating[:, :] values, + complexfloating_t[:, :] values, const int64_t[:] labels, Py_ssize_t min_count=0): """ @@ -438,13 +445,14 @@ def _group_add(floating[:, :] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - floating val, count - floating[:, :] sumx, nobs + complexfloating_t val, count + complexfloating_t[:, :] sumx + int64_t[:, :] nobs if len(values) != len(labels): raise ValueError("len(index) != len(labels)") - nobs = np.zeros_like(out) + nobs = np.zeros((len(out), out.shape[1]), dtype=np.int64) sumx = np.zeros_like(out) N, K = (values).shape @@ -462,7 +470,12 @@ def _group_add(floating[:, :] out, # not nan if val == val: nobs[lab, j] += 1 - sumx[lab, j] += val + if (complexfloating_t is complex64_t or + complexfloating_t is complex128_t): + # clang errors if we use += with these dtypes + sumx[lab, j] = sumx[lab, j] + val + else: + sumx[lab, j] += val for i in range(ncounts): for j in range(K): @@ -472,8 +485,10 @@ def _group_add(floating[:, :] out, out[i, j] = sumx[i, j] -group_add_float32 = _group_add['float'] -group_add_float64 = _group_add['double'] +group_add_float32 = _group_add['float32_t'] +group_add_float64 = _group_add['float64_t'] +group_add_complex64 = _group_add['float complex'] +group_add_complex128 = _group_add['double complex'] @cython.wraparound(False) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fa65179469840..b27d5bb05ee8f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1340,19 +1340,18 @@ def f(self, **kwargs): # try a cython aggregation if we can try: return self._cython_agg_general(alias, alt=npfunc, **kwargs) - except AssertionError: - raise except DataError: pass - except (TypeError, NotImplementedError): - # TODO: - # - TypeError: this is reached via test_groupby_complex - # and can be fixed by implementing _group_add for - # complex dtypes - # - NotImplementedError: reached in test_max_nan_bug, - # raised in _get_cython_function and should probably - # be handled inside _cython_agg_blocks - pass + except NotImplementedError as err: + if "function is not implemented for this dtype" in str(err): + # raised in _get_cython_function, in some cases can + # be trimmed by implementing cython funcs for more dtypes + pass + elif "decimal does not support skipna=True" in str(err): + # FIXME: kludge for test_decimal:test_in_numeric_groupby + pass + else: + raise # apply a non-cython aggregation result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 27415a1bacdbd..e380cf5930f97 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -526,7 +526,13 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: - values = ensure_float64(values) + try: + values = ensure_float64(values) + except TypeError: + if lib.infer_dtype(values, skipna=False) == "complex": + values = values.astype(complex) + else: + raise func = self._get_cython_function(kind, how, values, is_numeric) else: raise From 7b8c7445b4f8b931ba0af3f639365052b2e7ea3c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 18 Oct 2019 07:43:34 -0500 Subject: [PATCH 007/112] API: Restore getting name from MultiIndex level (#29061) * API: Restore getting name from MultiIndex level xref https://issues.apache.org/jira/browse/ARROW-6922 / https://github.com/pandas-dev/pandas/pull/27242#issuecomment-543302582 / https://github.com/pandas-dev/pandas/issues/29032 No docs yet, since it isn't clear how this will eventually sort out. But we at least want to preserve this behavior for 1.0 * fixups --- pandas/core/indexes/multi.py | 7 ++++-- pandas/tests/frame/test_alter_axes.py | 2 +- .../tests/indexes/multi/test_constructor.py | 6 ++--- pandas/tests/indexes/multi/test_names.py | 23 ++++++++++++------- pandas/tests/indexes/multi/test_reindex.py | 4 ++-- pandas/tests/indexes/multi/test_reshape.py | 4 ++-- pandas/tests/reshape/test_concat.py | 6 +++-- pandas/tests/test_multilevel.py | 21 ++++++++--------- 8 files changed, 42 insertions(+), 31 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b0a1ed0650f7c..fda5c78a61e53 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -639,7 +639,10 @@ def from_frame(cls, df, sortorder=None, names=None): @property def levels(self): - return self._levels + result = [ + x._shallow_copy(name=name) for x, name in zip(self._levels, self._names) + ] + return FrozenList(result) @property def _values(self): @@ -830,7 +833,7 @@ def _set_codes( if level is None: new_codes = FrozenList( _ensure_frozen(level_codes, lev, copy=copy)._shallow_copy() - for lev, level_codes in zip(self.levels, codes) + for lev, level_codes in zip(self._levels, codes) ) else: level = [self._get_level_number(l) for l in level] diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index b310335be5f65..017cbea7ec723 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -978,7 +978,7 @@ def test_reset_index(self, float_frame): ): values = lev.take(level_codes) name = names[i] - tm.assert_index_equal(values, Index(deleveled[name].rename(name=None))) + tm.assert_index_equal(values, Index(deleveled[name])) stacked.index.names = [None, None] deleveled2 = stacked.reset_index() diff --git a/pandas/tests/indexes/multi/test_constructor.py b/pandas/tests/indexes/multi/test_constructor.py index 993979f31a35b..ff98da85cfb2d 100644 --- a/pandas/tests/indexes/multi/test_constructor.py +++ b/pandas/tests/indexes/multi/test_constructor.py @@ -17,7 +17,7 @@ def test_constructor_single_level(): levels=[["foo", "bar", "baz", "qux"]], codes=[[0, 1, 2, 3]], names=["first"] ) assert isinstance(result, MultiIndex) - expected = Index(["foo", "bar", "baz", "qux"]) + expected = Index(["foo", "bar", "baz", "qux"], name="first") tm.assert_index_equal(result.levels[0], expected) assert result.names == ["first"] @@ -292,7 +292,7 @@ def test_from_arrays_empty(): # 1 level result = MultiIndex.from_arrays(arrays=[[]], names=["A"]) assert isinstance(result, MultiIndex) - expected = Index([]) + expected = Index([], name="A") tm.assert_index_equal(result.levels[0], expected) assert result.names == ["A"] @@ -440,7 +440,7 @@ def test_from_product_empty_zero_levels(): def test_from_product_empty_one_level(): result = MultiIndex.from_product([[]], names=["A"]) - expected = pd.Index([]) + expected = pd.Index([], name="A") tm.assert_index_equal(result.levels[0], expected) assert result.names == ["A"] diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 679e045a68f29..5c3a48c9dd481 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -27,7 +27,7 @@ def test_index_name_retained(): def test_changing_names(idx): - assert [level.name for level in idx.levels] == [None, None] + assert [level.name for level in idx.levels] == ["first", "second"] view = idx.view() copy = idx.copy() @@ -36,16 +36,16 @@ def test_changing_names(idx): # changing names should not change level names on object new_names = [name + "a" for name in idx.names] idx.names = new_names - check_level_names(idx, [None, None]) + check_level_names(idx, ["firsta", "seconda"]) # and not on copies - check_level_names(view, [None, None]) - check_level_names(copy, [None, None]) - check_level_names(shallow_copy, [None, None]) + check_level_names(view, ["first", "second"]) + check_level_names(copy, ["first", "second"]) + check_level_names(shallow_copy, ["first", "second"]) # and copies shouldn't change original shallow_copy.names = [name + "c" for name in shallow_copy.names] - check_level_names(idx, [None, None]) + check_level_names(idx, ["firsta", "seconda"]) def test_take_preserve_name(idx): @@ -81,7 +81,7 @@ def test_names(idx, index_names): # names are assigned in setup assert index_names == ["first", "second"] level_names = [level.name for level in idx.levels] - assert level_names == [None, None] + assert level_names == index_names # setting bad names on existing index = idx @@ -109,7 +109,7 @@ def test_names(idx, index_names): # names are assigned on index, but not transferred to the levels index.names = ["a", "b"] level_names = [level.name for level in index.levels] - assert level_names == [None, None] + assert level_names == ["a", "b"] def test_duplicate_level_names_access_raises(idx): @@ -117,3 +117,10 @@ def test_duplicate_level_names_access_raises(idx): idx.names = ["foo", "foo"] with pytest.raises(ValueError, match="name foo occurs multiple times"): idx._get_level_number("foo") + + +def test_get_names_from_levels(): + idx = pd.MultiIndex.from_product([["a"], [1, 2]], names=["a", "b"]) + + assert idx.levels[0].name == "a" + assert idx.levels[1].name == "b" diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py index 970288e5747c7..513efa8941de8 100644 --- a/pandas/tests/indexes/multi/test_reindex.py +++ b/pandas/tests/indexes/multi/test_reindex.py @@ -10,13 +10,13 @@ def test_reindex(idx): result, indexer = idx.reindex(list(idx[:4])) assert isinstance(result, MultiIndex) assert result.names == ["first", "second"] - assert [level.name for level in result.levels] == [None, None] + assert [level.name for level in result.levels] == ["first", "second"] result, indexer = idx.reindex(list(idx)) assert isinstance(result, MultiIndex) assert indexer is None assert result.names == ["first", "second"] - assert [level.name for level in result.levels] == [None, None] + assert [level.name for level in result.levels] == ["first", "second"] def test_reindex_level(idx): diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index e79f212f30078..37df420e9ea2e 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -15,11 +15,11 @@ def test_insert(idx): # key not contained in all levels new_index = idx.insert(0, ("abc", "three")) - exp0 = Index(list(idx.levels[0]) + ["abc"]) + exp0 = Index(list(idx.levels[0]) + ["abc"], name="first") tm.assert_index_equal(new_index.levels[0], exp0) assert new_index.names == ["first", "second"] - exp1 = Index(list(idx.levels[1]) + ["three"]) + exp1 = Index(list(idx.levels[1]) + ["three"], name="second") tm.assert_index_equal(new_index.levels[1], exp1) assert new_index[0] == ("abc", "three") diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 33cbaaed1848d..eda7bc0ec4df7 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1219,7 +1219,7 @@ def test_concat_keys_specific_levels(self): names=["group_key"], ) - tm.assert_index_equal(result.columns.levels[0], Index(level)) + tm.assert_index_equal(result.columns.levels[0], Index(level, name="group_key")) tm.assert_index_equal(result.columns.levels[1], Index([0, 1, 2, 3])) assert result.columns.names == ["group_key", None] @@ -1412,7 +1412,9 @@ def test_concat_keys_and_levels(self): names=["first", "second"], ) assert result.index.names == ("first", "second", None) - tm.assert_index_equal(result.index.levels[0], Index(["baz", "foo"])) + tm.assert_index_equal( + result.index.levels[0], Index(["baz", "foo"], name="first") + ) def test_concat_keys_levels_no_overlap(self): # GH #1406 diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 76436f4480809..79c9fe2b60bd9 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -975,11 +975,11 @@ def test_count(self): series.index.names = ["a", "b"] result = series.count(level="b") - expect = self.series.count(level=1) + expect = self.series.count(level=1).rename_axis("b") tm.assert_series_equal(result, expect) result = series.count(level="a") - expect = self.series.count(level=0) + expect = self.series.count(level=0).rename_axis("a") tm.assert_series_equal(result, expect) msg = "Level x not found" @@ -1641,16 +1641,14 @@ def test_constructor_with_tz(self): result = MultiIndex.from_arrays([index, columns]) assert result.names == ["dt1", "dt2"] - # levels don't have names set, so set name of index/columns to None in checks - tm.assert_index_equal(result.levels[0], index.rename(name=None)) - tm.assert_index_equal(result.levels[1], columns.rename(name=None)) + tm.assert_index_equal(result.levels[0], index) + tm.assert_index_equal(result.levels[1], columns) result = MultiIndex.from_arrays([Series(index), Series(columns)]) assert result.names == ["dt1", "dt2"] - # levels don't have names set, so set name of index/columns to None in checks - tm.assert_index_equal(result.levels[0], index.rename(name=None)) - tm.assert_index_equal(result.levels[1], columns.rename(name=None)) + tm.assert_index_equal(result.levels[0], index) + tm.assert_index_equal(result.levels[1], columns) def test_set_index_datetime(self): # GH 3950 @@ -1672,17 +1670,18 @@ def test_set_index_datetime(self): df.index = df.index.tz_convert("US/Pacific") expected = pd.DatetimeIndex( - ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"] + ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], + name="datetime", ) expected = expected.tz_localize("UTC").tz_convert("US/Pacific") df = df.set_index("label", append=True) tm.assert_index_equal(df.index.levels[0], expected) - tm.assert_index_equal(df.index.levels[1], Index(["a", "b"])) + tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label")) assert df.index.names == ["datetime", "label"] df = df.swaplevel(0, 1) - tm.assert_index_equal(df.index.levels[0], Index(["a", "b"])) + tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label")) tm.assert_index_equal(df.index.levels[1], expected) assert df.index.names == ["label", "datetime"] From 5fbbe9e7ffad16577f4481491b129aae6571e60a Mon Sep 17 00:00:00 2001 From: Rajhans Jadhao Date: Fri, 18 Oct 2019 20:45:30 +0530 Subject: [PATCH 008/112] fixed issue of mypy for test_ujson (#29022) --- pandas/tests/io/json/test_ujson.py | 5 +---- setup.cfg | 3 --- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index d6572ac7b7bfe..20e2690084e2a 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -1,10 +1,7 @@ -try: - import json -except ImportError: - import simplejson as json import calendar import datetime import decimal +import json import locale import math import re diff --git a/setup.cfg b/setup.cfg index 0674e68a101c4..489be28a60a7d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -194,9 +194,6 @@ ignore_errors=True [mypy-pandas.tests.indexing.test_loc] ignore_errors=True -[mypy-pandas.tests.io.json.test_ujson] -ignore_errors=True - [mypy-pandas.tests.series.test_constructors] ignore_errors=True From 5506b827b11be23d0b31383a53f90dc30beb9972 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 10:27:29 -0700 Subject: [PATCH 009/112] CLN: derivation of nogil param (#29047) --- pandas/_libs/algos_take_helper.pxi.in | 42 ++++++++++++++------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index f10061a417c03..e7ee212065c5b 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -12,26 +12,26 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # name, dest, c_type_in, c_type_out, preval, postval, can_copy, nogil dtypes = [ - ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True, True), + ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True), ('bool', 'object', 'uint8_t', 'object', - 'True if ', ' > 0 else False', False, False), - ('int8', 'int8', 'int8_t', 'int8_t', '', '', True, False), - ('int8', 'int32', 'int8_t', 'int32_t', '', '', False, True), - ('int8', 'int64', 'int8_t', 'int64_t', '', '', False, True), - ('int8', 'float64', 'int8_t', 'float64_t', '', '', False, True), - ('int16', 'int16', 'int16_t', 'int16_t', '', '', True, True), - ('int16', 'int32', 'int16_t', 'int32_t', '', '', False, True), - ('int16', 'int64', 'int16_t', 'int64_t', '', '', False, True), - ('int16', 'float64', 'int16_t', 'float64_t', '', '', False, True), - ('int32', 'int32', 'int32_t', 'int32_t', '', '', True, True), - ('int32', 'int64', 'int32_t', 'int64_t', '', '', False, True), - ('int32', 'float64', 'int32_t', 'float64_t', '', '', False, True), - ('int64', 'int64', 'int64_t', 'int64_t', '', '', True, True), - ('int64', 'float64', 'int64_t', 'float64_t', '', '', False, True), - ('float32', 'float32', 'float32_t', 'float32_t', '', '', True, True), - ('float32', 'float64', 'float32_t', 'float64_t', '', '', False, True), - ('float64', 'float64', 'float64_t', 'float64_t', '', '', True, True), - ('object', 'object', 'object', 'object', '', '', False, False)] + 'True if ', ' > 0 else False', False), + ('int8', 'int8', 'int8_t', 'int8_t', '', '', True), + ('int8', 'int32', 'int8_t', 'int32_t', '', '', False), + ('int8', 'int64', 'int8_t', 'int64_t', '', '', False), + ('int8', 'float64', 'int8_t', 'float64_t', '', '', False), + ('int16', 'int16', 'int16_t', 'int16_t', '', '', True), + ('int16', 'int32', 'int16_t', 'int32_t', '', '', False), + ('int16', 'int64', 'int16_t', 'int64_t', '', '', False), + ('int16', 'float64', 'int16_t', 'float64_t', '', '', False), + ('int32', 'int32', 'int32_t', 'int32_t', '', '', True), + ('int32', 'int64', 'int32_t', 'int64_t', '', '', False), + ('int32', 'float64', 'int32_t', 'float64_t', '', '', False), + ('int64', 'int64', 'int64_t', 'int64_t', '', '', True), + ('int64', 'float64', 'int64_t', 'float64_t', '', '', False), + ('float32', 'float32', 'float32_t', 'float32_t', '', '', True), + ('float32', 'float64', 'float32_t', 'float64_t', '', '', False), + ('float64', 'float64', 'float64_t', 'float64_t', '', '', True), + ('object', 'object', 'object', 'object', '', '', False)] def get_dispatch(dtypes): @@ -118,7 +118,9 @@ def get_dispatch(dtypes): """ for (name, dest, c_type_in, c_type_out, preval, postval, - can_copy, nogil) in dtypes: + can_copy) in dtypes: + + nogil = c_type_out != "object" if nogil: nogil_str = "with nogil:" tab = ' ' From ad01c8bb5df3d988f1ed651948385a6e6b3fcbec Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 10:28:03 -0700 Subject: [PATCH 010/112] REF: remove groupby_helper (#29040) --- pandas/_libs/groupby.pyx | 689 +++++++++++++++++++++++++++- pandas/_libs/groupby_helper.pxi.in | 693 ----------------------------- setup.py | 3 +- 3 files changed, 688 insertions(+), 697 deletions(-) delete mode 100644 pandas/_libs/groupby_helper.pxi.in diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 68c21139e7384..8a417d8fe3a92 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -11,6 +11,8 @@ from numpy cimport (ndarray, uint32_t, uint64_t, float32_t, float64_t, complex64_t, complex128_t) cnp.import_array() +cdef extern from "numpy/npy_math.h": + float64_t NAN "NPY_NAN" from pandas._libs.util cimport numeric, get_nat @@ -21,6 +23,7 @@ from pandas._libs.algos import (take_2d_axis1_float64_float64, groupsort_indexer, tiebreakers) cdef int64_t NPY_NAT = get_nat() +_int64_max = np.iinfo(np.int64).max cdef float64_t NaN = np.NaN @@ -804,5 +807,687 @@ def group_quantile(ndarray[float64_t] out, grp_start += grp_sz -# generated from template -include "groupby_helper.pxi" +# ---------------------------------------------------------------------- +# group_nth, group_last, group_rank +# ---------------------------------------------------------------------- + +ctypedef fused rank_t: + float64_t + float32_t + int64_t + uint64_t + object + + +cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: + if rank_t is object: + # Should never be used, but we need to avoid the `val != val` below + # or else cython will raise about gil acquisition. + raise NotImplementedError + + elif rank_t is int64_t: + return is_datetimelike and val == NPY_NAT + else: + return val != val + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_last(rank_t[:, :] out, + int64_t[:] counts, + rank_t[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + rank_t val + ndarray[rank_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + if rank_t is object: + resx = np.empty((out).shape, dtype=object) + else: + resx = np.empty_like(out) + + N, K = (values).shape + + if rank_t is object: + # TODO: De-duplicate once conditional-nogil is available + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if val == val: + # NB: use _treat_as_na here once + # conditional-nogil is available. + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if rank_t is int64_t: + out[i, j] = NPY_NAT + elif rank_t is uint64_t: + runtime_error = True + break + else: + out[i, j] = NAN + + else: + out[i, j] = resx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +group_last_float64 = group_last["float64_t"] +group_last_float32 = group_last["float32_t"] +group_last_int64 = group_last["int64_t"] +group_last_object = group_last["object"] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_nth(rank_t[:, :] out, + int64_t[:] counts, + rank_t[:, :] values, + const int64_t[:] labels, int64_t rank, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + rank_t val + ndarray[rank_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros((out).shape, dtype=np.int64) + if rank_t is object: + resx = np.empty((out).shape, dtype=object) + else: + resx = np.empty_like(out) + + N, K = (values).shape + + if rank_t is object: + # TODO: De-duplicate once conditional-nogil is available + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if val == val: + # NB: use _treat_as_na here once + # conditional-nogil is available. + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if rank_t is int64_t: + out[i, j] = NPY_NAT + elif rank_t is uint64_t: + runtime_error = True + break + else: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +group_nth_float64 = group_nth["float64_t"] +group_nth_float32 = group_nth["float32_t"] +group_nth_int64 = group_nth["int64_t"] +group_nth_object = group_nth["object"] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_rank(float64_t[:, :] out, + rank_t[:, :] values, + const int64_t[:] labels, + bint is_datetimelike, object ties_method, + bint ascending, bint pct, object na_option): + """ + Provides the rank of values within each group. + + Parameters + ---------- + out : array of float64_t values which this method will write its results to + values : array of rank_t values to be ranked + labels : array containing unique label for each group, with its ordering + matching up to the corresponding record in `values` + is_datetimelike : bool, default False + unused in this method but provided for call compatibility with other + Cython transformations + ties_method : {'average', 'min', 'max', 'first', 'dense'}, default + 'average' + * average: average rank of group + * min: lowest rank in group + * max: highest rank in group + * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups + ascending : boolean, default True + False for ranks by high (1) to low (N) + na_option : {'keep', 'top', 'bottom'}, default 'keep' + pct : boolean, default False + Compute percentage rank of data within each group + na_option : {'keep', 'top', 'bottom'}, default 'keep' + * keep: leave NA values where they are + * top: smallest rank if ascending + * bottom: smallest rank if descending + + Notes + ----- + This method modifies the `out` parameter rather than returning an object + """ + cdef: + TiebreakEnumType tiebreak + Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 + ndarray[int64_t] _as + ndarray[float64_t, ndim=2] grp_sizes + ndarray[rank_t] masked_vals + ndarray[uint8_t] mask + bint keep_na + rank_t nan_fill_val + + if rank_t is object: + raise NotImplementedError("Cant do nogil") + + tiebreak = tiebreakers[ties_method] + keep_na = na_option == 'keep' + N, K = (values).shape + grp_sizes = np.ones_like(out) + + # Copy values into new array in order to fill missing data + # with mask, without obfuscating location of missing data + # in values array + masked_vals = np.array(values[:, 0], copy=True) + if rank_t is int64_t: + mask = (masked_vals == NPY_NAT).astype(np.uint8) + else: + mask = np.isnan(masked_vals).astype(np.uint8) + + if ascending ^ (na_option == 'top'): + if rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).max + elif rank_t is uint64_t: + nan_fill_val = np.iinfo(np.uint64).max + else: + nan_fill_val = np.inf + order = (masked_vals, mask, labels) + else: + if rank_t is int64_t: + nan_fill_val = np.iinfo(np.int64).min + elif rank_t is uint64_t: + nan_fill_val = 0 + else: + nan_fill_val = -np.inf + + order = (masked_vals, ~mask, labels) + np.putmask(masked_vals, mask, nan_fill_val) + + # lexsort using labels, then mask, then actual values + # each label corresponds to a different group value, + # the mask helps you differentiate missing values before + # performing sort on the actual values + _as = np.lexsort(order).astype(np.int64, copy=False) + + if not ascending: + _as = _as[::-1] + + with nogil: + # Loop over the length of the value array + # each incremental i value can be looked up in the _as array + # that we sorted previously, which gives us the location of + # that sorted value for retrieval back from the original + # values / masked_vals arrays + for i in range(N): + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change + # Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + # Update out only when there is a transition of values or labels. + # When a new value or group is encountered, go back #dups steps( + # the number of occurrence of current value) and assign the ranks + # based on the the starting index of the current group (grp_start) + # and the current index + if (i == N - 1 or + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or + (mask[_as[i]] ^ mask[_as[i+1]]) or + (labels[_as[i]] != labels[_as[i+1]])): + # if keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and mask[_as[i]]: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = NaN + grp_na_count = dups + elif tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + if ascending: + out[_as[j], 0] = j + 1 - grp_start + else: + out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[_as[j], 0] = grp_vals_seen + + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is + # coming up. the conditional also needs to handle nan equality + # and the end of iteration + if (i == N - 1 or + (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or + (mask[_as[i]] ^ mask[_as[i+1]])): + dups = sum_ranks = 0 + grp_vals_seen += 1 + grp_tie_count += 1 + + # Similar to the previous conditional, check now if we are + # moving to a new group. If so, keep track of the index where + # the new group occurs, so the tiebreaker calculations can + # decrement that from their position. fill in the size of each + # group encountered (used by pct calculations later). also be + # sure to reset any of the items helping to calculate dups + if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: + if tiebreak != TIEBREAK_DENSE: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (i - grp_start + 1 - + grp_na_count) + else: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (grp_tie_count - + (grp_na_count > 0)) + dups = sum_ranks = 0 + grp_na_count = 0 + grp_tie_count = 0 + grp_start = i + 1 + grp_vals_seen = 1 + + if pct: + for i in range(N): + # We don't include NaN values in percentage + # rankings, so we assign them percentages of NaN. + if out[i, 0] != out[i, 0] or out[i, 0] == NAN: + out[i, 0] = NAN + elif grp_sizes[i, 0] != 0: + out[i, 0] = out[i, 0] / grp_sizes[i, 0] + + +group_rank_float64 = group_rank["float64_t"] +group_rank_float32 = group_rank["float32_t"] +group_rank_int64 = group_rank["int64_t"] +group_rank_uint64 = group_rank["uint64_t"] +# Note: we do not have a group_rank_object because that would require a +# not-nogil implementation, see GH#19560 + + +# ---------------------------------------------------------------------- +# group_min, group_max +# ---------------------------------------------------------------------- + +# TODO: consider implementing for more dtypes +ctypedef fused groupby_t: + float64_t + float32_t + int64_t + uint64_t + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max(groupby_t[:, :] out, + int64_t[:] counts, + groupby_t[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + groupby_t val, count, nan_val + ndarray[groupby_t, ndim=2] maxx, nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + if groupby_t is int64_t: + # Note: evaluated at compile-time + maxx[:] = -_int64_max + nan_val = NPY_NAT + elif groupby_t is uint64_t: + # NB: We do not define nan_val because there is no such thing + # for uint64_t. We carefully avoid having to reference it in this + # case. + maxx[:] = 0 + else: + maxx[:] = -np.inf + nan_val = NAN + + N, K = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if groupby_t is uint64_t: + runtime_error = True + break + out[i, j] = nan_val + else: + out[i, j] = maxx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min(groupby_t[:, :] out, + int64_t[:] counts, + groupby_t[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=-1): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + groupby_t val, count, nan_val + ndarray[groupby_t, ndim=2] minx, nobs + bint runtime_error = False + + assert min_count == -1, "'min_count' only used in add and prod" + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + if groupby_t is int64_t: + minx[:] = _int64_max + nan_val = NPY_NAT + elif groupby_t is uint64_t: + # NB: We do not define nan_val because there is no such thing + # for uint64_t. We carefully avoid having to reference it in this + # case. + minx[:] = np.iinfo(np.uint64).max + else: + minx[:] = np.inf + nan_val = NAN + + N, K = (values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if not _treat_as_na(val, True): + # TODO: Sure we always want is_datetimelike=True? + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + if groupby_t is uint64_t: + runtime_error = True + break + out[i, j] = nan_val + else: + out[i, j] = minx[i, j] + + if runtime_error: + # We cannot raise directly above because that is within a nogil + # block. + raise RuntimeError("empty group with uint64_t") + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cummin(groupby_t[:, :] out, + groupby_t[:, :] values, + const int64_t[:] labels, + int ngroups, + bint is_datetimelike): + """ + Cumulative minimum of columns of `values`, in row groups `labels`. + + Parameters + ---------- + out : array + Array to store cummin in. + values : array + Values to take cummin of. + labels : int64 array + Labels to group by. + ngroups : int + Number of groups, larger than all entries of `labels`. + is_datetimelike : bool + True if `values` contains datetime-like entries. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. + """ + + cdef: + Py_ssize_t i, j, N, K, size + groupby_t val, mval + ndarray[groupby_t, ndim=2] accum + int64_t lab + + N, K = (values).shape + accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) + if groupby_t is int64_t: + accum[:] = _int64_max + elif groupby_t is uint64_t: + accum[:] = np.iinfo(np.uint64).max + else: + accum[:] = np.inf + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + + if _treat_as_na(val, is_datetimelike): + out[i, j] = val + else: + mval = accum[lab, j] + if val < mval: + accum[lab, j] = mval = val + out[i, j] = mval + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cummax(groupby_t[:, :] out, + groupby_t[:, :] values, + const int64_t[:] labels, + int ngroups, + bint is_datetimelike): + """ + Cumulative maximum of columns of `values`, in row groups `labels`. + + Parameters + ---------- + out : array + Array to store cummax in. + values : array + Values to take cummax of. + labels : int64 array + Labels to group by. + ngroups : int + Number of groups, larger than all entries of `labels`. + is_datetimelike : bool + True if `values` contains datetime-like entries. + + Notes + ----- + This method modifies the `out` parameter, rather than returning an object. + """ + + cdef: + Py_ssize_t i, j, N, K, size + groupby_t val, mval + ndarray[groupby_t, ndim=2] accum + int64_t lab + + N, K = (values).shape + accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) + if groupby_t is int64_t: + accum[:] = -_int64_max + elif groupby_t is uint64_t: + accum[:] = 0 + else: + accum[:] = -np.inf + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + + if _treat_as_na(val, is_datetimelike): + out[i, j] = val + else: + mval = accum[lab, j] + if val > mval: + accum[lab, j] = mval = val + out[i, j] = mval diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in deleted file mode 100644 index c837c6c5c6519..0000000000000 --- a/pandas/_libs/groupby_helper.pxi.in +++ /dev/null @@ -1,693 +0,0 @@ -""" -Template for each `dtype` helper function using groupby - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -cdef extern from "numpy/npy_math.h": - float64_t NAN "NPY_NAN" -_int64_max = np.iinfo(np.int64).max - -# ---------------------------------------------------------------------- -# group_nth, group_last, group_rank -# ---------------------------------------------------------------------- - -ctypedef fused rank_t: - float64_t - float32_t - int64_t - uint64_t - object - - -cdef inline bint _treat_as_na(rank_t val, bint is_datetimelike) nogil: - if rank_t is object: - # Should never be used, but we need to avoid the `val != val` below - # or else cython will raise about gil acquisition. - raise NotImplementedError - - elif rank_t is int64_t: - return is_datetimelike and val == NPY_NAT - else: - return val != val - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_last(rank_t[:, :] out, - int64_t[:] counts, - rank_t[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val - ndarray[rank_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - bint runtime_error = False - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros((out).shape, dtype=np.int64) - if rank_t is object: - resx = np.empty((out).shape, dtype=object) - else: - resx = np.empty_like(out) - - N, K = (values).shape - - if rank_t is object: - # TODO: De-duplicate once conditional-nogil is available - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if val == val: - # NB: use _treat_as_na here once - # conditional-nogil is available. - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if rank_t is int64_t: - out[i, j] = NPY_NAT - elif rank_t is uint64_t: - runtime_error = True - break - else: - out[i, j] = NAN - - else: - out[i, j] = resx[i, j] - - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - -group_last_float64 = group_last["float64_t"] -group_last_float32 = group_last["float32_t"] -group_last_int64 = group_last["int64_t"] -group_last_object = group_last["object"] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth(rank_t[:, :] out, - int64_t[:] counts, - rank_t[:, :] values, - const int64_t[:] labels, int64_t rank, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - rank_t val - ndarray[rank_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - bint runtime_error = False - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros((out).shape, dtype=np.int64) - if rank_t is object: - resx = np.empty((out).shape, dtype=object) - else: - resx = np.empty_like(out) - - N, K = (values).shape - - if rank_t is object: - # TODO: De-duplicate once conditional-nogil is available - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if val == val: - # NB: use _treat_as_na here once - # conditional-nogil is available. - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - - else: - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if rank_t is int64_t: - out[i, j] = NPY_NAT - elif rank_t is uint64_t: - runtime_error = True - break - else: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - - -group_nth_float64 = group_nth["float64_t"] -group_nth_float32 = group_nth["float32_t"] -group_nth_int64 = group_nth["int64_t"] -group_nth_object = group_nth["object"] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_rank(float64_t[:, :] out, - rank_t[:, :] values, - const int64_t[:] labels, - bint is_datetimelike, object ties_method, - bint ascending, bint pct, object na_option): - """ - Provides the rank of values within each group. - - Parameters - ---------- - out : array of float64_t values which this method will write its results to - values : array of rank_t values to be ranked - labels : array containing unique label for each group, with its ordering - matching up to the corresponding record in `values` - is_datetimelike : bool, default False - unused in this method but provided for call compatibility with other - Cython transformations - ties_method : {'average', 'min', 'max', 'first', 'dense'}, default - 'average' - * average: average rank of group - * min: lowest rank in group - * max: highest rank in group - * first: ranks assigned in order they appear in the array - * dense: like 'min', but rank always increases by 1 between groups - ascending : boolean, default True - False for ranks by high (1) to low (N) - na_option : {'keep', 'top', 'bottom'}, default 'keep' - pct : boolean, default False - Compute percentage rank of data within each group - na_option : {'keep', 'top', 'bottom'}, default 'keep' - * keep: leave NA values where they are - * top: smallest rank if ascending - * bottom: smallest rank if descending - - Notes - ----- - This method modifies the `out` parameter rather than returning an object - """ - cdef: - TiebreakEnumType tiebreak - Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 - Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 - ndarray[int64_t] _as - ndarray[float64_t, ndim=2] grp_sizes - ndarray[rank_t] masked_vals - ndarray[uint8_t] mask - bint keep_na - rank_t nan_fill_val - - if rank_t is object: - raise NotImplementedError("Cant do nogil") - - tiebreak = tiebreakers[ties_method] - keep_na = na_option == 'keep' - N, K = (values).shape - grp_sizes = np.ones_like(out) - - # Copy values into new array in order to fill missing data - # with mask, without obfuscating location of missing data - # in values array - masked_vals = np.array(values[:, 0], copy=True) - if rank_t is int64_t: - mask = (masked_vals == NPY_NAT).astype(np.uint8) - else: - mask = np.isnan(masked_vals).astype(np.uint8) - - if ascending ^ (na_option == 'top'): - if rank_t is int64_t: - nan_fill_val = np.iinfo(np.int64).max - elif rank_t is uint64_t: - nan_fill_val = np.iinfo(np.uint64).max - else: - nan_fill_val = np.inf - order = (masked_vals, mask, labels) - else: - if rank_t is int64_t: - nan_fill_val = np.iinfo(np.int64).min - elif rank_t is uint64_t: - nan_fill_val = 0 - else: - nan_fill_val = -np.inf - - order = (masked_vals, ~mask, labels) - np.putmask(masked_vals, mask, nan_fill_val) - - # lexsort using labels, then mask, then actual values - # each label corresponds to a different group value, - # the mask helps you differentiate missing values before - # performing sort on the actual values - _as = np.lexsort(order).astype(np.int64, copy=False) - - if not ascending: - _as = _as[::-1] - - with nogil: - # Loop over the length of the value array - # each incremental i value can be looked up in the _as array - # that we sorted previously, which gives us the location of - # that sorted value for retrieval back from the original - # values / masked_vals arrays - for i in range(N): - # dups and sum_ranks will be incremented each loop where - # the value / group remains the same, and should be reset - # when either of those change - # Used to calculate tiebreakers - dups += 1 - sum_ranks += i - grp_start + 1 - - # Update out only when there is a transition of values or labels. - # When a new value or group is encountered, go back #dups steps( - # the number of occurrence of current value) and assign the ranks - # based on the the starting index of the current group (grp_start) - # and the current index - if (i == N - 1 or - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or - (mask[_as[i]] ^ mask[_as[i+1]]) or - (labels[_as[i]] != labels[_as[i+1]])): - # if keep_na, check for missing values and assign back - # to the result where appropriate - if keep_na and mask[_as[i]]: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - if ascending: - out[_as[j], 0] = j + 1 - grp_start - else: - out[_as[j], 0] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: - for j in range(i - dups + 1, i + 1): - out[_as[j], 0] = grp_vals_seen - - # look forward to the next value (using the sorting in _as) - # if the value does not equal the current value then we need to - # reset the dups and sum_ranks, knowing that a new value is - # coming up. the conditional also needs to handle nan equality - # and the end of iteration - if (i == N - 1 or - (masked_vals[_as[i]] != masked_vals[_as[i+1]]) or - (mask[_as[i]] ^ mask[_as[i+1]])): - dups = sum_ranks = 0 - grp_vals_seen += 1 - grp_tie_count += 1 - - # Similar to the previous conditional, check now if we are - # moving to a new group. If so, keep track of the index where - # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. fill in the size of each - # group encountered (used by pct calculations later). also be - # sure to reset any of the items helping to calculate dups - if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: - if tiebreak != TIEBREAK_DENSE: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = (i - grp_start + 1 - - grp_na_count) - else: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = (grp_tie_count - - (grp_na_count > 0)) - dups = sum_ranks = 0 - grp_na_count = 0 - grp_tie_count = 0 - grp_start = i + 1 - grp_vals_seen = 1 - - if pct: - for i in range(N): - # We don't include NaN values in percentage - # rankings, so we assign them percentages of NaN. - if out[i, 0] != out[i, 0] or out[i, 0] == NAN: - out[i, 0] = NAN - elif grp_sizes[i, 0] != 0: - out[i, 0] = out[i, 0] / grp_sizes[i, 0] - - -group_rank_float64 = group_rank["float64_t"] -group_rank_float32 = group_rank["float32_t"] -group_rank_int64 = group_rank["int64_t"] -group_rank_uint64 = group_rank["uint64_t"] -# Note: we do not have a group_rank_object because that would require a -# not-nogil implementation, see GH#19560 - - -# ---------------------------------------------------------------------- -# group_min, group_max -# ---------------------------------------------------------------------- - -# TODO: consider implementing for more dtypes -ctypedef fused groupby_t: - float64_t - float32_t - int64_t - uint64_t - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_max(groupby_t[:, :] out, - int64_t[:] counts, - groupby_t[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - groupby_t val, count, nan_val - ndarray[groupby_t, ndim=2] maxx, nobs - bint runtime_error = False - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - if groupby_t is int64_t: - # Note: evaluated at compile-time - maxx[:] = -_int64_max - nan_val = NPY_NAT - elif groupby_t is uint64_t: - # NB: We do not define nan_val because there is no such thing - # for uint64_t. We carefully avoid having to reference it in this - # case. - maxx[:] = 0 - else: - maxx[:] = -np.inf - nan_val = NAN - - N, K = (values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if groupby_t is uint64_t: - runtime_error = True - break - out[i, j] = nan_val - else: - out[i, j] = maxx[i, j] - - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_min(groupby_t[:, :] out, - int64_t[:] counts, - groupby_t[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - groupby_t val, count, nan_val - ndarray[groupby_t, ndim=2] minx, nobs - bint runtime_error = False - - assert min_count == -1, "'min_count' only used in add and prod" - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - if groupby_t is int64_t: - minx[:] = _int64_max - nan_val = NPY_NAT - elif groupby_t is uint64_t: - # NB: We do not define nan_val because there is no such thing - # for uint64_t. We carefully avoid having to reference it in this - # case. - minx[:] = np.iinfo(np.uint64).max - else: - minx[:] = np.inf - nan_val = NAN - - N, K = (values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - if not _treat_as_na(val, True): - # TODO: Sure we always want is_datetimelike=True? - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - if groupby_t is uint64_t: - runtime_error = True - break - out[i, j] = nan_val - else: - out[i, j] = minx[i, j] - - if runtime_error: - # We cannot raise directly above because that is within a nogil - # block. - raise RuntimeError("empty group with uint64_t") - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cummin(groupby_t[:, :] out, - groupby_t[:, :] values, - const int64_t[:] labels, - int ngroups, - bint is_datetimelike): - """ - Cumulative minimum of columns of `values`, in row groups `labels`. - - Parameters - ---------- - out : array - Array to store cummin in. - values : array - Values to take cummin of. - labels : int64 array - Labels to group by. - ngroups : int - Number of groups, larger than all entries of `labels`. - is_datetimelike : bool - True if `values` contains datetime-like entries. - - Notes - ----- - This method modifies the `out` parameter, rather than returning an object. - """ - - cdef: - Py_ssize_t i, j, N, K, size - groupby_t val, mval - ndarray[groupby_t, ndim=2] accum - int64_t lab - - N, K = (values).shape - accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) - if groupby_t is int64_t: - accum[:] = _int64_max - elif groupby_t is uint64_t: - accum[:] = np.iinfo(np.uint64).max - else: - accum[:] = np.inf - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - - if _treat_as_na(val, is_datetimelike): - out[i, j] = val - else: - mval = accum[lab, j] - if val < mval: - accum[lab, j] = mval = val - out[i, j] = mval - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cummax(groupby_t[:, :] out, - groupby_t[:, :] values, - const int64_t[:] labels, - int ngroups, - bint is_datetimelike): - """ - Cumulative maximum of columns of `values`, in row groups `labels`. - - Parameters - ---------- - out : array - Array to store cummax in. - values : array - Values to take cummax of. - labels : int64 array - Labels to group by. - ngroups : int - Number of groups, larger than all entries of `labels`. - is_datetimelike : bool - True if `values` contains datetime-like entries. - - Notes - ----- - This method modifies the `out` parameter, rather than returning an object. - """ - - cdef: - Py_ssize_t i, j, N, K, size - groupby_t val, mval - ndarray[groupby_t, ndim=2] accum - int64_t lab - - N, K = (values).shape - accum = np.empty((ngroups, K), dtype=np.asarray(values).dtype) - if groupby_t is int64_t: - accum[:] = -_int64_max - elif groupby_t is uint64_t: - accum[:] = 0 - else: - accum[:] = -np.inf - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - - if _treat_as_na(val, is_datetimelike): - out[i, j] = val - else: - mval = accum[lab, j] - if val > mval: - accum[lab, j] = mval = val - out[i, j] = mval diff --git a/setup.py b/setup.py index c35a0e75ecb80..2892cd0b2e294 100755 --- a/setup.py +++ b/setup.py @@ -88,7 +88,6 @@ def is_platform_mac(): "_libs/algos_take_helper.pxi.in", "_libs/algos_rank_helper.pxi.in", ], - "groupby": ["_libs/groupby_helper.pxi.in"], "hashtable": [ "_libs/hashtable_class_helper.pxi.in", "_libs/hashtable_func_helper.pxi.in", @@ -564,7 +563,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): ext_data = { "_libs.algos": {"pyxfile": "_libs/algos", "depends": _pxi_dep["algos"]}, - "_libs.groupby": {"pyxfile": "_libs/groupby", "depends": _pxi_dep["groupby"]}, + "_libs.groupby": {"pyxfile": "_libs/groupby"}, "_libs.hashing": {"pyxfile": "_libs/hashing", "include": [], "depends": []}, "_libs.hashtable": { "pyxfile": "_libs/hashtable", From 2d2f220d2e8cd1e70f4c5ba54b7217f63167e5af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon-Martin=20Schr=C3=B6der?= Date: Fri, 18 Oct 2019 19:28:57 +0200 Subject: [PATCH 011/112] ENH: Informative dtype message for for assert_series_equal (#28993) --- pandas/tests/util/test_assert_frame_equal.py | 2 +- pandas/tests/util/test_assert_series_equal.py | 2 +- pandas/util/testing.py | 7 +++++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 9571e8027ccf7..86e5d506e0779 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -141,7 +141,7 @@ def test_empty_dtypes(check_dtype): df1["col1"] = df1["col1"].astype("int64") if check_dtype: - msg = "Attributes are different" + msg = r"Attributes of DataFrame\..* are different" with pytest.raises(AssertionError, match=msg): assert_frame_equal(df1, df2, **kwargs) else: diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index a12d9386eb159..bad3f2e67f8bb 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -179,7 +179,7 @@ def test_series_equal_values_mismatch(check_less_precise): def test_series_equal_categorical_mismatch(check_categorical): - msg = """Attributes are different + msg = """Attributes of Series are different Attribute "dtype" are different \\[left\\]: CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False\\) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 4cf2776f5aa7c..73535e55d4fa5 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1156,7 +1156,9 @@ def assert_series_equal( ): pass else: - assert_attr_equal("dtype", left, right) + assert_attr_equal( + "dtype", left, right, obj="Attributes of {obj}".format(obj=obj) + ) if check_exact: assert_numpy_array_equal( @@ -1315,8 +1317,9 @@ def assert_frame_equal( >>> assert_frame_equal(df1, df2) Traceback (most recent call last): - AssertionError: Attributes are different ... + AssertionError: Attributes of DataFrame.iloc[:, 1] are different + Attribute "dtype" are different [left]: int64 [right]: float64 From 009ffc42c2a66c27a7d674a1871602e2725e7b10 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Fri, 18 Oct 2019 17:32:51 +0000 Subject: [PATCH 012/112] TST: add regression test for all-none-groupby (#29067) Closes #21624 --- pandas/tests/groupby/test_groupby.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6212a37472000..dff5baa9b5984 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1944,3 +1944,13 @@ def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected): result = getattr(grouped, op)() expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz)) assert_frame_equal(result, expected) + + +def test_groupby_only_none_group(): + # see GH21624 + # this was crashing with "ValueError: Length of passed values is 1, index implies 0" + df = pd.DataFrame({"g": [None], "x": 1}) + actual = df.groupby("g")["x"].transform("sum") + expected = pd.Series([np.nan], name="x") + + assert_series_equal(actual, expected) From 394040124a0dd46ca8ccd3658982e99adc8cf483 Mon Sep 17 00:00:00 2001 From: Luke Date: Fri, 18 Oct 2019 11:56:06 -0600 Subject: [PATCH 013/112] Fix mypy errors for pandas\tests\*: test_convert_to.py (#28965) --- pandas/tests/frame/test_convert_to.py | 4 ++-- setup.cfg | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 3f0768ad5bdac..c9a7507969f5b 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -575,9 +575,9 @@ def test_frame_to_dict_tz(self): ), ), ( - defaultdict(list), + defaultdict(dict), defaultdict( - list, + dict, { 0: {"int_col": 1, "float_col": 1.0}, 1: {"int_col": 2, "float_col": 2.0}, diff --git a/setup.cfg b/setup.cfg index 489be28a60a7d..0b43849ac1306 100644 --- a/setup.cfg +++ b/setup.cfg @@ -161,9 +161,6 @@ ignore_errors=True [mypy-pandas.tests.frame.test_constructors] ignore_errors=True -[mypy-pandas.tests.frame.test_convert_to] -ignore_errors=True - [mypy-pandas.tests.indexes.datetimes.test_datetimelike] ignore_errors=True From b106ef388cfff8876bbc5906a0d340a7c5d90c3d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 10:57:54 -0700 Subject: [PATCH 014/112] CLN: catch less in groupby (#29077) --- pandas/core/groupby/generic.py | 20 +++++++++++++++++--- pandas/core/resample.py | 19 ++++++++++++++++++- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8e53972c95275..8191c3519a36a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -889,9 +889,23 @@ def aggregate(self, func=None, *args, **kwargs): result = self._aggregate_multiple_funcs( [func], _level=_level, _axis=self.axis ) - except AssertionError: - raise - except Exception: + except ValueError as err: + if "no results" not in str(err): + # raised directly by _aggregate_multiple_funcs + raise + result = self._aggregate_frame(func) + except NotImplementedError as err: + if "axis other than 0 is not supported" in str(err): + # raised directly by _aggregate_multiple_funcs + pass + elif "decimal does not support skipna=True" in str(err): + # FIXME: kludge for DecimalArray tests + pass + else: + raise + # FIXME: this is raised in a bunch of + # test_whitelist.test_regression_whitelist_methods tests, + # can be avoided result = self._aggregate_frame(func) else: result.columns = Index( diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 5185d95cfac4c..d4ae3767f6157 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -17,6 +17,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries import pandas.core.algorithms as algos +from pandas.core.base import DataError from pandas.core.generic import _shared_docs from pandas.core.groupby.base import GroupByMixin from pandas.core.groupby.generic import SeriesGroupBy @@ -362,7 +363,23 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): result = grouped.aggregate(how, *args, **kwargs) except AssertionError: raise - except Exception: + except DataError: + # we have a non-reducing function; try to evaluate + result = grouped.apply(how, *args, **kwargs) + except ValueError as err: + if "Must produce aggregated value" in str(err): + # raised in _aggregate_named + pass + elif "len(index) != len(labels)" in str(err): + # raised in libgroupby validation + pass + elif "No objects to concatenate" in str(err): + # raised in concat call + # In tests this is reached via either + # _apply_to_column_groupbys (ohlc) or DataFrameGroupBy.nunique + pass + else: + raise # we have a non-reducing function # try to evaluate From 429ed6e69aa1450cc3b3ec54f613fd355d40d7d5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 12:34:44 -0700 Subject: [PATCH 015/112] CLN: simplify take_2d_multi (#29065) --- pandas/core/algorithms.py | 79 ++++++++++++++++----------------------- pandas/core/generic.py | 2 +- 2 files changed, 33 insertions(+), 48 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2e5ab0d182aff..717c2eb26be8b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1304,7 +1304,7 @@ def get_indexer(current_indexer, other_indexer): return frame.sort_values(columns, ascending=ascending, kind="mergesort") -# ------- ## ---- # +# ---- # # take # # ---- # @@ -1712,59 +1712,44 @@ def take_nd( take_1d = take_nd -def take_2d_multi( - arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True -): +def take_2d_multi(arr, indexer, fill_value=np.nan): """ Specialized Cython take which sets NaN values in one pass """ - if indexer is None or (indexer[0] is None and indexer[1] is None): - row_idx = np.arange(arr.shape[0], dtype=np.int64) - col_idx = np.arange(arr.shape[1], dtype=np.int64) - indexer = row_idx, col_idx - dtype, fill_value = arr.dtype, arr.dtype.type() - else: - row_idx, col_idx = indexer - if row_idx is None: - row_idx = np.arange(arr.shape[0], dtype=np.int64) - else: - row_idx = ensure_int64(row_idx) - if col_idx is None: - col_idx = np.arange(arr.shape[1], dtype=np.int64) - else: - col_idx = ensure_int64(col_idx) - indexer = row_idx, col_idx - if not allow_fill: + # This is only called from one place in DataFrame._reindex_multi, + # so we know indexer is well-behaved. + assert indexer is not None + assert indexer[0] is not None + assert indexer[1] is not None + + row_idx, col_idx = indexer + + row_idx = ensure_int64(row_idx) + col_idx = ensure_int64(col_idx) + indexer = row_idx, col_idx + mask_info = None + + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype: + # check if promotion is actually required based on indexer + row_mask = row_idx == -1 + col_mask = col_idx == -1 + row_needs = row_mask.any() + col_needs = col_mask.any() + mask_info = (row_mask, col_mask), (row_needs, col_needs) + + if not (row_needs or col_needs): + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) dtype, fill_value = arr.dtype, arr.dtype.type() - mask_info = None, False - else: - # check for promotion based on types only (do this first because - # it's faster than computing a mask) - dtype, fill_value = maybe_promote(arr.dtype, fill_value) - if dtype != arr.dtype and (out is None or out.dtype != dtype): - # check if promotion is actually required based on indexer - if mask_info is not None: - (row_mask, col_mask), (row_needs, col_needs) = mask_info - else: - row_mask = row_idx == -1 - col_mask = col_idx == -1 - row_needs = row_mask.any() - col_needs = col_mask.any() - mask_info = (row_mask, col_mask), (row_needs, col_needs) - if row_needs or col_needs: - if out is not None and out.dtype != dtype: - raise TypeError("Incompatible type for fill_value") - else: - # if not, then depromote, set fill_value to dummy - # (it won't be used but we don't want the cython code - # to crash when trying to cast it to dtype) - dtype, fill_value = arr.dtype, arr.dtype.type() # at this point, it's guaranteed that dtype can hold both the arr values # and the fill_value - if out is None: - out_shape = len(row_idx), len(col_idx) - out = np.empty(out_shape, dtype=dtype) + out_shape = len(row_idx), len(col_idx) + out = np.empty(out_shape, dtype=dtype) func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) if func is None and arr.dtype != out.dtype: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e97772a418982..e3e59639de56b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4606,7 +4606,7 @@ def _needs_reindex_multi(self, axes, method, level): ) def _reindex_multi(self, axes, copy, fill_value): - return NotImplemented + raise AbstractMethodError(self) def _reindex_with_indexers( self, reindexers, fill_value=None, copy=False, allow_dups=False From ac4469c4230bb06239fec6859c2df7852d52c77b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 14:29:56 -0700 Subject: [PATCH 016/112] REF: use fused types in algos_rank_helper (#29044) --- pandas/_libs/algos_rank_helper.pxi.in | 418 ++++++++++++++------------ 1 file changed, 231 insertions(+), 187 deletions(-) diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 1ba1667b687be..d5a31b6a13010 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -8,24 +8,17 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # rank_1d, rank_2d # ---------------------------------------------------------------------- -{{py: - -# dtype ctype pos_nan_value neg_nan_value -dtypes = [('object', 'object', 'Infinity()', 'NegInfinity()'), - ('float64', 'float64_t', 'np.inf', '-np.inf'), - ('uint64', 'uint64_t', '', ''), - ('int64', 'int64_t', 'np.iinfo(np.int64).max', - 'np.iinfo(np.int64).min')] - -}} - -{{for dtype, ctype, pos_nan_value, neg_nan_value in dtypes}} +ctypedef fused rank_t: + object + float64_t + uint64_t + int64_t @cython.wraparound(False) @cython.boundscheck(False) -def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', - ascending=True, na_option='keep', pct=False): +def rank_1d(rank_t[:] in_arr, ties_method='average', + ascending=True, na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -33,85 +26,86 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', cdef: Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 - {{if dtype == 'object'}} - ndarray sorted_data, values - {{else}} - ndarray[{{ctype}}] sorted_data, values - {{endif}} + ndarray[rank_t] sorted_data, values ndarray[float64_t] ranks ndarray[int64_t] argsorted ndarray[uint8_t, cast=True] sorted_mask - {{if dtype == 'uint64'}} - {{ctype}} val - {{else}} - {{ctype}} val, nan_value - {{endif}} + rank_t val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 - bint isnan + bint isnan, condition float64_t count = 0.0 + tiebreak = tiebreakers[ties_method] - {{if dtype == 'float64'}} - values = np.asarray(in_arr).copy() - {{elif dtype == 'object'}} - values = np.array(in_arr, copy=True) + if rank_t is float64_t: + values = np.asarray(in_arr).copy() + elif rank_t is object: + values = np.array(in_arr, copy=True) - if values.dtype != np.object_: - values = values.astype('O') - {{else}} - values = np.asarray(in_arr) - {{endif}} + if values.dtype != np.object_: + values = values.astype('O') + else: + values = np.asarray(in_arr) keep_na = na_option == 'keep' - {{if dtype == 'object'}} - mask = missing.isnaobj(values) - {{elif dtype == 'float64'}} - mask = np.isnan(values) - {{elif dtype == 'int64'}} - mask = values == NPY_NAT + if rank_t is object: + mask = missing.isnaobj(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT - # create copy in case of NPY_NAT - # values are mutated inplace - if mask.any(): - values = values.copy() - {{endif}} + # create copy in case of NPY_NAT + # values are mutated inplace + if mask.any(): + values = values.copy() # double sort first by mask and then by values to ensure nan values are # either at the beginning or the end. mask/(~mask) controls padding at # tail or the head - {{if dtype != 'uint64'}} - if ascending ^ (na_option == 'top'): - nan_value = {{pos_nan_value}} - order = (values, mask) + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max + + order = (values, mask) + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).min + + order = (values, ~mask) + np.putmask(values, mask, nan_value) else: - nan_value = {{neg_nan_value}} - order = (values, ~mask) - np.putmask(values, mask, nan_value) - {{else}} - mask = np.zeros(shape=len(values), dtype=bool) - order = (values, mask) - {{endif}} + mask = np.zeros(shape=len(values), dtype=bool) + order = (values, mask) n = len(values) ranks = np.empty(n, dtype='f8') - {{if dtype == 'object'}} - _as = np.lexsort(keys=order) - {{else}} - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here + if rank_t is object: _as = np.lexsort(keys=order) - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING else: - _as = np.lexsort(keys=order) - {{endif}} + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = np.lexsort(keys=order) + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = np.lexsort(keys=order) if not ascending: _as = _as[::-1] @@ -122,38 +116,32 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', non_na_idx = _indices[0] if len(_indices) > 0 else -1 argsorted = _as.astype('i8') - {{if dtype == 'object'}} - if True: - {{else}} - with nogil: - {{endif}} - # TODO: why does the 2d version not have a nogil block? + if rank_t is object: + # TODO: de-duplicate once cython supports conditional nogil for i in range(n): sum_ranks += i + 1 dups += 1 - {{if dtype == 'object'}} - val = util.get_value_at(sorted_data, i) - {{else}} val = sorted_data[i] - {{endif}} - {{if dtype != 'uint64'}} - isnan = sorted_mask[i] - if isnan and keep_na: - ranks[argsorted[i]] = NaN - continue - {{endif}} + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue count += 1.0 - {{if dtype == 'object'}} - if (i == n - 1 or - are_diff(util.get_value_at(sorted_data, i + 1), val) or - i == non_na_idx): - {{else}} - if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx): - {{endif}} + if rank_t is object: + condition = (i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx) + else: + condition = (i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx) + + if condition: if tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): @@ -165,13 +153,12 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = i + 1 elif tiebreak == TIEBREAK_FIRST: - {{if dtype == 'object'}} - raise ValueError('first not supported for ' - 'non-numeric data') - {{else}} - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - {{endif}} + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 @@ -180,6 +167,60 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 + + else: + with nogil: + # TODO: why does the 2d version not have a nogil block? + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = sorted_data[i] + + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue + + count += 1.0 + + if rank_t is object: + condition = (i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx) + else: + condition = (i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx) + + if condition: + + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + if pct: if tiebreak == TIEBREAK_DENSE: return ranks / total_tie_count @@ -189,8 +230,14 @@ def rank_1d_{{dtype}}({{ctype}}[:] in_arr, ties_method='average', return ranks -def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): +rank_1d_object = rank_1d["object"] +rank_1d_float64 = rank_1d["float64_t"] +rank_1d_uint64 = rank_1d["uint64_t"] +rank_1d_int64 = rank_1d["int64_t"] + + +def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', + ascending=True, na_option='keep', pct=False): """ Fast NaN-friendly version of scipy.stats.rankdata """ @@ -198,29 +245,20 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', cdef: Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - {{if dtype == 'object'}} Py_ssize_t infs - {{endif}} ndarray[float64_t, ndim=2] ranks - {{if dtype == 'int64' or dtype == 'uint64'}} - ndarray[{{ctype}}, ndim=2, cast=True] values - {{else}} - ndarray[{{ctype}}, ndim=2] values - {{endif}} + ndarray[rank_t, ndim=2] values ndarray[int64_t, ndim=2] argsorted - {{if dtype == 'uint64'}} - {{ctype}} val - {{else}} - {{ctype}} val, nan_value - {{endif}} + rank_t val, nan_value float64_t sum_ranks = 0 int tiebreak = 0 bint keep_na = 0 float64_t count = 0.0 + bint condition, skip_condition tiebreak = tiebreakers[ties_method] @@ -231,103 +269,106 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', else: values = np.asarray(in_arr).copy() - {{if dtype == 'object'}} - if values.dtype != np.object_: - values = values.astype('O') - {{endif}} - - {{if dtype != 'uint64'}} - if ascending ^ (na_option == 'top'): - nan_value = {{pos_nan_value}} - else: - nan_value = {{neg_nan_value}} + if rank_t is object: + if values.dtype != np.object_: + values = values.astype('O') - {{if dtype == 'object'}} - mask = missing.isnaobj2d(values) - {{elif dtype == 'float64'}} - mask = np.isnan(values) - {{elif dtype == 'int64'}} - mask = values == NPY_NAT - {{endif}} + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max - np.putmask(values, mask, nan_value) - {{endif}} + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = NPY_NAT + + if rank_t is object: + mask = missing.isnaobj2d(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT + + np.putmask(values, mask, nan_value) n, k = (values).shape ranks = np.empty((n, k), dtype='f8') - {{if dtype == 'object'}} - try: - _as = values.argsort(1) - except TypeError: - values = in_arr - for i in range(len(values)): - ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, - ascending=ascending, pct=pct) - if axis == 0: - return ranks.T - else: - return ranks - {{else}} - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(axis=1, kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING + if rank_t is object: + try: + _as = values.argsort(1) + except TypeError: + values = in_arr + for i in range(len(values)): + ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, + ascending=ascending, pct=pct) + if axis == 0: + return ranks.T + else: + return ranks else: - _as = values.argsort(1) - {{endif}} + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) if not ascending: _as = _as[:, ::-1] - values = _take_2d_{{dtype}}(values, _as) + values = _take_2d(values, _as) argsorted = _as.astype('i8') for i in range(n): - {{if dtype == 'object'}} - dups = sum_ranks = infs = 0 - {{else}} - dups = sum_ranks = 0 - {{endif}} + if rank_t is object: + dups = sum_ranks = infs = 0 + else: + dups = sum_ranks = 0 total_tie_count = 0 count = 0.0 for j in range(k): - {{if dtype != 'object'}} - sum_ranks += j + 1 - dups += 1 - {{endif}} + if rank_t is not object: + sum_ranks += j + 1 + dups += 1 val = values[i, j] - {{if dtype != 'uint64'}} - {{if dtype == 'object'}} - if (val is nan_value) and keep_na: - {{else}} - if (val == nan_value) and keep_na: - {{endif}} - ranks[i, argsorted[i, j]] = NaN + if rank_t is not uint64_t: + if rank_t is object: + skip_condition = (val is nan_value) and keep_na + else: + skip_condition = (val == nan_value) and keep_na + if skip_condition: + ranks[i, argsorted[i, j]] = NaN - {{if dtype == 'object'}} - infs += 1 - {{endif}} + if rank_t is object: + infs += 1 - continue - {{endif}} + continue count += 1.0 - {{if dtype == 'object'}} - sum_ranks += (j - infs) + 1 - dups += 1 - {{endif}} + if rank_t is object: + sum_ranks += (j - infs) + 1 + dups += 1 - {{if dtype == 'object'}} - if j == k - 1 or are_diff(values[i, j + 1], val): - {{else}} - if j == k - 1 or values[i, j + 1] != val: - {{endif}} + if rank_t is object: + condition = j == k - 1 or are_diff(values[i, j + 1], val) + else: + condition = j == k - 1 or values[i, j + 1] != val + + if condition: if tiebreak == TIEBREAK_AVERAGE: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = sum_ranks / dups @@ -338,13 +379,12 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = j + 1 elif tiebreak == TIEBREAK_FIRST: - {{if dtype == 'object'}} - raise ValueError('first not supported ' - 'for non-numeric data') - {{else}} - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = z + 1 - {{endif}} + if rank_t is object: + raise ValueError('first not supported ' + 'for non-numeric data') + else: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 elif tiebreak == TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 @@ -363,4 +403,8 @@ def rank_2d_{{dtype}}({{ctype}}[:, :] in_arr, axis=0, ties_method='average', else: return ranks -{{endfor}} + +rank_2d_object = rank_2d["object"] +rank_2d_float64 = rank_2d["float64_t"] +rank_2d_uint64 = rank_2d["uint64_t"] +rank_2d_int64 = rank_2d["int64_t"] From 193994215f9c287838fb3b9ef7c189c579bf3ec4 Mon Sep 17 00:00:00 2001 From: Javad Date: Sat, 19 Oct 2019 03:27:38 +0330 Subject: [PATCH 017/112] DOC: updated categorical docstring (#29068) --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d34cf3e576beb..795986127cde7 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -295,7 +295,7 @@ class Categorical(ExtensionArray, PandasObject): See Also -------- - api.types.CategoricalDtype : Type for categorical data. + CategoricalDtype : Type for categorical data. CategoricalIndex : An Index with an underlying ``Categorical``. Notes From 6ce48d97fbfb054c69cf9b78ad96f0e4fb9c6997 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Oct 2019 19:22:59 -0700 Subject: [PATCH 018/112] CLN: tighten exception catching in indexes (#29078) --- pandas/core/indexes/base.py | 17 ++++------------- pandas/core/indexes/period.py | 10 ++++++++-- pandas/core/indexes/timedeltas.py | 3 ++- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 526b2c2e2c412..1a08609ccd99a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3142,16 +3142,7 @@ def is_int(v): elif is_positional: indexer = key else: - try: - indexer = self.slice_indexer(start, stop, step, kind=kind) - except Exception: - if is_index_slice: - if self.is_integer(): - raise - else: - indexer = key - else: - raise + indexer = self.slice_indexer(start, stop, step, kind=kind) return indexer @@ -4676,11 +4667,11 @@ def get_value(self, series, key): raise InvalidIndexError(key) else: raise e1 - except Exception: # pragma: no cover + except Exception: raise e1 except TypeError: - # python 3 - if is_scalar(key): # pragma: no cover + # e.g. "[False] is an invalid key" + if is_scalar(key): raise IndexError(key) raise InvalidIndexError(key) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 0fc74f4e78c9f..f085dff84462d 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -457,7 +457,11 @@ def __contains__(self, key): try: self.get_loc(key) return True - except Exception: + except (ValueError, TypeError, KeyError): + # TypeError can be reached if we pass a tuple that is not hashable + # ValueError can be reached if pass a 2-tuple and parse_time_string + # raises with the wrong number of return values + # TODO: the latter is a bug in parse_time_string return False @cache_readonly @@ -765,7 +769,9 @@ def _maybe_cast_slice_bound(self, label, side, kind): _, parsed, reso = parse_time_string(label, self.freq) bounds = self._parsed_string_to_bounds(reso, parsed) return bounds[0 if side == "left" else 1] - except Exception: + except ValueError: + # string cannot be parsed as datetime-like + # TODO: we need tests for this case raise KeyError(label) elif is_integer(label) or is_float(label): self._invalid_indexer("slice", label) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 755992c881fe5..62a74fefa6577 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -630,7 +630,8 @@ def insert(self, loc, item): if _is_convertible_to_td(item): try: item = Timedelta(item) - except Exception: + except ValueError: + # e.g. str that can't be parsed to timedelta pass elif is_scalar(item) and isna(item): # GH 18295 From b03696750d7ab3a04f30917989898e4ff5cc5381 Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Sat, 19 Oct 2019 17:50:56 +0200 Subject: [PATCH 019/112] [#22550] Remove TestData from series-tests test_quantile.py (#29096) --- pandas/tests/series/test_quantile.py | 51 +++++++++++++++------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index b001312fa37f3..1a4a3f523cbbe 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -8,24 +8,22 @@ from pandas.core.indexes.datetimes import Timestamp import pandas.util.testing as tm -from .common import TestData +class TestSeriesQuantile: + def test_quantile(self, datetime_series): -class TestSeriesQuantile(TestData): - def test_quantile(self): + q = datetime_series.quantile(0.1) + assert q == np.percentile(datetime_series.dropna(), 10) - q = self.ts.quantile(0.1) - assert q == np.percentile(self.ts.dropna(), 10) - - q = self.ts.quantile(0.9) - assert q == np.percentile(self.ts.dropna(), 90) + q = datetime_series.quantile(0.9) + assert q == np.percentile(datetime_series.dropna(), 90) # object dtype - q = Series(self.ts, dtype=object).quantile(0.9) - assert q == np.percentile(self.ts.dropna(), 90) + q = Series(datetime_series, dtype=object).quantile(0.9) + assert q == np.percentile(datetime_series.dropna(), 90) # datetime64[ns] dtype - dts = self.ts.index.to_series() + dts = datetime_series.index.to_series() q = dts.quantile(0.2) assert q == Timestamp("2000-01-10 19:12:00") @@ -41,20 +39,23 @@ def test_quantile(self): msg = "percentiles should all be in the interval \\[0, 1\\]" for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: with pytest.raises(ValueError, match=msg): - self.ts.quantile(invalid) + datetime_series.quantile(invalid) - def test_quantile_multi(self): + def test_quantile_multi(self, datetime_series): qs = [0.1, 0.9] - result = self.ts.quantile(qs) + result = datetime_series.quantile(qs) expected = pd.Series( - [np.percentile(self.ts.dropna(), 10), np.percentile(self.ts.dropna(), 90)], + [ + np.percentile(datetime_series.dropna(), 10), + np.percentile(datetime_series.dropna(), 90), + ], index=qs, - name=self.ts.name, + name=datetime_series.name, ) tm.assert_series_equal(result, expected) - dts = self.ts.index.to_series() + dts = datetime_series.index.to_series() dts.name = "xxx" result = dts.quantile((0.2, 0.2)) expected = Series( @@ -64,18 +65,20 @@ def test_quantile_multi(self): ) tm.assert_series_equal(result, expected) - result = self.ts.quantile([]) - expected = pd.Series([], name=self.ts.name, index=Index([], dtype=float)) + result = datetime_series.quantile([]) + expected = pd.Series( + [], name=datetime_series.name, index=Index([], dtype=float) + ) tm.assert_series_equal(result, expected) - def test_quantile_interpolation(self): + def test_quantile_interpolation(self, datetime_series): # see gh-10174 # interpolation = linear (default case) - q = self.ts.quantile(0.1, interpolation="linear") - assert q == np.percentile(self.ts.dropna(), 10) - q1 = self.ts.quantile(0.1) - assert q1 == np.percentile(self.ts.dropna(), 10) + q = datetime_series.quantile(0.1, interpolation="linear") + assert q == np.percentile(datetime_series.dropna(), 10) + q1 = datetime_series.quantile(0.1) + assert q1 == np.percentile(datetime_series.dropna(), 10) # test with and without interpolation keyword assert q == q1 From ac4518544cfeb9fde436224506b0fafdcb2f4578 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 10:03:50 -0700 Subject: [PATCH 020/112] comments, catch less (#29088) --- pandas/core/groupby/generic.py | 12 ++++-------- pandas/core/groupby/ops.py | 10 +++++++++- pandas/core/series.py | 1 + 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8191c3519a36a..a78857423e7e0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1067,14 +1067,9 @@ def _aggregate_frame(self, func, *args, **kwargs): result = OrderedDict() if axis != obj._info_axis_number: - try: - for name, data in self: - fres = func(data, *args, **kwargs) - result[name] = self._try_cast(fres, data) - except AssertionError: - raise - except Exception: - return self._aggregate_item_by_item(func, *args, **kwargs) + for name, data in self: + fres = func(data, *args, **kwargs) + result[name] = self._try_cast(fres, data) else: for name in self.indices: data = self.get_group(name, obj=obj) @@ -1441,6 +1436,7 @@ def _choose_path(self, fast_path, slow_path, group): raise except Exception: # Hard to know ex-ante what exceptions `fast_path` might raise + # TODO: no test cases get here return path, res # verify fast path does not change columns (and names), otherwise diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e380cf5930f97..fcc646dec89d9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -655,7 +655,15 @@ def agg_series(self, obj, func): return self._aggregate_series_fast(obj, func) except AssertionError: raise - except Exception: + except ValueError as err: + if "No result." in str(err): + # raised in libreduction + pass + elif "Function does not reduce" in str(err): + # raised in libreduction + pass + else: + raise return self._aggregate_series_pure_python(obj, func) def _aggregate_series_fast(self, obj, func): diff --git a/pandas/core/series.py b/pandas/core/series.py index 1039e9af929d4..ea48b3603623a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1288,6 +1288,7 @@ def _set_with(self, key, value): else: if isinstance(key, tuple): try: + # TODO: no test cases that get here self._set_values(key, value) except Exception: pass From 6c93ef1eda37142e84a5db8c4ffcd523d55874b7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 10:05:31 -0700 Subject: [PATCH 021/112] REF: remove algos_rank_helper (#29086) --- pandas/_libs/algos.pyx | 415 +++++++++++++++++++++++++- pandas/_libs/algos_rank_helper.pxi.in | 410 ------------------------- setup.py | 1 - 3 files changed, 414 insertions(+), 412 deletions(-) delete mode 100644 pandas/_libs/algos_rank_helper.pxi.in diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 0f91f612994c7..cab8bc8e799d4 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -771,7 +771,420 @@ def is_monotonic(ndarray[algos_t, ndim=1] arr, bint timelike): return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic +# ---------------------------------------------------------------------- +# rank_1d, rank_2d +# ---------------------------------------------------------------------- + +ctypedef fused rank_t: + object + float64_t + uint64_t + int64_t + + +@cython.wraparound(False) +@cython.boundscheck(False) +def rank_1d(rank_t[:] in_arr, ties_method='average', + ascending=True, na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 + + ndarray[rank_t] sorted_data, values + + ndarray[float64_t] ranks + ndarray[int64_t] argsorted + ndarray[uint8_t, cast=True] sorted_mask + + rank_t val, nan_value + + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + bint isnan, condition + float64_t count = 0.0 + + tiebreak = tiebreakers[ties_method] + + if rank_t is float64_t: + values = np.asarray(in_arr).copy() + elif rank_t is object: + values = np.array(in_arr, copy=True) + + if values.dtype != np.object_: + values = values.astype('O') + else: + values = np.asarray(in_arr) + + keep_na = na_option == 'keep' + + if rank_t is object: + mask = missing.isnaobj(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT + + # create copy in case of NPY_NAT + # values are mutated inplace + if mask.any(): + values = values.copy() + + # double sort first by mask and then by values to ensure nan values are + # either at the beginning or the end. mask/(~mask) controls padding at + # tail or the head + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max + + order = (values, mask) + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).min + + order = (values, ~mask) + np.putmask(values, mask, nan_value) + else: + mask = np.zeros(shape=len(values), dtype=bool) + order = (values, mask) + + n = len(values) + ranks = np.empty(n, dtype='f8') + + if rank_t is object: + _as = np.lexsort(keys=order) + else: + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = np.lexsort(keys=order) + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = np.lexsort(keys=order) + + if not ascending: + _as = _as[::-1] + + sorted_data = values.take(_as) + sorted_mask = mask.take(_as) + _indices = np.diff(sorted_mask.astype(int)).nonzero()[0] + non_na_idx = _indices[0] if len(_indices) > 0 else -1 + argsorted = _as.astype('i8') + + if rank_t is object: + # TODO: de-duplicate once cython supports conditional nogil + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = sorted_data[i] + + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue + + count += 1.0 + + if rank_t is object: + condition = ( + i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx + ) + else: + condition = ( + i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx + ) + + if condition: + + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + + else: + with nogil: + # TODO: why does the 2d version not have a nogil block? + for i in range(n): + sum_ranks += i + 1 + dups += 1 + + val = sorted_data[i] + + if rank_t is not uint64_t: + isnan = sorted_mask[i] + if isnan and keep_na: + ranks[argsorted[i]] = NaN + continue + + count += 1.0 + + if rank_t is object: + condition = ( + i == n - 1 or + are_diff(sorted_data[i + 1], val) or + i == non_na_idx + ) + else: + condition = ( + i == n - 1 or + sorted_data[i + 1] != val or + i == non_na_idx + ) + + if condition: + + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported for ' + 'non-numeric data') + else: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count + sum_ranks = dups = 0 + + if pct: + if tiebreak == TIEBREAK_DENSE: + return ranks / total_tie_count + else: + return ranks / count + else: + return ranks + + +rank_1d_object = rank_1d["object"] +rank_1d_float64 = rank_1d["float64_t"] +rank_1d_uint64 = rank_1d["uint64_t"] +rank_1d_int64 = rank_1d["int64_t"] + + +def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', + ascending=True, na_option='keep', pct=False): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 + + Py_ssize_t infs + + ndarray[float64_t, ndim=2] ranks + ndarray[rank_t, ndim=2] values + + ndarray[int64_t, ndim=2] argsorted + + rank_t val, nan_value + + float64_t sum_ranks = 0 + int tiebreak = 0 + bint keep_na = 0 + float64_t count = 0.0 + bint condition, skip_condition + + tiebreak = tiebreakers[ties_method] + + keep_na = na_option == 'keep' + + if axis == 0: + values = np.asarray(in_arr).T.copy() + else: + values = np.asarray(in_arr).copy() + + if rank_t is object: + if values.dtype != np.object_: + values = values.astype('O') + + if rank_t is not uint64_t: + if ascending ^ (na_option == 'top'): + if rank_t is object: + nan_value = Infinity() + elif rank_t is float64_t: + nan_value = np.inf + elif rank_t is int64_t: + nan_value = np.iinfo(np.int64).max + + else: + if rank_t is object: + nan_value = NegInfinity() + elif rank_t is float64_t: + nan_value = -np.inf + elif rank_t is int64_t: + nan_value = NPY_NAT + + if rank_t is object: + mask = missing.isnaobj2d(values) + elif rank_t is float64_t: + mask = np.isnan(values) + elif rank_t is int64_t: + mask = values == NPY_NAT + + np.putmask(values, mask, nan_value) + + n, k = (values).shape + ranks = np.empty((n, k), dtype='f8') + + if rank_t is object: + try: + _as = values.argsort(1) + except TypeError: + values = in_arr + for i in range(len(values)): + ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, + ascending=ascending, pct=pct) + if axis == 0: + return ranks.T + else: + return ranks + else: + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) + + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d(values, _as) + argsorted = _as.astype('i8') + + for i in range(n): + if rank_t is object: + dups = sum_ranks = infs = 0 + else: + dups = sum_ranks = 0 + + total_tie_count = 0 + count = 0.0 + for j in range(k): + if rank_t is not object: + sum_ranks += j + 1 + dups += 1 + + val = values[i, j] + + if rank_t is not uint64_t: + if rank_t is object: + skip_condition = (val is nan_value) and keep_na + else: + skip_condition = (val == nan_value) and keep_na + if skip_condition: + ranks[i, argsorted[i, j]] = NaN + + if rank_t is object: + infs += 1 + + continue + + count += 1.0 + + if rank_t is object: + sum_ranks += (j - infs) + 1 + dups += 1 + + if rank_t is object: + condition = j == k - 1 or are_diff(values[i, j + 1], val) + else: + condition = j == k - 1 or values[i, j + 1] != val + + if condition: + if tiebreak == TIEBREAK_AVERAGE: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j + 1 + elif tiebreak == TIEBREAK_FIRST: + if rank_t is object: + raise ValueError('first not supported ' + 'for non-numeric data') + else: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count + sum_ranks = dups = 0 + if pct: + if tiebreak == TIEBREAK_DENSE: + ranks[i, :] /= total_tie_count + else: + ranks[i, :] /= count + if axis == 0: + return ranks.T + else: + return ranks + + +rank_2d_object = rank_2d["object"] +rank_2d_float64 = rank_2d["float64_t"] +rank_2d_uint64 = rank_2d["uint64_t"] +rank_2d_int64 = rank_2d["int64_t"] + + # generated from template include "algos_common_helper.pxi" -include "algos_rank_helper.pxi" include "algos_take_helper.pxi" diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in deleted file mode 100644 index d5a31b6a13010..0000000000000 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ /dev/null @@ -1,410 +0,0 @@ -""" -Template for each `dtype` helper function for rank - -WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in -""" - -# ---------------------------------------------------------------------- -# rank_1d, rank_2d -# ---------------------------------------------------------------------- - -ctypedef fused rank_t: - object - float64_t - uint64_t - int64_t - - -@cython.wraparound(False) -@cython.boundscheck(False) -def rank_1d(rank_t[:] in_arr, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, n, dups = 0, total_tie_count = 0, non_na_idx = 0 - - ndarray[rank_t] sorted_data, values - - ndarray[float64_t] ranks - ndarray[int64_t] argsorted - ndarray[uint8_t, cast=True] sorted_mask - - rank_t val, nan_value - - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - bint isnan, condition - float64_t count = 0.0 - - tiebreak = tiebreakers[ties_method] - - if rank_t is float64_t: - values = np.asarray(in_arr).copy() - elif rank_t is object: - values = np.array(in_arr, copy=True) - - if values.dtype != np.object_: - values = values.astype('O') - else: - values = np.asarray(in_arr) - - keep_na = na_option == 'keep' - - if rank_t is object: - mask = missing.isnaobj(values) - elif rank_t is float64_t: - mask = np.isnan(values) - elif rank_t is int64_t: - mask = values == NPY_NAT - - # create copy in case of NPY_NAT - # values are mutated inplace - if mask.any(): - values = values.copy() - - # double sort first by mask and then by values to ensure nan values are - # either at the beginning or the end. mask/(~mask) controls padding at - # tail or the head - if rank_t is not uint64_t: - if ascending ^ (na_option == 'top'): - if rank_t is object: - nan_value = Infinity() - elif rank_t is float64_t: - nan_value = np.inf - elif rank_t is int64_t: - nan_value = np.iinfo(np.int64).max - - order = (values, mask) - else: - if rank_t is object: - nan_value = NegInfinity() - elif rank_t is float64_t: - nan_value = -np.inf - elif rank_t is int64_t: - nan_value = np.iinfo(np.int64).min - - order = (values, ~mask) - np.putmask(values, mask, nan_value) - else: - mask = np.zeros(shape=len(values), dtype=bool) - order = (values, mask) - - n = len(values) - ranks = np.empty(n, dtype='f8') - - if rank_t is object: - _as = np.lexsort(keys=order) - else: - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = np.lexsort(keys=order) - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = np.lexsort(keys=order) - - if not ascending: - _as = _as[::-1] - - sorted_data = values.take(_as) - sorted_mask = mask.take(_as) - _indices = np.diff(sorted_mask.astype(int)).nonzero()[0] - non_na_idx = _indices[0] if len(_indices) > 0 else -1 - argsorted = _as.astype('i8') - - if rank_t is object: - # TODO: de-duplicate once cython supports conditional nogil - for i in range(n): - sum_ranks += i + 1 - dups += 1 - - val = sorted_data[i] - - if rank_t is not uint64_t: - isnan = sorted_mask[i] - if isnan and keep_na: - ranks[argsorted[i]] = NaN - continue - - count += 1.0 - - if rank_t is object: - condition = (i == n - 1 or - are_diff(sorted_data[i + 1], val) or - i == non_na_idx) - else: - condition = (i == n - 1 or - sorted_data[i + 1] != val or - i == non_na_idx) - - if condition: - - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported for ' - 'non-numeric data') - else: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - - else: - with nogil: - # TODO: why does the 2d version not have a nogil block? - for i in range(n): - sum_ranks += i + 1 - dups += 1 - - val = sorted_data[i] - - if rank_t is not uint64_t: - isnan = sorted_mask[i] - if isnan and keep_na: - ranks[argsorted[i]] = NaN - continue - - count += 1.0 - - if rank_t is object: - condition = (i == n - 1 or - are_diff(sorted_data[i + 1], val) or - i == non_na_idx) - else: - condition = (i == n - 1 or - sorted_data[i + 1] != val or - i == non_na_idx) - - if condition: - - if tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = i + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported for ' - 'non-numeric data') - else: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = j + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = 2 * i - j - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for j in range(i - dups + 1, i + 1): - ranks[argsorted[j]] = total_tie_count - sum_ranks = dups = 0 - - if pct: - if tiebreak == TIEBREAK_DENSE: - return ranks / total_tie_count - else: - return ranks / count - else: - return ranks - - -rank_1d_object = rank_1d["object"] -rank_1d_float64 = rank_1d["float64_t"] -rank_1d_uint64 = rank_1d["uint64_t"] -rank_1d_int64 = rank_1d["int64_t"] - - -def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', - ascending=True, na_option='keep', pct=False): - """ - Fast NaN-friendly version of scipy.stats.rankdata - """ - - cdef: - Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 - - Py_ssize_t infs - - ndarray[float64_t, ndim=2] ranks - ndarray[rank_t, ndim=2] values - - ndarray[int64_t, ndim=2] argsorted - - rank_t val, nan_value - - float64_t sum_ranks = 0 - int tiebreak = 0 - bint keep_na = 0 - float64_t count = 0.0 - bint condition, skip_condition - - tiebreak = tiebreakers[ties_method] - - keep_na = na_option == 'keep' - - if axis == 0: - values = np.asarray(in_arr).T.copy() - else: - values = np.asarray(in_arr).copy() - - if rank_t is object: - if values.dtype != np.object_: - values = values.astype('O') - - if rank_t is not uint64_t: - if ascending ^ (na_option == 'top'): - if rank_t is object: - nan_value = Infinity() - elif rank_t is float64_t: - nan_value = np.inf - elif rank_t is int64_t: - nan_value = np.iinfo(np.int64).max - - else: - if rank_t is object: - nan_value = NegInfinity() - elif rank_t is float64_t: - nan_value = -np.inf - elif rank_t is int64_t: - nan_value = NPY_NAT - - if rank_t is object: - mask = missing.isnaobj2d(values) - elif rank_t is float64_t: - mask = np.isnan(values) - elif rank_t is int64_t: - mask = values == NPY_NAT - - np.putmask(values, mask, nan_value) - - n, k = (values).shape - ranks = np.empty((n, k), dtype='f8') - - if rank_t is object: - try: - _as = values.argsort(1) - except TypeError: - values = in_arr - for i in range(len(values)): - ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, - ascending=ascending, pct=pct) - if axis == 0: - return ranks.T - else: - return ranks - else: - if tiebreak == TIEBREAK_FIRST: - # need to use a stable sort here - _as = values.argsort(axis=1, kind='mergesort') - if not ascending: - tiebreak = TIEBREAK_FIRST_DESCENDING - else: - _as = values.argsort(1) - - if not ascending: - _as = _as[:, ::-1] - - values = _take_2d(values, _as) - argsorted = _as.astype('i8') - - for i in range(n): - if rank_t is object: - dups = sum_ranks = infs = 0 - else: - dups = sum_ranks = 0 - - total_tie_count = 0 - count = 0.0 - for j in range(k): - if rank_t is not object: - sum_ranks += j + 1 - dups += 1 - - val = values[i, j] - - if rank_t is not uint64_t: - if rank_t is object: - skip_condition = (val is nan_value) and keep_na - else: - skip_condition = (val == nan_value) and keep_na - if skip_condition: - ranks[i, argsorted[i, j]] = NaN - - if rank_t is object: - infs += 1 - - continue - - count += 1.0 - - if rank_t is object: - sum_ranks += (j - infs) + 1 - dups += 1 - - if rank_t is object: - condition = j == k - 1 or are_diff(values[i, j + 1], val) - else: - condition = j == k - 1 or values[i, j + 1] != val - - if condition: - if tiebreak == TIEBREAK_AVERAGE: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j - dups + 2 - elif tiebreak == TIEBREAK_MAX: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = j + 1 - elif tiebreak == TIEBREAK_FIRST: - if rank_t is object: - raise ValueError('first not supported ' - 'for non-numeric data') - else: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = z + 1 - elif tiebreak == TIEBREAK_FIRST_DESCENDING: - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 - elif tiebreak == TIEBREAK_DENSE: - total_tie_count += 1 - for z in range(j - dups + 1, j + 1): - ranks[i, argsorted[i, z]] = total_tie_count - sum_ranks = dups = 0 - if pct: - if tiebreak == TIEBREAK_DENSE: - ranks[i, :] /= total_tie_count - else: - ranks[i, :] /= count - if axis == 0: - return ranks.T - else: - return ranks - - -rank_2d_object = rank_2d["object"] -rank_2d_float64 = rank_2d["float64_t"] -rank_2d_uint64 = rank_2d["uint64_t"] -rank_2d_int64 = rank_2d["int64_t"] diff --git a/setup.py b/setup.py index 2892cd0b2e294..0dd1980088db8 100755 --- a/setup.py +++ b/setup.py @@ -86,7 +86,6 @@ def is_platform_mac(): "algos": [ "_libs/algos_common_helper.pxi.in", "_libs/algos_take_helper.pxi.in", - "_libs/algos_rank_helper.pxi.in", ], "hashtable": [ "_libs/hashtable_class_helper.pxi.in", From a8cbd11880be80499a2ceff967915c4b575b906a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 10:06:21 -0700 Subject: [PATCH 022/112] CLN: Exception in DataFrame._reduce (#29085) --- pandas/core/frame.py | 52 +++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 34 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7880acb1b78da..c90bf4ba7151f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -68,7 +68,6 @@ infer_dtype_from_object, is_bool_dtype, is_datetime64_any_dtype, - is_datetime64tz_dtype, is_dict_like, is_dtype_equal, is_extension_array_dtype, @@ -7784,20 +7783,9 @@ def _reduce( def f(x): return op(x, axis=axis, skipna=skipna, **kwds) - # exclude timedelta/datetime unless we are uniform types - if ( - axis == 1 - and self._is_datelike_mixed_type - and ( - not self._is_homogeneous_type - and not is_datetime64tz_dtype(self.dtypes[0]) - ) - ): - numeric_only = True - if numeric_only is None: + values = self.values try: - values = self.values result = f(values) if filter_type == "bool" and is_object_dtype(values) and axis is None: @@ -7809,27 +7797,23 @@ def f(x): # try by-column first if filter_type is None and axis == 0: - try: - - # this can end up with a non-reduction - # but not always. if the types are mixed - # with datelike then need to make sure a series - - # we only end up here if we have not specified - # numeric_only and yet we have tried a - # column-by-column reduction, where we have mixed type. - # So let's just do what we can - from pandas.core.apply import frame_apply - - opa = frame_apply( - self, func=f, result_type="expand", ignore_failures=True - ) - result = opa.get_result() - if result.ndim == self.ndim: - result = result.iloc[0] - return result - except Exception: - pass + # this can end up with a non-reduction + # but not always. if the types are mixed + # with datelike then need to make sure a series + + # we only end up here if we have not specified + # numeric_only and yet we have tried a + # column-by-column reduction, where we have mixed type. + # So let's just do what we can + from pandas.core.apply import frame_apply + + opa = frame_apply( + self, func=f, result_type="expand", ignore_failures=True + ) + result = opa.get_result() + if result.ndim == self.ndim: + result = result.iloc[0] + return result if filter_type is None or filter_type == "numeric": data = self._get_numeric_data() From 1f20f63f7214de3c078471bee017186e7e76e0ff Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Sat, 19 Oct 2019 19:07:11 +0200 Subject: [PATCH 023/112] [#22550] Remove TestData from series-tests test_operators.py (#29084) --- pandas/tests/series/test_operators.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 6bfcc02ca633a..942ab0db37a57 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -16,8 +16,6 @@ assert_series_equal, ) -from .common import TestData - class TestSeriesLogicalOps: @pytest.mark.parametrize("bool_op", [operator.and_, operator.or_, operator.xor]) @@ -746,7 +744,7 @@ def test_comparison_flex_alignment_fill(self): assert_series_equal(left.gt(right, fill_value=0), exp) -class TestSeriesOperators(TestData): +class TestSeriesOperators: def test_operators_empty_int_corner(self): s1 = Series([], [], dtype=np.int32) s2 = Series({"x": 0.0}) @@ -768,12 +766,10 @@ def test_ops_datetimelike_align(self): result = (dt2.to_frame() - dt.to_frame())[0] assert_series_equal(result, expected) - def test_operators_corner(self): - series = self.ts - + def test_operators_corner(self, datetime_series): empty = Series([], index=Index([])) - result = series + empty + result = datetime_series + empty assert np.isnan(result).all() result = empty + Series([], index=Index([])) @@ -786,10 +782,12 @@ def test_operators_corner(self): # deltas = deltas + sub_deltas # float + int - int_ts = self.ts.astype(int)[:-5] - added = self.ts + int_ts + int_ts = datetime_series.astype(int)[:-5] + added = datetime_series + int_ts expected = Series( - self.ts.values[:-5] + int_ts.values, index=self.ts.index[:-5], name="ts" + datetime_series.values[:-5] + int_ts.values, + index=datetime_series.index[:-5], + name="ts", ) tm.assert_series_equal(added[:-5], expected) From d5f430adaf19faf660909434bda42c50c35febc7 Mon Sep 17 00:00:00 2001 From: qudade <7327644+qudade@users.noreply.github.com> Date: Sat, 19 Oct 2019 19:08:36 +0200 Subject: [PATCH 024/112] TST: 2d index when constructing dataframe (#25416). (#29083) --- pandas/tests/frame/test_constructors.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index ebffeeaa3063e..583093af6d3e6 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -424,6 +424,25 @@ def test_constructor_multi_index(self): df = DataFrame(index=mi, columns=mi) assert pd.isna(df).values.ravel().all() + def test_constructor_2d_index(self): + # GH 25416 + # handling of 2d index in construction + df = pd.DataFrame([[1]], columns=[[1]], index=[1, 2]) + expected = pd.DataFrame( + [1, 1], + index=pd.Int64Index([1, 2], dtype="int64"), + columns=pd.MultiIndex(levels=[[1]], codes=[[0]]), + ) + tm.assert_frame_equal(df, expected) + + df = pd.DataFrame([[1]], columns=[[1]], index=[[1, 2]]) + expected = pd.DataFrame( + [1, 1], + index=pd.MultiIndex(levels=[[1, 2]], codes=[[0, 1]]), + columns=pd.MultiIndex(levels=[[1]], codes=[[0]]), + ) + tm.assert_frame_equal(df, expected) + def test_constructor_error_msgs(self): msg = "Empty data passed with indices specified." # passing an empty array with columns specified. From 78d34206610265271387831675cfee19e36b703e Mon Sep 17 00:00:00 2001 From: qudade <7327644+qudade@users.noreply.github.com> Date: Sat, 19 Oct 2019 19:11:15 +0200 Subject: [PATCH 025/112] TST: regression test for groupby with datetime and timedelta (#15562) (#29063) --- pandas/tests/groupby/test_apply.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 4d0063b773bc5..1af4768b7381e 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -543,6 +543,33 @@ def predictions(tool): tm.assert_series_equal(expected, result) +def test_apply_aggregating_timedelta_and_datetime(): + # Regression test for GH 15562 + # The following groupby caused ValueErrors and IndexErrors pre 0.20.0 + + df = pd.DataFrame( + { + "clientid": ["A", "B", "C"], + "datetime": [np.datetime64("2017-02-01 00:00:00")] * 3, + } + ) + df["time_delta_zero"] = df.datetime - df.datetime + result = df.groupby("clientid").apply( + lambda ddf: pd.Series( + dict(clientid_age=ddf.time_delta_zero.min(), date=ddf.datetime.min()) + ) + ) + expected = pd.DataFrame( + { + "clientid": ["A", "B", "C"], + "clientid_age": [np.timedelta64(0, "D")] * 3, + "date": [np.datetime64("2017-02-01 00:00:00")] * 3, + } + ).set_index("clientid") + + tm.assert_frame_equal(result, expected) + + def test_time_field_bug(): # Test a fix for the following error related to GH issue 11324 When # non-key fields in a group-by dataframe contained time-based fields From 0f93b07dda3ff28586d21357eb90e3d2840f99a0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 10:13:05 -0700 Subject: [PATCH 026/112] CLN: cython and docstring cleanups (#29089) --- pandas/_libs/algos_common_helper.pxi.in | 5 +- pandas/_libs/algos_take_helper.pxi.in | 62 ++++++++++++----------- pandas/_libs/hashtable_func_helper.pxi.in | 6 ++- pandas/_libs/join.pyx | 24 ++++----- pandas/_libs/sparse_op_helper.pxi.in | 2 +- pandas/_libs/window.pyx | 8 +-- pandas/core/arrays/base.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/groupby/groupby.py | 2 +- pandas/core/indexes/base.py | 5 +- pandas/core/indexes/datetimes.py | 3 +- pandas/core/indexes/timedeltas.py | 4 +- pandas/core/missing.py | 2 +- pandas/core/ops/docstrings.py | 4 +- pandas/core/strings.py | 2 +- pandas/io/pytables.py | 2 +- pandas/tests/plotting/common.py | 4 +- 17 files changed, 67 insertions(+), 72 deletions(-) diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 91599fa223b57..eb6d689899073 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -17,12 +17,11 @@ dtypes = [('float64', 'float64_t', 'float64_t'), def get_dispatch(dtypes): for name, c_type, dest_type, in dtypes: - dest_name = dest_type[:-2] # i.e. strip "_t" - yield name, c_type, dest_type, dest_name + yield name, c_type, dest_type }} -{{for name, c_type, dest_type, dest_name +{{for name, c_type, dest_type in get_dispatch(dtypes)}} diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index e7ee212065c5b..bd5a488722f6d 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -10,28 +10,28 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: -# name, dest, c_type_in, c_type_out, preval, postval, can_copy, nogil +# c_type_in, c_type_out, preval, postval dtypes = [ - ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True), - ('bool', 'object', 'uint8_t', 'object', - 'True if ', ' > 0 else False', False), - ('int8', 'int8', 'int8_t', 'int8_t', '', '', True), - ('int8', 'int32', 'int8_t', 'int32_t', '', '', False), - ('int8', 'int64', 'int8_t', 'int64_t', '', '', False), - ('int8', 'float64', 'int8_t', 'float64_t', '', '', False), - ('int16', 'int16', 'int16_t', 'int16_t', '', '', True), - ('int16', 'int32', 'int16_t', 'int32_t', '', '', False), - ('int16', 'int64', 'int16_t', 'int64_t', '', '', False), - ('int16', 'float64', 'int16_t', 'float64_t', '', '', False), - ('int32', 'int32', 'int32_t', 'int32_t', '', '', True), - ('int32', 'int64', 'int32_t', 'int64_t', '', '', False), - ('int32', 'float64', 'int32_t', 'float64_t', '', '', False), - ('int64', 'int64', 'int64_t', 'int64_t', '', '', True), - ('int64', 'float64', 'int64_t', 'float64_t', '', '', False), - ('float32', 'float32', 'float32_t', 'float32_t', '', '', True), - ('float32', 'float64', 'float32_t', 'float64_t', '', '', False), - ('float64', 'float64', 'float64_t', 'float64_t', '', '', True), - ('object', 'object', 'object', 'object', '', '', False)] + ('uint8_t', 'uint8_t', '', ''), + ('uint8_t', 'object', 'True if ', ' > 0 else False'), + ('int8_t', 'int8_t', '', ''), + ('int8_t', 'int32_t', '', ''), + ('int8_t', 'int64_t', '', ''), + ('int8_t', 'float64_t', '', ''), + ('int16_t', 'int16_t', '', ''), + ('int16_t', 'int32_t', '', ''), + ('int16_t', 'int64_t', '', ''), + ('int16_t', 'float64_t', '', ''), + ('int32_t', 'int32_t', '', ''), + ('int32_t', 'int64_t', '', ''), + ('int32_t', 'float64_t', '', ''), + ('int64_t', 'int64_t', '', ''), + ('int64_t', 'float64_t', '', ''), + ('float32_t', 'float32_t', '', ''), + ('float32_t', 'float64_t', '', ''), + ('float64_t', 'float64_t', '', ''), + ('object', 'object', '', ''), +] def get_dispatch(dtypes): @@ -117,9 +117,9 @@ def get_dispatch(dtypes): out[i, j] = %(preval)svalues[i, idx]%(postval)s """ - for (name, dest, c_type_in, c_type_out, preval, postval, - can_copy) in dtypes: + for (c_type_in, c_type_out, preval, postval) in dtypes: + can_copy = c_type_in == c_type_out != "object" nogil = c_type_out != "object" if nogil: nogil_str = "with nogil:" @@ -128,6 +128,16 @@ def get_dispatch(dtypes): nogil_str = '' tab = '' + def get_name(dtype_name): + if dtype_name == "object": + return "object" + if dtype_name == "uint8_t": + return "bool" + return dtype_name[:-2] + + name = get_name(c_type_in) + dest = get_name(c_type_out) + args = dict(name=name, dest=dest, c_type_in=c_type_in, c_type_out=c_type_out, preval=preval, postval=postval, can_copy=can_copy, nogil_str=nogil_str, tab=tab) @@ -291,9 +301,3 @@ cdef _take_2d(ndarray[take_t, ndim=2] values, object idx): for j in range(K): result[i, j] = values[i, indexer[i, j]] return result - - -_take_2d_object = _take_2d[object] -_take_2d_float64 = _take_2d[float64_t] -_take_2d_int64 = _take_2d[int64_t] -_take_2d_uint64 = _take_2d[uint64_t] diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index e400ec0e608f0..f6af93f85bd5a 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -151,12 +151,14 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): if keep == 'last': {{if dtype == 'object'}} - for i from n > i >= 0: + for i in range(n - 1, -1, -1): + # equivalent: range(n)[::-1], which cython doesnt like in nogil kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{else}} with nogil: - for i from n > i >= 0: + for i in range(n - 1, -1, -1): + # equivalent: range(n)[::-1], which cython doesnt like in nogil kh_put_{{ttype}}(table, values[i], &ret) out[i] = ret == 0 {{endif}} diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 238bfd0be0aa7..caf730389008a 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -13,6 +13,7 @@ from pandas._libs.algos import ( ) +@cython.boundscheck(False) def inner_join(const int64_t[:] left, const int64_t[:] right, Py_ssize_t max_groups): cdef: @@ -20,6 +21,8 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, ndarray[int64_t] left_count, right_count, left_sorter, right_sorter ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc + Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 + Py_ssize_t offset # NA group in location 0 @@ -34,11 +37,6 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, if rc > 0 and lc > 0: count += lc * rc - # group 0 is the NA group - cdef: - Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 - Py_ssize_t offset - # exclude the NA group left_pos = left_count[0] right_pos = right_count[0] @@ -64,6 +62,7 @@ def inner_join(const int64_t[:] left, const int64_t[:] right, _get_result_indexer(right_sorter, right_indexer)) +@cython.boundscheck(False) def left_outer_join(const int64_t[:] left, const int64_t[:] right, Py_ssize_t max_groups, sort=True): cdef: @@ -72,6 +71,8 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, ndarray rev ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc + Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 + Py_ssize_t offset # NA group in location 0 @@ -85,11 +86,6 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, else: count += left_count[i] - # group 0 is the NA group - cdef: - Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 - Py_ssize_t offset - # exclude the NA group left_pos = left_count[0] right_pos = right_count[0] @@ -137,6 +133,7 @@ def left_outer_join(const int64_t[:] left, const int64_t[:] right, return left_indexer, right_indexer +@cython.boundscheck(False) def full_outer_join(const int64_t[:] left, const int64_t[:] right, Py_ssize_t max_groups): cdef: @@ -144,6 +141,8 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, ndarray[int64_t] left_count, right_count, left_sorter, right_sorter ndarray[int64_t] left_indexer, right_indexer int64_t lc, rc + int64_t left_pos = 0, right_pos = 0 + Py_ssize_t offset, position = 0 # NA group in location 0 @@ -160,11 +159,6 @@ def full_outer_join(const int64_t[:] left, const int64_t[:] right, else: count += lc + rc - # group 0 is the NA group - cdef: - int64_t left_pos = 0, right_pos = 0 - Py_ssize_t offset, position = 0 - # exclude the NA group left_pos = left_count[0] right_pos = right_count[0] diff --git a/pandas/_libs/sparse_op_helper.pxi.in b/pandas/_libs/sparse_op_helper.pxi.in index 5949a3fd0ed81..62ea477167b72 100644 --- a/pandas/_libs/sparse_op_helper.pxi.in +++ b/pandas/_libs/sparse_op_helper.pxi.in @@ -86,7 +86,7 @@ def get_op(tup): 'and': '{0} & {1}', # logical op 'or': '{0} | {1}'} - return ops_dict[opname].format(lval, rval, dtype) + return ops_dict[opname].format(lval, rval) def get_dispatch(dtypes): diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 8de593ce36c86..a2096d389823f 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -1296,7 +1296,7 @@ cdef _roll_min_max_variable(ndarray[numeric] values, # The original impl didn't deal with variable window sizes # So the code was optimized for that - for i from starti[0] <= i < endi[0]: + for i in range(starti[0], endi[0]): ai = init_mm(values[i], &nobs, is_max) # Discard previous entries if we find new min or max @@ -1644,7 +1644,7 @@ def roll_generic(object obj, else: # truncated windows at the beginning, through first full-length window - for i from 0 <= i < (int_min(win, N) - offset): + for i in range((int_min(win, N) - offset)): if counts[i] >= minp: output[i] = func(arr[0: (i + offset + 1)], *args, **kwargs) else: @@ -1654,7 +1654,7 @@ def roll_generic(object obj, buf = arr.data bufarr = np.empty(win, dtype=float) oldbuf = bufarr.data - for i from (win - offset) <= i < (N - offset): + for i in range((win - offset), (N - offset)): buf = buf + 1 bufarr.data = buf if counts[i] >= minp: @@ -1664,7 +1664,7 @@ def roll_generic(object obj, bufarr.data = oldbuf # truncated windows at the end - for i from int_max(N - offset, 0) <= i < N: + for i in range(int_max(N - offset, 0), N): if counts[i] >= minp: output[i] = func(arr[int_max(i + offset - win + 1, 0): N], *args, diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 53755695c97e3..08901df963f20 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1104,7 +1104,7 @@ def _create_method(cls, op, coerce_to_dtype=True): ---------- op : function An operator that takes arguments op(a, b) - coerce_to_dtype : bool, default True + coerce_to_dtype : bool, default True boolean indicating whether to attempt to convert the result to the underlying ExtensionArray dtype. If it's not possible to create a new ExtensionArray with the diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e3e59639de56b..a300748ee5bc8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2481,7 +2481,7 @@ def to_hdf(self, path_or_buf, key, **kwargs): like searching / selecting subsets of the data. append : bool, default False For Table formats, append the input data to the existing. - data_columns : list of columns or True, optional + data_columns : list of columns or True, optional List of columns to create as indexed data columns for on-disk queries, or True to use all columns. By default only the axes of the object are indexed. See :ref:`io.hdf5-query-data-columns`. diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b27d5bb05ee8f..f622480cfe4b7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2080,7 +2080,7 @@ def rank( * dense: like 'min', but rank always increases by 1 between groups ascending : bool, default True False for ranks by high (1) to low (N). - na_option : {'keep', 'top', 'bottom'}, default 'keep' + na_option : {'keep', 'top', 'bottom'}, default 'keep' * keep: leave NA values where they are * top: smallest rank if ascending * bottom: smallest rank if descending diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1a08609ccd99a..9d6487f7a8ae4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2031,7 +2031,7 @@ def fillna(self, value=None, downcast=None): Parameters ---------- - how : {'any', 'all'}, default 'any' + how : {'any', 'all'}, default 'any' If the Index is a MultiIndex, drop the value when any or all levels are NaN. @@ -5016,12 +5016,11 @@ def _validate_indexer(self, form, key, kind): Returns ------- - label : object + label : object Notes ----- Value of `side` parameter should be validated in caller. - """ @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 477525d7ab272..49c11c5505d00 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1079,12 +1079,11 @@ def _maybe_cast_slice_bound(self, label, side, kind): Returns ------- - label : object + label : object Notes ----- Value of `side` parameter should be validated in caller. - """ assert kind in ["ix", "loc", "getitem", None] diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 62a74fefa6577..c404e205e603c 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -550,7 +550,6 @@ def _maybe_cast_slice_bound(self, label, side, kind): """ If label is a string, cast it to timedelta according to resolution. - Parameters ---------- label : object @@ -559,8 +558,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): Returns ------- - label : object - + label : object """ assert kind in ["ix", "loc", "getitem", None] diff --git a/pandas/core/missing.py b/pandas/core/missing.py index bc81fbb7e1ce0..f2655c126b9e5 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -420,7 +420,7 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0): ---------- xi : array_like A sorted list of x-coordinates, of length N. - yi : array_like + yi : array_like A 1-D array of real values. `yi`'s length along the interpolation axis must be equal to the length of `xi`. If N-D array, use axis parameter to select correct axis. diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 93f197366cf32..5d3f9cd92aa1a 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -387,7 +387,7 @@ def _make_flex_doc(op_name, typ): ---------- other : scalar, sequence, Series, or DataFrame Any single or multiple element data structure, or list-like object. -axis : {{0 or 'index', 1 or 'columns'}} +axis : {{0 or 'index', 1 or 'columns'}} Whether to compare by the index (0 or 'index') or columns (1 or 'columns'). For Series input, axis to match Series index on. level : int or label @@ -541,7 +541,7 @@ def _make_flex_doc(op_name, typ): ---------- other : scalar, sequence, Series, or DataFrame Any single or multiple element data structure, or list-like object. -axis : {{0 or 'index', 1 or 'columns'}}, default 'columns' +axis : {{0 or 'index', 1 or 'columns'}}, default 'columns' Whether to compare by the index (0 or 'index') or columns (1 or 'columns'). level : int or label diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2f2e7234999f2..e50da168af4d2 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1343,7 +1343,7 @@ def str_pad(arr, width, side="left", fillchar=" "): character. Equivalent to ``Series.str.pad(side='right')``. Series.str.center : Fills boths sides of strings with an arbitrary character. Equivalent to ``Series.str.pad(side='both')``. - Series.str.zfill : Pad strings in the Series/Index by prepending '0' + Series.str.zfill : Pad strings in the Series/Index by prepending '0' character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. Examples diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c87cad5472bd9..94f863d8970f1 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1027,7 +1027,7 @@ def append( / selecting subsets of the data append : bool, default True Append the input data to the existing. - data_columns : list of columns, or True, default None + data_columns : list of columns, or True, default None List of columns to create as indexed data columns for on-disk queries, or True to use all columns. By default only the axes of the object are indexed. See `here diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 65d0c3d9fb17d..f0ba5f14d59c6 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -311,7 +311,7 @@ def _check_ax_scales(self, axes, xaxis="linear", yaxis="linear"): axes : matplotlib Axes object, or its list-like xaxis : {'linear', 'log'} expected xaxis scale - yaxis : {'linear', 'log'} + yaxis : {'linear', 'log'} expected yaxis scale """ axes = self._flatten_visible(axes) @@ -329,7 +329,7 @@ def _check_axes_shape(self, axes, axes_num=None, layout=None, figsize=None): axes_num : number expected number of axes. Unnecessary axes should be set to invisible. - layout : tuple + layout : tuple expected layout, (expected number of rows , columns) figsize : tuple expected figsize. default is matplotlib default From 1846dc657db4d27c22d280fb2b5298351277611a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Oct 2019 11:14:22 -0700 Subject: [PATCH 027/112] BUG: parse_time_string failing to raise TypeError (#29098) --- pandas/_libs/tslibs/parsing.pyx | 4 ++-- pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/period.py | 9 ++------- pandas/core/indexes/timedeltas.py | 2 +- pandas/tests/tslibs/test_parsing.py | 6 ++++++ 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index bf0a0ae5a3fe9..796d1400194fd 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -233,7 +233,7 @@ def parse_datetime_string(date_string, freq=None, dayfirst=False, return dt -def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): +def parse_time_string(arg: str, freq=None, dayfirst=None, yearfirst=None): """ Try hard to parse datetime string, leveraging dateutil plus some extra goodies like quarter recognition. @@ -253,7 +253,7 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): datetime, datetime/dateutil.parser._result, str """ if not isinstance(arg, str): - return arg + raise TypeError("parse_time_string argument must be str") if getattr(freq, "_typ", None) == "dateoffset": freq = freq.rule_code diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 49c11c5505d00..3535682bf182d 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1106,7 +1106,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): else: return label - def _get_string_slice(self, key, use_lhs=True, use_rhs=True): + def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) _, parsed, reso = parsing.parse_time_string(key, freq) loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, use_rhs=use_rhs) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index f085dff84462d..a20290e77023a 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -457,11 +457,8 @@ def __contains__(self, key): try: self.get_loc(key) return True - except (ValueError, TypeError, KeyError): + except (TypeError, KeyError): # TypeError can be reached if we pass a tuple that is not hashable - # ValueError can be reached if pass a 2-tuple and parse_time_string - # raises with the wrong number of return values - # TODO: the latter is a bug in parse_time_string return False @cache_readonly @@ -608,7 +605,7 @@ def get_value(self, series, key): try: return com.maybe_box(self, super().get_value(s, key), series, key) except (KeyError, IndexError): - try: + if isinstance(key, str): asdt, parsed, reso = parse_time_string(key, self.freq) grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) @@ -634,8 +631,6 @@ def get_value(self, series, key): ) else: raise KeyError(key) - except TypeError: - pass period = Period(key, self.freq) key = period.value if isna(period) else period.ordinal diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index c404e205e603c..983e68f38a4b9 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -528,7 +528,7 @@ def get_loc(self, key, method=None, tolerance=None): # the try/except clauses below tolerance = self._convert_tolerance(tolerance, np.asarray(key)) - if _is_convertible_to_td(key): + if _is_convertible_to_td(key) or key is NaT: key = Timedelta(key) return Index.get_loc(self, key, method, tolerance) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 126a1bd12ad59..9b6ed86bc2770 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -23,6 +23,12 @@ def test_parse_time_string(): assert parsed == parsed_lower +def test_parse_time_string_invalid_type(): + # Raise on invalid input, don't just return it + with pytest.raises(TypeError): + parse_time_string((4, 5)) + + @pytest.mark.parametrize( "dashed,normal", [("1988-Q2", "1988Q2"), ("2Q-1988", "2Q1988")] ) From 1f49aa839151f10177204dc515a235e3784810c4 Mon Sep 17 00:00:00 2001 From: Grigorios Giannakopoulos Date: Sun, 20 Oct 2019 16:49:54 +0300 Subject: [PATCH 028/112] Add a regression test for the timezone issue (#29097) --- pandas/tests/frame/test_apply.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index fe034504b8161..4b7439cd40023 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1346,3 +1346,17 @@ def test_frequency_is_original(self, num_cols): df = DataFrame(1, index=index, columns=range(num_cols)) df.apply(lambda x: x) assert index.freq == original.freq + + def test_apply_datetime_tz_issue(self): + # GH 29052 + + timestamps = [ + pd.Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"), + pd.Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"), + pd.Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"), + ] + df = DataFrame(data=[0, 1, 2], index=timestamps) + result = df.apply(lambda x: x.name, axis=1) + expected = pd.Series(index=timestamps, data=timestamps) + + tm.assert_series_equal(result, expected) From 08d66f3f722c151632c5ed52dc8eca9836d0d1e4 Mon Sep 17 00:00:00 2001 From: Abhijeet Krishnan Date: Sun, 20 Oct 2019 18:26:40 -0400 Subject: [PATCH 029/112] Fix typing errors (#29115) Thanks, @AbhijeetKrishnan --- pandas/tests/frame/test_constructors.py | 6 +++--- setup.cfg | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 583093af6d3e6..aa00cf234d9ee 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -50,13 +50,13 @@ class TestDataFrameConstructors: lambda: DataFrame({}), lambda: DataFrame(()), lambda: DataFrame([]), - lambda: DataFrame((x for x in [])), + lambda: DataFrame((_ for _ in [])), lambda: DataFrame(range(0)), lambda: DataFrame(data=None), lambda: DataFrame(data={}), lambda: DataFrame(data=()), lambda: DataFrame(data=[]), - lambda: DataFrame(data=(x for x in [])), + lambda: DataFrame(data=(_ for _ in [])), lambda: DataFrame(data=range(0)), ], ) @@ -72,7 +72,7 @@ def test_empty_constructor(self, constructor): [ ([[]], RangeIndex(1), RangeIndex(0)), ([[], []], RangeIndex(2), RangeIndex(0)), - ([(x for x in [])], RangeIndex(1), RangeIndex(0)), + ([(_ for _ in [])], RangeIndex(1), RangeIndex(0)), ], ) def test_emptylike_constructor(self, emptylike, expected_index, expected_columns): diff --git a/setup.cfg b/setup.cfg index 0b43849ac1306..26aab29f025a0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -158,9 +158,6 @@ ignore_errors=True [mypy-pandas.tests.extension.json.test_json] ignore_errors=True -[mypy-pandas.tests.frame.test_constructors] -ignore_errors=True - [mypy-pandas.tests.indexes.datetimes.test_datetimelike] ignore_errors=True From 140102ba49351418c75b6c22f03de45fb63ff330 Mon Sep 17 00:00:00 2001 From: Abhijeet Krishnan Date: Sun, 20 Oct 2019 18:27:41 -0400 Subject: [PATCH 030/112] Fix typing errors (#29114) --- pandas/tests/dtypes/test_inference.py | 7 +++++-- setup.cfg | 3 --- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index cfa6304909bb7..60afd768195d9 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -75,7 +75,7 @@ def coerce(request): (iter([1, 2]), True, "iterator"), # noqa: E241 (iter([]), True, "iterator-empty"), # noqa: E241 ((x for x in [1, 2]), True, "generator"), # noqa: E241 - ((x for x in []), True, "generator-empty"), # noqa: E241 + ((_ for _ in []), True, "generator-empty"), # noqa: E241 (Series([1]), True, "Series"), # noqa: E241 (Series([]), True, "Series-empty"), # noqa: E241 (Series(["a"]).str, True, "StringMethods"), # noqa: E241 @@ -288,7 +288,10 @@ class MockFile: assert not is_file(data) -@pytest.mark.parametrize("ll", [collections.namedtuple("Test", list("abc"))(1, 2, 3)]) +test_tuple = collections.namedtuple("Test", ["a", "b", "c"]) + + +@pytest.mark.parametrize("ll", [test_tuple(1, 2, 3)]) def test_is_names_tuple_passes(ll): assert inference.is_named_tuple(ll) diff --git a/setup.cfg b/setup.cfg index 26aab29f025a0..5dd4309ede6a3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -146,9 +146,6 @@ ignore_errors=True ignore_errors=True -[mypy-pandas.tests.dtypes.test_inference] -ignore_errors=True - [mypy-pandas.tests.extension.decimal.test_decimal] ignore_errors=True From c8dd72a7529b9c12debed18fc3cb5aa80e76f310 Mon Sep 17 00:00:00 2001 From: Abhijeet Krishnan Date: Mon, 21 Oct 2019 02:49:55 -0400 Subject: [PATCH 031/112] Fix mypy errors (#29108) --- pandas/tests/series/test_constructors.py | 4 ++-- setup.cfg | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 65cbf5fcf91d2..ca14f0fd05869 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -45,12 +45,12 @@ class TestSeriesConstructors: (lambda: Series({}), True), (lambda: Series(()), False), # creates a RangeIndex (lambda: Series([]), False), # creates a RangeIndex - (lambda: Series((x for x in [])), False), # creates a RangeIndex + (lambda: Series((_ for _ in [])), False), # creates a RangeIndex (lambda: Series(data=None), True), (lambda: Series(data={}), True), (lambda: Series(data=()), False), # creates a RangeIndex (lambda: Series(data=[]), False), # creates a RangeIndex - (lambda: Series(data=(x for x in [])), False), # creates a RangeIndex + (lambda: Series(data=(_ for _ in [])), False), # creates a RangeIndex ], ) def test_empty_constructor(self, constructor, check_index_type): diff --git a/setup.cfg b/setup.cfg index 5dd4309ede6a3..d0a7d9f3b7755 100644 --- a/setup.cfg +++ b/setup.cfg @@ -185,9 +185,6 @@ ignore_errors=True [mypy-pandas.tests.indexing.test_loc] ignore_errors=True -[mypy-pandas.tests.series.test_constructors] -ignore_errors=True - [mypy-pandas.tests.series.test_operators] ignore_errors=True From 3c80277ad6dc5ffe77510443ff67482e9c69bcc0 Mon Sep 17 00:00:00 2001 From: yogendrasoni Date: Mon, 21 Oct 2019 17:27:28 +0530 Subject: [PATCH 032/112] fix #28926 mypy error in pandas\tests\arrays\test_array.py (#28970) --- pandas/core/arrays/period.py | 4 +++- setup.cfg | 11 +++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index a21d9e67e49e5..78cc54db4b1b8 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -831,7 +831,9 @@ def _raise_on_incompatible(left, right): def period_array( - data: Sequence[Optional[Period]], freq: Optional[Tick] = None, copy: bool = False + data: Sequence[Optional[Period]], + freq: Optional[Union[str, Tick]] = None, + copy: bool = False, ) -> PeriodArray: """ Construct a new PeriodArray from a sequence of Period scalars. diff --git a/setup.cfg b/setup.cfg index d0a7d9f3b7755..8be3659a202d8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -136,15 +136,17 @@ ignore_errors=True [mypy-pandas.tests.arithmetic.test_datetime64] ignore_errors=True -[mypy-pandas.tests.arrays.test_array] -ignore_errors=True - [mypy-pandas.tests.arrays.test_datetimelike] ignore_errors=True +<<<<<<< HEAD [mypy-pandas.tests.arrays.test_period] ignore_errors=True +======= +[mypy-pandas.tests.dtypes.test_common] +ignore_errors=True +>>>>>>> 6c898e6a5... fix #28926 mypy error in pandas\tests\arrays\test_array.py (#28970) [mypy-pandas.tests.extension.decimal.test_decimal] ignore_errors=True @@ -188,9 +190,6 @@ ignore_errors=True [mypy-pandas.tests.series.test_operators] ignore_errors=True -[mypy-pandas.tests.test_base] -ignore_errors=True - [mypy-pandas.tests.tseries.offsets.test_offsets] ignore_errors=True From 1e12510b22573fa9175500d68350043e6bb1cd6f Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 22 Oct 2019 02:34:37 +0200 Subject: [PATCH 033/112] TST: Test key dtype cast after merge (#29030) --- pandas/tests/reshape/merge/test_merge.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 08698133e360d..19555a0c7e4c3 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1437,6 +1437,17 @@ def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals): result = B.merge(A, left_on="Y", right_on="X") assert_frame_equal(result, expected[["Y", "X"]]) + def test_merge_key_dtype_cast(self): + # GH 17044 + df1 = DataFrame({"key": [1.0, 2.0], "v1": [10, 20]}, columns=["key", "v1"]) + df2 = DataFrame({"key": [2], "v2": [200]}, columns=["key", "v2"]) + result = df1.merge(df2, on="key", how="left") + expected = DataFrame( + {"key": [1.0, 2.0], "v1": [10, 20], "v2": [np.nan, 200.0]}, + columns=["key", "v1", "v2"], + ) + tm.assert_frame_equal(result, expected) + def test_merge_on_ints_floats_warning(self): # GH 16572 # merge will produce a warning when merging on int and From 06ba9f592abb76b34481f6ecb8fdb83ff6c8c746 Mon Sep 17 00:00:00 2001 From: lukasbk Date: Tue, 22 Oct 2019 02:36:47 +0200 Subject: [PATCH 034/112] CLN: Type error fix in tests\tseries\offsets\test_yqm_offsets.py (#28996) --- setup.cfg | 6 ------ 1 file changed, 6 deletions(-) diff --git a/setup.cfg b/setup.cfg index 8be3659a202d8..c9ba13443e97c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -139,14 +139,8 @@ ignore_errors=True [mypy-pandas.tests.arrays.test_datetimelike] ignore_errors=True -<<<<<<< HEAD -[mypy-pandas.tests.arrays.test_period] -ignore_errors=True - -======= [mypy-pandas.tests.dtypes.test_common] ignore_errors=True ->>>>>>> 6c898e6a5... fix #28926 mypy error in pandas\tests\arrays\test_array.py (#28970) [mypy-pandas.tests.extension.decimal.test_decimal] ignore_errors=True From 4892735e990d09aba7403634c8897c60af53ec9c Mon Sep 17 00:00:00 2001 From: lukasbk Date: Tue, 22 Oct 2019 02:36:47 +0200 Subject: [PATCH 035/112] CLN: Type error fix in tests\tseries\offsets\test_yqm_offsets.py (#28996) --- pandas/tests/tseries/offsets/test_offsets.py | 3 ++- setup.cfg | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index ddf2c6e65b474..5cc10bf00203d 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1,4 +1,5 @@ from datetime import date, datetime, time as dt_time, timedelta +from typing import Type import numpy as np import pytest @@ -92,7 +93,7 @@ def test_to_M8(): class Base: - _offset = None + _offset = None # type: Type[DateOffset] d = Timestamp(datetime(2008, 1, 2)) timezones = [ diff --git a/setup.cfg b/setup.cfg index c9ba13443e97c..766099a9d7521 100644 --- a/setup.cfg +++ b/setup.cfg @@ -186,6 +186,3 @@ ignore_errors=True [mypy-pandas.tests.tseries.offsets.test_offsets] ignore_errors=True - -[mypy-pandas.tests.tseries.offsets.test_yqm_offsets] -ignore_errors=True From f992870c08701d356a00cabcc65f2de0a2ecd4c9 Mon Sep 17 00:00:00 2001 From: Jeroen Kant <45035434+jjlkant@users.noreply.github.com> Date: Tue, 22 Oct 2019 02:41:21 +0200 Subject: [PATCH 036/112] PERF: Benchmark merge with non-int64 and tolerance (#28922) (#28974) --- asv_bench/benchmarks/join_merge.py | 59 +++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 6aa82a43a4d6a..5cf9f6336ba0c 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -273,10 +273,10 @@ def time_merge_ordered(self): class MergeAsof: - params = [["backward", "forward", "nearest"]] - param_names = ["direction"] + params = [["backward", "forward", "nearest"], [None, 5]] + param_names = ["direction", "tolerance"] - def setup(self, direction): + def setup(self, direction, tolerance): one_count = 200000 two_count = 1000000 @@ -303,6 +303,9 @@ def setup(self, direction): df1["time32"] = np.int32(df1.time) df2["time32"] = np.int32(df2.time) + df1["timeu64"] = np.uint64(df1.time) + df2["timeu64"] = np.uint64(df2.time) + self.df1a = df1[["time", "value1"]] self.df2a = df2[["time", "value2"]] self.df1b = df1[["time", "key", "value1"]] @@ -313,22 +316,52 @@ def setup(self, direction): self.df2d = df2[["time32", "value2"]] self.df1e = df1[["time", "key", "key2", "value1"]] self.df2e = df2[["time", "key", "key2", "value2"]] + self.df1f = df1[["timeu64", "value1"]] + self.df2f = df2[["timeu64", "value2"]] + + def time_on_int(self, direction, tolerance): + merge_asof( + self.df1a, self.df2a, on="time", direction=direction, tolerance=tolerance + ) - def time_on_int(self, direction): - merge_asof(self.df1a, self.df2a, on="time", direction=direction) + def time_on_int32(self, direction, tolerance): + merge_asof( + self.df1d, self.df2d, on="time32", direction=direction, tolerance=tolerance + ) - def time_on_int32(self, direction): - merge_asof(self.df1d, self.df2d, on="time32", direction=direction) + def time_on_uint64(self, direction, tolerance): + merge_asof( + self.df1f, self.df2f, on="timeu64", direction=direction, tolerance=tolerance + ) - def time_by_object(self, direction): - merge_asof(self.df1b, self.df2b, on="time", by="key", direction=direction) + def time_by_object(self, direction, tolerance): + merge_asof( + self.df1b, + self.df2b, + on="time", + by="key", + direction=direction, + tolerance=tolerance, + ) - def time_by_int(self, direction): - merge_asof(self.df1c, self.df2c, on="time", by="key2", direction=direction) + def time_by_int(self, direction, tolerance): + merge_asof( + self.df1c, + self.df2c, + on="time", + by="key2", + direction=direction, + tolerance=tolerance, + ) - def time_multiby(self, direction): + def time_multiby(self, direction, tolerance): merge_asof( - self.df1e, self.df2e, on="time", by=["key", "key2"], direction=direction + self.df1e, + self.df2e, + on="time", + by=["key", "key2"], + direction=direction, + tolerance=tolerance, ) From c4f0b5c069aa3857252d519c2de2709ce958fe9c Mon Sep 17 00:00:00 2001 From: Rajat <22280243+R1j1t@users.noreply.github.com> Date: Tue, 22 Oct 2019 07:01:30 +0530 Subject: [PATCH 037/112] Pandas get_dummies validate `columns` input (#28463) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/reshape/reshape.py | 2 ++ pandas/tests/reshape/test_reshape.py | 17 +++++++++++++++++ 3 files changed, 20 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 48c1173a372a7..30a9ad76eae4d 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -387,6 +387,7 @@ Reshaping - Bug in :func:`merge`, did not append suffixes correctly with MultiIndex (:issue:`28518`) - :func:`qcut` and :func:`cut` now handle boolean input (:issue:`20303`) - Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`). +- Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 340e964d7c14f..1a90c845638d9 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -864,6 +864,8 @@ def get_dummies( # determine columns being encoded if columns is None: data_to_encode = data.select_dtypes(include=dtypes_to_encode) + elif not is_list_like(columns): + raise TypeError("Input must be a list-like for parameter `columns`") else: data_to_encode = data[columns] diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 0b9392a0eeb5b..66a24fa57a68c 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -608,6 +608,23 @@ def test_get_dummies_all_sparse(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("values", ["baz"]) + def test_get_dummies_with_string_values(self, values): + # issue #28383 + df = pd.DataFrame( + { + "bar": [1, 2, 3, 4, 5, 6], + "foo": ["one", "one", "one", "two", "two", "two"], + "baz": ["A", "B", "C", "A", "B", "C"], + "zoo": ["x", "y", "z", "q", "w", "t"], + } + ) + + msg = "Input must be a list-like for parameter `columns`" + + with pytest.raises(TypeError, match=msg): + pd.get_dummies(df, columns=values) + class TestCategoricalReshape: def test_reshaping_multi_index_categorical(self): From 4feb8ac2ef133b1ca99dab1b866d776ebf0c2860 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20=C5=A0koda?= Date: Tue, 22 Oct 2019 03:35:36 +0200 Subject: [PATCH 038/112] fix Rolling for multi-index and reversed index. (#28297) --- doc/source/whatsnew/v1.0.0.rst | 3 +- pandas/core/window/rolling.py | 25 +++++---- pandas/tests/window/test_timeseries_window.py | 53 +++++++++++++++++-- 3 files changed, 66 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 30a9ad76eae4d..4bd7cc2ba2841 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -372,8 +372,9 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- - Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue: `28192`) +- Bug in :meth:`DataFrame.rolling` not allowing rolling over multi-index levels (:issue: `15584`). +- Bug in :meth:`DataFrame.rolling` not allowing rolling on monotonic decreasing time indexes (:issue: `19248`). - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 29ef2e917ae57..3e75340ed3bcf 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -70,7 +70,7 @@ def __init__( center: Optional[bool] = False, win_type: Optional[str] = None, axis: Axis = 0, - on: Optional[str] = None, + on: Optional[Union[str, Index]] = None, closed: Optional[str] = None, **kwargs ): @@ -126,7 +126,7 @@ def _create_blocks(self): obj = self._selected_obj # filter out the on from the object - if self.on is not None: + if self.on is not None and not isinstance(self.on, Index): if obj.ndim == 2: obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) blocks = obj._to_dict_of_blocks(copy=False).values() @@ -637,10 +637,10 @@ class Window(_Window): Provide a window type. If ``None``, all points are evenly weighted. See the notes below for further information. on : str, optional - For a DataFrame, a datetime-like column on which to calculate the rolling - window, rather than the DataFrame's index. Provided integer column is - ignored and excluded from result since an integer index is not used to - calculate the rolling window. + For a DataFrame, a datetime-like column or MultiIndex level on which + to calculate the rolling window, rather than the DataFrame's index. + Provided integer column is ignored and excluded from result since + an integer index is not used to calculate the rolling window. axis : int or str, default 0 closed : str, default None Make the interval closed on the 'right', 'left', 'both' or @@ -1651,18 +1651,19 @@ def is_datetimelike(self): @cache_readonly def _on(self): - if self.on is None: if self.axis == 0: return self.obj.index elif self.axis == 1: return self.obj.columns + elif isinstance(self.on, Index): + return self.on elif isinstance(self.obj, ABCDataFrame) and self.on in self.obj.columns: return Index(self.obj[self.on]) else: raise ValueError( "invalid on specified as {0}, " - "must be a column (if DataFrame) " + "must be a column (of DataFrame), an Index " "or None".format(self.on) ) @@ -1706,10 +1707,12 @@ def validate(self): def _validate_monotonic(self): """ - Validate on is_monotonic. + Validate monotonic (increasing or decreasing). """ - if not self._on.is_monotonic: - formatted = self.on or "index" + if not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing): + formatted = self.on + if self.on is None: + formatted = "index" raise ValueError("{0} must be monotonic".format(formatted)) def _validate_freq(self): diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index e057eadae9da8..7055e5b538bea 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -1,7 +1,15 @@ import numpy as np import pytest -from pandas import DataFrame, Index, Series, Timestamp, date_range, to_datetime +from pandas import ( + DataFrame, + Index, + MultiIndex, + Series, + Timestamp, + date_range, + to_datetime, +) import pandas.util.testing as tm import pandas.tseries.offsets as offsets @@ -105,8 +113,16 @@ def test_monotonic_on(self): assert df.index.is_monotonic df.rolling("2s").sum() - # non-monotonic - df.index = reversed(df.index.tolist()) + def test_non_monotonic_on(self): + # GH 19248 + df = DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)} + ) + df = df.set_index("A") + non_monotonic_index = df.index.to_list() + non_monotonic_index[0] = non_monotonic_index[3] + df.index = non_monotonic_index + assert not df.index.is_monotonic with pytest.raises(ValueError): @@ -690,3 +706,34 @@ def test_rolling_cov_offset(self): expected2 = ss.rolling(3, min_periods=1).cov() tm.assert_series_equal(result, expected2) + + def test_rolling_on_decreasing_index(self): + # GH-19248 + index = [ + Timestamp("20190101 09:00:00"), + Timestamp("20190101 09:00:02"), + Timestamp("20190101 09:00:03"), + Timestamp("20190101 09:00:05"), + Timestamp("20190101 09:00:06"), + ] + + df = DataFrame({"column": [3, 4, 4, 2, 1]}, index=reversed(index)) + result = df.rolling("2s").min() + expected = DataFrame( + {"column": [3.0, 3.0, 3.0, 2.0, 1.0]}, index=reversed(index) + ) + tm.assert_frame_equal(result, expected) + + def test_rolling_on_multi_index_level(self): + # GH-15584 + df = DataFrame( + {"column": range(6)}, + index=MultiIndex.from_product( + [date_range("20190101", periods=3), range(2)], names=["date", "seq"] + ), + ) + result = df.rolling("10d", on=df.index.get_level_values("date")).sum() + expected = DataFrame( + {"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]}, index=df.index + ) + tm.assert_frame_equal(result, expected) From cf22fcb23373174f1613d175713fb9e23dbbda0d Mon Sep 17 00:00:00 2001 From: Antonio Andraues Jr Date: Mon, 21 Oct 2019 22:53:38 -0300 Subject: [PATCH 039/112] BUG: set_precision format fixed (#13257) (#27934) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/formats/style.py | 3 +- pandas/tests/io/formats/test_style.py | 41 +++++++++++++++++++++------ 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 4bd7cc2ba2841..7c59cbf7cfd1e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -355,6 +355,7 @@ I/O - Bug in :func:`read_hdf` closing stores that it didn't open when Exceptions are raised (:issue:`28699`) - Bug in :meth:`DataFrame.read_json` where using ``orient="index"`` would not maintain the order (:issue:`28557`) - Bug in :meth:`DataFrame.to_html` where the length of the ``formatters`` argument was not verified (:issue:`28469`) +- Bug in :meth:`pandas.io.formats.style.Styler` formatting for floating values not displaying decimals correctly (:issue:`13257`) Plotting ^^^^^^^^ diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 6b98eaca9dacc..0200b78e02fd2 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -156,7 +156,8 @@ def __init__( def default_display_func(x): if is_float(x): - return "{:>.{precision}g}".format(x, precision=self.precision) + display_format = "{0:.{precision}f}".format(x, precision=self.precision) + return display_format else: return x diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 61c163d2cdaac..0f1402d7da389 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1157,20 +1157,43 @@ def test_display_format_raises(self): with pytest.raises(TypeError): df.style.format(True) + def test_display_set_precision(self): + # Issue #13257 + df = pd.DataFrame(data=[[1.0, 2.0090], [3.2121, 4.566]], columns=["a", "b"]) + s = Styler(df) + + ctx = s.set_precision(1)._translate() + + assert s.precision == 1 + assert ctx["body"][0][1]["display_value"] == "1.0" + assert ctx["body"][0][2]["display_value"] == "2.0" + assert ctx["body"][1][1]["display_value"] == "3.2" + assert ctx["body"][1][2]["display_value"] == "4.6" + + ctx = s.set_precision(2)._translate() + assert s.precision == 2 + assert ctx["body"][0][1]["display_value"] == "1.00" + assert ctx["body"][0][2]["display_value"] == "2.01" + assert ctx["body"][1][1]["display_value"] == "3.21" + assert ctx["body"][1][2]["display_value"] == "4.57" + + ctx = s.set_precision(3)._translate() + assert s.precision == 3 + assert ctx["body"][0][1]["display_value"] == "1.000" + assert ctx["body"][0][2]["display_value"] == "2.009" + assert ctx["body"][1][1]["display_value"] == "3.212" + assert ctx["body"][1][2]["display_value"] == "4.566" + def test_display_subset(self): df = pd.DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"]) ctx = df.style.format( {"a": "{:0.1f}", "b": "{0:.2%}"}, subset=pd.IndexSlice[0, :] )._translate() expected = "0.1" - assert ctx["body"][0][1]["display_value"] == expected - assert ctx["body"][1][1]["display_value"] == "1.1234" - assert ctx["body"][0][2]["display_value"] == "12.34%" - - raw_11 = "1.1234" - ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice[0, :])._translate() + raw_11 = "1.123400" assert ctx["body"][0][1]["display_value"] == expected assert ctx["body"][1][1]["display_value"] == raw_11 + assert ctx["body"][0][2]["display_value"] == "12.34%" ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice[0, :])._translate() assert ctx["body"][0][1]["display_value"] == expected @@ -1178,7 +1201,7 @@ def test_display_subset(self): ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice["a"])._translate() assert ctx["body"][0][1]["display_value"] == expected - assert ctx["body"][0][2]["display_value"] == "0.1234" + assert ctx["body"][0][2]["display_value"] == "0.123400" ctx = df.style.format("{:0.1f}", subset=pd.IndexSlice[0, "a"])._translate() assert ctx["body"][0][1]["display_value"] == expected @@ -1189,8 +1212,8 @@ def test_display_subset(self): )._translate() assert ctx["body"][0][1]["display_value"] == expected assert ctx["body"][1][1]["display_value"] == "1.1" - assert ctx["body"][0][2]["display_value"] == "0.1234" - assert ctx["body"][1][2]["display_value"] == "1.1234" + assert ctx["body"][0][2]["display_value"] == "0.123400" + assert ctx["body"][1][2]["display_value"] == raw_11 def test_display_dict(self): df = pd.DataFrame([[0.1234, 0.1234], [1.1234, 1.1234]], columns=["a", "b"]) From 217358fed52188ffda1cafd51d5c47b98b382481 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Mon, 21 Oct 2019 23:07:20 -0500 Subject: [PATCH 040/112] WEB: Adding new pandas logo (#28948) --- web/pandas/_templates/layout.html | 1 + web/pandas/about/citing.md | 85 +++++++++++++- web/pandas/config.yml | 2 +- web/pandas/static/css/pandas.css | 8 +- web/pandas/static/img/favicon.ico | Bin 0 -> 1150 bytes web/pandas/static/img/pandas.svg | 2 +- web/pandas/static/img/pandas_mark.svg | 111 ++++++++++++++++++ web/pandas/static/img/pandas_mark_white.svg | 111 ++++++++++++++++++ web/pandas/static/img/pandas_secondary.svg | 1 + .../static/img/pandas_secondary_white.svg | 1 + web/pandas/static/img/pandas_white.svg | 1 + 11 files changed, 316 insertions(+), 7 deletions(-) create mode 100644 web/pandas/static/img/favicon.ico mode change 120000 => 100644 web/pandas/static/img/pandas.svg create mode 100644 web/pandas/static/img/pandas_mark.svg create mode 100644 web/pandas/static/img/pandas_mark_white.svg create mode 100644 web/pandas/static/img/pandas_secondary.svg create mode 100644 web/pandas/static/img/pandas_secondary_white.svg create mode 100644 web/pandas/static/img/pandas_white.svg diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html index fe3e4d1245d93..120058afd1190 100644 --- a/web/pandas/_templates/layout.html +++ b/web/pandas/_templates/layout.html @@ -12,6 +12,7 @@ pandas - Python Data Analysis Library + + + + + + + + + + + +### Secondary logo + + + + + + + + +### Logo mark + + + + + + + + +### Logo usage + +The pandas logo is available in full color and white accent. +The full color logo should only appear against white backgrounds. +The white accent logo should go against contrasting color background. When using the logo, please follow the next directives: -- Leave enough margin around the logo +- Primary logo should never be seen under 1 inch in size for printing and 72px for web +- The secondary logo should never be seen under 0.75 inch in size for printing and 55px for web +- Leave enough margin around the logo (leave the height of the logo in the top, bottom and both sides) - Do not distort the logo by changing its proportions - Do not place text or other elements on top of the logo + +### Colors + + + + + + + +
+ + + +
+ Blue
+ RGB: R21 G4 B88
+ HEX: #150458 +
+ + + +
+ Yellow
+ RGB: R255 G202 B0
+ HEX: #FFCA00 +
+ + + +
+ Pink
+ RGB: R231 G4 B136
+ HEX: #E70488 +
diff --git a/web/pandas/config.yml b/web/pandas/config.yml index d5c505f298437..e2a95a5039884 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -16,7 +16,7 @@ main: - tables - fenced_code static: - logo: # /static/img/pandas.svg + logo: /static/img/pandas_white.svg css: - /static/css/pandas.css navbar: diff --git a/web/pandas/static/css/pandas.css b/web/pandas/static/css/pandas.css index 0a227cf8d96c9..8b5905d480ac3 100644 --- a/web/pandas/static/css/pandas.css +++ b/web/pandas/static/css/pandas.css @@ -31,7 +31,7 @@ code { color: #130654; } a.navbar-brand img { - max-height: 2em; + height: 3rem; } div.card { margin: 0 0 .2em .2em !important; @@ -52,3 +52,9 @@ div.card .card-title { .navbar-dark .navbar-nav .nav-link:hover { color: white; } +table.logo td { + text-align: center; +} +table.logo img { + height: 4rem; +} diff --git a/web/pandas/static/img/favicon.ico b/web/pandas/static/img/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..0af2443dcaa3e814d631a36bdcdb1c4d070bd6ce GIT binary patch literal 1150 zcmZQzU<5(|0R|wcz>vYhz#zuJz@P!dKp~(AL>x%r1r7`hEFtW|e?r(rwxcOz1jzx# z7}3O0xM>UwtU&$$fp{0PL@1kZ1w>rP4OtwWO=bOzo(wANp$rVn$o9uFh;s*kLkit? z1_osNgE@uGL2eG_5N;1;7xDmx$?Q-zAuW*KK;j^^=qBLO-^TV#ql5W*SS!OzadPx` zu)L_~V0r$(o#m+;vHBx81QdeVg=Pk`2?unrz0hrEeV)+C@LGyk{R}7n3j=W!5bK9< z2s;DyO$=ohQ72meng8Ytr~dy3;&$ZlCt5$q{!{;BfY^W>{UHAX?V1X7vmdegLGcMn z8=y3T9RHAXB*^ao$sai5A2~fB>qi#H=YM2%l0frB-GcX)r#*ZHW<$pXzU;yd{uLA%8 literal 0 HcmV?d00001 diff --git a/web/pandas/static/img/pandas.svg b/web/pandas/static/img/pandas.svg deleted file mode 120000 index 2e5d3872e4845..0000000000000 --- a/web/pandas/static/img/pandas.svg +++ /dev/null @@ -1 +0,0 @@ -../../../../doc/logo/pandas_logo.svg \ No newline at end of file diff --git a/web/pandas/static/img/pandas.svg b/web/pandas/static/img/pandas.svg new file mode 100644 index 0000000000000..a7af4e4d2d401 --- /dev/null +++ b/web/pandas/static/img/pandas.svg @@ -0,0 +1 @@ +Artboard 63 \ No newline at end of file diff --git a/web/pandas/static/img/pandas_mark.svg b/web/pandas/static/img/pandas_mark.svg new file mode 100644 index 0000000000000..1451f57de198e --- /dev/null +++ b/web/pandas/static/img/pandas_mark.svg @@ -0,0 +1,111 @@ + + + + + + image/svg+xml + + + + + + + + + Artboard 61 + + + + + + + + + diff --git a/web/pandas/static/img/pandas_mark_white.svg b/web/pandas/static/img/pandas_mark_white.svg new file mode 100644 index 0000000000000..ae50bf5430c3a --- /dev/null +++ b/web/pandas/static/img/pandas_mark_white.svg @@ -0,0 +1,111 @@ + + + + + + image/svg+xml + + + + + + + + + Artboard 61 copy + + + + + + + + + diff --git a/web/pandas/static/img/pandas_secondary.svg b/web/pandas/static/img/pandas_secondary.svg new file mode 100644 index 0000000000000..e74404842e5b6 --- /dev/null +++ b/web/pandas/static/img/pandas_secondary.svg @@ -0,0 +1 @@ +Artboard 57 \ No newline at end of file diff --git a/web/pandas/static/img/pandas_secondary_white.svg b/web/pandas/static/img/pandas_secondary_white.svg new file mode 100644 index 0000000000000..86bcca57a031e --- /dev/null +++ b/web/pandas/static/img/pandas_secondary_white.svg @@ -0,0 +1 @@ +Artboard 57 copy \ No newline at end of file diff --git a/web/pandas/static/img/pandas_white.svg b/web/pandas/static/img/pandas_white.svg new file mode 100644 index 0000000000000..bc7c41651182d --- /dev/null +++ b/web/pandas/static/img/pandas_white.svg @@ -0,0 +1 @@ +Artboard 63 copy 2 \ No newline at end of file From 46af1408db9852e29f36e2852a83679e5232b583 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 22 Oct 2019 10:01:02 +0100 Subject: [PATCH 041/112] CLN: remove versionadded:: 0.19.0 and earlier (#29127) --- pandas/_libs/tslibs/timestamps.pyx | 3 --- pandas/core/frame.py | 2 -- pandas/core/tools/datetimes.py | 3 --- pandas/tseries/offsets.py | 2 -- 4 files changed, 10 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index c1575ce4f48b3..50a71d062c63f 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -194,13 +194,10 @@ class Timestamp(_Timestamp): valid values are 'D', 'h', 'm', 's', 'ms', 'us', and 'ns'. For example, 's' means seconds and 'ms' means milliseconds. year, month, day : int - .. versionadded:: 0.19.0 hour, minute, second, microsecond : int, optional, default 0 - .. versionadded:: 0.19.0 nanosecond : int, optional, default 0 .. versionadded:: 0.23.0 tzinfo : datetime.tzinfo, optional, default None - .. versionadded:: 0.19.0 Notes ----- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c90bf4ba7151f..7f36826a893dd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6446,8 +6446,6 @@ def diff(self, periods=1, axis=0): axis : {0 or 'index', 1 or 'columns'}, default 0 Take difference over rows (0) or columns (1). - .. versionadded:: 0.16.1. - Returns ------- DataFrame diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7b136fa29ecea..ea7a09881ef87 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -601,9 +601,6 @@ def to_datetime( Warning: yearfirst=True is not strict, but will prefer to parse with year first (this is a known bug, based on dateutil behavior). - - .. versionadded:: 0.16.1 - utc : bool, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well). diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index c0f6777fdb62b..1e3f5c1ed870e 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -980,8 +980,6 @@ def _repr_attrs(self): class BusinessHour(BusinessHourMixin, SingleConstructorOffset): """ DateOffset subclass representing possibly n business hours. - - .. versionadded:: 0.16.1 """ _prefix = "BH" From bd64ab7c8fdde5e57d9154ff800926e8322b16a8 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 22 Oct 2019 13:23:12 +0100 Subject: [PATCH 042/112] CLN: remove versionadded:: 0.20 (#29126) --- doc/source/development/contributing.rst | 2 -- doc/source/getting_started/basics.rst | 6 ------ doc/source/user_guide/advanced.rst | 4 ---- doc/source/user_guide/categorical.rst | 2 -- doc/source/user_guide/computation.rst | 2 -- doc/source/user_guide/groupby.rst | 6 ------ doc/source/user_guide/io.rst | 21 --------------------- doc/source/user_guide/merging.rst | 2 -- doc/source/user_guide/options.rst | 2 -- doc/source/user_guide/reshaping.rst | 2 -- doc/source/user_guide/text.rst | 4 ---- doc/source/user_guide/timedeltas.rst | 2 -- doc/source/user_guide/timeseries.rst | 2 -- doc/source/user_guide/visualization.rst | 2 -- pandas/_libs/interval.pyx | 2 -- pandas/_libs/tslibs/timedeltas.pyx | 2 -- pandas/core/dtypes/concat.py | 2 -- pandas/core/dtypes/inference.py | 4 ---- pandas/core/frame.py | 4 ---- pandas/core/generic.py | 19 ------------------- pandas/core/groupby/generic.py | 2 -- pandas/core/groupby/groupby.py | 2 -- pandas/core/indexes/base.py | 6 ------ pandas/core/indexes/multi.py | 4 ---- pandas/core/resample.py | 2 -- pandas/core/reshape/melt.py | 5 ----- pandas/core/reshape/merge.py | 2 -- pandas/core/reshape/tile.py | 2 -- pandas/core/strings.py | 8 -------- pandas/core/tools/datetimes.py | 2 -- pandas/core/util/hashing.py | 6 ------ pandas/core/window/rolling.py | 2 -- pandas/errors/__init__.py | 2 -- pandas/io/excel/_base.py | 3 --- pandas/io/feather_format.py | 2 -- pandas/io/formats/style.py | 5 ----- pandas/io/json/_normalize.py | 9 --------- pandas/io/pickle.py | 4 ---- pandas/plotting/_misc.py | 2 -- 39 files changed, 162 deletions(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 949b6bd475319..62e582dffae47 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -1197,8 +1197,6 @@ submitting a pull request. For more, see the `pytest `_ documentation. - .. versionadded:: 0.20.0 - Furthermore one can run .. code-block:: python diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 36a7166f350e5..9b97aa25a9240 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -172,8 +172,6 @@ You are highly encouraged to install both libraries. See the section These are both enabled to be used by default, you can control this by setting the options: -.. versionadded:: 0.20.0 - .. code-block:: python pd.set_option('compute.use_bottleneck', False) @@ -891,8 +889,6 @@ functionality. Aggregation API ~~~~~~~~~~~~~~~ -.. versionadded:: 0.20.0 - The aggregation API allows one to express possibly multiple aggregation operations in a single concise way. This API is similar across pandas objects, see :ref:`groupby API `, the :ref:`window functions API `, and the :ref:`resample API `. @@ -1030,8 +1026,6 @@ to the built in :ref:`describe function `. Transform API ~~~~~~~~~~~~~ -.. versionadded:: 0.20.0 - The :meth:`~DataFrame.transform` method returns an object that is indexed the same (same size) as the original. This API allows you to provide *multiple* operations at the same time rather than one-by-one. Its API is quite similar to the ``.agg`` API. diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 4949dd580414f..c6eadd2adadce 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -206,8 +206,6 @@ highly performant. If you want to see only the used levels, you can use the To reconstruct the ``MultiIndex`` with only the used levels, the :meth:`~MultiIndex.remove_unused_levels` method may be used. -.. versionadded:: 0.20.0 - .. ipython:: python new_mi = df[['foo', 'qux']].columns.remove_unused_levels() @@ -928,8 +926,6 @@ If you need integer based selection, you should use ``iloc``: IntervalIndex ~~~~~~~~~~~~~ -.. versionadded:: 0.20.0 - :class:`IntervalIndex` together with its own dtype, :class:`~pandas.api.types.IntervalDtype` as well as the :class:`Interval` scalar type, allow first-class support in pandas for interval notation. diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 8ca96ba0daa5e..6651f656ae45d 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -874,8 +874,6 @@ The below raises ``TypeError`` because the categories are ordered and not identi Out[3]: TypeError: to union ordered Categoricals, all categories must be the same -.. versionadded:: 0.20.0 - Ordered categoricals with different categories or orderings can be combined by using the ``ignore_ordered=True`` argument. diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 4beac5e035efc..bc00cd7f13e13 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -471,8 +471,6 @@ default of the index) in a DataFrame. Rolling window endpoints ~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.20.0 - The inclusion of the interval endpoints in rolling window calculations can be specified with the ``closed`` parameter: diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 141d1708d882d..8cd229070e365 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -311,8 +311,6 @@ Grouping with multiple levels is supported. s s.groupby(level=['first', 'second']).sum() -.. versionadded:: 0.20 - Index level names may be supplied as keys. .. ipython:: python @@ -353,8 +351,6 @@ Index levels may also be specified by name. df.groupby([pd.Grouper(level='second'), 'A']).sum() -.. versionadded:: 0.20 - Index level names may be specified as keys directly to ``groupby``. .. ipython:: python @@ -1274,8 +1270,6 @@ To see the order in which each row appears within its group, use the Enumerate groups ~~~~~~~~~~~~~~~~ -.. versionadded:: 0.20.2 - To see the ordering of the groups (as opposed to the order of rows within a group given by ``cumcount``) you can use :meth:`~pandas.core.groupby.DataFrameGroupBy.ngroup`. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 6b23c814843e1..173bcf7537154 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -163,9 +163,6 @@ dtype : Type name or dict of column -> type, default ``None`` (unsupported with ``engine='python'``). Use `str` or `object` together with suitable ``na_values`` settings to preserve and not interpret dtype. - - .. versionadded:: 0.20.0 support for the Python parser. - engine : {``'c'``, ``'python'``} Parser engine to use. The C engine is faster while the Python engine is currently more feature-complete. @@ -417,10 +414,6 @@ However, if you wanted for all the data to be coerced, no matter the type, then using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be worth trying. - .. versionadded:: 0.20.0 support for the Python parser. - - The ``dtype`` option is supported by the 'python' engine. - .. note:: In some cases, reading in abnormal data with columns containing mixed dtypes will result in an inconsistent dataset. If you rely on pandas to infer the @@ -616,8 +609,6 @@ Filtering columns (``usecols``) The ``usecols`` argument allows you to select any subset of the columns in a file, either using the column names, position numbers or a callable: -.. versionadded:: 0.20.0 support for callable `usecols` arguments - .. ipython:: python data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz' @@ -1447,8 +1438,6 @@ is whitespace). df = pd.read_fwf('bar.csv', header=None, index_col=0) df -.. versionadded:: 0.20.0 - ``read_fwf`` supports the ``dtype`` parameter for specifying the types of parsed columns to be different from the inferred type. @@ -2221,8 +2210,6 @@ For line-delimited json files, pandas can also return an iterator which reads in Table schema '''''''''''' -.. versionadded:: 0.20.0 - `Table Schema`_ is a spec for describing tabular datasets as a JSON object. The JSON includes information on the field names, types, and other attributes. You can use the orient ``table`` to build @@ -3071,8 +3058,6 @@ missing data to recover integer dtype: Dtype specifications ++++++++++++++++++++ -.. versionadded:: 0.20 - As an alternative to converters, the type for an entire column can be specified using the `dtype` keyword, which takes a dictionary mapping column names to types. To interpret data with @@ -3345,8 +3330,6 @@ any pickled pandas object (or any other pickled object) from file: Compressed pickle files ''''''''''''''''''''''' -.. versionadded:: 0.20.0 - :func:`read_pickle`, :meth:`DataFrame.to_pickle` and :meth:`Series.to_pickle` can read and write compressed pickle files. The compression types of ``gzip``, ``bz2``, ``xz`` are supported for reading and writing. The ``zip`` file format only supports reading and must contain only one data file @@ -4323,8 +4306,6 @@ control compression: ``complevel`` and ``complib``. - `bzip2 `_: Good compression rates. - `blosc `_: Fast compression and decompression. - .. versionadded:: 0.20.2 - Support for alternative blosc compressors: - `blosc:blosclz `_ This is the @@ -4651,8 +4632,6 @@ Performance Feather ------- -.. versionadded:: 0.20.0 - Feather provides binary columnar serialization for data frames. It is designed to make reading and writing data frames efficient, and to make sharing data across data analysis languages easy. diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 4c0d3b75a4f79..7bedc9515abb2 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -843,8 +843,6 @@ resulting dtype will be upcast. pd.merge(left, right, how='outer', on='key') pd.merge(left, right, how='outer', on='key').dtypes -.. versionadded:: 0.20.0 - Merging will preserve ``category`` dtypes of the mergands. See also the section on :ref:`categoricals `. The left frame. diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index a6491c6645613..5817efb31814e 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -561,8 +561,6 @@ However, setting this option incorrectly for your terminal will cause these char Table schema display -------------------- -.. versionadded:: 0.20.0 - ``DataFrame`` and ``Series`` will publish a Table Schema representation by default. False by default, this can be enabled globally with the ``display.html.table_schema`` option: diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index b2ee252495f23..8583a9312b690 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -539,8 +539,6 @@ Alternatively we can specify custom bin-edges: c = pd.cut(ages, bins=[0, 18, 35, 70]) c -.. versionadded:: 0.20.0 - If the ``bins`` keyword is an ``IntervalIndex``, then these will be used to bin the passed data.:: diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 789ff2a65355b..d521c745ccfe5 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -228,8 +228,6 @@ and ``repl`` must be strings: dollars.str.replace(r'-\$', '-') dollars.str.replace('-$', '-', regex=False) -.. versionadded:: 0.20.0 - The ``replace`` method can also take a callable as replacement. It is called on every ``pat`` using :func:`re.sub`. The callable should expect one positional argument (a regex object) and return a string. @@ -254,8 +252,6 @@ positional argument (a regex object) and return a string. pd.Series(['Foo Bar Baz', np.nan], dtype="string").str.replace(pat, repl) -.. versionadded:: 0.20.0 - The ``replace`` method also accepts a compiled regular expression object from :func:`re.compile` as a pattern. All flags should be included in the compiled regular expression object. diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 3e46140d79b8e..3439a0a4c13c7 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -327,8 +327,6 @@ similarly to the ``Series``. These are the *displayed* values of the ``Timedelta You can convert a ``Timedelta`` to an `ISO 8601 Duration`_ string with the ``.isoformat`` method -.. versionadded:: 0.20.0 - .. ipython:: python pd.Timedelta(days=6, minutes=50, seconds=3, diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 0894edd69c2ae..17b02374050d2 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -376,8 +376,6 @@ We subtract the epoch (midnight at January 1, 1970 UTC) and then floor divide by Using the ``origin`` Parameter ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.20.0 - Using the ``origin`` parameter, one can specify an alternative starting point for creation of a ``DatetimeIndex``. For example, to use 1960-01-01 as the starting date: diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index fa16b2f216610..609969b666726 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1247,8 +1247,6 @@ in ``pandas.plotting.plot_params`` can be used in a `with statement`: Automatic date tick adjustment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.20.0 - ``TimedeltaIndex`` now uses the native matplotlib tick locator methods, it is useful to call the automatic date tick adjustment from matplotlib for figures whose ticklabels overlap. diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 3c7ec70fb1f88..6a3f20928f64b 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -191,8 +191,6 @@ cdef class Interval(IntervalMixin): """ Immutable object implementing an Interval, a bounded slice-like interval. - .. versionadded:: 0.20.0 - Parameters ---------- left : orderable scalar diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 3d267b0114695..8435f1cd7d732 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1157,8 +1157,6 @@ cdef class _Timedelta(timedelta): ``P[n]Y[n]M[n]DT[n]H[n]M[n]S``, where the ``[n]`` s are replaced by the values. See https://en.wikipedia.org/wiki/ISO_8601#Durations. - .. versionadded:: 0.20.0 - Returns ------- formatted : str diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index bd1ed0bb7d318..f2176f573207c 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -199,8 +199,6 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False): If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. - .. versionadded:: 0.20.0 - Returns ------- result : Categorical diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 461b5cc6232cd..e69e703f3a96c 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -162,8 +162,6 @@ def is_file_like(obj): Note: file-like objects must be iterable, but iterable objects need not be file-like. - .. versionadded:: 0.20.0 - Parameters ---------- obj : The object to check @@ -281,8 +279,6 @@ def is_nested_list_like(obj): Check if the object is list-like, and that all of its elements are also list-like. - .. versionadded:: 0.20.0 - Parameters ---------- obj : The object to check diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7f36826a893dd..f2074bab276ac 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2082,8 +2082,6 @@ def to_feather(self, fname): """ Write out the binary feather-format for DataFrames. - .. versionadded:: 0.20.0 - Parameters ---------- fname : str @@ -7868,8 +7866,6 @@ def nunique(self, axis=0, dropna=True): Return Series with number of distinct observations. Can ignore NaN values. - .. versionadded:: 0.20.0 - Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a300748ee5bc8..61af22c6e92b3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -897,8 +897,6 @@ def squeeze(self, axis=None): A specific axis to squeeze. By default, all length-1 axes are squeezed. - .. versionadded:: 0.20.0 - Returns ------- DataFrame, Series, or scalar @@ -2163,8 +2161,6 @@ def _repr_data_resource_(self): Specifies the one-based bottommost row and rightmost column that is to be frozen. - .. versionadded:: 0.20.0. - See Also -------- to_csv : Write DataFrame to a comma-separated values (csv) file. @@ -2756,8 +2752,6 @@ def to_pickle(self, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL) default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. - - .. versionadded:: 0.20.0 protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible @@ -3032,22 +3026,15 @@ def to_latex( multicolumn : bool, default True Use \multicolumn to enhance MultiIndex columns. The default will be read from the config module. - - .. versionadded:: 0.20.0 multicolumn_format : str, default 'l' The alignment for multicolumns, similar to `column_format` The default will be read from the config module. - - .. versionadded:: 0.20.0 multirow : bool, default False Use \multirow to enhance MultiIndex rows. Requires adding a \usepackage{multirow} to your LaTeX preamble. Will print centered labels (instead of top-aligned) across the contained rows, separating groups via clines. The default will be read from the pandas config module. - - .. versionadded:: 0.20.0 - caption : str, optional The LaTeX caption to be placed inside ``\caption{}`` in the output. @@ -5133,8 +5120,6 @@ def pipe(self, func, *args, **kwargs): Call ``func`` on self producing a %(klass)s with transformed values and that has the same axis length as self. - .. versionadded:: 0.20.0 - Parameters ---------- func : function, str, list or dict @@ -5805,8 +5790,6 @@ def astype(self, dtype, copy=True, errors="raise"): - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object. - .. versionadded:: 0.20.0 - Returns ------- casted : same type as caller @@ -7946,8 +7929,6 @@ def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): Value to use for missing values, applied during upsampling (note this does not fill NaNs that already were present). - .. versionadded:: 0.20.0 - Returns ------- converted : same type as caller diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a78857423e7e0..5c7c56e2a31df 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1692,8 +1692,6 @@ def nunique(self, dropna=True): Return DataFrame with number of distinct observations per group for each column. - .. versionadded:: 0.20.0 - Parameters ---------- dropna : bool, default True diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f622480cfe4b7..f88f2e21bd595 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1942,8 +1942,6 @@ def ngroup(self, ascending=True): would be seen when iterating over the groupby object, not the order they are first observed. - .. versionadded:: 0.20.2 - Parameters ---------- ascending : bool, default True diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9d6487f7a8ae4..4c15e4b26ed46 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1897,8 +1897,6 @@ def isna(self): empty strings `''` or :attr:`numpy.inf` are not considered NA values (unless you set ``pandas.options.mode.use_inf_as_na = True``). - .. versionadded:: 0.20.0 - Returns ------- numpy.ndarray @@ -1956,8 +1954,6 @@ def notna(self): NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` values. - .. versionadded:: 0.20.0 - Returns ------- numpy.ndarray @@ -3420,8 +3416,6 @@ def _reindex_non_unique(self, target): Sort the join keys lexicographically in the result Index. If False, the order of the join keys depends on the join type (how keyword) - .. versionadded:: 0.20.0 - Returns ------- join_index, (left_indexer, right_indexer) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fda5c78a61e53..74dbcd4067ec0 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1823,8 +1823,6 @@ def _lexsort_depth(self) -> int: def _sort_levels_monotonic(self): """ - .. versionadded:: 0.20.0 - This is an *internal* function. Create a new MultiIndex from the current to monotonically sorted @@ -1901,8 +1899,6 @@ def remove_unused_levels(self): appearance, meaning the same .values and ordering. It will also be .equals() to the original. - .. versionadded:: 0.20.0 - Returns ------- MultiIndex diff --git a/pandas/core/resample.py b/pandas/core/resample.py index d4ae3767f6157..13cb0f9aed303 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -822,8 +822,6 @@ def asfreq(self, fill_value=None): Value to use for missing values, applied during upsampling (note this does not fill NaNs that already were present). - .. versionadded:: 0.20.0 - Returns ------- DataFrame or Series diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 6f2e264f1a4d0..c85050bc4232b 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -220,9 +220,6 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r"\d+"): in the wide format, to be stripped from the names in the long format. For example, if your column names are A-suffix1, A-suffix2, you can strip the hyphen by specifying `sep='-'` - - .. versionadded:: 0.20.0 - suffix : str, default '\\d+' A regular expression capturing the wanted suffixes. '\\d+' captures numeric suffixes. Suffixes with no numbers could be specified with the @@ -231,8 +228,6 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r"\d+"): A-one, B-two,.., and you have an unrelated column A-rating, you can ignore the last one by specifying `suffix='(!?one|two)'` - .. versionadded:: 0.20.0 - .. versionchanged:: 0.23.0 When all suffixes are numeric, they are cast to int64/float64. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 7bfc8153da568..7e593ddb91d3a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -364,8 +364,6 @@ def merge_asof( direction : 'backward' (default), 'forward', or 'nearest' Whether to search for prior, subsequent, or closest matches. - .. versionadded:: 0.20.0 - Returns ------- merged : DataFrame diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 6942a5797a7f0..2cc9f8927effb 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -300,8 +300,6 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates="raise"): duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. - .. versionadded:: 0.20.0 - Returns ------- out : Categorical or Series or array of integers if labels is False diff --git a/pandas/core/strings.py b/pandas/core/strings.py index e50da168af4d2..fcbb000acc256 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -491,18 +491,10 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): ---------- pat : str or compiled regex String can be a character sequence or regular expression. - - .. versionadded:: 0.20.0 - `pat` also accepts a compiled regex. - repl : str or callable Replacement string or a callable. The callable is passed the regex match object and must return a replacement string to be used. See :func:`re.sub`. - - .. versionadded:: 0.20.0 - `repl` also accepts a callable. - n : int, default -1 (all) Number of replacements to make from start. case : bool, default None diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ea7a09881ef87..70143e4603a4b 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -644,8 +644,6 @@ def to_datetime( at noon on January 1, 4713 BC. - If Timestamp convertible, origin is set to Timestamp identified by origin. - - .. versionadded:: 0.20.0 cache : bool, default True If True, use a cache of unique, converted dates to apply the datetime conversion. May produce significant speed-up when parsing duplicate diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index ca5279e93f678..e3617d53b000a 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -76,8 +76,6 @@ def hash_pandas_object( Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. - .. versionadded:: 0.20.0 - Returns ------- Series of uint64, same length as the object @@ -146,8 +144,6 @@ def hash_tuples(vals, encoding="utf8", hash_key=None): """ Hash an MultiIndex / list-of-tuples efficiently - .. versionadded:: 0.20.0 - Parameters ---------- vals : MultiIndex, list-of-tuples, or single tuple @@ -262,8 +258,6 @@ def hash_array(vals, encoding: str = "utf8", hash_key=None, categorize: bool = T Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. - .. versionadded:: 0.20.0 - Returns ------- 1d uint64 numpy array of hash values, same length as the vals diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 3e75340ed3bcf..68eb1f630bfc3 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -649,8 +649,6 @@ class Window(_Window): For fixed windows, defaults to 'both'. Remaining cases not implemented for fixed windows. - .. versionadded:: 0.20.0 - Returns ------- a Window or Rolling sub-classed for the particular operation diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index a85fc8bfb1414..883af5c2e62f0 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -25,8 +25,6 @@ class UnsortedIndexError(KeyError): """ Error raised when attempting to get a slice of a MultiIndex, and the index has not been lexsorted. Subclass of `KeyError`. - - .. versionadded:: 0.20.0 """ diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 039a0560af627..6eb1b9e950dfd 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -107,9 +107,6 @@ Use `object` to preserve data as stored in Excel and not interpret dtype. If converters are specified, they will be applied INSTEAD of dtype conversion. - - .. versionadded:: 0.20.0 - engine : str, default None If io is not a buffer or path, this must be set to identify io. Acceptable values are None, "xlrd", "openpyxl" or "odf". diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 25a6db675265d..dd6519275ad15 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -71,8 +71,6 @@ def read_feather(path, columns=None, use_threads=True): """ Load a feather-format object from the file path. - .. versionadded:: 0.20.0 - Parameters ---------- path : str, path object or file-like object diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 0200b78e02fd2..abf2caf3914e0 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -491,8 +491,6 @@ def render(self, **kwargs): This is useful when you need to provide additional variables for a custom template. - .. versionadded:: 0.20 - Returns ------- rendered : str @@ -1200,9 +1198,6 @@ def bar( - 'mid' : the center of the cell is at (max-min)/2, or if values are all negative (positive) the zero is aligned at the right (left) of the cell. - - .. versionadded:: 0.20.0 - vmin : float, optional Minimum bar value, defining the left hand limit of the bar drawing range, lower values are clipped to `vmin`. diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 24a255c78f3c0..cf8b9d901eda2 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -46,9 +46,6 @@ def nested_to_record( sep : str, default '.' Nested records will generate names separated by sep, e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar - - .. versionadded:: 0.20.0 - level: int, optional, default: 0 The number of levels in the json string. @@ -146,15 +143,9 @@ def json_normalize( always present. * 'raise' : will raise KeyError if keys listed in meta are not always present. - - .. versionadded:: 0.20.0 - sep : str, default '.' Nested records will generate names separated by sep. e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar. - - .. versionadded:: 0.20.0 - max_level : int, default None Max number of levels(depth of dict) to normalize. if None, normalizes all levels. diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 4b9a52a1fb8f3..621e8e09230b7 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -23,8 +23,6 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. - - .. versionadded:: 0.20.0 protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible @@ -99,8 +97,6 @@ def read_pickle(path, compression="infer"): or '.zip' respectively, and no decompression otherwise. Set to None for no decompression. - .. versionadded:: 0.20.0 - Returns ------- unpickled : same type as object stored in file diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 426ca9632af29..815c69bc27d7a 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -353,8 +353,6 @@ def parallel_coordinates( Options to be passed to axvline method for vertical lines. sort_labels : bool, default False Sort class_column labels, useful when assigning colors. - - .. versionadded:: 0.20.0 **kwargs Options to pass to matplotlib plotting method. From d63cfa226fabc14045b383d8189dfdfdef1582f1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 22 Oct 2019 05:23:58 -0700 Subject: [PATCH 043/112] BUG: fix AttributeError raised in libreduction (#29100) --- pandas/_libs/reduction.pyx | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 0eac0e94f0beb..7ed131e1c7608 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -203,7 +203,8 @@ cdef class SeriesBinGrouper: self.f = f values = series.values - if not values.flags.c_contiguous: + if util.is_array(values) and not values.flags.c_contiguous: + # e.g. Categorical has no `flags` attribute values = values.copy('C') self.arr = values self.typ = series._constructor @@ -230,7 +231,8 @@ cdef class SeriesBinGrouper: values = dummy.values if values.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') - if not values.flags.contiguous: + if util.is_array(values) and not values.flags.contiguous: + # e.g. Categorical has no `flags` attribute values = values.copy() index = dummy.index.values if not index.flags.contiguous: @@ -332,7 +334,8 @@ cdef class SeriesGrouper: self.f = f values = series.values - if not values.flags.c_contiguous: + if util.is_array(values) and not values.flags.c_contiguous: + # e.g. Categorical has no `flags` attribute values = values.copy('C') self.arr = values self.typ = series._constructor @@ -356,7 +359,8 @@ cdef class SeriesGrouper: if (dummy.dtype != self.arr.dtype and values.dtype != self.arr.dtype): raise ValueError('Dummy array must be same dtype') - if not values.flags.contiguous: + if util.is_array(values) and not values.flags.contiguous: + # e.g. Categorical has no `flags` attribute values = values.copy() index = dummy.index.values if not index.flags.contiguous: @@ -467,12 +471,13 @@ cdef class Slider: char *orig_data def __init__(self, object values, object buf): - assert(values.ndim == 1) + assert (values.ndim == 1) - if not values.flags.contiguous: + if util.is_array(values) and not values.flags.contiguous: + # e.g. Categorical has no `flags` attribute values = values.copy() - assert(values.dtype == buf.dtype) + assert (values.dtype == buf.dtype) self.values = values self.buf = buf self.stride = values.strides[0] From d9750c13269a76f54eb59fe655ab3cd76e75c98d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 22 Oct 2019 05:28:15 -0700 Subject: [PATCH 044/112] REF: avoid getattr pattern for rank_1d, rank_2d (#29137) --- pandas/_libs/algos.pyx | 22 +++++----------------- pandas/_libs/groupby.pyx | 27 ++++++++++++++++----------- pandas/core/algorithms.py | 31 +++++++++++-------------------- pandas/tests/test_algos.py | 2 +- 4 files changed, 33 insertions(+), 49 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index cab8bc8e799d4..30c9af645da22 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -311,7 +311,7 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): ranked_mat = np.empty((N, K), dtype=np.float64) for i in range(K): - ranked_mat[:, i] = rank_1d_float64(mat[:, i]) + ranked_mat[:, i] = rank_1d(mat[:, i]) for xi in range(K): for yi in range(xi + 1): @@ -337,8 +337,8 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): j += 1 if not all_ranks: - maskedx = rank_1d_float64(maskedx) - maskedy = rank_1d_float64(maskedy) + maskedx = rank_1d(maskedx) + maskedy = rank_1d(maskedy) mean = (nobs + 1) / 2. @@ -1005,12 +1005,6 @@ def rank_1d(rank_t[:] in_arr, ties_method='average', return ranks -rank_1d_object = rank_1d["object"] -rank_1d_float64 = rank_1d["float64_t"] -rank_1d_uint64 = rank_1d["uint64_t"] -rank_1d_int64 = rank_1d["int64_t"] - - def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', ascending=True, na_option='keep', pct=False): """ @@ -1083,8 +1077,8 @@ def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', except TypeError: values = in_arr for i in range(len(values)): - ranks[i] = rank_1d_object(in_arr[i], ties_method=ties_method, - ascending=ascending, pct=pct) + ranks[i] = rank_1d(in_arr[i], ties_method=ties_method, + ascending=ascending, pct=pct) if axis == 0: return ranks.T else: @@ -1179,12 +1173,6 @@ def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', return ranks -rank_2d_object = rank_2d["object"] -rank_2d_float64 = rank_2d["float64_t"] -rank_2d_uint64 = rank_2d["uint64_t"] -rank_2d_int64 = rank_2d["int64_t"] - - # generated from template include "algos_common_helper.pxi" include "algos_take_helper.pxi" diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 8a417d8fe3a92..c21528a7082f6 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -455,7 +455,7 @@ def _group_add(complexfloating_t[:, :] out, if len(values) != len(labels): raise ValueError("len(index) != len(labels)") - nobs = np.zeros((len(out), out.shape[1]), dtype=np.int64) + nobs = np.zeros((out).shape, dtype=np.int64) sumx = np.zeros_like(out) N, K = (values).shape @@ -507,12 +507,13 @@ def _group_prod(floating[:, :] out, cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, count - floating[:, :] prodx, nobs + floating[:, :] prodx + int64_t[:, :] nobs if not len(values) == len(labels): raise ValueError("len(index) != len(labels)") - nobs = np.zeros_like(out) + nobs = np.zeros((out).shape, dtype=np.int64) prodx = np.ones_like(out) N, K = (values).shape @@ -555,14 +556,15 @@ def _group_var(floating[:, :] out, cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, ct, oldmean - floating[:, :] nobs, mean + floating[:, :] mean + int64_t[:, :] nobs assert min_count == -1, "'min_count' only used in add and prod" if not len(values) == len(labels): raise ValueError("len(index) != len(labels)") - nobs = np.zeros_like(out) + nobs = np.zeros((out).shape, dtype=np.int64) mean = np.zeros_like(out) N, K = (values).shape @@ -610,14 +612,15 @@ def _group_mean(floating[:, :] out, cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, count - floating[:, :] sumx, nobs + floating[:, :] sumx + int64_t[:, :] nobs assert min_count == -1, "'min_count' only used in add and prod" if not len(values) == len(labels): raise ValueError("len(index) != len(labels)") - nobs = np.zeros_like(out) + nobs = np.zeros((out).shape, dtype=np.int64) sumx = np.zeros_like(out) N, K = (values).shape @@ -1243,15 +1246,16 @@ def group_max(groupby_t[:, :] out, cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) groupby_t val, count, nan_val - ndarray[groupby_t, ndim=2] maxx, nobs + ndarray[groupby_t, ndim=2] maxx bint runtime_error = False + int64_t[:, :] nobs assert min_count == -1, "'min_count' only used in add and prod" if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") - nobs = np.zeros_like(out) + nobs = np.zeros((out).shape, dtype=np.int64) maxx = np.empty_like(out) if groupby_t is int64_t: @@ -1314,15 +1318,16 @@ def group_min(groupby_t[:, :] out, cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) groupby_t val, count, nan_val - ndarray[groupby_t, ndim=2] minx, nobs + ndarray[groupby_t, ndim=2] minx bint runtime_error = False + int64_t[:, :] nobs assert min_count == -1, "'min_count' only used in add and prod" if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") - nobs = np.zeros_like(out) + nobs = np.zeros((out).shape, dtype=np.int64) minx = np.empty_like(out) if groupby_t is int64_t: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 717c2eb26be8b..60748839ba8d8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -245,11 +245,17 @@ def _get_hashtable_algo(values): return (htable, table, values, dtype, ndtype) -def _get_data_algo(values, func_map): +def _get_values_for_rank(values): if is_categorical_dtype(values): values = values._values_for_rank() values, dtype, ndtype = _ensure_data(values) + return values, dtype, ndtype + + +def _get_data_algo(values, func_map): + values, dtype, ndtype = _get_values_for_rank(values) + if ndtype == "object": # it's cheaper to use a String Hash Table than Object; we infer @@ -900,8 +906,8 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). """ if values.ndim == 1: - f, values = _get_data_algo(values, _rank1d_functions) - ranks = f( + values, _, _ = _get_values_for_rank(values) + ranks = algos.rank_1d( values, ties_method=method, ascending=ascending, @@ -909,8 +915,8 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct pct=pct, ) elif values.ndim == 2: - f, values = _get_data_algo(values, _rank2d_functions) - ranks = f( + values, _, _ = _get_values_for_rank(values) + ranks = algos.rank_2d( values, axis=axis, ties_method=method, @@ -1000,21 +1006,6 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): return arr + b -_rank1d_functions = { - "float64": algos.rank_1d_float64, - "int64": algos.rank_1d_int64, - "uint64": algos.rank_1d_uint64, - "object": algos.rank_1d_object, -} - -_rank2d_functions = { - "float64": algos.rank_2d_float64, - "int64": algos.rank_2d_int64, - "uint64": algos.rank_2d_uint64, - "object": algos.rank_2d_object, -} - - def quantile(x, q, interpolation_method="fraction"): """ Compute sample quantile or quantiles of the input array. For example, q=0.5 diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6df2c8faf7aee..48cfc06f42e91 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1619,7 +1619,7 @@ def test_scipy_compat(self): def _check(arr): mask = ~np.isfinite(arr) arr = arr.copy() - result = libalgos.rank_1d_float64(arr) + result = libalgos.rank_1d(arr) arr[mask] = np.inf exp = rankdata(arr) exp[mask] = np.nan From 3df6488f39a09dffda89c03893632642041c0227 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 22 Oct 2019 05:31:51 -0700 Subject: [PATCH 045/112] REF: avoid getattr pattern for diff_2d; use fused types (#29120) --- pandas/_libs/algos_common_helper.pxi.in | 45 +++++++++++++------------ pandas/core/algorithms.py | 11 ++---- 2 files changed, 25 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index eb6d689899073..0ae8094769f46 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -4,32 +4,34 @@ Template for each `dtype` helper function using 1-d template WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -{{py: - -# name, c_type, dest_type -dtypes = [('float64', 'float64_t', 'float64_t'), - ('float32', 'float32_t', 'float32_t'), - ('int8', 'int8_t', 'float32_t'), - ('int16', 'int16_t', 'float32_t'), - ('int32', 'int32_t', 'float64_t'), - ('int64', 'int64_t', 'float64_t')] - -def get_dispatch(dtypes): - - for name, c_type, dest_type, in dtypes: - yield name, c_type, dest_type +ctypedef fused diff_t: + float64_t + float32_t + int8_t + int16_t + int32_t + int64_t -}} - -{{for name, c_type, dest_type - in get_dispatch(dtypes)}} +ctypedef fused out_t: + float32_t + float64_t @cython.boundscheck(False) @cython.wraparound(False) -def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr, - ndarray[{{dest_type}}, ndim=2] out, - Py_ssize_t periods, int axis): +def diff_2d(ndarray[diff_t, ndim=2] arr, + ndarray[out_t, ndim=2] out, + Py_ssize_t periods, int axis): + + # Disable for unsupported dtype combinations, + # see https://github.com/cython/cython/issues/2646 + if out_t is float32_t: + if not (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t): + raise NotImplementedError + else: + if (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t): + raise NotImplementedError + cdef: Py_ssize_t i, j, sx, sy @@ -69,7 +71,6 @@ def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr, for j in range(start, stop): out[i, j] = arr[i, j] - arr[i, j - periods] -{{endfor}} # ---------------------------------------------------------------------- # ensure_dtype diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 60748839ba8d8..2c9f632e8bc24 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1850,14 +1850,7 @@ def searchsorted(arr, value, side="left", sorter=None): # diff # # ---- # -_diff_special = { - "float64": algos.diff_2d_float64, - "float32": algos.diff_2d_float32, - "int64": algos.diff_2d_int64, - "int32": algos.diff_2d_int32, - "int16": algos.diff_2d_int16, - "int8": algos.diff_2d_int8, -} +_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"} def diff(arr, n: int, axis: int = 0): @@ -1905,7 +1898,7 @@ def diff(arr, n: int, axis: int = 0): out_arr[tuple(na_indexer)] = na if arr.ndim == 2 and arr.dtype.name in _diff_special: - f = _diff_special[arr.dtype.name] + f = algos.diff_2d f(arr, out_arr, n, axis) else: # To keep mypy happy, _res_indexer is a list while res_indexer is From d449b730fc31755f5cdf7d689cd37d0f839cf0cf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 22 Oct 2019 05:32:50 -0700 Subject: [PATCH 046/112] REF: avoid getattr pattern for unstack, assorted cleanups (#29121) --- pandas/_libs/algos_take_helper.pxi.in | 4 ++-- pandas/_libs/internals.pyx | 5 +---- pandas/_libs/intervaltree.pxi.in | 14 +++++++------- pandas/_libs/reshape.pyx | 13 ------------- pandas/core/internals/concat.py | 4 +--- pandas/core/internals/managers.py | 8 ++------ pandas/core/reshape/reshape.py | 3 +-- 7 files changed, 14 insertions(+), 37 deletions(-) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index bd5a488722f6d..9dbae8170cbd0 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -146,13 +146,13 @@ def get_dispatch(dtypes): inner_take_2d_axis0 = inner_take_2d_axis0_template % args inner_take_2d_axis1 = inner_take_2d_axis1_template % args - yield (name, dest, c_type_in, c_type_out, preval, postval, can_copy, + yield (name, dest, c_type_in, c_type_out, preval, postval, inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1) }} -{{for name, dest, c_type_in, c_type_out, preval, postval, can_copy, +{{for name, dest, c_type_in, c_type_out, preval, postval, inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1 in get_dispatch(dtypes)}} diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index b7fd490532514..5f697f282fee5 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -441,24 +441,21 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): yield blkno, result -def get_blkno_placements(blknos, blk_count, group=True): +def get_blkno_placements(blknos, group=True): """ Parameters ---------- blknos : array of int64 - blk_count : int group : bool Returns ------- iterator yield (BlockPlacement, blkno) - """ blknos = ensure_int64(blknos) - # FIXME: blk_count is unused, but it may avoid the use of dicts in cython for blkno, indexer in get_blkno_indexers(blknos, group): yield blkno, BlockPlacement(indexer) diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index 08bfaf21db9fb..6e3be19f2b73e 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -25,9 +25,9 @@ cdef class IntervalTree(IntervalMixin): we are emulating the IndexEngine interface """ - cdef: - readonly object left, right, root, dtype - readonly str closed + cdef readonly: + object left, right, root, dtype + str closed object _is_overlapping, _left_sorter, _right_sorter def __init__(self, left, right, closed='right', leaf_size=100): @@ -259,14 +259,14 @@ cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: Categorizes intervals by those that fall to the left, those that fall to the right, and those that overlap with the pivot. """ - cdef: + cdef readonly: {{dtype_title}}Closed{{closed_title}}IntervalNode left_node, right_node {{dtype}}_t[:] center_left_values, center_right_values, left, right int64_t[:] center_left_indices, center_right_indices, indices {{dtype}}_t min_left, max_right - readonly {{dtype}}_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node + {{dtype}}_t pivot + int64_t n_elements, n_center, leaf_size + bint is_leaf_node def __init__(self, ndarray[{{dtype}}_t, ndim=1] left, diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index f229de002ce5c..32aa936672aab 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -83,19 +83,6 @@ def unstack(reshape_t[:, :] values, uint8_t[:] mask, nulls += 1 -unstack_uint8 = unstack["uint8_t"] -unstack_uint16 = unstack["uint16_t"] -unstack_uint32 = unstack["uint32_t"] -unstack_uint64 = unstack["uint64_t"] -unstack_int8 = unstack["int8_t"] -unstack_int16 = unstack["int16_t"] -unstack_int32 = unstack["int32_t"] -unstack_int64 = unstack["int64_t"] -unstack_float32 = unstack["float32_t"] -unstack_float64 = unstack["float64_t"] -unstack_object = unstack["object"] - - @cython.wraparound(False) @cython.boundscheck(False) def explode(ndarray[object] values): diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 85bce9450d12d..36e1b06230d7e 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -61,9 +61,7 @@ def get_mgr_concatenation_plan(mgr, indexers): blklocs = mgr._blklocs plan = [] - for blkno, placements in libinternals.get_blkno_placements( - blknos, mgr.nblocks, group=False - ): + for blkno, placements in libinternals.get_blkno_placements(blknos, group=False): assert placements.is_slice_like diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5f4c9d41b340b..d8b4e4127acd1 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1079,9 +1079,7 @@ def value_getitem(placement): unfit_mgr_locs = [] unfit_val_locs = [] removed_blknos = [] - for blkno, val_locs in libinternals.get_blkno_placements( - blknos, self.nblocks, group=True - ): + for blkno, val_locs in libinternals.get_blkno_placements(blknos, group=True): blk = self.blocks[blkno] blk_locs = blklocs[val_locs.indexer] if blk.should_store(value): @@ -1323,9 +1321,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order, # pytables serialization will break otherwise. blocks = [] - for blkno, mgr_locs in libinternals.get_blkno_placements( - blknos, self.nblocks, group=True - ): + for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=True): if blkno == -1: # If we've got here, fill_tuple was not None. fill_value = fill_tuple[0] diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 1a90c845638d9..ad7081fb17703 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -239,8 +239,7 @@ def get_new_values(self): sorted_values = sorted_values.astype(name, copy=False) # fill in our values & mask - f = getattr(libreshape, "unstack_{name}".format(name=name)) - f( + libreshape.unstack( sorted_values, mask.view("u1"), stride, From 19f76cdcd77b00d3f6ae65e51e79329febd57cb4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 22 Oct 2019 05:34:19 -0700 Subject: [PATCH 047/112] REF: avoid getattr pattern for join_indexer (#29117) --- pandas/_libs/join.pyx | 32 ----------- pandas/core/indexes/datetimes.py | 8 +-- pandas/core/indexes/timedeltas.py | 8 +-- pandas/tests/test_join.py | 91 +++++++++++++++---------------- 4 files changed, 52 insertions(+), 87 deletions(-) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index caf730389008a..11c56f784d378 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -290,14 +290,6 @@ def left_join_indexer_unique(join_t[:] left, join_t[:] right): return indexer -left_join_indexer_unique_float64 = left_join_indexer_unique["float64_t"] -left_join_indexer_unique_float32 = left_join_indexer_unique["float32_t"] -left_join_indexer_unique_object = left_join_indexer_unique["object"] -left_join_indexer_unique_int32 = left_join_indexer_unique["int32_t"] -left_join_indexer_unique_int64 = left_join_indexer_unique["int64_t"] -left_join_indexer_unique_uint64 = left_join_indexer_unique["uint64_t"] - - @cython.wraparound(False) @cython.boundscheck(False) def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): @@ -401,14 +393,6 @@ def left_join_indexer(ndarray[join_t] left, ndarray[join_t] right): return result, lindexer, rindexer -left_join_indexer_float64 = left_join_indexer["float64_t"] -left_join_indexer_float32 = left_join_indexer["float32_t"] -left_join_indexer_object = left_join_indexer["object"] -left_join_indexer_int32 = left_join_indexer["int32_t"] -left_join_indexer_int64 = left_join_indexer["int64_t"] -left_join_indexer_uint64 = left_join_indexer["uint64_t"] - - @cython.wraparound(False) @cython.boundscheck(False) def inner_join_indexer(ndarray[join_t] left, ndarray[join_t] right): @@ -502,14 +486,6 @@ def inner_join_indexer(ndarray[join_t] left, ndarray[join_t] right): return result, lindexer, rindexer -inner_join_indexer_float64 = inner_join_indexer["float64_t"] -inner_join_indexer_float32 = inner_join_indexer["float32_t"] -inner_join_indexer_object = inner_join_indexer["object"] -inner_join_indexer_int32 = inner_join_indexer["int32_t"] -inner_join_indexer_int64 = inner_join_indexer["int64_t"] -inner_join_indexer_uint64 = inner_join_indexer["uint64_t"] - - @cython.wraparound(False) @cython.boundscheck(False) def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): @@ -639,14 +615,6 @@ def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): return result, lindexer, rindexer -outer_join_indexer_float64 = outer_join_indexer["float64_t"] -outer_join_indexer_float32 = outer_join_indexer["float32_t"] -outer_join_indexer_object = outer_join_indexer["object"] -outer_join_indexer_int32 = outer_join_indexer["int32_t"] -outer_join_indexer_int64 = outer_join_indexer["int64_t"] -outer_join_indexer_uint64 = outer_join_indexer["uint64_t"] - - # ---------------------------------------------------------------------- # asof_join_by # ---------------------------------------------------------------------- diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 3535682bf182d..ee2f4e0f1e85d 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -230,11 +230,11 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin): def _join_i8_wrapper(joinf, **kwargs): return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="M8[ns]", **kwargs) - _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer_int64) - _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer_int64) - _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer_int64) + _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) + _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) + _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) _left_indexer_unique = _join_i8_wrapper( - libjoin.left_join_indexer_unique_int64, with_indexers=False + libjoin.left_join_indexer_unique, with_indexers=False ) _engine_type = libindex.DatetimeEngine diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 983e68f38a4b9..2324b8cf74c46 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -152,11 +152,11 @@ class TimedeltaIndex( def _join_i8_wrapper(joinf, **kwargs): return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="m8[ns]", **kwargs) - _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer_int64) - _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer_int64) - _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer_int64) + _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) + _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) + _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) _left_indexer_unique = _join_i8_wrapper( - libjoin.left_join_indexer_unique_int64, with_indexers=False + libjoin.left_join_indexer_unique, with_indexers=False ) _engine_type = libindex.TimedeltaEngine diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index e750193abb71a..2de70ceb53647 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas._libs import join as _join @@ -8,50 +9,46 @@ class TestIndexer: - def test_outer_join_indexer(self): - typemap = [ - ("int32", _join.outer_join_indexer_int32), - ("int64", _join.outer_join_indexer_int64), - ("float32", _join.outer_join_indexer_float32), - ("float64", _join.outer_join_indexer_float64), - ("object", _join.outer_join_indexer_object), - ] - - for dtype, indexer in typemap: - left = np.arange(3, dtype=dtype) - right = np.arange(2, 5, dtype=dtype) - empty = np.array([], dtype=dtype) - - result, lindexer, rindexer = indexer(left, right) - assert isinstance(result, np.ndarray) - assert isinstance(lindexer, np.ndarray) - assert isinstance(rindexer, np.ndarray) - tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype)) - exp = np.array([0, 1, 2, -1, -1], dtype=np.int64) - tm.assert_numpy_array_equal(lindexer, exp) - exp = np.array([-1, -1, 0, 1, 2], dtype=np.int64) - tm.assert_numpy_array_equal(rindexer, exp) - - result, lindexer, rindexer = indexer(empty, right) - tm.assert_numpy_array_equal(result, right) - exp = np.array([-1, -1, -1], dtype=np.int64) - tm.assert_numpy_array_equal(lindexer, exp) - exp = np.array([0, 1, 2], dtype=np.int64) - tm.assert_numpy_array_equal(rindexer, exp) - - result, lindexer, rindexer = indexer(left, empty) - tm.assert_numpy_array_equal(result, left) - exp = np.array([0, 1, 2], dtype=np.int64) - tm.assert_numpy_array_equal(lindexer, exp) - exp = np.array([-1, -1, -1], dtype=np.int64) - tm.assert_numpy_array_equal(rindexer, exp) + @pytest.mark.parametrize( + "dtype", ["int32", "int64", "float32", "float64", "object"] + ) + def test_outer_join_indexer(self, dtype): + indexer = _join.outer_join_indexer + + left = np.arange(3, dtype=dtype) + right = np.arange(2, 5, dtype=dtype) + empty = np.array([], dtype=dtype) + + result, lindexer, rindexer = indexer(left, right) + assert isinstance(result, np.ndarray) + assert isinstance(lindexer, np.ndarray) + assert isinstance(rindexer, np.ndarray) + tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype)) + exp = np.array([0, 1, 2, -1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(lindexer, exp) + exp = np.array([-1, -1, 0, 1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(rindexer, exp) + + result, lindexer, rindexer = indexer(empty, right) + tm.assert_numpy_array_equal(result, right) + exp = np.array([-1, -1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(lindexer, exp) + exp = np.array([0, 1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(rindexer, exp) + + result, lindexer, rindexer = indexer(left, empty) + tm.assert_numpy_array_equal(result, left) + exp = np.array([0, 1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(lindexer, exp) + exp = np.array([-1, -1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(rindexer, exp) def test_left_join_indexer_unique(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([2, 2, 3, 4, 4], dtype=np.int64) - result = _join.left_join_indexer_unique_int64(b, a) + result = _join.left_join_indexer_unique(b, a) expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) @@ -182,7 +179,7 @@ def test_inner_join_indexer(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - index, ares, bres = _join.inner_join_indexer_int64(a, b) + index, ares, bres = _join.inner_join_indexer(a, b) index_exp = np.array([3, 5], dtype=np.int64) assert_almost_equal(index, index_exp) @@ -195,7 +192,7 @@ def test_inner_join_indexer(): a = np.array([5], dtype=np.int64) b = np.array([5], dtype=np.int64) - index, ares, bres = _join.inner_join_indexer_int64(a, b) + index, ares, bres = _join.inner_join_indexer(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) @@ -205,7 +202,7 @@ def test_outer_join_indexer(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - index, ares, bres = _join.outer_join_indexer_int64(a, b) + index, ares, bres = _join.outer_join_indexer(a, b) index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) assert_almost_equal(index, index_exp) @@ -218,7 +215,7 @@ def test_outer_join_indexer(): a = np.array([5], dtype=np.int64) b = np.array([5], dtype=np.int64) - index, ares, bres = _join.outer_join_indexer_int64(a, b) + index, ares, bres = _join.outer_join_indexer(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) @@ -228,7 +225,7 @@ def test_left_join_indexer(): a = np.array([1, 2, 3, 4, 5], dtype=np.int64) b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - index, ares, bres = _join.left_join_indexer_int64(a, b) + index, ares, bres = _join.left_join_indexer(a, b) assert_almost_equal(index, a) @@ -240,7 +237,7 @@ def test_left_join_indexer(): a = np.array([5], dtype=np.int64) b = np.array([5], dtype=np.int64) - index, ares, bres = _join.left_join_indexer_int64(a, b) + index, ares, bres = _join.left_join_indexer(a, b) tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) @@ -250,7 +247,7 @@ def test_left_join_indexer2(): idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = _join.left_join_indexer_int64(idx2.values, idx.values) + res, lidx, ridx = _join.left_join_indexer(idx2.values, idx.values) exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) assert_almost_equal(res, exp_res) @@ -266,7 +263,7 @@ def test_outer_join_indexer2(): idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = _join.outer_join_indexer_int64(idx2.values, idx.values) + res, lidx, ridx = _join.outer_join_indexer(idx2.values, idx.values) exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) assert_almost_equal(res, exp_res) @@ -282,7 +279,7 @@ def test_inner_join_indexer2(): idx = Index([1, 1, 2, 5]) idx2 = Index([1, 2, 5, 7, 9]) - res, lidx, ridx = _join.inner_join_indexer_int64(idx2.values, idx.values) + res, lidx, ridx = _join.inner_join_indexer(idx2.values, idx.values) exp_res = np.array([1, 1, 2, 5], dtype=np.int64) assert_almost_equal(res, exp_res) From 4ff76bd3d2a16fe6196aa9d00073c96c4bccf290 Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Tue, 22 Oct 2019 14:38:16 +0200 Subject: [PATCH 048/112] Remove TestData from series-tests test_replace.py (#29147) --- pandas/tests/series/test_replace.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 06a859963cf93..e9d5a4b105a35 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -4,11 +4,9 @@ import pandas as pd import pandas.util.testing as tm -from .common import TestData - -class TestSeriesReplace(TestData): - def test_replace(self): +class TestSeriesReplace: + def test_replace(self, datetime_series): N = 100 ser = pd.Series(np.random.randn(N)) ser[0:4] = np.nan @@ -65,7 +63,7 @@ def test_replace(self): filled[4] = 0 tm.assert_series_equal(ser.replace(np.inf, 0), filled) - ser = pd.Series(self.ts.index) + ser = pd.Series(datetime_series.index) tm.assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) # malformed From f9020a2a12f3e1a3795da8e33fa4f8d4c421cf75 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 22 Oct 2019 05:39:23 -0700 Subject: [PATCH 049/112] TST: stop suppressing plot test exceptions, xfail broken test (#29099) --- pandas/tests/plotting/common.py | 13 +++++++------ pandas/tests/plotting/test_frame.py | 18 ++++++++++++------ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index f0ba5f14d59c6..82d67d1db3510 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -12,6 +12,7 @@ from pandas.core.dtypes.api import is_list_like +import pandas as pd from pandas import DataFrame, Series import pandas.util.testing as tm from pandas.util.testing import assert_is_valid_plot_return_object, ensure_clean @@ -541,13 +542,13 @@ def _check_plot_works(f, filterwarnings="always", **kwargs): assert_is_valid_plot_return_object(ret) - try: - kwargs["ax"] = fig.add_subplot(212) - ret = f(**kwargs) - except Exception: - pass + if f is pd.plotting.bootstrap_plot: + assert "ax" not in kwargs else: - assert_is_valid_plot_return_object(ret) + kwargs["ax"] = fig.add_subplot(212) + + ret = f(**kwargs) + assert_is_valid_plot_return_object(ret) with ensure_clean(return_filelike=True) as path: plt.savefig(path) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 84badba271fce..fd66888fc30e4 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3,6 +3,7 @@ """ Test cases for DataFrame.plot """ from datetime import date, datetime +import itertools import string import warnings @@ -2604,12 +2605,6 @@ def test_errorbar_plot(self): ax = _check_plot_works(df.plot, yerr=np.ones((2, 12)) * 0.4) self._check_has_errorbars(ax, xerr=0, yerr=2) - # yerr is iterator - import itertools - - ax = _check_plot_works(df.plot, yerr=itertools.repeat(0.1, len(df))) - self._check_has_errorbars(ax, xerr=0, yerr=2) - # yerr is column name for yerr in ["yerr", "誤差"]: s_df = df.copy() @@ -2626,6 +2621,17 @@ def test_errorbar_plot(self): with pytest.raises((ValueError, TypeError)): df.plot(yerr=df_err) + @pytest.mark.xfail(reason="Iterator is consumed", raises=ValueError) + @pytest.mark.slow + def test_errorbar_plot_iterator(self): + with warnings.catch_warnings(): + d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} + df = DataFrame(d) + + # yerr is iterator + ax = _check_plot_works(df.plot, yerr=itertools.repeat(0.1, len(df))) + self._check_has_errorbars(ax, xerr=0, yerr=2) + @pytest.mark.slow def test_errorbar_with_integer_column_names(self): # test with integer column names From 49bb2e18c490a752086806d4864a1a873b00a952 Mon Sep 17 00:00:00 2001 From: Mateusz Date: Tue, 22 Oct 2019 14:40:25 +0200 Subject: [PATCH 050/112] TST: Add regression test for Series dropping uint datatype (#18311) (#29071) --- pandas/tests/series/indexing/test_indexing.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index f50b3ddbce7dc..19d3e76f52adf 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -907,3 +907,12 @@ def test_head_tail(test_data): assert_series_equal(test_data.series.head(0), test_data.series[0:0]) assert_series_equal(test_data.series.tail(), test_data.series[-5:]) assert_series_equal(test_data.series.tail(0), test_data.series[0:0]) + + +def test_uint_drop(any_int_dtype): + # see GH18311 + # assigning series.loc[0] = 4 changed series.dtype to int + series = pd.Series([1, 2, 3], dtype=any_int_dtype) + series.loc[0] = 4 + expected = pd.Series([4, 2, 3], dtype=any_int_dtype) + tm.assert_series_equal(series, expected) From 2833cb409d1e07feb06756fb932384bd70f798eb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 22 Oct 2019 05:41:06 -0700 Subject: [PATCH 051/112] CLN: Exception in NDFrame._maybe_update_cacher (#29066) --- pandas/core/generic.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 61af22c6e92b3..aee02ef7f6644 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3340,9 +3340,13 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True): if ref is None: del self._cacher else: + # Note: we need to call ref._maybe_cache_changed even in the + # case where it will raise. (Uh, not clear why) try: ref._maybe_cache_changed(cacher[0], self) - except Exception: + except AssertionError: + # ref._data.setitem can raise + # AssertionError because of shape mismatch pass if verify_is_copy: From 219d18cb160553d71177b02b5d8b52b09ec6d7b8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 22 Oct 2019 07:42:15 -0500 Subject: [PATCH 052/112] REF: Store metadata in an attrs dict (#29062) --- doc/source/reference/frame.rst | 13 +++++++++++ doc/source/reference/series.rst | 13 +++++++++++ pandas/core/generic.py | 40 ++++++++++++++++++++++++++++++++- pandas/core/series.py | 30 +++++++++++-------------- 4 files changed, 78 insertions(+), 18 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 4982edeb7f85b..4b5faed0f4d2d 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -274,6 +274,19 @@ Time series-related DataFrame.tz_convert DataFrame.tz_localize +.. _api.frame.metadata: + +Metadata +~~~~~~~~ + +:attr:`DataFrame.attrs` is a dictionary for storing global metadata for this DataFrame. + +.. autosummary:: + :toctree: api/ + + DataFrame.attrs + + .. _api.dataframe.plotting: Plotting diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 5d825c8092efc..59910ba357130 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -531,6 +531,19 @@ Sparse-dtype specific methods and attributes are provided under the Series.sparse.to_coo +.. _api.series.metadata: + +Metadata +~~~~~~~~ + +:attr:`Series.attrs` is a dictionary for storing global metadata for this Series. + +.. autosummary:: + :toctree: api/ + + Series.attrs + + Plotting -------- ``Series.plot`` is both a callable method and a namespace attribute for diff --git a/pandas/core/generic.py b/pandas/core/generic.py index aee02ef7f6644..4693908e15f60 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8,12 +8,14 @@ import re from textwrap import dedent from typing import ( + TYPE_CHECKING, Any, Callable, Dict, FrozenSet, Hashable, List, + Mapping, Optional, Sequence, Set, @@ -188,6 +190,12 @@ class NDFrame(PandasObject, SelectionMixin): _is_copy = None _data = None # type: BlockManager + if TYPE_CHECKING: + # TODO(PY36): replace with _attrs : Dict[Hashable, Any] + # We need the TYPE_CHECKING, because _attrs is not a class attribute + # and Py35 doesn't support the new syntax. + _attrs = {} # type: Dict[Hashable, Any] + # ---------------------------------------------------------------------- # Constructors @@ -197,6 +205,7 @@ def __init__( axes: Optional[List[Index]] = None, copy: bool = False, dtype: Optional[Dtype] = None, + attrs: Optional[Mapping[Hashable, Any]] = None, fastpath: bool = False, ): @@ -213,6 +222,11 @@ def __init__( object.__setattr__(self, "_is_copy", None) object.__setattr__(self, "_data", data) object.__setattr__(self, "_item_cache", {}) + if attrs is None: + attrs = {} + else: + attrs = dict(attrs) + object.__setattr__(self, "_attrs", attrs) def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): """ passed a manager and a axes dict """ @@ -233,6 +247,19 @@ def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): # ---------------------------------------------------------------------- + @property + def attrs(self) -> Dict[Hashable, Any]: + """ + Dictionary of global attributes on this object. + """ + if self._attrs is None: + self._attrs = {} + return self._attrs + + @attrs.setter + def attrs(self, value: Mapping[Hashable, Any]) -> None: + self._attrs = dict(value) + @property def is_copy(self): """ @@ -2027,7 +2054,13 @@ def to_dense(self): def __getstate__(self): meta = {k: getattr(self, k, None) for k in self._metadata} - return dict(_data=self._data, _typ=self._typ, _metadata=self._metadata, **meta) + return dict( + _data=self._data, + _typ=self._typ, + _metadata=self._metadata, + attrs=self.attrs, + **meta + ) def __setstate__(self, state): @@ -2036,6 +2069,8 @@ def __setstate__(self, state): elif isinstance(state, dict): typ = state.get("_typ") if typ is not None: + attrs = state.get("_attrs", {}) + object.__setattr__(self, "_attrs", attrs) # set in the order of internal names # to avoid definitional recursion @@ -5202,6 +5237,9 @@ def __finalize__(self, other, method=None, **kwargs): """ if isinstance(other, NDFrame): + for name in other.attrs: + self.attrs[name] = other.attrs[name] + # For subclasses using _metadata. for name in self._metadata: object.__setattr__(self, name, getattr(other, name, None)) return self diff --git a/pandas/core/series.py b/pandas/core/series.py index ea48b3603623a..5f1a7624f47e4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5,7 +5,7 @@ from io import StringIO from shutil import get_terminal_size from textwrap import dedent -from typing import Any, Callable +from typing import Any, Callable, Hashable, List import warnings import numpy as np @@ -29,7 +29,6 @@ is_dict_like, is_extension_array_dtype, is_extension_type, - is_hashable, is_integer, is_iterator, is_list_like, @@ -45,6 +44,7 @@ ABCSeries, ABCSparseArray, ) +from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import ( isna, na_value_for_dtype, @@ -173,7 +173,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): Copy input data. """ - _metadata = ["name"] + _metadata = [] # type: List[str] _accessors = {"dt", "cat", "str", "sparse"} _deprecations = ( base.IndexOpsMixin._deprecations @@ -324,7 +324,6 @@ def __init__( data = SingleBlockManager(data, index, fastpath=True) generic.NDFrame.__init__(self, data, fastpath=True) - self.name = name self._set_axis(0, index, fastpath=True) @@ -457,19 +456,6 @@ def _update_inplace(self, result, **kwargs): # we want to call the generic version and not the IndexOpsMixin return generic.NDFrame._update_inplace(self, result, **kwargs) - @property - def name(self): - """ - Return name of the Series. - """ - return self._name - - @name.setter - def name(self, value): - if value is not None and not is_hashable(value): - raise TypeError("Series.name must be a hashable type") - object.__setattr__(self, "_name", value) - # ndarray compatibility @property def dtype(self): @@ -485,6 +471,16 @@ def dtypes(self): """ return self._data.dtype + @property + def name(self) -> Hashable: + return self.attrs.get("name", None) + + @name.setter + def name(self, value: Hashable) -> None: + if not is_hashable(value): + raise TypeError("Series.name must be a hashable type") + self.attrs["name"] = value + @property def ftype(self): """ From c594594a191999d5a46e6b0838245676a7551ecb Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Tue, 22 Oct 2019 15:12:31 +0200 Subject: [PATCH 053/112] Remove TestData from series-tests test_sorting.py (#29149) --- pandas/tests/series/test_sorting.py | 48 ++++++++++++++++------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index 125f516ab6b09..192b57d2a9007 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -7,11 +7,9 @@ import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal, assert_series_equal -from .common import TestData - -class TestSeriesSorting(TestData): - def test_sort_values(self): +class TestSeriesSorting: + def test_sort_values(self, datetime_series): # check indexes are reordered corresponding with the values ser = Series([3, 2, 4, 1], ["A", "B", "C", "D"]) @@ -19,7 +17,7 @@ def test_sort_values(self): result = ser.sort_values() tm.assert_series_equal(expected, result) - ts = self.ts.copy() + ts = datetime_series.copy() ts[:5] = np.NaN vals = ts.values @@ -69,10 +67,12 @@ def test_sort_values(self): ts.sort_values(ascending="foobar") # inplace=True - ts = self.ts.copy() + ts = datetime_series.copy() ts.sort_values(ascending=False, inplace=True) - tm.assert_series_equal(ts, self.ts.sort_values(ascending=False)) - tm.assert_index_equal(ts.index, self.ts.sort_values(ascending=False).index) + tm.assert_series_equal(ts, datetime_series.sort_values(ascending=False)) + tm.assert_index_equal( + ts.index, datetime_series.sort_values(ascending=False).index + ) # GH 5856/5853 # Series.sort_values operating on a view @@ -86,55 +86,59 @@ def test_sort_values(self): with pytest.raises(ValueError, match=msg): s.sort_values(inplace=True) - def test_sort_index(self): - rindex = list(self.ts.index) + def test_sort_index(self, datetime_series): + rindex = list(datetime_series.index) random.shuffle(rindex) - random_order = self.ts.reindex(rindex) + random_order = datetime_series.reindex(rindex) sorted_series = random_order.sort_index() - assert_series_equal(sorted_series, self.ts) + assert_series_equal(sorted_series, datetime_series) # descending sorted_series = random_order.sort_index(ascending=False) - assert_series_equal(sorted_series, self.ts.reindex(self.ts.index[::-1])) + assert_series_equal( + sorted_series, datetime_series.reindex(datetime_series.index[::-1]) + ) # compat on level sorted_series = random_order.sort_index(level=0) - assert_series_equal(sorted_series, self.ts) + assert_series_equal(sorted_series, datetime_series) # compat on axis sorted_series = random_order.sort_index(axis=0) - assert_series_equal(sorted_series, self.ts) + assert_series_equal(sorted_series, datetime_series) msg = "No axis named 1 for object type " with pytest.raises(ValueError, match=msg): random_order.sort_values(axis=1) sorted_series = random_order.sort_index(level=0, axis=0) - assert_series_equal(sorted_series, self.ts) + assert_series_equal(sorted_series, datetime_series) with pytest.raises(ValueError, match=msg): random_order.sort_index(level=0, axis=1) - def test_sort_index_inplace(self): + def test_sort_index_inplace(self, datetime_series): # For #11402 - rindex = list(self.ts.index) + rindex = list(datetime_series.index) random.shuffle(rindex) # descending - random_order = self.ts.reindex(rindex) + random_order = datetime_series.reindex(rindex) result = random_order.sort_index(ascending=False, inplace=True) assert result is None - tm.assert_series_equal(random_order, self.ts.reindex(self.ts.index[::-1])) + tm.assert_series_equal( + random_order, datetime_series.reindex(datetime_series.index[::-1]) + ) # ascending - random_order = self.ts.reindex(rindex) + random_order = datetime_series.reindex(rindex) result = random_order.sort_index(ascending=True, inplace=True) assert result is None - tm.assert_series_equal(random_order, self.ts) + tm.assert_series_equal(random_order, datetime_series) @pytest.mark.parametrize("level", ["A", 0]) # GH 21052 def test_sort_index_multiindex(self, level): From 1707f2a14a93aee73ac74ddd17622ba3ac58d8af Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Tue, 22 Oct 2019 16:59:50 +0200 Subject: [PATCH 054/112] Remove TestData from series-tests test_timeseries.py (#29150) --- pandas/tests/series/test_timeseries.py | 118 +++++++++++++------------ 1 file changed, 63 insertions(+), 55 deletions(-) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index fbe3f929cf5b5..f8c9c06900c3e 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -25,7 +25,6 @@ ) from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.tests.series.common import TestData import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, @@ -47,32 +46,34 @@ def assert_range_equal(left, right): assert left.tz == right.tz -class TestTimeSeries(TestData): - def test_shift(self): - shifted = self.ts.shift(1) +class TestTimeSeries: + def test_shift(self, datetime_series): + shifted = datetime_series.shift(1) unshifted = shifted.shift(-1) - tm.assert_index_equal(shifted.index, self.ts.index) - tm.assert_index_equal(unshifted.index, self.ts.index) - tm.assert_numpy_array_equal(unshifted.dropna().values, self.ts.values[:-1]) + tm.assert_index_equal(shifted.index, datetime_series.index) + tm.assert_index_equal(unshifted.index, datetime_series.index) + tm.assert_numpy_array_equal( + unshifted.dropna().values, datetime_series.values[:-1] + ) offset = BDay() - shifted = self.ts.shift(1, freq=offset) + shifted = datetime_series.shift(1, freq=offset) unshifted = shifted.shift(-1, freq=offset) - assert_series_equal(unshifted, self.ts) + assert_series_equal(unshifted, datetime_series) - unshifted = self.ts.shift(0, freq=offset) - assert_series_equal(unshifted, self.ts) + unshifted = datetime_series.shift(0, freq=offset) + assert_series_equal(unshifted, datetime_series) - shifted = self.ts.shift(1, freq="B") + shifted = datetime_series.shift(1, freq="B") unshifted = shifted.shift(-1, freq="B") - assert_series_equal(unshifted, self.ts) + assert_series_equal(unshifted, datetime_series) # corner case - unshifted = self.ts.shift(0) - assert_series_equal(unshifted, self.ts) + unshifted = datetime_series.shift(0) + assert_series_equal(unshifted, datetime_series) # Shifting with PeriodIndex ps = tm.makePeriodSeries() @@ -208,7 +209,7 @@ def test_shift_dst(self): tm.assert_series_equal(res, exp) assert res.dtype == "datetime64[ns, US/Eastern]" - def test_tshift(self): + def test_tshift(self, datetime_series): # PeriodIndex ps = tm.makePeriodSeries() shifted = ps.tshift(1) @@ -227,34 +228,34 @@ def test_tshift(self): ps.tshift(freq="M") # DatetimeIndex - shifted = self.ts.tshift(1) + shifted = datetime_series.tshift(1) unshifted = shifted.tshift(-1) - assert_series_equal(self.ts, unshifted) + assert_series_equal(datetime_series, unshifted) - shifted2 = self.ts.tshift(freq=self.ts.index.freq) + shifted2 = datetime_series.tshift(freq=datetime_series.index.freq) assert_series_equal(shifted, shifted2) inferred_ts = Series( - self.ts.values, Index(np.asarray(self.ts.index)), name="ts" + datetime_series.values, Index(np.asarray(datetime_series.index)), name="ts" ) shifted = inferred_ts.tshift(1) unshifted = shifted.tshift(-1) - assert_series_equal(shifted, self.ts.tshift(1)) + assert_series_equal(shifted, datetime_series.tshift(1)) assert_series_equal(unshifted, inferred_ts) - no_freq = self.ts[[0, 5, 7]] + no_freq = datetime_series[[0, 5, 7]] msg = "Freq was not given and was not set in the index" with pytest.raises(ValueError, match=msg): no_freq.tshift() - def test_truncate(self): + def test_truncate(self, datetime_series): offset = BDay() - ts = self.ts[::3] + ts = datetime_series[::3] - start, end = self.ts.index[3], self.ts.index[6] - start_missing, end_missing = self.ts.index[2], self.ts.index[7] + start, end = datetime_series.index[3], datetime_series.index[6] + start_missing, end_missing = datetime_series.index[2], datetime_series.index[7] # neither specified truncated = ts.truncate() @@ -288,16 +289,17 @@ def test_truncate(self): assert_series_equal(truncated, expected) # corner case, empty series returned - truncated = ts.truncate(after=self.ts.index[0] - offset) + truncated = ts.truncate(after=datetime_series.index[0] - offset) assert len(truncated) == 0 - truncated = ts.truncate(before=self.ts.index[-1] + offset) + truncated = ts.truncate(before=datetime_series.index[-1] + offset) assert len(truncated) == 0 msg = "Truncate: 1999-12-31 00:00:00 must be after 2000-02-14 00:00:00" with pytest.raises(ValueError, match=msg): ts.truncate( - before=self.ts.index[-1] + offset, after=self.ts.index[0] - offset + before=datetime_series.index[-1] + offset, + after=datetime_series.index[0] - offset, ) def test_truncate_nonsortedindex(self): @@ -355,20 +357,20 @@ def test_asfreq_datetimeindex_empty_series(self): ) tm.assert_index_equal(expected.index, result.index) - def test_pct_change(self): - rs = self.ts.pct_change(fill_method=None) - assert_series_equal(rs, self.ts / self.ts.shift(1) - 1) + def test_pct_change(self, datetime_series): + rs = datetime_series.pct_change(fill_method=None) + assert_series_equal(rs, datetime_series / datetime_series.shift(1) - 1) - rs = self.ts.pct_change(2) - filled = self.ts.fillna(method="pad") + rs = datetime_series.pct_change(2) + filled = datetime_series.fillna(method="pad") assert_series_equal(rs, filled / filled.shift(2) - 1) - rs = self.ts.pct_change(fill_method="bfill", limit=1) - filled = self.ts.fillna(method="bfill", limit=1) + rs = datetime_series.pct_change(fill_method="bfill", limit=1) + filled = datetime_series.fillna(method="bfill", limit=1) assert_series_equal(rs, filled / filled.shift(1) - 1) - rs = self.ts.pct_change(freq="5D") - filled = self.ts.fillna(method="pad") + rs = datetime_series.pct_change(freq="5D") + filled = datetime_series.fillna(method="pad") assert_series_equal( rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) ) @@ -391,26 +393,32 @@ def test_pct_change_shift_over_nas(self): ("14B", 14, None, None), ], ) - def test_pct_change_periods_freq(self, freq, periods, fill_method, limit): + def test_pct_change_periods_freq( + self, freq, periods, fill_method, limit, datetime_series + ): # GH 7292 - rs_freq = self.ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) - rs_periods = self.ts.pct_change(periods, fill_method=fill_method, limit=limit) + rs_freq = datetime_series.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + rs_periods = datetime_series.pct_change( + periods, fill_method=fill_method, limit=limit + ) assert_series_equal(rs_freq, rs_periods) - empty_ts = Series(index=self.ts.index) + empty_ts = Series(index=datetime_series.index) rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) assert_series_equal(rs_freq, rs_periods) - def test_autocorr(self): + def test_autocorr(self, datetime_series): # Just run the function - corr1 = self.ts.autocorr() + corr1 = datetime_series.autocorr() # Now run it with the lag parameter - corr2 = self.ts.autocorr(lag=1) + corr2 = datetime_series.autocorr(lag=1) # corr() with lag needs Series of at least length 2 - if len(self.ts) <= 2: + if len(datetime_series) <= 2: assert np.isnan(corr1) assert np.isnan(corr2) else: @@ -418,19 +426,19 @@ def test_autocorr(self): # Choose a random lag between 1 and length of Series - 2 # and compare the result with the Series corr() function - n = 1 + np.random.randint(max(1, len(self.ts) - 2)) - corr1 = self.ts.corr(self.ts.shift(n)) - corr2 = self.ts.autocorr(lag=n) + n = 1 + np.random.randint(max(1, len(datetime_series) - 2)) + corr1 = datetime_series.corr(datetime_series.shift(n)) + corr2 = datetime_series.autocorr(lag=n) # corr() with lag needs Series of at least length 2 - if len(self.ts) <= 2: + if len(datetime_series) <= 2: assert np.isnan(corr1) assert np.isnan(corr2) else: assert corr1 == corr2 - def test_first_last_valid(self): - ts = self.ts.copy() + def test_first_last_valid(self, datetime_series): + ts = datetime_series.copy() ts[:5] = np.NaN index = ts.first_valid_index() @@ -462,9 +470,9 @@ def test_first_last_valid(self): assert ts.first_valid_index().freq == ts.index.freq assert ts.last_valid_index().freq == ts.index.freq - def test_mpl_compat_hack(self): - result = self.ts[:, np.newaxis] - expected = self.ts.values[:, np.newaxis] + def test_mpl_compat_hack(self, datetime_series): + result = datetime_series[:, np.newaxis] + expected = datetime_series.values[:, np.newaxis] assert_almost_equal(result, expected) def test_timeseries_coercion(self): From 4dc532fe9d814a7972481bb2c25d5d8d07c74325 Mon Sep 17 00:00:00 2001 From: Puneeth K <32433964+punndcoder28@users.noreply.github.com> Date: Tue, 22 Oct 2019 21:16:03 +0530 Subject: [PATCH 055/112] minor inconsistency in Categorical.remove_categories error message (#28677) --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/tests/arrays/categorical/test_api.py | 12 +++++++++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 7c59cbf7cfd1e..01a102f269886 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -260,7 +260,7 @@ Categorical - Bug where :func:`merge` was unable to join on categorical and extension dtype columns (:issue:`28668`) - :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` now work on unordered categoricals also (:issue:`21667`) - Added test to assert roundtripping to parquet with :func:`DataFrame.to_parquet` or :func:`read_parquet` will preserve Categorical dtypes for string types (:issue:`27955`) -- +- Changed the error message in :meth:`Categorical.remove_categories` to always show the invalid removals as a set (:issue:`28669`) Datetimelike diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 795986127cde7..70ed411f6a3e4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1124,7 +1124,7 @@ def remove_categories(self, removals, inplace=False): # GH 10156 if any(isna(removals)): - not_included = [x for x in not_included if notna(x)] + not_included = {x for x in not_included if notna(x)} new_categories = [x for x in new_categories if notna(x)] if len(not_included) != 0: diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index ab07b3c96a1db..42087b89a19b5 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -339,9 +341,13 @@ def test_remove_categories(self): tm.assert_categorical_equal(cat, new) assert res is None - # removal is not in categories - with pytest.raises(ValueError): - cat.remove_categories(["c"]) + @pytest.mark.parametrize("removals", [["c"], ["c", np.nan], "c", ["c", "c"]]) + def test_remove_categories_raises(self, removals): + cat = Categorical(["a", "b", "a"]) + message = re.escape("removals must all be in old categories: {'c'}") + + with pytest.raises(ValueError, match=message): + cat.remove_categories(removals) def test_remove_unused_categories(self): c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"]) From a0446b0a4acfd30754ff58d6887108a001b5637e Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Tue, 22 Oct 2019 20:34:05 +0200 Subject: [PATCH 056/112] Remove TestData from series-tests test_api.py (#29153) --- pandas/tests/series/test_api.py | 163 ++++++++++++++++---------------- 1 file changed, 84 insertions(+), 79 deletions(-) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 998f8b6f7d8a4..f2435e8c804db 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -24,8 +24,6 @@ import pandas.io.formats.printing as printing -from .common import TestData - class SharedWithSparse: """ @@ -39,82 +37,84 @@ def _assert_series_equal(self, left, right): """Dispatch to series class dependent assertion""" raise NotImplementedError - def test_scalarop_preserve_name(self): - result = self.ts * 2 - assert result.name == self.ts.name + def test_scalarop_preserve_name(self, datetime_series): + result = datetime_series * 2 + assert result.name == datetime_series.name - def test_copy_name(self): - result = self.ts.copy() - assert result.name == self.ts.name + def test_copy_name(self, datetime_series): + result = datetime_series.copy() + assert result.name == datetime_series.name - def test_copy_index_name_checking(self): + def test_copy_index_name_checking(self, datetime_series): # don't want to be able to modify the index stored elsewhere after # making a copy - self.ts.index.name = None - assert self.ts.index.name is None - assert self.ts is self.ts + datetime_series.index.name = None + assert datetime_series.index.name is None + assert datetime_series is datetime_series - cp = self.ts.copy() + cp = datetime_series.copy() cp.index.name = "foo" - printing.pprint_thing(self.ts.index.name) - assert self.ts.index.name is None + printing.pprint_thing(datetime_series.index.name) + assert datetime_series.index.name is None - def test_append_preserve_name(self): - result = self.ts[:5].append(self.ts[5:]) - assert result.name == self.ts.name + def test_append_preserve_name(self, datetime_series): + result = datetime_series[:5].append(datetime_series[5:]) + assert result.name == datetime_series.name - def test_binop_maybe_preserve_name(self): + def test_binop_maybe_preserve_name(self, datetime_series): # names match, preserve - result = self.ts * self.ts - assert result.name == self.ts.name - result = self.ts.mul(self.ts) - assert result.name == self.ts.name + result = datetime_series * datetime_series + assert result.name == datetime_series.name + result = datetime_series.mul(datetime_series) + assert result.name == datetime_series.name - result = self.ts * self.ts[:-2] - assert result.name == self.ts.name + result = datetime_series * datetime_series[:-2] + assert result.name == datetime_series.name # names don't match, don't preserve - cp = self.ts.copy() + cp = datetime_series.copy() cp.name = "something else" - result = self.ts + cp + result = datetime_series + cp assert result.name is None - result = self.ts.add(cp) + result = datetime_series.add(cp) assert result.name is None ops = ["add", "sub", "mul", "div", "truediv", "floordiv", "mod", "pow"] ops = ops + ["r" + op for op in ops] for op in ops: # names match, preserve - s = self.ts.copy() + s = datetime_series.copy() result = getattr(s, op)(s) - assert result.name == self.ts.name + assert result.name == datetime_series.name # names don't match, don't preserve - cp = self.ts.copy() + cp = datetime_series.copy() cp.name = "changed" result = getattr(s, op)(cp) assert result.name is None - def test_combine_first_name(self): - result = self.ts.combine_first(self.ts[:5]) - assert result.name == self.ts.name + def test_combine_first_name(self, datetime_series): + result = datetime_series.combine_first(datetime_series[:5]) + assert result.name == datetime_series.name + + def test_getitem_preserve_name(self, datetime_series): + result = datetime_series[datetime_series > 0] + assert result.name == datetime_series.name - def test_getitem_preserve_name(self): - result = self.ts[self.ts > 0] - assert result.name == self.ts.name + result = datetime_series[[0, 2, 4]] + assert result.name == datetime_series.name - result = self.ts[[0, 2, 4]] - assert result.name == self.ts.name + result = datetime_series[5:10] + assert result.name == datetime_series.name - result = self.ts[5:10] - assert result.name == self.ts.name + def test_pickle_datetimes(self, datetime_series): + unp_ts = self._pickle_roundtrip(datetime_series) + assert_series_equal(unp_ts, datetime_series) - def test_pickle(self): - unp_series = self._pickle_roundtrip(self.series) - unp_ts = self._pickle_roundtrip(self.ts) - assert_series_equal(unp_series, self.series) - assert_series_equal(unp_ts, self.ts) + def test_pickle_strings(self, string_series): + unp_series = self._pickle_roundtrip(string_series) + assert_series_equal(unp_series, string_series) def _pickle_roundtrip(self, obj): @@ -123,13 +123,13 @@ def _pickle_roundtrip(self, obj): unpickled = pd.read_pickle(path) return unpickled - def test_argsort_preserve_name(self): - result = self.ts.argsort() - assert result.name == self.ts.name + def test_argsort_preserve_name(self, datetime_series): + result = datetime_series.argsort() + assert result.name == datetime_series.name - def test_sort_index_name(self): - result = self.ts.sort_index(ascending=False) - assert result.name == self.ts.name + def test_sort_index_name(self, datetime_series): + result = datetime_series.sort_index(ascending=False) + assert result.name == datetime_series.name def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} @@ -211,7 +211,7 @@ def test_sparse_accessor_updates_on_inplace(self): assert s.sparse.density == 1.0 -class TestSeriesMisc(TestData, SharedWithSparse): +class TestSeriesMisc(SharedWithSparse): series_klass = Series # SharedWithSparse tests use generic, series_klass-agnostic assertion @@ -307,44 +307,49 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(s) - def test_contains(self): - tm.assert_contains_all(self.ts.index, self.ts) + def test_contains(self, datetime_series): + tm.assert_contains_all(datetime_series.index, datetime_series) - def test_iter(self): - for i, val in enumerate(self.series): - assert val == self.series[i] + def test_iter_datetimes(self, datetime_series): + for i, val in enumerate(datetime_series): + assert val == datetime_series[i] - for i, val in enumerate(self.ts): - assert val == self.ts[i] + def test_iter_strings(self, string_series): + for i, val in enumerate(string_series): + assert val == string_series[i] - def test_keys(self): + def test_keys(self, datetime_series): # HACK: By doing this in two stages, we avoid 2to3 wrapping the call # to .keys() in a list() - getkeys = self.ts.keys - assert getkeys() is self.ts.index + getkeys = datetime_series.keys + assert getkeys() is datetime_series.index - def test_values(self): - tm.assert_almost_equal(self.ts.values, self.ts, check_dtype=False) + def test_values(self, datetime_series): + tm.assert_almost_equal( + datetime_series.values, datetime_series, check_dtype=False + ) - def test_iteritems(self): - for idx, val in self.series.iteritems(): - assert val == self.series[idx] + def test_iteritems_datetimes(self, datetime_series): + for idx, val in datetime_series.iteritems(): + assert val == datetime_series[idx] - for idx, val in self.ts.iteritems(): - assert val == self.ts[idx] + def test_iteritems_strings(self, string_series): + for idx, val in string_series.iteritems(): + assert val == string_series[idx] # assert is lazy (genrators don't define reverse, lists do) - assert not hasattr(self.series.iteritems(), "reverse") + assert not hasattr(string_series.iteritems(), "reverse") - def test_items(self): - for idx, val in self.series.items(): - assert val == self.series[idx] + def test_items_datetimes(self, datetime_series): + for idx, val in datetime_series.items(): + assert val == datetime_series[idx] - for idx, val in self.ts.items(): - assert val == self.ts[idx] + def test_items_strings(self, string_series): + for idx, val in string_series.items(): + assert val == string_series[idx] # assert is lazy (genrators don't define reverse, lists do) - assert not hasattr(self.series.items(), "reverse") + assert not hasattr(string_series.items(), "reverse") def test_raise_on_info(self): s = Series(np.random.randn(10)) @@ -413,9 +418,9 @@ def test_class_axis(self): # no exception and no empty docstring assert pydoc.getdoc(Series.index) - def test_numpy_unique(self): + def test_numpy_unique(self, datetime_series): # it works! - np.unique(self.ts) + np.unique(datetime_series) def test_ndarray_compat(self): From 38b923a4197d1fead1147c7ffd21d67076eed3f1 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 22 Oct 2019 20:03:40 +0100 Subject: [PATCH 057/112] CLN: replace Dict with Mapping to annotate arguments (#29155) --- pandas/_typing.py | 13 ++++++++++++- pandas/core/dtypes/dtypes.py | 4 ++-- pandas/core/generic.py | 6 +++--- pandas/core/ops/array_ops.py | 4 ++-- pandas/io/common.py | 20 ++++++++++---------- pandas/io/formats/format.py | 3 ++- pandas/io/formats/html.py | 6 +++--- pandas/io/formats/printing.py | 16 +++++++++++++--- pandas/io/json/_json.py | 18 ++++++++---------- pandas/util/_decorators.py | 4 ++-- 10 files changed, 57 insertions(+), 37 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 5afe64f719b8a..445eff9e19e47 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,5 +1,15 @@ from pathlib import Path -from typing import IO, TYPE_CHECKING, AnyStr, Iterable, Optional, TypeVar, Union +from typing import ( + IO, + TYPE_CHECKING, + AnyStr, + Dict, + Iterable, + List, + Optional, + TypeVar, + Union, +) import numpy as np @@ -25,6 +35,7 @@ Scalar = Union[str, int, float, bool] Axis = Union[str, int] Ordered = Optional[bool] +JSONSerializable = Union[Scalar, List, Dict] # use Collection after we drop support for py35 Axes = Iterable diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index ae6f2ed289248..5c8dbd6d68a50 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1,6 +1,6 @@ """ define extension dtypes """ import re -from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast +from typing import Any, Dict, List, MutableMapping, Optional, Tuple, Type, Union, cast import warnings import numpy as np @@ -351,7 +351,7 @@ def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> Non self._ordered = ordered if ordered is not ordered_sentinel else None self._ordered_from_sentinel = ordered is ordered_sentinel - def __setstate__(self, state: Dict[str_type, Any]) -> None: + def __setstate__(self, state: MutableMapping[str_type, Any]) -> None: # for pickle compat. __get_state__ is defined in the # PandasExtensionDtype superclass and uses the public properties to # pickle -> need to set the settable private ones here (see GH26067) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4693908e15f60..75bbfd9c12216 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -67,7 +67,7 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas._typing import Dtype, FilePathOrBuffer, Scalar +from pandas._typing import Dtype, FilePathOrBuffer, JSONSerializable from pandas.core import missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin @@ -2299,7 +2299,7 @@ def to_json( double_precision: int = 10, force_ascii: bool_t = True, date_unit: str = "ms", - default_handler: Optional[Callable[[Any], Union[Scalar, List, Dict]]] = None, + default_handler: Optional[Callable[[Any], JSONSerializable]] = None, lines: bool_t = False, compression: Optional[str] = "infer", index: bool_t = True, @@ -3155,7 +3155,7 @@ def to_csv( index_label: Optional[Union[bool_t, str, Sequence[Hashable]]] = None, mode: str = "w", encoding: Optional[str] = None, - compression: Optional[Union[str, Dict[str, str]]] = "infer", + compression: Optional[Union[str, Mapping[str, str]]] = "infer", quoting: Optional[int] = None, quotechar: str = '"', line_terminator: Optional[str] = None, diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 8c9a4b94446c0..46c3b8b575af9 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -3,7 +3,7 @@ ExtensionArrays. """ import operator -from typing import Any, Dict, Union +from typing import Any, Mapping, Union import numpy as np @@ -161,7 +161,7 @@ def arithmetic_op( right: Any, op, str_rep: str, - eval_kwargs: Dict[str, bool], + eval_kwargs: Mapping[str, bool], ): """ Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ... diff --git a/pandas/io/common.py b/pandas/io/common.py index 2ca2007e2925f..0b8594bbbd3e4 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -15,6 +15,7 @@ BinaryIO, Dict, List, + Mapping, Optional, TextIO, Tuple, @@ -276,16 +277,16 @@ def file_path_to_url(path: str) -> str: def _get_compression_method( - compression: Optional[Union[str, Dict[str, str]]] + compression: Optional[Union[str, Mapping[str, str]]] ) -> Tuple[Optional[str], Dict[str, str]]: """ Simplifies a compression argument to a compression method string and - a dict containing additional arguments. + a mapping containing additional arguments. Parameters ---------- - compression : str or dict - If string, specifies the compression method. If dict, value at key + compression : str or mapping + If string, specifies the compression method. If mapping, value at key 'method' specifies compression method. Returns @@ -295,15 +296,14 @@ def _get_compression_method( Raises ------ - ValueError on dict missing 'method' key + ValueError on mapping missing 'method' key """ - # Handle dict - if isinstance(compression, dict): - compression_args = compression.copy() + if isinstance(compression, Mapping): + compression_args = dict(compression) try: compression = compression_args.pop("method") except KeyError: - raise ValueError("If dict, compression must have key 'method'") + raise ValueError("If mapping, compression must have key 'method'") else: compression_args = {} return compression, compression_args @@ -368,7 +368,7 @@ def _get_handle( path_or_buf, mode: str, encoding=None, - compression: Optional[Union[str, Dict[str, Any]]] = None, + compression: Optional[Union[str, Mapping[str, Any]]] = None, memory_map: bool = False, is_text: bool = True, ): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b8c40e3f62221..6ddba6a297bdc 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -20,6 +20,7 @@ Dict, Iterable, List, + Mapping, Optional, Sequence, Tuple, @@ -78,7 +79,7 @@ from pandas import Series, DataFrame, Categorical formatters_type = Union[ - List[Callable], Tuple[Callable, ...], Dict[Union[str, int], Callable] + List[Callable], Tuple[Callable, ...], Mapping[Union[str, int], Callable] ] float_format_type = Union[str, Callable, "EngFormatter"] diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 50fa4796f8d72..38f2e332017f0 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -4,7 +4,7 @@ from collections import OrderedDict from textwrap import dedent -from typing import IO, Any, Dict, Iterable, List, Optional, Tuple, Union, cast +from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast from pandas._config import get_option @@ -394,7 +394,7 @@ def _write_body(self, indent: int) -> None: self.write("", indent) def _write_regular_rows( - self, fmt_values: Dict[int, List[str]], indent: int + self, fmt_values: Mapping[int, List[str]], indent: int ) -> None: truncate_h = self.fmt.truncate_h truncate_v = self.fmt.truncate_v @@ -440,7 +440,7 @@ def _write_regular_rows( ) def _write_hierarchical_rows( - self, fmt_values: Dict[int, List[str]], indent: int + self, fmt_values: Mapping[int, List[str]], indent: int ) -> None: template = 'rowspan="{span}" valign="top"' diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index ead51693da791..061103820ca83 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -3,13 +3,23 @@ """ import sys -from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union +from typing import ( + Any, + Callable, + Iterable, + List, + Mapping, + Optional, + Sequence, + Tuple, + Union, +) from pandas._config import get_option from pandas.core.dtypes.inference import is_sequence -EscapeChars = Union[Dict[str, str], Iterable[str]] +EscapeChars = Union[Mapping[str, str], Iterable[str]] def adjoin(space: int, *lists: List[str], **kwargs) -> str: @@ -119,7 +129,7 @@ def _pprint_seq( def _pprint_dict( - seq: Dict, _nest_lvl: int = 0, max_seq_items: Optional[int] = None, **kwds + seq: Mapping, _nest_lvl: int = 0, max_seq_items: Optional[int] = None, **kwds ) -> str: """ internal. pprinter for iterables. you should probably use pprint_thing() diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index c71677fa3b570..6e9e0a0b01200 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -2,7 +2,7 @@ from io import StringIO from itertools import islice import os -from typing import Any, Callable, Dict, List, Optional, Type, Union +from typing import Any, Callable, Optional, Type import numpy as np @@ -13,7 +13,7 @@ from pandas.core.dtypes.common import ensure_str, is_period_dtype from pandas import DataFrame, MultiIndex, Series, compat, isna, to_datetime -from pandas._typing import Scalar +from pandas._typing import JSONSerializable from pandas.core.reshape.concat import concat from pandas.io.common import ( @@ -34,8 +34,6 @@ TABLE_SCHEMA_VERSION = "0.20.0" -Serializable = Union[Scalar, List, Dict] - # interface to/from def to_json( @@ -46,7 +44,7 @@ def to_json( double_precision: int = 10, force_ascii: bool = True, date_unit: str = "ms", - default_handler: Optional[Callable[[Any], Serializable]] = None, + default_handler: Optional[Callable[[Any], JSONSerializable]] = None, lines: bool = False, compression: Optional[str] = "infer", index: bool = True, @@ -110,7 +108,7 @@ def __init__( ensure_ascii: bool, date_unit: str, index: bool, - default_handler: Optional[Callable[[Any], Serializable]] = None, + default_handler: Optional[Callable[[Any], JSONSerializable]] = None, indent: int = 0, ): self.obj = obj @@ -153,7 +151,7 @@ def _write( ensure_ascii: bool, date_unit: str, iso_dates: bool, - default_handler: Optional[Callable[[Any], Serializable]], + default_handler: Optional[Callable[[Any], JSONSerializable]], indent: int, ): return dumps( @@ -186,7 +184,7 @@ def _write( ensure_ascii: bool, date_unit: str, iso_dates: bool, - default_handler: Optional[Callable[[Any], Serializable]], + default_handler: Optional[Callable[[Any], JSONSerializable]], indent: int, ): if not self.index and orient == "split": @@ -233,7 +231,7 @@ def _write( ensure_ascii: bool, date_unit: str, iso_dates: bool, - default_handler: Optional[Callable[[Any], Serializable]], + default_handler: Optional[Callable[[Any], JSONSerializable]], indent: int, ): if not self.index and orient == "split": @@ -263,7 +261,7 @@ def __init__( ensure_ascii: bool, date_unit: str, index: bool, - default_handler: Optional[Callable[[Any], Serializable]] = None, + default_handler: Optional[Callable[[Any], JSONSerializable]] = None, indent: int = 0, ): """ diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index ebc015c820c14..f8c08ed8c099f 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -4,8 +4,8 @@ from typing import ( Any, Callable, - Dict, List, + Mapping, Optional, Tuple, Type, @@ -104,7 +104,7 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: def deprecate_kwarg( old_arg_name: str, new_arg_name: Optional[str], - mapping: Optional[Union[Dict[Any, Any], Callable[[Any], Any]]] = None, + mapping: Optional[Union[Mapping[Any, Any], Callable[[Any], Any]]] = None, stacklevel: int = 2, ) -> Callable[..., Any]: """ From 5ce7b1a8d0d0eed462d4ec29269bab28a5af208b Mon Sep 17 00:00:00 2001 From: Jan-Philip Gehrcke Date: Tue, 22 Oct 2019 21:42:58 +0200 Subject: [PATCH 058/112] DOC: attempt to fix contributors for 0.24.0, 0.25.0/1/2 (#29152) --- doc/source/whatsnew/v0.24.0.rst | 2 +- doc/source/whatsnew/v0.25.0.rst | 2 +- doc/source/whatsnew/v0.25.1.rst | 2 +- doc/source/whatsnew/v0.25.2.rst | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index d9f41d2a75116..42579becd4237 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1933,4 +1933,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v0.23.4..HEAD +.. contributors:: v0.23.4..v0.24.0 diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 503f9b6bfb1f0..2106d13dd615b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1267,4 +1267,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v0.24.x..HEAD +.. contributors:: v0.24.2..v0.25.0 diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 63dd56f4a3793..2e9524fea89b1 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -116,4 +116,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v0.25.0..HEAD +.. contributors:: v0.25.0..v0.25.1 diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index a99751f9bab9f..c0c68ce4b1f44 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -46,4 +46,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v0.25.1..HEAD +.. contributors:: v0.25.1..v0.25.2 From 052a64a3f19d23c3f54ba0c828a8638a0cc4e0ac Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 22 Oct 2019 13:17:30 -0700 Subject: [PATCH 059/112] Remove unnecessary check for Python >= 3.5 (#29167) --- doc/source/conf.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 86f78d9c0f0ae..cdabf2d470839 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -608,11 +608,7 @@ def linkcode_resolve(domain, info): return None try: - # inspect.unwrap() was added in Python version 3.4 - if sys.version_info >= (3, 5): - fn = inspect.getsourcefile(inspect.unwrap(obj)) - else: - fn = inspect.getsourcefile(obj) + fn = inspect.getsourcefile(inspect.unwrap(obj)) except TypeError: fn = None if not fn: From 41933cc115533ff632e2dac5df0de0413625a14f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 22 Oct 2019 13:18:22 -0700 Subject: [PATCH 060/112] REF: avoid getattr pattern for groupby-rank functions (#29166) --- pandas/_libs/groupby.pyx | 20 -------------------- pandas/core/groupby/ops.py | 14 +++++++++++--- pandas/tests/groupby/test_rank.py | 3 ++- 3 files changed, 13 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index c21528a7082f6..b2ffbb3ecb4f2 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -923,12 +923,6 @@ def group_last(rank_t[:, :] out, raise RuntimeError("empty group with uint64_t") -group_last_float64 = group_last["float64_t"] -group_last_float32 = group_last["float32_t"] -group_last_int64 = group_last["int64_t"] -group_last_object = group_last["object"] - - @cython.wraparound(False) @cython.boundscheck(False) def group_nth(rank_t[:, :] out, @@ -1020,12 +1014,6 @@ def group_nth(rank_t[:, :] out, raise RuntimeError("empty group with uint64_t") -group_nth_float64 = group_nth["float64_t"] -group_nth_float32 = group_nth["float32_t"] -group_nth_int64 = group_nth["int64_t"] -group_nth_object = group_nth["object"] - - @cython.boundscheck(False) @cython.wraparound(False) def group_rank(float64_t[:, :] out, @@ -1213,14 +1201,6 @@ def group_rank(float64_t[:, :] out, out[i, 0] = out[i, 0] / grp_sizes[i, 0] -group_rank_float64 = group_rank["float64_t"] -group_rank_float32 = group_rank["float32_t"] -group_rank_int64 = group_rank["int64_t"] -group_rank_uint64 = group_rank["uint64_t"] -# Note: we do not have a group_rank_object because that would require a -# not-nogil implementation, see GH#19560 - - # ---------------------------------------------------------------------- # group_min, group_max # ---------------------------------------------------------------------- diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fcc646dec89d9..00e7012b40986 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -419,13 +419,21 @@ def get_func(fname): # otherwise find dtype-specific version, falling back to object for dt in [dtype_str, "object"]: - f = getattr( + f2 = getattr( libgroupby, "{fname}_{dtype_str}".format(fname=fname, dtype_str=dt), None, ) - if f is not None: - return f + if f2 is not None: + return f2 + + if hasattr(f, "__signatures__"): + # inspect what fused types are implemented + if dtype_str == "object" and "object" not in f.__signatures__: + # return None so we get a NotImplementedError below + # instead of a TypeError at runtime + return None + return f ftype = self._cython_functions[kind][how] diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index a6ea793b53c41..e407cfadac2fb 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -3,6 +3,7 @@ import pandas as pd from pandas import DataFrame, Series, concat +from pandas.core.base import DataError from pandas.util import testing as tm @@ -384,7 +385,7 @@ def test_rank_avg_even_vals(): def test_rank_object_raises(ties_method, ascending, na_option, pct, vals): df = DataFrame({"key": ["foo"] * 5, "val": vals}) - with pytest.raises(TypeError, match="not callable"): + with pytest.raises(DataError, match="No numeric types to aggregate"): df.groupby("key").rank( method=ties_method, ascending=ascending, na_option=na_option, pct=pct ) From 22a8337a641c7faa6ce2fcde1e5c34185ac59a64 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 22 Oct 2019 22:27:28 +0100 Subject: [PATCH 061/112] TYPING : Series.name -> Optional[Hashable] (#29164) * TYPING : Series.name -> Optional[Hashable] --- pandas/core/frame.py | 2 +- pandas/core/generic.py | 12 ++++++------ pandas/core/groupby/generic.py | 5 +++-- pandas/core/groupby/groupby.py | 2 +- pandas/core/series.py | 6 +++--- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f2074bab276ac..a4d3e7058d7de 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -871,7 +871,7 @@ def style(self): """ @Appender(_shared_docs["items"]) - def items(self) -> Iterable[Tuple[Hashable, Series]]: + def items(self) -> Iterable[Tuple[Optional[Hashable], Series]]: if self.columns.is_unique and hasattr(self, "_item_cache"): for k in self.columns: yield k, self._get_item_cache(k) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 75bbfd9c12216..d3c32511b40c3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -194,7 +194,7 @@ class NDFrame(PandasObject, SelectionMixin): # TODO(PY36): replace with _attrs : Dict[Hashable, Any] # We need the TYPE_CHECKING, because _attrs is not a class attribute # and Py35 doesn't support the new syntax. - _attrs = {} # type: Dict[Hashable, Any] + _attrs = {} # type: Dict[Optional[Hashable], Any] # ---------------------------------------------------------------------- # Constructors @@ -205,7 +205,7 @@ def __init__( axes: Optional[List[Index]] = None, copy: bool = False, dtype: Optional[Dtype] = None, - attrs: Optional[Mapping[Hashable, Any]] = None, + attrs: Optional[Mapping[Optional[Hashable], Any]] = None, fastpath: bool = False, ): @@ -248,7 +248,7 @@ def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): # ---------------------------------------------------------------------- @property - def attrs(self) -> Dict[Hashable, Any]: + def attrs(self) -> Dict[Optional[Hashable], Any]: """ Dictionary of global attributes on this object. """ @@ -257,7 +257,7 @@ def attrs(self) -> Dict[Hashable, Any]: return self._attrs @attrs.setter - def attrs(self, value: Mapping[Hashable, Any]) -> None: + def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None: self._attrs = dict(value) @property @@ -3149,10 +3149,10 @@ def to_csv( sep: str = ",", na_rep: str = "", float_format: Optional[str] = None, - columns: Optional[Sequence[Hashable]] = None, + columns: Optional[Sequence[Optional[Hashable]]] = None, header: Union[bool_t, List[str]] = True, index: bool_t = True, - index_label: Optional[Union[bool_t, str, Sequence[Hashable]]] = None, + index_label: Optional[Union[bool_t, str, Sequence[Optional[Hashable]]]] = None, mode: str = "w", encoding: Optional[str] = None, compression: Optional[Union[str, Mapping[str, str]]] = "infer", diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5c7c56e2a31df..8ba9dbcc575f7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -17,6 +17,7 @@ FrozenSet, Hashable, Iterable, + Optional, Sequence, Tuple, Type, @@ -142,7 +143,7 @@ def pinner(cls): class SeriesGroupBy(GroupBy): _apply_whitelist = base.series_apply_whitelist - def _iterate_slices(self) -> Iterable[Tuple[Hashable, Series]]: + def _iterate_slices(self) -> Iterable[Tuple[Optional[Hashable], Series]]: yield self._selection_name, self._selected_obj @property @@ -926,7 +927,7 @@ def aggregate(self, func=None, *args, **kwargs): agg = aggregate - def _iterate_slices(self) -> Iterable[Tuple[Hashable, Series]]: + def _iterate_slices(self) -> Iterable[Tuple[Optional[Hashable], Series]]: obj = self._selected_obj if self.axis == 1: obj = obj.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f88f2e21bd595..7d1c74e415658 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -745,7 +745,7 @@ def _python_apply_general(self, f): keys, values, not_indexed_same=mutated or self.mutated ) - def _iterate_slices(self) -> Iterable[Tuple[Hashable, Series]]: + def _iterate_slices(self) -> Iterable[Tuple[Optional[Hashable], Series]]: raise AbstractMethodError(self) def transform(self, func, *args, **kwargs): diff --git a/pandas/core/series.py b/pandas/core/series.py index 5f1a7624f47e4..3e9d3d5c04559 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5,7 +5,7 @@ from io import StringIO from shutil import get_terminal_size from textwrap import dedent -from typing import Any, Callable, Hashable, List +from typing import Any, Callable, Hashable, List, Optional import warnings import numpy as np @@ -472,11 +472,11 @@ def dtypes(self): return self._data.dtype @property - def name(self) -> Hashable: + def name(self) -> Optional[Hashable]: return self.attrs.get("name", None) @name.setter - def name(self, value: Hashable) -> None: + def name(self, value: Optional[Hashable]) -> None: if not is_hashable(value): raise TypeError("Series.name must be a hashable type") self.attrs["name"] = value From 586a9e72b29a64f891b194d37bfaf4010e679f33 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 22 Oct 2019 16:11:57 -0700 Subject: [PATCH 062/112] Clarify referred command for flake8 caveat (#29168) --- doc/source/development/contributing.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 62e582dffae47..677e28b60c51d 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -618,7 +618,8 @@ from the root of the pandas repository. Now ``black`` and ``flake8`` will be run each time you commit changes. You can skip these checks with ``git commit --no-verify``. -This command will catch any stylistic errors in your changes specifically, but +One caveat about ``git diff upstream/master -u -- "*.py" | flake8 --diff``: this +command will catch any stylistic errors in your changes specifically, but be beware it may not catch all of them. For example, if you delete the only usage of an imported function, it is stylistically incorrect to import an unused function. However, style-checking the diff will not catch this because From 1e9a0479b527e9f426f7a35bda19c7c93e51df3e Mon Sep 17 00:00:00 2001 From: Jeroen Kant <45035434+jjlkant@users.noreply.github.com> Date: Wed, 23 Oct 2019 02:52:50 +0200 Subject: [PATCH 063/112] CLN: Fix typing in pandas\tests\arrays\test_datetimelike.py (#28926) (#29014) --- pandas/tests/arrays/test_datetimelike.py | 7 ++++++- setup.cfg | 3 --- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 7c482664bca48..117a19acbfc3a 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1,3 +1,5 @@ +from typing import Type, Union + import numpy as np import pytest @@ -5,6 +7,9 @@ import pandas as pd from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.period import PeriodIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex import pandas.util.testing as tm @@ -52,7 +57,7 @@ def timedelta_index(request): class SharedTests: - index_cls = None + index_cls = None # type: Type[Union[DatetimeIndex, PeriodIndex, TimedeltaIndex]] def test_compare_len1_raises(self): # make sure we raise when comparing with different lengths, specific diff --git a/setup.cfg b/setup.cfg index 766099a9d7521..f7920fb61b942 100644 --- a/setup.cfg +++ b/setup.cfg @@ -136,9 +136,6 @@ ignore_errors=True [mypy-pandas.tests.arithmetic.test_datetime64] ignore_errors=True -[mypy-pandas.tests.arrays.test_datetimelike] -ignore_errors=True - [mypy-pandas.tests.dtypes.test_common] ignore_errors=True From 3fbb0bcbf715ea46d6dbd154cd4c35910fd1b9fa Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 22 Oct 2019 18:38:56 -0700 Subject: [PATCH 064/112] BUG: fix TypeError raised in maybe_downcast_numeric (#29103) --- pandas/core/dtypes/cast.py | 2 +- pandas/core/groupby/generic.py | 2 +- pandas/tests/dtypes/cast/test_downcast.py | 9 +++++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index dd001e78c07de..7fcaf60088ad2 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -202,7 +202,7 @@ def trans(x): r = result.ravel() arr = np.array([r[0]]) - if isna(arr).any() or not np.allclose(arr, trans(arr).astype(dtype), rtol=0): + if isna(arr).any(): # if we have any nulls, then we are done return result diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8ba9dbcc575f7..9c9598273edd3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -262,7 +262,7 @@ def aggregate(self, func=None, *args, **kwargs): try: return self._python_agg_general(func, *args, **kwargs) - except AssertionError: + except (AssertionError, TypeError): raise except Exception: result = self._aggregate_named(func, *args, **kwargs) diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index d574b03a8c724..9e2eca5259bc3 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -1,3 +1,5 @@ +import decimal + import numpy as np import pytest @@ -25,6 +27,13 @@ "infer", np.array([8, 8, 8, 8, 9], dtype=np.int64), ), + ( + # This is a judgement call, but we do _not_ downcast Decimal + # objects + np.array([decimal.Decimal(0.0)]), + "int64", + np.array([decimal.Decimal(0.0)]), + ), ], ) def test_downcast(arr, expected, dtype): From cc93fdc648d4907ba7f71dea1df600dbb2086c4c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 23 Oct 2019 04:46:41 -0700 Subject: [PATCH 065/112] Stop catching TypeError in _aggregate_item_by_item (#29177) --- pandas/core/groupby/generic.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9c9598273edd3..c766fcaa4f849 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1099,10 +1099,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): cast = self._transform_should_cast(func) try: - result[item] = colg.aggregate(func, *args, **kwargs) - if cast: - result[item] = self._try_cast(result[item], data) except ValueError as err: if "Must produce aggregated value" in str(err): @@ -1111,10 +1108,10 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): raise cannot_agg.append(item) continue - except TypeError as e: - cannot_agg.append(item) - errors = e - continue + + else: + if cast: + result[item] = self._try_cast(result[item], data) result_columns = obj.columns if cannot_agg: From 19796250b35ebf2081d376cb91532d5b91cbbc75 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 23 Oct 2019 04:47:40 -0700 Subject: [PATCH 066/112] CLN: Assorted cleanups (#29175) --- pandas/_libs/hashtable_class_helper.pxi.in | 40 ++++++++++++---------- pandas/_libs/hashtable_func_helper.pxi.in | 29 +++++----------- pandas/_libs/internals.pyx | 2 +- pandas/_libs/interval.pyx | 2 +- pandas/_libs/lib.pyx | 5 +-- pandas/core/generic.py | 2 +- pandas/core/util/hashing.py | 6 ++-- 7 files changed, 37 insertions(+), 49 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 1cbdb0df6233c..c39d6d60d4ea5 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -13,7 +13,7 @@ from pandas._libs.tslibs.util cimport get_c_string {{py: -# name, dtype, arg +# name, dtype, c_type # the generated StringVector is not actually used # but is included for completeness (rather ObjectVector is used # for uniques in hashtables) @@ -24,13 +24,13 @@ dtypes = [('Float64', 'float64', 'float64_t'), ('UInt64', 'uint64', 'uint64_t')] }} -{{for name, dtype, arg in dtypes}} +{{for name, dtype, c_type in dtypes}} {{if dtype != 'int64'}} ctypedef struct {{name}}VectorData: - {{arg}} *data + {{c_type}} *data Py_ssize_t n, m {{endif}} @@ -39,7 +39,7 @@ ctypedef struct {{name}}VectorData: @cython.wraparound(False) @cython.boundscheck(False) cdef inline void append_data_{{dtype}}({{name}}VectorData *data, - {{arg}} x) nogil: + {{c_type}} x) nogil: data.data[data.n] = x data.n += 1 @@ -61,14 +61,14 @@ cdef inline bint needs_resize(vector_data *data) nogil: {{py: -# name, dtype, arg, idtype -dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'), - ('UInt64', 'uint64', 'uint64_t', 'np.uint64'), - ('Int64', 'int64', 'int64_t', 'np.int64')] +# name, dtype, c_type +dtypes = [('Float64', 'float64', 'float64_t'), + ('UInt64', 'uint64', 'uint64_t'), + ('Int64', 'int64', 'int64_t')] }} -{{for name, dtype, arg, idtype in dtypes}} +{{for name, dtype, c_type in dtypes}} cdef class {{name}}Vector: @@ -87,13 +87,13 @@ cdef class {{name}}Vector: self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype={{idtype}}) - self.data.data = <{{arg}}*>self.ao.data + self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) + self.data.data = <{{c_type}}*>self.ao.data cdef resize(self): self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) self.ao.resize(self.data.m, refcheck=False) - self.data.data = <{{arg}}*>self.ao.data + self.data.data = <{{c_type}}*>self.ao.data def __dealloc__(self): if self.data is not NULL: @@ -113,7 +113,7 @@ cdef class {{name}}Vector: self.external_view_exists = True return self.ao - cdef inline void append(self, {{arg}} x): + cdef inline void append(self, {{c_type}} x): if needs_resize(self.data): if self.external_view_exists: @@ -123,7 +123,7 @@ cdef class {{name}}Vector: append_data_{{dtype}}(self.data, x) - cdef extend(self, const {{arg}}[:] x): + cdef extend(self, const {{c_type}}[:] x): for i in range(len(x)): self.append(x[i]) @@ -279,7 +279,8 @@ cdef class {{name}}HashTable(HashTable): self.table = NULL def __contains__(self, object key): - cdef khiter_t k + cdef: + khiter_t k k = kh_get_{{dtype}}(self.table, key) return k != self.table.n_buckets @@ -290,7 +291,8 @@ cdef class {{name}}HashTable(HashTable): sizeof(uint32_t)) # flags cpdef get_item(self, {{dtype}}_t val): - cdef khiter_t k + cdef: + khiter_t k k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: return self.table.vals[k] @@ -899,7 +901,8 @@ cdef class PyObjectHashTable(HashTable): return self.table.size def __contains__(self, object key): - cdef khiter_t k + cdef: + khiter_t k hash(key) k = kh_get_pymap(self.table, key) @@ -912,7 +915,8 @@ cdef class PyObjectHashTable(HashTable): sizeof(uint32_t)) # flags cpdef get_item(self, object val): - cdef khiter_t k + cdef: + khiter_t k k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index f6af93f85bd5a..c4284ae403e5c 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -4,13 +4,9 @@ Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -# ---------------------------------------------------------------------- -# VectorData -# ---------------------------------------------------------------------- - {{py: -# dtype, ttype +# dtype, ttype, c_type dtypes = [('float64', 'float64', 'float64_t'), ('uint64', 'uint64', 'uint64_t'), ('object', 'pymap', 'object'), @@ -18,7 +14,7 @@ dtypes = [('float64', 'float64', 'float64_t'), }} -{{for dtype, ttype, scalar in dtypes}} +{{for dtype, ttype, c_type in dtypes}} @cython.wraparound(False) @@ -34,7 +30,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, khiter_t k Py_ssize_t i, n = len(values) - {{scalar}} val + {{c_type}} val int ret = 0 @@ -77,7 +73,7 @@ cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, {{if dtype == 'object'}} cpdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna): {{else}} -cpdef value_count_{{dtype}}({{scalar}}[:] values, bint dropna): +cpdef value_count_{{dtype}}({{c_type}}[:] values, bint dropna): {{endif}} cdef: Py_ssize_t i = 0 @@ -127,13 +123,9 @@ cpdef value_count_{{dtype}}({{scalar}}[:] values, bint dropna): @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} - - def duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first'): {{else}} - - -def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): +def duplicated_{{dtype}}({{c_type}}[:] values, object keep='first'): {{endif}} cdef: int ret = 0 @@ -212,15 +204,10 @@ def duplicated_{{dtype}}({{scalar}}[:] values, object keep='first'): @cython.wraparound(False) @cython.boundscheck(False) {{if dtype == 'object'}} - - -def ismember_{{dtype}}(ndarray[{{scalar}}] arr, ndarray[{{scalar}}] values): +def ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values): {{else}} - - -def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values): +def ismember_{{dtype}}({{c_type}}[:] arr, {{c_type}}[:] values): {{endif}} - """ Return boolean of values in arr on an element by-element basis @@ -238,7 +225,7 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values): Py_ssize_t i, n, k int ret = 0 ndarray[uint8_t] result - {{scalar}} val + {{c_type}} val kh_{{ttype}}_t *table = kh_init_{{ttype}}() # construct the table diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 5f697f282fee5..48190d123f4a9 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -284,7 +284,7 @@ cdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX): return start, stop, step, length -def slice_getitem(slice slc not None, ind): +cdef slice_getitem(slice slc, ind): cdef: Py_ssize_t s_start, s_stop, s_step, s_len Py_ssize_t ind_start, ind_stop, ind_step, ind_len diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 6a3f20928f64b..1a712d0c4efa8 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -18,7 +18,7 @@ cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.hashtable cimport Int64Vector, Int64VectorData +from pandas._libs.hashtable cimport Int64Vector from pandas._libs.tslibs.util cimport is_integer_object, is_float_object from pandas._libs.tslibs import Timestamp diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index b13246a4a969c..7fc4fede1996b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -9,12 +9,9 @@ import warnings import cython from cython import Py_ssize_t -from cpython.list cimport PyList_New -from cpython.object cimport (PyObject_Str, PyObject_RichCompareBool, Py_EQ, - Py_SIZE) +from cpython.object cimport PyObject_RichCompareBool, Py_EQ from cpython.ref cimport Py_INCREF from cpython.tuple cimport PyTuple_SET_ITEM, PyTuple_New -from cpython.unicode cimport PyUnicode_Join from cpython.datetime cimport (PyDateTime_Check, PyDate_Check, PyTime_Check, PyDelta_Check, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d3c32511b40c3..d59ce8db9ba8e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10852,7 +10852,7 @@ def transform(self, func, *args, **kwargs): Also returns None for empty %(klass)s. """ - def _find_valid_index(self, how): + def _find_valid_index(self, how: str): """ Retrieves the index of the first valid value. diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index e3617d53b000a..fddbea8ed0d7a 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -5,8 +5,8 @@ import numpy as np +from pandas._libs import Timestamp import pandas._libs.hashing as hashing -import pandas._libs.tslibs as tslibs from pandas.core.dtypes.cast import infer_dtype_from_scalar from pandas.core.dtypes.common import ( @@ -337,8 +337,8 @@ def _hash_scalar(val, encoding: str = "utf8", hash_key=None): # for tz-aware datetimes, we need the underlying naive UTC value and # not the tz aware object or pd extension type (as # infer_dtype_from_scalar would do) - if not isinstance(val, tslibs.Timestamp): - val = tslibs.Timestamp(val) + if not isinstance(val, Timestamp): + val = Timestamp(val) val = val.tz_convert(None) dtype, val = infer_dtype_from_scalar(val) From dd5d1c51481a1c92b1a43d54bc0db776bbf80233 Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Wed, 23 Oct 2019 16:23:09 +0200 Subject: [PATCH 067/112] Remove TestData from series-tests test_repr.py (#29148) --- pandas/tests/series/test_repr.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 3c6da304dd68d..9f881f5a5aa29 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -17,10 +17,8 @@ from pandas.core.index import MultiIndex import pandas.util.testing as tm -from .common import TestData - -class TestSeriesRepr(TestData): +class TestSeriesRepr: def test_multilevel_name_print(self): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], @@ -67,24 +65,24 @@ def test_name_printing(self): s = Series(index=date_range("20010101", "20020101"), name="test") assert "Name: test" in repr(s) - def test_repr(self): - str(self.ts) - str(self.series) - str(self.series.astype(int)) - str(self.objSeries) + def test_repr(self, datetime_series, string_series, object_series): + str(datetime_series) + str(string_series) + str(string_series.astype(int)) + str(object_series) str(Series(tm.randn(1000), index=np.arange(1000))) str(Series(tm.randn(1000), index=np.arange(1000, 0, step=-1))) # empty - str(self.empty) + str(Series()) # with NaNs - self.series[5:7] = np.NaN - str(self.series) + string_series[5:7] = np.NaN + str(string_series) # with Nones - ots = self.ts.astype("O") + ots = datetime_series.astype("O") ots[::2] = None repr(ots) @@ -102,8 +100,8 @@ def test_repr(self): ("\u03B1", "\u03B2", "\u03B3"), ("\u03B1", "bar"), ]: - self.series.name = name - repr(self.series) + string_series.name = name + repr(string_series) biggie = Series( tm.randn(1000), index=np.arange(1000), name=("foo", "bar", "baz") From 091bb06eeaca4af0c7479b25e0cbb24daaa0e73f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 23 Oct 2019 17:50:58 +0200 Subject: [PATCH 068/112] BUG/TST: ensure groupby.agg preserves extension dtype (#29144) --- pandas/core/groupby/ops.py | 8 ++- .../tests/extension/decimal/test_decimal.py | 52 +++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 00e7012b40986..e6f4f2f056058 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -672,7 +672,13 @@ def agg_series(self, obj, func): pass else: raise - return self._aggregate_series_pure_python(obj, func) + except TypeError as err: + if "ndarray" in str(err): + # raised in libreduction if obj's values is no ndarray + pass + else: + raise + return self._aggregate_series_pure_python(obj, func) def _aggregate_series_fast(self, obj, func): func = self._is_builtin_func(func) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 3ac9d37ccf4f3..86724d4d09819 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -426,3 +426,55 @@ def test_array_ufunc_series_defer(): tm.assert_series_equal(r1, expected) tm.assert_series_equal(r2, expected) + + +def test_groupby_agg(): + # Ensure that the result of agg is inferred to be decimal dtype + # https://github.com/pandas-dev/pandas/issues/29141 + + data = make_data()[:5] + df = pd.DataFrame( + {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)} + ) + + # single key, selected column + expected = pd.Series(to_decimal([data[0], data[3]])) + result = df.groupby("id1")["decimals"].agg(lambda x: x.iloc[0]) + tm.assert_series_equal(result, expected, check_names=False) + result = df["decimals"].groupby(df["id1"]).agg(lambda x: x.iloc[0]) + tm.assert_series_equal(result, expected, check_names=False) + + # multiple keys, selected column + expected = pd.Series( + to_decimal([data[0], data[1], data[3]]), + index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 1)]), + ) + result = df.groupby(["id1", "id2"])["decimals"].agg(lambda x: x.iloc[0]) + tm.assert_series_equal(result, expected, check_names=False) + result = df["decimals"].groupby([df["id1"], df["id2"]]).agg(lambda x: x.iloc[0]) + tm.assert_series_equal(result, expected, check_names=False) + + # multiple columns + expected = pd.DataFrame({"id2": [0, 1], "decimals": to_decimal([data[0], data[3]])}) + result = df.groupby("id1").agg(lambda x: x.iloc[0]) + tm.assert_frame_equal(result, expected, check_names=False) + + +def test_groupby_agg_ea_method(monkeypatch): + # Ensure that the result of agg is inferred to be decimal dtype + # https://github.com/pandas-dev/pandas/issues/29141 + + def DecimalArray__my_sum(self): + return np.sum(np.array(self)) + + monkeypatch.setattr(DecimalArray, "my_sum", DecimalArray__my_sum, raising=False) + + data = make_data()[:5] + df = pd.DataFrame({"id": [0, 0, 0, 1, 1], "decimals": DecimalArray(data)}) + expected = pd.Series(to_decimal([data[0] + data[1] + data[2], data[3] + data[4]])) + + result = df.groupby("id")["decimals"].agg(lambda x: x.values.my_sum()) + tm.assert_series_equal(result, expected, check_names=False) + s = pd.Series(DecimalArray(data)) + result = s.groupby(np.array([0, 0, 0, 1, 1])).agg(lambda x: x.values.my_sum()) + tm.assert_series_equal(result, expected, check_names=False) From 69838006def884566592c50f68e1d23f4637c093 Mon Sep 17 00:00:00 2001 From: Hugh Kelley <38143549+HughKelley@users.noreply.github.com> Date: Wed, 23 Oct 2019 12:07:22 -0400 Subject: [PATCH 069/112] Period 29073 (#29159) --- pandas/_libs/tslibs/period.pyx | 35 ++++++++++++++++++++++------------ pandas/core/dtypes/dtypes.py | 2 +- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 84a41b8757001..aed64aff14e0a 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1709,14 +1709,14 @@ cdef class _Period: def asfreq(self, freq, how='E'): """ - Convert Period to desired frequency, either at the start or end of the - interval. + Convert Period to desired frequency, at the start or end of the interval. Parameters ---------- - freq : string + freq : str + The desired frequency. how : {'E', 'S', 'end', 'start'}, default 'end' - Start or end of the timespan + Start or end of the timespan. Returns ------- @@ -1776,17 +1776,19 @@ cdef class _Period: def to_timestamp(self, freq=None, how='start', tz=None): """ - Return the Timestamp representation of the Period at the target - frequency at the specified end (how) of the Period. + Return the Timestamp representation of the Period. + + Uses the target frequency specified at the part of the period specified + by `how`, which is either `Start` or `Finish`. Parameters ---------- - freq : string or DateOffset + freq : str or DateOffset Target frequency. Default is 'D' if self.freq is week or - longer and 'S' otherwise + longer and 'S' otherwise. how : str, default 'S' (start) - 'S', 'E'. Can be aliased as case insensitive - 'Start', 'Finish', 'Begin', 'End' + One of 'S', 'E'. Can be aliased as case insensitive + 'Start', 'Finish', 'Begin', 'End'. Returns ------- @@ -2385,16 +2387,25 @@ class Period(_Period): Parameters ---------- value : Period or str, default None - The time period represented (e.g., '4Q2005') + The time period represented (e.g., '4Q2005'). freq : str, default None - One of pandas period strings or corresponding objects + One of pandas period strings or corresponding objects. + ordinal : int, default None + The period offset from the gregorian proleptic epoch. year : int, default None + Year value of the period. month : int, default 1 + Month value of the period. quarter : int, default None + Quarter value of the period. day : int, default 1 + Day value of the period. hour : int, default 0 + Hour value of the period. minute : int, default 0 + Minute value of the period. second : int, default 0 + Second value of the period. """ def __new__(cls, value=None, freq=None, ordinal=None, diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 5c8dbd6d68a50..7dca588e33839 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -793,7 +793,7 @@ class PeriodDtype(PandasExtensionDtype): Parameters ---------- freq : str or DateOffset - The frequency of this PeriodDtype + The frequency of this PeriodDtype. Attributes ---------- From 9a98680004a38d4e31e4bc4b145ef58d60f18e41 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 23 Oct 2019 10:26:15 -0700 Subject: [PATCH 070/112] TST: Test Series integer mod behavior on string (#29180) --- pandas/tests/series/test_operators.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 942ab0db37a57..4514164683015 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -878,6 +878,17 @@ def test_divmod(self): assert_series_equal(result[0], expected[0]) assert_series_equal(result[1], expected[1]) + @pytest.mark.parametrize("index", [None, range(9)]) + def test_series_integer_mod(self, index): + # see gh-24396 + s1 = Series(range(1, 10)) + s2 = Series("foo", index=index) + + msg = "not all arguments converted during string formatting" + + with pytest.raises(TypeError, match=msg): + s2 % s1 + class TestSeriesUnaryOps: # __neg__, __pos__, __inv__ From 0aa913e929ed66aa74d1caf940a326c616aba9f1 Mon Sep 17 00:00:00 2001 From: Mohit Anand Date: Wed, 23 Oct 2019 23:18:07 +0530 Subject: [PATCH 071/112] To string with encoding (#28951) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/frame.py | 7 ++++- pandas/io/formats/format.py | 10 +++++-- pandas/tests/io/formats/test_format.py | 36 ++++++++++++++++++++------ 4 files changed, 43 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 01a102f269886..cbc76127962d7 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -109,6 +109,7 @@ Other enhancements (:issue:`28368`) - :meth:`DataFrame.to_json` now accepts an ``indent`` integer argument to enable pretty printing of JSON output (:issue:`12004`) - :meth:`read_stata` can read Stata 119 dta files. (:issue:`28250`) +- Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`) - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) Build Changes diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a4d3e7058d7de..ef4e3e064d85e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -754,6 +754,7 @@ def to_string( decimal: str = ".", line_width: Optional[int] = None, max_colwidth: Optional[int] = None, + encoding: Optional[str] = None, ) -> Optional[str]: """ Render a DataFrame to a console-friendly tabular output. @@ -764,6 +765,10 @@ def to_string( Max width to truncate each column in characters. By default, no limit. .. versionadded:: 1.0.0 + encoding : str, default "utf-8" + Set character encoding. + + .. versionadded:: 1.0 %(returns)s See Also -------- @@ -802,7 +807,7 @@ def to_string( decimal=decimal, line_width=line_width, ) - return formatter.to_string(buf=buf) + return formatter.to_string(buf=buf, encoding=encoding) # ---------------------------------------------------------------------- diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 6ddba6a297bdc..17603809c2ea6 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -486,6 +486,8 @@ def get_buffer( if encoding is None: encoding = "utf-8" + elif not isinstance(buf, str): + raise ValueError("buf is not a file name and encoding is specified.") if hasattr(buf, "write"): yield buf @@ -896,8 +898,12 @@ def _join_multiline(self, *args) -> str: st = ed return "\n\n".join(str_lst) - def to_string(self, buf: Optional[FilePathOrBuffer[str]] = None) -> Optional[str]: - return self.get_result(buf=buf) + def to_string( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + ) -> Optional[str]: + return self.get_result(buf=buf, encoding=encoding) def to_latex( self, diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 454e2afb8abe0..9aba4c8aa5019 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -73,17 +73,19 @@ def filepath_or_buffer(filepath_or_buffer_id, tmp_path): @pytest.fixture -def assert_filepath_or_buffer_equals(filepath_or_buffer, filepath_or_buffer_id): +def assert_filepath_or_buffer_equals( + filepath_or_buffer, filepath_or_buffer_id, encoding +): """ Assertion helper for checking filepath_or_buffer. """ def _assert_filepath_or_buffer_equals(expected): if filepath_or_buffer_id == "string": - with open(filepath_or_buffer) as f: + with open(filepath_or_buffer, encoding=encoding) as f: result = f.read() elif filepath_or_buffer_id == "pathlike": - result = filepath_or_buffer.read_text() + result = filepath_or_buffer.read_text(encoding=encoding) elif filepath_or_buffer_id == "buffer": result = filepath_or_buffer.getvalue() assert result == expected @@ -3240,14 +3242,32 @@ def test_repr_html_ipython_config(ip): @pytest.mark.parametrize("method", ["to_string", "to_html", "to_latex"]) +@pytest.mark.parametrize( + "encoding, data", + [(None, "abc"), ("utf-8", "abc"), ("gbk", "造成输出中文显示乱码"), ("foo", "abc")], +) def test_filepath_or_buffer_arg( - float_frame, method, filepath_or_buffer, assert_filepath_or_buffer_equals + method, + filepath_or_buffer, + assert_filepath_or_buffer_equals, + encoding, + data, + filepath_or_buffer_id, ): - df = float_frame - expected = getattr(df, method)() + df = DataFrame([data]) - getattr(df, method)(buf=filepath_or_buffer) - assert_filepath_or_buffer_equals(expected) + if filepath_or_buffer_id not in ["string", "pathlike"] and encoding is not None: + with pytest.raises( + ValueError, match="buf is not a file name and encoding is specified." + ): + getattr(df, method)(buf=filepath_or_buffer, encoding=encoding) + elif encoding == "foo": + with pytest.raises(LookupError, match="unknown encoding"): + getattr(df, method)(buf=filepath_or_buffer, encoding=encoding) + else: + expected = getattr(df, method)() + getattr(df, method)(buf=filepath_or_buffer, encoding=encoding) + assert_filepath_or_buffer_equals(expected) @pytest.mark.parametrize("method", ["to_string", "to_html", "to_latex"]) From b6346369f66d8e790cf2b85a25c07a4f87d78cda Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 23 Oct 2019 14:36:22 -0500 Subject: [PATCH 072/112] DOC: Update the LaTeX author. (#29184) The LaTeX engine did not like the newlines. It caused an undefined control sequence. --- doc/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index cdabf2d470839..13d3324caf249 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -378,7 +378,7 @@ "index", "pandas.tex", "pandas: powerful Python data analysis toolkit", - r"Wes McKinney\n\& PyData Development Team", + "Wes McKinney and the Pandas Development Team", "manual", ) ] From 4879491022055a230861ed27c454a1e028423f74 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 23 Oct 2019 13:08:07 -0700 Subject: [PATCH 073/112] TST: Document and test na_filter in read_excel (#29171) --- pandas/io/excel/_base.py | 20 ++++++++++++++++++-- pandas/tests/io/excel/test_readers.py | 21 +++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 6eb1b9e950dfd..8574c9ad1d425 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -133,8 +133,24 @@ + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + """'. keep_default_na : bool, default True - If na_values are specified and keep_default_na is False the default NaN - values are overridden, otherwise they're appended to. + Whether or not to include the default NaN values when parsing the data. + Depending on whether `na_values` is passed in, the behavior is as follows: + + * If `keep_default_na` is True, and `na_values` are specified, `na_values` + is appended to the default NaN values used for parsing. + * If `keep_default_na` is True, and `na_values` are not specified, only + the default NaN values are used for parsing. + * If `keep_default_na` is False, and `na_values` are specified, only + the NaN values specified `na_values` are used for parsing. + * If `keep_default_na` is False, and `na_values` are not specified, no + strings will be parsed as NaN. + + Note that if `na_filter` is passed in as False, the `keep_default_na` and + `na_values` parameters will be ignored. +na_filter : bool, default True + Detect missing value markers (empty strings and the value of na_values). In + data without any NAs, passing na_filter=False can improve the performance + of reading a large file. verbose : bool, default False Indicate number of NA values placed in non-numeric columns. parse_dates : bool, list-like, or dict, default False diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 5326f2df68972..1d3653f685e1e 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -870,6 +870,27 @@ def test_excel_passes_na(self, read_ext): ) tm.assert_frame_equal(parsed, expected) + @pytest.mark.parametrize("na_filter", [None, True, False]) + def test_excel_passes_na_filter(self, read_ext, na_filter): + # gh-25453 + kwargs = {} + + if na_filter is not None: + kwargs["na_filter"] = na_filter + + with pd.ExcelFile("test5" + read_ext) as excel: + parsed = pd.read_excel( + excel, "Sheet1", keep_default_na=True, na_values=["apple"], **kwargs + ) + + if na_filter is False: + expected = [["1.#QNAN"], [1], ["nan"], ["apple"], ["rabbit"]] + else: + expected = [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]] + + expected = DataFrame(expected, columns=["Test"]) + tm.assert_frame_equal(parsed, expected) + @pytest.mark.parametrize("arg", ["sheet", "sheetname", "parse_cols"]) def test_unexpected_kwargs_raises(self, read_ext, arg): # gh-17964 From bec7043ed31fe65eb7f33e30403755d4c26b44a2 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 23 Oct 2019 14:18:05 -0700 Subject: [PATCH 074/112] CI: Avoid timeouts in 38 build (#29185) --- ci/build38.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/build38.sh b/ci/build38.sh index 903016536d240..66eb5cad38475 100644 --- a/ci/build38.sh +++ b/ci/build38.sh @@ -9,7 +9,7 @@ pip install python-dateutil pytz cython pytest pytest-xdist hypothesis pip install --pre -f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com/ numpy python setup.py build_ext -inplace -python -m pip install --no-build-isolation -e . +python -m pip install -v --no-build-isolation -e . python -c "import sys; print(sys.version_info)" python -c "import pandas as pd" From eda17f9fba673a9605261e468000487f6bb198ae Mon Sep 17 00:00:00 2001 From: rhstanton Date: Wed, 23 Oct 2019 17:04:12 -0700 Subject: [PATCH 075/112] BUG: Increase range of dates for holiday calculations (#27790) --- doc/source/whatsnew/v1.0.0.rst | 3 +++ pandas/tests/groupby/test_categorical.py | 3 ++- pandas/tests/tseries/holiday/test_calendar.py | 19 ++++++++++++++++++- pandas/tseries/holiday.py | 2 +- 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cbc76127962d7..8c1ce1195369d 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -413,6 +413,9 @@ Other - Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`) - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`) +- Fix :class:`AbstractHolidayCalendar` to return correct results for + years after 2030 (now goes up to 2200) (:issue:`27790`) + .. _whatsnew_1000.contributors: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 5391cb5ce821f..0e30b104bf9d2 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -784,7 +784,8 @@ def test_categorical_no_compress(): def test_sort(): - # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby # noqa: E501 + # http://stackoverflow.com/questions/23814368/sorting-pandas- + # categorical-labels-after-groupby # This should result in a properly sorted Series so that the plot # has a sorted x axis # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') diff --git a/pandas/tests/tseries/holiday/test_calendar.py b/pandas/tests/tseries/holiday/test_calendar.py index 79c28942769f0..c122f92ed228c 100644 --- a/pandas/tests/tseries/holiday/test_calendar.py +++ b/pandas/tests/tseries/holiday/test_calendar.py @@ -2,7 +2,7 @@ import pytest -from pandas import DatetimeIndex +from pandas import DatetimeIndex, offsets, to_datetime import pandas.util.testing as tm from pandas.tseries.holiday import ( @@ -10,6 +10,7 @@ Holiday, Timestamp, USFederalHolidayCalendar, + USLaborDay, USThanksgivingDay, get_calendar, ) @@ -81,3 +82,19 @@ def test_calendar_observance_dates(): def test_rule_from_name(): us_fed_cal = get_calendar("USFederalHolidayCalendar") assert us_fed_cal.rule_from_name("Thanksgiving") == USThanksgivingDay + + +def test_calendar_2031(): + # See gh-27790 + # + # Labor Day 2031 is on September 1. Saturday before is August 30. + # Next working day after August 30 ought to be Tuesday, September 2. + + class testCalendar(AbstractHolidayCalendar): + rules = [USLaborDay] + + cal = testCalendar() + workDay = offsets.CustomBusinessDay(calendar=cal) + Sat_before_Labor_Day_2031 = to_datetime("2031-08-30") + next_working_day = Sat_before_Labor_Day_2031 + 0 * workDay + assert next_working_day == to_datetime("2031-09-02") diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 1654163d2a9e0..eb8600031439f 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -346,7 +346,7 @@ class AbstractHolidayCalendar(metaclass=HolidayCalendarMetaClass): rules = [] # type: List[Holiday] start_date = Timestamp(datetime(1970, 1, 1)) - end_date = Timestamp(datetime(2030, 12, 31)) + end_date = Timestamp(datetime(2200, 12, 31)) _cache = None def __init__(self, name=None, rules=None): From c56d68a7f0b9626c0156cc4d72f2494e5d1b4a53 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 24 Oct 2019 05:03:19 -0700 Subject: [PATCH 076/112] CLN: preempt TypeError for EAs in groupby agg_series (#29186) --- pandas/core/groupby/generic.py | 7 ++++++- pandas/core/groupby/ops.py | 10 ++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c766fcaa4f849..2afb77a619a80 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -264,7 +264,12 @@ def aggregate(self, func=None, *args, **kwargs): return self._python_agg_general(func, *args, **kwargs) except (AssertionError, TypeError): raise - except Exception: + except (ValueError, KeyError, AttributeError, IndexError): + # TODO: IndexError can be removed here following GH#29106 + # TODO: AttributeError is caused by _index_data hijinx in + # libreduction, can be removed after GH#29160 + # TODO: KeyError is raised in _python_agg_general, + # see see test_groupby.test_basic result = self._aggregate_named(func, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e6f4f2f056058..fbe1598767736 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -26,6 +26,7 @@ is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, + is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, is_sparse, @@ -659,6 +660,12 @@ def _transform( return result def agg_series(self, obj, func): + if is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M": + # _aggregate_series_fast would raise TypeError when + # calling libreduction.Slider + # TODO: is the datetime64tz case supposed to go through here? + return self._aggregate_series_pure_python(obj, func) + try: return self._aggregate_series_fast(obj, func) except AssertionError: @@ -683,6 +690,8 @@ def agg_series(self, obj, func): def _aggregate_series_fast(self, obj, func): func = self._is_builtin_func(func) + # TODO: pre-empt this, also pre-empt get_result raising TypError if we pass a EA + # for EAs backed by ndarray we may have a performant workaround if obj.index._has_complex_internals: raise TypeError("Incompatible index for Cython grouper") @@ -717,6 +726,7 @@ def _aggregate_series_pure_python(self, obj, func): result[label] = res result = lib.maybe_convert_objects(result, try_float=0) + # TODO: try_cast back to EA? return result, counts From 051e1528e7affaf2188dd0416b31b24159de018f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 24 Oct 2019 05:06:26 -0700 Subject: [PATCH 077/112] CLN: dont catch Exception in _aggregate_frame (#29194) --- pandas/core/groupby/generic.py | 11 ++--------- pandas/tests/groupby/test_groupby.py | 2 +- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2afb77a619a80..695823e29ef1b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1079,15 +1079,8 @@ def _aggregate_frame(self, func, *args, **kwargs): else: for name in self.indices: data = self.get_group(name, obj=obj) - try: - fres = func(data, *args, **kwargs) - except AssertionError: - raise - except Exception: - wrapper = lambda x: func(x, *args, **kwargs) - result[name] = data.apply(wrapper, axis=axis) - else: - result[name] = self._try_cast(fres, data) + fres = func(data, *args, **kwargs) + result[name] = self._try_cast(fres, data) return self._wrap_frame_output(result, obj) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index dff5baa9b5984..43e2a6f040414 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -775,7 +775,7 @@ def test_omit_nuisance(df): # won't work with axis = 1 grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) - msg = r'\("unsupported operand type\(s\) for \+: ' "'Timestamp' and 'float'\", 0" + msg = r"unsupported operand type\(s\) for \+: 'Timestamp'" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) From fbe52f804e2f66afe0c2e48e1afa718ff902d623 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 24 Oct 2019 05:10:07 -0700 Subject: [PATCH 078/112] CLN: remove Block._try_coerce_arg (#29139) --- pandas/_libs/index.pyx | 22 ++-- pandas/core/internals/blocks.py | 145 +++++------------------ pandas/tests/internals/test_internals.py | 24 ++-- 3 files changed, 60 insertions(+), 131 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 144d555258c50..255fd85531d14 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -18,6 +18,7 @@ cnp.import_array() cimport pandas._libs.util as util from pandas._libs.tslibs.conversion cimport maybe_datetimelike_to_i8 +from pandas._libs.tslibs.nattype cimport c_NaT as NaT from pandas._libs.hashtable cimport HashTable @@ -547,30 +548,31 @@ cpdef convert_scalar(ndarray arr, object value): if util.is_array(value): pass elif isinstance(value, (datetime, np.datetime64, date)): - return Timestamp(value).value + return Timestamp(value).to_datetime64() elif util.is_timedelta64_object(value): # exclude np.timedelta64("NaT") from value != value below pass elif value is None or value != value: - return NPY_NAT - elif isinstance(value, str): - return Timestamp(value).value - raise ValueError("cannot set a Timestamp with a non-timestamp") + return np.datetime64("NaT", "ns") + raise ValueError("cannot set a Timestamp with a non-timestamp {typ}" + .format(typ=type(value).__name__)) elif arr.descr.type_num == NPY_TIMEDELTA: if util.is_array(value): pass elif isinstance(value, timedelta) or util.is_timedelta64_object(value): - return Timedelta(value).value + value = Timedelta(value) + if value is NaT: + return np.timedelta64("NaT", "ns") + return value.to_timedelta64() elif util.is_datetime64_object(value): # exclude np.datetime64("NaT") which would otherwise be picked up # by the `value != value check below pass elif value is None or value != value: - return NPY_NAT - elif isinstance(value, str): - return Timedelta(value).value - raise ValueError("cannot set a Timedelta with a non-timedelta") + return np.timedelta64("NaT", "ns") + raise ValueError("cannot set a Timedelta with a non-timedelta {typ}" + .format(typ=type(value).__name__)) if (issubclass(arr.dtype.type, (np.integer, np.floating, np.complex)) and not issubclass(arr.dtype.type, np.bool_)): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1495be1f26df5..51108d9a5a573 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,4 +1,4 @@ -from datetime import date, datetime, timedelta +from datetime import datetime, timedelta import functools import inspect import re @@ -7,7 +7,8 @@ import numpy as np -from pandas._libs import NaT, Timestamp, lib, tslib, writers +from pandas._libs import NaT, lib, tslib, writers +from pandas._libs.index import convert_scalar import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion from pandas._libs.tslibs.timezones import tz_compare @@ -54,7 +55,6 @@ from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCDatetimeIndex, ABCExtensionArray, ABCPandasArray, ABCSeries, @@ -64,7 +64,6 @@ array_equivalent, is_valid_nat_for_dtype, isna, - notna, ) import pandas.core.algorithms as algos @@ -663,28 +662,6 @@ def _can_hold_element(self, element: Any) -> bool: return issubclass(tipo.type, dtype) return isinstance(element, dtype) - def _try_coerce_args(self, other): - """ provide coercion to our input arguments """ - - if np.any(notna(other)) and not self._can_hold_element(other): - # coercion issues - # let higher levels handle - raise TypeError( - "cannot convert {} to an {}".format( - type(other).__name__, - type(self).__name__.lower().replace("Block", ""), - ) - ) - if np.any(isna(other)) and not self._can_hold_na: - raise TypeError( - "cannot convert {} to an {}".format( - type(other).__name__, - type(self).__name__.lower().replace("Block", ""), - ) - ) - - return other - def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.get_values() @@ -766,7 +743,11 @@ def replace( ) values = self.values - to_replace = self._try_coerce_args(to_replace) + if lib.is_scalar(to_replace) and isinstance(values, np.ndarray): + # The only non-DatetimeLike class that also has a non-trivial + # try_coerce_args is ObjectBlock, but that overrides replace, + # so does not get here. + to_replace = convert_scalar(values, to_replace) mask = missing.mask_missing(values, to_replace) if filter is not None: @@ -813,7 +794,8 @@ def _replace_single(self, *args, **kwargs): return self if kwargs["inplace"] else self.copy() def setitem(self, indexer, value): - """Set the value inplace, returning a a maybe different typed block. + """ + Set the value inplace, returning a a maybe different typed block. Parameters ---------- @@ -841,7 +823,10 @@ def setitem(self, indexer, value): # coerce if block dtype can store value values = self.values if self._can_hold_element(value): - value = self._try_coerce_args(value) + # We only get here for non-Extension Blocks, so _try_coerce_args + # is only relevant for DatetimeBlock and TimedeltaBlock + if lib.is_scalar(value): + value = convert_scalar(values, value) else: # current dtype cannot store value, coerce to common dtype @@ -862,7 +847,12 @@ def setitem(self, indexer, value): return b.setitem(indexer, value) # value must be storeable at this moment - arr_value = np.array(value) + if is_extension_array_dtype(getattr(value, "dtype", None)): + # We need to be careful not to allow through strings that + # can be parsed to EADtypes + arr_value = value + else: + arr_value = np.array(value) # cast the values to a type that can hold nan (if necessary) if not self._can_hold_element(value): @@ -938,7 +928,10 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) new = self.fill_value if self._can_hold_element(new): - new = self._try_coerce_args(new) + # We only get here for non-Extension Blocks, so _try_coerce_args + # is only relevant for DatetimeBlock and TimedeltaBlock + if lib.is_scalar(new): + new = convert_scalar(new_values, new) if transpose: new_values = new_values.T @@ -1176,7 +1169,10 @@ def _interpolate_with_fill( return [self.copy()] values = self.values if inplace else self.values.copy() - fill_value = self._try_coerce_args(fill_value) + + # We only get here for non-ExtensionBlock + fill_value = convert_scalar(self.values, fill_value) + values = missing.interpolate_2d( values, method=method, @@ -1375,7 +1371,10 @@ def func(cond, values, other): and np.isnan(other) ): # np.where will cast integer array to floats in this case - other = self._try_coerce_args(other) + if not self._can_hold_element(other): + raise TypeError + if lib.is_scalar(other) and isinstance(values, np.ndarray): + other = convert_scalar(values, other) fastres = expressions.where(cond, values, other) return fastres @@ -1641,7 +1640,6 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) # use block's copy logic. # .values may be an Index which does shallow copy by default new_values = self.values if inplace else self.copy().values - new = self._try_coerce_args(new) if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] @@ -2194,38 +2192,6 @@ def _can_hold_element(self, element: Any) -> bool: return is_valid_nat_for_dtype(element, self.dtype) - def _try_coerce_args(self, other): - """ - Coerce other to dtype 'i8'. NaN and NaT convert to - the smallest i8, and will correctly round-trip to NaT if converted - back in _try_coerce_result. values is always ndarray-like, other - may not be - - Parameters - ---------- - other : ndarray-like or scalar - - Returns - ------- - base-type other - """ - if is_valid_nat_for_dtype(other, self.dtype): - other = np.datetime64("NaT", "ns") - elif isinstance(other, (datetime, np.datetime64, date)): - other = Timestamp(other) - if other.tz is not None: - raise TypeError("cannot coerce a Timestamp with a tz on a naive Block") - other = other.asm8 - elif hasattr(other, "dtype") and is_datetime64_dtype(other): - # TODO: can we get here with non-nano? - pass - else: - # coercion issues - # let higher levels handle - raise TypeError(other) - - return other - def to_native_types( self, slicer=None, na_rep=None, date_format=None, quoting=None, **kwargs ): @@ -2364,10 +2330,6 @@ def _slice(self, slicer): return self.values[loc] return self.values[slicer] - def _try_coerce_args(self, other): - # DatetimeArray handles this for us - return other - def diff(self, n: int, axis: int = 0) -> List["Block"]: """ 1st discrete difference. @@ -2505,34 +2467,6 @@ def fillna(self, value, **kwargs): value = Timedelta(value, unit="s") return super().fillna(value, **kwargs) - def _try_coerce_args(self, other): - """ - Coerce values and other to datetime64[ns], with null values - converted to datetime64("NaT", "ns"). - - Parameters - ---------- - other : ndarray-like or scalar - - Returns - ------- - base-type other - """ - - if is_valid_nat_for_dtype(other, self.dtype): - other = np.timedelta64("NaT", "ns") - elif isinstance(other, (timedelta, np.timedelta64)): - other = Timedelta(other).to_timedelta64() - elif hasattr(other, "dtype") and is_timedelta64_dtype(other): - # TODO: can we get here with non-nano dtype? - pass - else: - # coercion issues - # let higher levels handle - raise TypeError(other) - - return other - def should_store(self, value): return issubclass( value.dtype.type, np.timedelta64 @@ -2668,21 +2602,6 @@ def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"] def _can_hold_element(self, element: Any) -> bool: return True - def _try_coerce_args(self, other): - """ provide coercion to our input arguments """ - - if isinstance(other, ABCDatetimeIndex): - # May get a DatetimeIndex here. Unbox it. - other = other.array - - if isinstance(other, DatetimeArray): - # hit in pandas/tests/indexing/test_coercion.py - # ::TestWhereCoercion::test_where_series_datetime64[datetime64tz] - # when falling back to ObjectBlock.where - other = other.astype(object) - - return other - def should_store(self, value): return not ( issubclass( diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 5eb9a067b11e4..ed1a321a3d7e6 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -327,19 +327,27 @@ def test_make_block_same_class(self): class TestDatetimeBlock: - def test_try_coerce_arg(self): + def test_can_hold_element(self): block = create_block("datetime", [0]) + # We will check that block._can_hold_element iff arr.__setitem__ works + arr = pd.array(block.values.ravel()) + # coerce None - none_coerced = block._try_coerce_args(None) - assert pd.Timestamp(none_coerced) is pd.NaT + assert block._can_hold_element(None) + arr[0] = None + assert arr[0] is pd.NaT - # coerce different types of date bojects - vals = (np.datetime64("2010-10-10"), datetime(2010, 10, 10), date(2010, 10, 10)) + # coerce different types of datetime objects + vals = [np.datetime64("2010-10-10"), datetime(2010, 10, 10)] for val in vals: - coerced = block._try_coerce_args(val) - assert np.datetime64 == type(coerced) - assert pd.Timestamp("2010-10-10") == pd.Timestamp(coerced) + assert block._can_hold_element(val) + arr[0] = val + + val = date(2010, 10, 10) + assert not block._can_hold_element(val) + with pytest.raises(TypeError): + arr[0] = val class TestBlockManager: From 96833c30cd73b746ca12c7ff5b6847a8ca0140ef Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Thu, 24 Oct 2019 14:10:56 +0200 Subject: [PATCH 079/112] Remove TestData from series-tests test_rank.py (#29101) --- pandas/tests/series/test_rank.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index 5dd27e4c20dcf..6a79edca01c99 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -9,12 +9,11 @@ from pandas import NaT, Series, Timestamp, date_range from pandas.api.types import CategoricalDtype -from pandas.tests.series.common import TestData import pandas.util.testing as tm from pandas.util.testing import assert_series_equal -class TestSeriesRank(TestData): +class TestSeriesRank: s = Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]) results = { @@ -25,20 +24,20 @@ class TestSeriesRank(TestData): "dense": np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3]), } - def test_rank(self): + def test_rank(self, datetime_series): pytest.importorskip("scipy.stats.special") rankdata = pytest.importorskip("scipy.stats.rankdata") - self.ts[::2] = np.nan - self.ts[:10][::3] = 4.0 + datetime_series[::2] = np.nan + datetime_series[:10][::3] = 4.0 - ranks = self.ts.rank() - oranks = self.ts.astype("O").rank() + ranks = datetime_series.rank() + oranks = datetime_series.astype("O").rank() assert_series_equal(ranks, oranks) - mask = np.isnan(self.ts) - filled = self.ts.fillna(np.inf) + mask = np.isnan(datetime_series) + filled = datetime_series.fillna(np.inf) # rankdata returns a ndarray exp = Series(rankdata(filled), index=filled.index, name="ts") From cfd0bb33f40cca571cbb4db784a450d22ef5dbdf Mon Sep 17 00:00:00 2001 From: Karthigeyan Date: Fri, 25 Oct 2019 00:08:03 +0530 Subject: [PATCH 080/112] CLN: remove noqa E241 #29207 (#29209) --- pandas/tests/dtypes/test_inference.py | 80 +++++++++++++-------------- pandas/tests/test_strings.py | 74 ++++++++++++------------- 2 files changed, 77 insertions(+), 77 deletions(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 60afd768195d9..aeec12b9ad14e 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -62,46 +62,46 @@ def coerce(request): # collect all objects to be tested for list-like-ness; use tuples of objects, # whether they are list-like or not (special casing for sets), and their ID ll_params = [ - ([1], True, "list"), # noqa: E241 - ([], True, "list-empty"), # noqa: E241 - ((1,), True, "tuple"), # noqa: E241 - (tuple(), True, "tuple-empty"), # noqa: E241 - ({"a": 1}, True, "dict"), # noqa: E241 - (dict(), True, "dict-empty"), # noqa: E241 - ({"a", 1}, "set", "set"), # noqa: E241 - (set(), "set", "set-empty"), # noqa: E241 - (frozenset({"a", 1}), "set", "frozenset"), # noqa: E241 - (frozenset(), "set", "frozenset-empty"), # noqa: E241 - (iter([1, 2]), True, "iterator"), # noqa: E241 - (iter([]), True, "iterator-empty"), # noqa: E241 - ((x for x in [1, 2]), True, "generator"), # noqa: E241 - ((_ for _ in []), True, "generator-empty"), # noqa: E241 - (Series([1]), True, "Series"), # noqa: E241 - (Series([]), True, "Series-empty"), # noqa: E241 - (Series(["a"]).str, True, "StringMethods"), # noqa: E241 - (Series([], dtype="O").str, True, "StringMethods-empty"), # noqa: E241 - (Index([1]), True, "Index"), # noqa: E241 - (Index([]), True, "Index-empty"), # noqa: E241 - (DataFrame([[1]]), True, "DataFrame"), # noqa: E241 - (DataFrame(), True, "DataFrame-empty"), # noqa: E241 - (np.ndarray((2,) * 1), True, "ndarray-1d"), # noqa: E241 - (np.array([]), True, "ndarray-1d-empty"), # noqa: E241 - (np.ndarray((2,) * 2), True, "ndarray-2d"), # noqa: E241 - (np.array([[]]), True, "ndarray-2d-empty"), # noqa: E241 - (np.ndarray((2,) * 3), True, "ndarray-3d"), # noqa: E241 - (np.array([[[]]]), True, "ndarray-3d-empty"), # noqa: E241 - (np.ndarray((2,) * 4), True, "ndarray-4d"), # noqa: E241 - (np.array([[[[]]]]), True, "ndarray-4d-empty"), # noqa: E241 - (np.array(2), False, "ndarray-0d"), # noqa: E241 - (1, False, "int"), # noqa: E241 - (b"123", False, "bytes"), # noqa: E241 - (b"", False, "bytes-empty"), # noqa: E241 - ("123", False, "string"), # noqa: E241 - ("", False, "string-empty"), # noqa: E241 - (str, False, "string-type"), # noqa: E241 - (object(), False, "object"), # noqa: E241 - (np.nan, False, "NaN"), # noqa: E241 - (None, False, "None"), # noqa: E241 + ([1], True, "list"), + ([], True, "list-empty"), + ((1,), True, "tuple"), + (tuple(), True, "tuple-empty"), + ({"a": 1}, True, "dict"), + (dict(), True, "dict-empty"), + ({"a", 1}, "set", "set"), + (set(), "set", "set-empty"), + (frozenset({"a", 1}), "set", "frozenset"), + (frozenset(), "set", "frozenset-empty"), + (iter([1, 2]), True, "iterator"), + (iter([]), True, "iterator-empty"), + ((x for x in [1, 2]), True, "generator"), + ((_ for _ in []), True, "generator-empty"), + (Series([1]), True, "Series"), + (Series([]), True, "Series-empty"), + (Series(["a"]).str, True, "StringMethods"), + (Series([], dtype="O").str, True, "StringMethods-empty"), + (Index([1]), True, "Index"), + (Index([]), True, "Index-empty"), + (DataFrame([[1]]), True, "DataFrame"), + (DataFrame(), True, "DataFrame-empty"), + (np.ndarray((2,) * 1), True, "ndarray-1d"), + (np.array([]), True, "ndarray-1d-empty"), + (np.ndarray((2,) * 2), True, "ndarray-2d"), + (np.array([[]]), True, "ndarray-2d-empty"), + (np.ndarray((2,) * 3), True, "ndarray-3d"), + (np.array([[[]]]), True, "ndarray-3d-empty"), + (np.ndarray((2,) * 4), True, "ndarray-4d"), + (np.array([[[[]]]]), True, "ndarray-4d-empty"), + (np.array(2), False, "ndarray-0d"), + (1, False, "int"), + (b"123", False, "bytes"), + (b"", False, "bytes-empty"), + ("123", False, "string"), + ("", False, "string-empty"), + (str, False, "string-type"), + (object(), False, "object"), + (np.nan, False, "NaN"), + (None, False, "None"), ] objs, expected, ids = zip(*ll_params) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 53d74f74dc439..cfaf123045b1f 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -21,46 +21,46 @@ def assert_series_or_index_equal(left, right): _any_string_method = [ - ("cat", (), {"sep": ","}), # noqa: E241 - ("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}), # noqa: E241 - ("center", (10,), {}), # noqa: E241 - ("contains", ("a",), {}), # noqa: E241 - ("count", ("a",), {}), # noqa: E241 - ("decode", ("UTF-8",), {}), # noqa: E241 - ("encode", ("UTF-8",), {}), # noqa: E241 - ("endswith", ("a",), {}), # noqa: E241 - ("extract", ("([a-z]*)",), {"expand": False}), # noqa: E241 - ("extract", ("([a-z]*)",), {"expand": True}), # noqa: E241 - ("extractall", ("([a-z]*)",), {}), # noqa: E241 - ("find", ("a",), {}), # noqa: E241 - ("findall", ("a",), {}), # noqa: E241 - ("get", (0,), {}), # noqa: E241 + ("cat", (), {"sep": ","}), + ("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}), + ("center", (10,), {}), + ("contains", ("a",), {}), + ("count", ("a",), {}), + ("decode", ("UTF-8",), {}), + ("encode", ("UTF-8",), {}), + ("endswith", ("a",), {}), + ("extract", ("([a-z]*)",), {"expand": False}), + ("extract", ("([a-z]*)",), {"expand": True}), + ("extractall", ("([a-z]*)",), {}), + ("find", ("a",), {}), + ("findall", ("a",), {}), + ("get", (0,), {}), # because "index" (and "rindex") fail intentionally # if the string is not found, search only for empty string - ("index", ("",), {}), # noqa: E241 - ("join", (",",), {}), # noqa: E241 - ("ljust", (10,), {}), # noqa: E241 - ("match", ("a",), {}), # noqa: E241 - ("normalize", ("NFC",), {}), # noqa: E241 - ("pad", (10,), {}), # noqa: E241 - ("partition", (" ",), {"expand": False}), # noqa: E241 - ("partition", (" ",), {"expand": True}), # noqa: E241 - ("repeat", (3,), {}), # noqa: E241 - ("replace", ("a", "z"), {}), # noqa: E241 - ("rfind", ("a",), {}), # noqa: E241 - ("rindex", ("",), {}), # noqa: E241 - ("rjust", (10,), {}), # noqa: E241 - ("rpartition", (" ",), {"expand": False}), # noqa: E241 - ("rpartition", (" ",), {"expand": True}), # noqa: E241 - ("slice", (0, 1), {}), # noqa: E241 - ("slice_replace", (0, 1, "z"), {}), # noqa: E241 - ("split", (" ",), {"expand": False}), # noqa: E241 - ("split", (" ",), {"expand": True}), # noqa: E241 - ("startswith", ("a",), {}), # noqa: E241 + ("index", ("",), {}), + ("join", (",",), {}), + ("ljust", (10,), {}), + ("match", ("a",), {}), + ("normalize", ("NFC",), {}), + ("pad", (10,), {}), + ("partition", (" ",), {"expand": False}), + ("partition", (" ",), {"expand": True}), + ("repeat", (3,), {}), + ("replace", ("a", "z"), {}), + ("rfind", ("a",), {}), + ("rindex", ("",), {}), + ("rjust", (10,), {}), + ("rpartition", (" ",), {"expand": False}), + ("rpartition", (" ",), {"expand": True}), + ("slice", (0, 1), {}), + ("slice_replace", (0, 1, "z"), {}), + ("split", (" ",), {"expand": False}), + ("split", (" ",), {"expand": True}), + ("startswith", ("a",), {}), # translating unicode points of "a" to "d" - ("translate", ({97: 100},), {}), # noqa: E241 - ("wrap", (2,), {}), # noqa: E241 - ("zfill", (10,), {}), # noqa: E241 + ("translate", ({97: 100},), {}), + ("wrap", (2,), {}), + ("zfill", (10,), {}), ] + list( zip( [ From 8d9005df3eebfe423359cddc1f438cb6bcd3936d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 25 Oct 2019 07:17:13 -0500 Subject: [PATCH 081/112] API: Change matplotlib formatter registration (#28722) --- ci/code_checks.sh | 2 +- doc/source/user_guide/visualization.rst | 15 +++++ doc/source/whatsnew/v1.0.0.rst | 27 ++++++++- pandas/core/config_init.py | 6 +- pandas/plotting/_core.py | 6 -- pandas/plotting/_matplotlib/__init__.py | 5 -- pandas/plotting/_matplotlib/boxplot.py | 3 - pandas/plotting/_matplotlib/converter.py | 69 ++++++++++++----------- pandas/plotting/_matplotlib/core.py | 9 +-- pandas/plotting/_matplotlib/hist.py | 3 - pandas/plotting/_matplotlib/timeseries.py | 4 +- pandas/plotting/_misc.py | 4 +- pandas/tests/plotting/test_converter.py | 42 +++----------- 13 files changed, 96 insertions(+), 99 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 0b9aae6676710..f4761c5663c9f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -208,7 +208,7 @@ import sys import pandas blacklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2', 'hypothesis', - 'lxml', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy', + 'lxml', 'matplotlib', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy', 'tables', 'urllib.request', 'xlrd', 'xlsxwriter', 'xlwt'} # GH#28227 for some of these check for top-level modules, while others are diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 609969b666726..39051440e9d9a 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1190,6 +1190,21 @@ with "(right)" in the legend. To turn off the automatic marking, use the plt.close('all') +.. _plotting.formatters: + +Custom formatters for timeseries plots +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionchanged:: 1.0.0 + +Pandas provides custom formatters for timeseries plots. These change the +formatting of the axis labels for dates and times. By default, +the custom formatters are applied only to plots created by pandas with +:meth:`DataFrame.plot` or :meth:`Series.plot`. To have them apply to all +plots, including those made by matplotlib, set the option +``pd.options.plotting.matplotlib.register_converters = True`` or use +:meth:`pandas.plotting.register_matplotlib_converters`. + Suppressing tick resolution adjustment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 8c1ce1195369d..736264a1196cf 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -175,7 +175,6 @@ Backwards incompatible API changes pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) - .. _whatsnew_1000.api.other: Other API changes @@ -187,8 +186,13 @@ Other API changes - In order to improve tab-completion, Pandas does not include most deprecated attributes when introspecting a pandas object using ``dir`` (e.g. ``dir(df)``). To see which attributes are excluded, see an object's ``_deprecations`` attribute, for example ``pd.DataFrame._deprecations`` (:issue:`28805`). - The returned dtype of ::func:`pd.unique` now matches the input dtype. (:issue:`27874`) +- Changed the default configuration value for ``options.matplotlib.register_converters`` from ``True`` to ``"auto"`` (:issue:`18720`). + Now, pandas custom formatters will only be applied to plots created by pandas, through :meth:`~DataFrame.plot`. + Previously, pandas' formatters would be applied to all plots created *after* a :meth:`~DataFrame.plot`. + See :ref:`units registration ` for more. - + .. _whatsnew_1000.api.documentation: Documentation Improvements @@ -221,6 +225,27 @@ with migrating existing code. Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_1000.matplotlib_units: + +**Matplotlib unit registration** + +Previously, pandas would register converters with matplotlib as a side effect of importing pandas (:issue:`18720`). +This changed the output of plots made via matplotlib plots after pandas was imported, even if you were using +matplotlib directly rather than rather than :meth:`~DataFrame.plot`. + +To use pandas formatters with a matplotlib plot, specify + +.. code-block:: python + + >>> import pandas as pd + >>> pd.options.plotting.matplotlib.register_converters = True + +Note that plots created by :meth:`DataFrame.plot` and :meth:`Series.plot` *do* register the converters +automatically. The only behavior change is when plotting a date-like object via ``matplotlib.pyplot.plot`` +or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. + +**Other removals** + - Removed the previously deprecated :meth:`Series.get_value`, :meth:`Series.set_value`, :meth:`DataFrame.get_value`, :meth:`DataFrame.set_value` (:issue:`17739`) - Changed the the default value of `inplace` in :meth:`DataFrame.set_index` and :meth:`Series.set_axis`. It now defaults to False (:issue:`27600`) - :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index bc2eb3511629d..ba0a4d81a88d3 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -599,7 +599,7 @@ def register_plotting_backend_cb(key): register_converter_doc = """ -: bool +: bool or 'auto'. Whether to register converters with matplotlib's units registry for dates, times, datetimes, and Periods. Toggling to False will remove the converters, restoring any converters that pandas overwrote. @@ -619,8 +619,8 @@ def register_converter_cb(key): with cf.config_prefix("plotting.matplotlib"): cf.register_option( "register_converters", - True, + "auto", register_converter_doc, - validator=bool, + validator=is_one_of_factory(["auto", True, False]), cb=register_converter_cb, ) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index eaf5b336bb8f6..127fdffafcf36 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -3,7 +3,6 @@ from pandas._config import get_option -from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_integer, is_list_like @@ -11,11 +10,6 @@ from pandas.core.base import PandasObject -# Trigger matplotlib import, which implicitly registers our -# converts. Implicit registration is deprecated, and when enforced -# we can lazily import matplotlib. -import_optional_dependency("pandas.plotting._matplotlib", raise_on_missing=False) - def hist_series( self, diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index d3b7a34b6c923..206600ad37acc 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -1,5 +1,3 @@ -from pandas._config import get_option - from pandas.plotting._matplotlib.boxplot import ( BoxPlot, boxplot, @@ -42,9 +40,6 @@ "hexbin": HexBinPlot, } -if get_option("plotting.matplotlib.register_converters"): - register(explicit=False) - def plot(data, kind, **kwargs): # Importing pyplot at the top of the file (before the converters are diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index eed328131da92..cfd6c3519d82c 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -11,7 +11,6 @@ import pandas as pd from pandas.io.formats.printing import pprint_thing -from pandas.plotting._matplotlib import converter from pandas.plotting._matplotlib.core import LinePlot, MPLPlot from pandas.plotting._matplotlib.style import _get_standard_colors from pandas.plotting._matplotlib.tools import _flatten, _subplots @@ -364,7 +363,6 @@ def boxplot_frame( ): import matplotlib.pyplot as plt - converter._WARN = False # no warning for pandas plots ax = boxplot( self, column=column, @@ -396,7 +394,6 @@ def boxplot_frame_groupby( sharey=True, **kwds ): - converter._WARN = False # no warning for pandas plots if subplots is True: naxes = len(grouped) fig, axes = _subplots( diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 446350cb5d915..946ce8bcec97f 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -1,6 +1,7 @@ +import contextlib import datetime as pydt from datetime import datetime, timedelta -import warnings +import functools from dateutil.relativedelta import relativedelta import matplotlib.dates as dates @@ -23,6 +24,7 @@ ) from pandas.core.dtypes.generic import ABCSeries +from pandas import get_option import pandas.core.common as com from pandas.core.index import Index from pandas.core.indexes.datetimes import date_range @@ -39,7 +41,6 @@ MUSEC_PER_DAY = 1e6 * SEC_PER_DAY -_WARN = True # Global for whether pandas has registered the units explicitly _mpl_units = {} # Cache for units overwritten by us @@ -55,13 +56,42 @@ def get_pairs(): return pairs -def register(explicit=True): - # Renamed in pandas.plotting.__init__ - global _WARN +def register_pandas_matplotlib_converters(func): + """ + Decorator applying pandas_converters. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs): + with pandas_converters(): + return func(*args, **kwargs) - if explicit: - _WARN = False + return wrapper + +@contextlib.contextmanager +def pandas_converters(): + """ + Context manager registering pandas' converters for a plot. + + See Also + -------- + register_pandas_matplotlib_converters : Decorator that applies this. + """ + value = get_option("plotting.matplotlib.register_converters") + + if value: + # register for True or "auto" + register() + try: + yield + finally: + if value == "auto": + # only deregister for "auto" + deregister() + + +def register(): pairs = get_pairs() for type_, cls in pairs: # Cache previous converter if present @@ -86,24 +116,6 @@ def deregister(): units.registry[unit] = formatter -def _check_implicitly_registered(): - global _WARN - - if _WARN: - msg = ( - "Using an implicitly registered datetime converter for a " - "matplotlib plotting method. The converter was registered " - "by pandas on import. Future versions of pandas will require " - "you to explicitly register matplotlib converters.\n\n" - "To register the converters:\n\t" - ">>> from pandas.plotting import register_matplotlib_converters" - "\n\t" - ">>> register_matplotlib_converters()" - ) - warnings.warn(msg, FutureWarning) - _WARN = False - - def _to_ordinalf(tm): tot_sec = tm.hour * 3600 + tm.minute * 60 + tm.second + float(tm.microsecond / 1e6) return tot_sec @@ -253,7 +265,6 @@ class DatetimeConverter(dates.DateConverter): @staticmethod def convert(values, unit, axis): # values might be a 1-d array, or a list-like of arrays. - _check_implicitly_registered() if is_nested_list_like(values): values = [DatetimeConverter._convert_1d(v, unit, axis) for v in values] else: @@ -330,7 +341,6 @@ def __init__(self, locator, tz=None, defaultfmt="%Y-%m-%d"): class PandasAutoDateLocator(dates.AutoDateLocator): def get_locator(self, dmin, dmax): """Pick the best locator based on a distance.""" - _check_implicitly_registered() delta = relativedelta(dmax, dmin) num_days = (delta.years * 12.0 + delta.months) * 31.0 + delta.days @@ -372,7 +382,6 @@ def get_unit_generic(freq): def __call__(self): # if no data have been set, this will tank with a ValueError - _check_implicitly_registered() try: dmin, dmax = self.viewlim_to_dt() except ValueError: @@ -990,7 +999,6 @@ def _get_default_locs(self, vmin, vmax): def __call__(self): "Return the locations of the ticks." # axis calls Locator.set_axis inside set_m_formatter - _check_implicitly_registered() vi = tuple(self.axis.get_view_interval()) if vi != self.plot_obj.view_interval: @@ -1075,7 +1083,6 @@ def set_locs(self, locs): "Sets the locations of the ticks" # don't actually use the locs. This is just needed to work with # matplotlib. Force to use vmin, vmax - _check_implicitly_registered() self.locs = locs @@ -1088,7 +1095,6 @@ def set_locs(self, locs): self._set_default_format(vmin, vmax) def __call__(self, x, pos=0): - _check_implicitly_registered() if self.formatdict is None: return "" @@ -1120,7 +1126,6 @@ def format_timedelta_ticks(x, pos, n_decimals): return s def __call__(self, x, pos=0): - _check_implicitly_registered() (vmin, vmax) = tuple(self.axis.get_view_interval()) n_decimals = int(np.ceil(np.log10(100 * 1e9 / (vmax - vmin)))) if n_decimals > 9: diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index a729951b3d7db..541dca715e814 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -4,8 +4,6 @@ import numpy as np -from pandas._config import get_option - from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -28,8 +26,8 @@ import pandas.core.common as com from pandas.io.formats.printing import pprint_thing -from pandas.plotting._matplotlib import converter from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0 +from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters from pandas.plotting._matplotlib.style import _get_standard_colors from pandas.plotting._matplotlib.tools import ( _flatten, @@ -41,9 +39,6 @@ table, ) -if get_option("plotting.matplotlib.register_converters"): - converter.register(explicit=False) - class MPLPlot: """ @@ -112,7 +107,6 @@ def __init__( import matplotlib.pyplot as plt - converter._WARN = False # no warning for pandas plots self.data = data self.by = by @@ -648,6 +642,7 @@ def _get_xticks(self, convert_period=False): return x @classmethod + @register_pandas_matplotlib_converters def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds): mask = isna(y) if mask.any(): diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index f95ff2578d882..c4ac9ead3f3d3 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -9,7 +9,6 @@ import pandas.core.common as com from pandas.io.formats.printing import pprint_thing -from pandas.plotting._matplotlib import converter from pandas.plotting._matplotlib.core import LinePlot, MPLPlot from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots @@ -255,7 +254,6 @@ def _grouped_hist( def plot_group(group, ax): ax.hist(group.dropna().values, bins=bins, **kwargs) - converter._WARN = False # no warning for pandas plots xrot = xrot or rot fig, axes = _grouped_plot( @@ -363,7 +361,6 @@ def hist_frame( bins=10, **kwds ): - converter._WARN = False # no warning for pandas plots if by is not None: axes = _grouped_hist( data, diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index f160e50d8d99b..931c699d9b9fd 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -25,6 +25,7 @@ TimeSeries_DateFormatter, TimeSeries_DateLocator, TimeSeries_TimedeltaFormatter, + register_pandas_matplotlib_converters, ) import pandas.tseries.frequencies as frequencies from pandas.tseries.offsets import DateOffset @@ -33,6 +34,7 @@ # Plotting functions and monkey patches +@register_pandas_matplotlib_converters def tsplot(series, plotf, ax=None, **kwargs): """ Plots a Series on the given Matplotlib axes or the current axes @@ -56,7 +58,7 @@ def tsplot(series, plotf, ax=None, **kwargs): "'tsplot' is deprecated and will be removed in a " "future version. Please use Series.plot() instead.", FutureWarning, - stacklevel=2, + stacklevel=3, ) # Used inferred freq is possible, need a test case for inferred diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 815c69bc27d7a..8435569d8bc61 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -30,7 +30,7 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs): ) -def register(explicit=True): +def register(): """ Register Pandas Formatters and Converters with matplotlib. @@ -49,7 +49,7 @@ def register(explicit=True): deregister_matplotlib_converters """ plot_backend = _get_plot_backend("matplotlib") - plot_backend.register(explicit=explicit) + plot_backend.register() def deregister(): diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index aabe16d5050f9..ccc2afbb8b824 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -28,18 +28,6 @@ pytest.importorskip("matplotlib.pyplot") -def test_initial_warning(): - code = ( - "import pandas as pd; import matplotlib.pyplot as plt; " - "s = pd.Series(1, pd.date_range('2000', periods=12)); " - "fig, ax = plt.subplots(); " - "ax.plot(s.index, s.values)" - ) - call = [sys.executable, "-c", code] - out = subprocess.check_output(call, stderr=subprocess.STDOUT).decode() - assert "Using an implicitly" in out - - def test_registry_mpl_resets(): # Check that Matplotlib converters are properly reset (see issue #27481) code = ( @@ -71,27 +59,12 @@ def test_register_by_default(self): call = [sys.executable, "-c", code] assert subprocess.check_call(call) == 0 - def test_warns(self): - plt = pytest.importorskip("matplotlib.pyplot") - s = Series(range(12), index=date_range("2017", periods=12)) - _, ax = plt.subplots() - - # Set to the "warning" state, in case this isn't the first test run - converter._WARN = True - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: - ax.plot(s.index, s.values) - plt.close() - - assert len(w) == 1 - assert "Using an implicitly registered datetime converter" in str(w[0]) - def test_registering_no_warning(self): plt = pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() # Set to the "warn" state, in case this isn't the first test run - converter._WARN = True register_matplotlib_converters() with tm.assert_produces_warning(None) as w: ax.plot(s.index, s.values) @@ -102,7 +75,6 @@ def test_pandas_plots_register(self): pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) # Set to the "warn" state, in case this isn't the first test run - converter._WARN = True with tm.assert_produces_warning(None) as w: s.plot() @@ -110,13 +82,15 @@ def test_pandas_plots_register(self): def test_matplotlib_formatters(self): units = pytest.importorskip("matplotlib.units") - assert Timestamp in units.registry - ctx = cf.option_context("plotting.matplotlib.register_converters", False) - with ctx: - assert Timestamp not in units.registry + # Can't make any assertion about the start state. + # We we check that toggling converters off remvoes it, and toggling it + # on restores it. - assert Timestamp in units.registry + with cf.option_context("plotting.matplotlib.register_converters", True): + with cf.option_context("plotting.matplotlib.register_converters", False): + assert Timestamp not in units.registry + assert Timestamp in units.registry def test_option_no_warning(self): pytest.importorskip("matplotlib.pyplot") @@ -125,7 +99,6 @@ def test_option_no_warning(self): s = Series(range(12), index=date_range("2017", periods=12)) _, ax = plt.subplots() - converter._WARN = True # Test without registering first, no warning with ctx: with tm.assert_produces_warning(None) as w: @@ -134,7 +107,6 @@ def test_option_no_warning(self): assert len(w) == 0 # Now test with registering - converter._WARN = True register_matplotlib_converters() with ctx: with tm.assert_produces_warning(None) as w: From cb4130b193c320fef9c43cee1b6aec525455b3e5 Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Fri, 25 Oct 2019 14:41:36 +0200 Subject: [PATCH 082/112] Remove TestData in frame tests in multiple files - part2 (#29222) --- pandas/tests/frame/test_reshape.py | 13 ++- pandas/tests/frame/test_sorting.py | 9 +- pandas/tests/frame/test_subclass.py | 3 +- pandas/tests/frame/test_timeseries.py | 133 +++++++++++++------------- pandas/tests/frame/test_to_csv.py | 95 +++++++++--------- 5 files changed, 126 insertions(+), 127 deletions(-) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index eb654be3f12e6..5ce811712b989 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -6,12 +6,11 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range -from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal -class TestDataFrameReshape(TestData): +class TestDataFrameReshape: def test_pivot(self): data = { "index": ["A", "B", "C", "C", "B", "A"], @@ -101,8 +100,8 @@ def test_pivot_index_none(self): expected.columns.name = "columns" tm.assert_frame_equal(result, expected) - def test_stack_unstack(self): - df = self.frame.copy() + def test_stack_unstack(self, float_frame): + df = float_frame.copy() df[:] = np.arange(np.prod(df.shape)).reshape(df.shape) stacked = df.stack() @@ -515,13 +514,13 @@ def test_unstack_level_binding(self): assert_frame_equal(result, expected) - def test_unstack_to_series(self): + def test_unstack_to_series(self, float_frame): # check reversibility - data = self.frame.unstack() + data = float_frame.unstack() assert isinstance(data, Series) undo = data.unstack().T - assert_frame_equal(undo, self.frame) + assert_frame_equal(undo, float_frame) # check NA handling data = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 24833f8c02df0..2b4b20d318adf 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -15,12 +15,11 @@ date_range, ) from pandas.api.types import CategoricalDtype -from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal -class TestDataFrameSorting(TestData): +class TestDataFrameSorting: def test_sort_values(self): frame = DataFrame( [[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC") @@ -295,8 +294,8 @@ def test_sort_datetimes(self): df2 = df.sort_values(by=["C", "B"]) assert_frame_equal(df1, df2) - def test_frame_column_inplace_sort_exception(self): - s = self.frame["A"] + def test_frame_column_inplace_sort_exception(self, float_frame): + s = float_frame["A"] with pytest.raises(ValueError, match="This Series is a view"): s.sort_values(inplace=True) @@ -379,7 +378,7 @@ def test_sort_nat(self): tm.assert_frame_equal(sorted_df, expected) -class TestDataFrameSortIndexKinds(TestData): +class TestDataFrameSortIndexKinds: def test_sort_index_multicolumn(self): A = np.arange(5).repeat(20) B = np.tile(np.arange(5), 20) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 649a78b785d21..e1e546256f7cd 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -3,11 +3,10 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series -from pandas.tests.frame.common import TestData import pandas.util.testing as tm -class TestDataFrameSubclassing(TestData): +class TestDataFrameSubclassing: def test_frame_subclassing_and_slicing(self): # Subclass frame and ensure it returns the right class on slicing it # In reference to PR 9632 diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index b8708e6ca1871..3355d6e746db2 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -17,7 +17,6 @@ period_range, to_datetime, ) -from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import ( assert_frame_equal, @@ -33,12 +32,12 @@ def close_open_fixture(request): return request.param -class TestDataFrameTimeSeriesMethods(TestData): - def test_diff(self): - the_diff = self.tsframe.diff(1) +class TestDataFrameTimeSeriesMethods: + def test_diff(self, datetime_frame): + the_diff = datetime_frame.diff(1) assert_series_equal( - the_diff["A"], self.tsframe["A"] - self.tsframe["A"].shift(1) + the_diff["A"], datetime_frame["A"] - datetime_frame["A"].shift(1) ) # int dtype @@ -50,7 +49,7 @@ def test_diff(self): assert rs.s[1] == 1 # mixed numeric - tf = self.tsframe.astype("float32") + tf = datetime_frame.astype("float32") the_diff = tf.diff(1) assert_series_equal(the_diff["A"], tf["A"] - tf["A"].shift(1)) @@ -126,14 +125,14 @@ def test_diff_mixed_dtype(self): result = df.diff() assert result[0].dtype == np.float64 - def test_diff_neg_n(self): - rs = self.tsframe.diff(-1) - xp = self.tsframe - self.tsframe.shift(-1) + def test_diff_neg_n(self, datetime_frame): + rs = datetime_frame.diff(-1) + xp = datetime_frame - datetime_frame.shift(-1) assert_frame_equal(rs, xp) - def test_diff_float_n(self): - rs = self.tsframe.diff(1.0) - xp = self.tsframe.diff(1) + def test_diff_float_n(self, datetime_frame): + rs = datetime_frame.diff(1.0) + xp = datetime_frame.diff(1) assert_frame_equal(rs, xp) def test_diff_axis(self): @@ -142,20 +141,20 @@ def test_diff_axis(self): assert_frame_equal(df.diff(axis=1), DataFrame([[np.nan, 1.0], [np.nan, 1.0]])) assert_frame_equal(df.diff(axis=0), DataFrame([[np.nan, np.nan], [2.0, 2.0]])) - def test_pct_change(self): - rs = self.tsframe.pct_change(fill_method=None) - assert_frame_equal(rs, self.tsframe / self.tsframe.shift(1) - 1) + def test_pct_change(self, datetime_frame): + rs = datetime_frame.pct_change(fill_method=None) + assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1) - rs = self.tsframe.pct_change(2) - filled = self.tsframe.fillna(method="pad") + rs = datetime_frame.pct_change(2) + filled = datetime_frame.fillna(method="pad") assert_frame_equal(rs, filled / filled.shift(2) - 1) - rs = self.tsframe.pct_change(fill_method="bfill", limit=1) - filled = self.tsframe.fillna(method="bfill", limit=1) + rs = datetime_frame.pct_change(fill_method="bfill", limit=1) + filled = datetime_frame.fillna(method="bfill", limit=1) assert_frame_equal(rs, filled / filled.shift(1) - 1) - rs = self.tsframe.pct_change(freq="5D") - filled = self.tsframe.fillna(method="pad") + rs = datetime_frame.pct_change(freq="5D") + filled = datetime_frame.fillna(method="pad") assert_frame_equal( rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) ) @@ -181,17 +180,19 @@ def test_pct_change_shift_over_nas(self): ("14B", 14, None, None), ], ) - def test_pct_change_periods_freq(self, freq, periods, fill_method, limit): + def test_pct_change_periods_freq( + self, datetime_frame, freq, periods, fill_method, limit + ): # GH 7292 - rs_freq = self.tsframe.pct_change( + rs_freq = datetime_frame.pct_change( freq=freq, fill_method=fill_method, limit=limit ) - rs_periods = self.tsframe.pct_change( + rs_periods = datetime_frame.pct_change( periods, fill_method=fill_method, limit=limit ) assert_frame_equal(rs_freq, rs_periods) - empty_ts = DataFrame(index=self.tsframe.index, columns=self.tsframe.columns) + empty_ts = DataFrame(index=datetime_frame.index, columns=datetime_frame.columns) rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) assert_frame_equal(rs_freq, rs_periods) @@ -249,39 +250,39 @@ def test_frame_append_datetime64_col_other_units(self): assert (tmp["dates"].values == ex_vals).all() - def test_shift(self): + def test_shift(self, datetime_frame, int_frame): # naive shift - shiftedFrame = self.tsframe.shift(5) - tm.assert_index_equal(shiftedFrame.index, self.tsframe.index) + shiftedFrame = datetime_frame.shift(5) + tm.assert_index_equal(shiftedFrame.index, datetime_frame.index) - shiftedSeries = self.tsframe["A"].shift(5) + shiftedSeries = datetime_frame["A"].shift(5) assert_series_equal(shiftedFrame["A"], shiftedSeries) - shiftedFrame = self.tsframe.shift(-5) - tm.assert_index_equal(shiftedFrame.index, self.tsframe.index) + shiftedFrame = datetime_frame.shift(-5) + tm.assert_index_equal(shiftedFrame.index, datetime_frame.index) - shiftedSeries = self.tsframe["A"].shift(-5) + shiftedSeries = datetime_frame["A"].shift(-5) assert_series_equal(shiftedFrame["A"], shiftedSeries) # shift by 0 - unshifted = self.tsframe.shift(0) - assert_frame_equal(unshifted, self.tsframe) + unshifted = datetime_frame.shift(0) + assert_frame_equal(unshifted, datetime_frame) # shift by DateOffset - shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay()) - assert len(shiftedFrame) == len(self.tsframe) + shiftedFrame = datetime_frame.shift(5, freq=offsets.BDay()) + assert len(shiftedFrame) == len(datetime_frame) - shiftedFrame2 = self.tsframe.shift(5, freq="B") + shiftedFrame2 = datetime_frame.shift(5, freq="B") assert_frame_equal(shiftedFrame, shiftedFrame2) - d = self.tsframe.index[0] + d = datetime_frame.index[0] shifted_d = d + offsets.BDay(5) assert_series_equal( - self.tsframe.xs(d), shiftedFrame.xs(shifted_d), check_names=False + datetime_frame.xs(d), shiftedFrame.xs(shifted_d), check_names=False ) # shift int frame - int_shifted = self.intframe.shift(1) # noqa + int_shifted = int_frame.shift(1) # noqa # Shifting with PeriodIndex ps = tm.makePeriodFrame() @@ -387,7 +388,7 @@ def test_shift_duplicate_columns(self): assert_frame_equal(shifted[0], shifted[1]) assert_frame_equal(shifted[0], shifted[2]) - def test_tshift(self): + def test_tshift(self, datetime_frame): # PeriodIndex ps = tm.makePeriodFrame() shifted = ps.tshift(1) @@ -405,36 +406,36 @@ def test_tshift(self): ps.tshift(freq="M") # DatetimeIndex - shifted = self.tsframe.tshift(1) + shifted = datetime_frame.tshift(1) unshifted = shifted.tshift(-1) - assert_frame_equal(self.tsframe, unshifted) + assert_frame_equal(datetime_frame, unshifted) - shifted2 = self.tsframe.tshift(freq=self.tsframe.index.freq) + shifted2 = datetime_frame.tshift(freq=datetime_frame.index.freq) assert_frame_equal(shifted, shifted2) inferred_ts = DataFrame( - self.tsframe.values, - Index(np.asarray(self.tsframe.index)), - columns=self.tsframe.columns, + datetime_frame.values, + Index(np.asarray(datetime_frame.index)), + columns=datetime_frame.columns, ) shifted = inferred_ts.tshift(1) unshifted = shifted.tshift(-1) - assert_frame_equal(shifted, self.tsframe.tshift(1)) + assert_frame_equal(shifted, datetime_frame.tshift(1)) assert_frame_equal(unshifted, inferred_ts) - no_freq = self.tsframe.iloc[[0, 5, 7], :] + no_freq = datetime_frame.iloc[[0, 5, 7], :] msg = "Freq was not given and was not set in the index" with pytest.raises(ValueError, match=msg): no_freq.tshift() - def test_truncate(self): - ts = self.tsframe[::3] + def test_truncate(self, datetime_frame): + ts = datetime_frame[::3] - start, end = self.tsframe.index[3], self.tsframe.index[6] + start, end = datetime_frame.index[3], datetime_frame.index[6] - start_missing = self.tsframe.index[2] - end_missing = self.tsframe.index[7] + start_missing = datetime_frame.index[2] + end_missing = datetime_frame.index[7] # neither specified truncated = ts.truncate() @@ -473,11 +474,11 @@ def test_truncate(self): before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq ) - def test_truncate_copy(self): - index = self.tsframe.index - truncated = self.tsframe.truncate(index[5], index[10]) + def test_truncate_copy(self, datetime_frame): + index = datetime_frame.index + truncated = datetime_frame.truncate(index[5], index[10]) truncated.values[:] = 5.0 - assert not (self.tsframe.values[5:11] == 5).any() + assert not (datetime_frame.values[5:11] == 5).any() def test_truncate_nonsortedindex(self): # GH 17935 @@ -510,9 +511,9 @@ def test_truncate_nonsortedindex(self): with pytest.raises(ValueError, match=msg): df.truncate(before=2, after=20, axis=1) - def test_asfreq(self): - offset_monthly = self.tsframe.asfreq(offsets.BMonthEnd()) - rule_monthly = self.tsframe.asfreq("BM") + def test_asfreq(self, datetime_frame): + offset_monthly = datetime_frame.asfreq(offsets.BMonthEnd()) + rule_monthly = datetime_frame.asfreq("BM") tm.assert_almost_equal(offset_monthly["A"], rule_monthly["A"]) @@ -523,7 +524,7 @@ def test_asfreq(self): filled_dep = rule_monthly.asfreq("B", method="pad") # noqa # test does not blow up on length-0 DataFrame - zero_length = self.tsframe.reindex([]) + zero_length = datetime_frame.reindex([]) result = zero_length.asfreq("BM") assert result is not zero_length @@ -569,13 +570,15 @@ def test_asfreq_fillvalue(self): ({"A": [1, np.nan, 3]}, [1, 2, 2], 1, 2), ], ) - def test_first_last_valid(self, data, idx, expected_first, expected_last): - N = len(self.frame.index) + def test_first_last_valid( + self, float_frame, data, idx, expected_first, expected_last + ): + N = len(float_frame.index) mat = np.random.randn(N) mat[:5] = np.nan mat[-5:] = np.nan - frame = DataFrame({"foo": mat}, index=self.frame.index) + frame = DataFrame({"foo": mat}, index=float_frame.index) index = frame.first_valid_index() assert index == frame.index[5] diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 8fb028a0f0326..67c748227a43d 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -19,7 +19,6 @@ to_datetime, ) import pandas.core.common as com -from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, @@ -44,37 +43,37 @@ ] -class TestDataFrameToCSV(TestData): +class TestDataFrameToCSV: def read_csv(self, path, **kwargs): params = dict(index_col=0, parse_dates=True) params.update(**kwargs) return pd.read_csv(path, **params) - def test_to_csv_from_csv1(self): + def test_to_csv_from_csv1(self, float_frame, datetime_frame): with ensure_clean("__tmp_to_csv_from_csv1__") as path: - self.frame["A"][:5] = np.nan + float_frame["A"][:5] = np.nan - self.frame.to_csv(path) - self.frame.to_csv(path, columns=["A", "B"]) - self.frame.to_csv(path, header=False) - self.frame.to_csv(path, index=False) + float_frame.to_csv(path) + float_frame.to_csv(path, columns=["A", "B"]) + float_frame.to_csv(path, header=False) + float_frame.to_csv(path, index=False) # test roundtrip - self.tsframe.to_csv(path) + datetime_frame.to_csv(path) recons = self.read_csv(path) - assert_frame_equal(self.tsframe, recons) + assert_frame_equal(datetime_frame, recons) - self.tsframe.to_csv(path, index_label="index") + datetime_frame.to_csv(path, index_label="index") recons = self.read_csv(path, index_col=None) - assert len(recons.columns) == len(self.tsframe.columns) + 1 + assert len(recons.columns) == len(datetime_frame.columns) + 1 # no index - self.tsframe.to_csv(path, index=False) + datetime_frame.to_csv(path, index=False) recons = self.read_csv(path, index_col=None) - assert_almost_equal(self.tsframe.values, recons.values) + assert_almost_equal(datetime_frame.values, recons.values) # corner case dm = DataFrame( @@ -88,7 +87,7 @@ def test_to_csv_from_csv1(self): recons = self.read_csv(path) assert_frame_equal(dm, recons) - def test_to_csv_from_csv2(self): + def test_to_csv_from_csv2(self, float_frame): with ensure_clean("__tmp_to_csv_from_csv2__") as path: @@ -109,16 +108,16 @@ def test_to_csv_from_csv2(self): # column aliases col_aliases = Index(["AA", "X", "Y", "Z"]) - self.frame2.to_csv(path, header=col_aliases) + float_frame.to_csv(path, header=col_aliases) rs = self.read_csv(path) - xp = self.frame2.copy() + xp = float_frame.copy() xp.columns = col_aliases assert_frame_equal(xp, rs) msg = "Writing 4 cols but got 2 aliases" with pytest.raises(ValueError, match=msg): - self.frame2.to_csv(path, header=["AA", "X"]) + float_frame.to_csv(path, header=["AA", "X"]) def test_to_csv_from_csv3(self): @@ -153,22 +152,22 @@ def test_to_csv_from_csv4(self): assert_frame_equal(df, result, check_index_type=True) - def test_to_csv_from_csv5(self): + def test_to_csv_from_csv5(self, timezone_frame): # tz, 8260 with ensure_clean("__tmp_to_csv_from_csv5__") as path: - self.tzframe.to_csv(path) + timezone_frame.to_csv(path) result = pd.read_csv(path, index_col=0, parse_dates=["A"]) converter = ( lambda c: to_datetime(result[c]) .dt.tz_convert("UTC") - .dt.tz_convert(self.tzframe[c].dt.tz) + .dt.tz_convert(timezone_frame[c].dt.tz) ) result["B"] = converter("B") result["C"] = converter("C") - assert_frame_equal(result, self.tzframe) + assert_frame_equal(result, timezone_frame) def test_to_csv_cols_reordering(self): # GH3454 @@ -493,37 +492,37 @@ def _to_uni(x): cnlvl=2, ) - def test_to_csv_from_csv_w_some_infs(self): + def test_to_csv_from_csv_w_some_infs(self, float_frame): # test roundtrip with inf, -inf, nan, as full columns and mix - self.frame["G"] = np.nan + float_frame["G"] = np.nan f = lambda x: [np.inf, np.nan][np.random.rand() < 0.5] - self.frame["H"] = self.frame.index.map(f) + float_frame["H"] = float_frame.index.map(f) with ensure_clean() as path: - self.frame.to_csv(path) + float_frame.to_csv(path) recons = self.read_csv(path) # TODO to_csv drops column name - assert_frame_equal(self.frame, recons, check_names=False) + assert_frame_equal(float_frame, recons, check_names=False) assert_frame_equal( - np.isinf(self.frame), np.isinf(recons), check_names=False + np.isinf(float_frame), np.isinf(recons), check_names=False ) - def test_to_csv_from_csv_w_all_infs(self): + def test_to_csv_from_csv_w_all_infs(self, float_frame): # test roundtrip with inf, -inf, nan, as full columns and mix - self.frame["E"] = np.inf - self.frame["F"] = -np.inf + float_frame["E"] = np.inf + float_frame["F"] = -np.inf with ensure_clean() as path: - self.frame.to_csv(path) + float_frame.to_csv(path) recons = self.read_csv(path) # TODO to_csv drops column name - assert_frame_equal(self.frame, recons, check_names=False) + assert_frame_equal(float_frame, recons, check_names=False) assert_frame_equal( - np.isinf(self.frame), np.isinf(recons), check_names=False + np.isinf(float_frame), np.isinf(recons), check_names=False ) def test_to_csv_no_index(self): @@ -563,9 +562,9 @@ def test_to_csv_headers(self): recons.reset_index(inplace=True) assert_frame_equal(to_df, recons) - def test_to_csv_multiindex(self): + def test_to_csv_multiindex(self, float_frame, datetime_frame): - frame = self.frame + frame = float_frame old_index = frame.index arrays = np.arange(len(old_index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) @@ -586,10 +585,10 @@ def test_to_csv_multiindex(self): assert frame.index.names == df.index.names # needed if setUp becomes a class method - self.frame.index = old_index + float_frame.index = old_index # try multiindex with dates - tsframe = self.tsframe + tsframe = datetime_frame old_index = tsframe.index new_index = [old_index, np.arange(len(old_index))] tsframe.index = MultiIndex.from_arrays(new_index) @@ -608,10 +607,10 @@ def test_to_csv_multiindex(self): # no index tsframe.to_csv(path, index=False) recons = self.read_csv(path, index_col=None) - assert_almost_equal(recons.values, self.tsframe.values) + assert_almost_equal(recons.values, datetime_frame.values) # needed if setUp becomes class method - self.tsframe.index = old_index + datetime_frame.index = old_index with ensure_clean("__tmp_to_csv_multiindex__") as path: # GH3571, GH1651, GH3141 @@ -889,13 +888,13 @@ def test_to_csv_unicode_index_col(self): df2 = read_csv(buf, index_col=0, encoding="UTF-8") assert_frame_equal(df, df2) - def test_to_csv_stringio(self): + def test_to_csv_stringio(self, float_frame): buf = StringIO() - self.frame.to_csv(buf) + float_frame.to_csv(buf) buf.seek(0) recons = read_csv(buf, index_col=0) # TODO to_csv drops column name - assert_frame_equal(recons, self.frame, check_names=False) + assert_frame_equal(recons, float_frame, check_names=False) def test_to_csv_float_format(self): @@ -1013,14 +1012,14 @@ def test_to_csv_from_csv_categorical(self): assert res.getvalue() == exp.getvalue() - def test_to_csv_path_is_none(self): + def test_to_csv_path_is_none(self, float_frame): # GH 8215 # Make sure we return string for consistency with # Series.to_csv() - csv_str = self.frame.to_csv(path_or_buf=None) + csv_str = float_frame.to_csv(path_or_buf=None) assert isinstance(csv_str, str) recons = pd.read_csv(StringIO(csv_str), index_col=0) - assert_frame_equal(self.frame, recons) + assert_frame_equal(float_frame, recons) @pytest.mark.parametrize( "df,encoding", @@ -1077,9 +1076,9 @@ def test_to_csv_compression(self, df, encoding, compression): with tm.decompress_file(filename, compression) as fh: assert_frame_equal(df, read_csv(fh, index_col=0, encoding=encoding)) - def test_to_csv_date_format(self): + def test_to_csv_date_format(self, datetime_frame): with ensure_clean("__tmp_to_csv_date_format__") as path: - dt_index = self.tsframe.index + dt_index = datetime_frame.index datetime_frame = DataFrame( {"A": dt_index, "B": dt_index.shift(1)}, index=dt_index ) From 5ea715e18c65c2d8d679f2efb4ec0345b452c502 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 25 Oct 2019 05:44:19 -0700 Subject: [PATCH 083/112] CLN: Remove unnecessary sys.version_info checks (#29210) --- pandas/tests/io/formats/test_to_csv.py | 2 +- versioneer.py | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index a85f3677bc3ab..095dfb7876154 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -11,7 +11,7 @@ class TestToCSV: @pytest.mark.xfail( - (3, 6, 5) > sys.version_info >= (3, 5), + (3, 6, 5) > sys.version_info, reason=("Python csv library bug (see https://bugs.python.org/issue32255)"), ) def test_to_csv_with_single_column(self): diff --git a/versioneer.py b/versioneer.py index 24d8105c307c0..8a4710da5958a 100644 --- a/versioneer.py +++ b/versioneer.py @@ -8,7 +8,6 @@ * https://github.com/warner/python-versioneer * Brian Warner * License: Public Domain -* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, and pypy * [![Latest Version] (https://pypip.in/version/versioneer/badge.svg?style=flat) ](https://pypi.org/project/versioneer/) @@ -464,9 +463,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): if verbose: print("unable to find command, tried %s" % (commands,)) return None - stdout = p.communicate()[0].strip() - if sys.version_info[0] >= 3: - stdout = stdout.decode() + + stdout = p.communicate()[0].strip().decode() + if p.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) @@ -561,9 +560,9 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): if verbose: print("unable to find command, tried %%s" %% (commands,)) return None - stdout = p.communicate()[0].strip() - if sys.version_info[0] >= 3: - stdout = stdout.decode() + + stdout = p.communicate()[0].strip().decode() + if p.returncode != 0: if verbose: print("unable to run %%s (error)" %% dispcmd) From a05829b21cd1468da11e3a28c4de489efb1a4754 Mon Sep 17 00:00:00 2001 From: Martin Winkel Date: Fri, 25 Oct 2019 14:50:02 +0200 Subject: [PATCH 084/112] Remove TestData in frame tests in multiple files (#29172) --- pandas/tests/frame/test_convert_to.py | 3 +- pandas/tests/frame/test_nonunique_indexes.py | 3 +- pandas/tests/frame/test_query_eval.py | 2 +- pandas/tests/frame/test_replace.py | 83 ++++++++++---------- pandas/tests/frame/test_repr_info.py | 37 +++++---- 5 files changed, 62 insertions(+), 66 deletions(-) diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index c9a7507969f5b..17edd48e36563 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -13,11 +13,10 @@ Timestamp, date_range, ) -from pandas.tests.frame.common import TestData import pandas.util.testing as tm -class TestDataFrameConvertTo(TestData): +class TestDataFrameConvertTo: def test_to_dict_timestamp(self): # GH11247 diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 4faa0d0e3f941..430d9ad135c80 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -3,12 +3,11 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range -from pandas.tests.frame.common import TestData import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal -class TestDataFrameNonuniqueIndexes(TestData): +class TestDataFrameNonuniqueIndexes: def test_column_dups_operations(self): def check(result, expected=None): if expected is not None: diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index f5f6c9ad6b3da..13b994d116c76 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -82,7 +82,7 @@ def test_query_numexpr(self): df.eval("A+1", engine="numexpr") -class TestDataFrameEval(TestData): +class TestDataFrameEval: def test_ops(self): # tst ops and reversed ops in evaluation diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index b341ed6a52ca5..fdb450da53137 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -8,7 +8,6 @@ import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, date_range -from pandas.tests.frame.common import TestData from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -22,27 +21,27 @@ def mix_abc() -> Dict[str, list]: return {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} -class TestDataFrameReplace(TestData): - def test_replace_inplace(self): - self.tsframe["A"][:5] = np.nan - self.tsframe["A"][-5:] = np.nan +class TestDataFrameReplace: + def test_replace_inplace(self, datetime_frame, float_string_frame): + datetime_frame["A"][:5] = np.nan + datetime_frame["A"][-5:] = np.nan - tsframe = self.tsframe.copy() + tsframe = datetime_frame.copy() tsframe.replace(np.nan, 0, inplace=True) - assert_frame_equal(tsframe, self.tsframe.fillna(0)) + assert_frame_equal(tsframe, datetime_frame.fillna(0)) # mixed type - mf = self.mixed_frame + mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan - result = self.mixed_frame.replace(np.nan, 0) - expected = self.mixed_frame.fillna(value=0) + result = float_string_frame.replace(np.nan, 0) + expected = float_string_frame.fillna(value=0) assert_frame_equal(result, expected) - tsframe = self.tsframe.copy() + tsframe = datetime_frame.copy() tsframe.replace([np.nan], [0], inplace=True) - assert_frame_equal(tsframe, self.tsframe.fillna(0)) + assert_frame_equal(tsframe, datetime_frame.fillna(0)) def test_regex_replace_scalar(self, mix_ab): obj = {"a": list("ab.."), "b": list("efgh")} @@ -583,17 +582,17 @@ def test_replace_regex_metachar(self, metachar): expected = DataFrame({"a": ["paren", "else"]}) assert_frame_equal(result, expected) - def test_replace(self): - self.tsframe["A"][:5] = np.nan - self.tsframe["A"][-5:] = np.nan + def test_replace(self, datetime_frame): + datetime_frame["A"][:5] = np.nan + datetime_frame["A"][-5:] = np.nan - zero_filled = self.tsframe.replace(np.nan, -1e8) - assert_frame_equal(zero_filled, self.tsframe.fillna(-1e8)) - assert_frame_equal(zero_filled.replace(-1e8, np.nan), self.tsframe) + zero_filled = datetime_frame.replace(np.nan, -1e8) + assert_frame_equal(zero_filled, datetime_frame.fillna(-1e8)) + assert_frame_equal(zero_filled.replace(-1e8, np.nan), datetime_frame) - self.tsframe["A"][:5] = np.nan - self.tsframe["A"][-5:] = np.nan - self.tsframe["B"][:5] = -1e8 + datetime_frame["A"][:5] = np.nan + datetime_frame["A"][-5:] = np.nan + datetime_frame["B"][:5] = -1e8 # empty df = DataFrame(index=["a", "b"]) @@ -684,20 +683,20 @@ def test_replace_convert(self): res = rep.dtypes assert_series_equal(expec, res) - def test_replace_mixed(self): - mf = self.mixed_frame + def test_replace_mixed(self, float_string_frame): + mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan - result = self.mixed_frame.replace(np.nan, -18) - expected = self.mixed_frame.fillna(value=-18) + result = float_string_frame.replace(np.nan, -18) + expected = float_string_frame.fillna(value=-18) assert_frame_equal(result, expected) - assert_frame_equal(result.replace(-18, np.nan), self.mixed_frame) + assert_frame_equal(result.replace(-18, np.nan), float_string_frame) - result = self.mixed_frame.replace(np.nan, -1e8) - expected = self.mixed_frame.fillna(value=-1e8) + result = float_string_frame.replace(np.nan, -1e8) + expected = float_string_frame.fillna(value=-1e8) assert_frame_equal(result, expected) - assert_frame_equal(result.replace(-1e8, np.nan), self.mixed_frame) + assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame) # int block upcasting df = DataFrame( @@ -793,30 +792,30 @@ def test_replace_simple_nested_dict_with_nonexistent_value(self): result = df.replace({"col": {-1: "-", 1: "a", 4: "b"}}) assert_frame_equal(expected, result) - def test_replace_value_is_none(self): - orig_value = self.tsframe.iloc[0, 0] - orig2 = self.tsframe.iloc[1, 0] + def test_replace_value_is_none(self, datetime_frame): + orig_value = datetime_frame.iloc[0, 0] + orig2 = datetime_frame.iloc[1, 0] - self.tsframe.iloc[0, 0] = np.nan - self.tsframe.iloc[1, 0] = 1 + datetime_frame.iloc[0, 0] = np.nan + datetime_frame.iloc[1, 0] = 1 - result = self.tsframe.replace(to_replace={np.nan: 0}) - expected = self.tsframe.T.replace(to_replace={np.nan: 0}).T + result = datetime_frame.replace(to_replace={np.nan: 0}) + expected = datetime_frame.T.replace(to_replace={np.nan: 0}).T assert_frame_equal(result, expected) - result = self.tsframe.replace(to_replace={np.nan: 0, 1: -1e8}) - tsframe = self.tsframe.copy() + result = datetime_frame.replace(to_replace={np.nan: 0, 1: -1e8}) + tsframe = datetime_frame.copy() tsframe.iloc[0, 0] = 0 tsframe.iloc[1, 0] = -1e8 expected = tsframe assert_frame_equal(expected, result) - self.tsframe.iloc[0, 0] = orig_value - self.tsframe.iloc[1, 0] = orig2 + datetime_frame.iloc[0, 0] = orig_value + datetime_frame.iloc[1, 0] = orig2 - def test_replace_for_new_dtypes(self): + def test_replace_for_new_dtypes(self, datetime_frame): # dtypes - tsframe = self.tsframe.copy().astype(np.float32) + tsframe = datetime_frame.copy().astype(np.float32) tsframe["A"][:5] = np.nan tsframe["A"][-5:] = np.nan diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 48f42b5f101ce..318b1c6add91e 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -18,7 +18,6 @@ option_context, period_range, ) -from pandas.tests.frame.common import TestData import pandas.util.testing as tm import pandas.io.formats.format as fmt @@ -27,21 +26,21 @@ # structure -class TestDataFrameReprInfoEtc(TestData): +class TestDataFrameReprInfoEtc: def test_repr_empty(self): # empty - foo = repr(self.empty) # noqa + foo = repr(DataFrame()) # noqa # empty with index frame = DataFrame(index=np.arange(1000)) foo = repr(frame) # noqa - def test_repr_mixed(self): + def test_repr_mixed(self, float_string_frame): buf = StringIO() # mixed - foo = repr(self.mixed_frame) # noqa - self.mixed_frame.info(verbose=False, buf=buf) + foo = repr(float_string_frame) # noqa + float_string_frame.info(verbose=False, buf=buf) @pytest.mark.slow def test_repr_mixed_big(self): @@ -54,16 +53,16 @@ def test_repr_mixed_big(self): foo = repr(biggie) # noqa - def test_repr(self): + def test_repr(self, float_frame): buf = StringIO() # small one - foo = repr(self.frame) - self.frame.info(verbose=False, buf=buf) + foo = repr(float_frame) + float_frame.info(verbose=False, buf=buf) # even smaller - self.frame.reindex(columns=["A"]).info(verbose=False, buf=buf) - self.frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf) + float_frame.reindex(columns=["A"]).info(verbose=False, buf=buf) + float_frame.reindex(columns=["A", "B"]).info(verbose=False, buf=buf) # exhausting cases in DataFrame.info @@ -72,7 +71,7 @@ def test_repr(self): foo = repr(no_index) # noqa # no columns or index - self.empty.info(buf=buf) + DataFrame().info(buf=buf) df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"]) assert "\t" not in repr(df) @@ -96,7 +95,7 @@ def test_repr_big(self): biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200)) repr(biggie) - def test_repr_unsortable(self): + def test_repr_unsortable(self, float_frame): # columns are not sortable import warnings @@ -115,13 +114,13 @@ def test_repr_unsortable(self): repr(unsortable) fmt.set_option("display.precision", 3, "display.column_space", 10) - repr(self.frame) + repr(float_frame) fmt.set_option("display.max_rows", 10, "display.max_columns", 2) - repr(self.frame) + repr(float_frame) fmt.set_option("display.max_rows", 1000, "display.max_columns", 1000) - repr(self.frame) + repr(float_frame) tm.reset_display_options() @@ -196,10 +195,10 @@ def test_latex_repr(self): # GH 12182 assert df._repr_latex_() is None - def test_info(self): + def test_info(self, float_frame, datetime_frame): io = StringIO() - self.frame.info(buf=io) - self.tsframe.info(buf=io) + float_frame.info(buf=io) + datetime_frame.info(buf=io) frame = DataFrame(np.random.randn(5, 3)) From 4abe6d83093a684cea037546bc16681bf4e35436 Mon Sep 17 00:00:00 2001 From: Elle <42851573+ellequelle@users.noreply.github.com> Date: Fri, 25 Oct 2019 09:06:30 -0400 Subject: [PATCH 085/112] BUG: fix non-existent variable in NDFrame.interpolate (#29142) --- doc/source/whatsnew/v1.0.0.rst | 3 ++- pandas/core/generic.py | 5 +++-- pandas/tests/frame/test_missing.py | 21 ++++++++++++++++++++- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 736264a1196cf..d3ec07e37c0d9 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -274,7 +274,6 @@ Performance improvements Bug fixes ~~~~~~~~~ -- Bug in :meth:`DataFrame.to_html` when using ``formatters=`` and ``max_cols`` together. (:issue:`25955`) Categorical ^^^^^^^^^^^ @@ -322,6 +321,7 @@ Numeric - Bug in :meth:`DataFrame.quantile` with zero-column :class:`DataFrame` incorrectly raising (:issue:`23925`) - :class:`DataFrame` flex inequality comparisons methods (:meth:`DataFrame.lt`, :meth:`DataFrame.le`, :meth:`DataFrame.gt`, :meth: `DataFrame.ge`) with object-dtype and ``complex`` entries failing to raise ``TypeError`` like their :class:`Series` counterparts (:issue:`28079`) - Bug in :class:`DataFrame` logical operations (`&`, `|`, `^`) not matching :class:`Series` behavior by filling NA values (:issue:`28741`) +- Bug in :meth:`DataFrame.interpolate` where specifying axis by name references variable before it is assigned (:issue:`29142`) - Conversion @@ -382,6 +382,7 @@ I/O - Bug in :meth:`DataFrame.read_json` where using ``orient="index"`` would not maintain the order (:issue:`28557`) - Bug in :meth:`DataFrame.to_html` where the length of the ``formatters`` argument was not verified (:issue:`28469`) - Bug in :meth:`pandas.io.formats.style.Styler` formatting for floating values not displaying decimals correctly (:issue:`13257`) +- Bug in :meth:`DataFrame.to_html` when using ``formatters=`` and ``max_cols`` together. (:issue:`25955`) Plotting ^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d59ce8db9ba8e..fe0923f096493 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7048,14 +7048,15 @@ def interpolate( """ inplace = validate_bool_kwarg(inplace, "inplace") + axis = self._get_axis_number(axis) + if axis == 0: ax = self._info_axis_name _maybe_transposed_self = self elif axis == 1: _maybe_transposed_self = self.T ax = 1 - else: - _maybe_transposed_self = self + ax = _maybe_transposed_self._get_axis_number(ax) if _maybe_transposed_self.ndim == 2: diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 94667ecfa837d..1f4bbcb358378 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -391,13 +391,15 @@ def test_fillna_categorical_nan(self): cat = Categorical([np.nan, 2, np.nan]) val = Categorical([np.nan, np.nan, np.nan]) df = DataFrame({"cats": cat, "vals": val}) - res = df.fillna(df.median()) + with tm.assert_produces_warning(RuntimeWarning): + res = df.fillna(df.median()) v_exp = [np.nan, np.nan, np.nan] df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category") tm.assert_frame_equal(res, df_exp) result = df.cats.fillna(np.nan) tm.assert_series_equal(result, df.cats) + result = df.vals.fillna(np.nan) tm.assert_series_equal(result, df.vals) @@ -876,6 +878,23 @@ def test_interp_rowwise(self): expected = df.interpolate() assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "axis_name, axis_number", + [ + pytest.param("rows", 0, id="rows_0"), + pytest.param("index", 0, id="index_0"), + pytest.param("columns", 1, id="columns_1"), + ], + ) + def test_interp_axis_names(self, axis_name, axis_number): + # GH 29132: test axis names + data = {0: [0, np.nan, 6], 1: [1, np.nan, 7], 2: [2, 5, 8]} + + df = DataFrame(data, dtype=np.float64) + result = df.interpolate(axis=axis_name, method="linear") + expected = df.interpolate(axis=axis_number, method="linear") + assert_frame_equal(result, expected) + def test_rowwise_alt(self): df = DataFrame( { From 4d519f91b49b215d0d7773e4dda19a367d67fb57 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 25 Oct 2019 15:34:09 +0200 Subject: [PATCH 086/112] ENH: Add StringArray.__arrow_array__ for conversion to Arrow (#29182) --- doc/source/whatsnew/v1.0.0.rst | 11 ++++++----- pandas/core/arrays/string_.py | 10 ++++++++++ pandas/tests/arrays/string_/test_string.py | 13 +++++++++++++ pandas/tests/arrays/test_integer.py | 2 +- pandas/tests/io/test_parquet.py | 19 +++++++++++++------ 5 files changed, 43 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index d3ec07e37c0d9..5514265f37f05 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -102,11 +102,12 @@ Other enhancements - :meth:`DataFrame.to_string` added the ``max_colwidth`` parameter to control when wide columns are truncated (:issue:`9784`) - :meth:`MultiIndex.from_product` infers level names from inputs if not explicitly provided (:issue:`27292`) - :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`) -- The :ref:`integer dtype ` with support for missing values can now be converted to - ``pyarrow`` (>= 0.15.0), which means that it is supported in writing to the Parquet file format - when using the ``pyarrow`` engine. It is currently not yet supported when converting back to - pandas (so it will become an integer or float dtype depending on the presence of missing data). - (:issue:`28368`) +- The :ref:`integer dtype ` with support for missing values and the + new :ref:`string dtype ` can now be converted to ``pyarrow`` (>= + 0.15.0), which means that it is supported in writing to the Parquet file + format when using the ``pyarrow`` engine. It is currently not yet supported + when converting back to pandas, so it will become an integer or float + (depending on the presence of missing data) or object dtype column. (:issue:`28368`) - :meth:`DataFrame.to_json` now accepts an ``indent`` integer argument to enable pretty printing of JSON output (:issue:`12004`) - :meth:`read_stata` can read Stata 119 dta files. (:issue:`28250`) - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 87649ac651127..7c487b227de20 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -182,6 +182,16 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): return cls._from_sequence(strings, dtype=dtype, copy=copy) + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + if type is None: + type = pa.string() + return pa.array(self._ndarray, type=type, from_pandas=True) + def __setitem__(self, key, value): value = extract_array(value, extract_numpy=True) if isinstance(value, type(self)): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 40221c34116ae..efe2b4e0b2deb 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas.util.testing as tm @@ -158,3 +160,14 @@ def test_reduce_missing(skipna): assert result == "abc" else: assert pd.isna(result) + + +@td.skip_if_no("pyarrow", min_version="0.15.0") +def test_arrow_array(): + # protocol added in 0.15.0 + import pyarrow as pa + + data = pd.array(["a", "b", "c"], dtype="string") + arr = pa.array(data) + expected = pa.array(list(data), type=pa.string(), from_pandas=True) + assert arr.equals(expected) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 55e25caafc4ee..793de66767cc3 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -819,7 +819,7 @@ def test_ufunc_reduce_raises(values): np.add.reduce(a) -@td.skip_if_no("pyarrow", min_version="0.14.1.dev") +@td.skip_if_no("pyarrow", min_version="0.15.0") def test_arrow_array(data): # protocol added in 0.15.0 import pyarrow as pa diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 2a95904d5668d..26bfefecc632d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -504,15 +504,22 @@ def test_empty_dataframe(self, pa): df = pd.DataFrame() check_round_trip(df, pa) - @td.skip_if_no("pyarrow", min_version="0.14.1.dev") - def test_nullable_integer(self, pa): - df = pd.DataFrame({"a": pd.Series([1, 2, 3], dtype="Int64")}) - # currently de-serialized as plain int - expected = df.assign(a=df.a.astype("int64")) + @td.skip_if_no("pyarrow", min_version="0.15.0") + def test_additional_extension_arrays(self, pa): + # test additional ExtensionArrays that are supported through the + # __arrow_array__ protocol + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype="Int64"), + "b": pd.Series(["a", None, "c"], dtype="string"), + } + ) + # currently de-serialized as plain int / object + expected = df.assign(a=df.a.astype("int64"), b=df.b.astype("object")) check_round_trip(df, pa, expected=expected) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) - # if missing values currently de-serialized as float + # if missing values in integer, currently de-serialized as float expected = df.assign(a=df.a.astype("float64")) check_round_trip(df, pa, expected=expected) From 08cf353461d91ed2acba4ffa69d19ad0405413ef Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 25 Oct 2019 09:04:18 -0700 Subject: [PATCH 087/112] CLN: Add types in a handful of places (#29178) --- pandas/_libs/index.pyx | 12 +++++------ pandas/core/algorithms.py | 18 ++++++++++------- pandas/core/indexes/interval.py | 2 +- pandas/core/nanops.py | 36 ++++++++++++++++++--------------- pandas/io/common.py | 4 ++-- pandas/io/excel/_odfreader.py | 3 ++- 6 files changed, 42 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 255fd85531d14..d9bde2a471e06 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -171,17 +171,17 @@ cdef class IndexEngine: raise KeyError(val) - def sizeof(self, deep=False): + def sizeof(self, deep: bool = False) -> int: """ return the sizeof our mapping """ if not self.is_mapping_populated: return 0 return self.mapping.sizeof(deep=deep) - def __sizeof__(self): + def __sizeof__(self) -> int: return self.sizeof() @property - def is_unique(self): + def is_unique(self) -> bool: if self.need_unique_check: self._do_unique_check() @@ -193,14 +193,14 @@ cdef class IndexEngine: self._ensure_mapping_populated() @property - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: if self.need_monotonic_check: self._do_monotonic_check() return self.monotonic_inc == 1 @property - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: if self.need_monotonic_check: self._do_monotonic_check() @@ -243,7 +243,7 @@ cdef class IndexEngine: hash(val) @property - def is_mapping_populated(self): + def is_mapping_populated(self) -> bool: return self.mapping is not None cdef inline _ensure_mapping_populated(self): diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2c9f632e8bc24..7760c8ec397a9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -692,7 +692,12 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): def value_counts( - values, sort=True, ascending=False, normalize=False, bins=None, dropna=True + values, + sort: bool = True, + ascending: bool = False, + normalize: bool = False, + bins=None, + dropna: bool = True, ): """ Compute a histogram of the counts of non-null values. @@ -700,22 +705,21 @@ def value_counts( Parameters ---------- values : ndarray (1-d) - sort : boolean, default True + sort : bool, default True Sort by values - ascending : boolean, default False + ascending : bool, default False Sort in ascending order - normalize: boolean, default False + normalize: bool, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data - dropna : boolean, default True + dropna : bool, default True Don't include counts of NaN Returns ------- - value_counts : Series - + Series """ from pandas.core.series import Series, Index diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index a2d48b5100a2e..3b6ac25e7c6b4 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1340,7 +1340,7 @@ def _intersection_non_unique(self, other: "IntervalIndex") -> "IntervalIndex": return self[mask] - def _setop(op_name, sort=None): + def _setop(op_name: str, sort=None): @SetopCheck(op_name=op_name) def func(self, other, sort=sort): result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 09b80d1b3a9ac..5dd4cc946572c 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -7,7 +7,7 @@ from pandas._config import get_option -from pandas._libs import iNaT, lib, tslibs +from pandas._libs import NaT, Timedelta, Timestamp, iNaT, lib from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask @@ -53,7 +53,7 @@ def __init__(self, *dtypes): super().__init__() self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes) - def check(self, obj): + def check(self, obj) -> bool: return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes) def __call__(self, f): @@ -128,7 +128,7 @@ def f(values, axis=None, skipna=True, **kwds): return f -def _bn_ok_dtype(dt, name): +def _bn_ok_dtype(dt, name: str) -> bool: # Bottleneck chokes on datetime64 if not is_object_dtype(dt) and not ( is_datetime_or_timedelta_dtype(dt) or is_datetime64tz_dtype(dt) @@ -149,7 +149,7 @@ def _bn_ok_dtype(dt, name): return False -def _has_infs(result): +def _has_infs(result) -> bool: if isinstance(result, np.ndarray): if result.dtype == "f8": return lib.has_infs_f8(result.ravel()) @@ -176,19 +176,22 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): return -np.inf else: if fill_value_typ is None: - return tslibs.iNaT + return iNaT else: if fill_value_typ == "+inf": # need the max int here return _int64_max else: - return tslibs.iNaT + return iNaT def _maybe_get_mask( values: np.ndarray, skipna: bool, mask: Optional[np.ndarray] ) -> Optional[np.ndarray]: - """ This function will compute a mask iff it is necessary. Otherwise, + """ + Compute a mask if and only if necessary. + + This function will compute a mask iff it is necessary. Otherwise, return the provided mask (potentially None) when a mask does not need to be computed. @@ -214,7 +217,6 @@ def _maybe_get_mask( Returns ------- Optional[np.ndarray] - """ if mask is None: @@ -346,7 +348,7 @@ def _wrap_results(result, dtype, fill_value=None): assert not isna(fill_value), "Expected non-null fill_value" if result == fill_value: result = np.nan - result = tslibs.Timestamp(result, tz=tz) + result = Timestamp(result, tz=tz) else: result = result.view(dtype) elif is_timedelta64_dtype(dtype): @@ -358,21 +360,22 @@ def _wrap_results(result, dtype, fill_value=None): if np.fabs(result) > _int64_max: raise ValueError("overflow in timedelta operation") - result = tslibs.Timedelta(result, unit="ns") + result = Timedelta(result, unit="ns") else: result = result.astype("m8[ns]").view(dtype) return result -def _na_for_min_count(values, axis): - """Return the missing value for `values` +def _na_for_min_count(values, axis: Optional[int]): + """ + Return the missing value for `values`. Parameters ---------- values : ndarray axis : int or None - axis for the reduction + axis for the reduction, required if values.ndim > 1. Returns ------- @@ -388,13 +391,14 @@ def _na_for_min_count(values, axis): if values.ndim == 1: return fill_value else: + assert axis is not None # assertion to make mypy happy result_shape = values.shape[:axis] + values.shape[axis + 1 :] result = np.empty(result_shape, dtype=values.dtype) result.fill(fill_value) return result -def nanany(values, axis=None, skipna=True, mask=None): +def nanany(values, axis=None, skipna: bool = True, mask=None): """ Check if any elements along an axis evaluate to True. @@ -426,7 +430,7 @@ def nanany(values, axis=None, skipna=True, mask=None): return values.any(axis) -def nanall(values, axis=None, skipna=True, mask=None): +def nanall(values, axis=None, skipna: bool = True, mask=None): """ Check if all elements along an axis evaluate to True. @@ -1195,7 +1199,7 @@ def _maybe_null_out( else: # GH12941, use None to auto cast null result[null_mask] = None - elif result is not tslibs.NaT: + elif result is not NaT: if mask is not None: null_mask = mask.size - mask.sum() else: diff --git a/pandas/io/common.py b/pandas/io/common.py index 0b8594bbbd3e4..0bef14e4999c7 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -569,11 +569,11 @@ def __iter__(self) -> "MMapWrapper": return self def __next__(self) -> str: - newline = self.mmap.readline() + newbytes = self.mmap.readline() # readline returns bytes, not str, but Python's CSV reader # expects str, so convert the output to str before continuing - newline = newline.decode("utf-8") + newline = newbytes.decode("utf-8") # mmap doesn't raise if reading past the allocated # data but instead returns an empty string, so raise diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 3be36663bac79..66a186161e01b 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -9,7 +9,8 @@ class _ODFReader(_BaseExcelReader): - """Read tables out of OpenDocument formatted files + """ + Read tables out of OpenDocument formatted files. Parameters ---------- From f2c42dc03b91ca5c3e215b196ca9baf69100d4b0 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Fri, 25 Oct 2019 11:18:34 -0500 Subject: [PATCH 088/112] Changing logo in README and removing old logo (#29215) --- README.md | 2 +- doc/logo/pandas_logo.png | Bin 21842 -> 0 bytes doc/logo/pandas_logo.py | 47 --- doc/logo/pandas_logo.svg | 879 --------------------------------------- 4 files changed, 1 insertion(+), 927 deletions(-) delete mode 100644 doc/logo/pandas_logo.png delete mode 100644 doc/logo/pandas_logo.py delete mode 100644 doc/logo/pandas_logo.svg diff --git a/README.md b/README.md index 7786eeb0ec5c7..c299241722b7e 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@
-
+
----------------- diff --git a/doc/logo/pandas_logo.png b/doc/logo/pandas_logo.png deleted file mode 100644 index 065ee4e4856a61c2ab165854efde1f7abf8ba940..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 21842 zcmZs@Wmp_t*E9+QhX6r?C&As_NrJn(OK^9$5Zv9}-DPlhcXxN!(>(Y4{X4m?&de}P z%kH&n)v8*XAQ>s)&+s_#U|?XMMMZwefq_A&0Pj6up@IKB6Y;Hq7f3rnQ3Y7wpF6C< zKj1T*wTP-67#Ond#|B==S6~7L_6PecDv%>CN!+DbEc^o7JaVRFNSe`g3;s4+&q-WxSLw0PNFtTjQi+aI&2%xKgu`()F;gt@m?@r3em zsVcvKqkek_hetp#TO56X3JMD|Tx)fiz0dmI^J=OJZ0ahm?MFX9ZdT`uqGd>BB|S6} z-a=Q)Y2{6{)U_vT=`)>~k)HJmL=_ocZnFQs?g30=e@VDgLqUnJd zgZdj*NX#Gq?G+kH9K`0-D>MBkE>?NU7WWB(NEFpA=KlZfliBNhGbt=fj$Zdxrrq9_ z*8n^QZ%~zLs>ne1q?D*KpE6jq-Ew&cqX@KP&-sSVkiZb-P_jW7{7$8YlCd+@Q5L<7}sKF3~9lr}}N(qMl&1I#o zydyc^o+YY)*f8H8xj^d1UN)iD=M3*$9$Y?hu@kdnk9PI%W<#>$er4}n(mOkIWGfLW zub!O)XHRelPo2VhNySZgk(+S;FKm3(+`SlsjyZ5%R`4GB7c9-W(;@lg((?nHjCg>n z1Y4juB`+g4)V|^cPW!V+W@^2iZ<{N^*u0<$(hYK+D`G8kUvKO;(kgpT)C^J2UhiD< z+2KJ@3Mggt-YOFD7qKvX%f5NaIL#lN|AJG1C4hvJcfkq6d!OvKXLT@rjF%S7V#yaH z2KQ6Pakq9spfd7Z!HKcgkmuIb*1wTpVn(Xy`>|GGT1nz5Bm(PfvryML{QP7pF0Z}bW`BDmmT2`=`qnF$*+H#tfiuM>-oFbH6&~$@ z^U2IT&D~^wYHUDkcXOlj9}>~mMH}L7E<`KVhSkN;8uVCSH*Z!2hNh*f$2@ z?!VhXV6C>wUmYnQ@f0c}@OIL3Xocmqd`k3}pAH(eoYSU)Oi+)3aC<*&E#ry#*-Z5}Z zCvCAla^Mwfh8hNqG%au?kV=H>yhagK%8N`ZC0O!2g6d(75bQ~wQ{>PR6P#Z#feAhF z=a;L0eOTwWs$xc(Zoi+LXgC{|GX*sf2(oPOqLzMSaAjXqz+NuEa?^d9-hIx{l68#Z zbC?d;BW=A~T2@!5)W0x$SAGCmr*`TomRwM??7x&8{P)zarq$BwbvlQL%||^MwEmeC z74%k_N}t%I{0zLc-@ubKYW^zj?5MOT)_f5aUWWc3Vs&aK%9W)%ISoVX6xpm^I5{yd z(k(PFrnsTS4O4_KMBMd#P8m;;iR5o;!!wS#>DZeK)7N~la}dt-w?5DEVP zm^vO$Du1Bs!NP>PcZU<2YijNTwF}$c8Pv?a1IU$v9Y!70`^z~ zxiRTgw+g%*^}-{Tau3#*vt?=AHAxs3LUKL<7S~ zP(;l$i*$`}cDg3v_}hK*(wiu#g7ex=Su#s!i+r49IAB&Sv<6u_5s4DD7|?hpI4VR$ zVsGmWTsvd}D_-AiiB@>m4V**^h&&bz+hE ze)xWZK|;aBRX3B)zqq*2>G}8N<@MZa|E%Sl^|5(`gqYYrH#b+!V!lF~jF>q0>G@|M z(&d`ld6n@vL9N>K?3t#fW)hX2 z#dBxnkZ+utOt6$yhn681xNIvO1ek^1kE2g8)`Wa(Ms~`*+C}zh;N-K190{r8_?jNa zgo&~yhwv%jV`~@v?_FmJ-<%;I9N3Q1pGO;F`+KvlZiqK#l`ZPun;(b{k-t}}ag>Sa z%nBPA5ZmpIY+mjD4f#V#Mpk3K$jJM+gEm*8J)Y*YW^(CDK|#^_uoZH$(u5Y)$0%yC1e?Dz!eZ5=ezF)BmQ^Pgb1Q`sX)?2QMxw-A1j7>E$&jL5>it0>6H;AszfPJ1epgBAO|Hz0-pz=;Mgdg z7Mzn7eFgui=rPY<0n@u4>Zm)gqiL>2=%4-W9OI&N>Ti85|JoaUlG#0H1_N8BlbDe%L z*i1&Ds$0dRhlhs}X`FF%?JrYXEOgp!Bk?rq)R@W*wue(BeG!D?%?`&nEaPPcMn;T! zJ+L)Bijp_3JXYReALfxQSlAF_J4 zj+ClvYE*0ge39mTH1Lb3lke&4V+3%dq-VA)HX(t{70D;_`%nd$jO!*u-u~E^$gjfA zmJq9!J+ihh?8!O8lX$$LdmG)li zWv%1jpggGa&EwG)fPwihL>H+X)1+=z|A5>Wo=ad7Uk5oic|@afd3iaK%z{v-rdhH& zIvN%ki2|_Ffx*G=3=ETn$OE*wWEt+L$jHb9?iT_nCxhMHegMdE^YGX>IQ-6;jEsr- zDJTd5jph@kRu&Hg4Fn$_-&xasx==-hQVHcp|+RFORQd@$0=Bp!t;~XWN^7M3aMn> z@@7T(xG-;R&y}~b z?5#pl%P8EbzJy-6Hz%s-w6u_IZ>~9lg4DYaJ$#Ai+(nCXXevS_SSkouw8VUT?N86& zfBwA~qN<>GI8wmF!z*5{Q=|DniV~%=t((K?x;=np1JA;DvpaCQ$<8?Bexl98Enhlg zY^F&55XhL{zket73YsxxfkDFC7^Z7yHknkZ)By*;z19?&MYYwL0gG0n@a=7k>)haQ zDle3PXNf%W_2u!p_3i2SPqkNYIh`wDX^oX zLv21gJNp6H^YNr+ycp_$s-ji!{q1#BtATfAWhIf>1eM8XMCW3o^X6!F*gys(RY1hZ z82uTS^_TEC3{zxiEO?)H7qA7#jo>M0bhQpXZ5!wyRFo|^36CeWk-d~u8<7t?h z;}a74_eN7jwVb;Dem+^O=`ZvEo{C1Zeb9b}*P4h1xPM^a^<_Vy-N8iGk-XjR?k+%$ zrcUZ4CrgxpNw7MejoRi;kHH} zUNl;z-4i7H%rws3J((li^#1mIjIIm@%;r!WmSDDZ_ZT3)t)Fg8Ldt?5d{_={px0I~ zK6U652_DRNH$qtBvSE6U*%%8%oA3Q#j*XMGy+Q}uZ>DU-%*|x4{S+-F{I*9EmfkL= z!y#__+DQO@hImffFN|L{E}1Bc?S=7Yi+^Iqse=o$=vz{#@xK1VAM3VL*4F(olN(Q( z-#OFP%(s7PDwZ+X(`=IQFS!zH%#hqr!}uDK{gmh`i=FQlja`n*8gL}MGUbY#fo$rs zmt-7T{`z!Wak2(lF1`K8C;&@LVzo$CBwZhjA*%sc&&jD-N+z4-%AY&9nkN~`IP>+k zG#CQsj?y4bN%@#+))1^A{GWK+yKNH_V zwhtk_kOZ}sW;giMt9KO`FBfs#HSnEHQ!h=}7o;L`Vi`x96Co%K9!`tw z?(VpfH_EDp?Onh=b`pCHV+g*9P zwP%y}Lf!7V%M|^2-xEWpiA;vkvASBrRBfoczU{35Codw`J?zYDb$a5tf%d35%aVjG z1?~YLhCCsCt@mqgz&LalzFv>>v6;2fyd3=ei^zcLiWAnC+cG8>PpfHqc{l)2d)&G- z4!ge6jiU^&x1^Dg5j6|=0AH)NM`&#XsS!k6))8|m*`LgJ1dmTo$)Qb80A<9*ONoxw zn_)E^ip3oCB1K6IAr>{~{N7TRAaUYTV295j(vWHI&}+CL1NPTQ;|E7YZl^uVo{WR* zg@&>W`A$Sm8NOVl**bxT7Qyc@c5XQ(*}ghoGcbewq4K_u_&Poc6a=5Io-sh5voUQ@ z)4^760;gPYjl?EVKYo3R-#A#e=YYo51IliD=wal8m!zQ4u-!ju7Q<_Q6F*Zw>3?xC zbQ!IWVEtaX=5en{Brfxlot^yxK)0`T=Sy`ZW0^boI&bdVx%L1#vzkezrKKe;6aCEK z`QrTYVv0pGduT7zd6iG^fO_5be5U_!v59%U-HWBoJiIlFCy~NBdcec5|Avl%jir8j zaZ$P02I-s6pCxmV;f1)pTLg0c^Uts<^K82hoSvEJuUuOC$yQ&6>nvU|P#bi;c3;|D z3FwpO?4(zL?N}`6)J3+WdSQcd`{xG}cBmh(bBw*Y(rNGfVFr~LV=M(7SPPlJplXjT zY|lz(OD0Q(t#4|&>6o$VUp0AM79h~gLO7Dv8m&Y4aJ7ZKEUI4>`Y4RfQzaD~uX|eS zU41Z6P&$CjRffX_AU+V-$|Ysl*50DY7kJ(%--HF=Id zXYwl?ruf0?a_!<0o4ysE2S_SwgXlBZ_gZgDlrs@+G%sREvxiWQ^$K8Y^UJp)P zU0nb)!JV9(V#N&8?)w$2U4FuR@%8hQ3@#NW^oWHabg!IL67$Jq*-}c5kEj@7wPk{^fcb z9Hbo76MjLed49J&#Ugk-jUuH=T|K(iS#?~tU?p6Sbl6gIJX;z3D21IbG6#QU&&kNh z=%7FA)6O-*l+ZhDm1kX12L%NQS%iV;?(XjV0|I7;_6=z!CnrD3HGD2d89BX_Y016R z8~58;h?%Cq*MYq#^>AwuiYpIaz)~Q$SU$*hFvLi1qHKSoZ1S%fsyA3cSt&A#DIdoYr3<$vjh^&+a7$8A;QwZ2R|d(AB)m< z0VBP*otJG^$ZwD#*Lix5w8gQl=uB7E>4S*}eP`VODDT%L&yP%;yx?q-(+mF2(8RD4 zx3_6()iNcmSnG-qU(_lVhOe!lU=S}eG%%p^baVK@L;+f}ZnOpeM*)sNS$X;G8#DSZ zHKs{3jE3gf(yS^w1qFpl`$IWs#IGGd8E8&g-qHGeJ@FIyazQ_0)`f?HB50iVIW&;K z^`BH4=VX<>@ZMP3Fi;^%kdkA-s&R$u>MA@uZRgJG5*mWIN&%jpJ0(Zy&Ld(9lf|30 zK(V|3N=Y?0coXx$eFX~^&~7elCE%K}(5gaAywp@n#2{W@8wfokRp`!yM0IbJQ;u#+ zsV`@?)4}+sE|*RnDcXOZ=YE6rDt@V&me-a7AKluKr)QD1?k~rQ_?mKs8Fkh-pLdN% zgTNK!$cl8R(CGzLO6frfi0$ircMWxSmq zmoQ>zKEXTTh`n>w0TIzl?d>UI-rTi#B2_1JrCQ?$li_3u(?MgThtKWgu?y>#s*n}- z=*TIP%j?7OiNwNS;688dK-?VW0$xK>H2gA;!LqN?`44k*xuNF1gxFXGV~lnTzs?Sz zECH0k;KAdzwl+jU-qec=Yfi@#k>|Tp8$go-iVFjc$x_vtfq{X_x8Jq&`Bw&uKRkyN z_FU{-4)YQofTBZE%g)i!2;jzu1l)~FwmU#M854sh1&EQa%Hqkpl^`ePw9xil|2zf__;;0@0IfZ7-j1;97@-2oex<()O)#{hLN^$S> z;#E5oP+UaCxp=xguc50b`kdLg*r3N^Z7_N99@4c-yxp&FKQH8vkBBVFV{8x{%zOd~ zbgd4CgGDTPS-CdccMSKY11db0CAzI!rPJv{$&mL$^|10lk%T;2qdPVw8ri=R>~=iL zTKcPuI`I?cXe0~_NY3XN*QGYW&{bUee{`6uZ@Chg1&z7kf+sf-&F*0Vr<%z1Y0$dL=4< zuBdrlLXtQ{Vu-}Rz~KI{87OoLJb;g}8Oh*V|G{ReDY%U4dVAE1%=f}CAOODHU}e1C z<^f239R`9^>Om0^#zV0bt_vna-1QB^c~B| zPxkfuiS?eJ?!b&ELky2EMx1G_HYr($$zvBJ+Rrmr&y{BVV2a6Jb-PHNv|dV2)M!GF z%+VUV#Q;?E*hG~g1^Vn1!WPYQi~A? zIsQAt8bc%@4vI?rj?)yjJ@sO{|9OFjv~fh=Y`=wxXY&=><8EITHH~D6(-*bp?;7jQ zTwGi$E1}mZ3YU|@gxGaqxNPCu{SkKet4`_~@VyF%^6Fg*V+DUTO80b5@iIK_lA_dU zzJ2p%G9J5_k(XA*%P|^$-rn8q1q$%RMjIj^S^@vC+GLs(lSa*;x|$y&n)q@ngl>s= zxSf9@mK-0y36r{?4C6GU(z*5N;)_dq8)!f&ZX?q^vqbqcT2ZlciELUr+upjvZS1GF zOOS9#T`w^v34~036dFz-bmE&E-_~3;M^YRM>?Z@CIN&m4fB(+1wdrDGj>e{;-g%<-SYbKR+SofBV})IU&acI0IT_e zcZhiRZ&|Cn+S-}?&UqT2qUrF@BxEds%^H=TtEPWFh62)J1|UF{>hwc z1!Q+aOP1})K4)NS$lTstoj%nEf5gVqN}S#nR#d=zSA0dr;-O?C9M4Ffm3np4#K(l? z^UGv+nh2r8{-#Rveac*YfA(5{fmFHSv>5Hk6=}sUz2BSpttb0~4$nDjG=6z8t}+Bs z@))txcrsx)R+OK;i(Y;!rt-MW6M!K5vh}2rqVFPMQ2srlTc9& z7dO@drZ7NTP3xMPnixtLDYEhs)hP#*ZcY|s;;58u0n%zTlBjFX!^@kl-yhDh>{WD~7Oro=M1A{j(1>W=m*DQ=Y`r3UKY)Of7bAHD=iwW6B$ zGN>@{uC6ZgW>59)aIR2&>aDFUK&q&79Rszy>MB zF&=*Y1Q4A35MNf9^73-hWApl5v;yCD=*~9URK(Z-S(_fun*Ji>VHQbE;m~jS{EoW; zkBOe%Wl81W9Td#y^PAnBI^Vpe?;Xm&q!G0|a>r@Xi)`%bMYOB0BI_vHsO+*)g7AGj zWC^+du~NY0wQyPL&w=8$7GSA^TCBPbn!ZF@qPVztjp^&@a)Ua)@yt>pn^n5(GC-OQ;%>&Vb=al!_kV>i3>rLNw)VCT!ZnoM`lM(@92bJR%ssQ{t@FNS9e=+&9Xrle8Qa96GS zF>_}TsJ+?Noc&6vhoxi;3=OMncg1q5L z=SH%BhlDOg$HYjs_1|XAAG-hsN!eI1ka^s4>T`36_4M^=l7*v*F{xE1ew=kCCsMxP zqodD2aD9Ot=^r2U(a5;VZF=&R3fxgJ-lAW&gsu80Sb1_R(1 z`;&DHrbKy21zmmWeW%8fe+MNs^$wt~usnBH=H~jV))-+oVJcE8`ehsM3`7Y@a}^a8 zxnB*E8J#x(g1FmRQ=mP9FP}OWkL!)j-bj+rX^C+&j@C@LVy?Km4omasfLJS1}G zUL2KFB=ZCki&Cpr`}4TnsemE?dnH;jg_Y%~N88=Kbr3De5BhVptCUYBhYUHo{hh<^ zugsU0Le_3c6$jUeCT+fKhw4NI3@^oioS`W2Q)n=@?Miep!#WHqt8FKd1{JrGt%{Gz zTP#1dA;oaFbB#inPk^m&nG8wCdx}{q@g}C*Yb$cDeCkf=f4hE z^Or|a%1MB3*X{Ln?4fMi<>kI#)xj2nujjSn@@6}AoOk_1Z{L0H@hBr`P6E)=74`NH zCUXFx>&*o~k!LKN4|5n$(Q)hOG@E0?;*7SR{)L8q=yr7w zam%jRla#z~5Be*sX75$e#6kT8ZlMXfj;O`{DWa zXq@h{hi%{d6oF%GU~tj#33*0L!F2Q$DC{-?;++ow$_F{>>Frfze%tB|0qm>UfdG6! z9Px)kqh5>PpIGsKXtvE}L=yM7$c$QOile;GJU)<}jn99A#9Wb>x2Akma7fu7Nh0Ur znOtr3arLs$Ri>$Yd9o&v=M`9pghJ5qZ8-TZN`a7hp#0cbLRr{Wx2dKzf={4{d$u5= zcdhsM?dsD&=~1 zDs=!z=ptfb*{!X3VM0IM-MLfPtoA;{EP!a&$^GPhx#+e(S(pq~UVA!Ns3HQY76#M# z)pxV`r92_$sKqk@U>Y5 zNJvHH>+93)_F+NB-m+CY34n;WY?hgmeeoxaGN{R%WqpPIB0h+K!q7& z_sMpR^HwRQB8Tlj)zzBfFu8sc-}E-mcFkoUKb(MPsA*`&*0e5Jn(s9h*k(>RT3cHe z+B{ko9X^vur;oI{Tmxou%w!#)-&a{>xD-$WO=1Ip?I+mo`8u*@X0{KQN#96G{{r-E z{Xk7g36s$4g#p-XKz6le^WkOc4b^(R@FjN}`}_TXfZ_6T*&kL8lm{RBPBKdbVCsK( z)bCI8?WIfoWxs@laY>5m{2#CPuSKXlOfg|C^8ixfnip)Q*>#9-hv)DMlJ*l!@`<|Z z8#i{q+KA!@(8<{o8?*s81EF~hx%3F@~Fl*;q2_JYO{`F?VU9z71nfpp4#K^+xUzO;- zEL3Aog4!E@XbT9pmy<j zSP{@EBMpb+Re5-U-~T|+7rRMxa&&kAa~}DAiy?Pv2kH<$+OG!jFTyLfzVoMf%qpIB zMiB*#CafOvw7PC&7W2i6(@MU zoVcgtLtGfc=fJ~SQh?<_Jnk$+5ecg_T4(*1#sHLhy8ZDC;|_0d%!VLHxM=T#g1?Udv%a<>tHLWRKV`Jd~2w1jnFD@xT$pFQz1_+b8 zhX)PLeM9?e7{CM>oiB7s=F^#skq|Jc>66H3Pyy?`>*LflMv^6-dO(nnEGUO(tsF0X zJ~^4_>(4185>nE{jyY1&ewp1}jWkYWz5<>BsN%`@4{e7(vStimiW8L%F z0+UW_1Yk}*z}Pf7UuI7x0}{asVCk$l%xeNpskgVfsS*glcp);JQ4A;m_`_U3|2Ka$ z?HU*X*#^Kh;a|TXm+LLIfqo2PV&Y7t5*vH_p8sn?vRq;Ru;O!Ue;&_Q7z3~rw$$y1 z%sUW@&GY~y=Lf)3WVhM!m#qYez8^=cB}@5D@NyPlkAP4w5PRqZUxJlRS6EI4-Cl(5-0){3iMSYEcP zFEMQ7gmt>myP*y}9;@2m9aW4S^ACR^9g)KslD zfCIA2M=D8561B9XjV9jQ+!QE~O0%{uT*jD8e5FE%f4V=H2F!1>Jg~vBvHZ+T?=8JY z(WUP0Ptwxkd}WGEHn*CXx5zGoja`D8c4~|bBpx)2Kjm*hx5xGVX=0+nR>r7y;^YME z=m|^(GQwXGs7G}dU3X_1=qRnMrZGzoEUw{K2PIqK2%qa+} zSYM~!jEb1<;m|G`k7)suTkqjXq@IUzvcCa2!*CBA#WXyGe*1?@f}xYK#sm2b5-na+ z{4`K3Jinm3I^jr4*yz<%tadix59?X(@m`bD?07armC`ke1Z4Rtk||%_?(`K{(u+WA zao_x4n{h}GDvl^x^WexA&6hFLWJ&Cs+JjV*1P1DGwnVw6{BUn?u5gOlH@~;D(+AL5 zRW(zWl^&j--%WaB`DK0^YbyhHDguu$@>Rz6j{2!fP=SS#R$;f@7MCR$Q4;!b_i=&Z z~4ENEN)yYo7e@UE7wty2nv+-PjU9)G>jDwqH&i?P?T+gon+|e{U%_5m< zF``W^r55u=qD$k)&h^h_0>ee2@-N93RMX8;W;1GpLY@Ov%-gZzdl$LJe3-&#**%kC zWjw-FvI(L}s!}<3`V!|VmVS%a2vQ)GRoAtU+MolmXZST-O10Z0=;WY}zgtBDl zvs>``-FDQjn`251ebc0CO1DdG^&DI&@BTKI7}^g>8br^JR;G>Gq%@^EEoM6e*Y#3! zRQgB54^Q(T-ko$FA$%dvYeuW0B>5CWN4WK#jdU51{mGBVjUO!y6dKqlve{?qg#z2cydrHHGHWSNPBg*2x;oW>=y}3cT4} z#6NS2M*kZO%9@b$+2Y`Myl8D)KP(t8aXOFIl72I4D)t-yCXIeqJ#1@${OEr_$@-&` z)QC)d#;Z_Sx~=5GFj0B@FW0EH6Fcoys^7n1%Z*k1a$EbB}_oRP;x z9A-4WFe$9Oph(W=D#MI9dHwk=E8tPdc5v9V>F0V(gn7OkNK1tr=SMKkz-C7hen6^T znLQ0ZIRB^2O+Pt`w+IW)p7or6)cJhP9o6CA8^!*^)e50|`2jz2!`SC=+~e7l#e_F{ zvoNhghR{OYUd{}UbnWt2y=tpu&aijVmjt%(_RiANny+<+=rC4z%w}oH_V#c)HOI=< zR5f?CnJy3$a{n(F*%wP-FCuC9M4NzkRBVR3Kd+FYJ(_7@bM*9_ohz}_TD7jWgAnyy zNW1z4Ixlzo62~Kay;k zykF~ryIZ=UwJI*-*sSVQnHGqa1#T&ViYGl#cSB?Hg_OPCs&d@^ysea@y&*op_*WO) z8=6NbTk21feFM(kD4?9E&}MKPp6lOssatQjCt5GaEcG6CNmxW}2*5&a8_0c^(7Te3 z`FOs74;5zyH6K}u!pR%H<{p(4rfrkotDln0nwBpBtSd?Ot4}T|=77DkA*m@$-DyR& z#O1{c@cYz^%T|oQ&=QUVWnI%@r|9jbMvT4+=B9kW+ScjN8ptPJQL7y^ooh?U7jL2+ z78uBCM^V7&*P=~WZd5dAbGOwma9_F6gIT@)4t8&|Z4HZ5&Z2B`Ln1bw3Ps|4Kf$ka zDrZjA{A@!aKN@AmA#prk>1<*`x?_0Z!Lt_))B1dAx>G?e0sA%2Ku2jrE1$jbLUTc- zjY5=Pp6JaaZw!hs%cV z^3_$F@=|3raUIdA(Hlfvrja#q2Tj3qumfgv(Sl7mOJ*5WKYDqo%?YB3v^y&0Y-^jW zoe}J-2J@oyp9x97Co8TPJTua9E)K~1U1v24O6Vx_d~;V&<*t(RovUbMMP2)var)(W z_9p|q(#;yH`6bTY)Bt_@nWM$+ZI3WB8)AO~iT(K?bj1(MGTagIfH+~u1n6yt3t+o; zq>#A?Hv1Z(O+N->Nc(;9l3giQl}F2`2ST$uNQEy;yFawyI%&r%oi-rOeO**TQVLOVd|1~WyRDt#E4P7d0^0aK3@4HXC z&U!$wC(A}Qp<=mVb-nI3NyizYu9~5O@!X=`!DA?wP^-joRJQJO`(K|n5XHqtUber7 zWYxy{((plS*a_#&4*7RLENSqT>INCx)Y%E-$z0~kI}rr0$s21A`rfxPNPuh;za@aB zjIynikS$NuV0+zKcc-a?{}|BxhQiGC<4@}?(DJHtTQvLPtcv-g=BM%XpLOA+$6*O6 zpR9|I&av;Q5P z!xn|tr|#2A$-(*{f-($-3l2?$757iM^1dw)D}c-oesG^voU%O%s=w3x8^%Akisu;( z=JJooX_(!1YjiNOtBQ8Pa%!Nkf%ueOW3;UxyioF5u+U6-&AS`c2jlV zB^%S{KAUcOuOsQc$0v7Muh87M!NQ7nv%9f@r1hP2zDk+G#YOGw9VOHk8&wH4w|^|{ z+W890t|9!($GB3VPXg)GGK8Y(f7i+m5eT`IVlOLmc5hKJ$ekCc0JE&%Fv>)=DQ(X8 zf2+POu=W9n+B-?^o_#5K;CQa`9C9VKPnRi)$fzL%)^`uL`=@mTd~&n9Tp5}YXBA)2$HfcY78i0=NHpR`4tDmbmTONpoglPgrPR{`Pl zujxL#UuP7~z+DA&v}Kk)>szl)=s410j}{Vm0;>jrMQ&k1@4N$N2RE!Pu;QnD-X2Z1 zC#p(KJQ>X|IjVCW8&|R`Cz_*Y>tlln<8V8@cJFnCHsw6Jja8e7f`ZF9+ek8Je(Zd9E$&P>$9oF-4=%!V`RV?(7#gxj1f1(Yw>Z(d(;544iA&3*AwjIn&PG}oZ_AKzb< zicDZCg39A=qKFo5$Poz zn9jd{XUOh`UBkU5utc(S2*0$yF#zJr-Th6Um*X8SMvzpuqPAW2X3-z2B;JkwuH9Du z0@j*EGr1Sf6uz|0(?g(WOS7Asck^>I**mNYowxLSGuc2Px(ayyc7^3{fJ^6DebX0w zeLU%4;pL0%*ev0YaCt`oR&j17!SPtznKwts^H3^;rT(mT+D*gl>w2sjd43$dgbcpw z5-OMB<6+V}NYa%WuRla$4ye?rqvyj9+IpmGkk7JOn#Kdx10MAz&+)*V zH~qo}v{fAJ0@~l)-uY>3!^l-Wc-A(?Tm4M7_)BhV31awZ+K=UMcN#%jc>Kr(L9F&< z>SoLPjWre;|HK1y)){Rtual(PSsYa37rcLdGD!H2H|no-T1YSAZbR+^XNFYy>TVj- z^jV(7D|qR%r?kT>7-E-WDQP-i_zeM?r-l|OvA0jgl1e$1N`66 zp%6`GULf>)3S7}5Bh!cff(j0CQ3ufE1Fv>!b%_mjsx9w>)Ly4-5eBJfb@ko}Xlmts z_%D`Aw)*|wHzpsZ&1&4n_NId}w)QrZ(mFL~+mLTEh@2iglFZAR;}Dx_kvy}E z=J`uQ;B>)HR2h@8NCUPEDt2~mxzC<1V69Fq|l)e>r6}MN^sZnW<8>% zqtUr@(j#(hYn?3z*LsV2SYV=u?vi$B1YoIgV^Y(`-wMsR863#dUU!KGfR5z0P9MA= z(cIb>Bk+-sAZ+%flCP^3>f#pj)U}+;n1|*_OIztszWG9V3YVGrcmE zU!c&E<0(a1!h_2N+j7#RA>$KM55;fezC3bGBKLI$x! z>6ZhoQ-YflXi?!^A*yMnnf|zB5?gN0j*Eg_1UGHJV0GJ{s@Pt$DEJ`F2lKVl52Q=T zzo|Z4QFKqh{N<^$TFxskBF42QNvNZ=-j60EceYxoBKM5EL0E3qbU&1~LT+aeWLc#+ zSCLQr{lL=E=#h!W%OnJj|9hqRz?z+MA~QWY4A=HGfpNuZrX82nO4=@=AgGgHLX0#9 z+Mo5Fl}5;l9+0OkeCivGeKJ)K-004b-x0_AyLK83vUDeaMVOoYE|vLYFZV7Y92QHx zNE_Fl8H($Y!bIuv}M*$NA{?1$ARORUI8znT&l8_OSi#AWFZ|E7Ih|wK@+~dlQ%x4(7HcwG>4~>@soqF z%`u06y{<>`U}J~(RbM_Hk6pu z!R!0*ENNi1-1~8o%L+%PWldgwX{v=VT{fGD0VcYg@VL(3dnDa!9}!|WWIpIq356JA1gBGJS(#nJhNz9XSXxI1-MDL?%Zt z9Q~l zoib&BHTh%-#T4XJ-y~z8`73&uuDmOgSl3iSMMSuh?d|4Q4%%EIlZ?adW&*;i5k5}cS3@2I;V4bC!&VaBI@bw)bqRd{r%=Q z^ZxbzxAV;I%?6b2w`w`CIGNdvOhE;R&_;8{%;jjA&_(mFiK*sDmX}7NVA4d&g z3lW_>W;Qun&-aL_ay|(1yL4SoBy%6Ic`>0ATjI4dQk#>K^a4RCoEJ)_a^)g-1zl%5DPV7lJV<;reR|JmzlBKZei;&it9i^{FFa*U zQ)<|8{OTgFvp=^6*LQu5r%l?mmO?*NMn;9klZhN1M5tPYkb;f?aWxTr71bY$HQ(G` zj<4^mFFvaHO7dvsj&p11xEMQKvru9~o*Nw<;$z_C(WMY_c5VdJZn^W-jAC&EixH zP@<5i5zva43gkhy5PzV=qgh~t!k*a2=u(tDsdpRxD_f;6NbP$NczAeg{L=|r;$w3# zE7V;K&TTj-^kUf89RZ0n~U zv6`IZiT7j~+jX7D5EZD#(W9RDOZ7xU!#fvw(>%KcH%gt32O0!*-&OQ#JUefS65#2~ zlv0%KT2lv~49;kt-Q4dl`Fd*48x)5;pi zif3KCNFCh~m&%G18L7{+Psul|8?|4^{c89aPkm9Df${MhTG@~g!!S~_qbIH@@pQiC zXPe}{R9~D4IaLvdr{r60XzO-w=r{EG^2*)Ps*UuqOt5zPhAy>PrN7<0l{@1ZjNJcI zURP)DoXNqNzgSPv13fEMrqij4rHp|N-Y24bnbQyC-uV)6;Or>h`GN#H&^)R(#WhFI zRG47n>b73+J0RR8y{ybG(m+Jync75j8)e+Njb-=VEdcpg3;`Xox(Q1ze2%)OFX0-% zO-fFAqd2^p{jBkq=s}vXjr>;0Qyx*IjsM|YUYeK~;$*sE?_M-x*H1Wc1^Uv`ZKGnJ zKYSz}`bb6+z$X2$0srXcX*I^`UG--AIE!!wY@Tmi2ihgL`{H}U$PE>;iU}6nLwcqE zQ$~J1V^un-&>%`WxDg<*M-Vts_m>P%eUVdGs$!}VaP^?|X6$9%WuXZ2K-$|MX3am) zGa)WQU2=U{$FV7n;aM>{J1-Rfyu`C;pnvRGveNkcJx>&|(zv6vE?am15%lyEhVD{C zSK>}O;lwUvCWpdoYJQX^ELw?fw3^2!WA4;Koiz#2A>#OiVF<3UyPIuvHB%{fi<&yP zdYIU&G4&?EdNAoG$Eac|e^G+Dd`v`e2{L-?ZV!}Zj8G=*A=g||Sq$)#NSwO*070kI zqkPs=tsu5aySI0jLkl;PB-*7f8tb#8qbI`>nk`@a!@&H`b9q#6l2cJyuG&%0w zRfegXzzm<&IG7rXQBbAt7k%KwSA)o)14GY$+F<_q=&Iik`dgSCqpB9`vpEYg^q77e(3+u%K3yZZ=AlB{oC`2c6dtU;R77R+p0CpBSVeKJpq*bpD|LnmaU z3c~V7iFb3JG+fcr!LG2lCaaHu$_Z|)lYZ+?C9J)#PY3trFZwtRBn9<`M)u3O@=)78CV6oM12Lh_HQrNa6aVzw_2)9EszE)AxRb}wuQT-@mSy2cGYNrx%?3QUWoyEUeE(;tfYJPqt_&?oa&a!Ac#Y zOxpnW-SmXx7@YFb+Ar0QS*!@o9_mhu{`!ejyl(3HK<<*J@9>xPQNqm%RYsG~*wq1| z04F%87P8F@lMG9S5k^DMS;h<8Wi8{=-^3m&0h0FY9h^&gbtZ=WojduICg`f|`or1H zfU8JmCnoP^6C+Z@U#62Pm3XiEdT&3|KTT#^;3mvw#>A1QKk*s_(>1%+)p?7Xd-L6M4am(+}~YveaA>@~61wZAy#<$Zx$;lacqKM4~ywFp&*tmGw&L z*h#ePoeG5ek;XJ;$U>oJq;$+1rRm{eK3o)_yf-rA-q3-EX0oMNI7OV!E|*7hpw(B@J4#7QJiAp= zNT6rVH>Oi2apE=rzQLkgoMxAEP{}~O#lbR?3y#3WdY!94SF8`Od>eNvYV&0lwGaL%H2}LmiXq> zQizMQw$S#o9qQL_n0cD`F1d(Rd)CW;*aLfVac8oPFGq$B4L%-Hx8lFsAL19*($S4g zRFqS)rQ@9DUu-GaD? z)t!6h>Tz^XZmVetWT=HO7kXB_CWm_Idage4B%)P1YpIoRzQ){GF6XL1fUecP;0^%1 zUNr1O4%O^TJ!jub@FNHjwrZ>{I8u%ZEjP=FL)GAsKw;BW0_7nhCm zb|@7cYh34S#(@j5uvUKoH_;L;v9LDP<=v-L>*b_6kMBr%2NO;sa;Z`GkvLe(A?rAe z-cetl>;W`8w2+(Lff1X-s|+ML;=PDj4x_V@nI~|kwCtVi*O$Udd;;DRRP5&k+-``U z443Wf!mA}-=ao`8CPH5|f?P2)h#%!2M#13X=88NtIh38BJq9J^h|elzK6ql7+T`NG zaAKsWsu3d7Ip35UCa!W+x-zx44}xELav=ROl2ct=;EWSd3T-fNKr|53dgBz~O;hthyJWY&PoW8NJY z9T)A#Y|mV+tTtp8m(1B-WhBP5a}_WTu0QxvR>qA;)F0hhgh+J81(pb))7>)D=;e%+ z=(tW*t};rT98FAK(q{?HdYFdc(*Px#xnqAQe`M}78C|?dDB`AAE!i00-4rwZDv1D( zOp$*eU+4(TyJKKqMvAsy>7!_E+VBJGQ-mxU)^l1QB{ddnX0Ln8=S|!4oELb#*Is z#9zw$MEQ@`=SnSv-L?drY4b84yIhUYp#zSMBviG(P#7QoE(p9|uDQ@tAE}UJwP=TL zaqEwO+M1pDKVgDO%Z58rGqUoT!J8qy$5g&gzx66mP|;#5ff_;Yt(dOf>wF+afJR@) z>vmhQc0bP)?~HHag`kwLMs&pF>H%E&Gs#oL#8vnZQi#KdjdVnaYpsOkRb><1;z?_ zO8sT|mXpO5UUN+-J+o0@zi+Nw@Dd^#%zZp+7sVl&AEv9LtGbZM{qVNlfR9#M%$*>% z>|}KJN^~HPUnEINJUV7UF?&(s8m@2R%K20v)X0OlIC7KezRh?h%ec)_rK5GxX5`mz zJk)Jokg1D3(4KG-5?20|#EXBt<+leU{8bQb#1YXyLv2|nq9uH()diojn3(;t%F6!g z0<|oh3hVscKwSQQRLB7(f`1Cq-}{E#^R){PgXcXO3?0{_vW|)uo+?+^oEN z22!X9)aM00?uRp!VP@V%w$Qr%iN{7m32CmJF0F^&hQ-{3$5Vm zh_bTG>;;}uIDyG5Vk8b)MIOuLFAb85tCZ4RW;6MOXCMn;H{muq1Q$Tbv7T^R`k3t+ z7e*{12Dy8e>`eT4v56+Wig)J$qFU#&j} z91p%w$O1AJBxJHGPL(>hYbloHuN@+=XY3|w5Bb=!k^s6?c3JjtZ6jmj2ZLi{=SNw{ zJioTwoCs-&r_+y1@tad=bt^z^jbD&heakRwrj z%ZHz1jj^Jl0*ll3FVwBVLd8nLr|6Nozd=i8x$WjX$vCP2td8A%oI|GnM<6J%x`(^H m{I}KrVa5OR{&yA(n{cBOB`z`BIu;ZaM_EB#zEajA^nU=J - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - From 946781d5780eb4afb14c5d580fab6ac54f6a9cc7 Mon Sep 17 00:00:00 2001 From: proost Date: Sat, 26 Oct 2019 01:34:51 +0900 Subject: [PATCH 089/112] ENH: groupby missing data in index (#28097) --- doc/source/whatsnew/v1.0.0.rst | 2 ++ pandas/core/indexes/multi.py | 5 ++++- pandas/tests/groupby/test_grouping.py | 18 ++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5514265f37f05..97fba5a8e1f38 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -401,6 +401,8 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- +- Bug in :meth:`DataFrame.groupby` with multiple groups where an ``IndexError`` would be raised if any group contained all NA values (:issue:`20519`) - Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue: `28192`) - Bug in :meth:`DataFrame.rolling` not allowing rolling over multi-index levels (:issue: `15584`). - Bug in :meth:`DataFrame.rolling` not allowing rolling on monotonic decreasing time indexes (:issue: `19248`). diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 74dbcd4067ec0..dc2abfb0cb6eb 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1308,7 +1308,10 @@ def _get_grouper_for_level(self, mapper, level): # Remove unobserved levels from level_index level_index = level_index.take(uniques) - grouper = level_index.take(codes) + if len(level_index): + grouper = level_index.take(codes) + else: + grouper = level_index.take(codes, fill_value=True) return grouper, codes, level_index diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 403f5f11ee768..ab25d183ae3ff 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -628,6 +628,24 @@ def test_groupby_empty(self): # check name assert s.groupby(s).grouper.names == ["name"] + def test_groupby_level_index_value_all_na(self): + # issue 20519 + df = DataFrame( + [["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"] + ).set_index(["A", "B"]) + result = df.groupby(level=["A", "B"]).sum() + expected = DataFrame( + data=[], + index=MultiIndex( + levels=[Index(["x"], dtype="object"), Index([], dtype="float64")], + codes=[[], []], + names=["A", "B"], + ), + columns=["C"], + dtype="int64", + ) + tm.assert_frame_equal(result, expected) + # get_group # -------------------------------- From 291afc360fbb41214be2ae4d0d170ffb748183b4 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 25 Oct 2019 09:54:34 -0700 Subject: [PATCH 090/112] TST: Add test for nested JSON normalization (#29225) --- pandas/tests/io/json/test_normalize.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 3ceddfc3c1db4..aa4f522ef45ba 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -457,6 +457,14 @@ def test_max_level_with_records_path(self, max_level, expected): expected_df = DataFrame(data=expected, columns=result.columns.values) tm.assert_equal(expected_df, result) + def test_nested_flattening_consistent(self): + # see gh-21537 + df1 = json_normalize([{"A": {"B": 1}}]) + df2 = json_normalize({"dummy": [{"A": {"B": 1}}]}, "dummy") + + # They should be the same. + tm.assert_frame_equal(df1, df2) + class TestNestedToRecord: def test_flat_stays_flat(self): From a23a58960ce6171df40d69b3e3745f0dd884fe5f Mon Sep 17 00:00:00 2001 From: "Jasper J.F. van den Bosch" Date: Fri, 25 Oct 2019 18:01:29 +0100 Subject: [PATCH 091/112] BUG: fixes formatted value error for missing sheet (#27676) (#27677) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/excel/_odfreader.py | 2 +- pandas/tests/io/excel/test_odf.py | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 97fba5a8e1f38..4007ecd5a9412 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -382,6 +382,7 @@ I/O - Bug in :func:`read_hdf` closing stores that it didn't open when Exceptions are raised (:issue:`28699`) - Bug in :meth:`DataFrame.read_json` where using ``orient="index"`` would not maintain the order (:issue:`28557`) - Bug in :meth:`DataFrame.to_html` where the length of the ``formatters`` argument was not verified (:issue:`28469`) +- Bug in :meth:`DataFrame.read_excel` with ``engine='ods'`` when ``sheet_name`` argument references a non-existent sheet (:issue:`27676`) - Bug in :meth:`pandas.io.formats.style.Styler` formatting for floating values not displaying decimals correctly (:issue:`13257`) - Bug in :meth:`DataFrame.to_html` when using ``formatters=`` and ``max_cols`` together. (:issue:`25955`) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 66a186161e01b..3a67f8306fff1 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -61,7 +61,7 @@ def get_sheet_by_name(self, name: str): if table.getAttribute("name") == name: return table - raise ValueError("sheet {name} not found".format(name)) + raise ValueError("sheet {} not found".format(name)) def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: """Parse an ODF Table into a list of lists diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index 76871eddf1cee..47e610562a388 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -36,3 +36,11 @@ def test_read_writer_table(): result = pd.read_excel("writertable.odt", "Table1", index_col=0) tm.assert_frame_equal(result, expected) + + +def test_nonexistent_sheetname_raises(read_ext): + # GH-27676 + # Specifying a non-existent sheet_name parameter should throw an error + # with the sheet name. + with pytest.raises(ValueError, match="sheet xyz not found"): + pd.read_excel("blank.ods", sheet_name="xyz") From 8f9769c68e07ac3c3f5a3368affee278bf028b14 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 25 Oct 2019 10:10:32 -0700 Subject: [PATCH 092/112] CLN: AttributeError in _wrap_applied_output (#29195) --- pandas/core/base.py | 4 ---- pandas/core/groupby/generic.py | 19 ++++++++++++++----- pandas/core/groupby/ops.py | 10 ++++++++-- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 5ae3926952a67..9586d49c555ff 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -571,8 +571,6 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): except (TypeError, DataError): pass - except SpecificationError: - raise else: results.append(new_res) @@ -591,8 +589,6 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): except ValueError: # cannot aggregate continue - except SpecificationError: - raise else: results.append(new_res) keys.append(col) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 695823e29ef1b..647eb5a2daa28 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -480,7 +480,7 @@ def _transform_fast(self, func, func_nm): out = self._try_cast(out, self.obj) return Series(out, index=self.obj.index, name=self.obj.name) - def filter(self, func, dropna=True, *args, **kwargs): # noqa + def filter(self, func, dropna=True, *args, **kwargs): """ Return a copy of a Series excluding elements from groups that do not satisfy the boolean criterion specified by func. @@ -1228,7 +1228,7 @@ def first_not_none(values): return self._concat_objects(keys, values, not_indexed_same=True) try: - if self.axis == 0: + if self.axis == 0 and isinstance(v, ABCSeries): # GH6124 if the list of Series have a consistent name, # then propagate that name to the result. index = v.index.copy() @@ -1264,15 +1264,24 @@ def first_not_none(values): axis=self.axis, ).unstack() result.columns = index - else: + elif isinstance(v, ABCSeries): stacked_values = np.vstack([np.asarray(v) for v in values]) result = DataFrame( stacked_values.T, index=v.index, columns=key_index ) + else: + # GH#1738: values is list of arrays of unequal lengths + # fall through to the outer else clause + # TODO: sure this is right? we used to do this + # after raising AttributeError above + return Series( + values, index=key_index, name=self._selection_name + ) - except (ValueError, AttributeError): + except ValueError: + # TODO: not reached in tests; is this still needed? # GH1738: values is list of arrays of unequal lengths fall - # through to the outer else caluse + # through to the outer else clause return Series(values, index=key_index, name=self._selection_name) # if we have date/time like in the original, then coerce dates diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index fbe1598767736..79b51ef57cd37 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -199,6 +199,9 @@ def apply(self, f, data, axis=0): f_name not in base.plotting_methods and hasattr(splitter, "fast_apply") and axis == 0 + # with MultiIndex, apply_frame_axis0 would raise InvalidApply + # TODO: can we make this check prettier? + and not splitter._get_sorted_data().index._has_complex_internals ): try: result_values, mutated = splitter.fast_apply(f, group_keys) @@ -208,11 +211,14 @@ def apply(self, f, data, axis=0): if len(result_values) == len(group_keys): return group_keys, result_values, mutated - except libreduction.InvalidApply: + except libreduction.InvalidApply as err: # Cannot fast apply on MultiIndex (_has_complex_internals). # This Exception is also raised if `f` triggers an exception # but it is preferable to raise the exception in Python. - pass + if "Let this error raise above us" not in str(err): + # TODO: can we infer anything about whether this is + # worth-retrying in pure-python? + raise except TypeError as err: if "Cannot convert" in str(err): # via apply_frame_axis0 if we pass a non-ndarray From adfd4decae8557266887564e6696cbb7a76ba73d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 25 Oct 2019 10:49:33 -0700 Subject: [PATCH 093/112] CLN: simplify core.algorithms (#29199) --- pandas/core/algorithms.py | 77 ++++++++++++++----------------- pandas/core/arrays/categorical.py | 14 ++---- pandas/core/dtypes/cast.py | 11 ++--- pandas/core/sorting.py | 4 +- 4 files changed, 42 insertions(+), 64 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7760c8ec397a9..8f72245b1f4eb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -47,7 +47,7 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas.core import common as com -from pandas.core.construction import array +from pandas.core.construction import array, extract_array from pandas.core.indexers import validate_indices _shared_docs = {} # type: Dict[str, str] @@ -82,9 +82,12 @@ def _ensure_data(values, dtype=None): """ # we check some simple dtypes first + if is_object_dtype(dtype): + return ensure_object(np.asarray(values)), "object", "object" + elif is_object_dtype(values) and dtype is None: + return ensure_object(np.asarray(values)), "object", "object" + try: - if is_object_dtype(dtype): - return ensure_object(np.asarray(values)), "object", "object" if is_bool_dtype(values) or is_bool_dtype(dtype): # we are actually coercing to uint64 # until our algos support uint8 directly (see TODO) @@ -95,8 +98,6 @@ def _ensure_data(values, dtype=None): return ensure_uint64(values), "uint64", "uint64" elif is_float_dtype(values) or is_float_dtype(dtype): return ensure_float64(values), "float64", "float64" - elif is_object_dtype(values) and dtype is None: - return ensure_object(np.asarray(values)), "object", "object" elif is_complex_dtype(values) or is_complex_dtype(dtype): # ignore the fact that we are casting to float @@ -207,11 +208,11 @@ def _ensure_arraylike(values): _hashtables = { - "float64": (htable.Float64HashTable, htable.Float64Vector), - "uint64": (htable.UInt64HashTable, htable.UInt64Vector), - "int64": (htable.Int64HashTable, htable.Int64Vector), - "string": (htable.StringHashTable, htable.ObjectVector), - "object": (htable.PyObjectHashTable, htable.ObjectVector), + "float64": htable.Float64HashTable, + "uint64": htable.UInt64HashTable, + "int64": htable.Int64HashTable, + "string": htable.StringHashTable, + "object": htable.PyObjectHashTable, } @@ -223,11 +224,9 @@ def _get_hashtable_algo(values): Returns ------- - tuples(hashtable class, - vector class, - values, - dtype, - ndtype) + htable : HashTable subclass + values : ndarray + dtype : str or dtype """ values, dtype, ndtype = _ensure_data(values) @@ -238,23 +237,21 @@ def _get_hashtable_algo(values): # StringHashTable and ObjectHashtable if lib.infer_dtype(values, skipna=False) in ["string"]: ndtype = "string" - else: - ndtype = "object" - htable, table = _hashtables[ndtype] - return (htable, table, values, dtype, ndtype) + htable = _hashtables[ndtype] + return htable, values, dtype def _get_values_for_rank(values): if is_categorical_dtype(values): values = values._values_for_rank() - values, dtype, ndtype = _ensure_data(values) - return values, dtype, ndtype + values, _, ndtype = _ensure_data(values) + return values, ndtype -def _get_data_algo(values, func_map): - values, dtype, ndtype = _get_values_for_rank(values) +def _get_data_algo(values): + values, ndtype = _get_values_for_rank(values) if ndtype == "object": @@ -264,7 +261,7 @@ def _get_data_algo(values, func_map): if lib.infer_dtype(values, skipna=False) in ["string"]: ndtype = "string" - f = func_map.get(ndtype, func_map["object"]) + f = _hashtables.get(ndtype, _hashtables["object"]) return f, values @@ -295,7 +292,7 @@ def match(to_match, values, na_sentinel=-1): match : ndarray of integers """ values = com.asarray_tuplesafe(values) - htable, _, values, dtype, ndtype = _get_hashtable_algo(values) + htable, values, dtype = _get_hashtable_algo(values) to_match, _, _ = _ensure_data(to_match, dtype) table = htable(min(len(to_match), 1000000)) table.map_locations(values) @@ -398,7 +395,7 @@ def unique(values): return values.unique() original = values - htable, _, values, dtype, ndtype = _get_hashtable_algo(values) + htable, values, _ = _get_hashtable_algo(values) table = htable(len(values)) uniques = table.unique(values) @@ -480,7 +477,8 @@ def isin(comps, values): def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None): - """Factorize an array-like to labels and uniques. + """ + Factorize an array-like to labels and uniques. This doesn't do any coercion of types or unboxing before factorization. @@ -498,9 +496,10 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, na_value=None): Returns ------- - labels, uniques : ndarray + labels : ndarray + uniques : ndarray """ - (hash_klass, _), values = _get_data_algo(values, _hashtables) + hash_klass, values = _get_data_algo(values) table = hash_klass(size_hint or len(values)) uniques, labels = table.factorize( @@ -652,17 +651,13 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): original = values if is_extension_array_dtype(values): - values = getattr(values, "_values", values) + values = extract_array(values) labels, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: values, dtype, _ = _ensure_data(values) - if ( - is_datetime64_any_dtype(original) - or is_timedelta64_dtype(original) - or is_period_dtype(original) - ): + if original.dtype.kind in ["m", "M"]: na_value = na_value_for_dtype(original.dtype) else: na_value = None @@ -835,7 +830,7 @@ def duplicated(values, keep="first"): duplicated : ndarray """ - values, dtype, ndtype = _ensure_data(values) + values, _, ndtype = _ensure_data(values) f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype)) return f(values, keep=keep) @@ -872,7 +867,7 @@ def mode(values, dropna: bool = True): mask = values.isnull() values = values[~mask] - values, dtype, ndtype = _ensure_data(values) + values, _, ndtype = _ensure_data(values) f = getattr(htable, "mode_{dtype}".format(dtype=ndtype)) result = f(values, dropna=dropna) @@ -910,7 +905,7 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). """ if values.ndim == 1: - values, _, _ = _get_values_for_rank(values) + values, _ = _get_values_for_rank(values) ranks = algos.rank_1d( values, ties_method=method, @@ -919,7 +914,7 @@ def rank(values, axis=0, method="average", na_option="keep", ascending=True, pct pct=pct, ) elif values.ndim == 2: - values, _, _ = _get_values_for_rank(values) + values, _ = _get_values_for_rank(values) ranks = algos.rank_2d( values, axis=axis, @@ -1634,9 +1629,7 @@ def take_nd( if is_extension_array_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) - if isinstance(arr, (ABCIndexClass, ABCSeries)): - arr = arr._values - + arr = extract_array(arr) arr = np.asarray(arr) if indexer is None: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 70ed411f6a3e4..4d065bd234e0b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -47,14 +47,7 @@ from pandas.core import ops from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms -from pandas.core.algorithms import ( - _get_data_algo, - _hashtables, - factorize, - take, - take_1d, - unique1d, -) +from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array @@ -2097,7 +2090,6 @@ def __setitem__(self, key, value): """ Item assignment. - Raises ------ ValueError @@ -2631,8 +2623,8 @@ def _get_codes_for_values(values, categories): values = ensure_object(values) categories = ensure_object(categories) - (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) - (_, _), cats = _get_data_algo(categories, _hashtables) + hash_klass, vals = _get_data_algo(values) + _, cats = _get_data_algo(categories) t = hash_klass(len(cats)) t.map_locations(cats) return coerce_indexer_dtype(t.lookup(vals), cats) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7fcaf60088ad2..3e92906be706c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -21,7 +21,6 @@ ensure_str, is_bool, is_bool_dtype, - is_categorical_dtype, is_complex, is_complex_dtype, is_datetime64_dtype, @@ -1325,14 +1324,10 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): np.ndarray / pandas type of length, filled with value """ - if is_datetime64tz_dtype(dtype): - from pandas import DatetimeIndex - - subarr = DatetimeIndex([value] * length, dtype=dtype) - elif is_categorical_dtype(dtype): - from pandas import Categorical + if is_extension_array_dtype(dtype): + cls = dtype.construct_array_type() + subarr = cls._from_sequence([value] * length, dtype=dtype) - subarr = Categorical([value] * length, dtype=dtype) else: if not isinstance(dtype, (np.dtype, type(np.dtype))): dtype = dtype.dtype diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 94810369785d3..706f6159bcafe 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -484,9 +484,7 @@ def sort_mixed(values): if sorter is None: # mixed types - (hash_klass, _), values = algorithms._get_data_algo( - values, algorithms._hashtables - ) + hash_klass, values = algorithms._get_data_algo(values) t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) From 58aa92b429126ea56a542f4588c0491912ce6cf0 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 25 Oct 2019 14:25:54 -0700 Subject: [PATCH 094/112] Update apt-get archives (#29230) --- azure-pipelines.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 62c46b6970969..6fb8241d6d600 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -33,6 +33,7 @@ jobs: condition: true - script: | + sudo apt-get update sudo apt-get install -y libc6-dev-i386 ci/setup_env.sh displayName: 'Setup environment and build pandas' From 53cc89236c0455893478f8cffdb7b0679bf1f8aa Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 26 Oct 2019 01:28:59 -0700 Subject: [PATCH 095/112] Removed raise_from_traceback (#29174) --- pandas/compat/__init__.py | 10 -- pandas/core/frame.py | 9 +- pandas/core/internals/construction.py | 7 +- pandas/io/html.py | 4 +- pandas/io/sql.py | 7 +- pandas/tests/util/test_util.py | 29 ------ pandas/util/testing.py | 140 +------------------------- 7 files changed, 12 insertions(+), 194 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 9c778f68727c6..81431db5b867c 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -37,16 +37,6 @@ def set_function_name(f, name, cls): return f -def raise_with_traceback(exc, traceback=Ellipsis): - """ - Raise exception with existing traceback. - If traceback is not passed, uses sys.exc_info() to get traceback. - """ - if traceback == Ellipsis: - _, _, traceback = sys.exc_info() - raise exc.with_traceback(traceback) - - # https://github.com/pandas-dev/pandas/pull/9123 def is_platform_little_endian(): """ am I little endian """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ef4e3e064d85e..23611055d6f01 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -34,7 +34,7 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, lib -from pandas.compat import PY36, raise_with_traceback +from pandas.compat import PY36 from pandas.compat.numpy import function as nv from pandas.util._decorators import ( Appender, @@ -485,7 +485,7 @@ def __init__( "DataFrame constructor called with " "incompatible data and dtype: {e}".format(e=e) ) - raise_with_traceback(exc) + raise exc from e if arr.ndim == 0 and index is not None and columns is not None: values = cast_scalar_to_array( @@ -7821,11 +7821,10 @@ def f(x): elif filter_type == "bool": data = self._get_bool_data() else: # pragma: no cover - e = NotImplementedError( + raise NotImplementedError( "Handling exception with filter_type {f} not" "implemented.".format(f=filter_type) - ) - raise_with_traceback(e) + ) from e with np.errstate(all="ignore"): result = f(data.values) labels = data._get_agg_axis(axis) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 3126b9d9d3e2e..176f4acd113fe 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -9,7 +9,7 @@ from pandas._libs import lib import pandas.compat as compat -from pandas.compat import PY36, raise_with_traceback +from pandas.compat import PY36 from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -164,11 +164,10 @@ def init_ndarray(values, index, columns, dtype=None, copy=False): try: values = values.astype(dtype) except Exception as orig: - e = ValueError( + raise ValueError( "failed to cast to '{dtype}' (Exception " "was: {orig})".format(dtype=dtype, orig=orig) - ) - raise_with_traceback(e) + ) from orig index, columns = _get_axes(*values.shape, index=index, columns=columns) values = values.T diff --git a/pandas/io/html.py b/pandas/io/html.py index 490c574463b9b..7da7a819f81e8 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -9,7 +9,6 @@ import os import re -from pandas.compat import raise_with_traceback from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError, EmptyDataError @@ -889,7 +888,6 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here - # hack around python 3 deleting the exception variable retained = None for flav in flavor: parser = _parser_dispatch(flav) @@ -916,7 +914,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): else: break else: - raise_with_traceback(retained) + raise retained ret = [] for table in tables: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 822b3288c82d9..820aeaeb11649 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -12,7 +12,6 @@ import numpy as np import pandas._libs.lib as lib -from pandas.compat import raise_with_traceback from pandas.core.dtypes.common import is_datetime64tz_dtype, is_dict_like, is_list_like from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -1596,17 +1595,17 @@ def execute(self, *args, **kwargs): except Exception as exc: try: self.con.rollback() - except Exception: # pragma: no cover + except Exception as inner_exc: # pragma: no cover ex = DatabaseError( "Execution failed on sql: {sql}\n{exc}\nunable " "to rollback".format(sql=args[0], exc=exc) ) - raise_with_traceback(ex) + raise ex from inner_exc ex = DatabaseError( "Execution failed on sql '{sql}': {exc}".format(sql=args[0], exc=exc) ) - raise_with_traceback(ex) + raise ex from exc @staticmethod def _query_iterator( diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 83d9be1ad235f..60124c8e943ad 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -1,10 +1,8 @@ import os -import sys import pytest import pandas.compat as compat -from pandas.compat import raise_with_traceback import pandas.util.testing as tm @@ -34,23 +32,6 @@ def test_numpy_err_state_is_default(): assert np.geterr() == expected -def test_raise_with_traceback(): - with pytest.raises(LookupError, match="error_text"): - try: - raise ValueError("THIS IS AN ERROR") - except ValueError: - e = LookupError("error_text") - raise_with_traceback(e) - - with pytest.raises(LookupError, match="error_text"): - try: - raise ValueError("This is another error") - except ValueError: - e = LookupError("error_text") - _, _, traceback = sys.exc_info() - raise_with_traceback(e, traceback) - - def test_convert_rows_list_to_csv_str(): rows_list = ["aaa", "bbb", "ccc"] ret = tm.convert_rows_list_to_csv_str(rows_list) @@ -70,16 +51,6 @@ def test_create_temp_directory(): assert not os.path.exists(path) -def test_assert_raises_regex_deprecated(): - # see gh-23592 - - with tm.assert_produces_warning(FutureWarning): - msg = "Not equal!" - - with tm.assert_raises_regex(AssertionError, msg): - assert 1 == 2, msg - - @pytest.mark.parametrize("strict_data_files", [True, False]) def test_datapath_missing(datapath): with pytest.raises(ValueError, match="Could not find file"): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 73535e55d4fa5..f3b0226547c78 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -5,7 +5,6 @@ from functools import wraps import gzip import os -import re from shutil import rmtree import string import tempfile @@ -23,7 +22,7 @@ ) import pandas._libs.testing as _testing -from pandas.compat import _get_lzma_file, _import_lzma, raise_with_traceback +from pandas.compat import _get_lzma_file, _import_lzma from pandas.core.dtypes.common import ( is_bool, @@ -2404,143 +2403,6 @@ def wrapper(*args, **kwargs): with_connectivity_check = network -def assert_raises_regex(_exception, _regexp, _callable=None, *args, **kwargs): - r""" - Check that the specified Exception is raised and that the error message - matches a given regular expression pattern. This may be a regular - expression object or a string containing a regular expression suitable - for use by `re.search()`. This is a port of the `assertRaisesRegexp` - function from unittest in Python 2.7. - - .. deprecated:: 0.24.0 - Use `pytest.raises` instead. - - Examples - -------- - >>> assert_raises_regex(ValueError, 'invalid literal for.*XYZ', int, 'XYZ') - >>> import re - >>> assert_raises_regex(ValueError, re.compile('literal'), int, 'XYZ') - - If an exception of a different type is raised, it bubbles up. - - >>> assert_raises_regex(TypeError, 'literal', int, 'XYZ') - Traceback (most recent call last): - ... - ValueError: invalid literal for int() with base 10: 'XYZ' - >>> dct = dict() - >>> assert_raises_regex(KeyError, 'pear', dct.__getitem__, 'apple') - Traceback (most recent call last): - ... - AssertionError: "pear" does not match "'apple'" - - You can also use this in a with statement. - - >>> with assert_raises_regex(TypeError, r'unsupported operand type\(s\)'): - ... 1 + {} - >>> with assert_raises_regex(TypeError, 'banana'): - ... 'apple'[0] = 'b' - Traceback (most recent call last): - ... - AssertionError: "banana" does not match "'str' object does not support \ -item assignment" - """ - warnings.warn( - ( - "assert_raises_regex has been deprecated and will " - "be removed in the next release. Please use " - "`pytest.raises` instead." - ), - FutureWarning, - stacklevel=2, - ) - - manager = _AssertRaisesContextmanager(exception=_exception, regexp=_regexp) - if _callable is not None: - with manager: - _callable(*args, **kwargs) - else: - return manager - - -class _AssertRaisesContextmanager: - """ - Context manager behind `assert_raises_regex`. - """ - - def __init__(self, exception, regexp=None): - """ - Initialize an _AssertRaisesContextManager instance. - - Parameters - ---------- - exception : class - The expected Exception class. - regexp : str, default None - The regex to compare against the Exception message. - """ - - self.exception = exception - - if regexp is not None and not hasattr(regexp, "search"): - regexp = re.compile(regexp, re.DOTALL) - - self.regexp = regexp - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, trace_back): - expected = self.exception - - if not exc_type: - exp_name = getattr(expected, "__name__", str(expected)) - raise AssertionError("{name} not raised.".format(name=exp_name)) - - return self.exception_matches(exc_type, exc_value, trace_back) - - def exception_matches(self, exc_type, exc_value, trace_back): - """ - Check that the Exception raised matches the expected Exception - and expected error message regular expression. - - Parameters - ---------- - exc_type : class - The type of Exception raised. - exc_value : Exception - The instance of `exc_type` raised. - trace_back : stack trace object - The traceback object associated with `exc_value`. - - Returns - ------- - is_matched : bool - Whether or not the Exception raised matches the expected - Exception class and expected error message regular expression. - - Raises - ------ - AssertionError : The error message provided does not match - the expected error message regular expression. - """ - - if issubclass(exc_type, self.exception): - if self.regexp is not None: - val = str(exc_value) - - if not self.regexp.search(val): - msg = '"{pat}" does not match "{val}"'.format( - pat=self.regexp.pattern, val=val - ) - e = AssertionError(msg) - raise_with_traceback(e, trace_back) - - return True - else: - # Failed, so allow Exception to bubble up. - return False - - @contextmanager def assert_produces_warning( expected_warning=Warning, From 846bf915ae1892171b19574be8ef98951fe12cf5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 26 Oct 2019 10:40:44 +0100 Subject: [PATCH 096/112] CLN: remove simplejson (#29169) --- pandas/io/msgpack/__init__.py | 2 +- pandas/io/msgpack/_packer.pyx | 1 - pandas/io/msgpack/_unpacker.pyx | 3 +-- pandas/util/_print_versions.py | 6 +----- 4 files changed, 3 insertions(+), 9 deletions(-) diff --git a/pandas/io/msgpack/__init__.py b/pandas/io/msgpack/__init__.py index 7107263c180cb..11407c8282660 100644 --- a/pandas/io/msgpack/__init__.py +++ b/pandas/io/msgpack/__init__.py @@ -48,7 +48,7 @@ def packb(o, **kwargs): return Packer(**kwargs).pack(o) -# alias for compatibility to simplejson/marshal/pickle. +# alias for compatibility to json/marshal/pickle. load = unpack loads = unpackb diff --git a/pandas/io/msgpack/_packer.pyx b/pandas/io/msgpack/_packer.pyx index 19307e2334f1e..aa71c5cc39667 100644 --- a/pandas/io/msgpack/_packer.pyx +++ b/pandas/io/msgpack/_packer.pyx @@ -60,7 +60,6 @@ cdef class Packer: :param callable default: Convert user type to builtin type that Packer supports. - See also simplejson's document. :param str encoding: Convert unicode to bytes with this encoding. (default: 'utf-8') :param str unicode_errors: diff --git a/pandas/io/msgpack/_unpacker.pyx b/pandas/io/msgpack/_unpacker.pyx index d7ebb194ef5c5..cf9b2c7c04d42 100644 --- a/pandas/io/msgpack/_unpacker.pyx +++ b/pandas/io/msgpack/_unpacker.pyx @@ -200,11 +200,10 @@ cdef class Unpacker: :param callable object_hook: When specified, it should be callable. Unpacker calls it with a dict argument after unpacking msgpack map. - (See also simplejson) :param callable object_pairs_hook: When specified, it should be callable. Unpacker calls it with a list - of key-value pairs after unpacking msgpack map. (See also simplejson) + of key-value pairs after unpacking msgpack map. :param str encoding: Encoding used for decoding msgpack raw. diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py index 25795859d8018..289a32c51a916 100644 --- a/pandas/util/_print_versions.py +++ b/pandas/util/_print_versions.py @@ -1,4 +1,5 @@ import codecs +import json import locale import os import platform @@ -105,11 +106,6 @@ def show_versions(as_json=False): deps_blob.append((modname, ver)) if as_json: - try: - import json - except ImportError: - import simplejson as json - j = dict(system=dict(sys_info), dependencies=dict(deps_blob)) if as_json is True: From 0d69e54321b01f885cbddf39de635d942441c9fc Mon Sep 17 00:00:00 2001 From: Christopher Whelan Date: Mon, 28 Oct 2019 11:40:12 -0700 Subject: [PATCH 097/112] BENCH: Improve perf of rolling.Apply.time_rolling (#29239) --- asv_bench/benchmarks/rolling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 493f96d46d5e7..7a72622fd5fe3 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -28,15 +28,15 @@ def peakmem_rolling(self, constructor, window, dtype, method): class Apply: params = ( ["DataFrame", "Series"], - [10, 1000], + [3, 300], ["int", "float"], [sum, np.sum, lambda x: np.sum(x) + 5], [True, False], ) - param_names = ["contructor", "window", "dtype", "function", "raw"] + param_names = ["constructor", "window", "dtype", "function", "raw"] def setup(self, constructor, window, dtype, function, raw): - N = 10 ** 5 + N = 10 ** 3 arr = (100 * np.random.random(N)).astype(dtype) self.roll = getattr(pd, constructor)(arr).rolling(window) From c1f19f28d4a52276907dc6dd5f7d77522f32f332 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 28 Oct 2019 11:40:56 -0700 Subject: [PATCH 098/112] CLN: remove algorithms.match (#29249) --- pandas/core/algorithms.py | 45 +++----------------------------------- pandas/tests/test_algos.py | 35 ----------------------------- 2 files changed, 3 insertions(+), 77 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8f72245b1f4eb..5139fdfeeb916 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -226,9 +226,8 @@ def _get_hashtable_algo(values): ------- htable : HashTable subclass values : ndarray - dtype : str or dtype """ - values, dtype, ndtype = _ensure_data(values) + values, _, ndtype = _ensure_data(values) if ndtype == "object": @@ -239,7 +238,7 @@ def _get_hashtable_algo(values): ndtype = "string" htable = _hashtables[ndtype] - return htable, values, dtype + return htable, values def _get_values_for_rank(values): @@ -271,44 +270,6 @@ def _get_data_algo(values): # --------------- # -def match(to_match, values, na_sentinel=-1): - """ - Compute locations of to_match into values - - Parameters - ---------- - to_match : array-like - values to find positions of - values : array-like - Unique set of values - na_sentinel : int, default -1 - Value to mark "not found" - - Examples - -------- - - Returns - ------- - match : ndarray of integers - """ - values = com.asarray_tuplesafe(values) - htable, values, dtype = _get_hashtable_algo(values) - to_match, _, _ = _ensure_data(to_match, dtype) - table = htable(min(len(to_match), 1000000)) - table.map_locations(values) - result = table.lookup(to_match) - - if na_sentinel != -1: - # replace but return a numpy array - # use a Series because it handles dtype conversions properly - from pandas import Series - - result = Series(result.ravel()).replace(-1, na_sentinel) - result = result.values.reshape(result.shape) - - return result - - def unique(values): """ Hash table-based unique. Uniques are returned in order @@ -395,7 +356,7 @@ def unique(values): return values.unique() original = values - htable, values, _ = _get_hashtable_algo(values) + htable, values = _get_hashtable_algo(values) table = htable(len(values)) uniques = table.unique(values) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 48cfc06f42e91..738afaea4b532 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -31,41 +31,6 @@ from pandas.util.testing import assert_almost_equal -class TestMatch: - def test_ints(self): - values = np.array([0, 2, 1]) - to_match = np.array([0, 1, 2, 2, 0, 1, 3, 0]) - - result = algos.match(to_match, values) - expected = np.array([0, 2, 1, 1, 0, 2, -1, 0], dtype=np.int64) - tm.assert_numpy_array_equal(result, expected) - - result = Series(algos.match(to_match, values, np.nan)) - expected = Series(np.array([0, 2, 1, 1, 0, 2, np.nan, 0])) - tm.assert_series_equal(result, expected) - - s = Series(np.arange(5), dtype=np.float32) - result = algos.match(s, [2, 4]) - expected = np.array([-1, -1, 0, -1, 1], dtype=np.int64) - tm.assert_numpy_array_equal(result, expected) - - result = Series(algos.match(s, [2, 4], np.nan)) - expected = Series(np.array([np.nan, np.nan, 0, np.nan, 1])) - tm.assert_series_equal(result, expected) - - def test_strings(self): - values = ["foo", "bar", "baz"] - to_match = ["bar", "foo", "qux", "foo", "bar", "baz", "qux"] - - result = algos.match(to_match, values) - expected = np.array([1, 0, -1, 0, 1, 2, -1], dtype=np.int64) - tm.assert_numpy_array_equal(result, expected) - - result = Series(algos.match(to_match, values, np.nan)) - expected = Series(np.array([1, 0, np.nan, 0, 1, 2, np.nan])) - tm.assert_series_equal(result, expected) - - class TestFactorize: def test_basic(self): From c006074ae8f3b304df4f600d53c4b3006e1349a7 Mon Sep 17 00:00:00 2001 From: Scott Cole Date: Mon, 28 Oct 2019 12:56:52 -0700 Subject: [PATCH 099/112] CLN: fix mypy errors in pandas\tests\indexes\interval\test_base.py #28926 (#28961) --- pandas/tests/indexes/common.py | 3 ++- setup.cfg | 21 --------------------- 2 files changed, 2 insertions(+), 22 deletions(-) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index b657d8d16df81..1ac6370860ba6 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -1,4 +1,5 @@ import gc +from typing import Optional, Type import numpy as np import pytest @@ -30,7 +31,7 @@ class Base: """ base class for index sub-class tests """ - _holder = None + _holder = None # type: Optional[Type[Index]] _compat_props = ["shape", "ndim", "size", "nbytes"] def test_pickle_compat_construction(self): diff --git a/setup.cfg b/setup.cfg index f7920fb61b942..d4657100c1291 100644 --- a/setup.cfg +++ b/setup.cfg @@ -148,33 +148,12 @@ ignore_errors=True [mypy-pandas.tests.extension.json.test_json] ignore_errors=True -[mypy-pandas.tests.indexes.datetimes.test_datetimelike] -ignore_errors=True - -[mypy-pandas.tests.indexes.interval.test_base] -ignore_errors=True - [mypy-pandas.tests.indexes.interval.test_interval_tree] ignore_errors=True -[mypy-pandas.tests.indexes.period.test_period] -ignore_errors=True - [mypy-pandas.tests.indexes.test_base] ignore_errors=True -[mypy-pandas.tests.indexes.test_category] -ignore_errors=True - -[mypy-pandas.tests.indexes.test_numeric] -ignore_errors=True - -[mypy-pandas.tests.indexes.test_range] -ignore_errors=True - -[mypy-pandas.tests.indexes.timedeltas.test_timedelta] -ignore_errors=True - [mypy-pandas.tests.indexing.test_loc] ignore_errors=True From 8c143b643c3921d100b73787720740871a796175 Mon Sep 17 00:00:00 2001 From: Blake Hawkins Date: Mon, 28 Oct 2019 19:26:48 -0500 Subject: [PATCH 100/112] Update setup.cfg --- setup.cfg | 3 --- 1 file changed, 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index d4657100c1291..bfb08a26f32ba 100644 --- a/setup.cfg +++ b/setup.cfg @@ -136,9 +136,6 @@ ignore_errors=True [mypy-pandas.tests.arithmetic.test_datetime64] ignore_errors=True -[mypy-pandas.tests.dtypes.test_common] -ignore_errors=True - [mypy-pandas.tests.extension.decimal.test_decimal] ignore_errors=True From 9708036d1fa18bf29277e9d14a1df89aa7d9590c Mon Sep 17 00:00:00 2001 From: Blake Hawkins Date: Thu, 31 Oct 2019 21:43:00 -0500 Subject: [PATCH 101/112] Update test_common.py --- pandas/tests/dtypes/test_common.py | 51 ++++++++++++++++++------------ 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index cb9572d6a1c0d..7b0f52268ead5 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -23,6 +23,9 @@ ) import pandas.util.testing as tm +from typing import TYPE_CHECKING, List, Union, AnyStr +from pandas._typing import Dtype + # EA & Actual Dtypes def to_ea_dtypes(dtypes): @@ -322,15 +325,17 @@ def test_is_datetimelike(): assert com.is_datetimelike(s) +integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype]] + + @pytest.mark.parametrize( "dtype", - [ - pd.Series([1, 2]), - *ALL_INT_DTYPES, - *to_numpy_dtypes(ALL_INT_DTYPES), - *ALL_EA_INT_DTYPES, - *to_ea_dtypes(ALL_EA_INT_DTYPES), - ], + integer_dtypes + + pd.Series([1, 2]) + + ALL_INT_DTYPES + + to_numpy_dtypes(ALL_INT_DTYPES) + + ALL_EA_INT_DTYPES + + to_ea_dtypes(ALL_EA_INT_DTYPES), ) def test_is_integer_dtype(dtype): assert com.is_integer_dtype(dtype) @@ -352,15 +357,17 @@ def test_is_not_integer_dtype(dtype): assert not com.is_integer_dtype(dtype) +signed_integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype]] + + @pytest.mark.parametrize( "dtype", - [ - pd.Series([1, 2]), - *SIGNED_INT_DTYPES, - *to_numpy_dtypes(SIGNED_INT_DTYPES), - *SIGNED_EA_INT_DTYPES, - *to_ea_dtypes(SIGNED_EA_INT_DTYPES), - ], + signed_integer_dtypes + + pd.Series([1, 2]) + + SIGNED_INT_DTYPES + + to_numpy_dtypes(SIGNED_INT_DTYPES) + + SIGNED_EA_INT_DTYPES + + to_ea_dtypes(SIGNED_EA_INT_DTYPES), ) def test_is_signed_integer_dtype(dtype): assert com.is_integer_dtype(dtype) @@ -386,15 +393,17 @@ def test_is_not_signed_integer_dtype(dtype): assert not com.is_signed_integer_dtype(dtype) +unsigned_integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype]] + + @pytest.mark.parametrize( "dtype", - [ - pd.Series([1, 2], dtype=np.uint32), - *UNSIGNED_INT_DTYPES, - *to_numpy_dtypes(UNSIGNED_INT_DTYPES), - *UNSIGNED_EA_INT_DTYPES, - *to_ea_dtypes(UNSIGNED_EA_INT_DTYPES), - ], + unsigned_integer_dtypes + + pd.Series([1, 2], dtype=np.uint32) + + UNSIGNED_INT_DTYPES + + to_numpy_dtypes(UNSIGNED_INT_DTYPES) + + UNSIGNED_EA_INT_DTYPES + + to_ea_dtypes(UNSIGNED_EA_INT_DTYPES), ) def test_is_unsigned_integer_dtype(dtype): assert com.is_unsigned_integer_dtype(dtype) From 84fe5d0348fa218ac22728d80a19225eb198bf19 Mon Sep 17 00:00:00 2001 From: Blake Hawkins Date: Thu, 31 Oct 2019 21:45:30 -0500 Subject: [PATCH 102/112] Applied black formatting to test_common.py --- pandas/tests/dtypes/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 7b0f52268ead5..e80500588ea4a 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -362,7 +362,7 @@ def test_is_not_integer_dtype(dtype): @pytest.mark.parametrize( "dtype", - signed_integer_dtypes + signed_integer_dtypes + pd.Series([1, 2]) + SIGNED_INT_DTYPES + to_numpy_dtypes(SIGNED_INT_DTYPES) From e92a6eb787751c2e6be50e4dc1ec4abc2da08c75 Mon Sep 17 00:00:00 2001 From: Blake Hawkins Date: Thu, 31 Oct 2019 23:19:18 -0500 Subject: [PATCH 103/112] Update test_common.py Trying to fix Azure Pipelines --- pandas/tests/dtypes/test_common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index e80500588ea4a..8b7735c3fceff 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -331,7 +331,7 @@ def test_is_datetimelike(): @pytest.mark.parametrize( "dtype", integer_dtypes - + pd.Series([1, 2]) + + [pd.Series([1, 2])] + ALL_INT_DTYPES + to_numpy_dtypes(ALL_INT_DTYPES) + ALL_EA_INT_DTYPES @@ -363,7 +363,7 @@ def test_is_not_integer_dtype(dtype): @pytest.mark.parametrize( "dtype", signed_integer_dtypes - + pd.Series([1, 2]) + + [pd.Series([1, 2])] + SIGNED_INT_DTYPES + to_numpy_dtypes(SIGNED_INT_DTYPES) + SIGNED_EA_INT_DTYPES @@ -399,7 +399,7 @@ def test_is_not_signed_integer_dtype(dtype): @pytest.mark.parametrize( "dtype", unsigned_integer_dtypes - + pd.Series([1, 2], dtype=np.uint32) + + [pd.Series([1, 2], dtype=np.uint32)] + UNSIGNED_INT_DTYPES + to_numpy_dtypes(UNSIGNED_INT_DTYPES) + UNSIGNED_EA_INT_DTYPES From ca180a61d9aa3f51cfc2a7f873614cd744c5fa7d Mon Sep 17 00:00:00 2001 From: Blake Hawkins Date: Thu, 31 Oct 2019 23:55:52 -0500 Subject: [PATCH 104/112] Update test_common.py Trying to fix Azure Pipelines again --- pandas/tests/dtypes/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 8b7735c3fceff..e3b9393477603 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -23,7 +23,7 @@ ) import pandas.util.testing as tm -from typing import TYPE_CHECKING, List, Union, AnyStr +from typing import List, Union from pandas._typing import Dtype From d21b9b2f06465f2d0465334ec04342a3d092d569 Mon Sep 17 00:00:00 2001 From: Blake Hawkins Date: Fri, 1 Nov 2019 00:24:24 -0500 Subject: [PATCH 105/112] Update test_common.py --- pandas/tests/dtypes/test_common.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index e3b9393477603..bef15374de7f7 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -1,3 +1,5 @@ +from typing import Any, List, Union + import numpy as np import pytest @@ -13,6 +15,7 @@ ) import pandas as pd +from pandas._typing import Dtype, ExtensionDtype from pandas.conftest import ( ALL_EA_INT_DTYPES, ALL_INT_DTYPES, @@ -23,9 +26,6 @@ ) import pandas.util.testing as tm -from typing import List, Union -from pandas._typing import Dtype - # EA & Actual Dtypes def to_ea_dtypes(dtypes): @@ -325,7 +325,7 @@ def test_is_datetimelike(): assert com.is_datetimelike(s) -integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype]] +integer_dtypes = [] # type: List[Union[Series, str, str, Any, ExtensionDtype] @pytest.mark.parametrize( @@ -393,7 +393,7 @@ def test_is_not_signed_integer_dtype(dtype): assert not com.is_signed_integer_dtype(dtype) -unsigned_integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype]] +unsigned_integer_dtypes = [] # type: List[Union[Series, str, str, Any, ExtensionDtype] @pytest.mark.parametrize( From 0dc0cc54ba09f265f022c6837d8032b08f08abd0 Mon Sep 17 00:00:00 2001 From: Blake Hawkins Date: Sat, 2 Nov 2019 17:44:35 -0500 Subject: [PATCH 106/112] Updating type annotation in test_common.py --- pandas/tests/dtypes/test_common.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index bef15374de7f7..771352c81a33e 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -1,4 +1,4 @@ -from typing import Any, List, Union +from typing import TYPE_CHECKING, List, Union import numpy as np import pytest @@ -10,12 +10,13 @@ CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, + ExtensionDtype, IntervalDtype, PeriodDtype, ) import pandas as pd -from pandas._typing import Dtype, ExtensionDtype +from pandas._typing import Dtype from pandas.conftest import ( ALL_EA_INT_DTYPES, ALL_INT_DTYPES, @@ -325,7 +326,7 @@ def test_is_datetimelike(): assert com.is_datetimelike(s) -integer_dtypes = [] # type: List[Union[Series, str, str, Any, ExtensionDtype] +integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype, ExtensionDtype]] @pytest.mark.parametrize( @@ -357,7 +358,7 @@ def test_is_not_integer_dtype(dtype): assert not com.is_integer_dtype(dtype) -signed_integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype]] +signed_integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype, ExtensionDtype]] @pytest.mark.parametrize( @@ -393,7 +394,7 @@ def test_is_not_signed_integer_dtype(dtype): assert not com.is_signed_integer_dtype(dtype) -unsigned_integer_dtypes = [] # type: List[Union[Series, str, str, Any, ExtensionDtype] +unsigned_integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype, ExtensionDtype]] @pytest.mark.parametrize( From 05f3b16fb26d3d7fa874b387436c3136a64cc356 Mon Sep 17 00:00:00 2001 From: Blake Hawkins Date: Sat, 2 Nov 2019 17:46:48 -0500 Subject: [PATCH 107/112] Remove unused import from test_common.py --- pandas/tests/dtypes/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 771352c81a33e..4511996ccfeeb 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, List, Union +from typing import List, Union import numpy as np import pytest From 8d21fe0045592586cf135b7781cbffbc81a633a8 Mon Sep 17 00:00:00 2001 From: Blake Date: Sat, 2 Nov 2019 23:37:56 -0500 Subject: [PATCH 108/112] Trying to fix Azure Pipelines --- pandas/tests/dtypes/test_common.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 4511996ccfeeb..e8ef237ea045b 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -1,4 +1,4 @@ -from typing import List, Union +from typing import List, Union, Any import numpy as np import pytest @@ -326,7 +326,7 @@ def test_is_datetimelike(): assert com.is_datetimelike(s) -integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype, ExtensionDtype]] +integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype, Any, ExtensionDtype]] @pytest.mark.parametrize( @@ -358,7 +358,7 @@ def test_is_not_integer_dtype(dtype): assert not com.is_integer_dtype(dtype) -signed_integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype, ExtensionDtype]] +signed_integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype, Any, ExtensionDtype]] @pytest.mark.parametrize( @@ -394,7 +394,7 @@ def test_is_not_signed_integer_dtype(dtype): assert not com.is_signed_integer_dtype(dtype) -unsigned_integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype, ExtensionDtype]] +unsigned_integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype, Any, ExtensionDtype]] @pytest.mark.parametrize( From a3eea24245049a41907bd5bffaa34a5a4be7905d Mon Sep 17 00:00:00 2001 From: Blake Date: Sun, 3 Nov 2019 14:39:43 -0600 Subject: [PATCH 109/112] Replacing List with Sequence and removing Any --- pandas/tests/dtypes/test_common.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index e8ef237ea045b..ed97e2fcad152 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -1,4 +1,4 @@ -from typing import List, Union, Any +from typing import Sequence, Union import numpy as np import pytest @@ -326,7 +326,7 @@ def test_is_datetimelike(): assert com.is_datetimelike(s) -integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype, Any, ExtensionDtype]] +integer_dtypes = [] # type: Sequence[Union[pd.Series, str, Dtype, ExtensionDtype]] @pytest.mark.parametrize( @@ -358,7 +358,7 @@ def test_is_not_integer_dtype(dtype): assert not com.is_integer_dtype(dtype) -signed_integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype, Any, ExtensionDtype]] +signed_integer_dtypes = [] # type: Sequence[Union[pd.Series, str, Dtype, ExtensionDtype]] @pytest.mark.parametrize( @@ -394,7 +394,7 @@ def test_is_not_signed_integer_dtype(dtype): assert not com.is_signed_integer_dtype(dtype) -unsigned_integer_dtypes = [] # type: List[Union[pd.Series, str, Dtype, Any, ExtensionDtype]] +unsigned_integer_dtypes = [] # type: Sequence[Union[pd.Series, str, Dtype, ExtensionDtype]] @pytest.mark.parametrize( From d01d5bbcaa9edffdf1bf6cdf369b46769ae5f745 Mon Sep 17 00:00:00 2001 From: Blake Date: Sun, 3 Nov 2019 14:47:38 -0600 Subject: [PATCH 110/112] Applying black formatting --- pandas/tests/dtypes/test_common.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index ed97e2fcad152..4f6e61d8e73a6 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -358,7 +358,9 @@ def test_is_not_integer_dtype(dtype): assert not com.is_integer_dtype(dtype) -signed_integer_dtypes = [] # type: Sequence[Union[pd.Series, str, Dtype, ExtensionDtype]] +signed_integer_dtypes = ( + [] +) # type: Sequence[Union[pd.Series, str, Dtype, ExtensionDtype]] @pytest.mark.parametrize( @@ -394,7 +396,9 @@ def test_is_not_signed_integer_dtype(dtype): assert not com.is_signed_integer_dtype(dtype) -unsigned_integer_dtypes = [] # type: Sequence[Union[pd.Series, str, Dtype, ExtensionDtype]] +unsigned_integer_dtypes = ( + [] +) # type: Sequence[Union[pd.Series, str, Dtype, ExtensionDtype]] @pytest.mark.parametrize( From 3f3755364d1aa07a6123afd872947e4542e22cff Mon Sep 17 00:00:00 2001 From: Blake Hawkins Date: Tue, 5 Nov 2019 18:00:52 -0600 Subject: [PATCH 111/112] Replaciing type stubs with List --- pandas/tests/dtypes/test_common.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 4f6e61d8e73a6..24f727f9a804e 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -1,4 +1,4 @@ -from typing import Sequence, Union +from typing import List import numpy as np import pytest @@ -16,7 +16,6 @@ ) import pandas as pd -from pandas._typing import Dtype from pandas.conftest import ( ALL_EA_INT_DTYPES, ALL_INT_DTYPES, @@ -326,7 +325,7 @@ def test_is_datetimelike(): assert com.is_datetimelike(s) -integer_dtypes = [] # type: Sequence[Union[pd.Series, str, Dtype, ExtensionDtype]] +integer_dtypes = [] # type: List @pytest.mark.parametrize( @@ -358,9 +357,7 @@ def test_is_not_integer_dtype(dtype): assert not com.is_integer_dtype(dtype) -signed_integer_dtypes = ( - [] -) # type: Sequence[Union[pd.Series, str, Dtype, ExtensionDtype]] +signed_integer_dtypes = [] # type: List @pytest.mark.parametrize( @@ -396,9 +393,7 @@ def test_is_not_signed_integer_dtype(dtype): assert not com.is_signed_integer_dtype(dtype) -unsigned_integer_dtypes = ( - [] -) # type: Sequence[Union[pd.Series, str, Dtype, ExtensionDtype]] +unsigned_integer_dtypes = [] # type: List @pytest.mark.parametrize( From 3ba0b50339ac7dfd9c78a457e2aeaa1753b66de7 Mon Sep 17 00:00:00 2001 From: Blake Hawkins Date: Tue, 5 Nov 2019 18:31:08 -0600 Subject: [PATCH 112/112] Remove unused import from test_common.py --- pandas/tests/dtypes/test_common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 24f727f9a804e..894d6a40280b7 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -10,7 +10,6 @@ CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, - ExtensionDtype, IntervalDtype, PeriodDtype, )